7097546: Optimize use of CMOVE instructions
Avoid CMove in a loop if possible. May generate CMove if it could be moved outside a loop. Reviewed-by: never
This commit is contained in:
parent
952f0b9b81
commit
669fa7396d
@ -1860,6 +1860,14 @@ const bool Matcher::init_array_count_is_in_bytes = true;
|
||||
// Threshold size for cleararray.
|
||||
const int Matcher::init_array_short_size = 8 * BytesPerLong;
|
||||
|
||||
// No additional cost for CMOVL.
|
||||
const int Matcher::long_cmove_cost() { return 0; }
|
||||
|
||||
// CMOVF/CMOVD are expensive on T4 and on SPARC64.
|
||||
const int Matcher::float_cmove_cost() {
|
||||
return (VM_Version::is_T4() || VM_Version::is_sparc64()) ? ConditionalMoveLimit : 0;
|
||||
}
|
||||
|
||||
// Should the Matcher clone shifts on addressing modes, expecting them to
|
||||
// be subsumed into complex addressing expressions or compute them into
|
||||
// registers? True for Intel but false for most RISCs
|
||||
|
@ -1393,6 +1393,12 @@ const bool Matcher::init_array_count_is_in_bytes = false;
|
||||
// Threshold size for cleararray.
|
||||
const int Matcher::init_array_short_size = 8 * BytesPerLong;
|
||||
|
||||
// Needs 2 CMOV's for longs.
|
||||
const int Matcher::long_cmove_cost() { return 1; }
|
||||
|
||||
// No CMOVF/CMOVD with SSE/SSE2
|
||||
const int Matcher::float_cmove_cost() { return (UseSSE>=1) ? ConditionalMoveLimit : 0; }
|
||||
|
||||
// Should the Matcher clone shifts on addressing modes, expecting them to
|
||||
// be subsumed into complex addressing expressions or compute them into
|
||||
// registers? True for Intel but false for most RISCs
|
||||
@ -7905,6 +7911,40 @@ instruct castP2X(eRegI dst, eRegP src ) %{
|
||||
|
||||
//----------Conditional Move---------------------------------------------------
|
||||
// Conditional move
|
||||
instruct jmovI_reg(cmpOp cop, eFlagsReg cr, eRegI dst, eRegI src) %{
|
||||
predicate(!VM_Version::supports_cmov() );
|
||||
match(Set dst (CMoveI (Binary cop cr) (Binary dst src)));
|
||||
ins_cost(200);
|
||||
format %{ "J$cop,us skip\t# signed cmove\n\t"
|
||||
"MOV $dst,$src\n"
|
||||
"skip:" %}
|
||||
ins_encode %{
|
||||
Label Lskip;
|
||||
// Invert sense of branch from sense of CMOV
|
||||
__ jccb((Assembler::Condition)($cop$$cmpcode^1), Lskip);
|
||||
__ movl($dst$$Register, $src$$Register);
|
||||
__ bind(Lskip);
|
||||
%}
|
||||
ins_pipe( pipe_cmov_reg );
|
||||
%}
|
||||
|
||||
instruct jmovI_regU(cmpOpU cop, eFlagsRegU cr, eRegI dst, eRegI src) %{
|
||||
predicate(!VM_Version::supports_cmov() );
|
||||
match(Set dst (CMoveI (Binary cop cr) (Binary dst src)));
|
||||
ins_cost(200);
|
||||
format %{ "J$cop,us skip\t# unsigned cmove\n\t"
|
||||
"MOV $dst,$src\n"
|
||||
"skip:" %}
|
||||
ins_encode %{
|
||||
Label Lskip;
|
||||
// Invert sense of branch from sense of CMOV
|
||||
__ jccb((Assembler::Condition)($cop$$cmpcode^1), Lskip);
|
||||
__ movl($dst$$Register, $src$$Register);
|
||||
__ bind(Lskip);
|
||||
%}
|
||||
ins_pipe( pipe_cmov_reg );
|
||||
%}
|
||||
|
||||
instruct cmovI_reg(eRegI dst, eRegI src, eFlagsReg cr, cmpOp cop ) %{
|
||||
predicate(VM_Version::supports_cmov() );
|
||||
match(Set dst (CMoveI (Binary cop cr) (Binary dst src)));
|
||||
|
@ -1993,6 +1993,12 @@ const bool Matcher::init_array_count_is_in_bytes = false;
|
||||
// Threshold size for cleararray.
|
||||
const int Matcher::init_array_short_size = 8 * BytesPerLong;
|
||||
|
||||
// No additional cost for CMOVL.
|
||||
const int Matcher::long_cmove_cost() { return 0; }
|
||||
|
||||
// No CMOVF/CMOVD with SSE2
|
||||
const int Matcher::float_cmove_cost() { return ConditionalMoveLimit; }
|
||||
|
||||
// Should the Matcher clone shifts on addressing modes, expecting them
|
||||
// to be subsumed into complex addressing expressions or compute them
|
||||
// into registers? True for Intel but false for most RISCs
|
||||
|
@ -1722,11 +1722,11 @@ void CompileBroker::invoke_compiler_on_method(CompileTask* task) {
|
||||
if (PrintCompilation) {
|
||||
const char* reason = ci_env.failure_reason();
|
||||
if (compilable == ciEnv::MethodCompilable_not_at_tier) {
|
||||
tty->print_cr("%3d COMPILE SKIPPED: %s (retry at different tier)", compile_id, reason);
|
||||
tty->print_cr("%4d COMPILE SKIPPED: %s (retry at different tier)", compile_id, reason);
|
||||
} else if (compilable == ciEnv::MethodCompilable_never) {
|
||||
tty->print_cr("%3d COMPILE SKIPPED: %s (not retryable)", compile_id, reason);
|
||||
tty->print_cr("%4d COMPILE SKIPPED: %s (not retryable)", compile_id, reason);
|
||||
} else if (compilable == ciEnv::MethodCompilable) {
|
||||
tty->print_cr("%3d COMPILE SKIPPED: %s", compile_id, reason);
|
||||
tty->print_cr("%4d COMPILE SKIPPED: %s", compile_id, reason);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
@ -1743,6 +1743,13 @@ void CompileBroker::invoke_compiler_on_method(CompileTask* task) {
|
||||
|
||||
collect_statistics(thread, time, task);
|
||||
|
||||
if (PrintCompilation && PrintInlining) {
|
||||
tty->print("%7d ", (int) tty->time_stamp().milliseconds()); // print timestamp
|
||||
tty->print("%4d ", compile_id); // print compilation number
|
||||
tty->print("%s ", (is_osr ? "%" : " "));
|
||||
tty->print_cr("size: %d time: %d inlined: %d bytes", task->code()->total_size(), time.milliseconds(), task->num_inlined_bytecodes());
|
||||
}
|
||||
|
||||
if (compilable == ciEnv::MethodCompilable_never) {
|
||||
if (is_osr) {
|
||||
method->set_not_osr_compilable();
|
||||
|
@ -28,6 +28,7 @@
|
||||
#include "opto/connode.hpp"
|
||||
#include "opto/divnode.hpp"
|
||||
#include "opto/loopnode.hpp"
|
||||
#include "opto/matcher.hpp"
|
||||
#include "opto/mulnode.hpp"
|
||||
#include "opto/rootnode.hpp"
|
||||
#include "opto/subnode.hpp"
|
||||
@ -472,46 +473,50 @@ Node *PhaseIdealLoop::remix_address_expressions( Node *n ) {
|
||||
// 1 or 2 items with a total of 1 or 2 ops executed speculatively.
|
||||
Node *PhaseIdealLoop::conditional_move( Node *region ) {
|
||||
|
||||
assert( region->is_Region(), "sanity check" );
|
||||
if( region->req() != 3 ) return NULL;
|
||||
assert(region->is_Region(), "sanity check");
|
||||
if (region->req() != 3) return NULL;
|
||||
|
||||
// Check for CFG diamond
|
||||
Node *lp = region->in(1);
|
||||
Node *rp = region->in(2);
|
||||
if( !lp || !rp ) return NULL;
|
||||
if (!lp || !rp) return NULL;
|
||||
Node *lp_c = lp->in(0);
|
||||
if( lp_c == NULL || lp_c != rp->in(0) || !lp_c->is_If() ) return NULL;
|
||||
if (lp_c == NULL || lp_c != rp->in(0) || !lp_c->is_If()) return NULL;
|
||||
IfNode *iff = lp_c->as_If();
|
||||
|
||||
// Check for highly predictable branch. No point in CMOV'ing if
|
||||
// we are going to predict accurately all the time.
|
||||
// %%% This hides patterns produced by utility methods like Math.min.
|
||||
if( iff->_prob < PROB_UNLIKELY_MAG(3) ||
|
||||
iff->_prob > PROB_LIKELY_MAG(3) )
|
||||
return NULL;
|
||||
|
||||
// Check for ops pinned in an arm of the diamond.
|
||||
// Can't remove the control flow in this case
|
||||
if( lp->outcnt() > 1 ) return NULL;
|
||||
if( rp->outcnt() > 1 ) return NULL;
|
||||
if (lp->outcnt() > 1) return NULL;
|
||||
if (rp->outcnt() > 1) return NULL;
|
||||
|
||||
IdealLoopTree* r_loop = get_loop(region);
|
||||
assert(r_loop == get_loop(iff), "sanity");
|
||||
// Always convert to CMOVE if all results are used only outside this loop.
|
||||
bool used_inside_loop = (r_loop == _ltree_root);
|
||||
|
||||
// Check profitability
|
||||
int cost = 0;
|
||||
int phis = 0;
|
||||
for (DUIterator_Fast imax, i = region->fast_outs(imax); i < imax; i++) {
|
||||
Node *out = region->fast_out(i);
|
||||
if( !out->is_Phi() ) continue; // Ignore other control edges, etc
|
||||
if (!out->is_Phi()) continue; // Ignore other control edges, etc
|
||||
phis++;
|
||||
PhiNode* phi = out->as_Phi();
|
||||
switch (phi->type()->basic_type()) {
|
||||
case T_LONG:
|
||||
cost++; // Probably encodes as 2 CMOV's
|
||||
case T_INT: // These all CMOV fine
|
||||
BasicType bt = phi->type()->basic_type();
|
||||
switch (bt) {
|
||||
case T_FLOAT:
|
||||
case T_DOUBLE:
|
||||
case T_ADDRESS: // (RawPtr)
|
||||
case T_DOUBLE: {
|
||||
cost += Matcher::float_cmove_cost(); // Could be very expensive
|
||||
break;
|
||||
}
|
||||
case T_LONG: {
|
||||
cost += Matcher::long_cmove_cost(); // May encodes as 2 CMOV's
|
||||
}
|
||||
case T_INT: // These all CMOV fine
|
||||
case T_ADDRESS: { // (RawPtr)
|
||||
cost++;
|
||||
break;
|
||||
}
|
||||
case T_NARROWOOP: // Fall through
|
||||
case T_OBJECT: { // Base oops are OK, but not derived oops
|
||||
const TypeOopPtr *tp = phi->type()->make_ptr()->isa_oopptr();
|
||||
@ -524,7 +529,7 @@ Node *PhaseIdealLoop::conditional_move( Node *region ) {
|
||||
// relevant bases. This puts the allocator in the business of
|
||||
// manufacturing expensive instructions, generally a bad plan.
|
||||
// Just Say No to Conditionally-Moved Derived Pointers.
|
||||
if( tp && tp->offset() != 0 )
|
||||
if (tp && tp->offset() != 0)
|
||||
return NULL;
|
||||
cost++;
|
||||
break;
|
||||
@ -533,39 +538,64 @@ Node *PhaseIdealLoop::conditional_move( Node *region ) {
|
||||
return NULL; // In particular, can't do memory or I/O
|
||||
}
|
||||
// Add in cost any speculative ops
|
||||
for( uint j = 1; j < region->req(); j++ ) {
|
||||
for (uint j = 1; j < region->req(); j++) {
|
||||
Node *proj = region->in(j);
|
||||
Node *inp = phi->in(j);
|
||||
if (get_ctrl(inp) == proj) { // Found local op
|
||||
cost++;
|
||||
// Check for a chain of dependent ops; these will all become
|
||||
// speculative in a CMOV.
|
||||
for( uint k = 1; k < inp->req(); k++ )
|
||||
for (uint k = 1; k < inp->req(); k++)
|
||||
if (get_ctrl(inp->in(k)) == proj)
|
||||
return NULL; // Too much speculative goo
|
||||
cost += ConditionalMoveLimit; // Too much speculative goo
|
||||
}
|
||||
}
|
||||
// See if the Phi is used by a Cmp or Narrow oop Decode/Encode.
|
||||
// This will likely Split-If, a higher-payoff operation.
|
||||
for (DUIterator_Fast kmax, k = phi->fast_outs(kmax); k < kmax; k++) {
|
||||
Node* use = phi->fast_out(k);
|
||||
if( use->is_Cmp() || use->is_DecodeN() || use->is_EncodeP() )
|
||||
return NULL;
|
||||
if (use->is_Cmp() || use->is_DecodeN() || use->is_EncodeP())
|
||||
cost += ConditionalMoveLimit;
|
||||
// Is there a use inside the loop?
|
||||
// Note: check only basic types since CMoveP is pinned.
|
||||
if (!used_inside_loop && is_java_primitive(bt)) {
|
||||
IdealLoopTree* u_loop = get_loop(has_ctrl(use) ? get_ctrl(use) : use);
|
||||
if (r_loop == u_loop || r_loop->is_member(u_loop)) {
|
||||
used_inside_loop = true;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
if( cost >= ConditionalMoveLimit ) return NULL; // Too much goo
|
||||
Node* bol = iff->in(1);
|
||||
assert( bol->Opcode() == Op_Bool, "" );
|
||||
assert(bol->Opcode() == Op_Bool, "");
|
||||
int cmp_op = bol->in(1)->Opcode();
|
||||
// It is expensive to generate flags from a float compare.
|
||||
// Avoid duplicated float compare.
|
||||
if( phis > 1 && (cmp_op == Op_CmpF || cmp_op == Op_CmpD)) return NULL;
|
||||
if (phis > 1 && (cmp_op == Op_CmpF || cmp_op == Op_CmpD)) return NULL;
|
||||
|
||||
float infrequent_prob = PROB_UNLIKELY_MAG(3);
|
||||
// Ignore cost and blocks frequency if CMOVE can be moved outside the loop.
|
||||
if (used_inside_loop) {
|
||||
if (cost >= ConditionalMoveLimit) return NULL; // Too much goo
|
||||
|
||||
// BlockLayoutByFrequency optimization moves infrequent branch
|
||||
// from hot path. No point in CMOV'ing in such case (110 is used
|
||||
// instead of 100 to take into account not exactness of float value).
|
||||
if (BlockLayoutByFrequency) {
|
||||
infrequent_prob = MAX2(infrequent_prob, (float)BlockLayoutMinDiamondPercentage/110.0f);
|
||||
}
|
||||
}
|
||||
// Check for highly predictable branch. No point in CMOV'ing if
|
||||
// we are going to predict accurately all the time.
|
||||
if (iff->_prob < infrequent_prob ||
|
||||
iff->_prob > (1.0f - infrequent_prob))
|
||||
return NULL;
|
||||
|
||||
// --------------
|
||||
// Now replace all Phis with CMOV's
|
||||
Node *cmov_ctrl = iff->in(0);
|
||||
uint flip = (lp->Opcode() == Op_IfTrue);
|
||||
while( 1 ) {
|
||||
while (1) {
|
||||
PhiNode* phi = NULL;
|
||||
for (DUIterator_Fast imax, i = region->fast_outs(imax); i < imax; i++) {
|
||||
Node *out = region->fast_out(i);
|
||||
@ -576,15 +606,15 @@ Node *PhaseIdealLoop::conditional_move( Node *region ) {
|
||||
}
|
||||
if (phi == NULL) break;
|
||||
#ifndef PRODUCT
|
||||
if( PrintOpto && VerifyLoopOptimizations ) tty->print_cr("CMOV");
|
||||
if (PrintOpto && VerifyLoopOptimizations) tty->print_cr("CMOV");
|
||||
#endif
|
||||
// Move speculative ops
|
||||
for( uint j = 1; j < region->req(); j++ ) {
|
||||
for (uint j = 1; j < region->req(); j++) {
|
||||
Node *proj = region->in(j);
|
||||
Node *inp = phi->in(j);
|
||||
if (get_ctrl(inp) == proj) { // Found local op
|
||||
#ifndef PRODUCT
|
||||
if( PrintOpto && VerifyLoopOptimizations ) {
|
||||
if (PrintOpto && VerifyLoopOptimizations) {
|
||||
tty->print(" speculate: ");
|
||||
inp->dump();
|
||||
}
|
||||
@ -596,7 +626,14 @@ Node *PhaseIdealLoop::conditional_move( Node *region ) {
|
||||
register_new_node( cmov, cmov_ctrl );
|
||||
_igvn.replace_node( phi, cmov );
|
||||
#ifndef PRODUCT
|
||||
if( VerifyLoopOptimizations ) verify();
|
||||
if (TraceLoopOpts) {
|
||||
tty->print("CMOV ");
|
||||
r_loop->dump_head();
|
||||
if (Verbose)
|
||||
bol->in(1)->dump(1);
|
||||
cmov->dump(1);
|
||||
}
|
||||
if (VerifyLoopOptimizations) verify();
|
||||
#endif
|
||||
}
|
||||
|
||||
@ -676,14 +713,14 @@ Node *PhaseIdealLoop::split_if_with_blocks_pre( Node *n ) {
|
||||
|
||||
// Split 'n' through the merge point if it is profitable
|
||||
Node *phi = split_thru_phi( n, n_blk, policy );
|
||||
if( !phi ) return n;
|
||||
if (!phi) return n;
|
||||
|
||||
// Found a Phi to split thru!
|
||||
// Replace 'n' with the new phi
|
||||
_igvn.replace_node( n, phi );
|
||||
// Moved a load around the loop, 'en-registering' something.
|
||||
if( n_blk->Opcode() == Op_Loop && n->is_Load() &&
|
||||
!phi->in(LoopNode::LoopBackControl)->is_Load() )
|
||||
if (n_blk->is_Loop() && n->is_Load() &&
|
||||
!phi->in(LoopNode::LoopBackControl)->is_Load())
|
||||
C->set_major_progress();
|
||||
|
||||
return phi;
|
||||
|
@ -360,6 +360,12 @@ public:
|
||||
// Anything this size or smaller may get converted to discrete scalar stores.
|
||||
static const int init_array_short_size;
|
||||
|
||||
// Some hardware needs 2 CMOV's for longs.
|
||||
static const int long_cmove_cost();
|
||||
|
||||
// Some hardware have expensive CMOV for float and double.
|
||||
static const int float_cmove_cost();
|
||||
|
||||
// Should the Matcher clone shifts on addressing modes, expecting them to
|
||||
// be subsumed into complex addressing expressions or compute them into
|
||||
// registers? True for Intel but false for most RISCs
|
||||
|
Loading…
x
Reference in New Issue
Block a user