8136414: Large performance penalty declaring a method strictfp on strict-only platforms

Reviewed-by: thartmann
2020-02-11 14:55:39 +03:00 · 2020-02-11 14:55:39 +03:00 · 590f5996c6
commit 590f5996c6
parent c16040393c
23 changed files with 134 additions and 117 deletions
--- a/src/hotspot/cpu/aarch64/aarch64.ad
+++ b/src/hotspot/cpu/aarch64/aarch64.ad
@ -2257,8 +2257,7 @@ void Matcher::pd_implicit_null_fixup(MachNode *node, uint idx) {
  Unimplemented();
 }
-// Advertise here if the CPU requires explicit rounding operations to
+// Advertise here if the CPU requires explicit rounding operations to implement strictfp mode.
 // implement the UseStrictFP mode.
 const bool Matcher::strict_fp_requires_explicit_rounding = false;
 // Are floats converted to double when stored to stack during
--- a/src/hotspot/cpu/aarch64/c1_globals_aarch64.hpp
+++ b/src/hotspot/cpu/aarch64/c1_globals_aarch64.hpp
@ -61,7 +61,6 @@ define_pd_global(uint64_t,MaxRAM,                    1ULL*G);
 define_pd_global(bool, CICompileOSR,                 true );
 #endif // !TIERED
 define_pd_global(bool, UseTypeProfile,               false);
 define_pd_global(bool, RoundFPResults,               true );
 define_pd_global(bool, LIRFillDelaySlots,            false);
 define_pd_global(bool, OptimizeSinglePrecision,      true );
--- a/src/hotspot/cpu/arm/arm.ad
+++ b/src/hotspot/cpu/arm/arm.ad
@ -1140,8 +1140,7 @@ const bool Matcher::misaligned_doubles_ok = false;
 void Matcher::pd_implicit_null_fixup(MachNode *node, uint idx) {
 }
-// Advertise here if the CPU requires explicit rounding operations
+// Advertise here if the CPU requires explicit rounding operations to implement strictfp mode.
 // to implement the UseStrictFP mode.
 const bool Matcher::strict_fp_requires_explicit_rounding = false;
 // Are floats converted to double when stored to stack during deoptimization?
--- a/src/hotspot/cpu/arm/c1_globals_arm.hpp
+++ b/src/hotspot/cpu/arm/c1_globals_arm.hpp
@ -62,8 +62,6 @@ define_pd_global(uint64_t, MaxRAM,                   1ULL*G);
 define_pd_global(bool, CICompileOSR,                 true );
 #endif // COMPILER2
 define_pd_global(bool, UseTypeProfile,               false);
 define_pd_global(bool, RoundFPResults,               false);
 define_pd_global(bool, LIRFillDelaySlots,            false);
 define_pd_global(bool, OptimizeSinglePrecision,      true);
--- a/src/hotspot/cpu/ppc/c1_globals_ppc.hpp
+++ b/src/hotspot/cpu/ppc/c1_globals_ppc.hpp
@ -62,7 +62,6 @@ define_pd_global(uintx,    InitialCodeCacheSize,         160*K);
 #endif // !TIERED
 define_pd_global(bool,     UseTypeProfile,               false);
 define_pd_global(bool,     RoundFPResults,               false);
 define_pd_global(bool,     LIRFillDelaySlots,            false);
 define_pd_global(bool,     OptimizeSinglePrecision,      false);
--- a/src/hotspot/cpu/ppc/ppc.ad
+++ b/src/hotspot/cpu/ppc/ppc.ad
@ -2501,8 +2501,7 @@ void Matcher::pd_implicit_null_fixup(MachNode *node, uint idx) {
 Unimplemented();
 }
-// Advertise here if the CPU requires explicit rounding operations
+// Advertise here if the CPU requires explicit rounding operations to implement strictfp mode.
 // to implement the UseStrictFP mode.
 const bool Matcher::strict_fp_requires_explicit_rounding = false;
 // Do floats take an entire double register or just half?
--- a/src/hotspot/cpu/s390/c1_globals_s390.hpp
+++ b/src/hotspot/cpu/s390/c1_globals_s390.hpp
@ -63,7 +63,6 @@ define_pd_global(uintx,    InitialCodeCacheSize,         160*K);
 #endif // !TIERED
 define_pd_global(bool,     UseTypeProfile,               false);
 define_pd_global(bool,     RoundFPResults,               false);
 define_pd_global(bool,     LIRFillDelaySlots,            false);
 define_pd_global(bool,     OptimizeSinglePrecision,      false);
--- a/src/hotspot/cpu/s390/s390.ad
+++ b/src/hotspot/cpu/s390/s390.ad
@ -1710,8 +1710,7 @@ const bool Matcher::rematerialize_float_constants = false;
 // Java calling convention forces doubles to be aligned.
 const bool Matcher::misaligned_doubles_ok = true;
-// Advertise here if the CPU requires explicit rounding operations
+// Advertise here if the CPU requires explicit rounding operations to implement strictfp mode.
 // to implement the UseStrictFP mode.
 const bool Matcher::strict_fp_requires_explicit_rounding = false;
 // Do floats take an entire double register or just half?
--- a/src/hotspot/cpu/sparc/c1_globals_sparc.hpp
+++ b/src/hotspot/cpu/sparc/c1_globals_sparc.hpp
@ -61,7 +61,6 @@ define_pd_global(uintx, InitialCodeCacheSize,         160*K);
 #endif // !TIERED
 define_pd_global(bool, UseTypeProfile,               false);
 define_pd_global(bool, RoundFPResults,               false);
 define_pd_global(bool, LIRFillDelaySlots,            true );
 define_pd_global(bool, OptimizeSinglePrecision,      false);
--- a/src/hotspot/cpu/sparc/sparc.ad
+++ b/src/hotspot/cpu/sparc/sparc.ad
@ -1873,8 +1873,7 @@ const bool Matcher::misaligned_doubles_ok = true;
 void Matcher::pd_implicit_null_fixup(MachNode *node, uint idx) {
 }
-// Advertise here if the CPU requires explicit rounding operations
+// Advertise here if the CPU requires explicit rounding operations to implement strictfp mode.
 // to implement the UseStrictFP mode.
 const bool Matcher::strict_fp_requires_explicit_rounding = false;
 // Are floats converted to double when stored to stack during deoptimization?
--- a/src/hotspot/cpu/x86/c1_Defs_x86.hpp
+++ b/src/hotspot/cpu/x86/c1_Defs_x86.hpp
@ -33,7 +33,7 @@ enum {
 // explicit rounding operations are required to implement the strictFP mode
 enum {
-  pd_strict_fp_requires_explicit_rounding = true
+  pd_strict_fp_requires_explicit_rounding = LP64_ONLY( false ) NOT_LP64 ( true )
 };
--- a/src/hotspot/cpu/x86/c1_globals_x86.hpp
+++ b/src/hotspot/cpu/x86/c1_globals_x86.hpp
@ -60,7 +60,6 @@ define_pd_global(uint64_t, MaxRAM,                    1ULL*G);
 define_pd_global(bool,   CICompileOSR,                 true );
 #endif // !TIERED
 define_pd_global(bool, UseTypeProfile,                 false);
 define_pd_global(bool, RoundFPResults,                 true );
 define_pd_global(bool, LIRFillDelaySlots,              false);
 define_pd_global(bool, OptimizeSinglePrecision,        true );
--- a/src/hotspot/cpu/x86/x86_32.ad
+++ b/src/hotspot/cpu/x86/x86_32.ad
@ -1516,8 +1516,7 @@ void Matcher::pd_implicit_null_fixup(MachNode *node, uint idx) {
  node->_opnds[opcnt] = new_memory;
 }
-// Advertise here if the CPU requires explicit rounding operations
+// Advertise here if the CPU requires explicit rounding operations to implement strictfp mode.
 // to implement the UseStrictFP mode.
 const bool Matcher::strict_fp_requires_explicit_rounding = true;
 // Are floats conerted to double when stored to stack during deoptimization?
--- a/src/hotspot/cpu/x86/x86_64.ad
+++ b/src/hotspot/cpu/x86/x86_64.ad
@ -1700,9 +1700,8 @@ const bool Matcher::misaligned_doubles_ok = true;
 // No-op on amd64
 void Matcher::pd_implicit_null_fixup(MachNode *node, uint idx) {}
-// Advertise here if the CPU requires explicit rounding operations to
+// Advertise here if the CPU requires explicit rounding operations to implement strictfp mode.
-// implement the UseStrictFP mode.
+const bool Matcher::strict_fp_requires_explicit_rounding = false;
 const bool Matcher::strict_fp_requires_explicit_rounding = true;
 // Are floats conerted to double when stored to stack during deoptimization?
 // On x64 it is stored without convertion so we can use normal access.
@ -10521,24 +10520,6 @@ instruct cmpD_imm(rRegI dst, regD src, immD con, rFlagsReg cr) %{
 //----------Arithmetic Conversion Instructions---------------------------------
 instruct roundFloat_nop(regF dst)
 %{
  match(Set dst (RoundFloat dst));
  ins_cost(0);
  ins_encode();
  ins_pipe(empty);
 %}
 instruct roundDouble_nop(regD dst)
 %{
  match(Set dst (RoundDouble dst));
  ins_cost(0);
  ins_encode();
  ins_pipe(empty);
 %}
 instruct convF2D_reg_reg(regD dst, regF src)
 %{
  match(Set dst (ConvF2D src));
--- a/src/hotspot/share/c1/c1_GraphBuilder.cpp
+++ b/src/hotspot/share/c1/c1_GraphBuilder.cpp
@ -607,10 +607,16 @@ class MemoryBuffer: public CompilationResourceObj {
      return load;
    }
-    if (RoundFPResults && UseSSE < 2 && load->type()->is_float_kind()) {
+    if (strict_fp_requires_explicit_rounding && load->type()->is_float_kind()) {
 #ifdef IA32
      if (UseSSE < 2) {
        // can't skip load since value might get rounded as a side effect
        return load;
      }
 #else
      Unimplemented();
 #endif // IA32
    }
    ciField* field = load->field();
    Value object   = load->obj();
@ -2272,8 +2278,10 @@ void GraphBuilder::throw_op(int bci) {
 Value GraphBuilder::round_fp(Value fp_value) {
  if (strict_fp_requires_explicit_rounding) {
 #ifdef IA32
    // no rounding needed if SSE2 is used
-  if (RoundFPResults && UseSSE < 2) {
+    if (UseSSE < 2) {
      // Must currently insert rounding node for doubleword values that
      // are results of expressions (i.e., not loads from memory or
      // constants)
@ -2284,6 +2292,10 @@ Value GraphBuilder::round_fp(Value fp_value) {
        return append(new RoundFP(fp_value));
      }
    }
 #else
    Unimplemented();
 #endif // IA32
  }
  return fp_value;
 }
@ -3766,12 +3778,18 @@ bool GraphBuilder::try_inline_full(ciMethod* callee, bool holder_known, bool ign
  // Proper inlining of methods with jsrs requires a little more work.
  if (callee->has_jsrs()                 ) INLINE_BAILOUT("jsrs not handled properly by inliner yet");
-  // When SSE2 is used on intel, then no special handling is needed
+  if (strict_fp_requires_explicit_rounding &&
-  // for strictfp because the enum-constant is fixed at compile time,
+      method()->is_strict() != callee->is_strict()) {
-  // the check for UseSSE2 is needed here
+#ifdef IA32
-  if (strict_fp_requires_explicit_rounding && UseSSE < 2 && method()->is_strict() != callee->is_strict()) {
+    // If explicit rounding is required, do not inline strict code into non-strict code (or the reverse).
    // When SSE2 is present, no special handling is needed.
    if (UseSSE < 2) {
      INLINE_BAILOUT("caller and callee have different strict fp requirements");
    }
 #else
    Unimplemented();
 #endif // IA32
  }
  if (is_profiling() && !callee->ensure_method_data()) {
    INLINE_BAILOUT("mdo allocation failed");
--- a/src/hotspot/share/c1/c1_LIRAssembler.cpp
+++ b/src/hotspot/share/c1/c1_LIRAssembler.cpp
@ -778,6 +778,7 @@ void LIR_Assembler::build_frame() {
 void LIR_Assembler::roundfp_op(LIR_Opr src, LIR_Opr tmp, LIR_Opr dest, bool pop_fpu_stack) {
  assert(strict_fp_requires_explicit_rounding, "not required");
  assert((src->is_single_fpu() && dest->is_single_stack()) ||
         (src->is_double_fpu() && dest->is_double_stack()),
         "round_fp: rounds register -> stack location");
--- a/src/hotspot/share/c1/c1_LIRGenerator.cpp
+++ b/src/hotspot/share/c1/c1_LIRGenerator.cpp
@ -899,7 +899,9 @@ void LIRGenerator::arraycopy_helper(Intrinsic* x, int* flagsp, ciArrayKlass** ex
 LIR_Opr LIRGenerator::round_item(LIR_Opr opr) {
  assert(opr->is_register(), "why spill if item is not register?");
-  if (RoundFPResults && UseSSE < 1 && opr->is_single_fpu()) {
+  if (strict_fp_requires_explicit_rounding) {
 #ifdef IA32
    if (UseSSE < 1 && opr->is_single_fpu()) {
      LIR_Opr result = new_register(T_FLOAT);
      set_vreg_flag(result, must_start_in_memory);
      assert(opr->is_register(), "only a register can be spilled");
@ -907,6 +909,10 @@ LIR_Opr LIRGenerator::round_item(LIR_Opr opr) {
      __ roundfp(opr, LIR_OprFact::illegalOpr, result);
      return result;
    }
 #else
    Unimplemented();
 #endif // IA32
  }
  return opr;
 }
@ -1951,6 +1957,8 @@ void LIRGenerator::do_Throw(Throw* x) {
 void LIRGenerator::do_RoundFP(RoundFP* x) {
  assert(strict_fp_requires_explicit_rounding, "not required");
  LIRItem input(x->input(), this);
  input.load_item();
  LIR_Opr input_opr = input.result();
--- a/src/hotspot/share/c1/c1_globals.hpp
+++ b/src/hotspot/share/c1/c1_globals.hpp
@ -170,9 +170,6 @@
  develop(bool, UseTableRanges, true,                                       \
          "Faster versions of lookup table using ranges")                   \
                                                                            \
  develop_pd(bool, RoundFPResults,                                          \
          "Indicates whether rounding is needed for floating point results")\
                                                                            \
  develop(intx, NestedInliningSizeRatio, 90,                                \
          "Percentage of prev. allowed inline size in recursive inlining")  \
          range(0, 100)                                                     \
--- a/src/hotspot/share/opto/doCall.cpp
+++ b/src/hotspot/share/opto/doCall.cpp
@ -150,8 +150,9 @@ CallGenerator* Compile::call_generator(ciMethod* callee, int vtable_index, bool
    return cg;
  }
-  // Do not inline strict fp into non-strict code, or the reverse
+  // If explicit rounding is required, do not inline strict into non-strict code (or the reverse).
-  if (caller->is_strict() ^ callee->is_strict()) {
+  if (Matcher::strict_fp_requires_explicit_rounding &&
      caller->is_strict() != callee->is_strict()) {
    allow_inline = false;
  }
--- a/src/hotspot/share/opto/graphKit.cpp
+++ b/src/hotspot/share/opto/graphKit.cpp
@ -2142,22 +2142,6 @@ Node* GraphKit::just_allocated_object(Node* current_control) {
 }
 void GraphKit::round_double_arguments(ciMethod* dest_method) {
  // (Note:  TypeFunc::make has a cache that makes this fast.)
  const TypeFunc* tf    = TypeFunc::make(dest_method);
  int             nargs = tf->domain()->cnt() - TypeFunc::Parms;
  for (int j = 0; j < nargs; j++) {
    const Type *targ = tf->domain()->field_at(j + TypeFunc::Parms);
    if( targ->basic_type() == T_DOUBLE ) {
      // If any parameters are doubles, they must be rounded before
      // the call, dstore_rounding does gvn.transform
      Node *arg = argument(j);
      arg = dstore_rounding(arg);
      set_argument(j, arg);
    }
  }
 }
 /**
 * Record profiling data exact_kls for Node n with the type system so
 * that it can propagate it (speculation)
@ -2323,12 +2307,12 @@ void GraphKit::record_profiled_return_for_speculation() {
 }
 void GraphKit::round_double_result(ciMethod* dest_method) {
-  // A non-strict method may return a double value which has an extended
+  if (Matcher::strict_fp_requires_explicit_rounding) {
-  // exponent, but this must not be visible in a caller which is 'strict'
+    // If a strict caller invokes a non-strict callee, round a double result.
-  // If a strict caller invokes a non-strict callee, round a double result
+    // A non-strict method may return a double value which has an extended exponent,
-
+    // but this must not be visible in a caller which is strict.
    BasicType result_type = dest_method->return_type()->basic_type();
-  assert( method() != NULL, "must have caller context");
+    assert(method() != NULL, "must have caller context");
    if( result_type == T_DOUBLE && method()->is_strict() && !dest_method->is_strict() ) {
      // Destination method's return value is on top of stack
      // dstore_rounding() does gvn.transform
@ -2336,30 +2320,67 @@ void GraphKit::round_double_result(ciMethod* dest_method) {
      result = dstore_rounding(result);
      push_pair(result);
    }
  }
 }
 void GraphKit::round_double_arguments(ciMethod* dest_method) {
  if (Matcher::strict_fp_requires_explicit_rounding) {
    // (Note:  TypeFunc::make has a cache that makes this fast.)
    const TypeFunc* tf    = TypeFunc::make(dest_method);
    int             nargs = tf->domain()->cnt() - TypeFunc::Parms;
    for (int j = 0; j < nargs; j++) {
      const Type *targ = tf->domain()->field_at(j + TypeFunc::Parms);
      if (targ->basic_type() == T_DOUBLE) {
        // If any parameters are doubles, they must be rounded before
        // the call, dstore_rounding does gvn.transform
        Node *arg = argument(j);
        arg = dstore_rounding(arg);
        set_argument(j, arg);
      }
    }
  }
 }
 // rounding for strict float precision conformance
 Node* GraphKit::precision_rounding(Node* n) {
-  return UseStrictFP && _method->flags().is_strict()
+  if (Matcher::strict_fp_requires_explicit_rounding) {
-    && UseSSE == 0 && Matcher::strict_fp_requires_explicit_rounding
+#ifdef IA32
-    ? _gvn.transform( new RoundFloatNode(0, n) )
+    if (_method->flags().is_strict() && UseSSE == 0) {
-    : n;
+      return _gvn.transform(new RoundFloatNode(0, n));
    }
 #else
    Unimplemented();
 #endif // IA32
  }
  return n;
 }
 // rounding for strict double precision conformance
 Node* GraphKit::dprecision_rounding(Node *n) {
-  return UseStrictFP && _method->flags().is_strict()
+  if (Matcher::strict_fp_requires_explicit_rounding) {
-    && UseSSE <= 1 && Matcher::strict_fp_requires_explicit_rounding
+#ifdef IA32
-    ? _gvn.transform( new RoundDoubleNode(0, n) )
+    if (_method->flags().is_strict() && UseSSE < 2) {
-    : n;
+      return _gvn.transform(new RoundDoubleNode(0, n));
    }
 #else
    Unimplemented();
 #endif // IA32
  }
  return n;
 }
 // rounding for non-strict double stores
 Node* GraphKit::dstore_rounding(Node* n) {
-  return Matcher::strict_fp_requires_explicit_rounding
+  if (Matcher::strict_fp_requires_explicit_rounding) {
-    && UseSSE <= 1
+#ifdef IA32
-    ? _gvn.transform( new RoundDoubleNode(0, n) )
+    if (UseSSE < 2) {
-    : n;
+      return _gvn.transform(new RoundDoubleNode(0, n));
    }
 #else
    Unimplemented();
 #endif // IA32
  }
  return n;
 }
 //=============================================================================
--- a/src/hotspot/share/opto/library_call.cpp
+++ b/src/hotspot/share/opto/library_call.cpp
@ -1785,8 +1785,15 @@ bool LibraryCallKit::inline_string_char_access(bool is_store) {
 //--------------------------round_double_node--------------------------------
 // Round a double node if necessary.
 Node* LibraryCallKit::round_double_node(Node* n) {
-  if (Matcher::strict_fp_requires_explicit_rounding && UseSSE <= 1)
+  if (Matcher::strict_fp_requires_explicit_rounding) {
-    n = _gvn.transform(new RoundDoubleNode(0, n));
+#ifdef IA32
    if (UseSSE < 2) {
      n = _gvn.transform(new RoundDoubleNode(NULL, n));
    }
 #else
    Unimplemented();
 #endif // IA32
  }
  return n;
 }
--- a/src/hotspot/share/opto/matcher.hpp
+++ b/src/hotspot/share/opto/matcher.hpp
@ -534,8 +534,7 @@ public:
  // on windows95 to take care of some unusual register constraints.
  void pd_implicit_null_fixup(MachNode *load, uint idx);
-  // Advertise here if the CPU requires explicit rounding operations
+  // Advertise here if the CPU requires explicit rounding operations to implement strictfp mode.
  // to implement the UseStrictFP mode.
  static const bool strict_fp_requires_explicit_rounding;
  // Are floats conerted to double when stored to stack during deoptimization?
--- a/src/hotspot/share/runtime/globals.hpp
+++ b/src/hotspot/share/runtime/globals.hpp
@ -994,9 +994,6 @@ const size_t minimumSymbolTableSize = 1024;
          "proper StackOverflow handling; disable only to measure cost "    \
          "of stackbanging)")                                               \
                                                                            \
  develop(bool, UseStrictFP, true,                                          \
          "use strict fp if modifier strictfp is set")                      \
                                                                            \
  develop(bool, GenerateSynchronizationCode, true,                          \
          "generate locking/unlocking code for synchronized methods and "   \
          "monitors")                                                       \