8217561: X86: Add floating-point Math.min/max intrinsics

Implementation taking care of +/-0.0 and NaN which uses a specific pattern for reductions Reviewed-by: aph, kvn, neliasso, sviswanathan, adinn
2019-03-07 15:27:42 +01:00 · 2019-03-07 15:27:42 +01:00 · c2ec1085e1
commit c2ec1085e1
parent 367ae10733
9 changed files with 718 additions and 38 deletions
--- a/src/hotspot/cpu/x86/assembler_x86.cpp
+++ b/src/hotspot/cpu/x86/assembler_x86.cpp
@ -1,5 +1,5 @@
 /*
- * Copyright (c) 1997, 2018, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1997, 2019, Oracle and/or its affiliates. All rights reserved.
 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
 *
 * This code is free software; you can redistribute it and/or modify it
@ -7765,9 +7765,43 @@ int Assembler::simd_prefix_and_encode(XMMRegister dst, XMMRegister nds, XMMRegis
  }
 }

+void Assembler::vmaxss(XMMRegister dst, XMMRegister nds, XMMRegister src) {
+  assert(VM_Version::supports_avx(), "");
+  InstructionAttr attributes(AVX_128bit, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ false);
+  int encode = vex_prefix_and_encode(dst->encoding(), nds->encoding(), src->encoding(), VEX_SIMD_F3, VEX_OPCODE_0F, &attributes);
+  emit_int8(0x5F);
+  emit_int8((unsigned char)(0xC0 | encode));
+}
+
+void Assembler::vmaxsd(XMMRegister dst, XMMRegister nds, XMMRegister src) {
+  assert(VM_Version::supports_avx(), "");
+  InstructionAttr attributes(AVX_128bit, /* vex_w */ VM_Version::supports_evex(), /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ false);
+  attributes.set_rex_vex_w_reverted();
+  int encode = vex_prefix_and_encode(dst->encoding(), nds->encoding(), src->encoding(), VEX_SIMD_F2, VEX_OPCODE_0F, &attributes);
+  emit_int8(0x5F);
+  emit_int8((unsigned char)(0xC0 | encode));
+}
+
+void Assembler::vminss(XMMRegister dst, XMMRegister nds, XMMRegister src) {
+  assert(VM_Version::supports_avx(), "");
+  InstructionAttr attributes(AVX_128bit, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ false);
+  int encode = vex_prefix_and_encode(dst->encoding(), nds->encoding(), src->encoding(), VEX_SIMD_F3, VEX_OPCODE_0F, &attributes);
+  emit_int8(0x5D);
+  emit_int8((unsigned char)(0xC0 | encode));
+}
+
+void Assembler::vminsd(XMMRegister dst, XMMRegister nds, XMMRegister src) {
+  assert(VM_Version::supports_avx(), "");
+  InstructionAttr attributes(AVX_128bit, /* vex_w */ VM_Version::supports_evex(), /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ false);
+  attributes.set_rex_vex_w_reverted();
+  int encode = vex_prefix_and_encode(dst->encoding(), nds->encoding(), src->encoding(), VEX_SIMD_F2, VEX_OPCODE_0F, &attributes);
+  emit_int8(0x5D);
+  emit_int8((unsigned char)(0xC0 | encode));
+}
+
 void Assembler::cmppd(XMMRegister dst, XMMRegister nds, XMMRegister src, int cop, int vector_len) {
  assert(VM_Version::supports_avx(), "");
-  assert(!VM_Version::supports_evex(), "");
+  assert(vector_len <= AVX_256bit, "");
  InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ true, /* no_mask_reg */ true, /* uses_vl */ true);
  int encode = simd_prefix_and_encode(dst, nds, src, VEX_SIMD_66, VEX_OPCODE_0F, &attributes);
  emit_int8((unsigned char)0xC2);
@ -7777,7 +7811,7 @@ void Assembler::cmppd(XMMRegister dst, XMMRegister nds, XMMRegister src, int cop

 void Assembler::blendvpd(XMMRegister dst, XMMRegister nds, XMMRegister src1, XMMRegister src2, int vector_len) {
  assert(VM_Version::supports_avx(), "");
-  assert(!VM_Version::supports_evex(), "");
+  assert(vector_len <= AVX_256bit, "");
  InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ true, /* no_mask_reg */ true, /* uses_vl */ true);
  int encode = vex_prefix_and_encode(dst->encoding(), nds->encoding(), src1->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_3A, &attributes);
  emit_int8((unsigned char)0x4B);
@ -7788,7 +7822,7 @@ void Assembler::blendvpd(XMMRegister dst, XMMRegister nds, XMMRegister src1, XMM

 void Assembler::cmpps(XMMRegister dst, XMMRegister nds, XMMRegister src, int cop, int vector_len) {
  assert(VM_Version::supports_avx(), "");
-  assert(!VM_Version::supports_evex(), "");
+  assert(vector_len <= AVX_256bit, "");
  InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ true, /* no_mask_reg */ true, /* uses_vl */ true);
  int encode = simd_prefix_and_encode(dst, nds, src, VEX_SIMD_NONE, VEX_OPCODE_0F, &attributes);
  emit_int8((unsigned char)0xC2);
@ -7798,7 +7832,7 @@ void Assembler::cmpps(XMMRegister dst, XMMRegister nds, XMMRegister src, int cop

 void Assembler::blendvps(XMMRegister dst, XMMRegister nds, XMMRegister src1, XMMRegister src2, int vector_len) {
  assert(VM_Version::supports_avx(), "");
-  assert(!VM_Version::supports_evex(), "");
+  assert(vector_len <= AVX_256bit, "");
  InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ true, /* no_mask_reg */ true, /* uses_vl */ true);
  int encode = vex_prefix_and_encode(dst->encoding(), nds->encoding(), src1->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_3A, &attributes);
  emit_int8((unsigned char)0x4A);
--- a/src/hotspot/cpu/x86/assembler_x86.hpp
+++ b/src/hotspot/cpu/x86/assembler_x86.hpp
@ -1934,6 +1934,11 @@ private:
  void vsubss(XMMRegister dst, XMMRegister nds, Address src);
  void vsubss(XMMRegister dst, XMMRegister nds, XMMRegister src);

+  void vmaxss(XMMRegister dst, XMMRegister nds, XMMRegister src);
+  void vmaxsd(XMMRegister dst, XMMRegister nds, XMMRegister src);
+  void vminss(XMMRegister dst, XMMRegister nds, XMMRegister src);
+  void vminsd(XMMRegister dst, XMMRegister nds, XMMRegister src);
+
  void shlxl(Register dst, Register src1, Register src2);
  void shlxq(Register dst, Register src1, Register src2);

--- a/src/hotspot/cpu/x86/macroAssembler_x86.hpp
+++ b/src/hotspot/cpu/x86/macroAssembler_x86.hpp
@ -165,6 +165,7 @@ class MacroAssembler: public Assembler {

  // Support optimal SSE move instructions.
  void movflt(XMMRegister dst, XMMRegister src) {
+    if (dst-> encoding() == src->encoding()) return;
    if (UseXmmRegToRegMoveAll) { movaps(dst, src); return; }
    else                       { movss (dst, src); return; }
  }
@ -173,6 +174,7 @@ class MacroAssembler: public Assembler {
  void movflt(Address dst, XMMRegister src) { movss(dst, src); }

  void movdbl(XMMRegister dst, XMMRegister src) {
+    if (dst-> encoding() == src->encoding()) return;
    if (UseXmmRegToRegMoveAll) { movapd(dst, src); return; }
    else                       { movsd (dst, src); return; }
  }
--- a/src/hotspot/cpu/x86/x86.ad
+++ b/src/hotspot/cpu/x86/x86.ad
@ -1,5 +1,5 @@
 //
-// Copyright (c) 2011, 2018, Oracle and/or its affiliates. All rights reserved.
+// Copyright (c) 2011, 2019, Oracle and/or its affiliates. All rights reserved.
 // DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
 //
 // This code is free software; you can redistribute it and/or modify it
@ -1450,6 +1450,15 @@ const bool Matcher::match_rule_supported(int opcode) {
      if (UseSSE < 2)
        ret_value = false;
      break;
+#ifdef _LP64
+    case Op_MaxD:
+    case Op_MaxF:
+    case Op_MinD:
+    case Op_MinF:
+      if (UseAVX < 1) // enabled for AVX only
+        ret_value = false;
+      break;
+#endif
  }

  return ret_value;  // Per default match rules are supported.
--- a/src/hotspot/cpu/x86/x86_64.ad
+++ b/src/hotspot/cpu/x86/x86_64.ad
@ -698,6 +698,87 @@ void emit_cmpfp3(MacroAssembler& _masm, Register dst) {
  __ bind(done);
 }

+// Math.min()    # Math.max()
+// --------------------------
+// ucomis[s/d]   #
+// ja   -> b     # a
+// jp   -> NaN   # NaN
+// jb   -> a     # b
+// je            #
+// |-jz -> a | b # a & b
+// |    -> a     #
+void emit_fp_min_max(MacroAssembler& _masm, XMMRegister dst,
+                     XMMRegister a, XMMRegister b,
+                     XMMRegister xmmt, Register rt,
+                     bool min, bool single) {
+
+  Label nan, zero, below, above, done;
+
+  if (single)
+    __ ucomiss(a, b);
+  else
+    __ ucomisd(a, b);
+
+  if (dst->encoding() != (min ? b : a)->encoding())
+    __ jccb(Assembler::above, above); // CF=0 & ZF=0
+  else
+    __ jccb(Assembler::above, done);
+
+  __ jccb(Assembler::parity, nan);  // PF=1
+  __ jccb(Assembler::below, below); // CF=1
+
+  // equal
+  __ vpxor(xmmt, xmmt, xmmt, Assembler::AVX_128bit);
+  if (single) {
+    __ ucomiss(a, xmmt);
+    __ jccb(Assembler::equal, zero);
+
+    __ movflt(dst, a);
+    __ jmp(done);
+  }
+  else {
+    __ ucomisd(a, xmmt);
+    __ jccb(Assembler::equal, zero);
+
+    __ movdbl(dst, a);
+    __ jmp(done);
+  }
+
+  __ bind(zero);
+  if (min)
+    __ vpor(dst, a, b, Assembler::AVX_128bit);
+  else
+    __ vpand(dst, a, b, Assembler::AVX_128bit);
+
+  __ jmp(done);
+
+  __ bind(above);
+  if (single)
+    __ movflt(dst, min ? b : a);
+  else
+    __ movdbl(dst, min ? b : a);
+
+  __ jmp(done);
+
+  __ bind(nan);
+  if (single) {
+    __ movl(rt, 0x7fc00000); // Float.NaN
+    __ movdl(dst, rt);
+  }
+  else {
+    __ mov64(rt, 0x7ff8000000000000L); // Double.NaN
+    __ movdq(dst, rt);
+  }
+  __ jmp(done);
+
+  __ bind(below);
+  if (single)
+    __ movflt(dst, min ? a : b);
+  else
+    __ movdbl(dst, min ? a : b);
+
+  __ bind(done);
+}

 //=============================================================================
 const RegMask& MachConstantBaseNode::_out_RegMask = RegMask::Empty;
@ -3547,6 +3628,15 @@ operand regF() %{
   interface(REG_INTER);
 %}

+// Float register operands
+operand legRegF() %{
+   constraint(ALLOC_IN_RC(float_reg_legacy));
+   match(RegF);
+
+   format %{ %}
+   interface(REG_INTER);
+%}
+
 // Float register operands
 operand vlRegF() %{
   constraint(ALLOC_IN_RC(float_reg_vl));
@ -3565,6 +3655,15 @@ operand regD() %{
   interface(REG_INTER);
 %}

+// Double register operands
+operand legRegD() %{
+   constraint(ALLOC_IN_RC(double_reg_legacy));
+   match(RegD);
+
+   format %{ %}
+   interface(REG_INTER);
+%}
+
 // Double register operands
 operand vlRegD() %{
   constraint(ALLOC_IN_RC(double_reg_vl));
@ -5303,6 +5402,16 @@ instruct MoveF2VL(vlRegF dst, regF src) %{
  ins_pipe( fpu_reg_reg );
 %}

+// Load Float
+instruct MoveF2LEG(legRegF dst, regF src) %{
+  match(Set dst src);
+  format %{ "movss $dst,$src\t# if src != dst load float (4 bytes)" %}
+  ins_encode %{
+    __ movflt($dst$$XMMRegister, $src$$XMMRegister);
+  %}
+  ins_pipe( fpu_reg_reg );
+%}
+
 // Load Float
 instruct MoveVL2F(regF dst, vlRegF src) %{
  match(Set dst src);
@ -5313,6 +5422,16 @@ instruct MoveVL2F(regF dst, vlRegF src) %{
  ins_pipe( fpu_reg_reg );
 %}

+// Load Float
+instruct MoveLEG2F(regF dst, legRegF src) %{
+  match(Set dst src);
+  format %{ "movss $dst,$src\t# if src != dst load float (4 bytes)" %}
+  ins_encode %{
+    __ movflt($dst$$XMMRegister, $src$$XMMRegister);
+  %}
+  ins_pipe( fpu_reg_reg );
+%}
+
 // Load Double
 instruct loadD_partial(regD dst, memory mem)
 %{
@ -5350,6 +5469,16 @@ instruct MoveD2VL(vlRegD dst, regD src) %{
  ins_pipe( fpu_reg_reg );
 %}

+// Load Double
+instruct MoveD2LEG(legRegD dst, regD src) %{
+  match(Set dst src);
+  format %{ "movsd $dst,$src\t# if src != dst load double (8 bytes)" %}
+  ins_encode %{
+    __ movdbl($dst$$XMMRegister, $src$$XMMRegister);
+  %}
+  ins_pipe( fpu_reg_reg );
+%}
+
 // Load Double
 instruct MoveVL2D(regD dst, vlRegD src) %{
  match(Set dst src);
@ -5360,6 +5489,167 @@ instruct MoveVL2D(regD dst, vlRegD src) %{
  ins_pipe( fpu_reg_reg );
 %}

+// Load Double
+instruct MoveLEG2D(regD dst, legRegD src) %{
+  match(Set dst src);
+  format %{ "movsd $dst,$src\t# if src != dst load double (8 bytes)" %}
+  ins_encode %{
+    __ movdbl($dst$$XMMRegister, $src$$XMMRegister);
+  %}
+  ins_pipe( fpu_reg_reg );
+%}
+
+// Following pseudo code describes the algorithm for max[FD]:
+// Min algorithm is on similar lines
+//  btmp = (b < +0.0) ? a : b
+//  atmp = (b < +0.0) ? b : a
+//  Tmp  = Max_Float(atmp , btmp)
+//  Res  = (atmp == NaN) ? atmp : Tmp
+
+// max = java.lang.Math.max(float a, float b)
+instruct maxF_reg(legRegF dst, legRegF a, legRegF b, legRegF tmp, legRegF atmp, legRegF btmp) %{
+  predicate(UseAVX > 0 && !n->is_reduction());
+  match(Set dst (MaxF a b));
+  effect(USE a, USE b, TEMP tmp, TEMP atmp, TEMP btmp);
+  format %{
+     "blendvps         $btmp,$b,$a,$b           \n\t"
+     "blendvps         $atmp,$a,$b,$b           \n\t"
+     "vmaxss           $tmp,$atmp,$btmp         \n\t"
+     "cmpps.unordered  $btmp,$atmp,$atmp        \n\t"
+     "blendvps         $dst,$tmp,$atmp,$btmp    \n\t"
+  %}
+  ins_encode %{
+    int vector_len = Assembler::AVX_128bit;
+    __ blendvps($btmp$$XMMRegister, $b$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister, vector_len);
+    __ blendvps($atmp$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister, $b$$XMMRegister, vector_len);
+    __ vmaxss($tmp$$XMMRegister, $atmp$$XMMRegister, $btmp$$XMMRegister);
+    __ cmpps($btmp$$XMMRegister, $atmp$$XMMRegister, $atmp$$XMMRegister, Assembler::_false, vector_len);
+    __ blendvps($dst$$XMMRegister, $tmp$$XMMRegister, $atmp$$XMMRegister, $btmp$$XMMRegister, vector_len);
+ %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct maxF_reduction_reg(regF dst, regF a, regF b, regF xmmt, rRegI tmp, rFlagsReg cr) %{
+  predicate(UseAVX > 0 && n->is_reduction());
+  match(Set dst (MaxF a b));
+  effect(USE a, USE b, TEMP xmmt, TEMP tmp, KILL cr);
+
+  format %{ "$dst = max($a, $b)\t# intrinsic (float)" %}
+  ins_encode %{
+    emit_fp_min_max(_masm, $dst$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister, $xmmt$$XMMRegister, $tmp$$Register,
+                    false /*min*/, true /*single*/);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+// max = java.lang.Math.max(double a, double b)
+instruct maxD_reg(legRegD dst, legRegD a, legRegD b, legRegD tmp, legRegD atmp, legRegD btmp) %{
+  predicate(UseAVX > 0 && !n->is_reduction());
+  match(Set dst (MaxD a b));
+  effect(USE a, USE b, TEMP atmp, TEMP btmp, TEMP tmp);
+  format %{
+     "blendvpd         $btmp,$b,$a,$b            \n\t"
+     "blendvpd         $atmp,$a,$b,$b            \n\t"
+     "vmaxsd           $tmp,$atmp,$btmp          \n\t"
+     "cmppd.unordered  $btmp,$atmp,$atmp         \n\t"
+     "blendvpd         $dst,$tmp,$atmp,$btmp     \n\t"
+  %}
+  ins_encode %{
+    int vector_len = Assembler::AVX_128bit;
+    __ blendvpd($btmp$$XMMRegister, $b$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister, vector_len);
+    __ blendvpd($atmp$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister, $b$$XMMRegister, vector_len);
+    __ vmaxsd($tmp$$XMMRegister, $atmp$$XMMRegister, $btmp$$XMMRegister);
+    __ cmppd($btmp$$XMMRegister, $atmp$$XMMRegister, $atmp$$XMMRegister, Assembler::_false, vector_len);
+    __ blendvpd($dst$$XMMRegister, $tmp$$XMMRegister, $atmp$$XMMRegister, $btmp$$XMMRegister, vector_len);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct maxD_reduction_reg(regD dst, regD a, regD b, regD xmmt, rRegL tmp, rFlagsReg cr) %{
+  predicate(UseAVX > 0 && n->is_reduction());
+  match(Set dst (MaxD a b));
+  effect(USE a, USE b, TEMP xmmt, TEMP tmp, KILL cr);
+
+  format %{ "$dst = max($a, $b)\t# intrinsic (double)" %}
+  ins_encode %{
+    emit_fp_min_max(_masm, $dst$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister, $xmmt$$XMMRegister, $tmp$$Register,
+                    false /*min*/, false /*single*/);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+// min = java.lang.Math.min(float a, float b)
+instruct minF_reg(legRegF dst, legRegF a, legRegF b, legRegF tmp, legRegF atmp, legRegF btmp) %{
+  predicate(UseAVX > 0 && !n->is_reduction());
+  match(Set dst (MinF a b));
+  effect(USE a, USE b, TEMP tmp, TEMP atmp, TEMP btmp);
+  format %{
+     "blendvps         $atmp,$a,$b,$a             \n\t"
+     "blendvps         $btmp,$b,$a,$a             \n\t"
+     "vminss           $tmp,$atmp,$btmp           \n\t"
+     "cmpps.unordered  $btmp,$atmp,$atmp          \n\t"
+     "blendvps         $dst,$tmp,$atmp,$btmp      \n\t"
+  %}
+  ins_encode %{
+    int vector_len = Assembler::AVX_128bit;
+    __ blendvps($atmp$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister, $a$$XMMRegister, vector_len);
+    __ blendvps($btmp$$XMMRegister, $b$$XMMRegister, $a$$XMMRegister, $a$$XMMRegister, vector_len);
+    __ vminss($tmp$$XMMRegister, $atmp$$XMMRegister, $btmp$$XMMRegister);
+    __ cmpps($btmp$$XMMRegister, $atmp$$XMMRegister, $atmp$$XMMRegister, Assembler::_false, vector_len);
+    __ blendvps($dst$$XMMRegister, $tmp$$XMMRegister, $atmp$$XMMRegister, $btmp$$XMMRegister, vector_len);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct minF_reduction_reg(regF dst, regF a, regF b, regF xmmt, rRegI tmp, rFlagsReg cr) %{
+  predicate(UseAVX > 0 && n->is_reduction());
+  match(Set dst (MinF a b));
+  effect(USE a, USE b, TEMP xmmt, TEMP tmp, KILL cr);
+
+  format %{ "$dst = min($a, $b)\t# intrinsic (float)" %}
+  ins_encode %{
+    emit_fp_min_max(_masm, $dst$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister, $xmmt$$XMMRegister, $tmp$$Register,
+                    true /*min*/, true /*single*/);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+// min = java.lang.Math.min(double a, double b)
+instruct minD_reg(legRegD dst, legRegD a, legRegD b, legRegD tmp, legRegD atmp, legRegD btmp) %{
+  predicate(UseAVX > 0 && !n->is_reduction());
+  match(Set dst (MinD a b));
+  effect(USE a, USE b, TEMP tmp, TEMP atmp, TEMP btmp);
+  format %{
+     "blendvpd         $atmp,$a,$b,$a           \n\t"
+     "blendvpd         $btmp,$b,$a,$a           \n\t"
+     "vminsd           $tmp,$atmp,$btmp         \n\t"
+     "cmppd.unordered  $btmp,$atmp,$atmp        \n\t"
+     "blendvpd         $dst,$tmp,$atmp,$btmp    \n\t"
+  %}
+  ins_encode %{
+    int vector_len = Assembler::AVX_128bit;
+    __ blendvpd($atmp$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister, $a$$XMMRegister, vector_len);
+    __ blendvpd($btmp$$XMMRegister, $b$$XMMRegister, $a$$XMMRegister, $a$$XMMRegister, vector_len);
+    __ vminsd($tmp$$XMMRegister, $atmp$$XMMRegister, $btmp$$XMMRegister);
+    __ cmppd($btmp$$XMMRegister, $atmp$$XMMRegister, $atmp$$XMMRegister, Assembler::_false, vector_len);
+    __ blendvpd($dst$$XMMRegister, $tmp$$XMMRegister, $atmp$$XMMRegister, $btmp$$XMMRegister, vector_len);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct minD_reduction_reg(regD dst, regD a, regD b, regD xmmt, rRegL tmp, rFlagsReg cr) %{
+  predicate(UseAVX > 0 && n->is_reduction());
+  match(Set dst (MinD a b));
+  effect(USE a, USE b, TEMP xmmt, TEMP tmp, KILL cr);
+
+  format %{ "$dst = min($a, $b)\t# intrinsic (double)" %}
+  ins_encode %{
+    emit_fp_min_max(_masm, $dst$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister, $xmmt$$XMMRegister, $tmp$$Register,
+                    true /*min*/, false /*single*/);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
 // Load Effective Address
 instruct leaP8(rRegP dst, indOffset8 mem)
 %{
--- a/src/hotspot/share/opto/library_call.cpp
+++ b/src/hotspot/share/opto/library_call.cpp
@ -1,5 +1,5 @@
 /*
- * Copyright (c) 1999, 2018, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1999, 2019, Oracle and/or its affiliates. All rights reserved.
 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
 *
 * This code is free software; you can redistribute it and/or modify it
@ -6609,6 +6609,40 @@ bool LibraryCallKit::inline_character_compare(vmIntrinsics::ID id) {

 //------------------------------inline_fp_min_max------------------------------
 bool LibraryCallKit::inline_fp_min_max(vmIntrinsics::ID id) {
+/* DISABLED BECAUSE METHOD DATA ISN'T COLLECTED PER CALL-SITE, SEE JDK-8015416.
+
+  // The intrinsic should be used only when the API branches aren't predictable,
+  // the last one performing the most important comparison. The following heuristic
+  // uses the branch statistics to eventually bail out if necessary.
+
+  ciMethodData *md = callee()->method_data();
+
+  if ( md != NULL && md->is_mature() && md->invocation_count() > 0 ) {
+    ciCallProfile cp = caller()->call_profile_at_bci(bci());
+
+    if ( ((double)cp.count()) / ((double)md->invocation_count()) < 0.8 ) {
+      // Bail out if the call-site didn't contribute enough to the statistics.
+      return false;
+    }
+
+    uint taken = 0, not_taken = 0;
+
+    for (ciProfileData *p = md->first_data(); md->is_valid(p); p = md->next_data(p)) {
+      if (p->is_BranchData()) {
+        taken = ((ciBranchData*)p)->taken();
+        not_taken = ((ciBranchData*)p)->not_taken();
+      }
+    }
+
+    double balance = (((double)taken) - ((double)not_taken)) / ((double)md->invocation_count());
+    balance = balance < 0 ? -balance : balance;
+    if ( balance > 0.2 ) {
+      // Bail out if the most important branch is predictable enough.
+      return false;
+    }
+  }
+*/
+
  Node *a = NULL;
  Node *b = NULL;
  Node *n = NULL;
--- a/src/hotspot/share/opto/loopTransform.cpp
+++ b/src/hotspot/share/opto/loopTransform.cpp
@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2000, 2018, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2000, 2019, Oracle and/or its affiliates. All rights reserved.
 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
 *
 * This code is free software; you can redistribute it and/or modify it
@ -2039,7 +2039,8 @@ void PhaseIdealLoop::mark_reductions(IdealLoopTree *loop) {
        if (n_ctrl != NULL && loop->is_member(get_loop(n_ctrl))) {
          // Now test it to see if it fits the standard pattern for a reduction operator.
          int opc = def_node->Opcode();
-          if (opc != ReductionNode::opcode(opc, def_node->bottom_type()->basic_type())) {
+          if (opc != ReductionNode::opcode(opc, def_node->bottom_type()->basic_type())
+              || opc == Op_MinD || opc == Op_MinF || opc == Op_MaxD || opc == Op_MaxF) {
            if (!def_node->is_reduction()) { // Not marked yet
              // To be a reduction, the arithmetic node must have the phi as input and provide a def to it
              bool ok = false;
--- a/test/hotspot/jtreg/compiler/intrinsics/math/TestFpMinMaxIntrinsics.java
+++ b/test/hotspot/jtreg/compiler/intrinsics/math/TestFpMinMaxIntrinsics.java
@ -1,6 +1,6 @@
 /*
- * Copyright (c) 2018, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2018, Arm Limited. All rights reserved.
+ * Copyright (c) 2018, 2019, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2018, 2019, Arm Limited. All rights reserved.
 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
 *
 * This code is free software; you can redistribute it and/or modify it
@ -27,20 +27,41 @@
 * @bug 8212043
 * @summary Test compiler intrinsics of floating-point Math.min/max
 *
- * @run main/othervm -Xint compiler.intrinsics.math.TestFpMinMaxIntrinsics
+ * @run main/othervm -Xint compiler.intrinsics.math.TestFpMinMaxIntrinsics sanityTests 1
 * @run main/othervm -XX:+UnlockDiagnosticVMOptions
 *                   -Xcomp -XX:TieredStopAtLevel=1
 *                   -XX:CompileOnly=java/lang/Math
- *                   compiler.intrinsics.math.TestFpMinMaxIntrinsics
+ *                   compiler.intrinsics.math.TestFpMinMaxIntrinsics sanityTests 1
 * @run main/othervm -XX:+UnlockDiagnosticVMOptions
 *                   -Xcomp -XX:-TieredCompilation
 *                   -XX:CompileOnly=java/lang/Math
- *                   compiler.intrinsics.math.TestFpMinMaxIntrinsics
+ *                   compiler.intrinsics.math.TestFpMinMaxIntrinsics sanityTests 1
+ * @run main/othervm -XX:+IgnoreUnrecognizedVMOptions -XX:+UnlockDiagnosticVMOptions
+ *                   -XX:-TieredCompilation -XX:CompileThresholdScaling=0.1
+ *                   -XX:CompileCommand=print,compiler/intrinsics/math/TestFpMinMaxIntrinsics.*Test*
+ *                   compiler.intrinsics.math.TestFpMinMaxIntrinsics sanityTests 10000
+ * @run main/othervm -XX:+IgnoreUnrecognizedVMOptions -XX:+UnlockDiagnosticVMOptions
+ *                   -XX:-TieredCompilation -Xcomp
+ *                   -XX:CompileCommand=print,compiler/intrinsics/math/TestFpMinMaxIntrinsics.*Test*
+ *                   -XX:CompileCommand=compileonly,compiler/intrinsics/math/TestFpMinMaxIntrinsics.*Test*
+ *                   compiler.intrinsics.math.TestFpMinMaxIntrinsics reductionTests 100
+ * @run main/othervm -XX:+IgnoreUnrecognizedVMOptions -XX:+UnlockDiagnosticVMOptions
+ *                   -XX:+TieredCompilation
+ *                   -XX:CompileCommand=print,compiler/intrinsics/math/TestFpMinMaxIntrinsics.min*
+ *                   -XX:CompileCommand=dontinline,compiler/intrinsics/math/TestFpMinMaxIntrinsics.min*
+ *                   compiler.intrinsics.math.TestFpMinMaxIntrinsics randomSearchTree 1
+ * @run main/othervm -XX:+IgnoreUnrecognizedVMOptions -XX:+UnlockDiagnosticVMOptions
+ *                   -XX:+TieredCompilation
+ *                   -XX:CompileCommand=print,compiler/intrinsics/math/TestFpMinMaxIntrinsics.min*
+ *                   -XX:CompileCommand=dontinline,compiler/intrinsics/math/TestFpMinMaxIntrinsics.min*
+ *                   compiler.intrinsics.math.TestFpMinMaxIntrinsics sortedSearchTree 1
 */

 package compiler.intrinsics.math;

 import java.util.Arrays;
+import java.util.Random;
+import java.lang.reflect.Method;

 public class TestFpMinMaxIntrinsics {

@ -63,63 +84,220 @@ public class TestFpMinMaxIntrinsics {
    private static final float[][] f_cases = {
        //     a         b         min       max
        {     fPos,     fPos,     fPos,     fPos },
+        {     fNeg,     fNeg,     fNeg,     fNeg },
        {     fPos,     fNeg,     fNeg,     fPos },
+        {     fNeg,     fPos,     fNeg,     fPos },
+
        { fPosZero, fNegZero, fNegZero, fPosZero },
+        { fNegZero, fPosZero, fNegZero, fPosZero },
        { fNegZero, fNegZero, fNegZero, fNegZero },
+
        {     fPos,  fPosInf,     fPos,  fPosInf },
        {     fNeg,  fNegInf,  fNegInf,     fNeg },
+
        {     fPos,     fNaN,     fNaN,     fNaN },
+        {     fNaN,     fPos,     fNaN,     fNaN },
+        {     fNeg,     fNaN,     fNaN,     fNaN },
+        {     fNaN,     fNeg,     fNaN,     fNaN },
+
+        {  fPosInf,     fNaN,     fNaN,     fNaN },
+        {     fNaN,  fPosInf,     fNaN,     fNaN },
        {  fNegInf,     fNaN,     fNaN,     fNaN },
+        {     fNaN,  fNegInf,     fNaN,     fNaN }
    };

    private static final double[][] d_cases = {
        //     a         b         min       max
        {     dPos,     dPos,     dPos,     dPos },
+        {     dNeg,     dNeg,     dNeg,     dNeg },
        {     dPos,     dNeg,     dNeg,     dPos },
+        {     dNeg,     dPos,     dNeg,     dPos },
+
        { dPosZero, dNegZero, dNegZero, dPosZero },
+        { dNegZero, dPosZero, dNegZero, dPosZero },
        { dNegZero, dNegZero, dNegZero, dNegZero },
+
        {     dPos,  dPosInf,     dPos,  dPosInf },
        {     dNeg,  dNegInf,  dNegInf,     dNeg },
+
        {     dPos,     dNaN,     dNaN,     dNaN },
+        {     dNaN,     dPos,     dNaN,     dNaN },
+        {     dNeg,     dNaN,     dNaN,     dNaN },
+        {     dNaN,     dNeg,     dNaN,     dNaN },
+
+        {  dPosInf,     dNaN,     dNaN,     dNaN },
+        {     dNaN,  dPosInf,     dNaN,     dNaN },
        {  dNegInf,     dNaN,     dNaN,     dNaN },
+        {     dNaN,  dNegInf,     dNaN,     dNaN }
    };

    private static void fTest(float[] row) {
-        float min = Math.min(row[0], row[1]);
-        float max = Math.max(row[0], row[1]);
-        if (Float.isNaN(min) && Float.isNaN(max)
-                && Float.isNaN(row[2]) && Float.isNaN(row[3])) {
-            // Return if all of them are NaN
-            return;
+        fCheck(row[0], row[1], Math.min(row[0], row[1]), Math.max(row[0], row[1]), row[2], row[3]);
    }
-        if (min != row[2] || max != row[3]) {
+
+    private static void fReductionTest(float[] row) {
+        float fmin = row[0], fmax = row[0];
+
+        for (int i=0; i<100; i++) {
+            fmin = Math.min(fmin, row[1]);
+            fmax = Math.max(fmax, row[1]);
+        }
+
+        fCheck(row[0], row[1], fmin, fmax, row[2], row[3]);
+    }
+
+    private static void fCheck(float a, float b, float fmin, float fmax, float efmin, float efmax) {
+        int min = Float.floatToRawIntBits(fmin);
+        int max = Float.floatToRawIntBits(fmax);
+        int emin = Float.floatToRawIntBits(efmin);
+        int emax = Float.floatToRawIntBits(efmax);
+
+        if (min != emin || max != emax) {
            throw new AssertionError("Unexpected result of float min/max: " +
-                    "a = " + row[0] + ", b = " + row[1] + ", " +
-                    "result = (" + min + ", " + max + "), " +
-                    "expected = (" + row[2] + ", " + row[3] + ")");
+                    "a = " + a + ", b = " + b + ", " +
+                    "result = (" + fmin + ", " + fmax + "), " +
+                    "expected = (" + efmin + ", " + efmax + ")");
        }
    }

    private static void dTest(double[] row) {
-        double min = Math.min(row[0], row[1]);
-        double max = Math.max(row[0], row[1]);
-        if (Double.isNaN(min) && Double.isNaN(max)
-                && Double.isNaN(row[2]) && Double.isNaN(row[3])) {
-            // Return if all of them are NaN
-            return;
+        dCheck(row[0], row[1], Math.min(row[0], row[1]), Math.max(row[0], row[1]), row[2], row[3]);
    }
-        if (min != row[2] || max != row[3]) {
-            throw new AssertionError("Unexpected result of double min/max" +
-                    "a = " + row[0] + ", b = " + row[1] + ", " +
-                    "result = (" + min + ", " + max + "), " +
-                    "expected = (" + row[2] + ", " + row[3] + ")");
+
+    private static void dReductionTest(double[] row) {
+        double dmin = row[0], dmax = row[0];
+
+        for (int i=0; i<100; i++) {
+            dmin = Math.min(dmin, row[1]);
+            dmax = Math.max(dmax, row[1]);
+        }
+
+        dCheck(row[0], row[1], dmin, dmax, row[2], row[3]);
+    }
+
+    private static void dCheck(double a, double b, double dmin, double dmax, double edmin, double edmax) {
+        double min = Double.doubleToRawLongBits(dmin);
+        double max = Double.doubleToRawLongBits(dmax);
+        double emin = Double.doubleToRawLongBits(edmin);
+        double emax = Double.doubleToRawLongBits(edmax);
+
+        if (min != emin || max != emax) {
+            throw new AssertionError("Unexpected result of double min/max: " +
+                    "a = " + a + ", b = " + b + ", " +
+                    "result = (" + dmin + ", " + dmax + "), " +
+                    "expected = (" + edmin + ", " + edmax + ")");
        }
    }

-    public static void main(String[] args) {
+    public static void sanityTests() {
        Arrays.stream(f_cases).forEach(TestFpMinMaxIntrinsics::fTest);
        Arrays.stream(d_cases).forEach(TestFpMinMaxIntrinsics::dTest);
-        System.out.println("PASS");
+    }
+
+    public static void reductionTests() {
+        Arrays.stream(f_cases).forEach(TestFpMinMaxIntrinsics::fReductionTest);
+        Arrays.stream(d_cases).forEach(TestFpMinMaxIntrinsics::dReductionTest);
+    }
+
+    public static void main(String[] args) throws Exception {
+        Method m = TestFpMinMaxIntrinsics.class.getDeclaredMethod(args[0]);
+        for (int i = 0 ; i < Integer.parseInt(args[1]) ; i++)
+            m.invoke(null);
+    }
+
+    private static final int COUNT = 1000;
+    private static final int LOOPS = 100;
+
+    private static Random r = new Random();
+
+    private static Node[] pool = new Node[COUNT];
+
+    private static long time = 0;
+    private static long times = 0;
+
+    public static void init() {
+        for (int i=0; i<COUNT; i++)
+            pool[i] = new Node(Double.NaN);
+    }
+
+    public static void finish() {
+        // String sorted = pool[0].toString();
+        // System.out.println("Sorted: {" + sorted.substring(0, Math.min(sorted.length(), 180)) + "... }");
+        System.out.println("Average time: " + (time/times) + " ns");
+    }
+
+    public static void randomSearchTree() {
+        init();
+        for (int l=0; l < LOOPS; l++) {
+            Node root = pool[0].reset(r.nextDouble());
+
+            for (int i=1; i<COUNT; i++)
+                insert(root, pool[i].reset(r.nextDouble()));
+        }
+        finish();
+    }
+
+    public static void sortedSearchTree() {
+        init();
+        for (int l=0; l < LOOPS; l++) {
+            Node root = pool[0].reset(-0.0);
+
+            for (int i=1; i<COUNT; i++)
+                insert(root, pool[i].reset(i-1));
+        }
+        finish();
+    }
+
+    private static class Node {
+        private double value;
+        private Node min;
+        private Node max;
+
+        public Node(double d) { value = d; }
+
+        public Node reset(double d) { value = d; min = max = null; return this; }
+
+        @Override
+        public String toString() {
+            return  (min != null ? min + ", " : "") +
+                    value +
+                    (max != null ? ", " + max : "");
        }
    }

+    private static Node insert(Node root, Node d) {
+        for ( ; ; ) {
+            long rootBits = Double.doubleToRawLongBits(root.value);
+            long dBits = Double.doubleToRawLongBits(d.value);
+
+            // No duplicates
+            if (rootBits == dBits)
+                return root;
+
+            long delta = System.nanoTime();
+
+            double dmin = min(root.value, d.value);
+
+            time += System.nanoTime() - delta;
+            times++;
+
+            long minBits = Double.doubleToRawLongBits(dmin);
+
+            if (minBits == dBits)
+                if (root.min != null)
+                    root = root.min;
+                else
+                    return root.min = d;
+            else
+                if (root.max != null)
+                    root = root.max;
+                else
+                    return root.max = d;
+        }
+    }
+
+    // Wrapper method to prevent code reordering from affecting measures (JLS 17.4).
+    private static double min(double a, double b) {
+        return Math.min(a, b);
+    }
+}
--- a/test/micro/org/openjdk/bench/vm/compiler/FpMinMaxIntrinsics.java
+++ b/test/micro/org/openjdk/bench/vm/compiler/FpMinMaxIntrinsics.java
@ -0,0 +1,127 @@
+/*
+ * Copyright (c) 2019, Oracle and/or its affiliates. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ */
+package org.openjdk.bench.vm.compiler;
+
+import org.openjdk.jmh.annotations.*;
+import org.openjdk.jmh.infra.*;
+
+import java.util.concurrent.TimeUnit;
+import java.util.Random;
+
+@BenchmarkMode(Mode.AverageTime)
+@OutputTimeUnit(TimeUnit.NANOSECONDS)
+@State(Scope.Thread)
+public class FpMinMaxIntrinsics {
+    private static final int COUNT = 1000;
+
+    private double[] doubles = new double[COUNT];
+    private float[] floats = new float[COUNT];
+
+    private int c1, c2, s1, s2;
+
+    private Random r = new Random();
+
+    @Setup
+    public void init() {
+        c1 = s1 = step();
+        c2 = COUNT - (s2 = step());
+
+        for (int i=0; i<COUNT; i++) {
+            floats[i] = r.nextFloat();
+            doubles[i] = r.nextDouble();
+        }
+    }
+
+    private int step() {
+        return (r.nextInt() & 0xf) + 1;
+    }
+
+    @Benchmark
+    public void dMax(Blackhole bh) {
+        for (int i=0; i<COUNT; i++)
+            bh.consume(dMaxBench());
+    }
+
+    @Benchmark
+    public void dMin(Blackhole bh) {
+        for (int i=0; i<COUNT; i++)
+            bh.consume(dMinBench());
+    }
+
+    @Benchmark
+    public void fMax(Blackhole bh) {
+        for (int i=0; i<COUNT; i++)
+            bh.consume(fMaxBench());
+    }
+
+    @Benchmark
+    public void fMin(Blackhole bh) {
+        for (int i=0; i<COUNT; i++)
+            bh.consume(fMinBench());
+    }
+
+    private double dMaxBench() {
+        inc();
+        return Math.max(doubles[c1], doubles[c2]);
+    }
+
+    private double dMinBench() {
+        inc();
+        return Math.min(doubles[c1], doubles[c2]);
+    }
+
+    private float fMaxBench() {
+        inc();
+        return Math.max(floats[c1], floats[c2]);
+    }
+
+    private float fMinBench() {
+        inc();
+        return Math.min(floats[c1], floats[c2]);
+    }
+
+    private void inc() {
+        c1 = c1 + s1 < COUNT ? c1 + s1 : (s1 = step());
+        c2 = c2 - s2 > 0 ? c2 - s2 : COUNT - (s2 = step());
+    }
+
+    @Benchmark
+    public float fMinReduce() {
+        float result = Float.MAX_VALUE;
+
+        for (int i=0; i<COUNT; i++)
+            result = Math.min(result, floats[i]);
+
+        return result;
+    }
+
+    @Benchmark
+    public double dMinReduce() {
+        double result = Double.MAX_VALUE;
+
+        for (int i=0; i<COUNT; i++)
+            result = Math.min(result, doubles[i]);
+
+        return result;
+    }
+}