From b96c85c640ee4690279d91f6650e2a309fa8cd84 Mon Sep 17 00:00:00 2001 From: Razvan A Lupusoru Date: Tue, 5 Dec 2017 09:49:23 -0800 Subject: [PATCH] 8192846: Support cmov vectorization for float Reviewed-by: kvn --- src/hotspot/cpu/x86/assembler_x86.cpp | 21 +++++++++++++++++++++ src/hotspot/cpu/x86/assembler_x86.hpp | 4 +++- src/hotspot/cpu/x86/x86.ad | 20 ++++++++++++++++++++ src/hotspot/share/adlc/formssel.cpp | 2 +- src/hotspot/share/opto/c2_globals.hpp | 3 +++ src/hotspot/share/opto/classes.hpp | 1 + src/hotspot/share/opto/loopopts.cpp | 8 ++++---- src/hotspot/share/opto/matcher.cpp | 1 + src/hotspot/share/opto/superword.cpp | 21 +++++++++++++++------ src/hotspot/share/opto/vectornode.cpp | 3 +++ src/hotspot/share/opto/vectornode.hpp | 10 +++++++++- src/hotspot/share/runtime/vmStructs.cpp | 1 + 12 files changed, 82 insertions(+), 13 deletions(-) diff --git a/src/hotspot/cpu/x86/assembler_x86.cpp b/src/hotspot/cpu/x86/assembler_x86.cpp index 961ebb531be..b13a911398b 100644 --- a/src/hotspot/cpu/x86/assembler_x86.cpp +++ b/src/hotspot/cpu/x86/assembler_x86.cpp @@ -7449,6 +7449,27 @@ void Assembler::blendvpd(XMMRegister dst, XMMRegister nds, XMMRegister src1, XMM emit_int8((unsigned char)(0xF0 & src2_enc<<4)); } +void Assembler::cmpps(XMMRegister dst, XMMRegister nds, XMMRegister src, int cop, int vector_len) { + assert(VM_Version::supports_avx(), ""); + assert(!VM_Version::supports_evex(), ""); + InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ true, /* no_mask_reg */ false, /* uses_vl */ false); + int encode = simd_prefix_and_encode(dst, nds, src, VEX_SIMD_NONE, VEX_OPCODE_0F, &attributes); + emit_int8((unsigned char)0xC2); + emit_int8((unsigned char)(0xC0 | encode)); + emit_int8((unsigned char)(0xF & cop)); +} + +void Assembler::blendvps(XMMRegister dst, XMMRegister nds, XMMRegister src1, XMMRegister src2, int vector_len) { + assert(VM_Version::supports_avx(), ""); + assert(!VM_Version::supports_evex(), ""); + InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ true, /* no_mask_reg */ false, /* uses_vl */ false); + int encode = vex_prefix_and_encode(dst->encoding(), nds->encoding(), src1->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_3A, &attributes); + emit_int8((unsigned char)0x4A); + emit_int8((unsigned char)(0xC0 | encode)); + int src2_enc = src2->encoding(); + emit_int8((unsigned char)(0xF0 & src2_enc<<4)); +} + void Assembler::vpblendd(XMMRegister dst, XMMRegister nds, XMMRegister src, int imm8, int vector_len) { assert(VM_Version::supports_avx2(), ""); InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ true, /* no_mask_reg */ false, /* uses_vl */ false); diff --git a/src/hotspot/cpu/x86/assembler_x86.hpp b/src/hotspot/cpu/x86/assembler_x86.hpp index 2739cf3b5eb..dbab8fbcf52 100644 --- a/src/hotspot/cpu/x86/assembler_x86.hpp +++ b/src/hotspot/cpu/x86/assembler_x86.hpp @@ -2114,9 +2114,11 @@ private: // runtime code and native libraries. void vzeroupper(); - // AVX support for vectorized conditional move (double). The following two instructions used only coupled. + // AVX support for vectorized conditional move (float/double). The following two instructions used only coupled. void cmppd(XMMRegister dst, XMMRegister nds, XMMRegister src, int cop, int vector_len); void blendvpd(XMMRegister dst, XMMRegister nds, XMMRegister src1, XMMRegister src2, int vector_len); + void cmpps(XMMRegister dst, XMMRegister nds, XMMRegister src, int cop, int vector_len); + void blendvps(XMMRegister dst, XMMRegister nds, XMMRegister src1, XMMRegister src2, int vector_len); void vpblendd(XMMRegister dst, XMMRegister nds, XMMRegister src, int imm8, int vector_len); protected: diff --git a/src/hotspot/cpu/x86/x86.ad b/src/hotspot/cpu/x86/x86.ad index 124e95a4dcd..b28e3215fe0 100644 --- a/src/hotspot/cpu/x86/x86.ad +++ b/src/hotspot/cpu/x86/x86.ad @@ -1263,6 +1263,7 @@ const bool Matcher::match_rule_supported(int opcode) { if (!VM_Version::supports_cx8()) ret_value = false; break; + case Op_CMoveVF: case Op_CMoveVD: if (UseAVX < 1 || UseAVX > 2) ret_value = false; @@ -1304,6 +1305,9 @@ const bool Matcher::match_rule_supported_vector(int opcode, int vlen) { if ((vlen == 32) && (VM_Version::supports_avx512bw() == false)) ret_value = false; break; + case Op_CMoveVF: + if (vlen != 8) + ret_value = false; case Op_CMoveVD: if (vlen != 4) ret_value = false; @@ -8170,6 +8174,22 @@ instruct vmul8D_mem(vecZ dst, vecZ src, memory mem) %{ ins_pipe( pipe_slow ); %} +instruct vcmov8F_reg(vecY dst, vecY src1, vecY src2, immI8 cop, cmpOp_vcmppd copnd) %{ + predicate(UseAVX > 0 && UseAVX < 3 && n->as_Vector()->length() == 8); + match(Set dst (CMoveVF (Binary copnd cop) (Binary src1 src2))); + effect(TEMP dst, USE src1, USE src2); + format %{ "cmpps.$copnd $dst, $src1, $src2 ! vcmovevf, cond=$cop\n\t" + "blendvps $dst,$src1,$src2,$dst ! vcmovevf\n\t" + %} + ins_encode %{ + int vector_len = 1; + int cond = (Assembler::Condition)($copnd$$cmpcode); + __ cmpps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, cond, vector_len); + __ blendvps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, $dst$$XMMRegister, vector_len); + %} + ins_pipe( pipe_slow ); +%} + instruct vcmov4D_reg(vecY dst, vecY src1, vecY src2, immI8 cop, cmpOp_vcmppd copnd) %{ predicate(UseAVX > 0 && UseAVX < 3 && n->as_Vector()->length() == 4); match(Set dst (CMoveVD (Binary copnd cop) (Binary src1 src2))); diff --git a/src/hotspot/share/adlc/formssel.cpp b/src/hotspot/share/adlc/formssel.cpp index fafd0ce7b27..b72c25f6e0a 100644 --- a/src/hotspot/share/adlc/formssel.cpp +++ b/src/hotspot/share/adlc/formssel.cpp @@ -4164,7 +4164,7 @@ bool MatchRule::is_vector() const { "AddVB","AddVS","AddVI","AddVL","AddVF","AddVD", "SubVB","SubVS","SubVI","SubVL","SubVF","SubVD", "MulVS","MulVI","MulVL","MulVF","MulVD", - "CMoveVD", + "CMoveVD", "CMoveVF", "DivVF","DivVD", "AbsVF","AbsVD", "NegVF","NegVD", diff --git a/src/hotspot/share/opto/c2_globals.hpp b/src/hotspot/share/opto/c2_globals.hpp index 9f105299402..1d0e16532b3 100644 --- a/src/hotspot/share/opto/c2_globals.hpp +++ b/src/hotspot/share/opto/c2_globals.hpp @@ -195,6 +195,9 @@ product(bool, UseSubwordForMaxVector, true, \ "Use Subword Analysis to set maximum vector size") \ \ + product(bool, UseVectorCmov, false, \ + "Use Vectorized Cmov") \ + \ develop(intx, UnrollLimitForProfileCheck, 1, \ "Don't use profile_trip_cnt() to restrict unrolling until " \ "unrolling would push the number of unrolled iterations above " \ diff --git a/src/hotspot/share/opto/classes.hpp b/src/hotspot/share/opto/classes.hpp index 235021f8660..dcbee6b4f69 100644 --- a/src/hotspot/share/opto/classes.hpp +++ b/src/hotspot/share/opto/classes.hpp @@ -66,6 +66,7 @@ macro(ConstraintCast) macro(CMoveD) macro(CMoveVD) macro(CMoveF) +macro(CMoveVF) macro(CMoveI) macro(CMoveL) macro(CMoveP) diff --git a/src/hotspot/share/opto/loopopts.cpp b/src/hotspot/share/opto/loopopts.cpp index d6ddc5723fe..d7a46ada7cd 100644 --- a/src/hotspot/share/opto/loopopts.cpp +++ b/src/hotspot/share/opto/loopopts.cpp @@ -528,13 +528,12 @@ Node *PhaseIdealLoop::conditional_move( Node *region ) { BasicType bt = phi->type()->basic_type(); switch (bt) { case T_DOUBLE: + case T_FLOAT: if (C->use_cmove()) { continue; //TODO: maybe we want to add some cost } - case T_FLOAT: { cost += Matcher::float_cmove_cost(); // Could be very expensive break; - } case T_LONG: { cost += Matcher::long_cmove_cost(); // May encodes as 2 CMOV's } @@ -613,8 +612,9 @@ Node *PhaseIdealLoop::conditional_move( Node *region ) { } // Check for highly predictable branch. No point in CMOV'ing if // we are going to predict accurately all the time. - if (C->use_cmove() && cmp_op == Op_CmpD) ;//keep going - else if (iff->_prob < infrequent_prob || + if (C->use_cmove() && (cmp_op == Op_CmpF || cmp_op == Op_CmpD)) { + //keep going + } else if (iff->_prob < infrequent_prob || iff->_prob > (1.0f - infrequent_prob)) return NULL; diff --git a/src/hotspot/share/opto/matcher.cpp b/src/hotspot/share/opto/matcher.cpp index 19cdfb99584..b304bc3b3fa 100644 --- a/src/hotspot/share/opto/matcher.cpp +++ b/src/hotspot/share/opto/matcher.cpp @@ -2267,6 +2267,7 @@ void Matcher::find_shared( Node *n ) { case Op_CMoveL: case Op_CMoveN: case Op_CMoveP: + case Op_CMoveVF: case Op_CMoveVD: { // Restructure into a binary tree for Matching. It's possible that // we could move this code up next to the graph reshaping for IfNodes diff --git a/src/hotspot/share/opto/superword.cpp b/src/hotspot/share/opto/superword.cpp index 39a525a1955..497f058a275 100644 --- a/src/hotspot/share/opto/superword.cpp +++ b/src/hotspot/share/opto/superword.cpp @@ -58,7 +58,7 @@ SuperWord::SuperWord(PhaseIdealLoop* phase) : _mem_slice_tail(arena(), 8, 0, NULL), // memory slice tails _node_info(arena(), 8, 0, SWNodeInfo::initial), // info needed per node _clone_map(phase->C->clone_map()), // map of nodes created in cloning - _cmovev_kit(_arena, this), // map to facilitate CMoveVD creation + _cmovev_kit(_arena, this), // map to facilitate CMoveV creation _align_to_ref(NULL), // memory reference to align vectors to _disjoint_ptrs(arena(), 8, 0, OrderedPair::initial), // runtime disambiguated pointer pairs _dg(_arena), // dependence graph @@ -511,8 +511,7 @@ void SuperWord::SLP_extract() { combine_packs(); construct_my_pack_map(); - - if (_do_vector_loop) { + if (UseVectorCmov) { merge_packs_to_cmovd(); } @@ -1249,8 +1248,8 @@ void SuperWord::set_alignment(Node* s1, Node* s2, int align) { //------------------------------data_size--------------------------- int SuperWord::data_size(Node* s) { - Node* use = NULL; //test if the node is a candidate for CMoveVD optimization, then return the size of CMov - if (_do_vector_loop) { + Node* use = NULL; //test if the node is a candidate for CMoveV optimization, then return the size of CMov + if (UseVectorCmov) { use = _cmovev_kit.is_Bool_candidate(s); if (use != NULL) { return data_size(use); @@ -1260,6 +1259,7 @@ int SuperWord::data_size(Node* s) { return data_size(use); } } + int bsize = type2aelembytes(velt_basic_type(s)); assert(bsize != 0, "valid size"); return bsize; @@ -1718,6 +1718,9 @@ Node_List* CMoveKit::make_cmovevd_pack(Node_List* cmovd_pk) { if (!cmovd->is_CMove()) { return NULL; } + if (cmovd->Opcode() != Op_CMoveF && cmovd->Opcode() != Op_CMoveD) { + return NULL; + } if (pack(cmovd) != NULL) { // already in the cmov pack return NULL; } @@ -2377,7 +2380,13 @@ void SuperWord::output() { } BasicType bt = velt_basic_type(n); const TypeVect* vt = TypeVect::make(bt, vlen); - vn = new CMoveVDNode(cc, src1, src2, vt); + assert(bt == T_FLOAT || bt == T_DOUBLE, "Only vectorization for FP cmovs is supported"); + if (bt == T_FLOAT) { + vn = new CMoveVFNode(cc, src1, src2, vt); + } else { + assert(bt == T_DOUBLE, "Expected double"); + vn = new CMoveVDNode(cc, src1, src2, vt); + } NOT_PRODUCT(if(is_trace_cmov()) {tty->print("SWPointer::output: created new CMove node %d: ", vn->_idx); vn->dump();}) } else if (opc == Op_FmaD || opc == Op_FmaF) { // Promote operands to vector diff --git a/src/hotspot/share/opto/vectornode.cpp b/src/hotspot/share/opto/vectornode.cpp index 57b0ecf0e7b..5bf5623b558 100644 --- a/src/hotspot/share/opto/vectornode.cpp +++ b/src/hotspot/share/opto/vectornode.cpp @@ -92,6 +92,9 @@ int VectorNode::opcode(int sopc, BasicType bt) { case Op_FmaF: assert(bt == T_FLOAT, "must be"); return Op_FmaVF; + case Op_CMoveF: + assert(bt == T_FLOAT, "must be"); + return Op_CMoveVF; case Op_CMoveD: assert(bt == T_DOUBLE, "must be"); return Op_CMoveVD; diff --git a/src/hotspot/share/opto/vectornode.hpp b/src/hotspot/share/opto/vectornode.hpp index 93da7bc4b7c..a774eafc4df 100644 --- a/src/hotspot/share/opto/vectornode.hpp +++ b/src/hotspot/share/opto/vectornode.hpp @@ -277,8 +277,16 @@ public: virtual int Opcode() const; }; +//------------------------------CMoveVFNode-------------------------------------- +// Vector float conditional move +class CMoveVFNode : public VectorNode { +public: + CMoveVFNode(Node* in1, Node* in2, Node* in3, const TypeVect* vt) : VectorNode(in1, in2, in3, vt) {} + virtual int Opcode() const; +}; + //------------------------------CMoveVDNode-------------------------------------- -// Vector multiply double +// Vector double conditional move class CMoveVDNode : public VectorNode { public: CMoveVDNode(Node* in1, Node* in2, Node* in3, const TypeVect* vt) : VectorNode(in1, in2, in3, vt) {} diff --git a/src/hotspot/share/runtime/vmStructs.cpp b/src/hotspot/share/runtime/vmStructs.cpp index 9fb22233abb..28b5259c892 100644 --- a/src/hotspot/share/runtime/vmStructs.cpp +++ b/src/hotspot/share/runtime/vmStructs.cpp @@ -1991,6 +1991,7 @@ typedef PaddedEnd PaddedObjectMonitor; declare_c2_type(MulVDNode, VectorNode) \ declare_c2_type(FmaVDNode, VectorNode) \ declare_c2_type(FmaVFNode, VectorNode) \ + declare_c2_type(CMoveVFNode, VectorNode) \ declare_c2_type(CMoveVDNode, VectorNode) \ declare_c2_type(MulReductionVDNode, ReductionNode) \ declare_c2_type(DivVFNode, VectorNode) \