Merge

2015-11-09 22:43:30 +00:00 · 2015-11-09 22:43:30 +00:00 · 51884084c1
commit 51884084c1
parent df31872e93 e86e38619e
25 changed files with 4872 additions and 2643 deletions
--- a/hotspot/src/cpu/aarch64/vm/aarch64.ad
+++ b/hotspot/src/cpu/aarch64/vm/aarch64.ad
@ -1079,10 +1079,10 @@ source %{
  // and for a volatile write we need
  //
  //   stlr<x>
-  // 
+  //
  // Alternatively, we can implement them by pairing a normal
  // load/store with a memory barrier. For a volatile read we need
-  // 
+  //
  //   ldr<x>
  //   dmb ishld
  //
@ -1240,7 +1240,7 @@ source %{
  // Alternatively, we can elide generation of the dmb instructions
  // and plant the alternative CompareAndSwap macro-instruction
  // sequence (which uses ldaxr<x>).
-  // 
+  //
  // Of course, the above only applies when we see these signature
  // configurations. We still want to plant dmb instructions in any
  // other cases where we may see a MemBarAcquire, MemBarRelease or
@ -1367,7 +1367,7 @@ source %{
    opcode = parent->Opcode();
    return opcode == Op_MemBarRelease;
  }
- 
+
  // 2) card mark detection helper

  // helper predicate which can be used to detect a volatile membar
@ -1383,7 +1383,7 @@ source %{
  // true
  //
  // iii) the node's Mem projection feeds a StoreCM node.
-  
+
  bool is_card_mark_membar(const MemBarNode *barrier)
  {
    if (!UseG1GC && !(UseConcMarkSweepGC && UseCondCardMark)) {
@ -1402,7 +1402,7 @@ source %{
 	return true;
      }
    }
-  
+
    return false;
  }

@ -1430,7 +1430,7 @@ source %{
  // where
  //  || and \\ represent Ctl and Mem feeds via Proj nodes
  //  | \ and / indicate further routing of the Ctl and Mem feeds
-  // 
+  //
  // this is the graph we see for non-object stores. however, for a
  // volatile Object store (StoreN/P) we may see other nodes below the
  // leading membar because of the need for a GC pre- or post-write
@ -1592,7 +1592,7 @@ source %{
  // ordering but neither will a releasing store (stlr). The latter
  // guarantees that the object put is visible but does not guarantee
  // that writes by other threads have also been observed.
-  // 
+  //
  // So, returning to the task of translating the object put and the
  // leading/trailing membar nodes: what do the non-normal node graph
  // look like for these 2 special cases? and how can we determine the
@ -1731,7 +1731,7 @@ source %{
  //       |         |         |            |
  //     C |       M |       M |          M |
  //        \        |         |           /
-  //                  . . . 
+  //                  . . .
  //          (post write subtree elided)
  //                    . . .
  //             C \         M /
@ -1812,12 +1812,12 @@ source %{
  //   |    |                 |  /        /
  //   |  Region  . . .     Phi[M]  _____/
  //   |    /                 |    /
-  //   |                      |   /   
+  //   |                      |   /
  //   | . . .   . . .        |  /
  //   | /                    | /
  // Region           |  |  Phi[M]
  //   |              |  |  / Bot
-  //    \            MergeMem 
+  //    \            MergeMem
  //     \            /
  //     MemBarVolatile
  //
@ -1858,7 +1858,7 @@ source %{
  // to a trailing barrier via a MergeMem. That feed is either direct
  // (for CMS) or via 2 or 3 Phi nodes merging the leading barrier
  // memory flow (for G1).
-  // 
+  //
  // The predicates controlling generation of instructions for store
  // and barrier nodes employ a few simple helper functions (described
  // below) which identify the presence or absence of all these
@ -2112,8 +2112,8 @@ source %{
      x = x->in(MemNode::Memory);
    } else {
      // the merge should get its Bottom mem feed from the leading membar
-      x = mm->in(Compile::AliasIdxBot);      
-    } 
+      x = mm->in(Compile::AliasIdxBot);
+    }

    // ensure this is a non control projection
    if (!x->is_Proj() || x->is_CFG()) {
@ -2190,12 +2190,12 @@ source %{
  //     . . .
  //       |
  //   MemBarVolatile (card mark)
-  //      |          |     
+  //      |          |
  //      |        StoreCM
  //      |          |
  //      |        . . .
-  //  Bot |  / 
-  //   MergeMem 
+  //  Bot |  /
+  //   MergeMem
  //      |
  //      |
  //    MemBarVolatile {trailing}
@ -2203,10 +2203,10 @@ source %{
  // 2)
  //   MemBarRelease/CPUOrder (leading)
  //    |
-  //    | 
+  //    |
  //    |\       . . .
-  //    | \        | 
-  //    |  \  MemBarVolatile (card mark) 
+  //    | \        |
+  //    |  \  MemBarVolatile (card mark)
  //    |   \   |     |
  //     \   \  |   StoreCM    . . .
  //      \   \ |
@ -2231,7 +2231,7 @@ source %{
  //    |  \   \  |   StoreCM    . . .
  //    |   \   \ |
  //     \   \  Phi
-  //      \   \ /  
+  //      \   \ /
  //       \  Phi
  //        \ /
  //        Phi  . . .
@ -2506,7 +2506,7 @@ bool unnecessary_acquire(const Node *barrier)

    return (x->is_Load() && x->as_Load()->is_acquire());
  }
-  
+
  // now check for an unsafe volatile get

  // need to check for
@ -2644,7 +2644,7 @@ bool needs_acquiring_load(const Node *n)
  }

  membar = child_membar(membar);
-  
+
  if (!membar || !membar->Opcode() == Op_MemBarCPUOrder) {
    return false;
  }
@ -2703,7 +2703,7 @@ bool unnecessary_volatile(const Node *n)

  // first we check if this is part of a card mark. if so then we have
  // to generate a StoreLoad barrier
-  
+
  if (is_card_mark_membar(mbvol)) {
      return false;
  }
@ -2769,7 +2769,7 @@ bool needs_releasing_store(const Node *n)
  if (!is_card_mark_membar(mbvol)) {
    return true;
  }
-  
+
  // we found a card mark -- just make sure we have a trailing barrier

  return (card_mark_to_trailing(mbvol) != NULL);
@ -2808,7 +2808,7 @@ bool needs_acquiring_load_exclusive(const Node *n)

  assert(barrier->Opcode() == Op_MemBarCPUOrder,
 	 "CAS not fed by cpuorder membar!");
-      
+
  MemBarNode *b = parent_membar(barrier);
  assert ((b != NULL && b->Opcode() == Op_MemBarRelease),
 	  "CAS not fed by cpuorder+release membar pair!");
@ -3463,6 +3463,17 @@ const bool Matcher::match_rule_supported(int opcode) {
  return true;  // Per default match rules are supported.
 }

+const bool Matcher::match_rule_supported_vector(int opcode, int vlen) {
+
+  // TODO
+  // identify extra cases that we might want to provide match rules for
+  // e.g. Op_ vector nodes and other intrinsics while guarding with vlen
+  bool ret_value = match_rule_supported(opcode);
+  // Add rules here.
+
+  return ret_value;  // Per default match rules are supported.
+}
+
 const int Matcher::float_pressure(int default_pressure_threshold) {
  return default_pressure_threshold;
 }
@ -4663,7 +4674,7 @@ encode %{
      call = __ trampoline_call(Address(addr, relocInfo::static_call_type), &cbuf);
    }
    if (call == NULL) {
-      ciEnv::current()->record_failure("CodeCache is full"); 
+      ciEnv::current()->record_failure("CodeCache is full");
      return;
    }

@ -4671,7 +4682,7 @@ encode %{
      // Emit stub for static call
      address stub = CompiledStaticCall::emit_to_interp_stub(cbuf);
      if (stub == NULL) {
-        ciEnv::current()->record_failure("CodeCache is full"); 
+        ciEnv::current()->record_failure("CodeCache is full");
        return;
      }
    }
@ -4681,7 +4692,7 @@ encode %{
    MacroAssembler _masm(&cbuf);
    address call = __ ic_call((address)$meth$$method);
    if (call == NULL) {
-      ciEnv::current()->record_failure("CodeCache is full"); 
+      ciEnv::current()->record_failure("CodeCache is full");
      return;
    }
  %}
@ -4706,7 +4717,7 @@ encode %{
    if (cb) {
      address call = __ trampoline_call(Address(entry, relocInfo::runtime_call_type));
      if (call == NULL) {
-        ciEnv::current()->record_failure("CodeCache is full"); 
+        ciEnv::current()->record_failure("CodeCache is full");
        return;
      }
    } else {
--- a/hotspot/src/cpu/aarch64/vm/c2_globals_aarch64.hpp
+++ b/hotspot/src/cpu/aarch64/vm/c2_globals_aarch64.hpp
@ -73,6 +73,7 @@ define_pd_global(bool, UseCISCSpill,                 true);
 define_pd_global(bool, OptoScheduling,               false);
 define_pd_global(bool, OptoBundling,                 false);
 define_pd_global(bool, OptoRegScheduling,            false);
+define_pd_global(bool, SuperWordLoopUnrollAnalysis,  false);

 define_pd_global(intx, ReservedCodeCacheSize,        48*M);
 define_pd_global(intx, NonProfiledCodeHeapSize,      21*M);
--- a/hotspot/src/cpu/ppc/vm/c2_globals_ppc.hpp
+++ b/hotspot/src/cpu/ppc/vm/c2_globals_ppc.hpp
@ -61,6 +61,7 @@ define_pd_global(bool, OptoPeephole,                 false);
 define_pd_global(bool, UseCISCSpill,                 false);
 define_pd_global(bool, OptoBundling,                 false);
 define_pd_global(bool, OptoRegScheduling,            false);
+define_pd_global(bool, SuperWordLoopUnrollAnalysis,  false);
 // GL:
 // Detected a problem with unscaled compressed oops and
 // narrow_oop_use_complex_address() == false.
--- a/hotspot/src/cpu/ppc/vm/ppc.ad
+++ b/hotspot/src/cpu/ppc/vm/ppc.ad
@ -2064,6 +2064,17 @@ const bool Matcher::match_rule_supported(int opcode) {
  return true;  // Per default match rules are supported.
 }

+const bool Matcher::match_rule_supported_vector(int opcode, int vlen) {
+
+  // TODO
+  // identify extra cases that we might want to provide match rules for
+  // e.g. Op_ vector nodes and other intrinsics while guarding with vlen
+  bool ret_value = match_rule_supported(opcode);
+  // Add rules here.
+
+  return ret_value;  // Per default match rules are supported.
+}
+
 const int Matcher::float_pressure(int default_pressure_threshold) {
  return default_pressure_threshold;
 }
@ -3416,7 +3427,7 @@ encode %{
      // The stub for call to interpreter.
      address stub = CompiledStaticCall::emit_to_interp_stub(cbuf);
      if (stub == NULL) {
-        ciEnv::current()->record_failure("CodeCache is full"); 
+        ciEnv::current()->record_failure("CodeCache is full");
        return;
      }
    }
@ -3465,7 +3476,7 @@ encode %{
    // The stub for call to interpreter.
    address stub = CompiledStaticCall::emit_to_interp_stub(cbuf);
    if (stub == NULL) {
-      ciEnv::current()->record_failure("CodeCache is full"); 
+      ciEnv::current()->record_failure("CodeCache is full");
      return;
    }

@ -6912,7 +6923,7 @@ instruct decodeN_Disjoint_isel_Ex(iRegPdst dst, iRegNsrc src, flagsReg crx) %{
    n_compare->_opnds[0] = op_crx;
    n_compare->_opnds[1] = op_src;
    n_compare->_opnds[2] = new immN_0Oper(TypeNarrowOop::NULL_PTR);
-    
+
    decodeN_mergeDisjointNode *n2 = new decodeN_mergeDisjointNode();
    n2->add_req(n_region, n_src, n1);
    n2->_opnds[0] = op_dst;
@ -10589,7 +10600,7 @@ instruct cmpP_reg_imm16(flagsReg crx, iRegPsrc src1, immL16 src2) %{

 instruct cmpFUnordered_reg_reg(flagsReg crx, regF src1, regF src2) %{
  // Needs matchrule, see cmpDUnordered.
-  match(Set crx (CmpF src1 src2)); 
+  match(Set crx (CmpF src1 src2));
  // no match-rule, false predicate
  predicate(false);

@ -10698,13 +10709,13 @@ instruct cmpF3_reg_reg_ExEx(iRegIdst dst, regF src1, regF src2) %{
 %}

 instruct cmpDUnordered_reg_reg(flagsReg crx, regD src1, regD src2) %{
-  // Needs matchrule so that ideal opcode is Cmp. This causes that gcm places the 
-  // node right before the conditional move using it. 
+  // Needs matchrule so that ideal opcode is Cmp. This causes that gcm places the
+  // node right before the conditional move using it.
  // In jck test api/java_awt/geom/QuadCurve2DFloat/index.html#SetCurveTesttestCase7,
  // compilation of java.awt.geom.RectangularShape::getBounds()Ljava/awt/Rectangle
  // crashed in register allocation where the flags Reg between cmpDUnoredered and a
  // conditional move was supposed to be spilled.
-  match(Set crx (CmpD src1 src2)); 
+  match(Set crx (CmpD src1 src2));
  // False predicate, shall not be matched.
  predicate(false);

--- a/hotspot/src/cpu/sparc/vm/c2_globals_sparc.hpp
+++ b/hotspot/src/cpu/sparc/vm/c2_globals_sparc.hpp
@ -65,6 +65,7 @@ define_pd_global(bool, UseCISCSpill,                 false);
 define_pd_global(bool, OptoBundling,                 false);
 define_pd_global(bool, OptoScheduling,               true);
 define_pd_global(bool, OptoRegScheduling,            false);
+define_pd_global(bool, SuperWordLoopUnrollAnalysis,  false);

 #ifdef _LP64
 // We need to make sure that all generated code is within
--- a/hotspot/src/cpu/sparc/vm/sparc.ad
+++ b/hotspot/src/cpu/sparc/vm/sparc.ad
@ -1860,6 +1860,17 @@ const bool Matcher::match_rule_supported(int opcode) {
  return true;  // Per default match rules are supported.
 }

+const bool Matcher::match_rule_supported_vector(int opcode, int vlen) {
+
+  // TODO
+  // identify extra cases that we might want to provide match rules for
+  // e.g. Op_ vector nodes and other intrinsics while guarding with vlen
+  bool ret_value = match_rule_supported(opcode);
+  // Add rules here.
+
+  return ret_value;  // Per default match rules are supported.
+}
+
 const int Matcher::float_pressure(int default_pressure_threshold) {
  return default_pressure_threshold;
 }
@ -1905,7 +1916,7 @@ const bool Matcher::misaligned_vectors_ok() {
 }

 // Current (2013) SPARC platforms need to read original key
-// to construct decryption expanded key 
+// to construct decryption expanded key
 const bool Matcher::pass_original_key_for_aes() {
  return true;
 }
@ -2612,7 +2623,7 @@ encode %{
      if (stub == NULL && !(TraceJumps && Compile::current()->in_scratch_emit_size())) {
        ciEnv::current()->record_failure("CodeCache is full");
        return;
-      } 
+      }
    }
  %}

@ -3132,10 +3143,10 @@ ins_attrib ins_size(32);           // Required size attribute (in bits)
 // AVOID_NONE   - instruction can be placed anywhere
 // AVOID_BEFORE - instruction cannot be placed after an
 //                instruction with MachNode::AVOID_AFTER
-// AVOID_AFTER  - the next instruction cannot be the one 
+// AVOID_AFTER  - the next instruction cannot be the one
 //                with MachNode::AVOID_BEFORE
-// AVOID_BEFORE_AND_AFTER - BEFORE and AFTER attributes at 
-//                          the same time                                
+// AVOID_BEFORE_AND_AFTER - BEFORE and AFTER attributes at
+//                          the same time
 ins_attrib ins_avoid_back_to_back(MachNode::AVOID_NONE);

 ins_attrib ins_short_branch(0);    // Required flag: is this instruction a
--- a/hotspot/src/cpu/x86/vm/assembler_x86.cpp
+++ b/hotspot/src/cpu/x86/vm/assembler_x86.cpp
--- a/hotspot/src/cpu/x86/vm/assembler_x86.hpp
+++ b/hotspot/src/cpu/x86/vm/assembler_x86.hpp
@ -438,6 +438,8 @@ class ArrayAddress VALUE_OBJ_CLASS_SPEC {

 };

+class InstructionAttr;
+
 // 64-bit refect the fxsave size which is 512 bytes and the new xsave area on EVEX which is another 2176 bytes
 // See fxsave and xsave(EVEX enabled) documentation for layout
 const int FPUStateSizeInWords = NOT_LP64(27) LP64_ONLY(2688 / wordSize);
@ -568,7 +570,8 @@ class Assembler : public AbstractAssembler  {
    EVEX_8bit  = 0,
    EVEX_16bit = 1,
    EVEX_32bit = 2,
-    EVEX_64bit = 3
+    EVEX_64bit = 3,
+    EVEX_NObit = 4
  };

  enum WhichOperand {
@ -598,16 +601,12 @@ class Assembler : public AbstractAssembler  {

 private:

-  int _evex_encoding;
-  int _input_size_in_bits;
-  int _avx_vector_len;
-  int _tuple_type;
-  bool _is_evex_instruction;
  bool _legacy_mode_bw;
  bool _legacy_mode_dq;
  bool _legacy_mode_vl;
  bool _legacy_mode_vlbw;
-  bool _instruction_uses_vl;
+
+  class InstructionAttr *_attributes;

  // 64bit prefixes
  int prefix_and_encode(int reg_enc, bool byteinst = false);
@ -637,181 +636,30 @@ private:
  int  rex_prefix_and_encode(int dst_enc, int src_enc,
                             VexSimdPrefix pre, VexOpcode opc, bool rex_w);

-  void vex_prefix(bool vex_r, bool vex_b, bool vex_x, bool vex_w,
-                  int nds_enc, VexSimdPrefix pre, VexOpcode opc,
-                  int vector_len);
+  void vex_prefix(bool vex_r, bool vex_b, bool vex_x, int nds_enc, VexSimdPrefix pre, VexOpcode opc);

-  void evex_prefix(bool vex_r, bool vex_b, bool vex_x, bool vex_w, bool evex_r, bool evex_v,
-                   int nds_enc, VexSimdPrefix pre, VexOpcode opc,
-                   bool is_extended_context, bool is_merge_context,
-                   int vector_len, bool no_mask_reg );
+  void evex_prefix(bool vex_r, bool vex_b, bool vex_x, bool evex_r, bool evex_v,
+                   int nds_enc, VexSimdPrefix pre, VexOpcode opc);

  void vex_prefix(Address adr, int nds_enc, int xreg_enc,
                  VexSimdPrefix pre, VexOpcode opc,
-                  bool vex_w, int vector_len,
-                  bool legacy_mode = false, bool no_mask_reg = false);
-
-  void vex_prefix(XMMRegister dst, XMMRegister nds, Address src,
-                  VexSimdPrefix pre, int vector_len = AVX_128bit,
-                  bool no_mask_reg = false, bool legacy_mode = false) {
-    int dst_enc = dst->encoding();
-    int nds_enc = nds->is_valid() ? nds->encoding() : 0;
-    vex_prefix(src, nds_enc, dst_enc, pre, VEX_OPCODE_0F, false, vector_len, legacy_mode, no_mask_reg);
-  }
-
-  void vex_prefix_q(XMMRegister dst, XMMRegister nds, Address src,
-                    VexSimdPrefix pre, int vector_len = AVX_128bit,
-                    bool no_mask_reg = false) {
-    int dst_enc = dst->encoding();
-    int nds_enc = nds->is_valid() ? nds->encoding() : 0;
-    vex_prefix(src, nds_enc, dst_enc, pre, VEX_OPCODE_0F, true, vector_len, false, no_mask_reg);
-  }
-
-  void vex_prefix_0F38(Register dst, Register nds, Address src, bool no_mask_reg = false) {
-    bool vex_w = false;
-    int vector_len = AVX_128bit;
-    vex_prefix(src, nds->encoding(), dst->encoding(),
-               VEX_SIMD_NONE, VEX_OPCODE_0F_38, vex_w,
-               vector_len, no_mask_reg);
-  }
-
-  void vex_prefix_0F38_legacy(Register dst, Register nds, Address src, bool no_mask_reg = false) {
-    bool vex_w = false;
-    int vector_len = AVX_128bit;
-    vex_prefix(src, nds->encoding(), dst->encoding(),
-               VEX_SIMD_NONE, VEX_OPCODE_0F_38, vex_w,
-               vector_len, true, no_mask_reg);
-  }
-
-  void vex_prefix_0F38_q(Register dst, Register nds, Address src, bool no_mask_reg = false) {
-    bool vex_w = true;
-    int vector_len = AVX_128bit;
-    vex_prefix(src, nds->encoding(), dst->encoding(),
-               VEX_SIMD_NONE, VEX_OPCODE_0F_38, vex_w,
-               vector_len, no_mask_reg);
-  }
-
-  void vex_prefix_0F38_q_legacy(Register dst, Register nds, Address src, bool no_mask_reg = false) {
-    bool vex_w = true;
-    int vector_len = AVX_128bit;
-    vex_prefix(src, nds->encoding(), dst->encoding(),
-               VEX_SIMD_NONE, VEX_OPCODE_0F_38, vex_w,
-               vector_len, true, no_mask_reg);
-  }
+                  InstructionAttr *attributes);

  int  vex_prefix_and_encode(int dst_enc, int nds_enc, int src_enc,
                             VexSimdPrefix pre, VexOpcode opc,
-                             bool vex_w, int vector_len,
-                             bool legacy_mode, bool no_mask_reg);
+                             InstructionAttr *attributes);

-  int  vex_prefix_0F38_and_encode(Register dst, Register nds, Register src, bool no_mask_reg = false) {
-    bool vex_w = false;
-    int vector_len = AVX_128bit;
-    return vex_prefix_and_encode(dst->encoding(), nds->encoding(), src->encoding(),
-                                 VEX_SIMD_NONE, VEX_OPCODE_0F_38, vex_w, vector_len,
-                                 false, no_mask_reg);
-  }
+  void simd_prefix(XMMRegister xreg, XMMRegister nds, Address adr, VexSimdPrefix pre,
+                   VexOpcode opc, InstructionAttr *attributes);

-  int  vex_prefix_0F38_and_encode_legacy(Register dst, Register nds, Register src, bool no_mask_reg = false) {
-    bool vex_w = false;
-    int vector_len = AVX_128bit;
-    return vex_prefix_and_encode(dst->encoding(), nds->encoding(), src->encoding(),
-      VEX_SIMD_NONE, VEX_OPCODE_0F_38, vex_w, vector_len,
-      true, no_mask_reg);
-  }
+  int simd_prefix_and_encode(XMMRegister dst, XMMRegister nds, XMMRegister src, VexSimdPrefix pre,
+                             VexOpcode opc, InstructionAttr *attributes);

-  int  vex_prefix_0F38_and_encode_q(Register dst, Register nds, Register src, bool no_mask_reg = false) {
-    bool vex_w = true;
-    int vector_len = AVX_128bit;
-    return vex_prefix_and_encode(dst->encoding(), nds->encoding(), src->encoding(),
-                                 VEX_SIMD_NONE, VEX_OPCODE_0F_38, vex_w, vector_len,
-                                 false, no_mask_reg);
-  }
+  int kreg_prefix_and_encode(KRegister dst, KRegister nds, KRegister src, VexSimdPrefix pre,
+                             VexOpcode opc, InstructionAttr *attributes);

-  int  vex_prefix_0F38_and_encode_q_legacy(Register dst, Register nds, Register src, bool no_mask_reg = false) {
-    bool vex_w = true;
-    int vector_len = AVX_128bit;
-    return vex_prefix_and_encode(dst->encoding(), nds->encoding(), src->encoding(),
-                                 VEX_SIMD_NONE, VEX_OPCODE_0F_38, vex_w, vector_len,
-                                 true, no_mask_reg);
-  }
-
-  int  vex_prefix_and_encode(XMMRegister dst, XMMRegister nds, XMMRegister src,
-                             VexSimdPrefix pre, int vector_len = AVX_128bit,
-                             VexOpcode opc = VEX_OPCODE_0F, bool legacy_mode = false,
-                             bool no_mask_reg = false) {
-    int src_enc = src->encoding();
-    int dst_enc = dst->encoding();
-    int nds_enc = nds->is_valid() ? nds->encoding() : 0;
-    return vex_prefix_and_encode(dst_enc, nds_enc, src_enc, pre, opc, false, vector_len, legacy_mode, no_mask_reg);
-  }
-
-  void simd_prefix(XMMRegister xreg, XMMRegister nds, Address adr,
-                   VexSimdPrefix pre, bool no_mask_reg, VexOpcode opc = VEX_OPCODE_0F,
-                   bool rex_w = false, int vector_len = AVX_128bit, bool legacy_mode = false);
-
-  void simd_prefix(XMMRegister dst, Address src, VexSimdPrefix pre,
-                   bool no_mask_reg, VexOpcode opc = VEX_OPCODE_0F) {
-    simd_prefix(dst, xnoreg, src, pre, no_mask_reg, opc);
-  }
-
-  void simd_prefix(Address dst, XMMRegister src, VexSimdPrefix pre, bool no_mask_reg) {
-    simd_prefix(src, dst, pre, no_mask_reg);
-  }
-  void simd_prefix_q(XMMRegister dst, XMMRegister nds, Address src,
-                     VexSimdPrefix pre, bool no_mask_reg = false) {
-    bool rex_w = true;
-    simd_prefix(dst, nds, src, pre, no_mask_reg, VEX_OPCODE_0F, rex_w);
-  }
-
-  int simd_prefix_and_encode(XMMRegister dst, XMMRegister nds, XMMRegister src,
-                             VexSimdPrefix pre, bool no_mask_reg,
-                             VexOpcode opc = VEX_OPCODE_0F,
-                             bool rex_w = false, int vector_len = AVX_128bit,
-                             bool legacy_mode = false);
-
-  int kreg_prefix_and_encode(KRegister dst, KRegister nds, KRegister src,
-                             VexSimdPrefix pre, bool no_mask_reg,
-                             VexOpcode opc = VEX_OPCODE_0F,
-                             bool rex_w = false, int vector_len = AVX_128bit);
-
-  int kreg_prefix_and_encode(KRegister dst, KRegister nds, Register src,
-                             VexSimdPrefix pre, bool no_mask_reg,
-                             VexOpcode opc = VEX_OPCODE_0F,
-                             bool rex_w = false, int vector_len = AVX_128bit);
-
-  // Move/convert 32-bit integer value.
-  int simd_prefix_and_encode(XMMRegister dst, XMMRegister nds, Register src,
-                             VexSimdPrefix pre, bool no_mask_reg) {
-    // It is OK to cast from Register to XMMRegister to pass argument here
-    // since only encoding is used in simd_prefix_and_encode() and number of
-    // Gen and Xmm registers are the same.
-    return simd_prefix_and_encode(dst, nds, as_XMMRegister(src->encoding()), pre, no_mask_reg, VEX_OPCODE_0F);
-  }
-  int simd_prefix_and_encode(XMMRegister dst, Register src, VexSimdPrefix pre, bool no_mask_reg) {
-    return simd_prefix_and_encode(dst, xnoreg, src, pre, no_mask_reg);
-  }
-  int simd_prefix_and_encode(Register dst, XMMRegister src,
-                             VexSimdPrefix pre, VexOpcode opc = VEX_OPCODE_0F,
-                             bool no_mask_reg = false) {
-    return simd_prefix_and_encode(as_XMMRegister(dst->encoding()), xnoreg, src, pre, no_mask_reg, opc);
-  }
-
-  // Move/convert 64-bit integer value.
-  int simd_prefix_and_encode_q(XMMRegister dst, XMMRegister nds, Register src,
-                               VexSimdPrefix pre, bool no_mask_reg = false) {
-    bool rex_w = true;
-    return simd_prefix_and_encode(dst, nds, as_XMMRegister(src->encoding()), pre, no_mask_reg, VEX_OPCODE_0F, rex_w);
-  }
-  int simd_prefix_and_encode_q(XMMRegister dst, Register src, VexSimdPrefix pre, bool no_mask_reg) {
-    return simd_prefix_and_encode_q(dst, xnoreg, src, pre, no_mask_reg);
-  }
-  int simd_prefix_and_encode_q(Register dst, XMMRegister src,
-                               VexSimdPrefix pre, VexOpcode opc = VEX_OPCODE_0F,
-                               bool no_mask_reg = false) {
-    bool rex_w = true;
-    return simd_prefix_and_encode(as_XMMRegister(dst->encoding()), xnoreg, src, pre, no_mask_reg, opc, rex_w);
-  }
+  int kreg_prefix_and_encode(KRegister dst, KRegister nds, Register src, VexSimdPrefix pre,
+                             VexOpcode opc, InstructionAttr *attributes);

  // Helper functions for groups of instructions
  void emit_arith_b(int op1, int op2, Register dst, int imm8);
@ -821,27 +669,6 @@ private:
  void emit_arith_imm32(int op1, int op2, Register dst, int32_t imm32);
  void emit_arith(int op1, int op2, Register dst, Register src);

-  void emit_simd_arith(int opcode, XMMRegister dst, Address src, VexSimdPrefix pre, bool no_mask_reg = false, bool legacy_mode = false);
-  void emit_simd_arith_q(int opcode, XMMRegister dst, Address src, VexSimdPrefix pre, bool no_mask_reg = false);
-  void emit_simd_arith(int opcode, XMMRegister dst, XMMRegister src, VexSimdPrefix pre, bool no_mask_reg = false, bool legacy_mode = false);
-  void emit_simd_arith_q(int opcode, XMMRegister dst, XMMRegister src, VexSimdPrefix pre, bool no_mask_reg = false);
-  void emit_simd_arith_nonds(int opcode, XMMRegister dst, Address src, VexSimdPrefix pre, bool no_mask_reg = false);
-  void emit_simd_arith_nonds_q(int opcode, XMMRegister dst, Address src, VexSimdPrefix pre, bool no_mask_reg = false);
-  void emit_simd_arith_nonds(int opcode, XMMRegister dst, XMMRegister src, VexSimdPrefix pre, bool no_mask_reg = false, bool legacy_mode = false);
-  void emit_simd_arith_nonds_q(int opcode, XMMRegister dst, XMMRegister src, VexSimdPrefix pre, bool no_mask_reg = false);
-  void emit_vex_arith(int opcode, XMMRegister dst, XMMRegister nds,
-                      Address src, VexSimdPrefix pre, int vector_len,
-                      bool no_mask_reg = false, bool legacy_mode = false);
-  void emit_vex_arith_q(int opcode, XMMRegister dst, XMMRegister nds,
-                        Address src, VexSimdPrefix pre, int vector_len,
-                        bool no_mask_reg = false);
-  void emit_vex_arith(int opcode, XMMRegister dst, XMMRegister nds,
-                      XMMRegister src, VexSimdPrefix pre, int vector_len,
-                      bool no_mask_reg = false, bool legacy_mode = false);
-  void emit_vex_arith_q(int opcode, XMMRegister dst, XMMRegister nds,
-                        XMMRegister src, VexSimdPrefix pre, int vector_len,
-                        bool no_mask_reg = false);
-
  bool emit_compressed_disp_byte(int &disp);

  void emit_operand(Register reg,
@ -986,18 +813,16 @@ private:
  // belong in macro assembler but there is no need for both varieties to exist

  void init_attributes(void) {
-    _evex_encoding = 0;
-    _input_size_in_bits = 0;
-    _avx_vector_len = AVX_NoVec;
-    _tuple_type = EVEX_ETUP;
-    _is_evex_instruction = false;
    _legacy_mode_bw = (VM_Version::supports_avx512bw() == false);
    _legacy_mode_dq = (VM_Version::supports_avx512dq() == false);
    _legacy_mode_vl = (VM_Version::supports_avx512vl() == false);
    _legacy_mode_vlbw = (VM_Version::supports_avx512vlbw() == false);
-    _instruction_uses_vl = false;
+    _attributes = NULL;
  }

+  void set_attributes(InstructionAttr *attributes) { _attributes = attributes; }
+  void clear_attributes(void) { _attributes = NULL; }
+
  void lea(Register dst, Address src);

  void mov(Register dst, Register src);
@ -2106,12 +1931,12 @@ private:
  void vextracti128h(Address dst, XMMRegister src);

  // Copy low 256bit into high 256bit of ZMM registers.
-  void vinserti64x4h(XMMRegister dst, XMMRegister nds, XMMRegister src);
-  void vinsertf64x4h(XMMRegister dst, XMMRegister nds, XMMRegister src);
-  void vextracti64x4h(XMMRegister dst, XMMRegister src);
-  void vextractf64x4h(XMMRegister dst, XMMRegister src);
-  void vextractf64x4h(Address dst, XMMRegister src);
-  void vinsertf64x4h(XMMRegister dst, Address src);
+  void vinserti64x4h(XMMRegister dst, XMMRegister nds, XMMRegister src, int value);
+  void vinsertf64x4h(XMMRegister dst, XMMRegister nds, XMMRegister src, int value);
+  void vextracti64x4h(XMMRegister dst, XMMRegister src, int value);
+  void vextractf64x4h(XMMRegister dst, XMMRegister src, int value);
+  void vextractf64x4h(Address dst, XMMRegister src, int value);
+  void vinsertf64x4h(XMMRegister dst, Address src, int value);

  // Copy targeted 128bit segments of the ZMM registers
  void vextracti64x2h(XMMRegister dst, XMMRegister src, int value);
@ -2173,4 +1998,95 @@ private:

 };

+// The Intel x86/Amd64 Assembler attributes: All fields enclosed here are to guide encoding level decisions.
+// Specific set functions are for specialized use, else defaults or whatever was supplied to object construction
+// are applied.
+class InstructionAttr {
+public:
+  InstructionAttr(
+    int vector_len,
+    bool rex_vex_w,
+    bool legacy_mode,
+    bool no_reg_mask,
+    bool uses_vl)
+    :
+      _avx_vector_len(vector_len),
+      _rex_vex_w(rex_vex_w),
+      _legacy_mode(legacy_mode),
+      _no_reg_mask(no_reg_mask),
+      _uses_vl(uses_vl),
+      _tuple_type(Assembler::EVEX_ETUP),
+      _input_size_in_bits(Assembler::EVEX_NObit),
+      _is_evex_instruction(false),
+      _evex_encoding(0),
+      _is_clear_context(false),
+      _is_extended_context(false),
+      _current_assembler(NULL) {
+    if (UseAVX < 3) _legacy_mode = true;
+  }
+
+  ~InstructionAttr() {
+    if (_current_assembler != NULL) {
+      _current_assembler->clear_attributes();
+    }
+    _current_assembler = NULL;
+  }
+
+private:
+  int  _avx_vector_len;
+  bool _rex_vex_w;
+  bool _legacy_mode;
+  bool _no_reg_mask;
+  bool _uses_vl;
+  int  _tuple_type;
+  int  _input_size_in_bits;
+  bool _is_evex_instruction;
+  int  _evex_encoding;
+  bool _is_clear_context;
+  bool _is_extended_context;
+
+  Assembler *_current_assembler;
+
+public:
+  // query functions for field accessors
+  int  get_vector_len(void) const { return _avx_vector_len; }
+  bool is_rex_vex_w(void) const { return _rex_vex_w; }
+  bool is_legacy_mode(void) const { return _legacy_mode; }
+  bool is_no_reg_mask(void) const { return _no_reg_mask; }
+  bool uses_vl(void) const { return _uses_vl; }
+  int  get_tuple_type(void) const { return _tuple_type; }
+  int  get_input_size(void) const { return _input_size_in_bits; }
+  int  is_evex_instruction(void) const { return _is_evex_instruction; }
+  int  get_evex_encoding(void) const { return _evex_encoding; }
+  bool is_clear_context(void) const { return _is_clear_context; }
+  bool is_extended_context(void) const { return _is_extended_context; }
+
+  // Set the vector len manually
+  void set_vector_len(int vector_len) { _avx_vector_len = vector_len; }
+
+  // Set the instruction to be encoded in AVX mode
+  void set_is_legacy_mode(void) { _legacy_mode = true; }
+
+  // Set the current instuction to be encoded as an EVEX instuction
+  void set_is_evex_instruction(void) { _is_evex_instruction = true; }
+
+  // Internal encoding data used in compressed immediate offset programming
+  void set_evex_encoding(int value) { _evex_encoding = value; }
+
+  // Set the Evex.Z field to be used to clear all non directed XMM/YMM/ZMM components
+  void set_is_clear_context(void) { _is_clear_context = true; }
+
+  // Map back to current asembler so that we can manage object level assocation
+  void set_current_assembler(Assembler *current_assembler) { _current_assembler = current_assembler; }
+
+  // Address modifiers used for compressed displacement calculation
+  void set_address_attributes(int tuple_type, int input_size_in_bits) {
+    if (VM_Version::supports_evex()) {
+      _tuple_type = tuple_type;
+      _input_size_in_bits = input_size_in_bits;
+    }
+  }
+
+};
+
 #endif // CPU_X86_VM_ASSEMBLER_X86_HPP
--- a/hotspot/src/cpu/x86/vm/c1_LIRAssembler_x86.cpp
+++ b/hotspot/src/cpu/x86/vm/c1_LIRAssembler_x86.cpp
@ -3714,7 +3714,7 @@ void LIR_Assembler::negate(LIR_Opr left, LIR_Opr dest) {
    if (left->as_xmm_float_reg() != dest->as_xmm_float_reg()) {
      __ movflt(dest->as_xmm_float_reg(), left->as_xmm_float_reg());
    }
-    if (UseAVX > 1) {
+    if (UseAVX > 0) {
      __ vnegatess(dest->as_xmm_float_reg(), dest->as_xmm_float_reg(),
                   ExternalAddress((address)float_signflip_pool));
    } else {
@ -3725,7 +3725,7 @@ void LIR_Assembler::negate(LIR_Opr left, LIR_Opr dest) {
    if (left->as_xmm_double_reg() != dest->as_xmm_double_reg()) {
      __ movdbl(dest->as_xmm_double_reg(), left->as_xmm_double_reg());
    }
-    if (UseAVX > 1) {
+    if (UseAVX > 0) {
      __ vnegatesd(dest->as_xmm_double_reg(), dest->as_xmm_double_reg(),
                   ExternalAddress((address)double_signflip_pool));
    } else {
--- a/hotspot/src/cpu/x86/vm/c2_globals_x86.hpp
+++ b/hotspot/src/cpu/x86/vm/c2_globals_x86.hpp
@ -84,6 +84,7 @@ define_pd_global(bool, UseCISCSpill,                 true);
 define_pd_global(bool, OptoScheduling,               false);
 define_pd_global(bool, OptoBundling,                 false);
 define_pd_global(bool, OptoRegScheduling,            true);
+define_pd_global(bool, SuperWordLoopUnrollAnalysis,  true);

 define_pd_global(intx, ReservedCodeCacheSize,        48*M);
 define_pd_global(intx, NonProfiledCodeHeapSize,      21*M);
--- a/hotspot/src/cpu/x86/vm/c2_init_x86.cpp
+++ b/hotspot/src/cpu/x86/vm/c2_init_x86.cpp
@ -58,6 +58,4 @@ void Compile::pd_compiler2_init() {
      OptoReg::invalidate(i);
    }
  }
-
-  SuperWordLoopUnrollAnalysis = true;
 }
--- a/hotspot/src/cpu/x86/vm/macroAssembler_x86.cpp
+++ b/hotspot/src/cpu/x86/vm/macroAssembler_x86.cpp
--- a/hotspot/src/cpu/x86/vm/macroAssembler_x86.hpp
+++ b/hotspot/src/cpu/x86/vm/macroAssembler_x86.hpp
@ -962,10 +962,15 @@ public:
  void divss(XMMRegister dst, AddressLiteral src);

  // Move Unaligned Double Quadword
-  void movdqu(Address     dst, XMMRegister src)   { Assembler::movdqu(dst, src); }
-  void movdqu(XMMRegister dst, Address src)       { Assembler::movdqu(dst, src); }
-  void movdqu(XMMRegister dst, XMMRegister src)   { Assembler::movdqu(dst, src); }
+  void movdqu(Address     dst, XMMRegister src);
+  void movdqu(XMMRegister dst, Address src);
+  void movdqu(XMMRegister dst, XMMRegister src);
  void movdqu(XMMRegister dst, AddressLiteral src);
+  // AVX Unaligned forms
+  void vmovdqu(Address     dst, XMMRegister src);
+  void vmovdqu(XMMRegister dst, Address src);
+  void vmovdqu(XMMRegister dst, XMMRegister src);
+  void vmovdqu(XMMRegister dst, AddressLiteral src);

  // Move Aligned Double Quadword
  void movdqa(XMMRegister dst, Address src)       { Assembler::movdqa(dst, src); }
@ -1024,12 +1029,12 @@ public:
  void ucomisd(XMMRegister dst, AddressLiteral src);

  // Bitwise Logical XOR of Packed Double-Precision Floating-Point Values
-  void xorpd(XMMRegister dst, XMMRegister src) { Assembler::xorpd(dst, src); }
+  void xorpd(XMMRegister dst, XMMRegister src);
  void xorpd(XMMRegister dst, Address src)     { Assembler::xorpd(dst, src); }
  void xorpd(XMMRegister dst, AddressLiteral src);

  // Bitwise Logical XOR of Packed Single-Precision Floating-Point Values
-  void xorps(XMMRegister dst, XMMRegister src) { Assembler::xorps(dst, src); }
+  void xorps(XMMRegister dst, XMMRegister src);
  void xorps(XMMRegister dst, Address src)     { Assembler::xorps(dst, src); }
  void xorps(XMMRegister dst, AddressLiteral src);

@ -1047,6 +1052,39 @@ public:
  void vaddss(XMMRegister dst, XMMRegister nds, Address src)     { Assembler::vaddss(dst, nds, src); }
  void vaddss(XMMRegister dst, XMMRegister nds, AddressLiteral src);

+  void vabsss(XMMRegister dst, XMMRegister nds, XMMRegister src, AddressLiteral negate_field, int vector_len);
+  void vabssd(XMMRegister dst, XMMRegister nds, XMMRegister src, AddressLiteral negate_field, int vector_len);
+
+  void vpaddb(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
+  void vpaddb(XMMRegister dst, XMMRegister nds, Address src, int vector_len);
+
+  void vpaddw(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
+  void vpaddw(XMMRegister dst, XMMRegister nds, Address src, int vector_len);
+
+  void vpsubb(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
+  void vpsubb(XMMRegister dst, XMMRegister nds, Address src, int vector_len);
+
+  void vpsubw(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
+  void vpsubw(XMMRegister dst, XMMRegister nds, Address src, int vector_len);
+
+  void vpmullw(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
+  void vpmullw(XMMRegister dst, XMMRegister nds, Address src, int vector_len);
+
+  void vpsraw(XMMRegister dst, XMMRegister nds, XMMRegister shift, int vector_len);
+  void vpsraw(XMMRegister dst, XMMRegister nds, int shift, int vector_len);
+
+  void vpsrlw(XMMRegister dst, XMMRegister nds, XMMRegister shift, int vector_len);
+  void vpsrlw(XMMRegister dst, XMMRegister nds, int shift, int vector_len);
+
+  void vpsllw(XMMRegister dst, XMMRegister nds, XMMRegister shift, int vector_len);
+  void vpsllw(XMMRegister dst, XMMRegister nds, int shift, int vector_len);
+
+  void punpcklbw(XMMRegister dst, XMMRegister src);
+  void punpcklbw(XMMRegister dst, Address src) { Assembler::punpcklbw(dst, src); }
+
+  void pshuflw(XMMRegister dst, XMMRegister src, int mode);
+  void pshuflw(XMMRegister dst, Address src, int mode) { Assembler::pshuflw(dst, src, mode); }
+
  void vandpd(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) { Assembler::vandpd(dst, nds, src, vector_len); }
  void vandpd(XMMRegister dst, XMMRegister nds, Address src, int vector_len)     { Assembler::vandpd(dst, nds, src, vector_len); }
  void vandpd(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len);
--- a/hotspot/src/cpu/x86/vm/sharedRuntime_x86_32.cpp
+++ b/hotspot/src/cpu/x86/vm/sharedRuntime_x86_32.cpp
@ -192,31 +192,22 @@ OopMap* RegisterSaver::save_live_registers(MacroAssembler* masm, int additional_
    }
  } else if(UseSSE >= 2) {
    // Save whole 128bit (16 bytes) XMM regiters
-    if (VM_Version::supports_avx512novl()) {
-      for (int n = 0; n < num_xmm_regs; n++) {
-        __ vextractf32x4h(Address(rsp, off*wordSize), as_XMMRegister(n), 0);
-        off += delta;
-      }
-    } else {
-      for (int n = 0; n < num_xmm_regs; n++) {
-        __ movdqu(Address(rsp, off*wordSize), as_XMMRegister(n));
-        off += delta;
-      }
+    for (int n = 0; n < num_xmm_regs; n++) {
+      __ movdqu(Address(rsp, off*wordSize), as_XMMRegister(n));
+      off += delta;
    }
  }

  if (vect_words > 0) {
    assert(vect_words*wordSize == 128, "");
    __ subptr(rsp, 128); // Save upper half of YMM registes
-    off = 0;
    for (int n = 0; n < num_xmm_regs; n++) {
-      __ vextractf128h(Address(rsp, off++*16), as_XMMRegister(n));
+      __ vextractf128h(Address(rsp, n*16), as_XMMRegister(n));
    }
    if (UseAVX > 2) {
      __ subptr(rsp, 256); // Save upper half of ZMM registes
-      off = 0;
      for (int n = 0; n < num_xmm_regs; n++) {
-        __ vextractf64x4h(Address(rsp, off++*32), as_XMMRegister(n));
+        __ vextractf64x4h(Address(rsp, n*32), as_XMMRegister(n), 1);
      }
    }
  }
@ -285,31 +276,23 @@ void RegisterSaver::restore_live_registers(MacroAssembler* masm, bool restore_ve
      off += delta;
    }
  } else if (UseSSE >= 2) {
-    if (VM_Version::supports_avx512novl()) {
-      for (int n = 0; n < num_xmm_regs; n++) {
-        __ vinsertf32x4h(as_XMMRegister(n), Address(rsp, off*wordSize+additional_frame_bytes), 0);
-        off += delta;
-      }
-    } else {
-      for (int n = 0; n < num_xmm_regs; n++) {
-        __ movdqu(as_XMMRegister(n), Address(rsp, off*wordSize+additional_frame_bytes));
-        off += delta;
-      }
+    for (int n = 0; n < num_xmm_regs; n++) {
+      __ movdqu(as_XMMRegister(n), Address(rsp, off*wordSize+additional_frame_bytes));
+      off += delta;
    }
  }
  if (restore_vectors) {
+    assert(additional_frame_bytes == 128, "");
    if (UseAVX > 2) {
-      off = 0;
+      // Restore upper half of ZMM registers.
      for (int n = 0; n < num_xmm_regs; n++) {
-        __ vinsertf64x4h(as_XMMRegister(n), Address(rsp, off++*32));
+        __ vinsertf64x4h(as_XMMRegister(n), Address(rsp, n*32), 1);
      }
      __ addptr(rsp, additional_frame_bytes*2); // Save upper half of ZMM registes
    }
    // Restore upper half of YMM registes.
-    assert(additional_frame_bytes == 128, "");
-    off = 0;
    for (int n = 0; n < num_xmm_regs; n++) {
-      __ vinsertf128h(as_XMMRegister(n), Address(rsp, off++*16));
+      __ vinsertf128h(as_XMMRegister(n), Address(rsp, n*16));
    }
    __ addptr(rsp, additional_frame_bytes); // Save upper half of YMM registes
  }
--- a/hotspot/src/cpu/x86/vm/sharedRuntime_x86_64.cpp
+++ b/hotspot/src/cpu/x86/vm/sharedRuntime_x86_64.cpp
@ -72,45 +72,28 @@ class SimpleRuntimeFrame {
 class RegisterSaver {
  // Capture info about frame layout.  Layout offsets are in jint
  // units because compiler frame slots are jints.
-#define HALF_ZMM_BANK_WORDS 128
+#define XSAVE_AREA_BEGIN 160
+#define XSAVE_AREA_YMM_BEGIN 576
+#define XSAVE_AREA_ZMM_BEGIN 1152
+#define XSAVE_AREA_UPPERBANK 1664
 #define DEF_XMM_OFFS(regnum) xmm ## regnum ## _off = xmm_off + (regnum)*16/BytesPerInt, xmm ## regnum ## H_off
+#define DEF_YMM_OFFS(regnum) ymm ## regnum ## _off = ymm_off + (regnum)*16/BytesPerInt, ymm ## regnum ## H_off
 #define DEF_ZMM_OFFS(regnum) zmm ## regnum ## _off = zmm_off + (regnum-16)*64/BytesPerInt, zmm ## regnum ## H_off
  enum layout {
    fpu_state_off = frame::arg_reg_save_area_bytes/BytesPerInt, // fxsave save area
-    xmm_off       = fpu_state_off + 160/BytesPerInt,            // offset in fxsave save area
+    xmm_off       = fpu_state_off + XSAVE_AREA_BEGIN/BytesPerInt,            // offset in fxsave save area
    DEF_XMM_OFFS(0),
    DEF_XMM_OFFS(1),
-    DEF_XMM_OFFS(2),
-    DEF_XMM_OFFS(3),
-    DEF_XMM_OFFS(4),
-    DEF_XMM_OFFS(5),
-    DEF_XMM_OFFS(6),
-    DEF_XMM_OFFS(7),
-    DEF_XMM_OFFS(8),
-    DEF_XMM_OFFS(9),
-    DEF_XMM_OFFS(10),
-    DEF_XMM_OFFS(11),
-    DEF_XMM_OFFS(12),
-    DEF_XMM_OFFS(13),
-    DEF_XMM_OFFS(14),
-    DEF_XMM_OFFS(15),
-    zmm_off = fpu_state_off + ((FPUStateSizeInWords - (HALF_ZMM_BANK_WORDS + 1))*wordSize / BytesPerInt),
+    // 2..15 are implied in range usage
+    ymm_off = xmm_off + (XSAVE_AREA_YMM_BEGIN - XSAVE_AREA_BEGIN)/BytesPerInt,
+    DEF_YMM_OFFS(0),
+    DEF_YMM_OFFS(1),
+    // 2..15 are implied in range usage
+    zmm_high = xmm_off + (XSAVE_AREA_ZMM_BEGIN - XSAVE_AREA_BEGIN)/BytesPerInt,
+    zmm_off = xmm_off + (XSAVE_AREA_UPPERBANK - XSAVE_AREA_BEGIN)/BytesPerInt,
    DEF_ZMM_OFFS(16),
    DEF_ZMM_OFFS(17),
-    DEF_ZMM_OFFS(18),
-    DEF_ZMM_OFFS(19),
-    DEF_ZMM_OFFS(20),
-    DEF_ZMM_OFFS(21),
-    DEF_ZMM_OFFS(22),
-    DEF_ZMM_OFFS(23),
-    DEF_ZMM_OFFS(24),
-    DEF_ZMM_OFFS(25),
-    DEF_ZMM_OFFS(26),
-    DEF_ZMM_OFFS(27),
-    DEF_ZMM_OFFS(28),
-    DEF_ZMM_OFFS(29),
-    DEF_ZMM_OFFS(30),
-    DEF_ZMM_OFFS(31),
+    // 18..31 are implied in range usage
    fpu_state_end = fpu_state_off + ((FPUStateSizeInWords-1)*wordSize / BytesPerInt),
    fpu_stateH_end,
    r15_off, r15H_off,
@ -160,8 +143,6 @@ class RegisterSaver {
 };

 OopMap* RegisterSaver::save_live_registers(MacroAssembler* masm, int additional_frame_words, int* total_frame_words, bool save_vectors) {
-  int vect_words = 0;
-  int ymmhi_offset = -1;
  int off = 0;
  int num_xmm_regs = XMMRegisterImpl::number_of_registers;
  if (UseAVX < 3) {
@ -171,24 +152,15 @@ OopMap* RegisterSaver::save_live_registers(MacroAssembler* masm, int additional_
  if (save_vectors) {
    assert(UseAVX > 0, "512bit vectors are supported only with EVEX");
    assert(MaxVectorSize == 64, "only 512bit vectors are supported now");
-    // Save upper half of YMM registers
-    vect_words = 16 * num_xmm_regs / wordSize;
-    if (UseAVX < 3) {
-      ymmhi_offset = additional_frame_words;
-      additional_frame_words += vect_words;
-    }
  }
 #else
  assert(!save_vectors, "vectors are generated only by C2 and JVMCI");
 #endif

-  // Always make the frame size 16-byte aligned
-  int frame_size_in_bytes = round_to(additional_frame_words*wordSize +
-                                     reg_save_size*BytesPerInt, num_xmm_regs);
+  // Always make the frame size 16-byte aligned, both vector and non vector stacks are always allocated
+  int frame_size_in_bytes = round_to(reg_save_size*BytesPerInt, num_xmm_regs);
  // OopMap frame size is in compiler stack slots (jint's) not bytes or words
  int frame_size_in_slots = frame_size_in_bytes / BytesPerInt;
-  // The caller will allocate additional_frame_words
-  int additional_frame_slots = additional_frame_words*wordSize / BytesPerInt;
  // CodeBlob frame size is in words.
  int frame_size_in_words = frame_size_in_bytes / wordSize;
  *total_frame_words = frame_size_in_words;
@ -203,12 +175,34 @@ OopMap* RegisterSaver::save_live_registers(MacroAssembler* masm, int additional_
  __ push_CPU_state(); // Push a multiple of 16 bytes

  // push cpu state handles this on EVEX enabled targets
-  if ((vect_words > 0) && (UseAVX < 3)) {
-    assert(vect_words*wordSize >= 256, "");
-    // Save upper half of YMM registes(0..num_xmm_regs)
-    __ subptr(rsp, num_xmm_regs*16);
-    for (int n = 0; n < num_xmm_regs; n++) {
-      __ vextractf128h(Address(rsp, off++*16), as_XMMRegister(n));
+  if (save_vectors) {
+    // Save upper half of YMM registes(0..15)
+    int base_addr = XSAVE_AREA_YMM_BEGIN;
+    for (int n = 0; n < 16; n++) {
+      __ vextractf128h(Address(rsp, base_addr+n*16), as_XMMRegister(n));
+    }
+    if (VM_Version::supports_evex()) {
+      // Save upper half of ZMM registes(0..15)
+      base_addr = XSAVE_AREA_ZMM_BEGIN;
+      for (int n = 0; n < 16; n++) {
+        __ vextractf64x4h(Address(rsp, base_addr+n*32), as_XMMRegister(n), 1);
+      }
+      // Save full ZMM registes(16..num_xmm_regs)
+      base_addr = XSAVE_AREA_UPPERBANK;
+      int off = 0;
+      int vector_len = Assembler::AVX_512bit;
+      for (int n = 16; n < num_xmm_regs; n++) {
+        __ evmovdqul(Address(rsp, base_addr+(off++*64)), as_XMMRegister(n), vector_len);
+      }
+    }
+  } else {
+    if (VM_Version::supports_evex()) {
+      // Save upper bank of ZMM registers(16..31) for double/float usage
+      int base_addr = XSAVE_AREA_UPPERBANK;
+      int off = 0;
+      for (int n = 16; n < num_xmm_regs; n++) {
+        __ movsd(Address(rsp, base_addr+(off++*64)), as_XMMRegister(n));
+      }
    }
  }
  if (frame::arg_reg_save_area_bytes != 0) {
@ -224,8 +218,7 @@ OopMap* RegisterSaver::save_live_registers(MacroAssembler* masm, int additional_
  OopMapSet *oop_maps = new OopMapSet();
  OopMap* map = new OopMap(frame_size_in_slots, 0);

-#define STACK_OFFSET(x) VMRegImpl::stack2reg((x) + additional_frame_slots)
-#define YMMHI_STACK_OFFSET(x) VMRegImpl::stack2reg((x / VMRegImpl::stack_slot_size) + ymmhi_offset)
+#define STACK_OFFSET(x) VMRegImpl::stack2reg((x))

  map->set_callee_saved(STACK_OFFSET( rax_off ), rax->as_VMReg());
  map->set_callee_saved(STACK_OFFSET( rcx_off ), rcx->as_VMReg());
@ -257,31 +250,21 @@ OopMap* RegisterSaver::save_live_registers(MacroAssembler* masm, int additional_
    off = zmm16_off;
    delta = zmm17_off - off;
    for (int n = 16; n < num_xmm_regs; n++) {
-      XMMRegister xmm_name = as_XMMRegister(n);
-      map->set_callee_saved(STACK_OFFSET(off), xmm_name->as_VMReg());
+      XMMRegister zmm_name = as_XMMRegister(n);
+      map->set_callee_saved(STACK_OFFSET(off), zmm_name->as_VMReg());
      off += delta;
    }
  }

 #if defined(COMPILER2) || INCLUDE_JVMCI
  if (save_vectors) {
-    assert(ymmhi_offset != -1, "save area must exist");
-    map->set_callee_saved(YMMHI_STACK_OFFSET(  0), xmm0->as_VMReg()->next(4));
-    map->set_callee_saved(YMMHI_STACK_OFFSET( 16), xmm1->as_VMReg()->next(4));
-    map->set_callee_saved(YMMHI_STACK_OFFSET( 32), xmm2->as_VMReg()->next(4));
-    map->set_callee_saved(YMMHI_STACK_OFFSET( 48), xmm3->as_VMReg()->next(4));
-    map->set_callee_saved(YMMHI_STACK_OFFSET( 64), xmm4->as_VMReg()->next(4));
-    map->set_callee_saved(YMMHI_STACK_OFFSET( 80), xmm5->as_VMReg()->next(4));
-    map->set_callee_saved(YMMHI_STACK_OFFSET( 96), xmm6->as_VMReg()->next(4));
-    map->set_callee_saved(YMMHI_STACK_OFFSET(112), xmm7->as_VMReg()->next(4));
-    map->set_callee_saved(YMMHI_STACK_OFFSET(128), xmm8->as_VMReg()->next(4));
-    map->set_callee_saved(YMMHI_STACK_OFFSET(144), xmm9->as_VMReg()->next(4));
-    map->set_callee_saved(YMMHI_STACK_OFFSET(160), xmm10->as_VMReg()->next(4));
-    map->set_callee_saved(YMMHI_STACK_OFFSET(176), xmm11->as_VMReg()->next(4));
-    map->set_callee_saved(YMMHI_STACK_OFFSET(192), xmm12->as_VMReg()->next(4));
-    map->set_callee_saved(YMMHI_STACK_OFFSET(208), xmm13->as_VMReg()->next(4));
-    map->set_callee_saved(YMMHI_STACK_OFFSET(224), xmm14->as_VMReg()->next(4));
-    map->set_callee_saved(YMMHI_STACK_OFFSET(240), xmm15->as_VMReg()->next(4));
+    off = ymm0_off;
+    int delta = ymm1_off - off;
+    for (int n = 0; n < 16; n++) {
+      XMMRegister ymm_name = as_XMMRegister(n);
+      map->set_callee_saved(STACK_OFFSET(off), ymm_name->as_VMReg()->next(4));
+      off += delta;
+    }
  }
 #endif // COMPILER2 || INCLUDE_JVMCI

@ -316,8 +299,8 @@ OopMap* RegisterSaver::save_live_registers(MacroAssembler* masm, int additional_
      off = zmm16H_off;
      delta = zmm17H_off - off;
      for (int n = 16; n < num_xmm_regs; n++) {
-        XMMRegister xmm_name = as_XMMRegister(n);
-        map->set_callee_saved(STACK_OFFSET(off), xmm_name->as_VMReg()->next());
+        XMMRegister zmm_name = as_XMMRegister(n);
+        map->set_callee_saved(STACK_OFFSET(off), zmm_name->as_VMReg()->next());
        off += delta;
      }
    }
@ -335,21 +318,48 @@ void RegisterSaver::restore_live_registers(MacroAssembler* masm, bool restore_ve
    // Pop arg register save area
    __ addptr(rsp, frame::arg_reg_save_area_bytes);
  }
+
 #if defined(COMPILER2) || INCLUDE_JVMCI
-  // On EVEX enabled targets everything is handled in pop fpu state
-  if ((restore_vectors) && (UseAVX < 3)) {
-    assert(UseAVX > 0, "256/512-bit vectors are supported only with AVX");
-    assert(MaxVectorSize == 64, "up to 512bit vectors are supported now");
-    int off = 0;
-    // Restore upper half of YMM registes (0..num_xmm_regs)
-    for (int n = 0; n < num_xmm_regs; n++) {
-      __ vinsertf128h(as_XMMRegister(n), Address(rsp,  off++*16));
-    }
-    __ addptr(rsp, num_xmm_regs*16);
+  if (restore_vectors) {
+    assert(UseAVX > 0, "512bit vectors are supported only with EVEX");
+    assert(MaxVectorSize == 64, "only 512bit vectors are supported now");
  }
 #else
-  assert(!restore_vectors, "vectors are generated only by C2 and JVMCI");
+  assert(!save_vectors, "vectors are generated only by C2");
 #endif
+
+  // On EVEX enabled targets everything is handled in pop fpu state
+  if (restore_vectors) {
+    // Restore upper half of YMM registes (0..15)
+    int base_addr = XSAVE_AREA_YMM_BEGIN;
+    for (int n = 0; n < 16; n++) {
+      __ vinsertf128h(as_XMMRegister(n), Address(rsp,  base_addr+n*16));
+    }
+    if (VM_Version::supports_evex()) {
+      // Restore upper half of ZMM registes (0..15)
+      base_addr = XSAVE_AREA_ZMM_BEGIN;
+      for (int n = 0; n < 16; n++) {
+        __ vinsertf64x4h(as_XMMRegister(n), Address(rsp, base_addr+n*32), 1);
+      }
+      // Restore full ZMM registes(16..num_xmm_regs)
+      base_addr = XSAVE_AREA_UPPERBANK;
+      int vector_len = Assembler::AVX_512bit;
+      int off = 0;
+      for (int n = 16; n < num_xmm_regs; n++) {
+        __ evmovdqul(as_XMMRegister(n), Address(rsp, base_addr+(off++*64)), vector_len);
+      }
+    }
+  } else {
+    if (VM_Version::supports_evex()) {
+      // Restore upper bank of ZMM registes(16..31) for double/float usage
+      int base_addr = XSAVE_AREA_UPPERBANK;
+      int off = 0;
+      for (int n = 16; n < num_xmm_regs; n++) {
+        __ movsd(as_XMMRegister(n), Address(rsp, base_addr+(off++*64)));
+      }
+    }
+  }
+
  // Recover CPU state
  __ pop_CPU_state();
  // Get the rbp described implicitly by the calling convention (no oopMap)
--- a/hotspot/src/cpu/x86/vm/stubGenerator_x86_64.cpp
+++ b/hotspot/src/cpu/x86/vm/stubGenerator_x86_64.cpp
@ -273,7 +273,7 @@ class StubGenerator: public StubCodeGenerator {
    if (UseAVX > 2) {
      last_reg = 31;
    }
-    if (VM_Version::supports_avx512novl()) {
+    if (VM_Version::supports_evex()) {
      for (int i = xmm_save_first; i <= last_reg; i++) {
        __ vextractf32x4h(xmm_save(i), as_XMMRegister(i), 0);
      }
@ -391,7 +391,7 @@ class StubGenerator: public StubCodeGenerator {
    // restore regs belonging to calling function
 #ifdef _WIN64
    // emit the restores for xmm regs
-    if (VM_Version::supports_avx512novl()) {
+    if (VM_Version::supports_evex()) {
      for (int i = xmm_save_first; i <= last_reg; i++) {
        __ vinsertf32x4h(as_XMMRegister(i), xmm_save(i), 0);
      }
--- a/hotspot/src/cpu/x86/vm/vm_version_x86.cpp
+++ b/hotspot/src/cpu/x86/vm/vm_version_x86.cpp
@ -891,7 +891,7 @@ void VM_Version::get_processor_features() {
      UseNewLongLShift = true;
    }
    if( FLAG_IS_DEFAULT(UseXmmLoadAndClearUpper) ) {
-      if( supports_sse4a() ) {
+      if (supports_sse4a()) {
        UseXmmLoadAndClearUpper = true; // use movsd only on '10h' Opteron
      } else {
        UseXmmLoadAndClearUpper = false;
--- a/hotspot/src/cpu/x86/vm/vm_version_x86.hpp
+++ b/hotspot/src/cpu/x86/vm/vm_version_x86.hpp
@ -552,6 +552,19 @@ protected:
          break;
        }
      }
+      // zmm_save will be set on a EVEX enabled machine even if we choose AVX code gen
+      if (retVal == false) {
+        // Verify that OS save/restore all bits of EVEX registers
+        // during signal processing.
+        int nreg = 2 LP64_ONLY(+2);
+        retVal = true;
+        for (int i = 0; i < 16 * nreg; i++) { // 64 bytes per zmm register
+          if (_cpuid_info.zmm_save[i] != ymm_test_value()) {
+            retVal = false;
+            break;
+          }
+        }
+      }
    }
    return retVal;
  }
@ -706,6 +719,9 @@ public:
  static bool supports_avx512vl() { return (_cpuFeatures & CPU_AVX512VL) != 0; }
  static bool supports_avx512vlbw() { return (supports_avx512bw() && supports_avx512vl()); }
  static bool supports_avx512novl() { return (supports_evex() && !supports_avx512vl()); }
+  static bool supports_avx512nobw() { return (supports_evex() && !supports_avx512bw()); }
+  static bool supports_avx256only() { return (supports_avx2() && !supports_evex()); }
+  static bool supports_avxonly()    { return ((supports_avx2() || supports_avx()) && !supports_evex()); }
  // Intel features
  static bool is_intel_family_core() { return is_intel() &&
                                       extended_cpu_family() == CPU_FAMILY_INTEL_CORE; }
--- a/hotspot/src/cpu/x86/vm/x86.ad
+++ b/hotspot/src/cpu/x86/vm/x86.ad
--- a/hotspot/src/cpu/x86/vm/x86_32.ad
+++ b/hotspot/src/cpu/x86/vm/x86_32.ad
@ -291,9 +291,7 @@ static int pre_call_resets_size() {
    size += 6; // fldcw
  }
  if (C->max_vector_size() > 16) {
-    if(UseAVX <= 2) {
-      size += 3; // vzeroupper
-    }
+    size += 3; // vzeroupper
  }
  return size;
 }
@ -1915,7 +1913,7 @@ encode %{
      if (stub == NULL) {
        ciEnv::current()->record_failure("CodeCache is full");
        return;
-      } 
+      }
    }
  %}

--- a/hotspot/src/cpu/x86/vm/x86_64.ad
+++ b/hotspot/src/cpu/x86/vm/x86_64.ad
@ -536,11 +536,7 @@ source %{
 #define __ _masm.

 static int clear_avx_size() {
-  if(UseAVX > 2) {
-    return 0; // vzeroupper is ignored
-  } else {
-    return (Compile::current()->max_vector_size() > 16) ? 3 : 0;  // vzeroupper
-  }
+  return (Compile::current()->max_vector_size() > 16) ? 3 : 0;  // vzeroupper
 }

 // !!!!! Special hack to get all types of calls to specify the byte offset
@ -871,7 +867,7 @@ void MachPrologNode::format(PhaseRegAlloc* ra_, outputStream* st) const {
      if (framesize > 0) {
        st->print("\n\t");
        st->print("addq    rbp, #%d", framesize);
-      }      
+      }
    }
  }

--- a/hotspot/src/share/vm/opto/c2_globals.hpp
+++ b/hotspot/src/share/vm/opto/c2_globals.hpp
@ -186,9 +186,9 @@
          "Maximum number of unrolls for main loop")                        \
          range(0, max_jint)                                                \
                                                                            \
-  product(bool,  SuperWordLoopUnrollAnalysis, false,                        \
-          "Map number of unrolls for main loop via "                        \
-          "Superword Level Parallelism analysis")                           \
+  product_pd(bool,  SuperWordLoopUnrollAnalysis,                            \
+           "Map number of unrolls for main loop via "                       \
+           "Superword Level Parallelism analysis")                          \
                                                                            \
  notproduct(bool, TraceSuperWordLoopUnrollAnalysis, false,                 \
          "Trace what Superword Level Parallelism analysis applies")        \
--- a/hotspot/src/share/vm/opto/matcher.hpp
+++ b/hotspot/src/share/vm/opto/matcher.hpp
@ -269,6 +269,10 @@ public:
  // should generate this one.
  static const bool match_rule_supported(int opcode);

+  // identify extra cases that we might want to provide match rules for
+  // e.g. Op_ vector nodes and other intrinsics while guarding with vlen
+  static const bool match_rule_supported_vector(int opcode, int vlen);
+
  // Some uarchs have different sized float register resources
  static const int float_pressure(int default_pressure_threshold);

--- a/hotspot/src/share/vm/opto/superword.cpp
+++ b/hotspot/src/share/vm/opto/superword.cpp
@ -2247,7 +2247,10 @@ void SuperWord::output() {
        NOT_PRODUCT(if (TraceSuperWordLoopUnrollAnalysis) tty->print_cr("vector loop(unroll=%d, len=%d)\n", max_vlen, max_vlen_in_bytes*BitsPerByte));
        // For atomic unrolled loops which are vector mapped, instigate more unrolling.
        cl->set_notpassed_slp();
-        C->set_major_progress();
+        // if vector resources are limited, do not allow additional unrolling
+        if (FLOATPRESSURE > 8) {
+          C->set_major_progress();
+        }
        cl->mark_do_unroll_only();
      }
    }
--- a/hotspot/src/share/vm/opto/vectornode.cpp
+++ b/hotspot/src/share/vm/opto/vectornode.cpp
@ -188,7 +188,7 @@ bool VectorNode::implemented(int opc, uint vlen, BasicType bt) {
      (vlen > 1) && is_power_of_2(vlen) &&
      Matcher::vector_size_supported(bt, vlen)) {
    int vopc = VectorNode::opcode(opc, bt);
-    return vopc > 0 && Matcher::match_rule_supported(vopc) && (vopc != Op_CMoveD || vlen == 4);
+    return vopc > 0 && Matcher::match_rule_supported_vector(vopc, vlen);
  }
  return false;
 }