Merge

2015-09-17 09:19:39 -07:00 · 2015-09-17 09:19:39 -07:00 · 5ef8af7bec
commit 5ef8af7bec
parent 6ee248ddf1 b1050ba777
63 changed files with 2512 additions and 1627 deletions
--- a/hotspot/src/cpu/aarch64/vm/aarch64.ad
+++ b/hotspot/src/cpu/aarch64/vm/aarch64.ad
@ -3803,82 +3803,38 @@ encode %{

  enc_class aarch64_enc_cmpxchg(memory mem, iRegLNoSp oldval, iRegLNoSp newval) %{
    MacroAssembler _masm(&cbuf);
-    Register old_reg = as_Register($oldval$$reg);
-    Register new_reg = as_Register($newval$$reg);
-    Register base = as_Register($mem$$base);
-    Register addr_reg;
-    int index = $mem$$index;
-    int scale = $mem$$scale;
-    int disp = $mem$$disp;
-    if (index == -1) {
-       if (disp != 0) {
-        __ lea(rscratch2, Address(base, disp));
-        addr_reg = rscratch2;
-      } else {
-        // TODO
-        // should we ever get anything other than this case?
-        addr_reg = base;
-      }
-    } else {
-      Register index_reg = as_Register(index);
-      if (disp == 0) {
-        __ lea(rscratch2, Address(base, index_reg, Address::lsl(scale)));
-        addr_reg = rscratch2;
-      } else {
-        __ lea(rscratch2, Address(base, disp));
-        __ lea(rscratch2, Address(rscratch2, index_reg, Address::lsl(scale)));
-        addr_reg = rscratch2;
-      }
-    }
-    Label retry_load, done;
-    __ bind(retry_load);
-    __ ldxr(rscratch1, addr_reg);
-    __ cmp(rscratch1, old_reg);
-    __ br(Assembler::NE, done);
-    __ stlxr(rscratch1, new_reg, addr_reg);
-    __ cbnzw(rscratch1, retry_load);
-    __ bind(done);
+    guarantee($mem$$index == -1 && $mem$$disp == 0, "impossible encoding");
+    __ cmpxchg($mem$$base$$Register, $oldval$$Register, $newval$$Register,
+               &Assembler::ldxr, &MacroAssembler::cmp, &Assembler::stlxr);
  %}

  enc_class aarch64_enc_cmpxchgw(memory mem, iRegINoSp oldval, iRegINoSp newval) %{
    MacroAssembler _masm(&cbuf);
-    Register old_reg = as_Register($oldval$$reg);
-    Register new_reg = as_Register($newval$$reg);
-    Register base = as_Register($mem$$base);
-    Register addr_reg;
-    int index = $mem$$index;
-    int scale = $mem$$scale;
-    int disp = $mem$$disp;
-    if (index == -1) {
-       if (disp != 0) {
-        __ lea(rscratch2, Address(base, disp));
-        addr_reg = rscratch2;
-      } else {
-        // TODO
-        // should we ever get anything other than this case?
-        addr_reg = base;
-      }
-    } else {
-      Register index_reg = as_Register(index);
-      if (disp == 0) {
-        __ lea(rscratch2, Address(base, index_reg, Address::lsl(scale)));
-        addr_reg = rscratch2;
-      } else {
-        __ lea(rscratch2, Address(base, disp));
-        __ lea(rscratch2, Address(rscratch2, index_reg, Address::lsl(scale)));
-        addr_reg = rscratch2;
-      }
-    }
-    Label retry_load, done;
-    __ bind(retry_load);
-    __ ldxrw(rscratch1, addr_reg);
-    __ cmpw(rscratch1, old_reg);
-    __ br(Assembler::NE, done);
-    __ stlxrw(rscratch1, new_reg, addr_reg);
-    __ cbnzw(rscratch1, retry_load);
-    __ bind(done);
+    guarantee($mem$$index == -1 && $mem$$disp == 0, "impossible encoding");
+    __ cmpxchg($mem$$base$$Register, $oldval$$Register, $newval$$Register,
+               &Assembler::ldxrw, &MacroAssembler::cmpw, &Assembler::stlxrw);
  %}

+
+  // The only difference between aarch64_enc_cmpxchg and
+  // aarch64_enc_cmpxchg_acq is that we use load-acquire in the
+  // CompareAndSwap sequence to serve as a barrier on acquiring a
+  // lock.
+  enc_class aarch64_enc_cmpxchg_acq(memory mem, iRegLNoSp oldval, iRegLNoSp newval) %{
+    MacroAssembler _masm(&cbuf);
+    guarantee($mem$$index == -1 && $mem$$disp == 0, "impossible encoding");
+    __ cmpxchg($mem$$base$$Register, $oldval$$Register, $newval$$Register,
+               &Assembler::ldaxr, &MacroAssembler::cmp, &Assembler::stlxr);
+  %}
+
+  enc_class aarch64_enc_cmpxchgw_acq(memory mem, iRegINoSp oldval, iRegINoSp newval) %{
+    MacroAssembler _masm(&cbuf);
+    guarantee($mem$$index == -1 && $mem$$disp == 0, "impossible encoding");
+    __ cmpxchg($mem$$base$$Register, $oldval$$Register, $newval$$Register,
+               &Assembler::ldaxrw, &MacroAssembler::cmpw, &Assembler::stlxrw);
+  %}
+
+
  // auxiliary used for CompareAndSwapX to set result register
  enc_class aarch64_enc_cset_eq(iRegINoSp res) %{
    MacroAssembler _masm(&cbuf);
@ -4398,13 +4354,10 @@ encode %{

    // Compare object markOop with mark and if equal exchange scratch1
    // with object markOop.
-    // Note that this is simply a CAS: it does not generate any
-    // barriers.  These are separately generated by
-    // membar_acquire_lock().
    {
      Label retry_load;
      __ bind(retry_load);
-      __ ldxr(tmp, oop);
+      __ ldaxr(tmp, oop);
      __ cmp(tmp, disp_hdr);
      __ br(Assembler::NE, cas_failed);
      // use stlxr to ensure update is immediately visible
@ -4454,7 +4407,7 @@ encode %{
      {
        Label retry_load, fail;
        __ bind(retry_load);
-        __ ldxr(rscratch1, tmp);
+        __ ldaxr(rscratch1, tmp);
        __ cmp(disp_hdr, rscratch1);
        __ br(Assembler::NE, fail);
        // use stlxr to ensure update is immediately visible
@ -8017,10 +7970,10 @@ instruct membar_acquire_lock() %{
  match(MemBarAcquireLock);
  ins_cost(VOLATILE_REF_COST);

-  format %{ "membar_acquire_lock" %}
+  format %{ "membar_acquire_lock (elided)" %}

  ins_encode %{
-    __ membar(Assembler::LoadLoad|Assembler::LoadStore);
+    __ block_comment("membar_acquire_lock (elided)");
  %}

  ins_pipe(pipe_serial);
@ -8080,10 +8033,10 @@ instruct membar_release_lock() %{
  match(MemBarReleaseLock);
  ins_cost(VOLATILE_REF_COST);

-  format %{ "membar_release_lock" %}
+  format %{ "membar_release_lock (elided)" %}

  ins_encode %{
-    __ membar(Assembler::LoadStore|Assembler::StoreStore);
+    __ block_comment("membar_release_lock (elided)");
  %}

  ins_pipe(pipe_serial);
@ -8369,7 +8322,11 @@ instruct storePConditional(memory heap_top_ptr, iRegP oldval, iRegP newval, rFla
  ins_pipe(pipe_serial);
 %}

-// this has to be implemented as a CAS
+
+// storeLConditional is used by PhaseMacroExpand::expand_lock_node
+// when attempting to rebias a lock towards the current thread.  We
+// must use the acquire form of cmpxchg in order to guarantee acquire
+// semantics in this case.
 instruct storeLConditional(indirect mem, iRegLNoSp oldval, iRegLNoSp newval, rFlagsReg cr)
 %{
  match(Set cr (StoreLConditional mem (Binary oldval newval)));
@ -8381,12 +8338,14 @@ instruct storeLConditional(indirect mem, iRegLNoSp oldval, iRegLNoSp newval, rFl
    "cmpw rscratch1, zr\t# EQ on successful write"
  %}

-  ins_encode(aarch64_enc_cmpxchg(mem, oldval, newval));
+  ins_encode(aarch64_enc_cmpxchg_acq(mem, oldval, newval));

  ins_pipe(pipe_slow);
 %}

-// this has to be implemented as a CAS
+// storeIConditional also has acquire semantics, for no better reason
+// than matching storeLConditional.  At the time of writing this
+// comment storeIConditional was not used anywhere by AArch64.
 instruct storeIConditional(indirect mem, iRegINoSp oldval, iRegINoSp newval, rFlagsReg cr)
 %{
  match(Set cr (StoreIConditional mem (Binary oldval newval)));
@ -8398,7 +8357,7 @@ instruct storeIConditional(indirect mem, iRegINoSp oldval, iRegINoSp newval, rFl
    "cmpw rscratch1, zr\t# EQ on successful write"
  %}

-  ins_encode(aarch64_enc_cmpxchgw(mem, oldval, newval));
+  ins_encode(aarch64_enc_cmpxchgw_acq(mem, oldval, newval));

  ins_pipe(pipe_slow);
 %}
--- a/hotspot/src/cpu/aarch64/vm/macroAssembler_aarch64.hpp
+++ b/hotspot/src/cpu/aarch64/vm/macroAssembler_aarch64.hpp
@ -917,6 +917,8 @@ public:

  void cmpptr(Register src1, Address src2);

+  // Various forms of CAS
+
  void cmpxchgptr(Register oldv, Register newv, Register addr, Register tmp,
                  Label &suceed, Label *fail);

@ -938,6 +940,23 @@ public:
    str(rscratch2, adr);
  }

+  // A generic CAS; success or failure is in the EQ flag.
+  template <typename T1, typename T2>
+  void cmpxchg(Register addr, Register expected, Register new_val,
+               T1 load_insn,
+               void (MacroAssembler::*cmp_insn)(Register, Register),
+               T2 store_insn,
+               Register tmp = rscratch1) {
+    Label retry_load, done;
+    bind(retry_load);
+    (this->*load_insn)(tmp, addr);
+    (this->*cmp_insn)(tmp, expected);
+    br(Assembler::NE, done);
+    (this->*store_insn)(tmp, new_val, addr);
+    cbnzw(tmp, retry_load);
+    bind(done);
+  }
+
  // Calls

  address trampoline_call(Address entry, CodeBuffer *cbuf = NULL);
--- a/hotspot/src/cpu/x86/vm/assembler_x86.cpp
+++ b/hotspot/src/cpu/x86/vm/assembler_x86.cpp
--- a/hotspot/src/cpu/x86/vm/assembler_x86.hpp
+++ b/hotspot/src/cpu/x86/vm/assembler_x86.hpp
@ -438,7 +438,9 @@ class ArrayAddress VALUE_OBJ_CLASS_SPEC {

 };

-const int FPUStateSizeInWords = NOT_LP64(27) LP64_ONLY( 512*2 / wordSize);
+// 64-bit refect the fxsave size which is 512 bytes and the new xsave area on EVEX which is another 2176 bytes
+// See fxsave and xsave(EVEX enabled) documentation for layout
+const int FPUStateSizeInWords = NOT_LP64(27) LP64_ONLY(2688 / wordSize);

 // The Intel x86/Amd64 Assembler: Pure assembler doing NO optimizations on the instruction
 // level (e.g. mov rax, 0 is not translated into xor rax, rax!); i.e., what you write
@ -594,11 +596,16 @@ class Assembler : public AbstractAssembler  {

 private:

-  int evex_encoding;
-  int input_size_in_bits;
-  int avx_vector_len;
-  int tuple_type;
-  bool is_evex_instruction;
+  int _evex_encoding;
+  int _input_size_in_bits;
+  int _avx_vector_len;
+  int _tuple_type;
+  bool _is_evex_instruction;
+  bool _legacy_mode_bw;
+  bool _legacy_mode_dq;
+  bool _legacy_mode_vl;
+  bool _legacy_mode_vlbw;
+  bool _instruction_uses_vl;

  // 64bit prefixes
  int prefix_and_encode(int reg_enc, bool byteinst = false);
@ -972,11 +979,16 @@ private:
  // belong in macro assembler but there is no need for both varieties to exist

  void init_attributes(void) {
-    evex_encoding = 0;
-    input_size_in_bits = 0;
-    avx_vector_len = AVX_NoVec;
-    tuple_type = EVEX_ETUP;
-    is_evex_instruction = false;
+    _evex_encoding = 0;
+    _input_size_in_bits = 0;
+    _avx_vector_len = AVX_NoVec;
+    _tuple_type = EVEX_ETUP;
+    _is_evex_instruction = false;
+    _legacy_mode_bw = (VM_Version::supports_avx512bw() == false);
+    _legacy_mode_dq = (VM_Version::supports_avx512dq() == false);
+    _legacy_mode_vl = (VM_Version::supports_avx512vl() == false);
+    _legacy_mode_vlbw = (VM_Version::supports_avx512vlbw() == false);
+    _instruction_uses_vl = false;
  }

  void lea(Register dst, Address src);
@ -1344,8 +1356,10 @@ private:
  void fxch(int i = 1);

  void fxrstor(Address src);
+  void xrstor(Address src);

  void fxsave(Address dst);
+  void xsave(Address dst);

  void fyl2x();
  void frndint();
@ -1479,11 +1493,12 @@ private:
  void movb(Address dst, int imm8);
  void movb(Register dst, Address src);

-  void kmovq(KRegister dst, KRegister src);
+  void kmovql(KRegister dst, KRegister src);
  void kmovql(KRegister dst, Register src);
  void kmovdl(KRegister dst, Register src);
-  void kmovq(Address dst, KRegister src);
-  void kmovq(KRegister dst, Address src);
+  void kmovwl(KRegister dst, Register src);
+  void kmovql(Address dst, KRegister src);
+  void kmovql(KRegister dst, Address src);

  void movdl(XMMRegister dst, Register src);
  void movdl(Register dst, XMMRegister src);
@ -1509,9 +1524,12 @@ private:
  void vmovdqu(XMMRegister dst, XMMRegister src);

   // Move Unaligned 512bit Vector
-  void evmovdqu(Address dst, XMMRegister src, int vector_len);
-  void evmovdqu(XMMRegister dst, Address src, int vector_len);
-  void evmovdqu(XMMRegister dst, XMMRegister src, int vector_len);
+  void evmovdqul(Address dst, XMMRegister src, int vector_len);
+  void evmovdqul(XMMRegister dst, Address src, int vector_len);
+  void evmovdqul(XMMRegister dst, XMMRegister src, int vector_len);
+  void evmovdquq(Address dst, XMMRegister src, int vector_len);
+  void evmovdquq(XMMRegister dst, Address src, int vector_len);
+  void evmovdquq(XMMRegister dst, XMMRegister src, int vector_len);

  // Move lower 64bit to high 64bit in 128bit register
  void movlhps(XMMRegister dst, XMMRegister src);
@ -1643,6 +1661,7 @@ private:

  // Pemutation of 64bit words
  void vpermq(XMMRegister dst, XMMRegister src, int imm8, int vector_len);
+  void vpermq(XMMRegister dst, XMMRegister src, int imm8);

  void pause();

@ -1920,6 +1939,10 @@ private:
  void vdivpd(XMMRegister dst, XMMRegister nds, Address src, int vector_len);
  void vdivps(XMMRegister dst, XMMRegister nds, Address src, int vector_len);

+  // Sqrt Packed Floating-Point Values - Double precision only
+  void vsqrtpd(XMMRegister dst, XMMRegister src, int vector_len);
+  void vsqrtpd(XMMRegister dst, Address src, int vector_len);
+
  // Bitwise Logical AND of Packed Floating-Point Values
  void andpd(XMMRegister dst, XMMRegister src);
  void andps(XMMRegister dst, XMMRegister src);
@ -2057,6 +2080,9 @@ private:
  void vextracti64x2h(XMMRegister dst, XMMRegister src, int value);
  void vextractf64x2h(XMMRegister dst, XMMRegister src, int value);
  void vextractf32x4h(XMMRegister dst, XMMRegister src, int value);
+  void vextractf32x4h(Address dst, XMMRegister src, int value);
+  void vinsertf32x4h(XMMRegister dst, XMMRegister nds, XMMRegister src, int value);
+  void vinsertf32x4h(XMMRegister dst, Address src, int value);

  // duplicate 4-bytes integer data from src into 8 locations in dest
  void vpbroadcastd(XMMRegister dst, XMMRegister src);
--- a/hotspot/src/cpu/x86/vm/c1_LIRAssembler_x86.cpp
+++ b/hotspot/src/cpu/x86/vm/c1_LIRAssembler_x86.cpp
@ -3798,16 +3798,24 @@ void LIR_Assembler::negate(LIR_Opr left, LIR_Opr dest) {
    if (left->as_xmm_float_reg() != dest->as_xmm_float_reg()) {
      __ movflt(dest->as_xmm_float_reg(), left->as_xmm_float_reg());
    }
-    __ xorps(dest->as_xmm_float_reg(),
-             ExternalAddress((address)float_signflip_pool));
-
+    if (UseAVX > 1) {
+      __ vnegatess(dest->as_xmm_float_reg(), dest->as_xmm_float_reg(),
+                   ExternalAddress((address)float_signflip_pool));
+    } else {
+      __ xorps(dest->as_xmm_float_reg(),
+               ExternalAddress((address)float_signflip_pool));
+    }
  } else if (dest->is_double_xmm()) {
    if (left->as_xmm_double_reg() != dest->as_xmm_double_reg()) {
      __ movdbl(dest->as_xmm_double_reg(), left->as_xmm_double_reg());
    }
-    __ xorpd(dest->as_xmm_double_reg(),
-             ExternalAddress((address)double_signflip_pool));
-
+    if (UseAVX > 1) {
+      __ vnegatesd(dest->as_xmm_double_reg(), dest->as_xmm_double_reg(),
+                   ExternalAddress((address)double_signflip_pool));
+    } else {
+      __ xorpd(dest->as_xmm_double_reg(),
+               ExternalAddress((address)double_signflip_pool));
+    }
  } else if (left->is_single_fpu() || left->is_double_fpu()) {
    assert(left->fpu() == 0, "arg must be on TOS");
    assert(dest->fpu() == 0, "dest must be TOS");
--- a/hotspot/src/cpu/x86/vm/c1_Runtime1_x86.cpp
+++ b/hotspot/src/cpu/x86/vm/c1_Runtime1_x86.cpp
@ -401,11 +401,9 @@ static OopMap* generate_oop_map(StubAssembler* sasm, int num_rt_args,

    } else if (UseSSE == 1) {
      int xmm_off = xmm_regs_as_doubles_off;
-      for (int n = 0; n < FrameMap::nof_xmm_regs; n++) {
-        if (n < xmm_bypass_limit) {
-          VMReg xmm_name_0 = as_XMMRegister(n)->as_VMReg();
-          map->set_callee_saved(VMRegImpl::stack2reg(xmm_off + num_rt_args), xmm_name_0);
-        }
+      for (int n = 0; n < FrameMap::nof_fpu_regs; n++) {
+        VMReg xmm_name_0 = as_XMMRegister(n)->as_VMReg();
+        map->set_callee_saved(VMRegImpl::stack2reg(xmm_off + num_rt_args), xmm_name_0);
        xmm_off += 2;
      }
      assert(xmm_off == float_regs_as_doubles_off, "incorrect number of xmm registers");
@ -452,14 +450,11 @@ static OopMap* save_live_registers(StubAssembler* sasm, int num_rt_args,
      __ frstor(Address(rsp, fpu_state_off * VMRegImpl::stack_slot_size));

      // Save the FPU registers in de-opt-able form
-      __ fstp_d(Address(rsp, float_regs_as_doubles_off * VMRegImpl::stack_slot_size +  0));
-      __ fstp_d(Address(rsp, float_regs_as_doubles_off * VMRegImpl::stack_slot_size +  8));
-      __ fstp_d(Address(rsp, float_regs_as_doubles_off * VMRegImpl::stack_slot_size + 16));
-      __ fstp_d(Address(rsp, float_regs_as_doubles_off * VMRegImpl::stack_slot_size + 24));
-      __ fstp_d(Address(rsp, float_regs_as_doubles_off * VMRegImpl::stack_slot_size + 32));
-      __ fstp_d(Address(rsp, float_regs_as_doubles_off * VMRegImpl::stack_slot_size + 40));
-      __ fstp_d(Address(rsp, float_regs_as_doubles_off * VMRegImpl::stack_slot_size + 48));
-      __ fstp_d(Address(rsp, float_regs_as_doubles_off * VMRegImpl::stack_slot_size + 56));
+      int offset = 0;
+      for (int n = 0; n < FrameMap::nof_fpu_regs; n++) {
+        __ fstp_d(Address(rsp, float_regs_as_doubles_off * VMRegImpl::stack_slot_size + offset));
+        offset += 8;
+      }
    }

    if (UseSSE >= 2) {
@ -468,52 +463,26 @@ static OopMap* save_live_registers(StubAssembler* sasm, int num_rt_args,
      // so always save them as doubles.
      // note that float values are _not_ converted automatically, so for float values
      // the second word contains only garbage data.
-      __ movdbl(Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size +  0), xmm0);
-      __ movdbl(Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size +  8), xmm1);
-      __ movdbl(Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 16), xmm2);
-      __ movdbl(Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 24), xmm3);
-      __ movdbl(Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 32), xmm4);
-      __ movdbl(Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 40), xmm5);
-      __ movdbl(Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 48), xmm6);
-      __ movdbl(Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 56), xmm7);
+      int xmm_bypass_limit = FrameMap::nof_xmm_regs;
+      int offset = 0;
 #ifdef _LP64
-      __ movdbl(Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 64), xmm8);
-      __ movdbl(Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 72), xmm9);
-      __ movdbl(Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 80), xmm10);
-      __ movdbl(Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 88), xmm11);
-      __ movdbl(Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 96), xmm12);
-      __ movdbl(Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 104), xmm13);
-      __ movdbl(Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 112), xmm14);
-      __ movdbl(Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 120), xmm15);
-      if (UseAVX > 2) {
-        __ movdbl(Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 128), xmm16);
-        __ movdbl(Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 136), xmm17);
-        __ movdbl(Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 144), xmm18);
-        __ movdbl(Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 152), xmm19);
-        __ movdbl(Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 160), xmm20);
-        __ movdbl(Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 168), xmm21);
-        __ movdbl(Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 176), xmm22);
-        __ movdbl(Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 184), xmm23);
-        __ movdbl(Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 192), xmm24);
-        __ movdbl(Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 200), xmm25);
-        __ movdbl(Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 208), xmm26);
-        __ movdbl(Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 216), xmm27);
-        __ movdbl(Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 224), xmm28);
-        __ movdbl(Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 232), xmm29);
-        __ movdbl(Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 240), xmm30);
-        __ movdbl(Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 248), xmm31);
+      if (UseAVX < 3) {
+        xmm_bypass_limit = xmm_bypass_limit / 2;
+      }
+#endif
+      for (int n = 0; n < xmm_bypass_limit; n++) {
+        XMMRegister xmm_name = as_XMMRegister(n);
+        __ movdbl(Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + offset), xmm_name);
+        offset += 8;
      }
-#endif // _LP64
    } else if (UseSSE == 1) {
-      // save XMM registers as float because double not supported without SSE2
-      __ movflt(Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size +  0), xmm0);
-      __ movflt(Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size +  8), xmm1);
-      __ movflt(Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 16), xmm2);
-      __ movflt(Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 24), xmm3);
-      __ movflt(Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 32), xmm4);
-      __ movflt(Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 40), xmm5);
-      __ movflt(Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 48), xmm6);
-      __ movflt(Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 56), xmm7);
+      // save XMM registers as float because double not supported without SSE2(num MMX == num fpu)
+      int offset = 0;
+      for (int n = 0; n < FrameMap::nof_fpu_regs; n++) {
+        XMMRegister xmm_name = as_XMMRegister(n);
+        __ movflt(Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + offset), xmm_name);
+        offset += 8;
+      }
    }
  }

@ -528,52 +497,26 @@ static void restore_fpu(StubAssembler* sasm, bool restore_fpu_registers = true)
  if (restore_fpu_registers) {
    if (UseSSE >= 2) {
      // restore XMM registers
-      __ movdbl(xmm0, Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size +  0));
-      __ movdbl(xmm1, Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size +  8));
-      __ movdbl(xmm2, Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 16));
-      __ movdbl(xmm3, Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 24));
-      __ movdbl(xmm4, Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 32));
-      __ movdbl(xmm5, Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 40));
-      __ movdbl(xmm6, Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 48));
-      __ movdbl(xmm7, Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 56));
+      int xmm_bypass_limit = FrameMap::nof_xmm_regs;
 #ifdef _LP64
-      __ movdbl(xmm8, Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 64));
-      __ movdbl(xmm9, Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 72));
-      __ movdbl(xmm10, Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 80));
-      __ movdbl(xmm11, Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 88));
-      __ movdbl(xmm12, Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 96));
-      __ movdbl(xmm13, Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 104));
-      __ movdbl(xmm14, Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 112));
-      __ movdbl(xmm15, Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 120));
-      if (UseAVX > 2) {
-        __ movdbl(xmm16, Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 128));
-        __ movdbl(xmm17, Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 136));
-        __ movdbl(xmm18, Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 144));
-        __ movdbl(xmm19, Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 152));
-        __ movdbl(xmm20, Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 160));
-        __ movdbl(xmm21, Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 168));
-        __ movdbl(xmm22, Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 176));
-        __ movdbl(xmm23, Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 184));
-        __ movdbl(xmm24, Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 192));
-        __ movdbl(xmm25, Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 200));
-        __ movdbl(xmm26, Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 208));
-        __ movdbl(xmm27, Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 216));
-        __ movdbl(xmm28, Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 224));
-        __ movdbl(xmm29, Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 232));
-        __ movdbl(xmm30, Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 240));
-        __ movdbl(xmm31, Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 248));
+      if (UseAVX < 3) {
+        xmm_bypass_limit = xmm_bypass_limit / 2;
+      }
+#endif
+      int offset = 0;
+      for (int n = 0; n < xmm_bypass_limit; n++) {
+        XMMRegister xmm_name = as_XMMRegister(n);
+        __ movdbl(xmm_name, Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + offset));
+        offset += 8;
      }
-#endif // _LP64
    } else if (UseSSE == 1) {
-      // restore XMM registers
-      __ movflt(xmm0, Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size +  0));
-      __ movflt(xmm1, Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size +  8));
-      __ movflt(xmm2, Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 16));
-      __ movflt(xmm3, Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 24));
-      __ movflt(xmm4, Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 32));
-      __ movflt(xmm5, Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 40));
-      __ movflt(xmm6, Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 48));
-      __ movflt(xmm7, Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 56));
+      // restore XMM registers(num MMX == num fpu)
+      int offset = 0;
+      for (int n = 0; n < FrameMap::nof_fpu_regs; n++) {
+        XMMRegister xmm_name = as_XMMRegister(n);
+        __ movflt(xmm_name, Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + offset));
+        offset += 8;
+      }
    }

    if (UseSSE < 2) {
--- a/hotspot/src/cpu/x86/vm/macroAssembler_x86.cpp
+++ b/hotspot/src/cpu/x86/vm/macroAssembler_x86.cpp
@ -3751,8 +3751,31 @@ void MacroAssembler::pop_CPU_state() {
 }

 void MacroAssembler::pop_FPU_state() {
-  NOT_LP64(frstor(Address(rsp, 0));)
-  LP64_ONLY(fxrstor(Address(rsp, 0));)
+#ifndef _LP64
+  frstor(Address(rsp, 0));
+#else
+  // AVX will continue to use the fxsave area.
+  // EVEX needs to utilize the xsave area, which is under different
+  // management.
+  if(VM_Version::supports_evex()) {
+    // EDX:EAX describe the XSAVE header and
+    // are obtained while fetching info for XCR0 via cpuid.
+    // These two registers make up 64-bits in the header for which bits
+    // 62:10 are currently reserved for future implementations and unused.  Bit 63
+    // is unused for our implementation as we do not utilize
+    // compressed XSAVE areas.  Bits 9..8 are currently ignored as we do not use
+    // the functionality for PKRU state and MSR tracing.
+    // Ergo we are primarily concerned with bits 7..0, which define
+    // which ISA extensions and features are enabled for a given machine and are
+    // defined in XemXcr0Eax and is used to map the XSAVE area
+    // for restoring registers as described via XCR0.
+    movl(rdx,VM_Version::get_xsave_header_upper_segment());
+    movl(rax,VM_Version::get_xsave_header_lower_segment());
+    xrstor(Address(rsp, 0));
+  } else {
+    fxrstor(Address(rsp, 0));
+  }
+#endif
  addptr(rsp, FPUStateSizeInWords * wordSize);
 }

@ -3769,13 +3792,49 @@ void MacroAssembler::push_CPU_state() {
  push_FPU_state();
 }

+#ifdef _LP64
+#define XSTATE_BV 0x200
+#endif
+
 void MacroAssembler::push_FPU_state() {
  subptr(rsp, FPUStateSizeInWords * wordSize);
 #ifndef _LP64
  fnsave(Address(rsp, 0));
  fwait();
 #else
-  fxsave(Address(rsp, 0));
+  // AVX will continue to use the fxsave area.
+  // EVEX needs to utilize the xsave area, which is under different
+  // management.
+  if(VM_Version::supports_evex()) {
+    // Save a copy of EAX and EDX
+    push(rax);
+    push(rdx);
+    // EDX:EAX describe the XSAVE header and
+    // are obtained while fetching info for XCR0 via cpuid.
+    // These two registers make up 64-bits in the header for which bits
+    // 62:10 are currently reserved for future implementations and unused.  Bit 63
+    // is unused for our implementation as we do not utilize
+    // compressed XSAVE areas.  Bits 9..8 are currently ignored as we do not use
+    // the functionality for PKRU state and MSR tracing.
+    // Ergo we are primarily concerned with bits 7..0, which define
+    // which ISA extensions and features are enabled for a given machine and are
+    // defined in XemXcr0Eax and is used to program XSAVE area
+    // for saving the required registers as defined in XCR0.
+    int xcr0_edx = VM_Version::get_xsave_header_upper_segment();
+    int xcr0_eax = VM_Version::get_xsave_header_lower_segment();
+    movl(rdx,xcr0_edx);
+    movl(rax,xcr0_eax);
+    xsave(Address(rsp, wordSize*2));
+    // now Apply control bits and clear bytes 8..23 in the header
+    pop(rdx);
+    pop(rax);
+    movl(Address(rsp, XSTATE_BV), xcr0_eax);
+    movl(Address(rsp, XSTATE_BV+4), xcr0_edx);
+    andq(Address(rsp, XSTATE_BV+8), 0);
+    andq(Address(rsp, XSTATE_BV+16), 0);
+  } else {
+    fxsave(Address(rsp, 0));
+  }
 #endif // LP64
 }

@ -4082,6 +4141,84 @@ void MacroAssembler::vsubss(XMMRegister dst, XMMRegister nds, AddressLiteral src
  }
 }

+void MacroAssembler::vnegatess(XMMRegister dst, XMMRegister nds, AddressLiteral src) {
+  int nds_enc = nds->encoding();
+  int dst_enc = dst->encoding();
+  bool dst_upper_bank = (dst_enc > 15);
+  bool nds_upper_bank = (nds_enc > 15);
+  if (VM_Version::supports_avx512novl() &&
+      (nds_upper_bank || dst_upper_bank)) {
+    if (dst_upper_bank) {
+      subptr(rsp, 64);
+      evmovdqul(Address(rsp, 0), xmm0, Assembler::AVX_512bit);
+      movflt(xmm0, nds);
+      if (reachable(src)) {
+        vxorps(xmm0, xmm0, as_Address(src), Assembler::AVX_128bit);
+      } else {
+        lea(rscratch1, src);
+        vxorps(xmm0, xmm0, Address(rscratch1, 0), Assembler::AVX_128bit);
+      }
+      movflt(dst, xmm0);
+      evmovdqul(xmm0, Address(rsp, 0), Assembler::AVX_512bit);
+      addptr(rsp, 64);
+    } else {
+      movflt(dst, nds);
+      if (reachable(src)) {
+        vxorps(dst, dst, as_Address(src), Assembler::AVX_128bit);
+      } else {
+        lea(rscratch1, src);
+        vxorps(dst, dst, Address(rscratch1, 0), Assembler::AVX_128bit);
+      }
+    }
+  } else {
+    if (reachable(src)) {
+      vxorps(dst, nds, as_Address(src), Assembler::AVX_128bit);
+    } else {
+      lea(rscratch1, src);
+      vxorps(dst, nds, Address(rscratch1, 0), Assembler::AVX_128bit);
+    }
+  }
+}
+
+void MacroAssembler::vnegatesd(XMMRegister dst, XMMRegister nds, AddressLiteral src) {
+  int nds_enc = nds->encoding();
+  int dst_enc = dst->encoding();
+  bool dst_upper_bank = (dst_enc > 15);
+  bool nds_upper_bank = (nds_enc > 15);
+  if (VM_Version::supports_avx512novl() &&
+      (nds_upper_bank || dst_upper_bank)) {
+    if (dst_upper_bank) {
+      subptr(rsp, 64);
+      evmovdqul(Address(rsp, 0), xmm0, Assembler::AVX_512bit);
+      movdbl(xmm0, nds);
+      if (reachable(src)) {
+        vxorps(xmm0, xmm0, as_Address(src), Assembler::AVX_128bit);
+      } else {
+        lea(rscratch1, src);
+        vxorps(xmm0, xmm0, Address(rscratch1, 0), Assembler::AVX_128bit);
+      }
+      movdbl(dst, xmm0);
+      evmovdqul(xmm0, Address(rsp, 0), Assembler::AVX_512bit);
+      addptr(rsp, 64);
+    } else {
+      movdbl(dst, nds);
+      if (reachable(src)) {
+        vxorps(dst, dst, as_Address(src), Assembler::AVX_128bit);
+      } else {
+        lea(rscratch1, src);
+        vxorps(dst, dst, Address(rscratch1, 0), Assembler::AVX_128bit);
+      }
+    }
+  } else {
+    if (reachable(src)) {
+      vxorpd(dst, nds, as_Address(src), Assembler::AVX_128bit);
+    } else {
+      lea(rscratch1, src);
+      vxorpd(dst, nds, Address(rscratch1, 0), Assembler::AVX_128bit);
+    }
+  }
+}
+
 void MacroAssembler::vxorpd(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len) {
  if (reachable(src)) {
    vxorpd(dst, nds, as_Address(src), vector_len);
@ -4318,7 +4455,6 @@ void MacroAssembler::store_check(Register obj, Address dst) {
 void MacroAssembler::store_check(Register obj) {
  // Does a store check for the oop in register obj. The content of
  // register obj is destroyed afterwards.
-
  BarrierSet* bs = Universe::heap()->barrier_set();
  assert(bs->kind() == BarrierSet::CardTableForRS ||
         bs->kind() == BarrierSet::CardTableExtension,
@ -4572,69 +4708,58 @@ void MacroAssembler::fp_runtime_fallback(address runtime_entry, int nb_args, int

  // if we are coming from c1, xmm registers may be live
  int off = 0;
+  int num_xmm_regs = LP64_ONLY(16) NOT_LP64(8);
+  if (UseAVX > 2) {
+    num_xmm_regs = LP64_ONLY(32) NOT_LP64(8);
+  }
+
  if (UseSSE == 1)  {
    subptr(rsp, sizeof(jdouble)*8);
-    movflt(Address(rsp,off++*sizeof(jdouble)),xmm0);
-    movflt(Address(rsp,off++*sizeof(jdouble)),xmm1);
-    movflt(Address(rsp,off++*sizeof(jdouble)),xmm2);
-    movflt(Address(rsp,off++*sizeof(jdouble)),xmm3);
-    movflt(Address(rsp,off++*sizeof(jdouble)),xmm4);
-    movflt(Address(rsp,off++*sizeof(jdouble)),xmm5);
-    movflt(Address(rsp,off++*sizeof(jdouble)),xmm6);
-    movflt(Address(rsp,off++*sizeof(jdouble)),xmm7);
+    for (int n = 0; n < 8; n++) {
+      movflt(Address(rsp, off++*sizeof(jdouble)), as_XMMRegister(n));
+    }
  } else if (UseSSE >= 2)  {
    if (UseAVX > 2) {
+      push(rbx);
      movl(rbx, 0xffff);
-#ifdef _LP64
-      kmovql(k1, rbx);
-#else
-      kmovdl(k1, rbx);
-#endif
+      kmovwl(k1, rbx);
+      pop(rbx);
    }
 #ifdef COMPILER2
    if (MaxVectorSize > 16) {
-      assert(UseAVX > 0, "256bit vectors are supported only with AVX");
+      if(UseAVX > 2) {
+        // Save upper half of ZMM registes
+        subptr(rsp, 32*num_xmm_regs);
+        for (int n = 0; n < num_xmm_regs; n++) {
+          vextractf64x4h(Address(rsp, off++*32), as_XMMRegister(n));
+        }
+        off = 0;
+      }
+      assert(UseAVX > 0, "256 bit vectors are supported only with AVX");
      // Save upper half of YMM registes
-      subptr(rsp, 16 * LP64_ONLY(16) NOT_LP64(8));
-      vextractf128h(Address(rsp,  0),xmm0);
-      vextractf128h(Address(rsp, 16),xmm1);
-      vextractf128h(Address(rsp, 32),xmm2);
-      vextractf128h(Address(rsp, 48),xmm3);
-      vextractf128h(Address(rsp, 64),xmm4);
-      vextractf128h(Address(rsp, 80),xmm5);
-      vextractf128h(Address(rsp, 96),xmm6);
-      vextractf128h(Address(rsp,112),xmm7);
-#ifdef _LP64
-      vextractf128h(Address(rsp,128),xmm8);
-      vextractf128h(Address(rsp,144),xmm9);
-      vextractf128h(Address(rsp,160),xmm10);
-      vextractf128h(Address(rsp,176),xmm11);
-      vextractf128h(Address(rsp,192),xmm12);
-      vextractf128h(Address(rsp,208),xmm13);
-      vextractf128h(Address(rsp,224),xmm14);
-      vextractf128h(Address(rsp,240),xmm15);
-#endif
+      subptr(rsp, 16*num_xmm_regs);
+      for (int n = 0; n < num_xmm_regs; n++) {
+        vextractf128h(Address(rsp, off++*16), as_XMMRegister(n));
+      }
    }
 #endif
-    // Save whole 128bit (16 bytes) XMM regiters
-    subptr(rsp, 16 * LP64_ONLY(16) NOT_LP64(8));
-    movdqu(Address(rsp,off++*16),xmm0);
-    movdqu(Address(rsp,off++*16),xmm1);
-    movdqu(Address(rsp,off++*16),xmm2);
-    movdqu(Address(rsp,off++*16),xmm3);
-    movdqu(Address(rsp,off++*16),xmm4);
-    movdqu(Address(rsp,off++*16),xmm5);
-    movdqu(Address(rsp,off++*16),xmm6);
-    movdqu(Address(rsp,off++*16),xmm7);
+    // Save whole 128bit (16 bytes) XMM registers
+    subptr(rsp, 16*num_xmm_regs);
+    off = 0;
 #ifdef _LP64
-    movdqu(Address(rsp,off++*16),xmm8);
-    movdqu(Address(rsp,off++*16),xmm9);
-    movdqu(Address(rsp,off++*16),xmm10);
-    movdqu(Address(rsp,off++*16),xmm11);
-    movdqu(Address(rsp,off++*16),xmm12);
-    movdqu(Address(rsp,off++*16),xmm13);
-    movdqu(Address(rsp,off++*16),xmm14);
-    movdqu(Address(rsp,off++*16),xmm15);
+    if (VM_Version::supports_avx512novl()) {
+      for (int n = 0; n < num_xmm_regs; n++) {
+        vextractf32x4h(Address(rsp, off++*16), as_XMMRegister(n), 0);
+      }
+    } else {
+      for (int n = 0; n < num_xmm_regs; n++) {
+        movdqu(Address(rsp, off++*16), as_XMMRegister(n));
+      }
+    }
+#else
+    for (int n = 0; n < num_xmm_regs; n++) {
+      movdqu(Address(rsp, off++*16), as_XMMRegister(n));
+    }
 #endif
  }

@ -4689,7 +4814,7 @@ void MacroAssembler::fp_runtime_fallback(address runtime_entry, int nb_args, int
  movsd(Address(rsp, 0), xmm0);
  fld_d(Address(rsp, 0));
 #endif // _LP64
-  addptr(rsp, sizeof(jdouble) * nb_args);
+  addptr(rsp, sizeof(jdouble)*nb_args);
  if (num_fpu_regs_in_use > 1) {
    // Must save return value to stack and then restore entire FPU
    // stack except incoming arguments
@ -4699,63 +4824,50 @@ void MacroAssembler::fp_runtime_fallback(address runtime_entry, int nb_args, int
      addptr(rsp, sizeof(jdouble));
    }
    fld_d(Address(rsp, (nb_args-1)*sizeof(jdouble)));
-    addptr(rsp, sizeof(jdouble) * nb_args);
+    addptr(rsp, sizeof(jdouble)*nb_args);
  }

  off = 0;
  if (UseSSE == 1)  {
-    movflt(xmm0, Address(rsp,off++*sizeof(jdouble)));
-    movflt(xmm1, Address(rsp,off++*sizeof(jdouble)));
-    movflt(xmm2, Address(rsp,off++*sizeof(jdouble)));
-    movflt(xmm3, Address(rsp,off++*sizeof(jdouble)));
-    movflt(xmm4, Address(rsp,off++*sizeof(jdouble)));
-    movflt(xmm5, Address(rsp,off++*sizeof(jdouble)));
-    movflt(xmm6, Address(rsp,off++*sizeof(jdouble)));
-    movflt(xmm7, Address(rsp,off++*sizeof(jdouble)));
+    for (int n = 0; n < 8; n++) {
+      movflt(as_XMMRegister(n), Address(rsp, off++*sizeof(jdouble)));
+    }
    addptr(rsp, sizeof(jdouble)*8);
  } else if (UseSSE >= 2)  {
    // Restore whole 128bit (16 bytes) XMM regiters
-    movdqu(xmm0, Address(rsp,off++*16));
-    movdqu(xmm1, Address(rsp,off++*16));
-    movdqu(xmm2, Address(rsp,off++*16));
-    movdqu(xmm3, Address(rsp,off++*16));
-    movdqu(xmm4, Address(rsp,off++*16));
-    movdqu(xmm5, Address(rsp,off++*16));
-    movdqu(xmm6, Address(rsp,off++*16));
-    movdqu(xmm7, Address(rsp,off++*16));
 #ifdef _LP64
-    movdqu(xmm8, Address(rsp,off++*16));
-    movdqu(xmm9, Address(rsp,off++*16));
-    movdqu(xmm10, Address(rsp,off++*16));
-    movdqu(xmm11, Address(rsp,off++*16));
-    movdqu(xmm12, Address(rsp,off++*16));
-    movdqu(xmm13, Address(rsp,off++*16));
-    movdqu(xmm14, Address(rsp,off++*16));
-    movdqu(xmm15, Address(rsp,off++*16));
+    if (VM_Version::supports_avx512novl()) {
+      for (int n = 0; n < num_xmm_regs; n++) {
+        vinsertf32x4h(as_XMMRegister(n), Address(rsp, off++*16), 0);
+      }
+    }
+    else {
+      for (int n = 0; n < num_xmm_regs; n++) {
+        movdqu(as_XMMRegister(n), Address(rsp, off++*16));
+      }
+    }
+#else
+    for (int n = 0; n < num_xmm_regs; n++) {
+      movdqu(as_XMMRegister(n), Address(rsp, off++ * 16));
+    }
 #endif
-    addptr(rsp, 16 * LP64_ONLY(16) NOT_LP64(8));
+    addptr(rsp, 16*num_xmm_regs);
+
 #ifdef COMPILER2
    if (MaxVectorSize > 16) {
      // Restore upper half of YMM registes.
-      vinsertf128h(xmm0, Address(rsp,  0));
-      vinsertf128h(xmm1, Address(rsp, 16));
-      vinsertf128h(xmm2, Address(rsp, 32));
-      vinsertf128h(xmm3, Address(rsp, 48));
-      vinsertf128h(xmm4, Address(rsp, 64));
-      vinsertf128h(xmm5, Address(rsp, 80));
-      vinsertf128h(xmm6, Address(rsp, 96));
-      vinsertf128h(xmm7, Address(rsp,112));
-#ifdef _LP64
-      vinsertf128h(xmm8, Address(rsp,128));
-      vinsertf128h(xmm9, Address(rsp,144));
-      vinsertf128h(xmm10, Address(rsp,160));
-      vinsertf128h(xmm11, Address(rsp,176));
-      vinsertf128h(xmm12, Address(rsp,192));
-      vinsertf128h(xmm13, Address(rsp,208));
-      vinsertf128h(xmm14, Address(rsp,224));
-      vinsertf128h(xmm15, Address(rsp,240));
-#endif
-      addptr(rsp, 16 * LP64_ONLY(16) NOT_LP64(8));
+      off = 0;
+      for (int n = 0; n < num_xmm_regs; n++) {
+        vinsertf128h(as_XMMRegister(n), Address(rsp, off++*16));
+      }
+      addptr(rsp, 16*num_xmm_regs);
+      if(UseAVX > 2) {
+        off = 0;
+        for (int n = 0; n < num_xmm_regs; n++) {
+          vinsertf64x4h(as_XMMRegister(n), Address(rsp, off++*32));
+        }
+        addptr(rsp, 32*num_xmm_regs);
+      }
    }
 #endif
  }
@ -7095,11 +7207,7 @@ void MacroAssembler::generate_fill(BasicType t, bool aligned,
      Label L_fill_32_bytes_loop, L_check_fill_8_bytes, L_fill_8_bytes_loop, L_fill_8_bytes;
      if (UseAVX > 2) {
        movl(rtmp, 0xffff);
-#ifdef _LP64
-        kmovql(k1, rtmp);
-#else
-        kmovdl(k1, rtmp);
-#endif
+        kmovwl(k1, rtmp);
      }
      movdl(xtmp, value);
      if (UseAVX > 2 && UseUnalignedLoadStores) {
@ -7112,7 +7220,7 @@ void MacroAssembler::generate_fill(BasicType t, bool aligned,
        align(16);

        BIND(L_fill_64_bytes_loop);
-        evmovdqu(Address(to, 0), xtmp, Assembler::AVX_512bit);
+        evmovdqul(Address(to, 0), xtmp, Assembler::AVX_512bit);
        addptr(to, 64);
        subl(count, 16 << shift);
        jcc(Assembler::greaterEqual, L_fill_64_bytes_loop);
@ -7120,7 +7228,7 @@ void MacroAssembler::generate_fill(BasicType t, bool aligned,
        BIND(L_check_fill_32_bytes);
        addl(count, 8 << shift);
        jccb(Assembler::less, L_check_fill_8_bytes);
-        evmovdqu(Address(to, 0), xtmp, Assembler::AVX_256bit);
+        evmovdqul(Address(to, 0), xtmp, Assembler::AVX_256bit);
        addptr(to, 32);
        subl(count, 8 << shift);

@ -8399,6 +8507,14 @@ void MacroAssembler::kernel_crc32(Register crc, Register buf, Register len, Regi
  Label L_tail, L_tail_restore, L_tail_loop, L_exit, L_align_loop, L_aligned;
  Label L_fold_tail, L_fold_128b, L_fold_512b, L_fold_512b_loop, L_fold_tail_loop;

+  // For EVEX with VL and BW, provide a standard mask, VL = 128 will guide the merge
+  // context for the registers used, where all instructions below are using 128-bit mode
+  // On EVEX without VL and BW, these instructions will all be AVX.
+  if (VM_Version::supports_avx512vlbw()) {
+    movl(tmp, 0xffff);
+    kmovwl(k1, tmp);
+  }
+
  lea(table, ExternalAddress(StubRoutines::crc_table_addr()));
  notl(crc); // ~crc
  cmpl(len, 16);
--- a/hotspot/src/cpu/x86/vm/macroAssembler_x86.hpp
+++ b/hotspot/src/cpu/x86/vm/macroAssembler_x86.hpp
@ -1069,6 +1069,9 @@ public:
  void vsubss(XMMRegister dst, XMMRegister nds, Address src)     { Assembler::vsubss(dst, nds, src); }
  void vsubss(XMMRegister dst, XMMRegister nds, AddressLiteral src);

+  void vnegatess(XMMRegister dst, XMMRegister nds, AddressLiteral src);
+  void vnegatesd(XMMRegister dst, XMMRegister nds, AddressLiteral src);
+
  // AVX Vector instructions

  void vxorpd(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) { Assembler::vxorpd(dst, nds, src, vector_len); }
--- a/hotspot/src/cpu/x86/vm/sharedRuntime_x86_32.cpp
+++ b/hotspot/src/cpu/x86/vm/sharedRuntime_x86_32.cpp
@ -115,6 +115,7 @@ class RegisterSaver {
 OopMap* RegisterSaver::save_live_registers(MacroAssembler* masm, int additional_frame_words,
                                           int* total_frame_words, bool verify_fpu, bool save_vectors) {
  int vect_words = 0;
+  int num_xmm_regs = XMMRegisterImpl::number_of_registers;
 #ifdef COMPILER2
  if (save_vectors) {
    assert(UseAVX > 0, "512bit vectors are supported only with EVEX");
@ -173,59 +174,50 @@ OopMap* RegisterSaver::save_live_registers(MacroAssembler* masm, int additional_
    __ fldcw(ExternalAddress(StubRoutines::addr_fpu_cntrl_wrd_std()));
  }

+  int off = st0_off;
+  int delta = st1_off - off;
+
  // Save the FPU registers in de-opt-able form
+  for (int n = 0; n < FloatRegisterImpl::number_of_registers; n++) {
+    __ fstp_d(Address(rsp, off*wordSize));
+    off += delta;
+  }

-  __ fstp_d(Address(rsp, st0_off*wordSize)); // st(0)
-  __ fstp_d(Address(rsp, st1_off*wordSize)); // st(1)
-  __ fstp_d(Address(rsp, st2_off*wordSize)); // st(2)
-  __ fstp_d(Address(rsp, st3_off*wordSize)); // st(3)
-  __ fstp_d(Address(rsp, st4_off*wordSize)); // st(4)
-  __ fstp_d(Address(rsp, st5_off*wordSize)); // st(5)
-  __ fstp_d(Address(rsp, st6_off*wordSize)); // st(6)
-  __ fstp_d(Address(rsp, st7_off*wordSize)); // st(7)
-
-  if( UseSSE == 1 ) {           // Save the XMM state
-    __ movflt(Address(rsp,xmm0_off*wordSize),xmm0);
-    __ movflt(Address(rsp,xmm1_off*wordSize),xmm1);
-    __ movflt(Address(rsp,xmm2_off*wordSize),xmm2);
-    __ movflt(Address(rsp,xmm3_off*wordSize),xmm3);
-    __ movflt(Address(rsp,xmm4_off*wordSize),xmm4);
-    __ movflt(Address(rsp,xmm5_off*wordSize),xmm5);
-    __ movflt(Address(rsp,xmm6_off*wordSize),xmm6);
-    __ movflt(Address(rsp,xmm7_off*wordSize),xmm7);
-  } else if( UseSSE >= 2 ) {
+  off = xmm0_off;
+  delta = xmm1_off - off;
+  if(UseSSE == 1) {           // Save the XMM state
+    for (int n = 0; n < num_xmm_regs; n++) {
+      __ movflt(Address(rsp, off*wordSize), as_XMMRegister(n));
+      off += delta;
+    }
+  } else if(UseSSE >= 2) {
    // Save whole 128bit (16 bytes) XMM regiters
-    __ movdqu(Address(rsp,xmm0_off*wordSize),xmm0);
-    __ movdqu(Address(rsp,xmm1_off*wordSize),xmm1);
-    __ movdqu(Address(rsp,xmm2_off*wordSize),xmm2);
-    __ movdqu(Address(rsp,xmm3_off*wordSize),xmm3);
-    __ movdqu(Address(rsp,xmm4_off*wordSize),xmm4);
-    __ movdqu(Address(rsp,xmm5_off*wordSize),xmm5);
-    __ movdqu(Address(rsp,xmm6_off*wordSize),xmm6);
-    __ movdqu(Address(rsp,xmm7_off*wordSize),xmm7);
+    if (VM_Version::supports_avx512novl()) {
+      for (int n = 0; n < num_xmm_regs; n++) {
+        __ vextractf32x4h(Address(rsp, off*wordSize), as_XMMRegister(n), 0);
+        off += delta;
+      }
+    } else {
+      for (int n = 0; n < num_xmm_regs; n++) {
+        __ movdqu(Address(rsp, off*wordSize), as_XMMRegister(n));
+        off += delta;
+      }
+    }
  }

  if (vect_words > 0) {
    assert(vect_words*wordSize == 128, "");
    __ subptr(rsp, 128); // Save upper half of YMM registes
-    __ vextractf128h(Address(rsp,  0),xmm0);
-    __ vextractf128h(Address(rsp, 16),xmm1);
-    __ vextractf128h(Address(rsp, 32),xmm2);
-    __ vextractf128h(Address(rsp, 48),xmm3);
-    __ vextractf128h(Address(rsp, 64),xmm4);
-    __ vextractf128h(Address(rsp, 80),xmm5);
-    __ vextractf128h(Address(rsp, 96),xmm6);
-    __ vextractf128h(Address(rsp,112),xmm7);
+    off = 0;
+    for (int n = 0; n < num_xmm_regs; n++) {
+      __ vextractf128h(Address(rsp, off++*16), as_XMMRegister(n));
+    }
    if (UseAVX > 2) {
      __ subptr(rsp, 256); // Save upper half of ZMM registes
-      __ vextractf64x4h(Address(rsp, 0), xmm0);
-      __ vextractf64x4h(Address(rsp, 32), xmm1);
-      __ vextractf64x4h(Address(rsp, 64), xmm2);
-      __ vextractf64x4h(Address(rsp, 96), xmm3);
-      __ vextractf64x4h(Address(rsp, 128), xmm4);
-      __ vextractf64x4h(Address(rsp, 160), xmm5);
-      __ vextractf64x4h(Address(rsp, 192), xmm6);
-      __ vextractf64x4h(Address(rsp, 224), xmm7);
+      off = 0;
+      for (int n = 0; n < num_xmm_regs; n++) {
+        __ vextractf64x4h(Address(rsp, off++*32), as_XMMRegister(n));
+      }
    }
  }

@ -238,58 +230,40 @@ OopMap* RegisterSaver::save_live_registers(MacroAssembler* masm, int additional_
  OopMap* map =  new OopMap( frame_words, 0 );

 #define STACK_OFFSET(x) VMRegImpl::stack2reg((x) + additional_frame_words)
-
-  map->set_callee_saved(STACK_OFFSET( rax_off), rax->as_VMReg());
-  map->set_callee_saved(STACK_OFFSET( rcx_off), rcx->as_VMReg());
-  map->set_callee_saved(STACK_OFFSET( rdx_off), rdx->as_VMReg());
-  map->set_callee_saved(STACK_OFFSET( rbx_off), rbx->as_VMReg());
-  // rbp, location is known implicitly, no oopMap
-  map->set_callee_saved(STACK_OFFSET( rsi_off), rsi->as_VMReg());
-  map->set_callee_saved(STACK_OFFSET( rdi_off), rdi->as_VMReg());
-  map->set_callee_saved(STACK_OFFSET(st0_off), as_FloatRegister(0)->as_VMReg());
-  map->set_callee_saved(STACK_OFFSET(st1_off), as_FloatRegister(1)->as_VMReg());
-  map->set_callee_saved(STACK_OFFSET(st2_off), as_FloatRegister(2)->as_VMReg());
-  map->set_callee_saved(STACK_OFFSET(st3_off), as_FloatRegister(3)->as_VMReg());
-  map->set_callee_saved(STACK_OFFSET(st4_off), as_FloatRegister(4)->as_VMReg());
-  map->set_callee_saved(STACK_OFFSET(st5_off), as_FloatRegister(5)->as_VMReg());
-  map->set_callee_saved(STACK_OFFSET(st6_off), as_FloatRegister(6)->as_VMReg());
-  map->set_callee_saved(STACK_OFFSET(st7_off), as_FloatRegister(7)->as_VMReg());
-  map->set_callee_saved(STACK_OFFSET(xmm0_off), xmm0->as_VMReg());
-  map->set_callee_saved(STACK_OFFSET(xmm1_off), xmm1->as_VMReg());
-  map->set_callee_saved(STACK_OFFSET(xmm2_off), xmm2->as_VMReg());
-  map->set_callee_saved(STACK_OFFSET(xmm3_off), xmm3->as_VMReg());
-  map->set_callee_saved(STACK_OFFSET(xmm4_off), xmm4->as_VMReg());
-  map->set_callee_saved(STACK_OFFSET(xmm5_off), xmm5->as_VMReg());
-  map->set_callee_saved(STACK_OFFSET(xmm6_off), xmm6->as_VMReg());
-  map->set_callee_saved(STACK_OFFSET(xmm7_off), xmm7->as_VMReg());
-  // %%% This is really a waste but we'll keep things as they were for now
-  if (true) {
 #define NEXTREG(x) (x)->as_VMReg()->next()
-    map->set_callee_saved(STACK_OFFSET(st0H_off), NEXTREG(as_FloatRegister(0)));
-    map->set_callee_saved(STACK_OFFSET(st1H_off), NEXTREG(as_FloatRegister(1)));
-    map->set_callee_saved(STACK_OFFSET(st2H_off), NEXTREG(as_FloatRegister(2)));
-    map->set_callee_saved(STACK_OFFSET(st3H_off), NEXTREG(as_FloatRegister(3)));
-    map->set_callee_saved(STACK_OFFSET(st4H_off), NEXTREG(as_FloatRegister(4)));
-    map->set_callee_saved(STACK_OFFSET(st5H_off), NEXTREG(as_FloatRegister(5)));
-    map->set_callee_saved(STACK_OFFSET(st6H_off), NEXTREG(as_FloatRegister(6)));
-    map->set_callee_saved(STACK_OFFSET(st7H_off), NEXTREG(as_FloatRegister(7)));
-    map->set_callee_saved(STACK_OFFSET(xmm0H_off), NEXTREG(xmm0));
-    map->set_callee_saved(STACK_OFFSET(xmm1H_off), NEXTREG(xmm1));
-    map->set_callee_saved(STACK_OFFSET(xmm2H_off), NEXTREG(xmm2));
-    map->set_callee_saved(STACK_OFFSET(xmm3H_off), NEXTREG(xmm3));
-    map->set_callee_saved(STACK_OFFSET(xmm4H_off), NEXTREG(xmm4));
-    map->set_callee_saved(STACK_OFFSET(xmm5H_off), NEXTREG(xmm5));
-    map->set_callee_saved(STACK_OFFSET(xmm6H_off), NEXTREG(xmm6));
-    map->set_callee_saved(STACK_OFFSET(xmm7H_off), NEXTREG(xmm7));
+
+  map->set_callee_saved(STACK_OFFSET(rax_off), rax->as_VMReg());
+  map->set_callee_saved(STACK_OFFSET(rcx_off), rcx->as_VMReg());
+  map->set_callee_saved(STACK_OFFSET(rdx_off), rdx->as_VMReg());
+  map->set_callee_saved(STACK_OFFSET(rbx_off), rbx->as_VMReg());
+  // rbp, location is known implicitly, no oopMap
+  map->set_callee_saved(STACK_OFFSET(rsi_off), rsi->as_VMReg());
+  map->set_callee_saved(STACK_OFFSET(rdi_off), rdi->as_VMReg());
+  // %%% This is really a waste but we'll keep things as they were for now for the upper component
+  off = st0_off;
+  delta = st1_off - off;
+  for (int n = 0; n < FloatRegisterImpl::number_of_registers; n++) {
+    FloatRegister freg_name = as_FloatRegister(n);
+    map->set_callee_saved(STACK_OFFSET(off), freg_name->as_VMReg());
+    map->set_callee_saved(STACK_OFFSET(off+1), NEXTREG(freg_name));
+    off += delta;
+  }
+  off = xmm0_off;
+  delta = xmm1_off - off;
+  for (int n = 0; n < num_xmm_regs; n++) {
+    XMMRegister xmm_name = as_XMMRegister(n);
+    map->set_callee_saved(STACK_OFFSET(off), xmm_name->as_VMReg());
+    map->set_callee_saved(STACK_OFFSET(off+1), NEXTREG(xmm_name));
+    off += delta;
+  }
 #undef NEXTREG
 #undef STACK_OFFSET
-  }

  return map;
-
 }

 void RegisterSaver::restore_live_registers(MacroAssembler* masm, bool restore_vectors) {
+  int num_xmm_regs = XMMRegisterImpl::number_of_registers;
  // Recover XMM & FPU state
  int additional_frame_bytes = 0;
 #ifdef COMPILER2
@ -301,52 +275,43 @@ void RegisterSaver::restore_live_registers(MacroAssembler* masm, bool restore_ve
 #else
  assert(!restore_vectors, "vectors are generated only by C2");
 #endif
+  int off = xmm0_off;
+  int delta = xmm1_off - off;
+
  if (UseSSE == 1) {
    assert(additional_frame_bytes == 0, "");
-    __ movflt(xmm0,Address(rsp,xmm0_off*wordSize));
-    __ movflt(xmm1,Address(rsp,xmm1_off*wordSize));
-    __ movflt(xmm2,Address(rsp,xmm2_off*wordSize));
-    __ movflt(xmm3,Address(rsp,xmm3_off*wordSize));
-    __ movflt(xmm4,Address(rsp,xmm4_off*wordSize));
-    __ movflt(xmm5,Address(rsp,xmm5_off*wordSize));
-    __ movflt(xmm6,Address(rsp,xmm6_off*wordSize));
-    __ movflt(xmm7,Address(rsp,xmm7_off*wordSize));
+    for (int n = 0; n < num_xmm_regs; n++) {
+      __ movflt(as_XMMRegister(n), Address(rsp, off*wordSize));
+      off += delta;
+    }
  } else if (UseSSE >= 2) {
-#define STACK_ADDRESS(x) Address(rsp,(x)*wordSize + additional_frame_bytes)
-    __ movdqu(xmm0,STACK_ADDRESS(xmm0_off));
-    __ movdqu(xmm1,STACK_ADDRESS(xmm1_off));
-    __ movdqu(xmm2,STACK_ADDRESS(xmm2_off));
-    __ movdqu(xmm3,STACK_ADDRESS(xmm3_off));
-    __ movdqu(xmm4,STACK_ADDRESS(xmm4_off));
-    __ movdqu(xmm5,STACK_ADDRESS(xmm5_off));
-    __ movdqu(xmm6,STACK_ADDRESS(xmm6_off));
-    __ movdqu(xmm7,STACK_ADDRESS(xmm7_off));
-#undef STACK_ADDRESS
+    if (VM_Version::supports_avx512novl()) {
+      for (int n = 0; n < num_xmm_regs; n++) {
+        __ vinsertf32x4h(as_XMMRegister(n), Address(rsp, off*wordSize+additional_frame_bytes), 0);
+        off += delta;
+      }
+    } else {
+      for (int n = 0; n < num_xmm_regs; n++) {
+        __ movdqu(as_XMMRegister(n), Address(rsp, off*wordSize+additional_frame_bytes));
+        off += delta;
+      }
+    }
  }
  if (restore_vectors) {
+    if (UseAVX > 2) {
+      off = 0;
+      for (int n = 0; n < num_xmm_regs; n++) {
+        __ vinsertf64x4h(as_XMMRegister(n), Address(rsp, off++*32));
+      }
+      __ addptr(rsp, additional_frame_bytes*2); // Save upper half of ZMM registes
+    }
    // Restore upper half of YMM registes.
    assert(additional_frame_bytes == 128, "");
-    __ vinsertf128h(xmm0, Address(rsp,  0));
-    __ vinsertf128h(xmm1, Address(rsp, 16));
-    __ vinsertf128h(xmm2, Address(rsp, 32));
-    __ vinsertf128h(xmm3, Address(rsp, 48));
-    __ vinsertf128h(xmm4, Address(rsp, 64));
-    __ vinsertf128h(xmm5, Address(rsp, 80));
-    __ vinsertf128h(xmm6, Address(rsp, 96));
-    __ vinsertf128h(xmm7, Address(rsp,112));
-    __ addptr(rsp, additional_frame_bytes);
-    if (UseAVX > 2) {
-      additional_frame_bytes = 256;
-      __ vinsertf64x4h(xmm0, Address(rsp, 0));
-      __ vinsertf64x4h(xmm1, Address(rsp, 32));
-      __ vinsertf64x4h(xmm2, Address(rsp, 64));
-      __ vinsertf64x4h(xmm3, Address(rsp, 96));
-      __ vinsertf64x4h(xmm4, Address(rsp, 128));
-      __ vinsertf64x4h(xmm5, Address(rsp, 160));
-      __ vinsertf64x4h(xmm6, Address(rsp, 192));
-      __ vinsertf64x4h(xmm7, Address(rsp, 224));
-      __ addptr(rsp, additional_frame_bytes);
+    off = 0;
+    for (int n = 0; n < num_xmm_regs; n++) {
+      __ vinsertf128h(as_XMMRegister(n), Address(rsp, off++*16));
    }
+    __ addptr(rsp, additional_frame_bytes); // Save upper half of YMM registes
  }
  __ pop_FPU_state();
  __ addptr(rsp, FPU_regs_live*wordSize); // Pop FPU registers
--- a/hotspot/src/cpu/x86/vm/sharedRuntime_x86_64.cpp
+++ b/hotspot/src/cpu/x86/vm/sharedRuntime_x86_64.cpp
@ -69,7 +69,9 @@ class SimpleRuntimeFrame {
 class RegisterSaver {
  // Capture info about frame layout.  Layout offsets are in jint
  // units because compiler frame slots are jints.
+#define HALF_ZMM_BANK_WORDS 128
 #define DEF_XMM_OFFS(regnum) xmm ## regnum ## _off = xmm_off + (regnum)*16/BytesPerInt, xmm ## regnum ## H_off
+#define DEF_ZMM_OFFS(regnum) zmm ## regnum ## _off = zmm_off + (regnum-16)*64/BytesPerInt, zmm ## regnum ## H_off
  enum layout {
    fpu_state_off = frame::arg_reg_save_area_bytes/BytesPerInt, // fxsave save area
    xmm_off       = fpu_state_off + 160/BytesPerInt,            // offset in fxsave save area
@ -89,23 +91,24 @@ class RegisterSaver {
    DEF_XMM_OFFS(13),
    DEF_XMM_OFFS(14),
    DEF_XMM_OFFS(15),
-    DEF_XMM_OFFS(16),
-    DEF_XMM_OFFS(17),
-    DEF_XMM_OFFS(18),
-    DEF_XMM_OFFS(19),
-    DEF_XMM_OFFS(20),
-    DEF_XMM_OFFS(21),
-    DEF_XMM_OFFS(22),
-    DEF_XMM_OFFS(23),
-    DEF_XMM_OFFS(24),
-    DEF_XMM_OFFS(25),
-    DEF_XMM_OFFS(26),
-    DEF_XMM_OFFS(27),
-    DEF_XMM_OFFS(28),
-    DEF_XMM_OFFS(29),
-    DEF_XMM_OFFS(30),
-    DEF_XMM_OFFS(31),
-    fpu_state_end = fpu_state_off + ((FPUStateSizeInWords - 1)*wordSize / BytesPerInt),
+    zmm_off = fpu_state_off + ((FPUStateSizeInWords - (HALF_ZMM_BANK_WORDS + 1))*wordSize / BytesPerInt),
+    DEF_ZMM_OFFS(16),
+    DEF_ZMM_OFFS(17),
+    DEF_ZMM_OFFS(18),
+    DEF_ZMM_OFFS(19),
+    DEF_ZMM_OFFS(20),
+    DEF_ZMM_OFFS(21),
+    DEF_ZMM_OFFS(22),
+    DEF_ZMM_OFFS(23),
+    DEF_ZMM_OFFS(24),
+    DEF_ZMM_OFFS(25),
+    DEF_ZMM_OFFS(26),
+    DEF_ZMM_OFFS(27),
+    DEF_ZMM_OFFS(28),
+    DEF_ZMM_OFFS(29),
+    DEF_ZMM_OFFS(30),
+    DEF_ZMM_OFFS(31),
+    fpu_state_end = fpu_state_off + ((FPUStateSizeInWords-1)*wordSize / BytesPerInt),
    fpu_stateH_end,
    r15_off, r15H_off,
    r14_off, r14H_off,
@ -155,9 +158,10 @@ class RegisterSaver {

 OopMap* RegisterSaver::save_live_registers(MacroAssembler* masm, int additional_frame_words, int* total_frame_words, bool save_vectors) {
  int vect_words = 0;
-  int num_xmm_regs = 16;
-  if (UseAVX > 2) {
-    num_xmm_regs = 32;
+  int off = 0;
+  int num_xmm_regs = XMMRegisterImpl::number_of_registers;
+  if (UseAVX < 3) {
+    num_xmm_regs = num_xmm_regs/2;
  }
 #ifdef COMPILER2
  if (save_vectors) {
@ -165,9 +169,7 @@ OopMap* RegisterSaver::save_live_registers(MacroAssembler* masm, int additional_
    assert(MaxVectorSize == 64, "only 512bit vectors are supported now");
    // Save upper half of YMM registers
    vect_words = 16 * num_xmm_regs / wordSize;
-    additional_frame_words += vect_words;
-    if (UseAVX > 2) {
-      // Save upper half of ZMM registers as well
+    if (UseAVX < 3) {
      additional_frame_words += vect_words;
    }
  }
@ -195,77 +197,13 @@ OopMap* RegisterSaver::save_live_registers(MacroAssembler* masm, int additional_
  __ enter();          // rsp becomes 16-byte aligned here
  __ push_CPU_state(); // Push a multiple of 16 bytes

-  if (vect_words > 0) {
+  // push cpu state handles this on EVEX enabled targets
+  if ((vect_words > 0) && (UseAVX < 3)) {
    assert(vect_words*wordSize >= 256, "");
-    __ subptr(rsp, 256); // Save upper half of YMM registes(0..15)
-    __ vextractf128h(Address(rsp, 0), xmm0);
-    __ vextractf128h(Address(rsp, 16), xmm1);
-    __ vextractf128h(Address(rsp, 32), xmm2);
-    __ vextractf128h(Address(rsp, 48), xmm3);
-    __ vextractf128h(Address(rsp, 64), xmm4);
-    __ vextractf128h(Address(rsp, 80), xmm5);
-    __ vextractf128h(Address(rsp, 96), xmm6);
-    __ vextractf128h(Address(rsp, 112), xmm7);
-    __ vextractf128h(Address(rsp, 128), xmm8);
-    __ vextractf128h(Address(rsp, 144), xmm9);
-    __ vextractf128h(Address(rsp, 160), xmm10);
-    __ vextractf128h(Address(rsp, 176), xmm11);
-    __ vextractf128h(Address(rsp, 192), xmm12);
-    __ vextractf128h(Address(rsp, 208), xmm13);
-    __ vextractf128h(Address(rsp, 224), xmm14);
-    __ vextractf128h(Address(rsp, 240), xmm15);
-    if (UseAVX > 2) {
-      __ subptr(rsp, 256); // Save upper half of YMM registes(16..31)
-      __ vextractf128h(Address(rsp, 0), xmm16);
-      __ vextractf128h(Address(rsp, 16), xmm17);
-      __ vextractf128h(Address(rsp, 32), xmm18);
-      __ vextractf128h(Address(rsp, 48), xmm19);
-      __ vextractf128h(Address(rsp, 64), xmm20);
-      __ vextractf128h(Address(rsp, 80), xmm21);
-      __ vextractf128h(Address(rsp, 96), xmm22);
-      __ vextractf128h(Address(rsp, 112), xmm23);
-      __ vextractf128h(Address(rsp, 128), xmm24);
-      __ vextractf128h(Address(rsp, 144), xmm25);
-      __ vextractf128h(Address(rsp, 160), xmm26);
-      __ vextractf128h(Address(rsp, 176), xmm27);
-      __ vextractf128h(Address(rsp, 192), xmm28);
-      __ vextractf128h(Address(rsp, 208), xmm29);
-      __ vextractf128h(Address(rsp, 224), xmm30);
-      __ vextractf128h(Address(rsp, 240), xmm31);
-      // Now handle the ZMM registers (0..31)
-      __ subptr(rsp, 1024); // Save upper half of ZMM registes
-      __ vextractf64x4h(Address(rsp, 0), xmm0);
-      __ vextractf64x4h(Address(rsp, 32), xmm1);
-      __ vextractf64x4h(Address(rsp, 64), xmm2);
-      __ vextractf64x4h(Address(rsp, 96), xmm3);
-      __ vextractf64x4h(Address(rsp, 128), xmm4);
-      __ vextractf64x4h(Address(rsp, 160), xmm5);
-      __ vextractf64x4h(Address(rsp, 192), xmm6);
-      __ vextractf64x4h(Address(rsp, 224), xmm7);
-      __ vextractf64x4h(Address(rsp, 256), xmm8);
-      __ vextractf64x4h(Address(rsp, 288), xmm9);
-      __ vextractf64x4h(Address(rsp, 320), xmm10);
-      __ vextractf64x4h(Address(rsp, 352), xmm11);
-      __ vextractf64x4h(Address(rsp, 384), xmm12);
-      __ vextractf64x4h(Address(rsp, 416), xmm13);
-      __ vextractf64x4h(Address(rsp, 448), xmm14);
-      __ vextractf64x4h(Address(rsp, 480), xmm15);
-      __ vextractf64x4h(Address(rsp, 512), xmm16);
-      __ vextractf64x4h(Address(rsp, 544), xmm17);
-      __ vextractf64x4h(Address(rsp, 576), xmm18);
-      __ vextractf64x4h(Address(rsp, 608), xmm19);
-      __ vextractf64x4h(Address(rsp, 640), xmm20);
-      __ vextractf64x4h(Address(rsp, 672), xmm21);
-      __ vextractf64x4h(Address(rsp, 704), xmm22);
-      __ vextractf64x4h(Address(rsp, 736), xmm23);
-      __ vextractf64x4h(Address(rsp, 768), xmm24);
-      __ vextractf64x4h(Address(rsp, 800), xmm25);
-      __ vextractf64x4h(Address(rsp, 832), xmm26);
-      __ vextractf64x4h(Address(rsp, 864), xmm27);
-      __ vextractf64x4h(Address(rsp, 896), xmm28);
-      __ vextractf64x4h(Address(rsp, 928), xmm29);
-      __ vextractf64x4h(Address(rsp, 960), xmm30);
-      __ vextractf64x4h(Address(rsp, 992), xmm31);
+    // Save upper half of YMM registes(0..num_xmm_regs)
+    __ subptr(rsp, num_xmm_regs*16);
+    for (int n = 0; n < num_xmm_regs; n++) {
+      __ vextractf128h(Address(rsp, off++*16), as_XMMRegister(n));
    }
  }
  if (frame::arg_reg_save_area_bytes != 0) {
@ -299,39 +237,24 @@ OopMap* RegisterSaver::save_live_registers(MacroAssembler* masm, int additional_
  map->set_callee_saved(STACK_OFFSET( r13_off ), r13->as_VMReg());
  map->set_callee_saved(STACK_OFFSET( r14_off ), r14->as_VMReg());
  map->set_callee_saved(STACK_OFFSET( r15_off ), r15->as_VMReg());
-  map->set_callee_saved(STACK_OFFSET(xmm0_off ), xmm0->as_VMReg());
-  map->set_callee_saved(STACK_OFFSET(xmm1_off ), xmm1->as_VMReg());
-  map->set_callee_saved(STACK_OFFSET(xmm2_off ), xmm2->as_VMReg());
-  map->set_callee_saved(STACK_OFFSET(xmm3_off ), xmm3->as_VMReg());
-  map->set_callee_saved(STACK_OFFSET(xmm4_off ), xmm4->as_VMReg());
-  map->set_callee_saved(STACK_OFFSET(xmm5_off ), xmm5->as_VMReg());
-  map->set_callee_saved(STACK_OFFSET(xmm6_off ), xmm6->as_VMReg());
-  map->set_callee_saved(STACK_OFFSET(xmm7_off ), xmm7->as_VMReg());
-  map->set_callee_saved(STACK_OFFSET(xmm8_off ), xmm8->as_VMReg());
-  map->set_callee_saved(STACK_OFFSET(xmm9_off ), xmm9->as_VMReg());
-  map->set_callee_saved(STACK_OFFSET(xmm10_off), xmm10->as_VMReg());
-  map->set_callee_saved(STACK_OFFSET(xmm11_off), xmm11->as_VMReg());
-  map->set_callee_saved(STACK_OFFSET(xmm12_off), xmm12->as_VMReg());
-  map->set_callee_saved(STACK_OFFSET(xmm13_off), xmm13->as_VMReg());
-  map->set_callee_saved(STACK_OFFSET(xmm14_off), xmm14->as_VMReg());
-  map->set_callee_saved(STACK_OFFSET(xmm15_off), xmm15->as_VMReg());
-  if (UseAVX > 2) {
-    map->set_callee_saved(STACK_OFFSET(xmm16_off), xmm16->as_VMReg());
-    map->set_callee_saved(STACK_OFFSET(xmm17_off), xmm17->as_VMReg());
-    map->set_callee_saved(STACK_OFFSET(xmm18_off), xmm18->as_VMReg());
-    map->set_callee_saved(STACK_OFFSET(xmm19_off), xmm19->as_VMReg());
-    map->set_callee_saved(STACK_OFFSET(xmm20_off), xmm20->as_VMReg());
-    map->set_callee_saved(STACK_OFFSET(xmm21_off), xmm21->as_VMReg());
-    map->set_callee_saved(STACK_OFFSET(xmm22_off), xmm22->as_VMReg());
-    map->set_callee_saved(STACK_OFFSET(xmm23_off), xmm23->as_VMReg());
-    map->set_callee_saved(STACK_OFFSET(xmm24_off), xmm24->as_VMReg());
-    map->set_callee_saved(STACK_OFFSET(xmm25_off), xmm25->as_VMReg());
-    map->set_callee_saved(STACK_OFFSET(xmm26_off), xmm26->as_VMReg());
-    map->set_callee_saved(STACK_OFFSET(xmm27_off), xmm27->as_VMReg());
-    map->set_callee_saved(STACK_OFFSET(xmm28_off), xmm28->as_VMReg());
-    map->set_callee_saved(STACK_OFFSET(xmm29_off), xmm29->as_VMReg());
-    map->set_callee_saved(STACK_OFFSET(xmm30_off), xmm30->as_VMReg());
-    map->set_callee_saved(STACK_OFFSET(xmm31_off), xmm31->as_VMReg());
+  // For both AVX and EVEX we will use the legacy FXSAVE area for xmm0..xmm15,
+  // on EVEX enabled targets, we get it included in the xsave area
+  off = xmm0_off;
+  int delta = xmm1_off - off;
+  for (int n = 0; n < 16; n++) {
+    XMMRegister xmm_name = as_XMMRegister(n);
+    map->set_callee_saved(STACK_OFFSET(off), xmm_name->as_VMReg());
+    off += delta;
+  }
+  if(UseAVX > 2) {
+    // Obtain xmm16..xmm31 from the XSAVE area on EVEX enabled targets
+    off = zmm16_off;
+    delta = zmm17_off - off;
+    for (int n = 16; n < num_xmm_regs; n++) {
+      XMMRegister xmm_name = as_XMMRegister(n);
+      map->set_callee_saved(STACK_OFFSET(off), xmm_name->as_VMReg());
+      off += delta;
+    }
  }

  // %%% These should all be a waste but we'll keep things as they were for now
@ -351,39 +274,24 @@ OopMap* RegisterSaver::save_live_registers(MacroAssembler* masm, int additional_
    map->set_callee_saved(STACK_OFFSET( r13H_off ), r13->as_VMReg()->next());
    map->set_callee_saved(STACK_OFFSET( r14H_off ), r14->as_VMReg()->next());
    map->set_callee_saved(STACK_OFFSET( r15H_off ), r15->as_VMReg()->next());
-    map->set_callee_saved(STACK_OFFSET(xmm0H_off ), xmm0->as_VMReg()->next());
-    map->set_callee_saved(STACK_OFFSET(xmm1H_off ), xmm1->as_VMReg()->next());
-    map->set_callee_saved(STACK_OFFSET(xmm2H_off ), xmm2->as_VMReg()->next());
-    map->set_callee_saved(STACK_OFFSET(xmm3H_off ), xmm3->as_VMReg()->next());
-    map->set_callee_saved(STACK_OFFSET(xmm4H_off ), xmm4->as_VMReg()->next());
-    map->set_callee_saved(STACK_OFFSET(xmm5H_off ), xmm5->as_VMReg()->next());
-    map->set_callee_saved(STACK_OFFSET(xmm6H_off ), xmm6->as_VMReg()->next());
-    map->set_callee_saved(STACK_OFFSET(xmm7H_off ), xmm7->as_VMReg()->next());
-    map->set_callee_saved(STACK_OFFSET(xmm8H_off ), xmm8->as_VMReg()->next());
-    map->set_callee_saved(STACK_OFFSET(xmm9H_off ), xmm9->as_VMReg()->next());
-    map->set_callee_saved(STACK_OFFSET(xmm10H_off), xmm10->as_VMReg()->next());
-    map->set_callee_saved(STACK_OFFSET(xmm11H_off), xmm11->as_VMReg()->next());
-    map->set_callee_saved(STACK_OFFSET(xmm12H_off), xmm12->as_VMReg()->next());
-    map->set_callee_saved(STACK_OFFSET(xmm13H_off), xmm13->as_VMReg()->next());
-    map->set_callee_saved(STACK_OFFSET(xmm14H_off), xmm14->as_VMReg()->next());
-    map->set_callee_saved(STACK_OFFSET(xmm15H_off), xmm15->as_VMReg()->next());
+    // For both AVX and EVEX we will use the legacy FXSAVE area for xmm0..xmm15,
+    // on EVEX enabled targets, we get it included in the xsave area
+    off = xmm0H_off;
+    delta = xmm1H_off - off;
+    for (int n = 0; n < 16; n++) {
+      XMMRegister xmm_name = as_XMMRegister(n);
+      map->set_callee_saved(STACK_OFFSET(off), xmm_name->as_VMReg()->next());
+      off += delta;
+    }
    if (UseAVX > 2) {
-      map->set_callee_saved(STACK_OFFSET(xmm16H_off), xmm16->as_VMReg()->next());
-      map->set_callee_saved(STACK_OFFSET(xmm17H_off), xmm17->as_VMReg()->next());
-      map->set_callee_saved(STACK_OFFSET(xmm18H_off), xmm18->as_VMReg()->next());
-      map->set_callee_saved(STACK_OFFSET(xmm19H_off), xmm19->as_VMReg()->next());
-      map->set_callee_saved(STACK_OFFSET(xmm20H_off), xmm20->as_VMReg()->next());
-      map->set_callee_saved(STACK_OFFSET(xmm21H_off), xmm21->as_VMReg()->next());
-      map->set_callee_saved(STACK_OFFSET(xmm22H_off), xmm22->as_VMReg()->next());
-      map->set_callee_saved(STACK_OFFSET(xmm23H_off), xmm23->as_VMReg()->next());
-      map->set_callee_saved(STACK_OFFSET(xmm24H_off), xmm24->as_VMReg()->next());
-      map->set_callee_saved(STACK_OFFSET(xmm25H_off), xmm25->as_VMReg()->next());
-      map->set_callee_saved(STACK_OFFSET(xmm26H_off), xmm26->as_VMReg()->next());
-      map->set_callee_saved(STACK_OFFSET(xmm27H_off), xmm27->as_VMReg()->next());
-      map->set_callee_saved(STACK_OFFSET(xmm28H_off), xmm28->as_VMReg()->next());
-      map->set_callee_saved(STACK_OFFSET(xmm29H_off), xmm29->as_VMReg()->next());
-      map->set_callee_saved(STACK_OFFSET(xmm30H_off), xmm30->as_VMReg()->next());
-      map->set_callee_saved(STACK_OFFSET(xmm31H_off), xmm31->as_VMReg()->next());
+      // Obtain xmm16..xmm31 from the XSAVE area on EVEX enabled targets
+      off = zmm16H_off;
+      delta = zmm17H_off - off;
+      for (int n = 16; n < num_xmm_regs; n++) {
+        XMMRegister xmm_name = as_XMMRegister(n);
+        map->set_callee_saved(STACK_OFFSET(off), xmm_name->as_VMReg()->next());
+        off += delta;
+      }
    }
  }

@ -391,86 +299,25 @@ OopMap* RegisterSaver::save_live_registers(MacroAssembler* masm, int additional_
 }

 void RegisterSaver::restore_live_registers(MacroAssembler* masm, bool restore_vectors) {
+  int num_xmm_regs = XMMRegisterImpl::number_of_registers;
+  if (UseAVX < 3) {
+    num_xmm_regs = num_xmm_regs/2;
+  }
  if (frame::arg_reg_save_area_bytes != 0) {
    // Pop arg register save area
    __ addptr(rsp, frame::arg_reg_save_area_bytes);
  }
 #ifdef COMPILER2
-  if (restore_vectors) {
-    // Restore upper half of YMM registes (0..15)
-    assert(UseAVX > 0, "512bit vectors are supported only with AVX");
-    assert(MaxVectorSize == 64, "only 512bit vectors are supported now");
-    __ vinsertf128h(xmm0, Address(rsp,  0));
-    __ vinsertf128h(xmm1, Address(rsp, 16));
-    __ vinsertf128h(xmm2, Address(rsp, 32));
-    __ vinsertf128h(xmm3, Address(rsp, 48));
-    __ vinsertf128h(xmm4, Address(rsp, 64));
-    __ vinsertf128h(xmm5, Address(rsp, 80));
-    __ vinsertf128h(xmm6, Address(rsp, 96));
-    __ vinsertf128h(xmm7, Address(rsp,112));
-    __ vinsertf128h(xmm8, Address(rsp,128));
-    __ vinsertf128h(xmm9, Address(rsp,144));
-    __ vinsertf128h(xmm10, Address(rsp,160));
-    __ vinsertf128h(xmm11, Address(rsp,176));
-    __ vinsertf128h(xmm12, Address(rsp,192));
-    __ vinsertf128h(xmm13, Address(rsp,208));
-    __ vinsertf128h(xmm14, Address(rsp,224));
-    __ vinsertf128h(xmm15, Address(rsp,240));
-    __ addptr(rsp, 256);
-    if (UseAVX > 2) {
-      // Restore upper half of YMM registes (16..31)
-      __ vinsertf128h(xmm16, Address(rsp,  0));
-      __ vinsertf128h(xmm17, Address(rsp, 16));
-      __ vinsertf128h(xmm18, Address(rsp, 32));
-      __ vinsertf128h(xmm19, Address(rsp, 48));
-      __ vinsertf128h(xmm20, Address(rsp, 64));
-      __ vinsertf128h(xmm21, Address(rsp, 80));
-      __ vinsertf128h(xmm22, Address(rsp, 96));
-      __ vinsertf128h(xmm23, Address(rsp,112));
-      __ vinsertf128h(xmm24, Address(rsp,128));
-      __ vinsertf128h(xmm25, Address(rsp,144));
-      __ vinsertf128h(xmm26, Address(rsp,160));
-      __ vinsertf128h(xmm27, Address(rsp,176));
-      __ vinsertf128h(xmm28, Address(rsp,192));
-      __ vinsertf128h(xmm29, Address(rsp,208));
-      __ vinsertf128h(xmm30, Address(rsp,224));
-      __ vinsertf128h(xmm31, Address(rsp,240));
-      __ addptr(rsp, 256);
-      // Restore upper half of ZMM registes.
-      __ vinsertf64x4h(xmm0, Address(rsp, 0));
-      __ vinsertf64x4h(xmm1, Address(rsp, 32));
-      __ vinsertf64x4h(xmm2, Address(rsp, 64));
-      __ vinsertf64x4h(xmm3, Address(rsp, 96));
-      __ vinsertf64x4h(xmm4, Address(rsp, 128));
-      __ vinsertf64x4h(xmm5, Address(rsp, 160));
-      __ vinsertf64x4h(xmm6, Address(rsp, 192));
-      __ vinsertf64x4h(xmm7, Address(rsp, 224));
-      __ vinsertf64x4h(xmm8, Address(rsp, 256));
-      __ vinsertf64x4h(xmm9, Address(rsp, 288));
-      __ vinsertf64x4h(xmm10, Address(rsp, 320));
-      __ vinsertf64x4h(xmm11, Address(rsp, 352));
-      __ vinsertf64x4h(xmm12, Address(rsp, 384));
-      __ vinsertf64x4h(xmm13, Address(rsp, 416));
-      __ vinsertf64x4h(xmm14, Address(rsp, 448));
-      __ vinsertf64x4h(xmm15, Address(rsp, 480));
-      __ vinsertf64x4h(xmm16, Address(rsp, 512));
-      __ vinsertf64x4h(xmm17, Address(rsp, 544));
-      __ vinsertf64x4h(xmm18, Address(rsp, 576));
-      __ vinsertf64x4h(xmm19, Address(rsp, 608));
-      __ vinsertf64x4h(xmm20, Address(rsp, 640));
-      __ vinsertf64x4h(xmm21, Address(rsp, 672));
-      __ vinsertf64x4h(xmm22, Address(rsp, 704));
-      __ vinsertf64x4h(xmm23, Address(rsp, 736));
-      __ vinsertf64x4h(xmm24, Address(rsp, 768));
-      __ vinsertf64x4h(xmm25, Address(rsp, 800));
-      __ vinsertf64x4h(xmm26, Address(rsp, 832));
-      __ vinsertf64x4h(xmm27, Address(rsp, 864));
-      __ vinsertf64x4h(xmm28, Address(rsp, 896));
-      __ vinsertf64x4h(xmm29, Address(rsp, 928));
-      __ vinsertf64x4h(xmm30, Address(rsp, 960));
-      __ vinsertf64x4h(xmm31, Address(rsp, 992));
-      __ addptr(rsp, 1024);
+  // On EVEX enabled targets everything is handled in pop fpu state
+  if ((restore_vectors) && (UseAVX < 3)) {
+    assert(UseAVX > 0, "256/512-bit vectors are supported only with AVX");
+    assert(MaxVectorSize == 64, "up to 512bit vectors are supported now");
+    int off = 0;
+    // Restore upper half of YMM registes (0..num_xmm_regs)
+    for (int n = 0; n < num_xmm_regs; n++) {
+      __ vinsertf128h(as_XMMRegister(n), Address(rsp,  off++*16));
    }
+    __ addptr(rsp, num_xmm_regs*16);
  }
 #else
  assert(!restore_vectors, "vectors are generated only by C2");
--- a/hotspot/src/cpu/x86/vm/stubGenerator_x86_32.cpp
+++ b/hotspot/src/cpu/x86/vm/stubGenerator_x86_32.cpp
@ -795,6 +795,12 @@ class StubGenerator: public StubCodeGenerator {
  void xmm_copy_forward(Register from, Register to_from, Register qword_count) {
    assert( UseSSE >= 2, "supported cpu only" );
    Label L_copy_64_bytes_loop, L_copy_64_bytes, L_copy_8_bytes, L_exit;
+    if (UseAVX > 2) {
+      __ push(rbx);
+      __ movl(rbx, 0xffff);
+      __ kmovdl(k1, rbx);
+      __ pop(rbx);
+    }
    // Copy 64-byte chunks
    __ jmpb(L_copy_64_bytes);
    __ align(OptoLoopAlignment);
@ -802,8 +808,8 @@ class StubGenerator: public StubCodeGenerator {

    if (UseUnalignedLoadStores) {
      if (UseAVX > 2) {
-        __ evmovdqu(xmm0, Address(from, 0), Assembler::AVX_512bit);
-        __ evmovdqu(Address(from, to_from, Address::times_1, 0), xmm0, Assembler::AVX_512bit);
+        __ evmovdqul(xmm0, Address(from, 0), Assembler::AVX_512bit);
+        __ evmovdqul(Address(from, to_from, Address::times_1, 0), xmm0, Assembler::AVX_512bit);
      } else if (UseAVX == 2) {
        __ vmovdqu(xmm0, Address(from,  0));
        __ vmovdqu(Address(from, to_from, Address::times_1,  0), xmm0);
@ -2217,6 +2223,15 @@ class StubGenerator: public StubCodeGenerator {
    const XMMRegister xmm_temp4  = xmm5;

    __ enter();   // required for proper stackwalking of RuntimeStub frame
+
+    // For EVEX with VL and BW, provide a standard mask, VL = 128 will guide the merge
+    // context for the registers used, where all instructions below are using 128-bit mode
+    // On EVEX without VL and BW, these instructions will all be AVX.
+    if (VM_Version::supports_avx512vlbw()) {
+      __ movl(rdx, 0xffff);
+      __ kmovdl(k1, rdx);
+    }
+
    __ movptr(from, from_param);
    __ movptr(key, key_param);

@ -2315,6 +2330,15 @@ class StubGenerator: public StubCodeGenerator {
    const XMMRegister xmm_temp4  = xmm5;

    __ enter(); // required for proper stackwalking of RuntimeStub frame
+
+    // For EVEX with VL and BW, provide a standard mask, VL = 128 will guide the merge
+    // context for the registers used, where all instructions below are using 128-bit mode
+    // On EVEX without VL and BW, these instructions will all be AVX.
+    if (VM_Version::supports_avx512vlbw()) {
+      __ movl(rdx, 0xffff);
+      __ kmovdl(k1, rdx);
+    }
+
    __ movptr(from, from_param);
    __ movptr(key, key_param);

@ -2441,6 +2465,14 @@ class StubGenerator: public StubCodeGenerator {
    __ enter(); // required for proper stackwalking of RuntimeStub frame
    handleSOERegisters(true /*saving*/);

+    // For EVEX with VL and BW, provide a standard mask, VL = 128 will guide the merge
+    // context for the registers used, where all instructions below are using 128-bit mode
+    // On EVEX without VL and BW, these instructions will all be AVX.
+    if (VM_Version::supports_avx512vlbw()) {
+      __ movl(rdx, 0xffff);
+      __ kmovdl(k1, rdx);
+    }
+
    // load registers from incoming parameters
    const Address  from_param(rbp, 8+0);
    const Address  to_param  (rbp, 8+4);
@ -2602,6 +2634,14 @@ class StubGenerator: public StubCodeGenerator {
    __ enter(); // required for proper stackwalking of RuntimeStub frame
    handleSOERegisters(true /*saving*/);

+    // For EVEX with VL and BW, provide a standard mask, VL = 128 will guide the merge
+    // context for the registers used, where all instructions below are using 128-bit mode
+    // On EVEX without VL and BW, these instructions will all be AVX.
+    if (VM_Version::supports_avx512vlbw()) {
+      __ movl(rdx, 0xffff);
+      __ kmovdl(k1, rdx);
+    }
+
    // load registers from incoming parameters
    const Address  from_param(rbp, 8+0);
    const Address  to_param  (rbp, 8+4);
@ -2782,6 +2822,14 @@ class StubGenerator: public StubCodeGenerator {
    __ enter();
    handleSOERegisters(true);  // Save registers

+    // For EVEX with VL and BW, provide a standard mask, VL = 128 will guide the merge
+    // context for the registers used, where all instructions below are using 128-bit mode
+    // On EVEX without VL and BW, these instructions will all be AVX.
+    if (VM_Version::supports_avx512vlbw()) {
+      __ movl(rdx, 0xffff);
+      __ kmovdl(k1, rdx);
+    }
+
    __ movptr(state, state_param);
    __ movptr(subkeyH, subkeyH_param);
    __ movptr(data, data_param);
--- a/hotspot/src/cpu/x86/vm/stubGenerator_x86_64.cpp
+++ b/hotspot/src/cpu/x86/vm/stubGenerator_x86_64.cpp
@ -269,12 +269,16 @@ class StubGenerator: public StubCodeGenerator {
      __ kmovql(k1, rbx);
    }
 #ifdef _WIN64
+    int last_reg = 15;
    if (UseAVX > 2) {
-      for (int i = 6; i <= 31; i++) {
-        __ movdqu(xmm_save(i), as_XMMRegister(i));
+      last_reg = 31;
+    }
+    if (VM_Version::supports_avx512novl()) {
+      for (int i = xmm_save_first; i <= last_reg; i++) {
+        __ vextractf32x4h(xmm_save(i), as_XMMRegister(i), 0);
      }
    } else {
-      for (int i = 6; i <= 15; i++) {
+      for (int i = xmm_save_first; i <= last_reg; i++) {
        __ movdqu(xmm_save(i), as_XMMRegister(i));
      }
    }
@ -386,13 +390,15 @@ class StubGenerator: public StubCodeGenerator {

    // restore regs belonging to calling function
 #ifdef _WIN64
-    int xmm_ub = 15;
-    if (UseAVX > 2) {
-      xmm_ub = 31;
-    }
    // emit the restores for xmm regs
-    for (int i = 6; i <= xmm_ub; i++) {
-      __ movdqu(as_XMMRegister(i), xmm_save(i));
+    if (VM_Version::supports_avx512novl()) {
+      for (int i = xmm_save_first; i <= last_reg; i++) {
+        __ vinsertf32x4h(as_XMMRegister(i), xmm_save(i), 0);
+      }
+    } else {
+      for (int i = xmm_save_first; i <= last_reg; i++) {
+        __ movdqu(as_XMMRegister(i), xmm_save(i));
+      }
    }
 #endif
    __ movptr(r15, r15_save);
@ -1342,11 +1348,15 @@ class StubGenerator: public StubCodeGenerator {
    __ align(OptoLoopAlignment);
    if (UseUnalignedLoadStores) {
      Label L_end;
+      if (UseAVX > 2) {
+        __ movl(to, 0xffff);
+        __ kmovql(k1, to);
+      }
      // Copy 64-bytes per iteration
      __ BIND(L_loop);
      if (UseAVX > 2) {
-        __ evmovdqu(xmm0, Address(end_from, qword_count, Address::times_8, -56), Assembler::AVX_512bit);
-        __ evmovdqu(Address(end_to, qword_count, Address::times_8, -56), xmm0, Assembler::AVX_512bit);
+        __ evmovdqul(xmm0, Address(end_from, qword_count, Address::times_8, -56), Assembler::AVX_512bit);
+        __ evmovdqul(Address(end_to, qword_count, Address::times_8, -56), xmm0, Assembler::AVX_512bit);
      } else if (UseAVX == 2) {
        __ vmovdqu(xmm0, Address(end_from, qword_count, Address::times_8, -56));
        __ vmovdqu(Address(end_to, qword_count, Address::times_8, -56), xmm0);
@ -1422,11 +1432,15 @@ class StubGenerator: public StubCodeGenerator {
    __ align(OptoLoopAlignment);
    if (UseUnalignedLoadStores) {
      Label L_end;
+      if (UseAVX > 2) {
+        __ movl(to, 0xffff);
+        __ kmovql(k1, to);
+      }
      // Copy 64-bytes per iteration
      __ BIND(L_loop);
      if (UseAVX > 2) {
-        __ evmovdqu(xmm0, Address(from, qword_count, Address::times_8, 32), Assembler::AVX_512bit);
-        __ evmovdqu(Address(dest, qword_count, Address::times_8, 32), xmm0, Assembler::AVX_512bit);
+        __ evmovdqul(xmm0, Address(from, qword_count, Address::times_8, 32), Assembler::AVX_512bit);
+        __ evmovdqul(Address(dest, qword_count, Address::times_8, 32), xmm0, Assembler::AVX_512bit);
      } else if (UseAVX == 2) {
        __ vmovdqu(xmm0, Address(from, qword_count, Address::times_8, 32));
        __ vmovdqu(Address(dest, qword_count, Address::times_8, 32), xmm0);
@ -3106,6 +3120,14 @@ class StubGenerator: public StubCodeGenerator {

    __ enter(); // required for proper stackwalking of RuntimeStub frame

+    // For EVEX with VL and BW, provide a standard mask, VL = 128 will guide the merge
+    // context for the registers used, where all instructions below are using 128-bit mode
+    // On EVEX without VL and BW, these instructions will all be AVX.
+    if (VM_Version::supports_avx512vlbw()) {
+      __ movl(rax, 0xffff);
+      __ kmovql(k1, rax);
+    }
+
    // keylen could be only {11, 13, 15} * 4 = {44, 52, 60}
    __ movl(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));

@ -3200,6 +3222,14 @@ class StubGenerator: public StubCodeGenerator {

    __ enter(); // required for proper stackwalking of RuntimeStub frame

+    // For EVEX with VL and BW, provide a standard mask, VL = 128 will guide the merge
+    // context for the registers used, where all instructions below are using 128-bit mode
+    // On EVEX without VL and BW, these instructions will all be AVX.
+    if (VM_Version::supports_avx512vlbw()) {
+      __ movl(rax, 0xffff);
+      __ kmovql(k1, rax);
+    }
+
    // keylen could be only {11, 13, 15} * 4 = {44, 52, 60}
    __ movl(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));

@ -3312,6 +3342,14 @@ class StubGenerator: public StubCodeGenerator {

    __ enter(); // required for proper stackwalking of RuntimeStub frame

+    // For EVEX with VL and BW, provide a standard mask, VL = 128 will guide the merge
+    // context for the registers used, where all instructions below are using 128-bit mode
+    // On EVEX without VL and BW, these instructions will all be AVX.
+    if (VM_Version::supports_avx512vlbw()) {
+      __ movl(rax, 0xffff);
+      __ kmovql(k1, rax);
+    }
+
 #ifdef _WIN64
    // on win64, fill len_reg from stack position
    __ movl(len_reg, len_mem);
@ -3508,6 +3546,14 @@ class StubGenerator: public StubCodeGenerator {

    __ enter(); // required for proper stackwalking of RuntimeStub frame

+    // For EVEX with VL and BW, provide a standard mask, VL = 128 will guide the merge
+    // context for the registers used, where all instructions below are using 128-bit mode
+    // On EVEX without VL and BW, these instructions will all be AVX.
+    if (VM_Version::supports_avx512vlbw()) {
+      __ movl(rax, 0xffff);
+      __ kmovql(k1, rax);
+    }
+
 #ifdef _WIN64
    // on win64, fill len_reg from stack position
    __ movl(len_reg, len_mem);
@ -3746,6 +3792,14 @@ class StubGenerator: public StubCodeGenerator {

    __ enter();

+    // For EVEX with VL and BW, provide a standard mask, VL = 128 will guide the merge
+    // context for the registers used, where all instructions below are using 128-bit mode
+    // On EVEX without VL and BW, these instructions will all be AVX.
+    if (VM_Version::supports_avx512vlbw()) {
+      __ movl(rax, 0xffff);
+      __ kmovql(k1, rax);
+    }
+
 #ifdef _WIN64
    // save the xmm registers which must be preserved 6-10
    __ subptr(rsp, -rsp_after_call_off * wordSize);
--- a/hotspot/src/cpu/x86/vm/stubRoutines_x86_32.hpp
+++ b/hotspot/src/cpu/x86/vm/stubRoutines_x86_32.hpp
@ -31,7 +31,7 @@

 enum platform_dependent_constants {
  code_size1 =  9000,           // simply increase if too small (assembler will crash if too small)
-  code_size2 = 22000            // simply increase if too small (assembler will crash if too small)
+  code_size2 = 30000            // simply increase if too small (assembler will crash if too small)
 };

 class x86 {
--- a/hotspot/src/cpu/x86/vm/stubRoutines_x86_64.hpp
+++ b/hotspot/src/cpu/x86/vm/stubRoutines_x86_64.hpp
@ -33,7 +33,7 @@ static bool    returns_to_call_stub(address return_pc)   { return return_pc == _

 enum platform_dependent_constants {
  code_size1 = 19000,          // simply increase if too small (assembler will crash if too small)
-  code_size2 = 24000           // simply increase if too small (assembler will crash if too small)
+  code_size2 = 32000           // simply increase if too small (assembler will crash if too small)
 };

 class x86 {
--- a/hotspot/src/cpu/x86/vm/vm_version_x86.cpp
+++ b/hotspot/src/cpu/x86/vm/vm_version_x86.cpp
@ -367,16 +367,12 @@ class VM_Version_StubGenerator: public StubCodeGenerator {
    __ movl(rcx, VM_Version::ymm_test_value());
    __ movdl(xmm0, rcx);
    __ movl(rcx, 0xffff);
-#ifdef _LP64
-    __ kmovql(k1, rcx);
-#else
-    __ kmovdl(k1, rcx);
-#endif
+    __ kmovwl(k1, rcx);
    __ evpbroadcastd(xmm0, xmm0, Assembler::AVX_512bit);
-    __ evmovdqu(xmm7, xmm0, Assembler::AVX_512bit);
+    __ evmovdqul(xmm7, xmm0, Assembler::AVX_512bit);
 #ifdef _LP64
-    __ evmovdqu(xmm8, xmm0, Assembler::AVX_512bit);
-    __ evmovdqu(xmm31, xmm0, Assembler::AVX_512bit);
+    __ evmovdqul(xmm8, xmm0, Assembler::AVX_512bit);
+    __ evmovdqul(xmm31, xmm0, Assembler::AVX_512bit);
 #endif
    VM_Version::clean_cpuFeatures();
    __ jmp(save_restore_except);
@ -427,11 +423,11 @@ class VM_Version_StubGenerator: public StubCodeGenerator {
    UseAVX = 3;
    UseSSE = 2;
    __ lea(rsi, Address(rbp, in_bytes(VM_Version::zmm_save_offset())));
-    __ evmovdqu(Address(rsi, 0), xmm0, Assembler::AVX_512bit);
-    __ evmovdqu(Address(rsi, 64), xmm7, Assembler::AVX_512bit);
+    __ evmovdqul(Address(rsi, 0), xmm0, Assembler::AVX_512bit);
+    __ evmovdqul(Address(rsi, 64), xmm7, Assembler::AVX_512bit);
 #ifdef _LP64
-    __ evmovdqu(Address(rsi, 128), xmm8, Assembler::AVX_512bit);
-    __ evmovdqu(Address(rsi, 192), xmm31, Assembler::AVX_512bit);
+    __ evmovdqul(Address(rsi, 128), xmm8, Assembler::AVX_512bit);
+    __ evmovdqul(Address(rsi, 192), xmm31, Assembler::AVX_512bit);
 #endif
    VM_Version::clean_cpuFeatures();
    UseAVX = saved_useavx;
--- a/hotspot/src/cpu/x86/vm/vm_version_x86.hpp
+++ b/hotspot/src/cpu/x86/vm/vm_version_x86.hpp
@ -227,14 +227,15 @@ public:
  union XemXcr0Eax {
    uint32_t value;
    struct {
-      uint32_t x87    : 1,
-               sse    : 1,
-               ymm    : 1,
-                      : 2,
-               opmask : 1,
-               zmm512 : 1,
-                zmm32 : 1,
-                      : 24;
+      uint32_t x87     : 1,
+               sse     : 1,
+               ymm     : 1,
+               bndregs : 1,
+               bndcsr  : 1,
+               opmask  : 1,
+               zmm512  : 1,
+               zmm32   : 1,
+                       : 24;
    } bits;
  };

@ -703,6 +704,7 @@ public:
  static bool supports_avx512bw() { return (_cpuFeatures & CPU_AVX512BW) != 0; }
  static bool supports_avx512vl() { return (_cpuFeatures & CPU_AVX512VL) != 0; }
  static bool supports_avx512vlbw() { return (supports_avx512bw() && supports_avx512vl()); }
+  static bool supports_avx512novl() { return (supports_evex() && !supports_avx512vl()); }
  // Intel features
  static bool is_intel_family_core() { return is_intel() &&
                                       extended_cpu_family() == CPU_FAMILY_INTEL_CORE; }
@ -817,6 +819,12 @@ public:
    intx count = PrefetchFieldsAhead;
    return count >= 0 ? count : 1;
  }
+  static uint32_t get_xsave_header_lower_segment() {
+    return _cpuid_info.xem_xcr0_eax.value;
+  }
+  static uint32_t get_xsave_header_upper_segment() {
+    return _cpuid_info.xem_xcr0_edx;
+  }
 };

 #endif // CPU_X86_VM_VM_VERSION_X86_HPP
--- a/hotspot/src/cpu/x86/vm/x86.ad
+++ b/hotspot/src/cpu/x86/vm/x86.ad
@ -1661,46 +1661,55 @@ const bool Matcher::match_rule_supported(int opcode) {
  if (!has_match_rule(opcode))
    return false;

+  bool ret_value = true;
  switch (opcode) {
    case Op_PopCountI:
    case Op_PopCountL:
      if (!UsePopCountInstruction)
-        return false;
-    break;
+        ret_value = false;
+      break;
    case Op_MulVI:
      if ((UseSSE < 4) && (UseAVX < 1)) // only with SSE4_1 or AVX
-        return false;
-    break;
+        ret_value = false;
+      break;
    case Op_MulVL:
    case Op_MulReductionVL:
      if (VM_Version::supports_avx512dq() == false)
-        return false;
+        ret_value = false;
+      break;
    case Op_AddReductionVL:
      if (UseAVX < 3) // only EVEX : vector connectivity becomes an issue here
-        return false;
+        ret_value = false;
+      break;
    case Op_AddReductionVI:
      if (UseSSE < 3) // requires at least SSE3
-        return false;
+        ret_value = false;
+      break;
    case Op_MulReductionVI:
      if (UseSSE < 4) // requires at least SSE4
-        return false;
+        ret_value = false;
+      break;
    case Op_AddReductionVF:
    case Op_AddReductionVD:
    case Op_MulReductionVF:
    case Op_MulReductionVD:
      if (UseSSE < 1) // requires at least SSE
-        return false;
-    break;
+        ret_value = false;
+      break;
+    case Op_SqrtVD:
+      if (UseAVX < 1) // enabled for AVX only
+        ret_value = false;
+      break;
    case Op_CompareAndSwapL:
 #ifdef _LP64
    case Op_CompareAndSwapP:
 #endif
      if (!VM_Version::supports_cx8())
-        return false;
-    break;
+        ret_value = false;
+      break;
  }

-  return true;  // Per default match rules are supported.
+  return ret_value;  // Per default match rules are supported.
 }

 // Max vector size in bytes. 0 if not supported.
@ -1721,14 +1730,24 @@ const int Matcher::vector_width_in_bytes(BasicType bt) {
  case T_DOUBLE:
  case T_LONG:
    if (size < 16) return 0;
+    break;
  case T_FLOAT:
  case T_INT:
    if (size < 8) return 0;
+    break;
  case T_BOOLEAN:
-  case T_BYTE:
+    if (size < 4) return 0;
+    break;
  case T_CHAR:
+    if (size < 4) return 0;
+    break;
+  case T_BYTE:
+    if (size < 4) return 0;
+    if ((size > 32) && !VM_Version::supports_avx512bw()) return 0;
+    break;
  case T_SHORT:
    if (size < 4) return 0;
+    if ((size > 16) && !VM_Version::supports_avx512bw()) return 0;
    break;
  default:
    ShouldNotReachHere();
@ -1800,7 +1819,7 @@ static int vec_mov_helper(CodeBuffer *cbuf, bool do_size, int src_lo, int dst_lo
      __ vmovdqu(as_XMMRegister(Matcher::_regEncode[dst_lo]), as_XMMRegister(Matcher::_regEncode[src_lo]));
      break;
    case Op_VecZ:
-      __ evmovdqu(as_XMMRegister(Matcher::_regEncode[dst_lo]), as_XMMRegister(Matcher::_regEncode[src_lo]), 2);
+      __ evmovdqul(as_XMMRegister(Matcher::_regEncode[dst_lo]), as_XMMRegister(Matcher::_regEncode[src_lo]), 2);
      break;
    default:
      ShouldNotReachHere();
@ -1855,7 +1874,7 @@ static int vec_spill_helper(CodeBuffer *cbuf, bool do_size, bool is_load,
        __ vmovdqu(as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset));
        break;
      case Op_VecZ:
-        __ evmovdqu(as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset), 2);
+        __ evmovdqul(as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset), 2);
        break;
      default:
        ShouldNotReachHere();
@ -1875,7 +1894,7 @@ static int vec_spill_helper(CodeBuffer *cbuf, bool do_size, bool is_load,
        __ vmovdqu(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]));
        break;
      case Op_VecZ:
-        __ evmovdqu(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]), 2);
+        __ evmovdqul(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]), 2);
        break;
      default:
        ShouldNotReachHere();
@ -1929,9 +1948,40 @@ static int vec_spill_helper(CodeBuffer *cbuf, bool do_size, bool is_load,
    }
 #endif
  }
-  int offset_size = (stack_offset == 0) ? 0 : ((stack_offset < 0x80) ? 1 : (UseAVX > 2) ? 6 : 4);
+  bool is_single_byte = false;
+  int vec_len = 0;
+  if ((UseAVX > 2) && (stack_offset != 0)) {
+    switch (ireg) {
+	case Op_VecS:
+    case Op_VecD:
+    case Op_VecX:
+	  break;
+	case Op_VecY:
+	  vec_len = 1;
+	  break;
+    case Op_VecZ:
+	  vec_len = 2;
+	  break;
+    }
+    is_single_byte = Assembler::query_compressed_disp_byte(stack_offset, true, vec_len, Assembler::EVEX_FVM, Assembler::EVEX_32bit, 0);
+  }
+  int offset_size = 0;
+  int size = 5;
+  if (UseAVX > 2 ) {
+    if ((VM_Version::supports_avx512vl() == false) && (vec_len == 2)) { 
+      offset_size = (stack_offset == 0) ? 0 : ((is_single_byte) ? 1 : 4);
+      size += 2; // Need an additional two bytes for EVEX encoding
+    } else if ((VM_Version::supports_avx512vl() == false) && (vec_len < 2)) { 
+      offset_size = (stack_offset == 0) ? 0 : ((stack_offset <= 127) ? 1 : 4);
+    } else {
+      offset_size = (stack_offset == 0) ? 0 : ((is_single_byte) ? 1 : 4);
+      size += 2; // Need an additional two bytes for EVEX encodding
+    }
+  } else {
+    offset_size = (stack_offset == 0) ? 0 : ((stack_offset <= 127) ? 1 : 4);
+  }
  // VEX_2bytes prefix is used if UseAVX > 0, so it takes the same 2 bytes as SIMD prefix.
-  return 5+offset_size;
+  return size+offset_size;
 }

 static inline jfloat replicate4_imm(int con, int width) {
@ -2675,11 +2725,10 @@ instruct negF_reg_reg(regF dst, regF src) %{
  predicate(UseAVX > 0);
  match(Set dst (NegF src));
  ins_cost(150);
-  format %{ "vxorps  $dst, $src, [0x80000000]\t# neg float by sign flipping" %}
+  format %{ "vnegatess  $dst, $src, [0x80000000]\t# neg float by sign flipping" %}
  ins_encode %{
-    int vector_len = 0;
-    __ vxorps($dst$$XMMRegister, $src$$XMMRegister,
-              ExternalAddress(float_signflip()), vector_len);
+    __ vnegatess($dst$$XMMRegister, $src$$XMMRegister,
+                 ExternalAddress(float_signflip()));
  %}
  ins_pipe(pipe_slow);
 %}
@ -2700,12 +2749,11 @@ instruct negD_reg_reg(regD dst, regD src) %{
  predicate(UseAVX > 0);
  match(Set dst (NegD src));
  ins_cost(150);
-  format %{ "vxorpd  $dst, $src, [0x8000000000000000]\t"
+  format %{ "vnegatess  $dst, $src, [0x8000000000000000]\t"
            "# neg double by sign flipping" %}
  ins_encode %{
-    int vector_len = 0;
-    __ vxorpd($dst$$XMMRegister, $src$$XMMRegister,
-              ExternalAddress(double_signflip()), vector_len);
+    __ vnegatesd($dst$$XMMRegister, $src$$XMMRegister,
+                 ExternalAddress(double_signflip()));
  %}
  ins_pipe(pipe_slow);
 %}
@ -2838,7 +2886,7 @@ instruct loadV64(vecZ dst, memory mem) %{
  format %{ "vmovdqu $dst k0,$mem\t! load vector (64 bytes)" %}
  ins_encode %{
    int vector_len = 2;
-    __ evmovdqu($dst$$XMMRegister, $mem$$Address, vector_len);
+    __ evmovdqul($dst$$XMMRegister, $mem$$Address, vector_len);
  %}
  ins_pipe( pipe_slow );
 %}
@ -2895,7 +2943,7 @@ instruct storeV64(memory mem, vecZ src) %{
  format %{ "vmovdqu $mem k0,$src\t! store vector (64 bytes)" %}
  ins_encode %{
    int vector_len = 2;
-    __ evmovdqu($mem$$Address, $src$$XMMRegister, vector_len);
+    __ evmovdqul($mem$$Address, $src$$XMMRegister, vector_len);
  %}
  ins_pipe( pipe_slow );
 %}
@ -3315,6 +3363,37 @@ instruct Repl8F_mem(vecY dst, memory mem) %{
  ins_pipe( pipe_slow );
 %}

+instruct Repl2F_zero(vecD dst, immF0 zero) %{
+  predicate(n->as_Vector()->length() == 2 && UseAVX < 3);
+  match(Set dst (ReplicateF zero));
+  format %{ "xorps   $dst,$dst\t! replicate2F zero" %}
+  ins_encode %{
+    __ xorps($dst$$XMMRegister, $dst$$XMMRegister);
+  %}
+  ins_pipe( fpu_reg_reg );
+%}
+
+instruct Repl4F_zero(vecX dst, immF0 zero) %{
+  predicate(n->as_Vector()->length() == 4 && UseAVX < 3);
+  match(Set dst (ReplicateF zero));
+  format %{ "xorps   $dst,$dst\t! replicate4F zero" %}
+  ins_encode %{
+    __ xorps($dst$$XMMRegister, $dst$$XMMRegister);
+  %}
+  ins_pipe( fpu_reg_reg );
+%}
+
+instruct Repl8F_zero(vecY dst, immF0 zero) %{
+  predicate(n->as_Vector()->length() == 8 && UseAVX < 3);
+  match(Set dst (ReplicateF zero));
+  format %{ "vxorps  $dst,$dst,$dst\t! replicate8F zero" %}
+  ins_encode %{
+    int vector_len = 1;
+    __ vxorps($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vector_len);
+  %}
+  ins_pipe( fpu_reg_reg );
+%}
+
 instruct Repl2D_mem(vecX dst, memory mem) %{
  predicate(n->as_Vector()->length() == 2 && UseAVX > 0 && !VM_Version::supports_avx512vl());
  match(Set dst (ReplicateD (LoadD mem)));
@ -3349,6 +3428,28 @@ instruct Repl4D_mem(vecY dst, memory mem) %{
  ins_pipe( pipe_slow );
 %}

+// Replicate double (8 byte) scalar zero to be vector
+instruct Repl2D_zero(vecX dst, immD0 zero) %{
+  predicate(n->as_Vector()->length() == 2 && UseAVX < 3);
+  match(Set dst (ReplicateD zero));
+  format %{ "xorpd   $dst,$dst\t! replicate2D zero" %}
+  ins_encode %{
+    __ xorpd($dst$$XMMRegister, $dst$$XMMRegister);
+  %}
+  ins_pipe( fpu_reg_reg );
+%}
+
+instruct Repl4D_zero(vecY dst, immD0 zero) %{
+  predicate(n->as_Vector()->length() == 4 && UseAVX < 3);
+  match(Set dst (ReplicateD zero));
+  format %{ "vxorpd  $dst,$dst,$dst,vect256\t! replicate4D zero" %}
+  ins_encode %{
+    int vector_len = 1;
+    __ vxorpd($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vector_len);
+  %}
+  ins_pipe( fpu_reg_reg );
+%}
+
 // ====================GENERIC REPLICATE==========================================

 // Replicate byte scalar to be vector
@ -3680,38 +3781,6 @@ instruct Repl4F(vecX dst, regF src) %{
  ins_pipe( pipe_slow );
 %}

-// Replicate float (4 byte) scalar zero to be vector
-instruct Repl2F_zero(vecD dst, immF0 zero) %{
-  predicate(n->as_Vector()->length() == 2);
-  match(Set dst (ReplicateF zero));
-  format %{ "xorps   $dst,$dst\t! replicate2F zero" %}
-  ins_encode %{
-    __ xorps($dst$$XMMRegister, $dst$$XMMRegister);
-  %}
-  ins_pipe( fpu_reg_reg );
-%}
-
-instruct Repl4F_zero(vecX dst, immF0 zero) %{
-  predicate(n->as_Vector()->length() == 4);
-  match(Set dst (ReplicateF zero));
-  format %{ "xorps   $dst,$dst\t! replicate4F zero" %}
-  ins_encode %{
-    __ xorps($dst$$XMMRegister, $dst$$XMMRegister);
-  %}
-  ins_pipe( fpu_reg_reg );
-%}
-
-instruct Repl8F_zero(vecY dst, immF0 zero) %{
-  predicate(n->as_Vector()->length() == 8);
-  match(Set dst (ReplicateF zero));
-  format %{ "vxorps  $dst,$dst,$dst\t! replicate8F zero" %}
-  ins_encode %{
-    int vector_len = 1;
-    __ vxorps($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vector_len);
-  %}
-  ins_pipe( fpu_reg_reg );
-%}
-
 // Replicate double (8 bytes) scalar to be vector
 instruct Repl2D(vecX dst, regD src) %{
  predicate(n->as_Vector()->length() == 2);
@ -3723,28 +3792,6 @@ instruct Repl2D(vecX dst, regD src) %{
  ins_pipe( pipe_slow );
 %}

-// Replicate double (8 byte) scalar zero to be vector
-instruct Repl2D_zero(vecX dst, immD0 zero) %{
-  predicate(n->as_Vector()->length() == 2);
-  match(Set dst (ReplicateD zero));
-  format %{ "xorpd   $dst,$dst\t! replicate2D zero" %}
-  ins_encode %{
-    __ xorpd($dst$$XMMRegister, $dst$$XMMRegister);
-  %}
-  ins_pipe( fpu_reg_reg );
-%}
-
-instruct Repl4D_zero(vecY dst, immD0 zero) %{
-  predicate(n->as_Vector()->length() == 4);
-  match(Set dst (ReplicateD zero));
-  format %{ "vxorpd  $dst,$dst,$dst,vect256\t! replicate4D zero" %}
-  ins_encode %{
-    int vector_len = 1;
-    __ vxorpd($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vector_len);
-  %}
-  ins_pipe( fpu_reg_reg );
-%}
-
 // ====================EVEX REPLICATE=============================================

 instruct Repl4B_mem_evex(vecS dst, memory mem) %{
@ -3814,7 +3861,7 @@ instruct Repl32B_mem_evex(vecY dst, memory mem) %{
 %}

 instruct Repl64B_evex(vecZ dst, rRegI src) %{
-  predicate(n->as_Vector()->length() == 64 && UseAVX > 2);
+  predicate(n->as_Vector()->length() == 64 && VM_Version::supports_avx512bw());
  match(Set dst (ReplicateB src));
  format %{ "vpbroadcastb $dst,$src\t! upper replicate64B" %}
  ins_encode %{
@ -3825,7 +3872,7 @@ instruct Repl64B_evex(vecZ dst, rRegI src) %{
 %}

 instruct Repl64B_mem_evex(vecZ dst, memory mem) %{
-  predicate(n->as_Vector()->length() == 64 && VM_Version::supports_avx512vlbw());
+  predicate(n->as_Vector()->length() == 64 && VM_Version::supports_avx512bw());
  match(Set dst (ReplicateB (LoadB mem)));
  format %{ "vpbroadcastb  $dst,$mem\t! replicate64B" %}
  ins_encode %{
@ -3862,7 +3909,7 @@ instruct Repl32B_imm_evex(vecY dst, immI con) %{
 %}

 instruct Repl64B_imm_evex(vecZ dst, immI con) %{
-  predicate(n->as_Vector()->length() == 64 && UseAVX > 2);
+  predicate(n->as_Vector()->length() == 64 && VM_Version::supports_avx512bw());
  match(Set dst (ReplicateB con));
  format %{ "movq    $dst,[$constantaddress]\n\t"
            "vpbroadcastb $dst,$dst\t! upper replicate64B" %}
@ -3953,7 +4000,7 @@ instruct Repl16S_mem_evex(vecY dst, memory mem) %{
 %}

 instruct Repl32S_evex(vecZ dst, rRegI src) %{
-  predicate(n->as_Vector()->length() == 32 && UseAVX > 2);
+  predicate(n->as_Vector()->length() == 32 && VM_Version::supports_avx512bw());
  match(Set dst (ReplicateS src));
  format %{ "vpbroadcastw $dst,$src\t! replicate32S" %}
  ins_encode %{
@ -3964,7 +4011,7 @@ instruct Repl32S_evex(vecZ dst, rRegI src) %{
 %}

 instruct Repl32S_mem_evex(vecZ dst, memory mem) %{
-  predicate(n->as_Vector()->length() == 32 && UseAVX > 2);
+  predicate(n->as_Vector()->length() == 32 && VM_Version::supports_avx512bw());
  match(Set dst (ReplicateS (LoadS mem)));
  format %{ "vpbroadcastw  $dst,$mem\t! replicate32S" %}
  ins_encode %{
@ -4001,7 +4048,7 @@ instruct Repl16S_imm_evex(vecY dst, immI con) %{
 %}

 instruct Repl32S_imm_evex(vecZ dst, immI con) %{
-  predicate(n->as_Vector()->length() == 32 && UseAVX > 2);
+  predicate(n->as_Vector()->length() == 32 && VM_Version::supports_avx512bw());
  match(Set dst (ReplicateS con));
  format %{ "movq    $dst,[$constantaddress]\n\t"
            "vpbroadcastw $dst,$dst\t! replicate32S" %}
@ -4318,13 +4365,50 @@ instruct Repl16F_mem_evex(vecZ dst, memory mem) %{
  ins_pipe( pipe_slow );
 %}

+instruct Repl2F_zero_evex(vecD dst, immF0 zero) %{
+  predicate(n->as_Vector()->length() == 2 && UseAVX > 2);
+  match(Set dst (ReplicateF zero));
+  format %{ "vpxor  $dst k0,$dst,$dst\t! replicate2F zero" %}
+  ins_encode %{
+    // Use vpxor in place of vxorps since EVEX has a constriant on dq for vxorps: this is a 512-bit operation
+    int vector_len = 2;
+    __ vpxor($dst$$XMMRegister,$dst$$XMMRegister, $dst$$XMMRegister, vector_len);
+  %}
+  ins_pipe( fpu_reg_reg );
+%}
+
+instruct Repl4F_zero_evex(vecX dst, immF0 zero) %{
+  predicate(n->as_Vector()->length() == 4 && UseAVX > 2);
+  match(Set dst (ReplicateF zero));
+  format %{ "vpxor  $dst k0,$dst,$dst\t! replicate4F zero" %}
+  ins_encode %{
+    // Use vpxor in place of vxorps since EVEX has a constriant on dq for vxorps: this is a 512-bit operation
+    int vector_len = 2;
+    __ vpxor($dst$$XMMRegister,$dst$$XMMRegister, $dst$$XMMRegister, vector_len);
+  %}
+  ins_pipe( fpu_reg_reg );
+%}
+
+instruct Repl8F_zero_evex(vecY dst, immF0 zero) %{
+  predicate(n->as_Vector()->length() == 8 && UseAVX > 2);
+  match(Set dst (ReplicateF zero));
+  format %{ "vpxor  $dst k0,$dst,$dst\t! replicate8F zero" %}
+  ins_encode %{
+    // Use vpxor in place of vxorps since EVEX has a constriant on dq for vxorps: this is a 512-bit operation
+    int vector_len = 2;
+    __ vpxor($dst$$XMMRegister,$dst$$XMMRegister, $dst$$XMMRegister, vector_len);
+  %}
+  ins_pipe( fpu_reg_reg );
+%}
+
 instruct Repl16F_zero_evex(vecZ dst, immF0 zero) %{
  predicate(n->as_Vector()->length() == 16 && UseAVX > 2);
  match(Set dst (ReplicateF zero));
-  format %{ "vxorps  $dst k0,$dst,$dst\t! replicate16F zero" %}
+  format %{ "vpxor  $dst k0,$dst,$dst\t! replicate16F zero" %}
  ins_encode %{
+    // Use vpxor in place of vxorps since EVEX has a constriant on dq for vxorps: this is a 512-bit operation
    int vector_len = 2;
-    __ vxorps($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vector_len);
+    __ vpxor($dst$$XMMRegister,$dst$$XMMRegister, $dst$$XMMRegister, vector_len);
  %}
  ins_pipe( fpu_reg_reg );
 %}
@ -4373,13 +4457,38 @@ instruct Repl8D_mem_evex(vecZ dst, memory mem) %{
  ins_pipe( pipe_slow );
 %}

+instruct Repl2D_zero_evex(vecX dst, immD0 zero) %{
+  predicate(n->as_Vector()->length() == 2 && UseAVX > 2);
+  match(Set dst (ReplicateD zero));
+  format %{ "vpxor  $dst k0,$dst,$dst\t! replicate2D zero" %}
+  ins_encode %{
+    // Use vpxor in place of vxorpd since EVEX has a constriant on dq for vxorpd: this is a 512-bit operation
+    int vector_len = 2;
+    __ vpxor($dst$$XMMRegister,$dst$$XMMRegister, $dst$$XMMRegister, vector_len);
+  %}
+  ins_pipe( fpu_reg_reg );
+%}
+
+instruct Repl4D_zero_evex(vecY dst, immD0 zero) %{
+  predicate(n->as_Vector()->length() == 4 && UseAVX > 2);
+  match(Set dst (ReplicateD zero));
+  format %{ "vpxor  $dst k0,$dst,$dst\t! replicate4D zero" %}
+  ins_encode %{
+    // Use vpxor in place of vxorpd since EVEX has a constriant on dq for vxorpd: this is a 512-bit operation
+    int vector_len = 2;
+    __ vpxor($dst$$XMMRegister,$dst$$XMMRegister, $dst$$XMMRegister, vector_len);
+  %}
+  ins_pipe( fpu_reg_reg );
+%}
+
 instruct Repl8D_zero_evex(vecZ dst, immD0 zero) %{
  predicate(n->as_Vector()->length() == 8 && UseAVX > 2);
  match(Set dst (ReplicateD zero));
-  format %{ "vxorpd  $dst k0,$dst,$dst,vect512\t! replicate8D zero" %}
+  format %{ "vpxor  $dst k0,$dst,$dst,vect512\t! replicate8D zero" %}
  ins_encode %{
+    // Use vpxor in place of vxorpd since EVEX has a constriant on dq for vxorpd: this is a 512-bit operation
    int vector_len = 2;
-    __ vxorpd($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vector_len);
+    __ vpxor($dst$$XMMRegister,$dst$$XMMRegister, $dst$$XMMRegister, vector_len);
  %}
  ins_pipe( fpu_reg_reg );
 %}
@ -7474,6 +7583,75 @@ instruct vshiftcnt(vecS dst, rRegI cnt) %{
  ins_pipe( pipe_slow );
 %}

+// --------------------------------- Sqrt --------------------------------------
+
+// Floating point vector sqrt - double precision only
+instruct vsqrt2D_reg(vecX dst, vecX src) %{
+  predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
+  match(Set dst (SqrtVD src));
+  format %{ "vsqrtpd  $dst,$src\t! sqrt packed2D" %}
+  ins_encode %{
+    int vector_len = 0;
+    __ vsqrtpd($dst$$XMMRegister, $src$$XMMRegister, vector_len);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vsqrt2D_mem(vecX dst, memory mem) %{
+  predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
+  match(Set dst (SqrtVD (LoadVector mem)));
+  format %{ "vsqrtpd  $dst,$mem\t! sqrt packed2D" %}
+  ins_encode %{
+    int vector_len = 0;
+    __ vsqrtpd($dst$$XMMRegister, $mem$$Address, vector_len);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vsqrt4D_reg(vecY dst, vecY src) %{
+  predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
+  match(Set dst (SqrtVD src));
+  format %{ "vsqrtpd  $dst,$src\t! sqrt packed4D" %}
+  ins_encode %{
+    int vector_len = 1;
+    __ vsqrtpd($dst$$XMMRegister, $src$$XMMRegister, vector_len);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vsqrt4D_mem(vecY dst, memory mem) %{
+  predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
+  match(Set dst (SqrtVD (LoadVector mem)));
+  format %{ "vsqrtpd  $dst,$mem\t! sqrt packed4D" %}
+  ins_encode %{
+    int vector_len = 1;
+    __ vsqrtpd($dst$$XMMRegister, $mem$$Address, vector_len);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vsqrt8D_reg(vecZ dst, vecZ src) %{
+  predicate(UseAVX > 2 && n->as_Vector()->length() == 8);
+  match(Set dst (SqrtVD src));
+  format %{ "vsqrtpd  $dst,$src\t! sqrt packed8D" %}
+  ins_encode %{
+    int vector_len = 2;
+    __ vsqrtpd($dst$$XMMRegister, $src$$XMMRegister, vector_len);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vsqrt8D_mem(vecZ dst, memory mem) %{
+  predicate(UseAVX > 2 && n->as_Vector()->length() == 8);
+  match(Set dst (SqrtVD (LoadVector mem)));
+  format %{ "vsqrtpd  $dst,$mem\t! sqrt packed8D" %}
+  ins_encode %{
+    int vector_len = 2;
+    __ vsqrtpd($dst$$XMMRegister, $mem$$Address, vector_len);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
 // ------------------------------ LeftShift -----------------------------------

 // Shorts/Chars vector left shift
--- a/hotspot/src/cpu/x86/vm/x86_32.ad
+++ b/hotspot/src/cpu/x86/vm/x86_32.ad
@ -1004,10 +1004,10 @@ static int vec_stack_to_stack_helper(CodeBuffer *cbuf, bool do_size, int src_off
      __ vmovdqu(Address(rsp, dst_offset), xmm0);
      __ vmovdqu(xmm0, Address(rsp, -32));
    case Op_VecZ:
-      __ evmovdqu(Address(rsp, -64), xmm0, 2);
-      __ evmovdqu(xmm0, Address(rsp, src_offset), 2);
-      __ evmovdqu(Address(rsp, dst_offset), xmm0, 2);
-      __ evmovdqu(xmm0, Address(rsp, -64), 2);
+      __ evmovdqul(Address(rsp, -64), xmm0, 2);
+      __ evmovdqul(xmm0, Address(rsp, src_offset), 2);
+      __ evmovdqul(Address(rsp, dst_offset), xmm0, 2);
+      __ evmovdqul(xmm0, Address(rsp, -64), 2);
      break;
    default:
      ShouldNotReachHere();
--- a/hotspot/src/cpu/x86/vm/x86_64.ad
+++ b/hotspot/src/cpu/x86/vm/x86_64.ad
@ -1075,10 +1075,10 @@ static void vec_stack_to_stack_helper(CodeBuffer *cbuf, int src_offset,
      __ vmovdqu(Address(rsp, dst_offset), xmm0);
      __ vmovdqu(xmm0, Address(rsp, -32));
    case Op_VecZ:
-      __ evmovdqu(Address(rsp, -64), xmm0, 2);
-      __ evmovdqu(xmm0, Address(rsp, src_offset), 2);
-      __ evmovdqu(Address(rsp, dst_offset), xmm0, 2);
-      __ evmovdqu(xmm0, Address(rsp, -64), 2);
+      __ evmovdqul(Address(rsp, -64), xmm0, 2);
+      __ evmovdqul(xmm0, Address(rsp, src_offset), 2);
+      __ evmovdqul(Address(rsp, dst_offset), xmm0, 2);
+      __ evmovdqul(xmm0, Address(rsp, -64), 2);
      break;
    default:
      ShouldNotReachHere();
--- a/hotspot/src/os/linux/vm/os_linux.cpp
+++ b/hotspot/src/os/linux/vm/os_linux.cpp
@ -2211,9 +2211,13 @@ void os::pd_print_cpu_info(outputStream* st, char* buf, size_t buflen) {
  }
 }

-const char* search_string = IA32_ONLY("model name") AMD64_ONLY("model name")
-                            IA64_ONLY("") SPARC_ONLY("cpu")
-                            ARM32_ONLY("Processor") PPC_ONLY("Processor") AARCH64_ONLY("Processor");
+#if defined(AMD64) || defined(IA32) || defined(X32)
+const char* search_string = "model name";
+#elif defined(SPARC)
+const char* search_string = "cpu";
+#else
+const char* search_string = "Processor";
+#endif

 // Parses the cpuinfo file for string representing the model name.
 void os::get_summary_cpu_info(char* cpuinfo, size_t length) {
@ -2248,9 +2252,25 @@ void os::get_summary_cpu_info(char* cpuinfo, size_t length) {
  }
  // cpuinfo not found or parsing failed, just print generic string.  The entire
  // /proc/cpuinfo file will be printed later in the file (or enough of it for x86)
-  strncpy(cpuinfo, IA32_ONLY("x86_32") AMD64_ONLY("x86_32")
-                   IA64_ONLY("IA64") SPARC_ONLY("sparcv9")
-                   ARM32_ONLY("ARM") PPC_ONLY("PPC64") AARCH64_ONLY("AArch64"), length);
+#if defined(AMD64)
+  strncpy(cpuinfo, "x86_64", length);
+#elif defined(IA32)
+  strncpy(cpuinfo, "x86_32", length);
+#elif defined(IA64)
+  strncpy(cpuinfo, "IA64", length);
+#elif defined(SPARC)
+  strncpy(cpuinfo, "sparcv9", length);
+#elif defined(AARCH64)
+  strncpy(cpuinfo, "AArch64", length);
+#elif defined(ARM)
+  strncpy(cpuinfo, "ARM", length);
+#elif defined(PPC)
+  strncpy(cpuinfo, "PPC64", length);
+#elif defined(ZERO_LIBARCH)
+  strncpy(cpuinfo, ZERO_LIBARCH, length);
+#else
+  strncpy(cpuinfo, "unknown", length);
+#endif
 }

 void os::print_siginfo(outputStream* st, void* siginfo) {
--- a/hotspot/src/os/windows/vm/os_windows.cpp
+++ b/hotspot/src/os/windows/vm/os_windows.cpp
@ -4877,6 +4877,26 @@ char* os::pd_remap_memory(int fd, const char* file_name, size_t file_offset,
 // Returns true=success, otherwise false.

 bool os::pd_unmap_memory(char* addr, size_t bytes) {
+  MEMORY_BASIC_INFORMATION mem_info;
+  if (VirtualQuery(addr, &mem_info, sizeof(mem_info)) == 0) {
+    if (PrintMiscellaneous && Verbose) {
+      DWORD err = GetLastError();
+      tty->print_cr("VirtualQuery() failed: GetLastError->%ld.", err);
+    }
+    return false;
+  }
+
+  // Executable memory was not mapped using CreateFileMapping/MapViewOfFileEx.
+  // Instead, executable region was allocated using VirtualAlloc(). See
+  // pd_map_memory() above.
+  //
+  // The following flags should match the 'exec_access' flages used for
+  // VirtualProtect() in pd_map_memory().
+  if (mem_info.Protect == PAGE_EXECUTE_READ ||
+      mem_info.Protect == PAGE_EXECUTE_READWRITE) {
+    return pd_release_memory(addr, bytes);
+  }
+
  BOOL result = UnmapViewOfFile(addr);
  if (result == 0) {
    if (PrintMiscellaneous && Verbose) {
--- a/hotspot/src/share/vm/adlc/formssel.cpp
+++ b/hotspot/src/share/vm/adlc/formssel.cpp
@ -4143,6 +4143,7 @@ bool MatchRule::is_vector() const {
    "SubVB","SubVS","SubVI","SubVL","SubVF","SubVD",
    "MulVS","MulVI","MulVL","MulVF","MulVD",
    "DivVF","DivVD",
+    "SqrtVD",
    "AndV" ,"XorV" ,"OrV",
    "AddReductionVI", "AddReductionVL",
    "AddReductionVF", "AddReductionVD",
--- a/hotspot/src/share/vm/gc/cms/concurrentMarkSweepGeneration.cpp
+++ b/hotspot/src/share/vm/gc/cms/concurrentMarkSweepGeneration.cpp
@ -304,8 +304,7 @@ AdaptiveSizePolicy* CMSCollector::size_policy() {
 void ConcurrentMarkSweepGeneration::initialize_performance_counters() {

  const char* gen_name = "old";
-  GenCollectorPolicy* gcp = (GenCollectorPolicy*) GenCollectedHeap::heap()->collector_policy();
-
+  GenCollectorPolicy* gcp = GenCollectedHeap::heap()->gen_policy();
  // Generation Counters - generation 1, 1 subspace
  _gen_counters = new GenerationCounters(gen_name, 1, 1,
      gcp->min_old_size(), gcp->max_old_size(), &_virtual_space);
--- a/hotspot/src/share/vm/gc/g1/collectionSetChooser.cpp
+++ b/hotspot/src/share/vm/gc/g1/collectionSetChooser.cpp
@ -83,7 +83,7 @@ CollectionSetChooser::CollectionSetChooser() :
  _regions((ResourceObj::set_allocation_type((address) &_regions,
                                             ResourceObj::C_HEAP),
                  100), true /* C_Heap */),
-    _curr_index(0), _length(0), _first_par_unreserved_idx(0),
+    _front(0), _end(0), _first_par_unreserved_idx(0),
    _region_live_threshold_bytes(0), _remaining_reclaimable_bytes(0) {
  _region_live_threshold_bytes =
    HeapRegion::GrainBytes * (size_t) G1MixedGCLiveThresholdPercent / 100;
@ -91,19 +91,19 @@ CollectionSetChooser::CollectionSetChooser() :

 #ifndef PRODUCT
 void CollectionSetChooser::verify() {
-  guarantee(_length <= regions_length(),
-         err_msg("_length: %u regions length: %u", _length, regions_length()));
-  guarantee(_curr_index <= _length,
-            err_msg("_curr_index: %u _length: %u", _curr_index, _length));
+  guarantee(_end <= regions_length(),
+         err_msg("_end: %u regions length: %u", _end, regions_length()));
+  guarantee(_front <= _end,
+            err_msg("_front: %u _end: %u", _front, _end));
  uint index = 0;
  size_t sum_of_reclaimable_bytes = 0;
-  while (index < _curr_index) {
+  while (index < _front) {
    guarantee(regions_at(index) == NULL,
-              "all entries before _curr_index should be NULL");
+              "all entries before _front should be NULL");
    index += 1;
  }
  HeapRegion *prev = NULL;
-  while (index < _length) {
+  while (index < _end) {
    HeapRegion *curr = regions_at(index++);
    guarantee(curr != NULL, "Regions in _regions array cannot be NULL");
    guarantee(!curr->is_young(), "should not be young!");
@ -132,15 +132,15 @@ void CollectionSetChooser::sort_regions() {
    regions_trunc_to(_first_par_unreserved_idx);
  }
  _regions.sort(order_regions);
-  assert(_length <= regions_length(), "Requirement");
+  assert(_end <= regions_length(), "Requirement");
 #ifdef ASSERT
-  for (uint i = 0; i < _length; i++) {
+  for (uint i = 0; i < _end; i++) {
    assert(regions_at(i) != NULL, "Should be true by sorting!");
  }
 #endif // ASSERT
  if (G1PrintRegionLivenessInfo) {
    G1PrintRegionLivenessInfoClosure cl(gclog_or_tty, "Post-Sorting");
-    for (uint i = 0; i < _length; ++i) {
+    for (uint i = 0; i < _end; ++i) {
      HeapRegion* r = regions_at(i);
      cl.doHeapRegion(r);
    }
@ -154,11 +154,19 @@ void CollectionSetChooser::add_region(HeapRegion* hr) {
         err_msg("Pinned region shouldn't be added to the collection set (index %u)", hr->hrm_index()));
  assert(!hr->is_young(), "should not be young!");
  _regions.append(hr);
-  _length++;
+  _end++;
  _remaining_reclaimable_bytes += hr->reclaimable_bytes();
  hr->calc_gc_efficiency();
 }

+void CollectionSetChooser::push(HeapRegion* hr) {
+  assert(hr != NULL, "Can't put back a NULL region");
+  assert(_front >= 1, "Too many regions have been put back");
+  _front--;
+  regions_at_put(_front, hr);
+  _remaining_reclaimable_bytes += hr->reclaimable_bytes();
+}
+
 void CollectionSetChooser::prepare_for_par_region_addition(uint n_threads,
                                                           uint n_regions,
                                                           uint chunk_size) {
@ -193,7 +201,7 @@ void CollectionSetChooser::update_totals(uint region_num,
    // We could have just used atomics instead of taking the
    // lock. However, we currently don't have an atomic add for size_t.
    MutexLockerEx x(ParGCRareEvent_lock, Mutex::_no_safepoint_check_flag);
-    _length += region_num;
+    _end += region_num;
    _remaining_reclaimable_bytes += reclaimable_bytes;
  } else {
    assert(reclaimable_bytes == 0, "invariant");
@ -202,7 +210,7 @@ void CollectionSetChooser::update_totals(uint region_num,

 void CollectionSetChooser::clear() {
  _regions.clear();
-  _curr_index = 0;
-  _length = 0;
+  _front = 0;
+  _end = 0;
  _remaining_reclaimable_bytes = 0;
 };
--- a/hotspot/src/share/vm/gc/g1/collectionSetChooser.hpp
+++ b/hotspot/src/share/vm/gc/g1/collectionSetChooser.hpp
@ -48,12 +48,10 @@ class CollectionSetChooser: public CHeapObj<mtGC> {

  // The index of the next candidate old region to be considered for
  // addition to the CSet.
-  uint _curr_index;
+  uint _front;

-  // The number of candidate old regions added to the CSet chooser.
-  // Note: this is not updated when removing a region using
-  // remove_and_move_to_next() below.
-  uint _length;
+  // The index of the last candidate old region
+  uint _end;

  // Keeps track of the start of the next array chunk to be claimed by
  // parallel GC workers.
@ -73,31 +71,33 @@ public:
  // collection without removing it from the CSet chooser.
  HeapRegion* peek() {
    HeapRegion* res = NULL;
-    if (_curr_index < _length) {
-      res = regions_at(_curr_index);
+    if (_front < _end) {
+      res = regions_at(_front);
      assert(res != NULL,
             err_msg("Unexpected NULL hr in _regions at index %u",
-                     _curr_index));
+                     _front));
    }
    return res;
  }

  // Remove the given region from the CSet chooser and move to the
-  // next one. The given region should be the current candidate region
-  // in the CSet chooser.
-  void remove_and_move_to_next(HeapRegion* hr) {
+  // next one.
+  HeapRegion* pop() {
+    HeapRegion* hr = regions_at(_front);
    assert(hr != NULL, "pre-condition");
-    assert(_curr_index < _length, "pre-condition");
-    assert(regions_at(_curr_index) == hr, "pre-condition");
-    regions_at_put(_curr_index, NULL);
+    assert(_front < _end, "pre-condition");
+    regions_at_put(_front, NULL);
    assert(hr->reclaimable_bytes() <= _remaining_reclaimable_bytes,
           err_msg("remaining reclaimable bytes inconsistent "
                   "from region: " SIZE_FORMAT " remaining: " SIZE_FORMAT,
                   hr->reclaimable_bytes(), _remaining_reclaimable_bytes));
    _remaining_reclaimable_bytes -= hr->reclaimable_bytes();
-    _curr_index += 1;
+    _front += 1;
+    return hr;
  }

+  void push(HeapRegion* hr);
+
  CollectionSetChooser();

  void sort_regions();
@ -113,7 +113,7 @@ public:
  }

  // Returns the number candidate old regions added
-  uint length() { return _length; }
+  uint length() { return _end; }

  // Serial version.
  void add_region(HeapRegion *hr);
@ -135,7 +135,7 @@ public:
  void clear();

  // Return the number of candidate regions that remain to be collected.
-  uint remaining_regions() { return _length - _curr_index; }
+  uint remaining_regions() { return _end - _front; }

  // Determine whether the CSet chooser has more candidate regions or not.
  bool is_empty() { return remaining_regions() == 0; }
--- a/hotspot/src/share/vm/gc/g1/concurrentG1Refine.cpp
+++ b/hotspot/src/share/vm/gc/g1/concurrentG1Refine.cpp
@ -29,7 +29,7 @@
 #include "gc/g1/g1HotCardCache.hpp"
 #include "runtime/java.hpp"

-ConcurrentG1Refine::ConcurrentG1Refine(G1CollectedHeap* g1h, CardTableEntryClosure* refine_closure) :
+ConcurrentG1Refine::ConcurrentG1Refine(G1CollectedHeap* g1h) :
  _threads(NULL), _n_threads(0),
  _hot_card_cache(g1h)
 {
@ -48,29 +48,46 @@ ConcurrentG1Refine::ConcurrentG1Refine(G1CollectedHeap* g1h, CardTableEntryClosu
    FLAG_SET_DEFAULT(G1ConcRefinementRedZone, yellow_zone() * 2);
  }
  set_red_zone(MAX2<int>(G1ConcRefinementRedZone, yellow_zone()));
+}

-  _n_worker_threads = thread_num();
+ConcurrentG1Refine* ConcurrentG1Refine::create(G1CollectedHeap* g1h, CardTableEntryClosure* refine_closure, jint* ecode) {
+  ConcurrentG1Refine* cg1r = new ConcurrentG1Refine(g1h);
+  if (cg1r == NULL) {
+    *ecode = JNI_ENOMEM;
+    vm_shutdown_during_initialization("Could not create ConcurrentG1Refine");
+    return NULL;
+  }
+  cg1r->_n_worker_threads = thread_num();
  // We need one extra thread to do the young gen rset size sampling.
-  _n_threads = _n_worker_threads + 1;
+  cg1r->_n_threads = cg1r->_n_worker_threads + 1;

-  reset_threshold_step();
+  cg1r->reset_threshold_step();

-  _threads = NEW_C_HEAP_ARRAY(ConcurrentG1RefineThread*, _n_threads, mtGC);
+  cg1r->_threads = NEW_C_HEAP_ARRAY_RETURN_NULL(ConcurrentG1RefineThread*, cg1r->_n_threads, mtGC);
+  if (cg1r->_threads == NULL) {
+    *ecode = JNI_ENOMEM;
+    vm_shutdown_during_initialization("Could not allocate an array for ConcurrentG1RefineThread");
+    return NULL;
+  }

  uint worker_id_offset = DirtyCardQueueSet::num_par_ids();

  ConcurrentG1RefineThread *next = NULL;
-  for (uint i = _n_threads - 1; i != UINT_MAX; i--) {
-    ConcurrentG1RefineThread* t = new ConcurrentG1RefineThread(this, next, refine_closure, worker_id_offset, i);
+  for (uint i = cg1r->_n_threads - 1; i != UINT_MAX; i--) {
+    ConcurrentG1RefineThread* t = new ConcurrentG1RefineThread(cg1r, next, refine_closure, worker_id_offset, i);
    assert(t != NULL, "Conc refine should have been created");
    if (t->osthread() == NULL) {
-        vm_shutdown_during_initialization("Could not create ConcurrentG1RefineThread");
+      *ecode = JNI_ENOMEM;
+      vm_shutdown_during_initialization("Could not create ConcurrentG1RefineThread");
+      return NULL;
    }

-    assert(t->cg1r() == this, "Conc refine thread should refer to this");
-    _threads[i] = t;
+    assert(t->cg1r() == cg1r, "Conc refine thread should refer to this");
+    cg1r->_threads[i] = t;
    next = t;
  }
+  *ecode = JNI_OK;
+  return cg1r;
 }

 void ConcurrentG1Refine::reset_threshold_step() {
--- a/hotspot/src/share/vm/gc/g1/concurrentG1Refine.hpp
+++ b/hotspot/src/share/vm/gc/g1/concurrentG1Refine.hpp
@ -71,10 +71,15 @@ class ConcurrentG1Refine: public CHeapObj<mtGC> {
  // Reset the threshold step value based of the current zone boundaries.
  void reset_threshold_step();

+  ConcurrentG1Refine(G1CollectedHeap* g1h);
+
 public:
-  ConcurrentG1Refine(G1CollectedHeap* g1h, CardTableEntryClosure* refine_closure);
  ~ConcurrentG1Refine();

+  // Returns ConcurrentG1Refine instance if succeeded to create/initialize ConcurrentG1Refine and ConcurrentG1RefineThread.
+  // Otherwise, returns NULL with error code.
+  static ConcurrentG1Refine* create(G1CollectedHeap* g1h, CardTableEntryClosure* refine_closure, jint* ecode);
+
  void init(G1RegionToSpaceMapper* card_counts_storage);
  void stop();

--- a/hotspot/src/share/vm/gc/g1/g1CollectedHeap.cpp
+++ b/hotspot/src/share/vm/gc/g1/g1CollectedHeap.cpp
@ -2025,7 +2025,6 @@ G1CollectedHeap::G1CollectedHeap(G1CollectorPolicy* policy_) :
  _survivor_evac_stats(YoungPLABSize, PLABWeight),
  _old_evac_stats(OldPLABSize, PLABWeight),
  _expand_heap_after_alloc_failure(true),
-  _surviving_young_words(NULL),
  _old_marking_cycles_started(0),
  _old_marking_cycles_completed(0),
  _heap_summary_sent(false),
@ -2126,7 +2125,11 @@ jint G1CollectedHeap::initialize() {

  _refine_cte_cl = new RefineCardTableEntryClosure();

-  _cg1r = new ConcurrentG1Refine(this, _refine_cte_cl);
+  jint ecode = JNI_OK;
+  _cg1r = ConcurrentG1Refine::create(this, _refine_cte_cl, &ecode);
+  if (_cg1r == NULL) {
+    return ecode;
+  }

  // Reserve the maximum.

@ -2397,6 +2400,10 @@ void G1CollectedHeap::ref_processing_init() {
                                // (for efficiency/performance)
 }

+CollectorPolicy* G1CollectedHeap::collector_policy() const {
+  return g1_policy();
+}
+
 size_t G1CollectedHeap::capacity() const {
  return _hrm.length() * HeapRegion::GrainBytes;
 }
@ -3694,10 +3701,6 @@ size_t G1CollectedHeap::pending_card_num() {
  return (buffer_size * buffer_num + extra_cards) / oopSize;
 }

-size_t G1CollectedHeap::cards_scanned() {
-  return g1_rem_set()->cardsScanned();
-}
-
 class RegisterHumongousWithInCSetFastTestClosure : public HeapRegionClosure {
 private:
  size_t _total_humongous;
@ -3838,36 +3841,6 @@ void G1CollectedHeap::register_humongous_regions_with_cset() {
  cl.flush_rem_set_entries();
 }

-void G1CollectedHeap::setup_surviving_young_words() {
-  assert(_surviving_young_words == NULL, "pre-condition");
-  uint array_length = g1_policy()->young_cset_region_length();
-  _surviving_young_words = NEW_C_HEAP_ARRAY(size_t, (size_t) array_length, mtGC);
-  if (_surviving_young_words == NULL) {
-    vm_exit_out_of_memory(sizeof(size_t) * array_length, OOM_MALLOC_ERROR,
-                          "Not enough space for young surv words summary.");
-  }
-  memset(_surviving_young_words, 0, (size_t) array_length * sizeof(size_t));
-#ifdef ASSERT
-  for (uint i = 0;  i < array_length; ++i) {
-    assert( _surviving_young_words[i] == 0, "memset above" );
-  }
-#endif // !ASSERT
-}
-
-void G1CollectedHeap::update_surviving_young_words(size_t* surv_young_words) {
-  assert_at_safepoint(true);
-  uint array_length = g1_policy()->young_cset_region_length();
-  for (uint i = 0; i < array_length; ++i) {
-    _surviving_young_words[i] += surv_young_words[i];
-  }
-}
-
-void G1CollectedHeap::cleanup_surviving_young_words() {
-  guarantee( _surviving_young_words != NULL, "pre-condition" );
-  FREE_C_HEAP_ARRAY(size_t, _surviving_young_words);
-  _surviving_young_words = NULL;
-}
-
 #ifdef ASSERT
 class VerifyCSetClosure: public HeapRegionClosure {
 public:
@ -4129,7 +4102,8 @@ G1CollectedHeap::do_collection_pause_at_safepoint(double target_pause_time_ms) {
        g1_policy()->print_collection_set(g1_policy()->inc_cset_head(), gclog_or_tty);
 #endif // YOUNG_LIST_VERBOSE

-        g1_policy()->finalize_cset(target_pause_time_ms);
+        double time_remaining_ms = g1_policy()->finalize_young_cset_part(target_pause_time_ms);
+        g1_policy()->finalize_old_cset_part(time_remaining_ms);

        evacuation_info.set_collectionset_regions(g1_policy()->cset_region_length());

@ -4155,22 +4129,20 @@ G1CollectedHeap::do_collection_pause_at_safepoint(double target_pause_time_ms) {
        collection_set_iterate(&cl);
 #endif // ASSERT

-        setup_surviving_young_words();
-
        // Initialize the GC alloc regions.
        _allocator->init_gc_alloc_regions(evacuation_info);

+        G1ParScanThreadStateSet per_thread_states(this, workers()->active_workers(), g1_policy()->young_cset_region_length());
        // Actually do the work...
-        evacuate_collection_set(evacuation_info);
+        evacuate_collection_set(evacuation_info, &per_thread_states);

-        free_collection_set(g1_policy()->collection_set(), evacuation_info);
+        const size_t* surviving_young_words = per_thread_states.surviving_young_words();
+        free_collection_set(g1_policy()->collection_set(), evacuation_info, surviving_young_words);

        eagerly_reclaim_humongous_regions();

        g1_policy()->clear_collection_set();

-        cleanup_surviving_young_words();
-
        // Start a new incremental collection set for the next pause.
        g1_policy()->start_incremental_cset_building();

@ -4255,7 +4227,8 @@ G1CollectedHeap::do_collection_pause_at_safepoint(double target_pause_time_ms) {
        // investigate this in CR 7178365.
        double sample_end_time_sec = os::elapsedTime();
        double pause_time_ms = (sample_end_time_sec - sample_start_time_sec) * MILLIUNITS;
-        g1_policy()->record_collection_pause_end(pause_time_ms);
+        size_t total_cards_scanned = per_thread_states.total_cards_scanned();
+        g1_policy()->record_collection_pause_end(pause_time_ms, total_cards_scanned);

        evacuation_info.set_collectionset_used_before(g1_policy()->collection_set_bytes_used_before());
        evacuation_info.set_bytes_copied(g1_policy()->bytes_copied_during_gc());
@ -4541,15 +4514,15 @@ class G1KlassScanClosure : public KlassClosure {

 class G1ParTask : public AbstractGangTask {
 protected:
-  G1CollectedHeap*       _g1h;
-  G1ParScanThreadState** _pss;
-  RefToScanQueueSet*     _queues;
-  G1RootProcessor*       _root_processor;
-  ParallelTaskTerminator _terminator;
-  uint _n_workers;
+  G1CollectedHeap*         _g1h;
+  G1ParScanThreadStateSet* _pss;
+  RefToScanQueueSet*       _queues;
+  G1RootProcessor*         _root_processor;
+  ParallelTaskTerminator   _terminator;
+  uint                     _n_workers;

 public:
-  G1ParTask(G1CollectedHeap* g1h, G1ParScanThreadState** per_thread_states, RefToScanQueueSet *task_queues, G1RootProcessor* root_processor, uint n_workers)
+  G1ParTask(G1CollectedHeap* g1h, G1ParScanThreadStateSet* per_thread_states, RefToScanQueueSet *task_queues, G1RootProcessor* root_processor, uint n_workers)
    : AbstractGangTask("G1 collection"),
      _g1h(g1h),
      _pss(per_thread_states),
@ -4607,7 +4580,7 @@ public:

      ReferenceProcessor*             rp = _g1h->ref_processor_stw();

-      G1ParScanThreadState*           pss = _pss[worker_id];
+      G1ParScanThreadState*           pss = _pss->state_for_worker(worker_id);
      pss->set_ref_processor(rp);

      bool only_young = _g1h->collector_state()->gcs_are_young();
@ -4664,9 +4637,12 @@ public:
                                      worker_id);

      G1ParPushHeapRSClosure push_heap_rs_cl(_g1h, pss);
-      _g1h->g1_rem_set()->oops_into_collection_set_do(&push_heap_rs_cl,
-                                                      weak_root_cl,
-                                                      worker_id);
+      size_t cards_scanned = _g1h->g1_rem_set()->oops_into_collection_set_do(&push_heap_rs_cl,
+                                                                             weak_root_cl,
+                                                                             worker_id);
+
+      _pss->add_cards_scanned(worker_id, cards_scanned);
+
      double strong_roots_sec = os::elapsedTime() - start_strong_roots_sec;

      double term_sec = 0.0;
@ -5263,15 +5239,15 @@ public:

 class G1STWRefProcTaskExecutor: public AbstractRefProcTaskExecutor {
 private:
-  G1CollectedHeap*        _g1h;
-  G1ParScanThreadState**  _pss;
-  RefToScanQueueSet*      _queues;
-  WorkGang*               _workers;
-  uint                    _active_workers;
+  G1CollectedHeap*          _g1h;
+  G1ParScanThreadStateSet*  _pss;
+  RefToScanQueueSet*        _queues;
+  WorkGang*                 _workers;
+  uint                      _active_workers;

 public:
  G1STWRefProcTaskExecutor(G1CollectedHeap* g1h,
-                           G1ParScanThreadState** per_thread_states,
+                           G1ParScanThreadStateSet* per_thread_states,
                           WorkGang* workers,
                           RefToScanQueueSet *task_queues,
                           uint n_workers) :
@ -5295,14 +5271,14 @@ class G1STWRefProcTaskProxy: public AbstractGangTask {
  typedef AbstractRefProcTaskExecutor::ProcessTask ProcessTask;
  ProcessTask&     _proc_task;
  G1CollectedHeap* _g1h;
-  G1ParScanThreadState** _pss;
+  G1ParScanThreadStateSet* _pss;
  RefToScanQueueSet* _task_queues;
  ParallelTaskTerminator* _terminator;

 public:
  G1STWRefProcTaskProxy(ProcessTask& proc_task,
                        G1CollectedHeap* g1h,
-                        G1ParScanThreadState** per_thread_states,
+                        G1ParScanThreadStateSet* per_thread_states,
                        RefToScanQueueSet *task_queues,
                        ParallelTaskTerminator* terminator) :
    AbstractGangTask("Process reference objects in parallel"),
@ -5320,7 +5296,7 @@ public:

    G1STWIsAliveClosure is_alive(_g1h);

-    G1ParScanThreadState*           pss = _pss[worker_id];
+    G1ParScanThreadState*          pss = _pss->state_for_worker(worker_id);
    pss->set_ref_processor(NULL);

    G1ParScanExtRootClosure        only_copy_non_heap_cl(_g1h, pss);
@ -5399,14 +5375,14 @@ void G1STWRefProcTaskExecutor::execute(EnqueueTask& enq_task) {

 class G1ParPreserveCMReferentsTask: public AbstractGangTask {
 protected:
-  G1CollectedHeap*       _g1h;
-  G1ParScanThreadState** _pss;
-  RefToScanQueueSet*     _queues;
-  ParallelTaskTerminator _terminator;
-  uint _n_workers;
+  G1CollectedHeap*         _g1h;
+  G1ParScanThreadStateSet* _pss;
+  RefToScanQueueSet*       _queues;
+  ParallelTaskTerminator   _terminator;
+  uint                     _n_workers;

 public:
-  G1ParPreserveCMReferentsTask(G1CollectedHeap* g1h, G1ParScanThreadState** per_thread_states, int workers, RefToScanQueueSet *task_queues) :
+  G1ParPreserveCMReferentsTask(G1CollectedHeap* g1h, G1ParScanThreadStateSet* per_thread_states, int workers, RefToScanQueueSet *task_queues) :
    AbstractGangTask("ParPreserveCMReferents"),
    _g1h(g1h),
    _pss(per_thread_states),
@ -5419,7 +5395,7 @@ public:
    ResourceMark rm;
    HandleMark   hm;

-    G1ParScanThreadState*          pss = _pss[worker_id];
+    G1ParScanThreadState*          pss = _pss->state_for_worker(worker_id);
    pss->set_ref_processor(NULL);
    assert(pss->queue_is_empty(), "both queue and overflow should be empty");

@ -5480,7 +5456,7 @@ public:
 };

 // Weak Reference processing during an evacuation pause (part 1).
-void G1CollectedHeap::process_discovered_references(G1ParScanThreadState** per_thread_states) {
+void G1CollectedHeap::process_discovered_references(G1ParScanThreadStateSet* per_thread_states) {
  double ref_proc_start = os::elapsedTime();

  ReferenceProcessor* rp = _ref_processor_stw;
@ -5525,7 +5501,7 @@ void G1CollectedHeap::process_discovered_references(G1ParScanThreadState** per_t
  // JNI refs.

  // Use only a single queue for this PSS.
-  G1ParScanThreadState*           pss = per_thread_states[0];
+  G1ParScanThreadState*          pss = per_thread_states->state_for_worker(0);
  pss->set_ref_processor(NULL);
  assert(pss->queue_is_empty(), "pre-condition");

@ -5586,7 +5562,7 @@ void G1CollectedHeap::process_discovered_references(G1ParScanThreadState** per_t
 }

 // Weak Reference processing during an evacuation pause (part 2).
-void G1CollectedHeap::enqueue_discovered_references(G1ParScanThreadState** per_thread_states) {
+void G1CollectedHeap::enqueue_discovered_references(G1ParScanThreadStateSet* per_thread_states) {
  double ref_enq_start = os::elapsedTime();

  ReferenceProcessor* rp = _ref_processor_stw;
@ -5621,7 +5597,7 @@ void G1CollectedHeap::enqueue_discovered_references(G1ParScanThreadState** per_t
  g1_policy()->phase_times()->record_ref_enq_time(ref_enq_time * 1000.0);
 }

-void G1CollectedHeap::evacuate_collection_set(EvacuationInfo& evacuation_info) {
+void G1CollectedHeap::evacuate_collection_set(EvacuationInfo& evacuation_info, G1ParScanThreadStateSet* per_thread_states) {
  _expand_heap_after_alloc_failure = true;
  _evacuation_failed = false;

@ -5641,11 +5617,6 @@ void G1CollectedHeap::evacuate_collection_set(EvacuationInfo& evacuation_info) {
  double start_par_time_sec = os::elapsedTime();
  double end_par_time_sec;

-  G1ParScanThreadState** per_thread_states = NEW_C_HEAP_ARRAY(G1ParScanThreadState*, n_workers, mtGC);
-  for (uint i = 0; i < n_workers; i++) {
-    per_thread_states[i] = new_par_scan_state(i);
-  }
-
  {
    G1RootProcessor root_processor(this, n_workers);
    G1ParTask g1_par_task(this, per_thread_states, _task_queues, &root_processor, n_workers);
@ -5699,11 +5670,7 @@ void G1CollectedHeap::evacuate_collection_set(EvacuationInfo& evacuation_info) {
  _allocator->release_gc_alloc_regions(evacuation_info);
  g1_rem_set()->cleanup_after_oops_into_collection_set_do();

-  for (uint i = 0; i < n_workers; i++) {
-    G1ParScanThreadState* pss = per_thread_states[i];
-    delete pss;
-  }
-  FREE_C_HEAP_ARRAY(G1ParScanThreadState*, per_thread_states);
+  per_thread_states->flush();

  record_obj_copy_mem_stats();

@ -6054,7 +6021,7 @@ void G1CollectedHeap::cleanUpCardTable() {
  g1_policy()->phase_times()->record_clear_ct_time(elapsed * 1000.0);
 }

-void G1CollectedHeap::free_collection_set(HeapRegion* cs_head, EvacuationInfo& evacuation_info) {
+void G1CollectedHeap::free_collection_set(HeapRegion* cs_head, EvacuationInfo& evacuation_info, const size_t* surviving_young_words) {
  size_t pre_used = 0;
  FreeRegionList local_free_list("Local List for CSet Freeing");

@ -6108,7 +6075,7 @@ void G1CollectedHeap::free_collection_set(HeapRegion* cs_head, EvacuationInfo& e
      int index = cur->young_index_in_cset();
      assert(index != -1, "invariant");
      assert((uint) index < policy->young_cset_region_length(), "invariant");
-      size_t words_survived = _surviving_young_words[index];
+      size_t words_survived = surviving_young_words[index];
      cur->record_surv_words_in_group(words_survived);

      // At this point the we have 'popped' cur from the collection set
--- a/hotspot/src/share/vm/gc/g1/g1CollectedHeap.hpp
+++ b/hotspot/src/share/vm/gc/g1/g1CollectedHeap.hpp
@ -56,6 +56,7 @@ class HRRSCleanupTask;
 class GenerationSpec;
 class OopsInHeapRegionClosure;
 class G1ParScanThreadState;
+class G1ParScanThreadStateSet;
 class G1KlassScanClosure;
 class G1ParScanThreadState;
 class ObjectClosure;
@ -192,6 +193,7 @@ class G1CollectedHeap : public CollectedHeap {

  // Closures used in implementation.
  friend class G1ParScanThreadState;
+  friend class G1ParScanThreadStateSet;
  friend class G1ParTask;
  friend class G1PLABAllocator;
  friend class G1PrepareCompactClosure;
@ -309,14 +311,8 @@ private:

  volatile unsigned _gc_time_stamp;

-  size_t* _surviving_young_words;
-
  G1HRPrinter _hr_printer;

-  void setup_surviving_young_words();
-  void update_surviving_young_words(size_t* surv_young_words);
-  void cleanup_surviving_young_words();
-
  // It decides whether an explicit GC should start a concurrent cycle
  // instead of doing a STW GC. Currently, a concurrent cycle is
  // explicitly started if:
@ -584,11 +580,11 @@ protected:

  // Process any reference objects discovered during
  // an incremental evacuation pause.
-  void process_discovered_references(G1ParScanThreadState** per_thread_states);
+  void process_discovered_references(G1ParScanThreadStateSet* per_thread_states);

  // Enqueue any remaining discovered references
  // after processing.
-  void enqueue_discovered_references(G1ParScanThreadState** per_thread_states);
+  void enqueue_discovered_references(G1ParScanThreadStateSet* per_thread_states);

 public:
  WorkGang* workers() const { return _workers; }
@ -683,9 +679,6 @@ public:
  // Allocates a new heap region instance.
  HeapRegion* new_heap_region(uint hrs_index, MemRegion mr);

-  // Allocates a new per thread par scan state for the given thread id.
-  G1ParScanThreadState* new_par_scan_state(uint worker_id);
-
  // Allocate the highest free region in the reserved heap. This will commit
  // regions as necessary.
  HeapRegion* alloc_highest_free_region();
@ -799,7 +792,7 @@ protected:
  bool do_collection_pause_at_safepoint(double target_pause_time_ms);

  // Actually do the work of evacuating the collection set.
-  void evacuate_collection_set(EvacuationInfo& evacuation_info);
+  void evacuate_collection_set(EvacuationInfo& evacuation_info, G1ParScanThreadStateSet* per_thread_states);

  // Print the header for the per-thread termination statistics.
  static void print_termination_stats_hdr(outputStream* const st);
@ -833,7 +826,7 @@ protected:

  // After a collection pause, make the regions in the CS into free
  // regions.
-  void free_collection_set(HeapRegion* cs_head, EvacuationInfo& evacuation_info);
+  void free_collection_set(HeapRegion* cs_head, EvacuationInfo& evacuation_info, const size_t* surviving_young_words);

  // Abandon the current collection set without recording policy
  // statistics or updating free lists.
@ -1057,7 +1050,7 @@ public:
  // The current policy object for the collector.
  G1CollectorPolicy* g1_policy() const { return _g1_policy; }

-  virtual CollectorPolicy* collector_policy() const { return (CollectorPolicy*) g1_policy(); }
+  virtual CollectorPolicy* collector_policy() const;

  // Adaptive size policy.  No such thing for g1.
  virtual AdaptiveSizePolicy* size_policy() { return NULL; }
@ -1610,7 +1603,6 @@ public:

 public:
  size_t pending_card_num();
-  size_t cards_scanned();

 protected:
  size_t _max_heap_capacity;
--- a/hotspot/src/share/vm/gc/g1/g1CollectedHeap_ext.cpp
+++ b/hotspot/src/share/vm/gc/g1/g1CollectedHeap_ext.cpp
@ -38,7 +38,3 @@ HeapRegion* G1CollectedHeap::new_heap_region(uint hrs_index,
                                             MemRegion mr) {
  return new HeapRegion(hrs_index, bot_shared(), mr);
 }
-
-G1ParScanThreadState* G1CollectedHeap::new_par_scan_state(uint worker_id) {
-  return new G1ParScanThreadState(this, worker_id);
-}
--- a/hotspot/src/share/vm/gc/g1/g1CollectorPolicy.cpp
+++ b/hotspot/src/share/vm/gc/g1/g1CollectorPolicy.cpp
@ -923,7 +923,7 @@ bool G1CollectorPolicy::need_to_start_conc_mark(const char* source, size_t alloc
 // Anything below that is considered to be zero
 #define MIN_TIMER_GRANULARITY 0.0000001

-void G1CollectorPolicy::record_collection_pause_end(double pause_time_ms) {
+void G1CollectorPolicy::record_collection_pause_end(double pause_time_ms, size_t cards_scanned) {
  double end_time_sec = os::elapsedTime();
  assert(_cur_collection_pause_used_regions_at_start >= cset_region_length(),
         "otherwise, the subtraction below does not make sense");
@ -1052,8 +1052,6 @@ void G1CollectorPolicy::record_collection_pause_end(double pause_time_ms) {
      _cost_per_card_ms_seq->add(cost_per_card_ms);
    }

-    size_t cards_scanned = _g1->cards_scanned();
-
    double cost_per_entry_ms = 0.0;
    if (cards_scanned > 10) {
      cost_per_entry_ms = phase_times()->average_time_ms(G1GCPhaseTimes::ScanRS) / (double) cards_scanned;
@ -1871,7 +1869,7 @@ uint G1CollectorPolicy::calc_max_old_cset_length() {
 }


-void G1CollectorPolicy::finalize_cset(double target_pause_time_ms) {
+double G1CollectorPolicy::finalize_young_cset_part(double target_pause_time_ms) {
  double young_start_time_sec = os::elapsedTime();

  YoungList* young_list = _g1->young_list();
@ -1883,7 +1881,6 @@ void G1CollectorPolicy::finalize_cset(double target_pause_time_ms) {
  guarantee(_collection_set == NULL, "Precondition");

  double base_time_ms = predict_base_elapsed_time_ms(_pending_cards);
-  double predicted_pause_time_ms = base_time_ms;
  double time_remaining_ms = MAX2(target_pause_time_ms - base_time_ms, 0.0);

  ergo_verbose4(ErgoCSetConstruction | ErgoHigh,
@ -1927,15 +1924,16 @@ void G1CollectorPolicy::finalize_cset(double target_pause_time_ms) {
  _collection_set = _inc_cset_head;
  _collection_set_bytes_used_before = _inc_cset_bytes_used_before;
  time_remaining_ms = MAX2(time_remaining_ms - _inc_cset_predicted_elapsed_time_ms, 0.0);
-  predicted_pause_time_ms += _inc_cset_predicted_elapsed_time_ms;

-  ergo_verbose3(ErgoCSetConstruction | ErgoHigh,
+  ergo_verbose4(ErgoCSetConstruction | ErgoHigh,
                "add young regions to CSet",
                ergo_format_region("eden")
                ergo_format_region("survivors")
-                ergo_format_ms("predicted young region time"),
+                ergo_format_ms("predicted young region time")
+                ergo_format_ms("target pause time"),
                eden_region_length, survivor_region_length,
-                _inc_cset_predicted_elapsed_time_ms);
+                _inc_cset_predicted_elapsed_time_ms,
+                target_pause_time_ms);

  // The number of recorded young regions is the incremental
  // collection set's current size
@ -1944,8 +1942,13 @@ void G1CollectorPolicy::finalize_cset(double target_pause_time_ms) {
  double young_end_time_sec = os::elapsedTime();
  phase_times()->record_young_cset_choice_time_ms((young_end_time_sec - young_start_time_sec) * 1000.0);

-  // Set the start of the non-young choice time.
-  double non_young_start_time_sec = young_end_time_sec;
+  return time_remaining_ms;
+}
+
+void G1CollectorPolicy::finalize_old_cset_part(double time_remaining_ms) {
+  double non_young_start_time_sec = os::elapsedTime();
+  double predicted_old_time_ms = 0.0;
+

  if (!collector_state()->gcs_are_young()) {
    CollectionSetChooser* cset_chooser = _collectionSetChooser;
@ -2033,8 +2036,8 @@ void G1CollectorPolicy::finalize_cset(double target_pause_time_ms) {

      // We will add this region to the CSet.
      time_remaining_ms = MAX2(time_remaining_ms - predicted_time_ms, 0.0);
-      predicted_pause_time_ms += predicted_time_ms;
-      cset_chooser->remove_and_move_to_next(hr);
+      predicted_old_time_ms += predicted_time_ms;
+      cset_chooser->pop(); // already have region via peek()
      _g1->old_set_remove(hr);
      add_old_region_to_cset(hr);

@ -2068,16 +2071,13 @@ void G1CollectorPolicy::finalize_cset(double target_pause_time_ms) {

  stop_incremental_cset_building();

-  ergo_verbose5(ErgoCSetConstruction,
+  ergo_verbose3(ErgoCSetConstruction,
                "finish choosing CSet",
-                ergo_format_region("eden")
-                ergo_format_region("survivors")
                ergo_format_region("old")
-                ergo_format_ms("predicted pause time")
-                ergo_format_ms("target pause time"),
-                eden_region_length, survivor_region_length,
+                ergo_format_ms("predicted old region time")
+                ergo_format_ms("time remaining"),
                old_cset_region_length(),
-                predicted_pause_time_ms, target_pause_time_ms);
+                predicted_old_time_ms, time_remaining_ms);

  double non_young_end_time_sec = os::elapsedTime();
  phase_times()->record_non_young_cset_choice_time_ms((non_young_end_time_sec - non_young_start_time_sec) * 1000.0);
--- a/hotspot/src/share/vm/gc/g1/g1CollectorPolicy.hpp
+++ b/hotspot/src/share/vm/gc/g1/g1CollectorPolicy.hpp
@ -473,7 +473,7 @@ private:

  // The number of bytes in the collection set before the pause. Set from
  // the incrementally built collection set at the start of an evacuation
-  // pause, and incremented in finalize_cset() when adding old regions
+  // pause, and incremented in finalize_old_cset_part() when adding old regions
  // (if any) to the collection set.
  size_t _collection_set_bytes_used_before;

@ -634,7 +634,7 @@ public:

  // Record the start and end of an evacuation pause.
  void record_collection_pause_start(double start_time_sec);
-  void record_collection_pause_end(double pause_time_ms);
+  void record_collection_pause_end(double pause_time_ms, size_t cards_scanned);

  // Record the start and end of a full collection.
  void record_full_collection_start();
@ -689,7 +689,8 @@ public:
  // Choose a new collection set.  Marks the chosen regions as being
  // "in_collection_set", and links them together.  The head and number of
  // the collection set are available via access methods.
-  void finalize_cset(double target_pause_time_ms);
+  double finalize_young_cset_part(double target_pause_time_ms);
+  virtual void finalize_old_cset_part(double time_remaining_ms);

  // The head of the list (via "next_in_collection_set()") representing the
  // current collection set.
@ -865,8 +866,8 @@ public:
    return _recorded_survivor_regions;
  }

-  void record_thread_age_table(ageTable* age_table) {
-    _survivors_age_table.merge_par(age_table);
+  void record_age_table(ageTable* age_table) {
+    _survivors_age_table.merge(age_table);
  }

  void update_max_gc_locker_expansion();
--- a/hotspot/src/share/vm/gc/g1/g1EvacStats.cpp
+++ b/hotspot/src/share/vm/gc/g1/g1EvacStats.cpp
@ -46,11 +46,11 @@ void G1EvacStats::adjust_desired_plab_sz() {
    if (_allocated == 0) {
      assert((_unused == 0),
             err_msg("Inconsistency in PLAB stats: "
-                     "_allocated: "SIZE_FORMAT", "
-                     "_wasted: "SIZE_FORMAT", "
-                     "_region_end_waste: "SIZE_FORMAT", "
-                     "_unused: "SIZE_FORMAT", "
-                     "_used  : "SIZE_FORMAT,
+                     "_allocated: " SIZE_FORMAT ", "
+                     "_wasted: " SIZE_FORMAT ", "
+                     "_region_end_waste: " SIZE_FORMAT ", "
+                     "_unused: " SIZE_FORMAT ", "
+                     "_used  : " SIZE_FORMAT,
                     _allocated, _wasted, _region_end_waste, _unused, used()));
      _allocated = 1;
    }
--- a/hotspot/src/share/vm/gc/g1/g1ParScanThreadState.cpp
+++ b/hotspot/src/share/vm/gc/g1/g1ParScanThreadState.cpp
@ -32,7 +32,7 @@
 #include "oops/oop.inline.hpp"
 #include "runtime/prefetch.inline.hpp"

-G1ParScanThreadState::G1ParScanThreadState(G1CollectedHeap* g1h, uint worker_id)
+G1ParScanThreadState::G1ParScanThreadState(G1CollectedHeap* g1h, uint worker_id, size_t young_cset_length)
  : _g1h(g1h),
    _refs(g1h->task_queue(worker_id)),
    _dcq(&g1h->dirty_card_queue_set()),
@ -51,8 +51,8 @@ G1ParScanThreadState::G1ParScanThreadState(G1CollectedHeap* g1h, uint worker_id)
  // non-young regions (where the age is -1)
  // We also add a few elements at the beginning and at the end in
  // an attempt to eliminate cache contention
-  uint real_length = 1 + _g1h->g1_policy()->young_cset_region_length();
-  uint array_length = PADDING_ELEM_NUM +
+  size_t real_length = 1 + young_cset_length;
+  size_t array_length = PADDING_ELEM_NUM +
                      real_length +
                      PADDING_ELEM_NUM;
  _surviving_young_words_base = NEW_C_HEAP_ARRAY(size_t, array_length, mtGC);
@ -60,7 +60,7 @@ G1ParScanThreadState::G1ParScanThreadState(G1CollectedHeap* g1h, uint worker_id)
    vm_exit_out_of_memory(array_length * sizeof(size_t), OOM_MALLOC_ERROR,
                          "Not enough space for young surv histo.");
  _surviving_young_words = _surviving_young_words_base + PADDING_ELEM_NUM;
-  memset(_surviving_young_words, 0, (size_t) real_length * sizeof(size_t));
+  memset(_surviving_young_words, 0, real_length * sizeof(size_t));

  _plab_allocator = G1PLABAllocator::create_allocator(_g1h->allocator());

@ -71,13 +71,21 @@ G1ParScanThreadState::G1ParScanThreadState(G1CollectedHeap* g1h, uint worker_id)
  _dest[InCSetState::Old]          = InCSetState::Old;
 }

-G1ParScanThreadState::~G1ParScanThreadState() {
+// Pass locally gathered statistics to global state.
+void G1ParScanThreadState::flush(size_t* surviving_young_words) {
+  _dcq.flush();
  // Update allocation statistics.
  _plab_allocator->flush_and_retire_stats();
+  _g1h->g1_policy()->record_age_table(&_age_table);
+
+  uint length = _g1h->g1_policy()->young_cset_region_length();
+  for (uint region_index = 0; region_index < length; region_index++) {
+    surviving_young_words[region_index] += _surviving_young_words[region_index];
+  }
+}
+
+G1ParScanThreadState::~G1ParScanThreadState() {
  delete _plab_allocator;
-  _g1h->g1_policy()->record_thread_age_table(&_age_table);
-  // Update heap statistics.
-  _g1h->update_surviving_young_words(_surviving_young_words);
  FREE_C_HEAP_ARRAY(size_t, _surviving_young_words_base);
 }

@ -314,6 +322,42 @@ oop G1ParScanThreadState::copy_to_survivor_space(InCSetState const state,
  }
 }

+G1ParScanThreadState* G1ParScanThreadStateSet::state_for_worker(uint worker_id) {
+  assert(worker_id < _n_workers, "out of bounds access");
+  return _states[worker_id];
+}
+
+void G1ParScanThreadStateSet::add_cards_scanned(uint worker_id, size_t cards_scanned) {
+  assert(worker_id < _n_workers, "out of bounds access");
+  _cards_scanned[worker_id] += cards_scanned;
+}
+
+size_t G1ParScanThreadStateSet::total_cards_scanned() const {
+  assert(_flushed, "thread local state from the per thread states should have been flushed");
+  return _total_cards_scanned;
+}
+
+const size_t* G1ParScanThreadStateSet::surviving_young_words() const {
+  assert(_flushed, "thread local state from the per thread states should have been flushed");
+  return _surviving_young_words_total;
+}
+
+void G1ParScanThreadStateSet::flush() {
+  assert(!_flushed, "thread local state from the per thread states should be flushed once");
+  assert(_total_cards_scanned == 0, "should have been cleared");
+
+  for (uint worker_index = 0; worker_index < _n_workers; ++worker_index) {
+    G1ParScanThreadState* pss = _states[worker_index];
+
+    _total_cards_scanned += _cards_scanned[worker_index];
+
+    pss->flush(_surviving_young_words_total);
+    delete pss;
+    _states[worker_index] = NULL;
+  }
+  _flushed = true;
+}
+
 oop G1ParScanThreadState::handle_evacuation_failure_par(oop old, markOop m) {
  assert(_g1h->obj_in_cs(old),
         err_msg("Object " PTR_FORMAT " should be in the CSet", p2i(old)));
--- a/hotspot/src/share/vm/gc/g1/g1ParScanThreadState.hpp
+++ b/hotspot/src/share/vm/gc/g1/g1ParScanThreadState.hpp
@ -82,7 +82,7 @@ class G1ParScanThreadState : public CHeapObj<mtGC> {
  }

 public:
-  G1ParScanThreadState(G1CollectedHeap* g1h, uint worker_id);
+  G1ParScanThreadState(G1CollectedHeap* g1h, uint worker_id, size_t young_cset_length);
  ~G1ParScanThreadState();

  void set_ref_processor(ReferenceProcessor* rp) { _scanner.set_ref_processor(rp); }
@ -121,6 +121,8 @@ class G1ParScanThreadState : public CHeapObj<mtGC> {
    return _surviving_young_words + 1;
  }

+  void flush(size_t* surviving_young_words);
+
 private:
  #define G1_PARTIAL_ARRAY_MASK 0x2

@ -189,4 +191,48 @@ class G1ParScanThreadState : public CHeapObj<mtGC> {
  oop handle_evacuation_failure_par(oop obj, markOop m);
 };

+class G1ParScanThreadStateSet : public StackObj {
+  G1CollectedHeap* _g1h;
+  G1ParScanThreadState** _states;
+  size_t* _surviving_young_words_total;
+  size_t* _cards_scanned;
+  size_t _total_cards_scanned;
+  uint _n_workers;
+  bool _flushed;
+
+ public:
+  G1ParScanThreadStateSet(G1CollectedHeap* g1h, uint n_workers, size_t young_cset_length) :
+      _g1h(g1h),
+      _states(NEW_C_HEAP_ARRAY(G1ParScanThreadState*, n_workers, mtGC)),
+      _surviving_young_words_total(NEW_C_HEAP_ARRAY(size_t, young_cset_length, mtGC)),
+      _cards_scanned(NEW_C_HEAP_ARRAY(size_t, n_workers, mtGC)),
+      _total_cards_scanned(0),
+      _n_workers(n_workers),
+      _flushed(false) {
+    for (uint i = 0; i < n_workers; ++i) {
+      _states[i] = new_par_scan_state(i, young_cset_length);
+    }
+    memset(_surviving_young_words_total, 0, young_cset_length * sizeof(size_t));
+    memset(_cards_scanned, 0, n_workers * sizeof(size_t));
+  }
+
+  ~G1ParScanThreadStateSet() {
+    assert(_flushed, "thread local state from the per thread states should have been flushed");
+    FREE_C_HEAP_ARRAY(G1ParScanThreadState*, _states);
+    FREE_C_HEAP_ARRAY(size_t, _surviving_young_words_total);
+    FREE_C_HEAP_ARRAY(size_t, _cards_scanned);
+  }
+
+  void flush();
+
+  G1ParScanThreadState* state_for_worker(uint worker_id);
+
+  void add_cards_scanned(uint worker_id, size_t cards_scanned);
+  size_t total_cards_scanned() const;
+  const size_t* surviving_young_words() const;
+
+ private:
+  G1ParScanThreadState* new_par_scan_state(uint worker_id, size_t young_cset_length);
+};
+
 #endif // SHARE_VM_GC_G1_G1PARSCANTHREADSTATE_HPP
--- a/hotspot/src/share/vm/gc/g1/g1ParScanThreadState_ext.cpp
+++ b/hotspot/src/share/vm/gc/g1/g1ParScanThreadState_ext.cpp
@ -0,0 +1,31 @@
+/*
+ * Copyright (c) 2015, Oracle and/or its affiliates. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+#include "precompiled.hpp"
+
+#include "gc/g1/g1ParScanThreadState.hpp"
+
+G1ParScanThreadState* G1ParScanThreadStateSet::new_par_scan_state(uint worker_id, size_t young_cset_length) {
+  return new G1ParScanThreadState(_g1h, worker_id, young_cset_length);
+}
--- a/hotspot/src/share/vm/gc/g1/g1RemSet.cpp
+++ b/hotspot/src/share/vm/gc/g1/g1RemSet.cpp
@ -76,7 +76,6 @@ G1RemSet::G1RemSet(G1CollectedHeap* g1, CardTableModRefBS* ct_bs)
    _ct_bs(ct_bs), _g1p(_g1->g1_policy()),
    _cg1r(g1->concurrent_g1_refine()),
    _cset_rs_update_cl(NULL),
-    _cards_scanned(NULL), _total_cards_scanned(0),
    _prev_period_summary()
 {
  _cset_rs_update_cl = NEW_C_HEAP_ARRAY(G1ParPushHeapRSClosure*, n_workers(), mtGC);
@ -228,9 +227,9 @@ public:
  size_t cards_looked_up() { return _cards;}
 };

-void G1RemSet::scanRS(G1ParPushHeapRSClosure* oc,
-                      OopClosure* non_heap_roots,
-                      uint worker_i) {
+size_t G1RemSet::scanRS(G1ParPushHeapRSClosure* oc,
+                        OopClosure* non_heap_roots,
+                        uint worker_i) {
  double rs_time_start = os::elapsedTime();

  G1CodeBlobClosure code_root_cl(non_heap_roots);
@ -246,11 +245,10 @@ void G1RemSet::scanRS(G1ParPushHeapRSClosure* oc,
  double scan_rs_time_sec = (os::elapsedTime() - rs_time_start)
                            - scanRScl.strong_code_root_scan_time_sec();

-  assert(_cards_scanned != NULL, "invariant");
-  _cards_scanned[worker_i] = scanRScl.cards_done();
-
  _g1p->phase_times()->record_time_secs(G1GCPhaseTimes::ScanRS, worker_i, scan_rs_time_sec);
  _g1p->phase_times()->record_time_secs(G1GCPhaseTimes::CodeRoots, worker_i, scanRScl.strong_code_root_scan_time_sec());
+
+  return scanRScl.cards_done();
 }

 // Closure used for updating RSets and recording references that
@ -298,9 +296,9 @@ void G1RemSet::cleanupHRRS() {
  HeapRegionRemSet::cleanup();
 }

-void G1RemSet::oops_into_collection_set_do(G1ParPushHeapRSClosure* oc,
-                                           OopClosure* non_heap_roots,
-                                           uint worker_i) {
+size_t G1RemSet::oops_into_collection_set_do(G1ParPushHeapRSClosure* oc,
+                                             OopClosure* non_heap_roots,
+                                             uint worker_i) {
 #if CARD_REPEAT_HISTO
  ct_freq_update_histo_and_reset();
 #endif
@ -322,10 +320,11 @@ void G1RemSet::oops_into_collection_set_do(G1ParPushHeapRSClosure* oc,
  DirtyCardQueue into_cset_dcq(&_g1->into_cset_dirty_card_queue_set());

  updateRS(&into_cset_dcq, worker_i);
-  scanRS(oc, non_heap_roots, worker_i);
+  size_t cards_scanned = scanRS(oc, non_heap_roots, worker_i);

  // We now clear the cached values of _cset_rs_update_cl for this worker
  _cset_rs_update_cl[worker_i] = NULL;
+  return cards_scanned;
 }

 void G1RemSet::prepare_for_oops_into_collection_set_do() {
@ -333,23 +332,9 @@ void G1RemSet::prepare_for_oops_into_collection_set_do() {
  _g1->set_refine_cte_cl_concurrency(false);
  DirtyCardQueueSet& dcqs = JavaThread::dirty_card_queue_set();
  dcqs.concatenate_logs();
-
-  guarantee( _cards_scanned == NULL, "invariant" );
-  _cards_scanned = NEW_C_HEAP_ARRAY(size_t, n_workers(), mtGC);
-  for (uint i = 0; i < n_workers(); ++i) {
-    _cards_scanned[i] = 0;
-  }
-  _total_cards_scanned = 0;
 }

 void G1RemSet::cleanup_after_oops_into_collection_set_do() {
-  guarantee( _cards_scanned != NULL, "invariant" );
-  _total_cards_scanned = 0;
-  for (uint i = 0; i < n_workers(); ++i) {
-    _total_cards_scanned += _cards_scanned[i];
-  }
-  FREE_C_HEAP_ARRAY(size_t, _cards_scanned);
-  _cards_scanned = NULL;
  // Cleanup after copy
  _g1->set_refine_cte_cl_concurrency(true);
  // Set all cards back to clean.
--- a/hotspot/src/share/vm/gc/g1/g1RemSet.hpp
+++ b/hotspot/src/share/vm/gc/g1/g1RemSet.hpp
@ -62,9 +62,6 @@ protected:

  ConcurrentG1Refine*    _cg1r;

-  size_t*                _cards_scanned;
-  size_t                 _total_cards_scanned;
-
  // Used for caching the closure that is responsible for scanning
  // references into the collection set.
  G1ParPushHeapRSClosure** _cset_rs_update_cl;
@ -94,9 +91,12 @@ public:
  // partitioning the work to be done. It should be the same as
  // the "i" passed to the calling thread's work(i) function.
  // In the sequential case this param will be ignored.
-  void oops_into_collection_set_do(G1ParPushHeapRSClosure* blk,
-                                   OopClosure* non_heap_roots,
-                                   uint worker_i);
+  //
+  // Returns the number of cards scanned while looking for pointers
+  // into the collection set.
+  size_t oops_into_collection_set_do(G1ParPushHeapRSClosure* blk,
+                                     OopClosure* non_heap_roots,
+                                     uint worker_i);

  // Prepare for and cleanup after an oops_into_collection_set_do
  // call.  Must call each of these once before and after (in sequential
@ -106,14 +106,13 @@ public:
  void prepare_for_oops_into_collection_set_do();
  void cleanup_after_oops_into_collection_set_do();

-  void scanRS(G1ParPushHeapRSClosure* oc,
-              OopClosure* non_heap_roots,
-              uint worker_i);
+  size_t scanRS(G1ParPushHeapRSClosure* oc,
+                OopClosure* non_heap_roots,
+                uint worker_i);

  void updateRS(DirtyCardQueue* into_cset_dcq, uint worker_i);

  CardTableModRefBS* ct_bs() { return _ct_bs; }
-  size_t cardsScanned() { return _total_cards_scanned; }

  // Record, if necessary, the fact that *p (where "p" is in region "from",
  // which is required to be non-NULL) has changed to a new non-NULL value.
--- a/hotspot/src/share/vm/gc/parallel/parallelScavengeHeap.hpp
+++ b/hotspot/src/share/vm/gc/parallel/parallelScavengeHeap.hpp
@ -87,7 +87,7 @@ class ParallelScavengeHeap : public CollectedHeap {
    return CollectedHeap::ParallelScavengeHeap;
  }

-  virtual CollectorPolicy* collector_policy() const { return (CollectorPolicy*) _collector_policy; }
+  virtual CollectorPolicy* collector_policy() const { return _collector_policy; }

  static PSYoungGen* young_gen() { return _young_gen; }
  static PSOldGen* old_gen()     { return _old_gen; }
--- a/hotspot/src/share/vm/gc/serial/defNewGeneration.cpp
+++ b/hotspot/src/share/vm/gc/serial/defNewGeneration.cpp
@ -213,7 +213,7 @@ DefNewGeneration::DefNewGeneration(ReservedSpace rs,
  _max_eden_size = size - (2*_max_survivor_size);

  // allocate the performance counters
-  GenCollectorPolicy* gcp = (GenCollectorPolicy*)gch->collector_policy();
+  GenCollectorPolicy* gcp = gch->gen_policy();

  // Generation counters -- generation 0, 3 subspaces
  _gen_counters = new GenerationCounters("new", 0, 3,
--- a/hotspot/src/share/vm/gc/serial/tenuredGeneration.cpp
+++ b/hotspot/src/share/vm/gc/serial/tenuredGeneration.cpp
@ -57,8 +57,7 @@ TenuredGeneration::TenuredGeneration(ReservedSpace rs,
  // initialize performance counters

  const char* gen_name = "old";
-  GenCollectorPolicy* gcp = (GenCollectorPolicy*) GenCollectedHeap::heap()->collector_policy();
-
+  GenCollectorPolicy* gcp = GenCollectedHeap::heap()->gen_policy();
  // Generation Counters -- generation 1, 1 subspace
  _gen_counters = new GenerationCounters(gen_name, 1, 1,
      gcp->min_old_size(), gcp->max_old_size(), &_virtual_space);
--- a/hotspot/src/share/vm/gc/shared/ageTable.cpp
+++ b/hotspot/src/share/vm/gc/shared/ageTable.cpp
@ -28,7 +28,6 @@
 #include "gc/shared/collectorPolicy.hpp"
 #include "gc/shared/gcPolicyCounters.hpp"
 #include "memory/resourceArea.hpp"
-#include "runtime/atomic.inline.hpp"
 #include "utilities/copy.hpp"

 /* Copyright (c) 1992, 2015, Oracle and/or its affiliates, and Stanford University.
@ -73,12 +72,6 @@ void ageTable::merge(ageTable* subTable) {
  }
 }

-void ageTable::merge_par(ageTable* subTable) {
-  for (int i = 0; i < table_size; i++) {
-    Atomic::add_ptr(subTable->sizes[i], &sizes[i]);
-  }
-}
-
 uint ageTable::compute_tenuring_threshold(size_t survivor_capacity, GCPolicyCounters* gc_counters) {
  size_t desired_survivor_size = (size_t)((((double) survivor_capacity)*TargetSurvivorRatio)/100);
  uint result;
--- a/hotspot/src/share/vm/gc/shared/ageTable.hpp
+++ b/hotspot/src/share/vm/gc/shared/ageTable.hpp
@ -68,7 +68,6 @@ class ageTable VALUE_OBJ_CLASS_SPEC {
  // Merge another age table with the current one.  Used
  // for parallel young generation gc.
  void merge(ageTable* subTable);
-  void merge_par(ageTable* subTable);

  // calculate new tenuring threshold based on age information
  uint compute_tenuring_threshold(size_t survivor_capacity, GCPolicyCounters* gc_counters);
--- a/hotspot/src/share/vm/gc/shared/blockOffsetTable.cpp
+++ b/hotspot/src/share/vm/gc/shared/blockOffsetTable.cpp
@ -447,14 +447,16 @@ void BlockOffsetArrayNonContigSpace::split_block(HeapWord* blk,
      } else {
        // Unilaterally fix the first (num_pref_cards - 1) following
        // the "offset card" in the suffix block.
+        const size_t right_most_fixed_index = suff_index + num_pref_cards - 1;
        set_remainder_to_point_to_start_incl(suff_index + 1,
-          suff_index + num_pref_cards - 1, true /* reducing */);
+          right_most_fixed_index, true /* reducing */);
        // Fix the appropriate cards in the remainder of the
        // suffix block -- these are the last num_pref_cards
        // cards in each power block of the "new" range plumbed
        // from suff_addr.
        bool more = true;
        uint i = 1;
+        // Fix the first power block with  back_by > num_pref_cards.
        while (more && (i < N_powers)) {
          size_t back_by = power_to_cards_back(i);
          size_t right_index = suff_index + back_by - 1;
@ -463,6 +465,9 @@ void BlockOffsetArrayNonContigSpace::split_block(HeapWord* blk,
            right_index = end_index - 1;
            more = false;
          }
+          if (left_index <= right_most_fixed_index) {
+                left_index = right_most_fixed_index + 1;
+          }
          if (back_by > num_pref_cards) {
            // Fill in the remainder of this "power block", if it
            // is non-null.
@ -471,12 +476,14 @@ void BlockOffsetArrayNonContigSpace::split_block(HeapWord* blk,
                                     N_words + i - 1, true /* reducing */);
            } else {
              more = false; // we are done
+              assert((end_index - 1) == right_index, "Must be at the end.");
            }
            i++;
            break;
          }
          i++;
        }
+        // Fix the rest of the power blocks.
        while (more && (i < N_powers)) {
          size_t back_by = power_to_cards_back(i);
          size_t right_index = suff_index + back_by - 1;
--- a/hotspot/src/share/vm/gc/shared/genCollectedHeap.cpp
+++ b/hotspot/src/share/vm/gc/shared/genCollectedHeap.cpp
@ -172,8 +172,6 @@ char* GenCollectedHeap::allocate(size_t alignment,
 void GenCollectedHeap::post_initialize() {
  CollectedHeap::post_initialize();
  ref_processing_init();
-  GenCollectorPolicy *policy = (GenCollectorPolicy *)collector_policy();
-  guarantee(policy->is_generation_policy(), "Illegal policy type");
  assert((_young_gen->kind() == Generation::DefNew) ||
         (_young_gen->kind() == Generation::ParNew),
    "Wrong youngest generation type");
@ -183,10 +181,10 @@ void GenCollectedHeap::post_initialize() {
         _old_gen->kind() == Generation::MarkSweepCompact,
    "Wrong generation kind");

-  policy->initialize_size_policy(def_new_gen->eden()->capacity(),
-                                 _old_gen->capacity(),
-                                 def_new_gen->from()->capacity());
-  policy->initialize_gc_policy_counters();
+  _gen_policy->initialize_size_policy(def_new_gen->eden()->capacity(),
+                                      _old_gen->capacity(),
+                                      def_new_gen->from()->capacity());
+  _gen_policy->initialize_gc_policy_counters();
 }

 void GenCollectedHeap::ref_processing_init() {
@ -822,10 +820,11 @@ bool GenCollectedHeap::create_cms_collector() {
         "Unexpected generation kinds");
  // Skip two header words in the block content verification
  NOT_PRODUCT(_skip_header_HeapWords = CMSCollector::skip_header_HeapWords();)
-  CMSCollector* collector = new CMSCollector(
-    (ConcurrentMarkSweepGeneration*)_old_gen,
-    _rem_set->as_CardTableRS(),
-    (ConcurrentMarkSweepPolicy*) collector_policy());
+  assert(_gen_policy->is_concurrent_mark_sweep_policy(), "Unexpected policy type");
+  CMSCollector* collector =
+    new CMSCollector((ConcurrentMarkSweepGeneration*)_old_gen,
+                     _rem_set->as_CardTableRS(),
+                     _gen_policy->as_concurrent_mark_sweep_policy());

  if (collector == NULL || !collector->completed_initialization()) {
    if (collector) {
--- a/hotspot/src/share/vm/gc/shared/genCollectedHeap.hpp
+++ b/hotspot/src/share/vm/gc/shared/genCollectedHeap.hpp
@ -153,7 +153,7 @@ public:
  // The generational collector policy.
  GenCollectorPolicy* gen_policy() const { return _gen_policy; }

-  virtual CollectorPolicy* collector_policy() const { return (CollectorPolicy*) gen_policy(); }
+  virtual CollectorPolicy* collector_policy() const { return gen_policy(); }

  // Adaptive size policy
  virtual AdaptiveSizePolicy* size_policy() {
--- a/hotspot/src/share/vm/opto/classes.hpp
+++ b/hotspot/src/share/vm/opto/classes.hpp
@ -290,6 +290,7 @@ macro(MulVD)
 macro(MulReductionVD)
 macro(DivVF)
 macro(DivVD)
+macro(SqrtVD)
 macro(LShiftCntV)
 macro(RShiftCntV)
 macro(LShiftVB)
--- a/hotspot/src/share/vm/opto/ifnode.cpp
+++ b/hotspot/src/share/vm/opto/ifnode.cpp
@ -858,18 +858,29 @@ bool IfNode::fold_compares_helper(ProjNode* proj, ProjNode* success, ProjNode* f
    // this_bool = <=
    //   dom_bool = >= (proj = True) or dom_bool = < (proj = False)
    //     x in [a, b] on the fail (= True) projection, b+1 > a-1:
-    //     lo = a, hi = b, adjusted_lim = b-a, cond = <=u
+    //     lo = a, hi = b, adjusted_lim = b-a+1, cond = <u
+    //     lo = a, hi = b, adjusted_lim = b-a, cond = <=u doesn't work because b = a - 1 is possible, then b-a = -1
    //   dom_bool = > (proj = True) or dom_bool = <= (proj = False)
    //     x in ]a, b] on the fail (= True) projection b+1 > a:
    //     lo = a+1, hi = b, adjusted_lim = b-a, cond = <u
-    //     lo = a+1, hi = b, adjusted_lim = b-a-1, cond = <=u doesn't work because a = b is possible, then hi-lo = -1
+    //     lo = a+1, hi = b, adjusted_lim = b-a-1, cond = <=u doesn't work because a = b is possible, then b-a-1 = -1

-    if (lo_test == BoolTest::gt || lo_test == BoolTest::le) {
-      if (hi_test == BoolTest::le) {
+    if (hi_test == BoolTest::lt) {
+      if (lo_test == BoolTest::gt || lo_test == BoolTest::le) {
+        lo = igvn->transform(new AddINode(lo, igvn->intcon(1)));
+      }
+    } else {
+      assert(hi_test == BoolTest::le, "bad test");
+      if (lo_test == BoolTest::ge || lo_test == BoolTest::lt) {
        adjusted_lim = igvn->transform(new SubINode(hi, lo));
+        adjusted_lim = igvn->transform(new AddINode(adjusted_lim, igvn->intcon(1)));
+        cond = BoolTest::lt;
+      } else {
+        assert(lo_test == BoolTest::gt || lo_test == BoolTest::le, "bad test");
+        adjusted_lim = igvn->transform(new SubINode(hi, lo));
+        lo = igvn->transform(new AddINode(lo, igvn->intcon(1)));
        cond = BoolTest::lt;
      }
-      lo = igvn->transform(new AddINode(lo, igvn->intcon(1)));
    }
  } else if (lo_type->_lo > hi_type->_hi && lo_type->_hi == max_jint && hi_type->_lo == min_jint) {

@ -879,7 +890,8 @@ bool IfNode::fold_compares_helper(ProjNode* proj, ProjNode* success, ProjNode* f
    //     lo = b, hi = a, adjusted_lim = a-b, cond = >=u
    //   dom_bool = <= (proj = True) or dom_bool = > (proj = False)
    //     x in [b, a] on the fail (= False) projection, a+1 > b-1:
-    //     lo = b, hi = a, adjusted_lim = a-b, cond = >u
+    //     lo = b, hi = a, adjusted_lim = a-b+1, cond = >=u
+    //     lo = b, hi = a, adjusted_lim = a-b, cond = >u doesn't work because a = b - 1 is possible, then b-a = -1
    // this_bool = <=
    //   dom_bool = < (proj = True) or dom_bool = >= (proj = False)
    //     x in ]b, a[ on the fail (= False) projection, a > b:
@ -887,7 +899,7 @@ bool IfNode::fold_compares_helper(ProjNode* proj, ProjNode* success, ProjNode* f
    //   dom_bool = <= (proj = True) or dom_bool = > (proj = False)
    //     x in ]b, a] on the fail (= False) projection, a+1 > b:
    //     lo = b+1, hi = a, adjusted_lim = a-b, cond = >=u
-    //     lo = b+1, hi = a, adjusted_lim = a-b-1, cond = >u doesn't work because a = b is possible, then hi-lo = -1
+    //     lo = b+1, hi = a, adjusted_lim = a-b-1, cond = >u doesn't work because a = b is possible, then b-a-1 = -1

    swap(lo, hi);
    swap(lo_type, hi_type);
@ -900,14 +912,26 @@ bool IfNode::fold_compares_helper(ProjNode* proj, ProjNode* success, ProjNode* f

    cond = (hi_test == BoolTest::le || hi_test == BoolTest::gt) ? BoolTest::gt : BoolTest::ge;

-    if (lo_test == BoolTest::le) {
-      if (cond == BoolTest::gt) {
+    if (lo_test == BoolTest::lt) {
+      if (hi_test == BoolTest::lt || hi_test == BoolTest::ge) {
+        cond = BoolTest::ge;
+      } else {
+        assert(hi_test == BoolTest::le || hi_test == BoolTest::gt, "bad test");
        adjusted_lim = igvn->transform(new SubINode(hi, lo));
+        adjusted_lim = igvn->transform(new AddINode(adjusted_lim, igvn->intcon(1)));
+        cond = BoolTest::ge;
+      }
+    } else if (lo_test == BoolTest::le) {
+      if (hi_test == BoolTest::lt || hi_test == BoolTest::ge) {
+        lo = igvn->transform(new AddINode(lo, igvn->intcon(1)));
+        cond = BoolTest::ge;
+      } else {
+        assert(hi_test == BoolTest::le || hi_test == BoolTest::gt, "bad test");
+        adjusted_lim = igvn->transform(new SubINode(hi, lo));
+        lo = igvn->transform(new AddINode(lo, igvn->intcon(1)));
        cond = BoolTest::ge;
      }
-      lo = igvn->transform(new AddINode(lo, igvn->intcon(1)));
    }
-
  } else {
    const TypeInt* failtype  = filtered_int_type(igvn, n, proj);
    if (failtype != NULL) {
--- a/hotspot/src/share/vm/opto/loopPredicate.cpp
+++ b/hotspot/src/share/vm/opto/loopPredicate.cpp
@ -112,6 +112,13 @@ ProjNode* PhaseIdealLoop::create_new_if_for_predicate(ProjNode* cont_proj, Node*
    if (_idom != NULL) {
      set_idom(call, rgn, dom_depth(rgn));
    }
+    for (DUIterator_Fast imax, i = uncommon_proj->fast_outs(imax); i < imax; i++) {
+      Node* n = uncommon_proj->fast_out(i);
+      if (n->is_Load() || n->is_Store()) {
+        _igvn.replace_input_of(n, 0, rgn);
+        --i; --imax;
+      }
+    }
  } else {
    // Find region's edge corresponding to uncommon_proj
    for (; proj_index < rgn->req(); proj_index++)
--- a/hotspot/src/share/vm/opto/loopnode.cpp
+++ b/hotspot/src/share/vm/opto/loopnode.cpp
@ -1901,7 +1901,7 @@ void IdealLoopTree::dump_head( ) const {
    if (stride_con > 0) tty->print("+");
    tty->print("%d", stride_con);

-    tty->print(" (%d iters) ", (int)cl->profile_trip_cnt());
+    tty->print(" (%0.f iters) ", cl->profile_trip_cnt());

    if (cl->is_pre_loop ()) tty->print(" pre" );
    if (cl->is_main_loop()) tty->print(" main");
--- a/hotspot/src/share/vm/opto/superword.cpp
+++ b/hotspot/src/share/vm/opto/superword.cpp
@ -1858,6 +1858,11 @@ void SuperWord::output() {
          vn = VectorNode::make(opc, in1, in2, vlen, velt_basic_type(n));
          vlen_in_bytes = vn->as_Vector()->length_in_bytes();
        }
+      } else if (opc == Op_SqrtD) {
+        // Promote operand to vector (Sqrt is a 2 address instruction)
+        Node* in = vector_opd(p, 1);
+        vn = VectorNode::make(opc, in, NULL, vlen, velt_basic_type(n));
+        vlen_in_bytes = vn->as_Vector()->length_in_bytes();
      } else {
        ShouldNotReachHere();
      }
--- a/hotspot/src/share/vm/opto/vectornode.cpp
+++ b/hotspot/src/share/vm/opto/vectornode.cpp
@ -92,6 +92,9 @@ int VectorNode::opcode(int sopc, BasicType bt) {
  case Op_DivD:
    assert(bt == T_DOUBLE, "must be");
    return Op_DivVD;
+  case Op_SqrtD:
+    assert(bt == T_DOUBLE, "must be");
+    return Op_SqrtVD;
  case Op_LShiftI:
    switch (bt) {
    case T_BOOLEAN:
@ -277,6 +280,9 @@ VectorNode* VectorNode::make(int opc, Node* n1, Node* n2, uint vlen, BasicType b
  case Op_DivVF: return new DivVFNode(n1, n2, vt);
  case Op_DivVD: return new DivVDNode(n1, n2, vt);

+  // Currently only supports double precision sqrt
+  case Op_SqrtVD: return new SqrtVDNode(n1, vt);
+
  case Op_LShiftVB: return new LShiftVBNode(n1, n2, vt);
  case Op_LShiftVS: return new LShiftVSNode(n1, n2, vt);
  case Op_LShiftVI: return new LShiftVINode(n1, n2, vt);
--- a/hotspot/src/share/vm/opto/vectornode.hpp
+++ b/hotspot/src/share/vm/opto/vectornode.hpp
@ -309,6 +309,14 @@ class DivVDNode : public VectorNode {
  virtual int Opcode() const;
 };

+//------------------------------SqrtVDNode--------------------------------------
+// Vector Sqrt double
+class SqrtVDNode : public VectorNode {
+ public:
+  SqrtVDNode(Node* in, const TypeVect* vt) : VectorNode(in,vt) {}
+  virtual int Opcode() const;
+};
+
 //------------------------------LShiftVBNode-----------------------------------
 // Vector left shift bytes
 class LShiftVBNode : public VectorNode {
--- a/hotspot/src/share/vm/prims/whitebox.cpp
+++ b/hotspot/src/share/vm/prims/whitebox.cpp
@ -1041,11 +1041,18 @@ CodeBlob* WhiteBox::allocate_code_blob(int size, int blob_type) {
 }

 WB_ENTRY(jlong, WB_AllocateCodeBlob(JNIEnv* env, jobject o, jint size, jint blob_type))
-    return (jlong) WhiteBox::allocate_code_blob(size, blob_type);
+  if (size < 0) {
+    THROW_MSG_0(vmSymbols::java_lang_IllegalArgumentException(),
+      err_msg("WB_AllocateCodeBlob: size is negative: " INT32_FORMAT, size));
+  }
+  return (jlong) WhiteBox::allocate_code_blob(size, blob_type);
 WB_END

 WB_ENTRY(void, WB_FreeCodeBlob(JNIEnv* env, jobject o, jlong addr))
-    BufferBlob::free((BufferBlob*) addr);
+  if (addr == 0) {
+    return;
+  }
+  BufferBlob::free((BufferBlob*) addr);
 WB_END

 WB_ENTRY(jobjectArray, WB_GetCodeHeapEntries(JNIEnv* env, jobject o, jint blob_type))
@ -1090,9 +1097,13 @@ WB_ENTRY(jint, WB_GetCompilationActivityMode(JNIEnv* env, jobject o))
 WB_END

 WB_ENTRY(jobjectArray, WB_GetCodeBlob(JNIEnv* env, jobject o, jlong addr))
-    ThreadToNativeFromVM ttn(thread);
-    CodeBlobStub stub((CodeBlob*) addr);
-    return codeBlob2objectArray(thread, env, &stub);
+  if (addr == 0) {
+    THROW_MSG_NULL(vmSymbols::java_lang_NullPointerException(),
+      "WB_GetCodeBlob: addr is null");
+  }
+  ThreadToNativeFromVM ttn(thread);
+  CodeBlobStub stub((CodeBlob*) addr);
+  return codeBlob2objectArray(thread, env, &stub);
 WB_END

 WB_ENTRY(jlong, WB_GetThreadStackSize(JNIEnv* env, jobject o))
--- a/hotspot/src/share/vm/services/lowMemoryDetector.cpp
+++ b/hotspot/src/share/vm/services/lowMemoryDetector.cpp
@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2003, 2014, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2003, 2015, Oracle and/or its affiliates. All rights reserved.
 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
 *
 * This code is free software; you can redistribute it and/or modify it
@ -200,9 +200,10 @@ SensorInfo::SensorInfo() {
 // any clears unless the usage becomes greater than or equal
 // to the high threshold.
 //
-// If the current level is between high and low threhsold, no change.
+// If the current level is between high and low threshold, no change.
 //
 void SensorInfo::set_gauge_sensor_level(MemoryUsage usage, ThresholdSupport* high_low_threshold) {
+  assert(Service_lock->owned_by_self(), "Must own Service_lock");
  assert(high_low_threshold->is_high_threshold_supported(), "just checking");

  bool is_over_high = high_low_threshold->is_high_threshold_crossed(usage);
@ -257,6 +258,7 @@ void SensorInfo::set_gauge_sensor_level(MemoryUsage usage, ThresholdSupport* hig
 //      the sensor will be on (i.e. sensor is currently off
 //      and has pending trigger requests).
 void SensorInfo::set_counter_sensor_level(MemoryUsage usage, ThresholdSupport* counter_threshold) {
+  assert(Service_lock->owned_by_self(), "Must own Service_lock");
  assert(counter_threshold->is_high_threshold_supported(), "just checking");

  bool is_over_high = counter_threshold->is_high_threshold_crossed(usage);
@ -278,9 +280,7 @@ void SensorInfo::oops_do(OopClosure* f) {
 }

 void SensorInfo::process_pending_requests(TRAPS) {
-  if (!has_pending_requests()) {
-    return;
-  }
+  assert(has_pending_requests(), "Must have pending request");

  int pending_count = pending_trigger_count();
  if (pending_clear_count() > 0) {
@ -293,7 +293,6 @@ void SensorInfo::process_pending_requests(TRAPS) {

 void SensorInfo::trigger(int count, TRAPS) {
  assert(count <= _pending_trigger_count, "just checking");
-
  if (_sensor_obj != NULL) {
    Klass* k = Management::sun_management_Sensor_klass(CHECK);
    instanceKlassHandle sensorKlass (THREAD, k);
@ -316,6 +315,7 @@ void SensorInfo::trigger(int count, TRAPS) {
  {
    // Holds Service_lock and update the sensor state
    MutexLockerEx ml(Service_lock, Mutex::_no_safepoint_check_flag);
+    assert(_pending_trigger_count > 0, "Must have pending trigger");
    _sensor_on = true;
    _sensor_count += count;
    _pending_trigger_count = _pending_trigger_count - count;
@ -323,6 +323,20 @@ void SensorInfo::trigger(int count, TRAPS) {
 }

 void SensorInfo::clear(int count, TRAPS) {
+  {
+    // Holds Service_lock and update the sensor state
+    MutexLockerEx ml(Service_lock, Mutex::_no_safepoint_check_flag);
+    if (_pending_clear_count == 0) {
+      // Bail out if we lost a race to set_*_sensor_level() which may have
+      // reactivated the sensor in the meantime because it was triggered again.
+      return;
+    }
+    _sensor_on = false;
+    _sensor_count += count;
+    _pending_clear_count = 0;
+    _pending_trigger_count = _pending_trigger_count - count;
+  }
+
  if (_sensor_obj != NULL) {
    Klass* k = Management::sun_management_Sensor_klass(CHECK);
    instanceKlassHandle sensorKlass (THREAD, k);
@ -338,14 +352,6 @@ void SensorInfo::clear(int count, TRAPS) {
                            &args,
                            CHECK);
  }
-
-  {
-    // Holds Service_lock and update the sensor state
-    MutexLockerEx ml(Service_lock, Mutex::_no_safepoint_check_flag);
-    _sensor_on = false;
-    _pending_clear_count = 0;
-    _pending_trigger_count = _pending_trigger_count - count;
-  }
 }

 //--------------------------------------------------------------
--- a/hotspot/src/share/vm/services/lowMemoryDetector.hpp
+++ b/hotspot/src/share/vm/services/lowMemoryDetector.hpp
@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2003, 2012, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2003, 2015, Oracle and/or its affiliates. All rights reserved.
 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
 *
 * This code is free software; you can redistribute it and/or modify it
@ -180,7 +180,7 @@ public:
  // any clears unless the usage becomes greater than or equal
  // to the high threshold.
  //
-  // If the current level is between high and low threhsold, no change.
+  // If the current level is between high and low threshold, no change.
  //
  void set_gauge_sensor_level(MemoryUsage usage, ThresholdSupport* high_low_threshold);

--- a/hotspot/test/compiler/arraycopy/TestEliminatedArrayLoopPredicateCopyDeopt.java
+++ b/hotspot/test/compiler/arraycopy/TestEliminatedArrayLoopPredicateCopyDeopt.java
@ -0,0 +1,53 @@
+/*
+ * Copyright (c) 2015, Oracle and/or its affiliates. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ */
+
+/*
+ * @test
+ * @bug 8134974
+ * @summary Cannot pin eliminated arraycopy loads for deopt state in uncommon trap path if it is a loop predicate unc
+ * @run main/othervm -XX:-BackgroundCompilation -XX:-UseOnStackReplacement TestEliminatedArrayLoopPredicateCopyDeopt
+ *
+ */
+
+public class TestEliminatedArrayLoopPredicateCopyDeopt {
+
+    static boolean test(int[] array_src) {
+        int[] array_dst = new int[10];
+        System.arraycopy(array_src, 0, array_dst, 0, 10);
+
+        for (int i = 0; i < 100; i++) {
+            array_src[i] = i;
+        }
+        if (array_dst[0] == 0) {
+            return true;
+        }
+        return false;
+    }
+
+    static public void main(String[] args) {
+        int[] array_src = new int[100];
+        for (int i = 0; i < 20000; i++) {
+            test(array_src);
+        }
+    }
+}
--- a/hotspot/test/compiler/loopopts/superword/SumRedSqrt_Double.java
+++ b/hotspot/test/compiler/loopopts/superword/SumRedSqrt_Double.java
@ -0,0 +1,95 @@
+/*
+ * Copyright (c) 2014, Oracle and/or its affiliates. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+/**
+* @test
+* @summary Add C2 x86 Superword support for scalar sum reduction optimizations : double sqrt test
+*
+* @run main/othervm -XX:+IgnoreUnrecognizedVMOptions -XX:+SuperWordReductions -XX:LoopUnrollLimit=250 -XX:LoopMaxUnroll=2 -XX:CompileThresholdScaling=0.1 SumRedSqrt_Double
+* @run main/othervm -XX:+IgnoreUnrecognizedVMOptions -XX:-SuperWordReductions -XX:LoopUnrollLimit=250 -XX:LoopMaxUnroll=2 -XX:CompileThresholdScaling=0.1 SumRedSqrt_Double
+*
+* @run main/othervm -XX:+IgnoreUnrecognizedVMOptions -XX:+SuperWordReductions -XX:LoopUnrollLimit=250 -XX:LoopMaxUnroll=4 -XX:CompileThresholdScaling=0.1 SumRedSqrt_Double
+* @run main/othervm -XX:+IgnoreUnrecognizedVMOptions -XX:-SuperWordReductions -XX:LoopUnrollLimit=250 -XX:LoopMaxUnroll=4 -XX:CompileThresholdScaling=0.1 SumRedSqrt_Double
+*
+* @run main/othervm -XX:+IgnoreUnrecognizedVMOptions -XX:+SuperWordReductions -XX:LoopUnrollLimit=250 -XX:LoopMaxUnroll=8 -XX:CompileThresholdScaling=0.1 SumRedSqrt_Double
+* @run main/othervm -XX:+IgnoreUnrecognizedVMOptions -XX:-SuperWordReductions -XX:LoopUnrollLimit=250 -XX:LoopMaxUnroll=8 -XX:CompileThresholdScaling=0.1 SumRedSqrt_Double
+*
+* @run main/othervm -XX:+IgnoreUnrecognizedVMOptions -XX:+SuperWordReductions -XX:LoopUnrollLimit=250 -XX:LoopMaxUnroll=16 -XX:CompileThresholdScaling=0.1 SumRedSqrt_Double
+* @run main/othervm -XX:+IgnoreUnrecognizedVMOptions -XX:-SuperWordReductions -XX:LoopUnrollLimit=250 -XX:LoopMaxUnroll=16 -XX:CompileThresholdScaling=0.1 SumRedSqrt_Double
+*/
+
+public class SumRedSqrt_Double
+{
+  public static void main(String[] args) throws Exception {
+    double[] a = new double[256*1024];
+    double[] b = new double[256*1024];
+    double[] c = new double[256*1024];
+    double[] d = new double[256*1024];
+    sumReductionInit(a,b,c);
+    double total = 0;
+    double valid = 2.06157643776E14;
+    for(int j = 0; j < 2000; j++) {
+      total = sumReductionImplement(a,b,c,d,total);
+    }
+    if(total == valid) {
+      System.out.println("Success");
+    } else {
+      System.out.println("Invalid sum of elements variable in total: " + total);
+      System.out.println("Expected value = " + valid);
+      throw new Exception("Failed");
+    }
+  }
+
+  public static void sumReductionInit(
+    double[] a,
+    double[] b,
+    double[] c)
+  {
+    for(int j = 0; j < 1; j++)
+    {
+      for(int i = 0; i < a.length; i++)
+      {
+        a[i] = i * 1 + j;
+        b[i] = i * 1 - j;
+        c[i] = i + j;
+      }
+    }
+  }
+
+  public static double sumReductionImplement(
+    double[] a,
+    double[] b,
+    double[] c,
+    double[] d,
+    double total)
+  {
+    for(int i = 0; i < a.length; i++)
+    {
+      d[i]= Math.sqrt(a[i] * b[i]) + Math.sqrt(a[i] * c[i]) + Math.sqrt(b[i] * c[i]);
+      total += d[i];
+    }
+    return total;
+  }
+
+}
--- a/hotspot/test/compiler/rangechecks/TestBadFoldCompare.java
+++ b/hotspot/test/compiler/rangechecks/TestBadFoldCompare.java
@ -24,7 +24,8 @@
 /*
 * @test
 * @bug 8085832
- * @summary x <= 0 || x > 0 wrongly folded as (x-1) >u -1
+ * @bug 8135069
+ * @summary x <= 0 || x > 0 wrongly folded as (x-1) >u -1 and x < 0 || x > -1 wrongly folded as x >u -1
 * @run main/othervm -XX:-BackgroundCompilation -XX:-UseOnStackReplacement TestBadFoldCompare
 */

@ -58,6 +59,34 @@ public class TestBadFoldCompare {
        helper2(i, 0, 0, flag);
    }

+    static boolean test3_taken;
+
+    static void helper3(int i, int a, int b, boolean flag) {
+        if (flag) {
+            if (i < a || i > b - 1) {
+                test3_taken = true;
+            }
+        }
+    }
+
+    static void test3(int i, boolean flag) {
+        helper3(i, 0, 0, flag);
+    }
+
+    static boolean test4_taken;
+
+    static void helper4(int i, int a, int b, boolean flag) {
+        if (flag) {
+            if (i > b - 1 || i < a) {
+                test4_taken = true;
+            }
+        }
+    }
+
+    static void test4(int i, boolean flag) {
+        helper4(i, 0, 0, flag);
+    }
+
    static public void main(String[] args) {
        boolean success = true;

@ -87,6 +116,35 @@ public class TestBadFoldCompare {
            System.out.println("Test2 failed");
            success = false;
        }
+
+        for (int i = 0; i < 20000; i++) {
+            helper3(5, 0,  10, (i%2)==0);
+            helper3(-1, 0,  10, (i%2)==0);
+            helper3(15, 0,  10, (i%2)==0);
+            test3(0, false);
+        }
+        test3_taken = false;
+        test3(0, true);
+
+        if (!test3_taken) {
+            System.out.println("Test3 failed");
+            success = false;
+        }
+
+        for (int i = 0; i < 20000; i++) {
+            helper4(5, 0,  10, (i%2)==0);
+            helper4(-1, 0,  10, (i%2)==0);
+            helper4(15, 0,  10, (i%2)==0);
+            test4(0, false);
+        }
+        test4_taken = false;
+        test4(0, true);
+
+        if (!test4_taken) {
+            System.out.println("Test4 failed");
+            success = false;
+        }
+
        if (!success) {
            throw new RuntimeException("Some tests failed");
        }
--- a/hotspot/test/gc/g1/humongousObjects/TestHumongousThreshold.java
+++ b/hotspot/test/gc/g1/humongousObjects/TestHumongousThreshold.java
@ -56,11 +56,11 @@ import sun.hotspot.WhiteBox;
 * gc.g1.humongousObjects.TestHumongousThreshold
 *
 * @run main/othervm -XX:+UseG1GC -XX:+UnlockDiagnosticVMOptions -XX:+WhiteBoxAPI -Xbootclasspath/a:.
- * -XX:G1HeapRegionSize=16M
+ * -Xms128M -XX:G1HeapRegionSize=16M
 * gc.g1.humongousObjects.TestHumongousThreshold
 *
 * @run main/othervm -XX:+UseG1GC -XX:+UnlockDiagnosticVMOptions -XX:+WhiteBoxAPI -Xbootclasspath/a:.
- * -XX:G1HeapRegionSize=32M
+ * -Xms200M -XX:G1HeapRegionSize=32M
 * gc.g1.humongousObjects.TestHumongousThreshold
 *
 */
--- a/hotspot/test/serviceability/dcmd/compiler/CodelistTest.java
+++ b/hotspot/test/serviceability/dcmd/compiler/CodelistTest.java
@ -90,6 +90,9 @@ public class CodelistTest {
            if (methodPrintedInLogFormat.contains("MethodHandle")) {
                continue;
            }
+            if (methodPrintedInLogFormat.contains("sun.misc.Unsafe.getUnsafe")) {
+                continue;
+            }

            MethodIdentifierParser mf = new MethodIdentifierParser(methodPrintedInLogFormat);
            Method m = null;
--- a/hotspot/test/testlibrary/jdk/test/lib/Utils.java
+++ b/hotspot/test/testlibrary/jdk/test/lib/Utils.java
@ -428,4 +428,28 @@ public final class Utils {
    public static long adjustTimeout(long tOut) {
        return Math.round(tOut * Utils.TIMEOUT_FACTOR);
    }
+
+    /**
+     * Runs runnable and checks that it throws expected exception. If exceptionException is null it means
+     * that we expect no exception to be thrown.
+     * @param runnable what we run
+     * @param expectedException expected exception
+     */
+    public static void runAndCheckException(Runnable runnable, Class<? extends Throwable> expectedException) {
+        try {
+            runnable.run();
+            if (expectedException != null) {
+                throw new AssertionError("Didn't get expected exception " + expectedException.getSimpleName());
+            }
+        } catch (Throwable t) {
+            if (expectedException == null) {
+                throw new AssertionError("Got unexpected exception ", t);
+            }
+            if (!expectedException.isAssignableFrom(t.getClass())) {
+                throw new AssertionError(String.format("Got unexpected exception %s instead of %s",
+                        t.getClass().getSimpleName(), expectedException.getSimpleName()), t);
+            }
+        }
+    }
+
 }
--- a/hotspot/test/testlibrary_tests/whitebox/BlobSanityTest.java
+++ b/hotspot/test/testlibrary_tests/whitebox/BlobSanityTest.java
@ -0,0 +1,60 @@
+/*
+ * Copyright (c) 2015, Oracle and/or its affiliates. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ */
+
+/*
+ * @test BlobSanityTest
+ * @bug 8132980
+ * @library /testlibrary /../../test/lib
+ * @modules java.management/sun.management
+ * @build BlobSanityTest
+ * @run main ClassFileInstaller sun.hotspot.WhiteBox
+ *                              sun.hotspot.WhiteBox$WhiteBoxPermission
+ * @run main/othervm -Xbootclasspath/a:. -XX:+UnlockDiagnosticVMOptions -XX:+WhiteBoxAPI BlobSanityTest
+ * @summary sanity testing of allocateCodeBlob, freeCodeBlob and getCodeBlob
+ */
+
+
+import sun.hotspot.WhiteBox;
+
+import java.util.function.Consumer;
+import jdk.test.lib.Utils;
+
+public class BlobSanityTest {
+
+    private static void runTest(Consumer<Integer> consumer, int val, String testCaseName, Class<? extends Throwable>
+            expectedException) {
+            System.out.println("Calling " + testCaseName);
+            Utils.runAndCheckException(() -> consumer.accept(val), expectedException);
+            System.out.println("Looks ok");
+    }
+
+    public static void main(String[] args) throws Exception {
+        System.out.println("Crash means that sanity check failed");
+
+        WhiteBox wb = WhiteBox.getWhiteBox();
+
+        runTest(wb::freeCodeBlob, 0, "wb::freeCodeBlob(0)", null);
+        runTest(wb::getCodeBlob, 0, "wb::getCodeBlob(0)", NullPointerException.class);
+        runTest(x -> wb.allocateCodeBlob(x, 0), -1, "wb::allocateCodeBlob(-1,0)", IllegalArgumentException.class);
+    }
+}