diff --git a/.hgtags-top-repo b/.hgtags-top-repo
index af2cc8250c7..a531df300ad 100644
--- a/.hgtags-top-repo
+++ b/.hgtags-top-repo
@@ -142,3 +142,4 @@ a4f28069d44a379cda99dd1d921d19f819726d22 jdk8-b15
 7010bd24cdd07bc7daef80702f39124854dec36c jdk8-b18
 237bc29afbfc6f56a4fe4a6008e2befb59c44bac jdk8-b19
 5a5eaf6374bcbe23530899579fed17a05b7705f3 jdk8-b20
+cc771d92284f71765eca14d6d08703c4af254c04 jdk8-b21
diff --git a/hotspot/.hgtags b/hotspot/.hgtags
index b55f2017c48..214b0ee6025 100644
--- a/hotspot/.hgtags
+++ b/hotspot/.hgtags
@@ -209,3 +209,5 @@ a2fef924d8e6f37dac2a887315e3502876cc8e24 hs23-b08
 4bcf61041217f8677dcec18e90e9196acc945bba hs23-b09
 9232e0ecbc2cec54dcc8f93004fb00c214446460 jdk8-b19
 fe2c8764998112b7fefcd7d41599714813ae4327 jdk8-b20
+9952d1c439d64c5fd4ad1236a63a62bd5a49d4c3 jdk8-b21
+513351373923f74a7c91755748b95c9771e59f96 hs23-b10
diff --git a/hotspot/make/bsd/makefiles/adlc.make b/hotspot/make/bsd/makefiles/adlc.make
index 69797ab734e..7686c488623 100644
--- a/hotspot/make/bsd/makefiles/adlc.make
+++ b/hotspot/make/bsd/makefiles/adlc.make
@@ -39,9 +39,16 @@ OS = $(Platform_os_family)
 
 SOURCE.AD = $(OUTDIR)/$(OS)_$(Platform_arch_model).ad 
 
-SOURCES.AD = \
+ifeq ("${Platform_arch_model}", "${Platform_arch}")
+  SOURCES.AD = \
   $(call altsrc-replace,$(HS_COMMON_SRC)/cpu/$(ARCH)/vm/$(Platform_arch_model).ad) \
   $(call altsrc-replace,$(HS_COMMON_SRC)/os_cpu/$(OS)_$(ARCH)/vm/$(OS)_$(Platform_arch_model).ad)
+else
+  SOURCES.AD = \
+  $(call altsrc-replace,$(HS_COMMON_SRC)/cpu/$(ARCH)/vm/$(Platform_arch_model).ad) \
+  $(call altsrc-replace,$(HS_COMMON_SRC)/cpu/$(ARCH)/vm/$(Platform_arch).ad) \
+  $(call altsrc-replace,$(HS_COMMON_SRC)/os_cpu/$(OS)_$(ARCH)/vm/$(OS)_$(Platform_arch_model).ad)
+endif
 
 EXEC	= $(OUTDIR)/adlc
 
diff --git a/hotspot/make/hotspot_version b/hotspot/make/hotspot_version
index a72f4c22d43..75a0f4c2b14 100644
--- a/hotspot/make/hotspot_version
+++ b/hotspot/make/hotspot_version
@@ -35,7 +35,7 @@ HOTSPOT_VM_COPYRIGHT=Copyright 2011
 
 HS_MAJOR_VER=23
 HS_MINOR_VER=0
-HS_BUILD_NUMBER=09
+HS_BUILD_NUMBER=10
 
 JDK_MAJOR_VER=1
 JDK_MINOR_VER=8
diff --git a/hotspot/make/linux/makefiles/adlc.make b/hotspot/make/linux/makefiles/adlc.make
index 0c15c1c6589..33a28eef86b 100644
--- a/hotspot/make/linux/makefiles/adlc.make
+++ b/hotspot/make/linux/makefiles/adlc.make
@@ -39,9 +39,16 @@ OS = $(Platform_os_family)
 
 SOURCE.AD = $(OUTDIR)/$(OS)_$(Platform_arch_model).ad 
 
-SOURCES.AD = \
+ifeq ("${Platform_arch_model}", "${Platform_arch}")
+  SOURCES.AD = \
   $(call altsrc-replace,$(HS_COMMON_SRC)/cpu/$(ARCH)/vm/$(Platform_arch_model).ad) \
   $(call altsrc-replace,$(HS_COMMON_SRC)/os_cpu/$(OS)_$(ARCH)/vm/$(OS)_$(Platform_arch_model).ad)
+else
+  SOURCES.AD = \
+  $(call altsrc-replace,$(HS_COMMON_SRC)/cpu/$(ARCH)/vm/$(Platform_arch_model).ad) \
+  $(call altsrc-replace,$(HS_COMMON_SRC)/cpu/$(ARCH)/vm/$(Platform_arch).ad) \
+  $(call altsrc-replace,$(HS_COMMON_SRC)/os_cpu/$(OS)_$(ARCH)/vm/$(OS)_$(Platform_arch_model).ad)
+endif
 
 EXEC	= $(OUTDIR)/adlc
 
diff --git a/hotspot/make/solaris/makefiles/adlc.make b/hotspot/make/solaris/makefiles/adlc.make
index 4bcecf607cd..b14a1879477 100644
--- a/hotspot/make/solaris/makefiles/adlc.make
+++ b/hotspot/make/solaris/makefiles/adlc.make
@@ -40,9 +40,16 @@ OS = $(Platform_os_family)
 
 SOURCE.AD = $(OUTDIR)/$(OS)_$(Platform_arch_model).ad 
 
-SOURCES.AD = \
+ifeq ("${Platform_arch_model}", "${Platform_arch}")
+  SOURCES.AD = \
   $(call altsrc-replace,$(HS_COMMON_SRC)/cpu/$(ARCH)/vm/$(Platform_arch_model).ad) \
   $(call altsrc-replace,$(HS_COMMON_SRC)/os_cpu/$(OS)_$(ARCH)/vm/$(OS)_$(Platform_arch_model).ad)
+else
+  SOURCES.AD = \
+  $(call altsrc-replace,$(HS_COMMON_SRC)/cpu/$(ARCH)/vm/$(Platform_arch_model).ad) \
+  $(call altsrc-replace,$(HS_COMMON_SRC)/cpu/$(ARCH)/vm/$(Platform_arch).ad) \
+  $(call altsrc-replace,$(HS_COMMON_SRC)/os_cpu/$(OS)_$(ARCH)/vm/$(OS)_$(Platform_arch_model).ad)
+endif
 
 EXEC	= $(OUTDIR)/adlc
 
diff --git a/hotspot/make/windows/makefiles/adlc.make b/hotspot/make/windows/makefiles/adlc.make
index d03e73373ea..de607ec52d4 100644
--- a/hotspot/make/windows/makefiles/adlc.make
+++ b/hotspot/make/windows/makefiles/adlc.make
@@ -53,6 +53,17 @@ CPP_INCLUDE_DIRS=\
   /I "$(WorkSpace)\src\os\windows\vm" \
   /I "$(WorkSpace)\src\cpu\$(Platform_arch)\vm"
 
+!if "$(Platform_arch_model)" == "$(Platform_arch)"
+SOURCES_AD=\
+  $(WorkSpace)/src/cpu/$(Platform_arch)/vm/$(Platform_arch_model).ad \
+  $(WorkSpace)/src/os_cpu/windows_$(Platform_arch)/vm/windows_$(Platform_arch_model).ad
+!else
+SOURCES_AD=\
+  $(WorkSpace)/src/cpu/$(Platform_arch)/vm/$(Platform_arch_model).ad \
+  $(WorkSpace)/src/cpu/$(Platform_arch)/vm/$(Platform_arch).ad \
+  $(WorkSpace)/src/os_cpu/windows_$(Platform_arch)/vm/windows_$(Platform_arch_model).ad
+!endif
+
 # NOTE! If you add any files here, you must also update GENERATED_NAMES_IN_DIR
 # and ProjectCreatorIDEOptions in projectcreator.make. 
 GENERATED_NAMES=\
@@ -105,7 +116,6 @@ $(GENERATED_NAMES_IN_DIR): $(Platform_arch_model).ad adlc.exe
 	$(ADLC) $(ADLCFLAGS) $(Platform_arch_model).ad
 	mv $(GENERATED_NAMES) $(AdlcOutDir)/
 
-$(Platform_arch_model).ad: $(WorkSpace)/src/cpu/$(Platform_arch)/vm/$(Platform_arch_model).ad $(WorkSpace)/src/os_cpu/windows_$(Platform_arch)/vm/windows_$(Platform_arch_model).ad
+$(Platform_arch_model).ad: $(SOURCES_AD)
 	rm -f $(Platform_arch_model).ad
-	cat $(WorkSpace)/src/cpu/$(Platform_arch)/vm/$(Platform_arch_model).ad  \
-	    $(WorkSpace)/src/os_cpu/windows_$(Platform_arch)/vm/windows_$(Platform_arch_model).ad >$(Platform_arch_model).ad
+	cat $(SOURCES_AD) >$(Platform_arch_model).ad
diff --git a/hotspot/src/cpu/sparc/vm/assembler_sparc.cpp b/hotspot/src/cpu/sparc/vm/assembler_sparc.cpp
index 8fe11550f28..71c8e074532 100644
--- a/hotspot/src/cpu/sparc/vm/assembler_sparc.cpp
+++ b/hotspot/src/cpu/sparc/vm/assembler_sparc.cpp
@@ -3036,10 +3036,8 @@ void MacroAssembler::check_klass_subtype_fast_path(Register sub_klass,
                                                    Label* L_failure,
                                                    Label* L_slow_path,
                                         RegisterOrConstant super_check_offset) {
-  int sc_offset = (klassOopDesc::header_size() * HeapWordSize +
-                   Klass::secondary_super_cache_offset_in_bytes());
-  int sco_offset = (klassOopDesc::header_size() * HeapWordSize +
-                    Klass::super_check_offset_offset_in_bytes());
+  int sc_offset = in_bytes(Klass::secondary_super_cache_offset());
+  int sco_offset = in_bytes(Klass::super_check_offset_offset());
 
   bool must_load_sco  = (super_check_offset.constant_or_zero() == -1);
   bool need_slow_path = (must_load_sco ||
@@ -3159,10 +3157,8 @@ void MacroAssembler::check_klass_subtype_slow_path(Register sub_klass,
   assert(label_nulls <= 1, "at most one NULL in the batch");
 
   // a couple of useful fields in sub_klass:
-  int ss_offset = (klassOopDesc::header_size() * HeapWordSize +
-                   Klass::secondary_supers_offset_in_bytes());
-  int sc_offset = (klassOopDesc::header_size() * HeapWordSize +
-                   Klass::secondary_super_cache_offset_in_bytes());
+  int ss_offset = in_bytes(Klass::secondary_supers_offset());
+  int sc_offset = in_bytes(Klass::secondary_super_cache_offset());
 
   // Do a linear scan of the secondary super-klass chain.
   // This code is rarely used, so simplicity is a virtue here.
@@ -3336,7 +3332,7 @@ void MacroAssembler::biased_locking_enter(Register obj_reg, Register mark_reg,
   cmp_and_brx_short(temp_reg, markOopDesc::biased_lock_pattern, Assembler::notEqual, Assembler::pn, cas_label);
 
   load_klass(obj_reg, temp_reg);
-  ld_ptr(Address(temp_reg, Klass::prototype_header_offset_in_bytes() + klassOopDesc::klass_part_offset_in_bytes()), temp_reg);
+  ld_ptr(Address(temp_reg, Klass::prototype_header_offset()), temp_reg);
   or3(G2_thread, temp_reg, temp_reg);
   xor3(mark_reg, temp_reg, temp_reg);
   andcc(temp_reg, ~((int) markOopDesc::age_mask_in_place), temp_reg);
@@ -3413,7 +3409,7 @@ void MacroAssembler::biased_locking_enter(Register obj_reg, Register mark_reg,
   // FIXME: due to a lack of registers we currently blow away the age
   // bits in this situation. Should attempt to preserve them.
   load_klass(obj_reg, temp_reg);
-  ld_ptr(Address(temp_reg, Klass::prototype_header_offset_in_bytes() + klassOopDesc::klass_part_offset_in_bytes()), temp_reg);
+  ld_ptr(Address(temp_reg, Klass::prototype_header_offset()), temp_reg);
   or3(G2_thread, temp_reg, temp_reg);
   casn(mark_addr.base(), mark_reg, temp_reg);
   // If the biasing toward our thread failed, this means that
@@ -3443,7 +3439,7 @@ void MacroAssembler::biased_locking_enter(Register obj_reg, Register mark_reg,
   // FIXME: due to a lack of registers we currently blow away the age
   // bits in this situation. Should attempt to preserve them.
   load_klass(obj_reg, temp_reg);
-  ld_ptr(Address(temp_reg, Klass::prototype_header_offset_in_bytes() + klassOopDesc::klass_part_offset_in_bytes()), temp_reg);
+  ld_ptr(Address(temp_reg, Klass::prototype_header_offset()), temp_reg);
   casn(mark_addr.base(), mark_reg, temp_reg);
   // Fall through to the normal CAS-based lock, because no matter what
   // the result of the above CAS, some thread must have succeeded in
diff --git a/hotspot/src/cpu/sparc/vm/c1_CodeStubs_sparc.cpp b/hotspot/src/cpu/sparc/vm/c1_CodeStubs_sparc.cpp
index a54b68c4513..837488c23e9 100644
--- a/hotspot/src/cpu/sparc/vm/c1_CodeStubs_sparc.cpp
+++ b/hotspot/src/cpu/sparc/vm/c1_CodeStubs_sparc.cpp
@@ -302,7 +302,7 @@ void PatchingStub::emit_code(LIR_Assembler* ce) {
     assert(_obj != noreg, "must be a valid register");
     assert(_oop_index >= 0, "must have oop index");
     __ load_heap_oop(_obj, java_lang_Class::klass_offset_in_bytes(), G3);
-    __ ld_ptr(G3, instanceKlass::init_thread_offset_in_bytes() + sizeof(klassOopDesc), G3);
+    __ ld_ptr(G3, in_bytes(instanceKlass::init_thread_offset()), G3);
     __ cmp_and_brx_short(G2_thread, G3, Assembler::notEqual, Assembler::pn, call_patch);
 
     // load_klass patches may execute the patched code before it's
@@ -471,7 +471,7 @@ void G1UnsafeGetObjSATBBarrierStub::emit_code(LIR_Assembler* ce) {
 
   __ load_klass(src_reg, tmp_reg);
 
-  Address ref_type_adr(tmp_reg, instanceKlass::reference_type_offset_in_bytes() + sizeof(oopDesc));
+  Address ref_type_adr(tmp_reg, instanceKlass::reference_type_offset());
   __ ld(ref_type_adr, tmp_reg);
 
   // _reference_type field is of type ReferenceType (enum)
diff --git a/hotspot/src/cpu/sparc/vm/c1_LIRAssembler_sparc.cpp b/hotspot/src/cpu/sparc/vm/c1_LIRAssembler_sparc.cpp
index 97e86fd789f..96953eebf5a 100644
--- a/hotspot/src/cpu/sparc/vm/c1_LIRAssembler_sparc.cpp
+++ b/hotspot/src/cpu/sparc/vm/c1_LIRAssembler_sparc.cpp
@@ -2202,8 +2202,7 @@ void LIR_Assembler::emit_arraycopy(LIR_OpArrayCopy* op) {
           } else if (!(flags & LIR_OpArrayCopy::dst_objarray)) {
             __ load_klass(dst, tmp);
           }
-          int lh_offset = klassOopDesc::header_size() * HeapWordSize +
-            Klass::layout_helper_offset_in_bytes();
+          int lh_offset = in_bytes(Klass::layout_helper_offset());
 
           __ lduw(tmp, lh_offset, tmp2);
 
@@ -2238,12 +2237,10 @@ void LIR_Assembler::emit_arraycopy(LIR_OpArrayCopy* op) {
         __ mov(length, len);
         __ load_klass(dst, tmp);
 
-        int ek_offset = (klassOopDesc::header_size() * HeapWordSize +
-                         objArrayKlass::element_klass_offset_in_bytes());
+        int ek_offset = in_bytes(objArrayKlass::element_klass_offset());
         __ ld_ptr(tmp, ek_offset, super_k);
 
-        int sco_offset = (klassOopDesc::header_size() * HeapWordSize +
-                          Klass::super_check_offset_offset_in_bytes());
+        int sco_offset = in_bytes(Klass::super_check_offset_offset());
         __ lduw(super_k, sco_offset, chk_off);
 
         __ call_VM_leaf(tmp, copyfunc_addr);
@@ -2455,8 +2452,8 @@ void LIR_Assembler::emit_alloc_obj(LIR_OpAllocObj* op) {
          op->obj()->as_register()   == O0 &&
          op->klass()->as_register() == G5, "must be");
   if (op->init_check()) {
-    __ ld(op->klass()->as_register(),
-          instanceKlass::init_state_offset_in_bytes() + sizeof(oopDesc),
+    __ ldub(op->klass()->as_register(),
+          in_bytes(instanceKlass::init_state_offset()),
           op->tmp1()->as_register());
     add_debug_info_for_null_check_here(op->stub()->info());
     __ cmp(op->tmp1()->as_register(), instanceKlass::fully_initialized);
@@ -2627,7 +2624,7 @@ void LIR_Assembler::emit_typecheck_helper(LIR_OpTypeCheck *op, Label* success, L
   } else {
     bool need_slow_path = true;
     if (k->is_loaded()) {
-      if (k->super_check_offset() != sizeof(oopDesc) + Klass::secondary_super_cache_offset_in_bytes())
+      if ((int) k->super_check_offset() != in_bytes(Klass::secondary_super_cache_offset()))
         need_slow_path = false;
       // perform the fast part of the checking logic
       __ check_klass_subtype_fast_path(klass_RInfo, k_RInfo, Rtmp1, noreg,
@@ -2731,7 +2728,7 @@ void LIR_Assembler::emit_opTypeCheck(LIR_OpTypeCheck* op) {
     __ load_klass(value, klass_RInfo);
 
     // get instance klass
-    __ ld_ptr(Address(k_RInfo, objArrayKlass::element_klass_offset_in_bytes() + sizeof(oopDesc)), k_RInfo);
+    __ ld_ptr(Address(k_RInfo, objArrayKlass::element_klass_offset()), k_RInfo);
     // perform the fast part of the checking logic
     __ check_klass_subtype_fast_path(klass_RInfo, k_RInfo, Rtmp1, O7, success_target, failure_target, NULL);
 
diff --git a/hotspot/src/cpu/sparc/vm/c1_MacroAssembler_sparc.cpp b/hotspot/src/cpu/sparc/vm/c1_MacroAssembler_sparc.cpp
index 43722254879..47f82cf878d 100644
--- a/hotspot/src/cpu/sparc/vm/c1_MacroAssembler_sparc.cpp
+++ b/hotspot/src/cpu/sparc/vm/c1_MacroAssembler_sparc.cpp
@@ -181,7 +181,7 @@ void C1_MacroAssembler::try_allocate(
 void C1_MacroAssembler::initialize_header(Register obj, Register klass, Register len, Register t1, Register t2) {
   assert_different_registers(obj, klass, len, t1, t2);
   if (UseBiasedLocking && !len->is_valid()) {
-    ld_ptr(klass, Klass::prototype_header_offset_in_bytes() + klassOopDesc::klass_part_offset_in_bytes(), t1);
+    ld_ptr(klass, in_bytes(Klass::prototype_header_offset()), t1);
   } else {
     set((intx)markOopDesc::prototype(), t1);
   }
@@ -252,7 +252,7 @@ void C1_MacroAssembler::initialize_object(
 #ifdef ASSERT
   {
     Label ok;
-    ld(klass, klassOopDesc::header_size() * HeapWordSize + Klass::layout_helper_offset_in_bytes(), t1);
+    ld(klass, in_bytes(Klass::layout_helper_offset()), t1);
     if (var_size_in_bytes != noreg) {
       cmp_and_brx_short(t1, var_size_in_bytes, Assembler::equal, Assembler::pt, ok);
     } else {
diff --git a/hotspot/src/cpu/sparc/vm/c1_Runtime1_sparc.cpp b/hotspot/src/cpu/sparc/vm/c1_Runtime1_sparc.cpp
index 64f15e0b64a..d0ddc37c867 100644
--- a/hotspot/src/cpu/sparc/vm/c1_Runtime1_sparc.cpp
+++ b/hotspot/src/cpu/sparc/vm/c1_Runtime1_sparc.cpp
@@ -398,14 +398,14 @@ OopMapSet* Runtime1::generate_code_for(StubID id, StubAssembler* sasm) {
 
           if (id == fast_new_instance_init_check_id) {
             // make sure the klass is initialized
-            __ ld(G5_klass, instanceKlass::init_state_offset_in_bytes() + sizeof(oopDesc), G3_t1);
+            __ ldub(G5_klass, in_bytes(instanceKlass::init_state_offset()), G3_t1);
             __ cmp_and_br_short(G3_t1, instanceKlass::fully_initialized, Assembler::notEqual, Assembler::pn, slow_path);
           }
 #ifdef ASSERT
           // assert object can be fast path allocated
           {
             Label ok, not_ok;
-          __ ld(G5_klass, Klass::layout_helper_offset_in_bytes() + sizeof(oopDesc), G1_obj_size);
+          __ ld(G5_klass, in_bytes(Klass::layout_helper_offset()), G1_obj_size);
           // make sure it's an instance (LH > 0)
           __ cmp_and_br_short(G1_obj_size, 0, Assembler::lessEqual, Assembler::pn, not_ok);
           __ btst(Klass::_lh_instance_slow_path_bit, G1_obj_size);
@@ -425,7 +425,7 @@ OopMapSet* Runtime1::generate_code_for(StubID id, StubAssembler* sasm) {
           __ bind(retry_tlab);
 
           // get the instance size
-          __ ld(G5_klass, klassOopDesc::header_size() * HeapWordSize + Klass::layout_helper_offset_in_bytes(), G1_obj_size);
+          __ ld(G5_klass, in_bytes(Klass::layout_helper_offset()), G1_obj_size);
 
           __ tlab_allocate(O0_obj, G1_obj_size, 0, G3_t1, slow_path);
 
@@ -437,7 +437,7 @@ OopMapSet* Runtime1::generate_code_for(StubID id, StubAssembler* sasm) {
 
           __ bind(try_eden);
           // get the instance size
-          __ ld(G5_klass, klassOopDesc::header_size() * HeapWordSize + Klass::layout_helper_offset_in_bytes(), G1_obj_size);
+          __ ld(G5_klass, in_bytes(Klass::layout_helper_offset()), G1_obj_size);
           __ eden_allocate(O0_obj, G1_obj_size, 0, G3_t1, G4_t2, slow_path);
           __ incr_allocated_bytes(G1_obj_size, G3_t1, G4_t2);
 
@@ -471,8 +471,7 @@ OopMapSet* Runtime1::generate_code_for(StubID id, StubAssembler* sasm) {
         Register G4_length = G4; // Incoming
         Register O0_obj   = O0; // Outgoing
 
-        Address klass_lh(G5_klass, ((klassOopDesc::header_size() * HeapWordSize)
-                                    + Klass::layout_helper_offset_in_bytes()));
+        Address klass_lh(G5_klass, Klass::layout_helper_offset());
         assert(Klass::_lh_header_size_shift % BitsPerByte == 0, "bytewise");
         assert(Klass::_lh_header_size_mask == 0xFF, "bytewise");
         // Use this offset to pick out an individual byte of the layout_helper:
@@ -592,7 +591,7 @@ OopMapSet* Runtime1::generate_code_for(StubID id, StubAssembler* sasm) {
         Label register_finalizer;
         Register t = O1;
         __ load_klass(O0, t);
-        __ ld(t, Klass::access_flags_offset_in_bytes() + sizeof(oopDesc), t);
+        __ ld(t, in_bytes(Klass::access_flags_offset()), t);
         __ set(JVM_ACC_HAS_FINALIZER, G3);
         __ andcc(G3, t, G0);
         __ br(Assembler::notZero, false, Assembler::pt, register_finalizer);
diff --git a/hotspot/src/cpu/sparc/vm/cppInterpreter_sparc.cpp b/hotspot/src/cpu/sparc/vm/cppInterpreter_sparc.cpp
index 2c2a93f9959..f402d622f35 100644
--- a/hotspot/src/cpu/sparc/vm/cppInterpreter_sparc.cpp
+++ b/hotspot/src/cpu/sparc/vm/cppInterpreter_sparc.cpp
@@ -766,7 +766,7 @@ address InterpreterGenerator::generate_native_entry(bool synchronized) {
       // get native function entry point(O0 is a good temp until the very end)
        ld_ptr(Address(G5_method, 0, in_bytes(methodOopDesc::native_function_offset())), O0);
     // for static methods insert the mirror argument
-    const int mirror_offset = klassOopDesc::klass_part_offset_in_bytes() + Klass::java_mirror_offset_in_bytes();
+    const int mirror_offset = in_bytes(Klass::java_mirror_offset());
 
     __ ld_ptr(Address(G5_method, 0, in_bytes(methodOopDesc:: constants_offset())), O1);
     __ ld_ptr(Address(O1, 0, constantPoolOopDesc::pool_holder_offset_in_bytes()), O1);
@@ -1173,7 +1173,7 @@ void CppInterpreterGenerator::generate_compute_interpreter_state(const Register
     __ btst(JVM_ACC_SYNCHRONIZED, O1);
     __ br( Assembler::zero, false, Assembler::pt, done);
 
-    const int mirror_offset = klassOopDesc::klass_part_offset_in_bytes() + Klass::java_mirror_offset_in_bytes();
+    const int mirror_offset = in_bytes(Klass::java_mirror_offset());
     __ delayed()->btst(JVM_ACC_STATIC, O1);
     __ ld_ptr(XXX_STATE(_locals), O1);
     __ br( Assembler::zero, true, Assembler::pt, got_obj);
diff --git a/hotspot/src/cpu/sparc/vm/methodHandles_sparc.cpp b/hotspot/src/cpu/sparc/vm/methodHandles_sparc.cpp
index d2a94d17edb..5bdf88e53ba 100644
--- a/hotspot/src/cpu/sparc/vm/methodHandles_sparc.cpp
+++ b/hotspot/src/cpu/sparc/vm/methodHandles_sparc.cpp
@@ -1098,7 +1098,7 @@ void MethodHandles::generate_method_handle_stub(MacroAssembler* _masm, MethodHan
   Address G3_amh_argument ( G3_method_handle, java_lang_invoke_AdapterMethodHandle::argument_offset_in_bytes());
   Address G3_amh_conversion(G3_method_handle, java_lang_invoke_AdapterMethodHandle::conversion_offset_in_bytes());
 
-  const int java_mirror_offset = klassOopDesc::klass_part_offset_in_bytes() + Klass::java_mirror_offset_in_bytes();
+  const int java_mirror_offset = in_bytes(Klass::java_mirror_offset());
 
   if (have_entry(ek)) {
     __ nop();  // empty stubs make SG sick
diff --git a/hotspot/src/cpu/sparc/vm/sparc.ad b/hotspot/src/cpu/sparc/vm/sparc.ad
index fe5f992e889..3063b2d9714 100644
--- a/hotspot/src/cpu/sparc/vm/sparc.ad
+++ b/hotspot/src/cpu/sparc/vm/sparc.ad
@@ -6773,6 +6773,16 @@ instruct unnecessary_membar_volatile() %{
   ins_pipe(empty);
 %}
 
+instruct membar_storestore() %{
+  match(MemBarStoreStore);
+  ins_cost(0);
+
+  size(0);
+  format %{ "!MEMBAR-storestore (empty encoding)" %}
+  ins_encode( );
+  ins_pipe(empty);
+%}
+
 //----------Register Move Instructions-----------------------------------------
 instruct roundDouble_nop(regD dst) %{
   match(Set dst (RoundDouble dst));
@@ -9273,6 +9283,7 @@ instruct cmpD_reg(iRegI dst, regD src1, regD src2, flagsRegF0 fcc0) %{
 // (compare 'operand indIndex' and 'instruct addP_reg_reg' above)
 instruct jumpXtnd(iRegX switch_val, o7RegI table) %{
   match(Jump switch_val);
+  effect(TEMP table);
 
   ins_cost(350);
 
@@ -10263,24 +10274,24 @@ instruct partialSubtypeCheck_vs_zero( flagsRegP pcc, o1RegP sub, o2RegP super, i
 // ============================================================================
 // inlined locking and unlocking
 
-instruct cmpFastLock(flagsRegP pcc, iRegP object, iRegP box, iRegP scratch2, o7RegP scratch ) %{
+instruct cmpFastLock(flagsRegP pcc, iRegP object, o1RegP box, iRegP scratch2, o7RegP scratch ) %{
   match(Set pcc (FastLock object box));
 
-  effect(KILL scratch, TEMP scratch2);
+  effect(TEMP scratch2, USE_KILL box, KILL scratch);
   ins_cost(100);
 
-  format %{ "FASTLOCK  $object, $box; KILL $scratch, $scratch2, $box" %}
+  format %{ "FASTLOCK  $object,$box\t! kills $box,$scratch,$scratch2" %}
   ins_encode( Fast_Lock(object, box, scratch, scratch2) );
   ins_pipe(long_memory_op);
 %}
 
 
-instruct cmpFastUnlock(flagsRegP pcc, iRegP object, iRegP box, iRegP scratch2, o7RegP scratch ) %{
+instruct cmpFastUnlock(flagsRegP pcc, iRegP object, o1RegP box, iRegP scratch2, o7RegP scratch ) %{
   match(Set pcc (FastUnlock object box));
-  effect(KILL scratch, TEMP scratch2);
+  effect(TEMP scratch2, USE_KILL box, KILL scratch);
   ins_cost(100);
 
-  format %{ "FASTUNLOCK  $object, $box; KILL $scratch, $scratch2, $box" %}
+  format %{ "FASTUNLOCK  $object,$box\t! kills $box,$scratch,$scratch2" %}
   ins_encode( Fast_Unlock(object, box, scratch, scratch2) );
   ins_pipe(long_memory_op);
 %}
diff --git a/hotspot/src/cpu/sparc/vm/stubGenerator_sparc.cpp b/hotspot/src/cpu/sparc/vm/stubGenerator_sparc.cpp
index 9415c7b6c4c..0e076dbbd5b 100644
--- a/hotspot/src/cpu/sparc/vm/stubGenerator_sparc.cpp
+++ b/hotspot/src/cpu/sparc/vm/stubGenerator_sparc.cpp
@@ -3046,8 +3046,7 @@ class StubGenerator: public StubCodeGenerator {
     //   array_tag: typeArray = 0x3, objArray = 0x2, non-array = 0x0
     //
 
-    int lh_offset = klassOopDesc::header_size() * HeapWordSize +
-                    Klass::layout_helper_offset_in_bytes();
+    int lh_offset = in_bytes(Klass::layout_helper_offset());
 
     // Load 32-bits signed value. Use br() instruction with it to check icc.
     __ lduw(G3_src_klass, lh_offset, G5_lh);
@@ -3194,15 +3193,13 @@ class StubGenerator: public StubCodeGenerator {
                                  G4_dst_klass, G3_src_klass);
 
       // Generate the type check.
-      int sco_offset = (klassOopDesc::header_size() * HeapWordSize +
-                        Klass::super_check_offset_offset_in_bytes());
+      int sco_offset = in_bytes(Klass::super_check_offset_offset());
       __ lduw(G4_dst_klass, sco_offset, sco_temp);
       generate_type_check(G3_src_klass, sco_temp, G4_dst_klass,
                           O5_temp, L_plain_copy);
 
       // Fetch destination element klass from the objArrayKlass header.
-      int ek_offset = (klassOopDesc::header_size() * HeapWordSize +
-                       objArrayKlass::element_klass_offset_in_bytes());
+      int ek_offset = in_bytes(objArrayKlass::element_klass_offset());
 
       // the checkcast_copy loop needs two extra arguments:
       __ ld_ptr(G4_dst_klass, ek_offset, O4);   // dest elem klass
@@ -3414,6 +3411,9 @@ class StubGenerator: public StubCodeGenerator {
       generate_throw_exception("WrongMethodTypeException throw_exception",
                                CAST_FROM_FN_PTR(address, SharedRuntime::throw_WrongMethodTypeException),
                                G5_method_type, G3_method_handle);
+
+    // Build this early so it's available for the interpreter.
+    StubRoutines::_throw_StackOverflowError_entry          = generate_throw_exception("StackOverflowError throw_exception",           CAST_FROM_FN_PTR(address, SharedRuntime::throw_StackOverflowError));
   }
 
 
@@ -3427,7 +3427,6 @@ class StubGenerator: public StubCodeGenerator {
     StubRoutines::_throw_AbstractMethodError_entry         = generate_throw_exception("AbstractMethodError throw_exception",          CAST_FROM_FN_PTR(address, SharedRuntime::throw_AbstractMethodError));
     StubRoutines::_throw_IncompatibleClassChangeError_entry= generate_throw_exception("IncompatibleClassChangeError throw_exception", CAST_FROM_FN_PTR(address, SharedRuntime::throw_IncompatibleClassChangeError));
     StubRoutines::_throw_NullPointerException_at_call_entry= generate_throw_exception("NullPointerException at call throw_exception", CAST_FROM_FN_PTR(address, SharedRuntime::throw_NullPointerException_at_call));
-    StubRoutines::_throw_StackOverflowError_entry          = generate_throw_exception("StackOverflowError throw_exception",           CAST_FROM_FN_PTR(address, SharedRuntime::throw_StackOverflowError));
 
     StubRoutines::_handler_for_unsafe_access_entry =
       generate_handler_for_unsafe_access();
diff --git a/hotspot/src/cpu/sparc/vm/templateInterpreter_sparc.cpp b/hotspot/src/cpu/sparc/vm/templateInterpreter_sparc.cpp
index 8ec7d0690b6..ceb6a5994b9 100644
--- a/hotspot/src/cpu/sparc/vm/templateInterpreter_sparc.cpp
+++ b/hotspot/src/cpu/sparc/vm/templateInterpreter_sparc.cpp
@@ -366,7 +366,7 @@ void InterpreterGenerator::lock_method(void) {
 
   // get synchronization object to O0
   { Label done;
-    const int mirror_offset = klassOopDesc::klass_part_offset_in_bytes() + Klass::java_mirror_offset_in_bytes();
+    const int mirror_offset = in_bytes(Klass::java_mirror_offset());
     __ btst(JVM_ACC_STATIC, O0);
     __ br( Assembler::zero, true, Assembler::pt, done);
     __ delayed()->ld_ptr(Llocals, Interpreter::local_offset_in_bytes(0), O0); // get receiver for not-static case
@@ -396,7 +396,6 @@ void TemplateInterpreterGenerator::generate_stack_overflow_check(Register Rframe
                                                          Register Rscratch,
                                                          Register Rscratch2) {
   const int page_size = os::vm_page_size();
-  Address saved_exception_pc(G2_thread, JavaThread::saved_exception_pc_offset());
   Label after_frame_check;
 
   assert_different_registers(Rframe_size, Rscratch, Rscratch2);
@@ -436,11 +435,19 @@ void TemplateInterpreterGenerator::generate_stack_overflow_check(Register Rframe
   // the bottom of the stack
   __ cmp_and_brx_short(SP, Rscratch, Assembler::greater, Assembler::pt, after_frame_check);
 
-  // Save the return address as the exception pc
-  __ st_ptr(O7, saved_exception_pc);
-
   // the stack will overflow, throw an exception
-  __ call_VM(noreg, CAST_FROM_FN_PTR(address, InterpreterRuntime::throw_StackOverflowError));
+
+  // Note that SP is restored to sender's sp (in the delay slot). This
+  // is necessary if the sender's frame is an extended compiled frame
+  // (see gen_c2i_adapter()) and safer anyway in case of JSR292
+  // adaptations.
+
+  // Note also that the restored frame is not necessarily interpreted.
+  // Use the shared runtime version of the StackOverflowError.
+  assert(StubRoutines::throw_StackOverflowError_entry() != NULL, "stub not yet generated");
+  AddressLiteral stub(StubRoutines::throw_StackOverflowError_entry());
+  __ jump_to(stub, Rscratch);
+  __ delayed()->mov(O5_savedSP, SP);
 
   // if you get to here, then there is enough stack space
   __ bind( after_frame_check );
@@ -984,7 +991,7 @@ address InterpreterGenerator::generate_native_entry(bool synchronized) {
     // get native function entry point(O0 is a good temp until the very end)
     __ delayed()->ld_ptr(Lmethod, in_bytes(methodOopDesc::native_function_offset()), O0);
     // for static methods insert the mirror argument
-    const int mirror_offset = klassOopDesc::klass_part_offset_in_bytes() + Klass::java_mirror_offset_in_bytes();
+    const int mirror_offset = in_bytes(Klass::java_mirror_offset());
 
     __ ld_ptr(Lmethod, methodOopDesc:: constants_offset(), O1);
     __ ld_ptr(O1, constantPoolOopDesc::pool_holder_offset_in_bytes(), O1);
diff --git a/hotspot/src/cpu/sparc/vm/templateTable_sparc.cpp b/hotspot/src/cpu/sparc/vm/templateTable_sparc.cpp
index 850dbe62a25..af6829b3d71 100644
--- a/hotspot/src/cpu/sparc/vm/templateTable_sparc.cpp
+++ b/hotspot/src/cpu/sparc/vm/templateTable_sparc.cpp
@@ -888,7 +888,7 @@ void TemplateTable::aastore() {
 
   // do fast instanceof cache test
 
-  __ ld_ptr(O4,     sizeof(oopDesc) + objArrayKlass::element_klass_offset_in_bytes(),  O4);
+  __ ld_ptr(O4,     in_bytes(objArrayKlass::element_klass_offset()),  O4);
 
   assert(Otos_i == O0, "just checking");
 
@@ -2031,7 +2031,7 @@ void TemplateTable::_return(TosState state) {
     __ access_local_ptr(G3_scratch, Otos_i);
     __ load_klass(Otos_i, O2);
     __ set(JVM_ACC_HAS_FINALIZER, G3);
-    __ ld(O2, Klass::access_flags_offset_in_bytes() + sizeof(oopDesc), O2);
+    __ ld(O2, in_bytes(Klass::access_flags_offset()), O2);
     __ andcc(G3, O2, G0);
     Label skip_register_finalizer;
     __ br(Assembler::zero, false, Assembler::pn, skip_register_finalizer);
@@ -3350,13 +3350,13 @@ void TemplateTable::_new() {
   __ ld_ptr(Rscratch, Roffset, RinstanceKlass);
 
   // make sure klass is fully initialized:
-  __ ld(RinstanceKlass, instanceKlass::init_state_offset_in_bytes() + sizeof(oopDesc), G3_scratch);
+  __ ldub(RinstanceKlass, in_bytes(instanceKlass::init_state_offset()), G3_scratch);
   __ cmp(G3_scratch, instanceKlass::fully_initialized);
   __ br(Assembler::notEqual, false, Assembler::pn, slow_case);
-  __ delayed()->ld(RinstanceKlass, Klass::layout_helper_offset_in_bytes() + sizeof(oopDesc), Roffset);
+  __ delayed()->ld(RinstanceKlass, in_bytes(Klass::layout_helper_offset()), Roffset);
 
   // get instance_size in instanceKlass (already aligned)
-  //__ ld(RinstanceKlass, Klass::layout_helper_offset_in_bytes() + sizeof(oopDesc), Roffset);
+  //__ ld(RinstanceKlass, in_bytes(Klass::layout_helper_offset()), Roffset);
 
   // make sure klass does not have has_finalizer, or is abstract, or interface or java/lang/Class
   __ btst(Klass::_lh_instance_slow_path_bit, Roffset);
@@ -3483,7 +3483,7 @@ void TemplateTable::_new() {
   __ bind(initialize_header);
 
   if (UseBiasedLocking) {
-    __ ld_ptr(RinstanceKlass, Klass::prototype_header_offset_in_bytes() + sizeof(oopDesc), G4_scratch);
+    __ ld_ptr(RinstanceKlass, in_bytes(Klass::prototype_header_offset()), G4_scratch);
   } else {
     __ set((intptr_t)markOopDesc::prototype(), G4_scratch);
   }
diff --git a/hotspot/src/cpu/x86/vm/assembler_x86.cpp b/hotspot/src/cpu/x86/vm/assembler_x86.cpp
index a58455cbf95..aa5cd185420 100644
--- a/hotspot/src/cpu/x86/vm/assembler_x86.cpp
+++ b/hotspot/src/cpu/x86/vm/assembler_x86.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 1997, 2011, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1997, 2012, Oracle and/or its affiliates. All rights reserved.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
  * This code is free software; you can redistribute it and/or modify it
@@ -533,6 +533,19 @@ address Assembler::locate_operand(address inst, WhichOperand which) {
 
   case 0x0F: // movx..., etc.
     switch (0xFF & *ip++) {
+    case 0x3A: // pcmpestri
+      tail_size = 1;
+    case 0x38: // ptest, pmovzxbw
+      ip++; // skip opcode
+      debug_only(has_disp32 = true); // has both kinds of operands!
+      break;
+
+    case 0x70: // pshufd r, r/a, #8
+      debug_only(has_disp32 = true); // has both kinds of operands!
+    case 0x73: // psrldq r, #8
+      tail_size = 1;
+      break;
+
     case 0x12: // movlps
     case 0x28: // movaps
     case 0x2E: // ucomiss
@@ -543,9 +556,7 @@ address Assembler::locate_operand(address inst, WhichOperand which) {
     case 0x57: // xorps
     case 0x6E: // movd
     case 0x7E: // movd
-    case 0xAE: // ldmxcsr   a
-      // 64bit side says it these have both operands but that doesn't
-      // appear to be true
+    case 0xAE: // ldmxcsr, stmxcsr, fxrstor, fxsave, clflush
       debug_only(has_disp32 = true);
       break;
 
@@ -565,6 +576,12 @@ address Assembler::locate_operand(address inst, WhichOperand which) {
       // fall out of the switch to decode the address
       break;
 
+    case 0xC4: // pinsrw r, a, #8
+      debug_only(has_disp32 = true);
+    case 0xC5: // pextrw r, r, #8
+      tail_size = 1;  // the imm8
+      break;
+
     case 0xAC: // shrd r, a, #8
       debug_only(has_disp32 = true);
       tail_size = 1;  // the imm8
@@ -625,11 +642,44 @@ address Assembler::locate_operand(address inst, WhichOperand which) {
     tail_size = 1; // the imm8
     break;
 
-  case 0xE8: // call rdisp32
-  case 0xE9: // jmp  rdisp32
-    if (which == end_pc_operand)  return ip + 4;
-    assert(which == call32_operand, "call has no disp32 or imm");
-    return ip;
+  case 0xC4: // VEX_3bytes
+  case 0xC5: // VEX_2bytes
+    assert((UseAVX > 0), "shouldn't have VEX prefix");
+    assert(ip == inst+1, "no prefixes allowed");
+    // C4 and C5 are also used as opcodes for PINSRW and PEXTRW instructions
+    // but they have prefix 0x0F and processed when 0x0F processed above.
+    //
+    // In 32-bit mode the VEX first byte C4 and C5 alias onto LDS and LES
+    // instructions (these instructions are not supported in 64-bit mode).
+    // To distinguish them bits [7:6] are set in the VEX second byte since
+    // ModRM byte can not be of the form 11xxxxxx in 32-bit mode. To set
+    // those VEX bits REX and vvvv bits are inverted.
+    //
+    // Fortunately C2 doesn't generate these instructions so we don't need
+    // to check for them in product version.
+
+    // Check second byte
+    NOT_LP64(assert((0xC0 & *ip) == 0xC0, "shouldn't have LDS and LES instructions"));
+
+    // First byte
+    if ((0xFF & *inst) == VEX_3bytes) {
+      ip++; // third byte
+      is_64bit = ((VEX_W & *ip) == VEX_W);
+    }
+    ip++; // opcode
+    // To find the end of instruction (which == end_pc_operand).
+    switch (0xFF & *ip) {
+    case 0x61: // pcmpestri r, r/a, #8
+    case 0x70: // pshufd r, r/a, #8
+    case 0x73: // psrldq r, #8
+      tail_size = 1;  // the imm8
+      break;
+    default:
+      break;
+    }
+    ip++; // skip opcode
+    debug_only(has_disp32 = true); // has both kinds of operands!
+    break;
 
   case 0xD1: // sal a, 1; sar a, 1; shl a, 1; shr a, 1
   case 0xD3: // sal a, %cl; sar a, %cl; shl a, %cl; shr a, %cl
@@ -643,6 +693,12 @@ address Assembler::locate_operand(address inst, WhichOperand which) {
     debug_only(has_disp32 = true);
     break;
 
+  case 0xE8: // call rdisp32
+  case 0xE9: // jmp  rdisp32
+    if (which == end_pc_operand)  return ip + 4;
+    assert(which == call32_operand, "call has no disp32 or imm");
+    return ip;
+
   case 0xF0:                    // Lock
     assert(os::is_MP(), "only on MP");
     goto again_after_prefix;
@@ -918,9 +974,7 @@ void Assembler::addr_nop_8() {
 
 void Assembler::addsd(XMMRegister dst, XMMRegister src) {
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
-  emit_byte(0xF2);
-  int encode = prefix_and_encode(dst->encoding(), src->encoding());
-  emit_byte(0x0F);
+  int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_F2);
   emit_byte(0x58);
   emit_byte(0xC0 | encode);
 }
@@ -928,18 +982,14 @@ void Assembler::addsd(XMMRegister dst, XMMRegister src) {
 void Assembler::addsd(XMMRegister dst, Address src) {
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
   InstructionMark im(this);
-  emit_byte(0xF2);
-  prefix(src, dst);
-  emit_byte(0x0F);
+  simd_prefix(dst, dst, src, VEX_SIMD_F2);
   emit_byte(0x58);
   emit_operand(dst, src);
 }
 
 void Assembler::addss(XMMRegister dst, XMMRegister src) {
   NOT_LP64(assert(VM_Version::supports_sse(), ""));
-  emit_byte(0xF3);
-  int encode = prefix_and_encode(dst->encoding(), src->encoding());
-  emit_byte(0x0F);
+  int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_F3);
   emit_byte(0x58);
   emit_byte(0xC0 | encode);
 }
@@ -947,13 +997,19 @@ void Assembler::addss(XMMRegister dst, XMMRegister src) {
 void Assembler::addss(XMMRegister dst, Address src) {
   NOT_LP64(assert(VM_Version::supports_sse(), ""));
   InstructionMark im(this);
-  emit_byte(0xF3);
-  prefix(src, dst);
-  emit_byte(0x0F);
+  simd_prefix(dst, dst, src, VEX_SIMD_F3);
   emit_byte(0x58);
   emit_operand(dst, src);
 }
 
+void Assembler::andl(Address dst, int32_t imm32) {
+  InstructionMark im(this);
+  prefix(dst);
+  emit_byte(0x81);
+  emit_operand(rsp, dst, 4);
+  emit_long(imm32);
+}
+
 void Assembler::andl(Register dst, int32_t imm32) {
   prefix(dst);
   emit_arith(0x81, 0xE0, dst, imm32);
@@ -974,13 +1030,33 @@ void Assembler::andl(Register dst, Register src) {
 void Assembler::andpd(XMMRegister dst, Address src) {
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
   InstructionMark im(this);
-  emit_byte(0x66);
-  prefix(src, dst);
-  emit_byte(0x0F);
+  simd_prefix(dst, dst, src, VEX_SIMD_66);
   emit_byte(0x54);
   emit_operand(dst, src);
 }
 
+void Assembler::andpd(XMMRegister dst, XMMRegister src) {
+  NOT_LP64(assert(VM_Version::supports_sse2(), ""));
+  int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_66);
+  emit_byte(0x54);
+  emit_byte(0xC0 | encode);
+}
+
+void Assembler::andps(XMMRegister dst, Address src) {
+  NOT_LP64(assert(VM_Version::supports_sse(), ""));
+  InstructionMark im(this);
+  simd_prefix(dst, dst, src, VEX_SIMD_NONE);
+  emit_byte(0x54);
+  emit_operand(dst, src);
+}
+
+void Assembler::andps(XMMRegister dst, XMMRegister src) {
+  NOT_LP64(assert(VM_Version::supports_sse(), ""));
+  int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_NONE);
+  emit_byte(0x54);
+  emit_byte(0xC0 | encode);
+}
+
 void Assembler::bsfl(Register dst, Register src) {
   int encode = prefix_and_encode(dst->encoding(), src->encoding());
   emit_byte(0x0F);
@@ -1025,19 +1101,7 @@ void Assembler::call(Label& L, relocInfo::relocType rtype) {
 }
 
 void Assembler::call(Register dst) {
-  // This was originally using a 32bit register encoding
-  // and surely we want 64bit!
-  // this is a 32bit encoding but in 64bit mode the default
-  // operand size is 64bit so there is no need for the
-  // wide prefix. So prefix only happens if we use the
-  // new registers. Much like push/pop.
-  int x = offset();
-  // this may be true but dbx disassembles it as if it
-  // were 32bits...
-  // int encode = prefix_and_encode(dst->encoding());
-  // if (offset() != x) assert(dst->encoding() >= 8, "what?");
-  int encode = prefixq_and_encode(dst->encoding());
-
+  int encode = prefix_and_encode(dst->encoding());
   emit_byte(0xFF);
   emit_byte(0xD0 | encode);
 }
@@ -1157,87 +1221,119 @@ void Assembler::comisd(XMMRegister dst, Address src) {
   // NOTE: dbx seems to decode this as comiss even though the
   // 0x66 is there. Strangly ucomisd comes out correct
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
-  emit_byte(0x66);
-  comiss(dst, src);
-}
-
-void Assembler::comiss(XMMRegister dst, Address src) {
-  NOT_LP64(assert(VM_Version::supports_sse(), ""));
-
   InstructionMark im(this);
-  prefix(src, dst);
-  emit_byte(0x0F);
+  simd_prefix(dst, src, VEX_SIMD_66);
   emit_byte(0x2F);
   emit_operand(dst, src);
 }
 
+void Assembler::comisd(XMMRegister dst, XMMRegister src) {
+  NOT_LP64(assert(VM_Version::supports_sse2(), ""));
+  int encode = simd_prefix_and_encode(dst, src, VEX_SIMD_66);
+  emit_byte(0x2F);
+  emit_byte(0xC0 | encode);
+}
+
+void Assembler::comiss(XMMRegister dst, Address src) {
+  NOT_LP64(assert(VM_Version::supports_sse(), ""));
+  InstructionMark im(this);
+  simd_prefix(dst, src, VEX_SIMD_NONE);
+  emit_byte(0x2F);
+  emit_operand(dst, src);
+}
+
+void Assembler::comiss(XMMRegister dst, XMMRegister src) {
+  NOT_LP64(assert(VM_Version::supports_sse(), ""));
+  int encode = simd_prefix_and_encode(dst, src, VEX_SIMD_NONE);
+  emit_byte(0x2F);
+  emit_byte(0xC0 | encode);
+}
+
 void Assembler::cvtdq2pd(XMMRegister dst, XMMRegister src) {
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
-  emit_byte(0xF3);
-  int encode = prefix_and_encode(dst->encoding(), src->encoding());
-  emit_byte(0x0F);
+  int encode = simd_prefix_and_encode(dst, src, VEX_SIMD_F3);
   emit_byte(0xE6);
   emit_byte(0xC0 | encode);
 }
 
 void Assembler::cvtdq2ps(XMMRegister dst, XMMRegister src) {
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
-  int encode = prefix_and_encode(dst->encoding(), src->encoding());
-  emit_byte(0x0F);
+  int encode = simd_prefix_and_encode(dst, src, VEX_SIMD_NONE);
   emit_byte(0x5B);
   emit_byte(0xC0 | encode);
 }
 
 void Assembler::cvtsd2ss(XMMRegister dst, XMMRegister src) {
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
-  emit_byte(0xF2);
-  int encode = prefix_and_encode(dst->encoding(), src->encoding());
-  emit_byte(0x0F);
+  int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_F2);
   emit_byte(0x5A);
   emit_byte(0xC0 | encode);
 }
 
+void Assembler::cvtsd2ss(XMMRegister dst, Address src) {
+  NOT_LP64(assert(VM_Version::supports_sse2(), ""));
+  InstructionMark im(this);
+  simd_prefix(dst, dst, src, VEX_SIMD_F2);
+  emit_byte(0x5A);
+  emit_operand(dst, src);
+}
+
 void Assembler::cvtsi2sdl(XMMRegister dst, Register src) {
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
-  emit_byte(0xF2);
-  int encode = prefix_and_encode(dst->encoding(), src->encoding());
-  emit_byte(0x0F);
+  int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_F2);
   emit_byte(0x2A);
   emit_byte(0xC0 | encode);
 }
 
+void Assembler::cvtsi2sdl(XMMRegister dst, Address src) {
+  NOT_LP64(assert(VM_Version::supports_sse2(), ""));
+  InstructionMark im(this);
+  simd_prefix(dst, dst, src, VEX_SIMD_F2);
+  emit_byte(0x2A);
+  emit_operand(dst, src);
+}
+
 void Assembler::cvtsi2ssl(XMMRegister dst, Register src) {
   NOT_LP64(assert(VM_Version::supports_sse(), ""));
-  emit_byte(0xF3);
-  int encode = prefix_and_encode(dst->encoding(), src->encoding());
-  emit_byte(0x0F);
+  int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_F3);
   emit_byte(0x2A);
   emit_byte(0xC0 | encode);
 }
 
+void Assembler::cvtsi2ssl(XMMRegister dst, Address src) {
+  NOT_LP64(assert(VM_Version::supports_sse(), ""));
+  InstructionMark im(this);
+  simd_prefix(dst, dst, src, VEX_SIMD_F3);
+  emit_byte(0x2A);
+  emit_operand(dst, src);
+}
+
 void Assembler::cvtss2sd(XMMRegister dst, XMMRegister src) {
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
-  emit_byte(0xF3);
-  int encode = prefix_and_encode(dst->encoding(), src->encoding());
-  emit_byte(0x0F);
+  int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_F3);
   emit_byte(0x5A);
   emit_byte(0xC0 | encode);
 }
 
+void Assembler::cvtss2sd(XMMRegister dst, Address src) {
+  NOT_LP64(assert(VM_Version::supports_sse2(), ""));
+  InstructionMark im(this);
+  simd_prefix(dst, dst, src, VEX_SIMD_F3);
+  emit_byte(0x5A);
+  emit_operand(dst, src);
+}
+
+
 void Assembler::cvttsd2sil(Register dst, XMMRegister src) {
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
-  emit_byte(0xF2);
-  int encode = prefix_and_encode(dst->encoding(), src->encoding());
-  emit_byte(0x0F);
+  int encode = simd_prefix_and_encode(dst, src, VEX_SIMD_F2);
   emit_byte(0x2C);
   emit_byte(0xC0 | encode);
 }
 
 void Assembler::cvttss2sil(Register dst, XMMRegister src) {
   NOT_LP64(assert(VM_Version::supports_sse(), ""));
-  emit_byte(0xF3);
-  int encode = prefix_and_encode(dst->encoding(), src->encoding());
-  emit_byte(0x0F);
+  int encode = simd_prefix_and_encode(dst, src, VEX_SIMD_F3);
   emit_byte(0x2C);
   emit_byte(0xC0 | encode);
 }
@@ -1253,18 +1349,14 @@ void Assembler::decl(Address dst) {
 void Assembler::divsd(XMMRegister dst, Address src) {
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
   InstructionMark im(this);
-  emit_byte(0xF2);
-  prefix(src, dst);
-  emit_byte(0x0F);
+  simd_prefix(dst, dst, src, VEX_SIMD_F2);
   emit_byte(0x5E);
   emit_operand(dst, src);
 }
 
 void Assembler::divsd(XMMRegister dst, XMMRegister src) {
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
-  emit_byte(0xF2);
-  int encode = prefix_and_encode(dst->encoding(), src->encoding());
-  emit_byte(0x0F);
+  int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_F2);
   emit_byte(0x5E);
   emit_byte(0xC0 | encode);
 }
@@ -1272,18 +1364,14 @@ void Assembler::divsd(XMMRegister dst, XMMRegister src) {
 void Assembler::divss(XMMRegister dst, Address src) {
   NOT_LP64(assert(VM_Version::supports_sse(), ""));
   InstructionMark im(this);
-  emit_byte(0xF3);
-  prefix(src, dst);
-  emit_byte(0x0F);
+  simd_prefix(dst, dst, src, VEX_SIMD_F3);
   emit_byte(0x5E);
   emit_operand(dst, src);
 }
 
 void Assembler::divss(XMMRegister dst, XMMRegister src) {
   NOT_LP64(assert(VM_Version::supports_sse(), ""));
-  emit_byte(0xF3);
-  int encode = prefix_and_encode(dst->encoding(), src->encoding());
-  emit_byte(0x0F);
+  int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_F3);
   emit_byte(0x5E);
   emit_byte(0xC0 | encode);
 }
@@ -1377,8 +1465,14 @@ void Assembler::jccb(Condition cc, Label& L) {
   if (L.is_bound()) {
     const int short_size = 2;
     address entry = target(L);
-    assert(is8bit((intptr_t)entry - ((intptr_t)_code_pos + short_size)),
-           "Dispacement too large for a short jmp");
+#ifdef ASSERT
+    intptr_t dist = (intptr_t)entry - ((intptr_t)_code_pos + short_size);
+    intptr_t delta = short_branch_delta();
+    if (delta != 0) {
+      dist += (dist < 0 ? (-delta) :delta);
+    }
+    assert(is8bit(dist), "Dispacement too large for a short jmp");
+#endif
     intptr_t offs = (intptr_t)entry - (intptr_t)_code_pos;
     // 0111 tttn #8-bit disp
     emit_byte(0x70 | cc);
@@ -1444,9 +1538,15 @@ void Assembler::jmpb(Label& L) {
   if (L.is_bound()) {
     const int short_size = 2;
     address entry = target(L);
-    assert(is8bit((entry - _code_pos) + short_size),
-           "Dispacement too large for a short jmp");
     assert(entry != NULL, "jmp most probably wrong");
+#ifdef ASSERT
+    intptr_t dist = (intptr_t)entry - ((intptr_t)_code_pos + short_size);
+    intptr_t delta = short_branch_delta();
+    if (delta != 0) {
+      dist += (dist < 0 ? (-delta) :delta);
+    }
+    assert(is8bit(dist), "Dispacement too large for a short jmp");
+#endif
     intptr_t offs = entry - _code_pos;
     emit_byte(0xEB);
     emit_byte((offs - short_size) & 0xFF);
@@ -1509,49 +1609,16 @@ void Assembler::mov(Register dst, Register src) {
 
 void Assembler::movapd(XMMRegister dst, XMMRegister src) {
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
-  int dstenc = dst->encoding();
-  int srcenc = src->encoding();
-  emit_byte(0x66);
-  if (dstenc < 8) {
-    if (srcenc >= 8) {
-      prefix(REX_B);
-      srcenc -= 8;
-    }
-  } else {
-    if (srcenc < 8) {
-      prefix(REX_R);
-    } else {
-      prefix(REX_RB);
-      srcenc -= 8;
-    }
-    dstenc -= 8;
-  }
-  emit_byte(0x0F);
+  int encode = simd_prefix_and_encode(dst, src, VEX_SIMD_66);
   emit_byte(0x28);
-  emit_byte(0xC0 | dstenc << 3 | srcenc);
+  emit_byte(0xC0 | encode);
 }
 
 void Assembler::movaps(XMMRegister dst, XMMRegister src) {
   NOT_LP64(assert(VM_Version::supports_sse(), ""));
-  int dstenc = dst->encoding();
-  int srcenc = src->encoding();
-  if (dstenc < 8) {
-    if (srcenc >= 8) {
-      prefix(REX_B);
-      srcenc -= 8;
-    }
-  } else {
-    if (srcenc < 8) {
-      prefix(REX_R);
-    } else {
-      prefix(REX_RB);
-      srcenc -= 8;
-    }
-    dstenc -= 8;
-  }
-  emit_byte(0x0F);
+  int encode = simd_prefix_and_encode(dst, src, VEX_SIMD_NONE);
   emit_byte(0x28);
-  emit_byte(0xC0 | dstenc << 3 | srcenc);
+  emit_byte(0xC0 | encode);
 }
 
 void Assembler::movb(Register dst, Address src) {
@@ -1582,19 +1649,15 @@ void Assembler::movb(Address dst, Register src) {
 
 void Assembler::movdl(XMMRegister dst, Register src) {
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
-  emit_byte(0x66);
-  int encode = prefix_and_encode(dst->encoding(), src->encoding());
-  emit_byte(0x0F);
+  int encode = simd_prefix_and_encode(dst, src, VEX_SIMD_66);
   emit_byte(0x6E);
   emit_byte(0xC0 | encode);
 }
 
 void Assembler::movdl(Register dst, XMMRegister src) {
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
-  emit_byte(0x66);
   // swap src/dst to get correct prefix
-  int encode = prefix_and_encode(src->encoding(), dst->encoding());
-  emit_byte(0x0F);
+  int encode = simd_prefix_and_encode(src, dst, VEX_SIMD_66);
   emit_byte(0x7E);
   emit_byte(0xC0 | encode);
 }
@@ -1602,58 +1665,29 @@ void Assembler::movdl(Register dst, XMMRegister src) {
 void Assembler::movdl(XMMRegister dst, Address src) {
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
   InstructionMark im(this);
-  emit_byte(0x66);
-  prefix(src, dst);
-  emit_byte(0x0F);
+  simd_prefix(dst, src, VEX_SIMD_66);
   emit_byte(0x6E);
   emit_operand(dst, src);
 }
 
-
-void Assembler::movdqa(XMMRegister dst, Address src) {
-  NOT_LP64(assert(VM_Version::supports_sse2(), ""));
-  InstructionMark im(this);
-  emit_byte(0x66);
-  prefix(src, dst);
-  emit_byte(0x0F);
-  emit_byte(0x6F);
-  emit_operand(dst, src);
-}
-
 void Assembler::movdqa(XMMRegister dst, XMMRegister src) {
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
-  emit_byte(0x66);
-  int encode = prefixq_and_encode(dst->encoding(), src->encoding());
-  emit_byte(0x0F);
+  int encode = simd_prefix_and_encode(dst, src, VEX_SIMD_66);
   emit_byte(0x6F);
   emit_byte(0xC0 | encode);
 }
 
-void Assembler::movdqa(Address dst, XMMRegister src) {
-  NOT_LP64(assert(VM_Version::supports_sse2(), ""));
-  InstructionMark im(this);
-  emit_byte(0x66);
-  prefix(dst, src);
-  emit_byte(0x0F);
-  emit_byte(0x7F);
-  emit_operand(src, dst);
-}
-
 void Assembler::movdqu(XMMRegister dst, Address src) {
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
   InstructionMark im(this);
-  emit_byte(0xF3);
-  prefix(src, dst);
-  emit_byte(0x0F);
+  simd_prefix(dst, src, VEX_SIMD_F3);
   emit_byte(0x6F);
   emit_operand(dst, src);
 }
 
 void Assembler::movdqu(XMMRegister dst, XMMRegister src) {
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
-  emit_byte(0xF3);
-  int encode = prefixq_and_encode(dst->encoding(), src->encoding());
-  emit_byte(0x0F);
+  int encode = simd_prefix_and_encode(dst, src, VEX_SIMD_F3);
   emit_byte(0x6F);
   emit_byte(0xC0 | encode);
 }
@@ -1661,9 +1695,7 @@ void Assembler::movdqu(XMMRegister dst, XMMRegister src) {
 void Assembler::movdqu(Address dst, XMMRegister src) {
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
   InstructionMark im(this);
-  emit_byte(0xF3);
-  prefix(dst, src);
-  emit_byte(0x0F);
+  simd_prefix(dst, src, VEX_SIMD_F3);
   emit_byte(0x7F);
   emit_operand(src, dst);
 }
@@ -1710,9 +1742,7 @@ void Assembler::movl(Address dst, Register src) {
 void Assembler::movlpd(XMMRegister dst, Address src) {
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
   InstructionMark im(this);
-  emit_byte(0x66);
-  prefix(src, dst);
-  emit_byte(0x0F);
+  simd_prefix(dst, dst, src, VEX_SIMD_66);
   emit_byte(0x12);
   emit_operand(dst, src);
 }
@@ -1740,9 +1770,7 @@ void Assembler::movq( Address dst, MMXRegister src ) {
 void Assembler::movq(XMMRegister dst, Address src) {
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
   InstructionMark im(this);
-  emit_byte(0xF3);
-  prefix(src, dst);
-  emit_byte(0x0F);
+  simd_prefix(dst, src, VEX_SIMD_F3);
   emit_byte(0x7E);
   emit_operand(dst, src);
 }
@@ -1750,9 +1778,7 @@ void Assembler::movq(XMMRegister dst, Address src) {
 void Assembler::movq(Address dst, XMMRegister src) {
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
   InstructionMark im(this);
-  emit_byte(0x66);
-  prefix(dst, src);
-  emit_byte(0x0F);
+  simd_prefix(dst, src, VEX_SIMD_66);
   emit_byte(0xD6);
   emit_operand(src, dst);
 }
@@ -1775,9 +1801,7 @@ void Assembler::movsbl(Register dst, Register src) { // movsxb
 
 void Assembler::movsd(XMMRegister dst, XMMRegister src) {
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
-  emit_byte(0xF2);
-  int encode = prefix_and_encode(dst->encoding(), src->encoding());
-  emit_byte(0x0F);
+  int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_F2);
   emit_byte(0x10);
   emit_byte(0xC0 | encode);
 }
@@ -1785,9 +1809,7 @@ void Assembler::movsd(XMMRegister dst, XMMRegister src) {
 void Assembler::movsd(XMMRegister dst, Address src) {
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
   InstructionMark im(this);
-  emit_byte(0xF2);
-  prefix(src, dst);
-  emit_byte(0x0F);
+  simd_prefix(dst, src, VEX_SIMD_F2);
   emit_byte(0x10);
   emit_operand(dst, src);
 }
@@ -1795,18 +1817,14 @@ void Assembler::movsd(XMMRegister dst, Address src) {
 void Assembler::movsd(Address dst, XMMRegister src) {
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
   InstructionMark im(this);
-  emit_byte(0xF2);
-  prefix(dst, src);
-  emit_byte(0x0F);
+  simd_prefix(dst, src, VEX_SIMD_F2);
   emit_byte(0x11);
   emit_operand(src, dst);
 }
 
 void Assembler::movss(XMMRegister dst, XMMRegister src) {
   NOT_LP64(assert(VM_Version::supports_sse(), ""));
-  emit_byte(0xF3);
-  int encode = prefix_and_encode(dst->encoding(), src->encoding());
-  emit_byte(0x0F);
+  int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_F3);
   emit_byte(0x10);
   emit_byte(0xC0 | encode);
 }
@@ -1814,9 +1832,7 @@ void Assembler::movss(XMMRegister dst, XMMRegister src) {
 void Assembler::movss(XMMRegister dst, Address src) {
   NOT_LP64(assert(VM_Version::supports_sse(), ""));
   InstructionMark im(this);
-  emit_byte(0xF3);
-  prefix(src, dst);
-  emit_byte(0x0F);
+  simd_prefix(dst, src, VEX_SIMD_F3);
   emit_byte(0x10);
   emit_operand(dst, src);
 }
@@ -1824,9 +1840,7 @@ void Assembler::movss(XMMRegister dst, Address src) {
 void Assembler::movss(Address dst, XMMRegister src) {
   NOT_LP64(assert(VM_Version::supports_sse(), ""));
   InstructionMark im(this);
-  emit_byte(0xF3);
-  prefix(dst, src);
-  emit_byte(0x0F);
+  simd_prefix(dst, src, VEX_SIMD_F3);
   emit_byte(0x11);
   emit_operand(src, dst);
 }
@@ -1919,18 +1933,14 @@ void Assembler::mull(Register src) {
 void Assembler::mulsd(XMMRegister dst, Address src) {
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
   InstructionMark im(this);
-  emit_byte(0xF2);
-  prefix(src, dst);
-  emit_byte(0x0F);
+  simd_prefix(dst, dst, src, VEX_SIMD_F2);
   emit_byte(0x59);
   emit_operand(dst, src);
 }
 
 void Assembler::mulsd(XMMRegister dst, XMMRegister src) {
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
-  emit_byte(0xF2);
-  int encode = prefix_and_encode(dst->encoding(), src->encoding());
-  emit_byte(0x0F);
+  int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_F2);
   emit_byte(0x59);
   emit_byte(0xC0 | encode);
 }
@@ -1938,18 +1948,14 @@ void Assembler::mulsd(XMMRegister dst, XMMRegister src) {
 void Assembler::mulss(XMMRegister dst, Address src) {
   NOT_LP64(assert(VM_Version::supports_sse(), ""));
   InstructionMark im(this);
-  emit_byte(0xF3);
-  prefix(src, dst);
-  emit_byte(0x0F);
+  simd_prefix(dst, dst, src, VEX_SIMD_F3);
   emit_byte(0x59);
   emit_operand(dst, src);
 }
 
 void Assembler::mulss(XMMRegister dst, XMMRegister src) {
   NOT_LP64(assert(VM_Version::supports_sse(), ""));
-  emit_byte(0xF3);
-  int encode = prefix_and_encode(dst->encoding(), src->encoding());
-  emit_byte(0x0F);
+  int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_F3);
   emit_byte(0x59);
   emit_byte(0xC0 | encode);
 }
@@ -2237,14 +2243,26 @@ void Assembler::orl(Register dst, Register src) {
   emit_arith(0x0B, 0xC0, dst, src);
 }
 
+void Assembler::packuswb(XMMRegister dst, Address src) {
+  NOT_LP64(assert(VM_Version::supports_sse2(), ""));
+  assert((UseAVX > 0), "SSE mode requires address alignment 16 bytes");
+  InstructionMark im(this);
+  simd_prefix(dst, dst, src, VEX_SIMD_66);
+  emit_byte(0x67);
+  emit_operand(dst, src);
+}
+
+void Assembler::packuswb(XMMRegister dst, XMMRegister src) {
+  NOT_LP64(assert(VM_Version::supports_sse2(), ""));
+  int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_66);
+  emit_byte(0x67);
+  emit_byte(0xC0 | encode);
+}
+
 void Assembler::pcmpestri(XMMRegister dst, Address src, int imm8) {
   assert(VM_Version::supports_sse4_2(), "");
-
   InstructionMark im(this);
-  emit_byte(0x66);
-  prefix(src, dst);
-  emit_byte(0x0F);
-  emit_byte(0x3A);
+  simd_prefix(dst, src, VEX_SIMD_66, VEX_OPCODE_0F_3A);
   emit_byte(0x61);
   emit_operand(dst, src);
   emit_byte(imm8);
@@ -2252,16 +2270,27 @@ void Assembler::pcmpestri(XMMRegister dst, Address src, int imm8) {
 
 void Assembler::pcmpestri(XMMRegister dst, XMMRegister src, int imm8) {
   assert(VM_Version::supports_sse4_2(), "");
-
-  emit_byte(0x66);
-  int encode = prefixq_and_encode(dst->encoding(), src->encoding());
-  emit_byte(0x0F);
-  emit_byte(0x3A);
+  int encode = simd_prefix_and_encode(dst, src, VEX_SIMD_66, VEX_OPCODE_0F_3A);
   emit_byte(0x61);
   emit_byte(0xC0 | encode);
   emit_byte(imm8);
 }
 
+void Assembler::pmovzxbw(XMMRegister dst, Address src) {
+  assert(VM_Version::supports_sse4_1(), "");
+  InstructionMark im(this);
+  simd_prefix(dst, src, VEX_SIMD_66, VEX_OPCODE_0F_38);
+  emit_byte(0x30);
+  emit_operand(dst, src);
+}
+
+void Assembler::pmovzxbw(XMMRegister dst, XMMRegister src) {
+  assert(VM_Version::supports_sse4_1(), "");
+  int encode = simd_prefix_and_encode(dst, src, VEX_SIMD_66, VEX_OPCODE_0F_38);
+  emit_byte(0x30);
+  emit_byte(0xC0 | encode);
+}
+
 // generic
 void Assembler::pop(Register dst) {
   int encode = prefix_and_encode(dst->encoding());
@@ -2360,22 +2389,24 @@ void Assembler::prefix(Prefix p) {
 
 void Assembler::por(XMMRegister dst, XMMRegister src) {
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
-
-  emit_byte(0x66);
-  int  encode = prefix_and_encode(dst->encoding(), src->encoding());
-  emit_byte(0x0F);
-
+  int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_66);
   emit_byte(0xEB);
   emit_byte(0xC0 | encode);
 }
 
+void Assembler::por(XMMRegister dst, Address src) {
+  NOT_LP64(assert(VM_Version::supports_sse2(), ""));
+  assert((UseAVX > 0), "SSE mode requires address alignment 16 bytes");
+  InstructionMark im(this);
+  simd_prefix(dst, dst, src, VEX_SIMD_66);
+  emit_byte(0xEB);
+  emit_operand(dst, src);
+}
+
 void Assembler::pshufd(XMMRegister dst, XMMRegister src, int mode) {
   assert(isByte(mode), "invalid value");
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
-
-  emit_byte(0x66);
-  int encode = prefix_and_encode(dst->encoding(), src->encoding());
-  emit_byte(0x0F);
+  int encode = simd_prefix_and_encode(dst, src, VEX_SIMD_66);
   emit_byte(0x70);
   emit_byte(0xC0 | encode);
   emit_byte(mode & 0xFF);
@@ -2385,11 +2416,9 @@ void Assembler::pshufd(XMMRegister dst, XMMRegister src, int mode) {
 void Assembler::pshufd(XMMRegister dst, Address src, int mode) {
   assert(isByte(mode), "invalid value");
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
-
+  assert((UseAVX > 0), "SSE mode requires address alignment 16 bytes");
   InstructionMark im(this);
-  emit_byte(0x66);
-  prefix(src, dst);
-  emit_byte(0x0F);
+  simd_prefix(dst, src, VEX_SIMD_66);
   emit_byte(0x70);
   emit_operand(dst, src);
   emit_byte(mode & 0xFF);
@@ -2398,10 +2427,7 @@ void Assembler::pshufd(XMMRegister dst, Address src, int mode) {
 void Assembler::pshuflw(XMMRegister dst, XMMRegister src, int mode) {
   assert(isByte(mode), "invalid value");
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
-
-  emit_byte(0xF2);
-  int encode = prefix_and_encode(dst->encoding(), src->encoding());
-  emit_byte(0x0F);
+  int encode = simd_prefix_and_encode(dst, src, VEX_SIMD_F2);
   emit_byte(0x70);
   emit_byte(0xC0 | encode);
   emit_byte(mode & 0xFF);
@@ -2410,11 +2436,9 @@ void Assembler::pshuflw(XMMRegister dst, XMMRegister src, int mode) {
 void Assembler::pshuflw(XMMRegister dst, Address src, int mode) {
   assert(isByte(mode), "invalid value");
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
-
+  assert((UseAVX > 0), "SSE mode requires address alignment 16 bytes");
   InstructionMark im(this);
-  emit_byte(0xF2);
-  prefix(src, dst); // QQ new
-  emit_byte(0x0F);
+  simd_prefix(dst, src, VEX_SIMD_F2);
   emit_byte(0x70);
   emit_operand(dst, src);
   emit_byte(mode & 0xFF);
@@ -2425,11 +2449,8 @@ void Assembler::psrlq(XMMRegister dst, int shift) {
   // HMM Table D-1 says sse2 or mmx.
   // Do not confuse it with psrldq SSE2 instruction which
   // shifts 128 bit value in xmm register by number of bytes.
-  NOT_LP64(assert(VM_Version::supports_sse(), ""));
-
-  int encode = prefixq_and_encode(xmm2->encoding(), dst->encoding());
-  emit_byte(0x66);
-  emit_byte(0x0F);
+  NOT_LP64(assert(VM_Version::supports_sse2(), ""));
+  int encode = simd_prefix_and_encode(xmm2, dst, dst, VEX_SIMD_66);
   emit_byte(0x73);
   emit_byte(0xC0 | encode);
   emit_byte(shift);
@@ -2438,10 +2459,7 @@ void Assembler::psrlq(XMMRegister dst, int shift) {
 void Assembler::psrldq(XMMRegister dst, int shift) {
   // Shift 128 bit value in xmm register by number of bytes.
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
-
-  int encode = prefixq_and_encode(xmm3->encoding(), dst->encoding());
-  emit_byte(0x66);
-  emit_byte(0x0F);
+  int encode = simd_prefix_and_encode(xmm3, dst, dst, VEX_SIMD_66);
   emit_byte(0x73);
   emit_byte(0xC0 | encode);
   emit_byte(shift);
@@ -2449,36 +2467,52 @@ void Assembler::psrldq(XMMRegister dst, int shift) {
 
 void Assembler::ptest(XMMRegister dst, Address src) {
   assert(VM_Version::supports_sse4_1(), "");
-
+  assert((UseAVX > 0), "SSE mode requires address alignment 16 bytes");
   InstructionMark im(this);
-  emit_byte(0x66);
-  prefix(src, dst);
-  emit_byte(0x0F);
-  emit_byte(0x38);
+  simd_prefix(dst, src, VEX_SIMD_66, VEX_OPCODE_0F_38);
   emit_byte(0x17);
   emit_operand(dst, src);
 }
 
 void Assembler::ptest(XMMRegister dst, XMMRegister src) {
   assert(VM_Version::supports_sse4_1(), "");
-
-  emit_byte(0x66);
-  int encode = prefixq_and_encode(dst->encoding(), src->encoding());
-  emit_byte(0x0F);
-  emit_byte(0x38);
+  int encode = simd_prefix_and_encode(dst, src, VEX_SIMD_66, VEX_OPCODE_0F_38);
   emit_byte(0x17);
   emit_byte(0xC0 | encode);
 }
 
+void Assembler::punpcklbw(XMMRegister dst, Address src) {
+  NOT_LP64(assert(VM_Version::supports_sse2(), ""));
+  assert((UseAVX > 0), "SSE mode requires address alignment 16 bytes");
+  InstructionMark im(this);
+  simd_prefix(dst, dst, src, VEX_SIMD_66);
+  emit_byte(0x60);
+  emit_operand(dst, src);
+}
+
 void Assembler::punpcklbw(XMMRegister dst, XMMRegister src) {
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
-  emit_byte(0x66);
-  int encode = prefix_and_encode(dst->encoding(), src->encoding());
-  emit_byte(0x0F);
+  int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_66);
   emit_byte(0x60);
   emit_byte(0xC0 | encode);
 }
 
+void Assembler::punpckldq(XMMRegister dst, Address src) {
+  NOT_LP64(assert(VM_Version::supports_sse2(), ""));
+  assert((UseAVX > 0), "SSE mode requires address alignment 16 bytes");
+  InstructionMark im(this);
+  simd_prefix(dst, dst, src, VEX_SIMD_66);
+  emit_byte(0x62);
+  emit_operand(dst, src);
+}
+
+void Assembler::punpckldq(XMMRegister dst, XMMRegister src) {
+  NOT_LP64(assert(VM_Version::supports_sse2(), ""));
+  int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_66);
+  emit_byte(0x62);
+  emit_byte(0xC0 | encode);
+}
+
 void Assembler::push(int32_t imm32) {
   // in 64bits we push 64bits onto the stack but only
   // take a 32bit immediate
@@ -2508,20 +2542,16 @@ void Assembler::pushl(Address src) {
 
 void Assembler::pxor(XMMRegister dst, Address src) {
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
+  assert((UseAVX > 0), "SSE mode requires address alignment 16 bytes");
   InstructionMark im(this);
-  emit_byte(0x66);
-  prefix(src, dst);
-  emit_byte(0x0F);
+  simd_prefix(dst, dst, src, VEX_SIMD_66);
   emit_byte(0xEF);
   emit_operand(dst, src);
 }
 
 void Assembler::pxor(XMMRegister dst, XMMRegister src) {
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
-  InstructionMark im(this);
-  emit_byte(0x66);
-  int encode = prefix_and_encode(dst->encoding(), src->encoding());
-  emit_byte(0x0F);
+  int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_66);
   emit_byte(0xEF);
   emit_byte(0xC0 | encode);
 }
@@ -2683,12 +2713,8 @@ void Assembler::smovl() {
 }
 
 void Assembler::sqrtsd(XMMRegister dst, XMMRegister src) {
-  // HMM Table D-1 says sse2
-  // NOT_LP64(assert(VM_Version::supports_sse(), ""));
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
-  emit_byte(0xF2);
-  int encode = prefix_and_encode(dst->encoding(), src->encoding());
-  emit_byte(0x0F);
+  int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_F2);
   emit_byte(0x51);
   emit_byte(0xC0 | encode);
 }
@@ -2696,30 +2722,22 @@ void Assembler::sqrtsd(XMMRegister dst, XMMRegister src) {
 void Assembler::sqrtsd(XMMRegister dst, Address src) {
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
   InstructionMark im(this);
-  emit_byte(0xF2);
-  prefix(src, dst);
-  emit_byte(0x0F);
+  simd_prefix(dst, dst, src, VEX_SIMD_F2);
   emit_byte(0x51);
   emit_operand(dst, src);
 }
 
 void Assembler::sqrtss(XMMRegister dst, XMMRegister src) {
-  // HMM Table D-1 says sse2
-  // NOT_LP64(assert(VM_Version::supports_sse(), ""));
-  NOT_LP64(assert(VM_Version::supports_sse2(), ""));
-  emit_byte(0xF3);
-  int encode = prefix_and_encode(dst->encoding(), src->encoding());
-  emit_byte(0x0F);
+  NOT_LP64(assert(VM_Version::supports_sse(), ""));
+  int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_F3);
   emit_byte(0x51);
   emit_byte(0xC0 | encode);
 }
 
 void Assembler::sqrtss(XMMRegister dst, Address src) {
-  NOT_LP64(assert(VM_Version::supports_sse2(), ""));
+  NOT_LP64(assert(VM_Version::supports_sse(), ""));
   InstructionMark im(this);
-  emit_byte(0xF3);
-  prefix(src, dst);
-  emit_byte(0x0F);
+  simd_prefix(dst, dst, src, VEX_SIMD_F3);
   emit_byte(0x51);
   emit_operand(dst, src);
 }
@@ -2765,9 +2783,7 @@ void Assembler::subl(Register dst, Register src) {
 
 void Assembler::subsd(XMMRegister dst, XMMRegister src) {
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
-  emit_byte(0xF2);
-  int encode = prefix_and_encode(dst->encoding(), src->encoding());
-  emit_byte(0x0F);
+  int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_F2);
   emit_byte(0x5C);
   emit_byte(0xC0 | encode);
 }
@@ -2775,18 +2791,14 @@ void Assembler::subsd(XMMRegister dst, XMMRegister src) {
 void Assembler::subsd(XMMRegister dst, Address src) {
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
   InstructionMark im(this);
-  emit_byte(0xF2);
-  prefix(src, dst);
-  emit_byte(0x0F);
+  simd_prefix(dst, dst, src, VEX_SIMD_F2);
   emit_byte(0x5C);
   emit_operand(dst, src);
 }
 
 void Assembler::subss(XMMRegister dst, XMMRegister src) {
   NOT_LP64(assert(VM_Version::supports_sse(), ""));
-  emit_byte(0xF3);
-  int encode = prefix_and_encode(dst->encoding(), src->encoding());
-  emit_byte(0x0F);
+  int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_F3);
   emit_byte(0x5C);
   emit_byte(0xC0 | encode);
 }
@@ -2794,9 +2806,7 @@ void Assembler::subss(XMMRegister dst, XMMRegister src) {
 void Assembler::subss(XMMRegister dst, Address src) {
   NOT_LP64(assert(VM_Version::supports_sse(), ""));
   InstructionMark im(this);
-  emit_byte(0xF3);
-  prefix(src, dst);
-  emit_byte(0x0F);
+  simd_prefix(dst, dst, src, VEX_SIMD_F3);
   emit_byte(0x5C);
   emit_operand(dst, src);
 }
@@ -2836,30 +2846,30 @@ void Assembler::testl(Register dst, Address  src) {
 
 void Assembler::ucomisd(XMMRegister dst, Address src) {
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
-  emit_byte(0x66);
-  ucomiss(dst, src);
+  InstructionMark im(this);
+  simd_prefix(dst, src, VEX_SIMD_66);
+  emit_byte(0x2E);
+  emit_operand(dst, src);
 }
 
 void Assembler::ucomisd(XMMRegister dst, XMMRegister src) {
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
-  emit_byte(0x66);
-  ucomiss(dst, src);
+  int encode = simd_prefix_and_encode(dst, src, VEX_SIMD_66);
+  emit_byte(0x2E);
+  emit_byte(0xC0 | encode);
 }
 
 void Assembler::ucomiss(XMMRegister dst, Address src) {
   NOT_LP64(assert(VM_Version::supports_sse(), ""));
-
   InstructionMark im(this);
-  prefix(src, dst);
-  emit_byte(0x0F);
+  simd_prefix(dst, src, VEX_SIMD_NONE);
   emit_byte(0x2E);
   emit_operand(dst, src);
 }
 
 void Assembler::ucomiss(XMMRegister dst, XMMRegister src) {
   NOT_LP64(assert(VM_Version::supports_sse(), ""));
-  int encode = prefix_and_encode(dst->encoding(), src->encoding());
-  emit_byte(0x0F);
+  int encode = simd_prefix_and_encode(dst, src, VEX_SIMD_NONE);
   emit_byte(0x2E);
   emit_byte(0xC0 | encode);
 }
@@ -2905,16 +2915,15 @@ void Assembler::xorl(Register dst, Register src) {
 
 void Assembler::xorpd(XMMRegister dst, XMMRegister src) {
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
-  emit_byte(0x66);
-  xorps(dst, src);
+  int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_66);
+  emit_byte(0x57);
+  emit_byte(0xC0 | encode);
 }
 
 void Assembler::xorpd(XMMRegister dst, Address src) {
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
   InstructionMark im(this);
-  emit_byte(0x66);
-  prefix(src, dst);
-  emit_byte(0x0F);
+  simd_prefix(dst, dst, src, VEX_SIMD_66);
   emit_byte(0x57);
   emit_operand(dst, src);
 }
@@ -2922,8 +2931,7 @@ void Assembler::xorpd(XMMRegister dst, Address src) {
 
 void Assembler::xorps(XMMRegister dst, XMMRegister src) {
   NOT_LP64(assert(VM_Version::supports_sse(), ""));
-  int encode = prefix_and_encode(dst->encoding(), src->encoding());
-  emit_byte(0x0F);
+  int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_NONE);
   emit_byte(0x57);
   emit_byte(0xC0 | encode);
 }
@@ -2931,12 +2939,166 @@ void Assembler::xorps(XMMRegister dst, XMMRegister src) {
 void Assembler::xorps(XMMRegister dst, Address src) {
   NOT_LP64(assert(VM_Version::supports_sse(), ""));
   InstructionMark im(this);
-  prefix(src, dst);
-  emit_byte(0x0F);
+  simd_prefix(dst, dst, src, VEX_SIMD_NONE);
   emit_byte(0x57);
   emit_operand(dst, src);
 }
 
+// AVX 3-operands non destructive source instructions (encoded with VEX prefix)
+
+void Assembler::vaddsd(XMMRegister dst, XMMRegister nds, Address src) {
+  assert(VM_Version::supports_avx(), "");
+  InstructionMark im(this);
+  vex_prefix(dst, nds, src, VEX_SIMD_F2);
+  emit_byte(0x58);
+  emit_operand(dst, src);
+}
+
+void Assembler::vaddsd(XMMRegister dst, XMMRegister nds, XMMRegister src) {
+  assert(VM_Version::supports_avx(), "");
+  int encode = vex_prefix_and_encode(dst, nds, src, VEX_SIMD_F2);
+  emit_byte(0x58);
+  emit_byte(0xC0 | encode);
+}
+
+void Assembler::vaddss(XMMRegister dst, XMMRegister nds, Address src) {
+  assert(VM_Version::supports_avx(), "");
+  InstructionMark im(this);
+  vex_prefix(dst, nds, src, VEX_SIMD_F3);
+  emit_byte(0x58);
+  emit_operand(dst, src);
+}
+
+void Assembler::vaddss(XMMRegister dst, XMMRegister nds, XMMRegister src) {
+  assert(VM_Version::supports_avx(), "");
+  int encode = vex_prefix_and_encode(dst, nds, src, VEX_SIMD_F3);
+  emit_byte(0x58);
+  emit_byte(0xC0 | encode);
+}
+
+void Assembler::vandpd(XMMRegister dst, XMMRegister nds, Address src) {
+  assert(VM_Version::supports_avx(), "");
+  InstructionMark im(this);
+  vex_prefix(dst, nds, src, VEX_SIMD_66); // 128-bit vector
+  emit_byte(0x54);
+  emit_operand(dst, src);
+}
+
+void Assembler::vandps(XMMRegister dst, XMMRegister nds, Address src) {
+  assert(VM_Version::supports_avx(), "");
+  InstructionMark im(this);
+  vex_prefix(dst, nds, src, VEX_SIMD_NONE); // 128-bit vector
+  emit_byte(0x54);
+  emit_operand(dst, src);
+}
+
+void Assembler::vdivsd(XMMRegister dst, XMMRegister nds, Address src) {
+  assert(VM_Version::supports_avx(), "");
+  InstructionMark im(this);
+  vex_prefix(dst, nds, src, VEX_SIMD_F2);
+  emit_byte(0x5E);
+  emit_operand(dst, src);
+}
+
+void Assembler::vdivsd(XMMRegister dst, XMMRegister nds, XMMRegister src) {
+  assert(VM_Version::supports_avx(), "");
+  int encode = vex_prefix_and_encode(dst, nds, src, VEX_SIMD_F2);
+  emit_byte(0x5E);
+  emit_byte(0xC0 | encode);
+}
+
+void Assembler::vdivss(XMMRegister dst, XMMRegister nds, Address src) {
+  assert(VM_Version::supports_avx(), "");
+  InstructionMark im(this);
+  vex_prefix(dst, nds, src, VEX_SIMD_F3);
+  emit_byte(0x5E);
+  emit_operand(dst, src);
+}
+
+void Assembler::vdivss(XMMRegister dst, XMMRegister nds, XMMRegister src) {
+  assert(VM_Version::supports_avx(), "");
+  int encode = vex_prefix_and_encode(dst, nds, src, VEX_SIMD_F3);
+  emit_byte(0x5E);
+  emit_byte(0xC0 | encode);
+}
+
+void Assembler::vmulsd(XMMRegister dst, XMMRegister nds, Address src) {
+  assert(VM_Version::supports_avx(), "");
+  InstructionMark im(this);
+  vex_prefix(dst, nds, src, VEX_SIMD_F2);
+  emit_byte(0x59);
+  emit_operand(dst, src);
+}
+
+void Assembler::vmulsd(XMMRegister dst, XMMRegister nds, XMMRegister src) {
+  assert(VM_Version::supports_avx(), "");
+  int encode = vex_prefix_and_encode(dst, nds, src, VEX_SIMD_F2);
+  emit_byte(0x59);
+  emit_byte(0xC0 | encode);
+}
+
+void Assembler::vmulss(XMMRegister dst, XMMRegister nds, Address src) {
+  InstructionMark im(this);
+  vex_prefix(dst, nds, src, VEX_SIMD_F3);
+  emit_byte(0x59);
+  emit_operand(dst, src);
+}
+
+void Assembler::vmulss(XMMRegister dst, XMMRegister nds, XMMRegister src) {
+  assert(VM_Version::supports_avx(), "");
+  int encode = vex_prefix_and_encode(dst, nds, src, VEX_SIMD_F3);
+  emit_byte(0x59);
+  emit_byte(0xC0 | encode);
+}
+
+
+void Assembler::vsubsd(XMMRegister dst, XMMRegister nds, Address src) {
+  assert(VM_Version::supports_avx(), "");
+  InstructionMark im(this);
+  vex_prefix(dst, nds, src, VEX_SIMD_F2);
+  emit_byte(0x5C);
+  emit_operand(dst, src);
+}
+
+void Assembler::vsubsd(XMMRegister dst, XMMRegister nds, XMMRegister src) {
+  assert(VM_Version::supports_avx(), "");
+  int encode = vex_prefix_and_encode(dst, nds, src, VEX_SIMD_F2);
+  emit_byte(0x5C);
+  emit_byte(0xC0 | encode);
+}
+
+void Assembler::vsubss(XMMRegister dst, XMMRegister nds, Address src) {
+  assert(VM_Version::supports_avx(), "");
+  InstructionMark im(this);
+  vex_prefix(dst, nds, src, VEX_SIMD_F3);
+  emit_byte(0x5C);
+  emit_operand(dst, src);
+}
+
+void Assembler::vsubss(XMMRegister dst, XMMRegister nds, XMMRegister src) {
+  assert(VM_Version::supports_avx(), "");
+  int encode = vex_prefix_and_encode(dst, nds, src, VEX_SIMD_F3);
+  emit_byte(0x5C);
+  emit_byte(0xC0 | encode);
+}
+
+void Assembler::vxorpd(XMMRegister dst, XMMRegister nds, Address src) {
+  assert(VM_Version::supports_avx(), "");
+  InstructionMark im(this);
+  vex_prefix(dst, nds, src, VEX_SIMD_66); // 128-bit vector
+  emit_byte(0x57);
+  emit_operand(dst, src);
+}
+
+void Assembler::vxorps(XMMRegister dst, XMMRegister nds, Address src) {
+  assert(VM_Version::supports_avx(), "");
+  InstructionMark im(this);
+  vex_prefix(dst, nds, src, VEX_SIMD_NONE); // 128-bit vector
+  emit_byte(0x57);
+  emit_operand(dst, src);
+}
+
+
 #ifndef _LP64
 // 32bit only pieces of the assembler
 
@@ -3394,12 +3556,114 @@ void Assembler::fyl2x() {
   emit_byte(0xF1);
 }
 
+// SSE SIMD prefix byte values corresponding to VexSimdPrefix encoding.
+static int simd_pre[4] = { 0, 0x66, 0xF3, 0xF2 };
+// SSE opcode second byte values (first is 0x0F) corresponding to VexOpcode encoding.
+static int simd_opc[4] = { 0,    0, 0x38, 0x3A };
+
+// Generate SSE legacy REX prefix and SIMD opcode based on VEX encoding.
+void Assembler::rex_prefix(Address adr, XMMRegister xreg, VexSimdPrefix pre, VexOpcode opc, bool rex_w) {
+  if (pre > 0) {
+    emit_byte(simd_pre[pre]);
+  }
+  if (rex_w) {
+    prefixq(adr, xreg);
+  } else {
+    prefix(adr, xreg);
+  }
+  if (opc > 0) {
+    emit_byte(0x0F);
+    int opc2 = simd_opc[opc];
+    if (opc2 > 0) {
+      emit_byte(opc2);
+    }
+  }
+}
+
+int Assembler::rex_prefix_and_encode(int dst_enc, int src_enc, VexSimdPrefix pre, VexOpcode opc, bool rex_w) {
+  if (pre > 0) {
+    emit_byte(simd_pre[pre]);
+  }
+  int encode = (rex_w) ? prefixq_and_encode(dst_enc, src_enc) :
+                          prefix_and_encode(dst_enc, src_enc);
+  if (opc > 0) {
+    emit_byte(0x0F);
+    int opc2 = simd_opc[opc];
+    if (opc2 > 0) {
+      emit_byte(opc2);
+    }
+  }
+  return encode;
+}
+
+
+void Assembler::vex_prefix(bool vex_r, bool vex_b, bool vex_x, bool vex_w, int nds_enc, VexSimdPrefix pre, VexOpcode opc, bool vector256) {
+  if (vex_b || vex_x || vex_w || (opc == VEX_OPCODE_0F_38) || (opc == VEX_OPCODE_0F_3A)) {
+    prefix(VEX_3bytes);
+
+    int byte1 = (vex_r ? VEX_R : 0) | (vex_x ? VEX_X : 0) | (vex_b ? VEX_B : 0);
+    byte1 = (~byte1) & 0xE0;
+    byte1 |= opc;
+    a_byte(byte1);
+
+    int byte2 = ((~nds_enc) & 0xf) << 3;
+    byte2 |= (vex_w ? VEX_W : 0) | (vector256 ? 4 : 0) | pre;
+    emit_byte(byte2);
+  } else {
+    prefix(VEX_2bytes);
+
+    int byte1 = vex_r ? VEX_R : 0;
+    byte1 = (~byte1) & 0x80;
+    byte1 |= ((~nds_enc) & 0xf) << 3;
+    byte1 |= (vector256 ? 4 : 0) | pre;
+    emit_byte(byte1);
+  }
+}
+
+void Assembler::vex_prefix(Address adr, int nds_enc, int xreg_enc, VexSimdPrefix pre, VexOpcode opc, bool vex_w, bool vector256){
+  bool vex_r = (xreg_enc >= 8);
+  bool vex_b = adr.base_needs_rex();
+  bool vex_x = adr.index_needs_rex();
+  vex_prefix(vex_r, vex_b, vex_x, vex_w, nds_enc, pre, opc, vector256);
+}
+
+int Assembler::vex_prefix_and_encode(int dst_enc, int nds_enc, int src_enc, VexSimdPrefix pre, VexOpcode opc, bool vex_w, bool vector256) {
+  bool vex_r = (dst_enc >= 8);
+  bool vex_b = (src_enc >= 8);
+  bool vex_x = false;
+  vex_prefix(vex_r, vex_b, vex_x, vex_w, nds_enc, pre, opc, vector256);
+  return (((dst_enc & 7) << 3) | (src_enc & 7));
+}
+
+
+void Assembler::simd_prefix(XMMRegister xreg, XMMRegister nds, Address adr, VexSimdPrefix pre, VexOpcode opc, bool rex_w, bool vector256) {
+  if (UseAVX > 0) {
+    int xreg_enc = xreg->encoding();
+    int  nds_enc = nds->is_valid() ? nds->encoding() : 0;
+    vex_prefix(adr, nds_enc, xreg_enc, pre, opc, rex_w, vector256);
+  } else {
+    assert((nds == xreg) || (nds == xnoreg), "wrong sse encoding");
+    rex_prefix(adr, xreg, pre, opc, rex_w);
+  }
+}
+
+int Assembler::simd_prefix_and_encode(XMMRegister dst, XMMRegister nds, XMMRegister src, VexSimdPrefix pre, VexOpcode opc, bool rex_w, bool vector256) {
+  int dst_enc = dst->encoding();
+  int src_enc = src->encoding();
+  if (UseAVX > 0) {
+    int nds_enc = nds->is_valid() ? nds->encoding() : 0;
+    return vex_prefix_and_encode(dst_enc, nds_enc, src_enc, pre, opc, rex_w, vector256);
+  } else {
+    assert((nds == dst) || (nds == src) || (nds == xnoreg), "wrong sse encoding");
+    return rex_prefix_and_encode(dst_enc, src_enc, pre, opc, rex_w);
+  }
+}
 
 #ifndef _LP64
 
 void Assembler::incl(Register dst) {
   // Don't use it directly. Use MacroAssembler::incrementl() instead.
- emit_byte(0x40 | dst->encoding());
+  emit_byte(0x40 | dst->encoding());
 }
 
 void Assembler::lea(Register dst, Address src) {
@@ -3756,6 +4020,38 @@ void Assembler::prefix(Address adr, XMMRegister reg) {
   }
 }
 
+void Assembler::prefixq(Address adr, XMMRegister src) {
+  if (src->encoding() < 8) {
+    if (adr.base_needs_rex()) {
+      if (adr.index_needs_rex()) {
+        prefix(REX_WXB);
+      } else {
+        prefix(REX_WB);
+      }
+    } else {
+      if (adr.index_needs_rex()) {
+        prefix(REX_WX);
+      } else {
+        prefix(REX_W);
+      }
+    }
+  } else {
+    if (adr.base_needs_rex()) {
+      if (adr.index_needs_rex()) {
+        prefix(REX_WRXB);
+      } else {
+        prefix(REX_WRB);
+      }
+    } else {
+      if (adr.index_needs_rex()) {
+        prefix(REX_WRX);
+      } else {
+        prefix(REX_WR);
+      }
+    }
+  }
+}
+
 void Assembler::adcq(Register dst, int32_t imm32) {
   (void) prefixq_and_encode(dst->encoding());
   emit_arith(0x81, 0xD0, dst, imm32);
@@ -3918,36 +4214,44 @@ void Assembler::cmpxchgq(Register reg, Address adr) {
 
 void Assembler::cvtsi2sdq(XMMRegister dst, Register src) {
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
-  emit_byte(0xF2);
-  int encode = prefixq_and_encode(dst->encoding(), src->encoding());
-  emit_byte(0x0F);
+  int encode = simd_prefix_and_encode_q(dst, dst, src, VEX_SIMD_F2);
   emit_byte(0x2A);
   emit_byte(0xC0 | encode);
 }
 
+void Assembler::cvtsi2sdq(XMMRegister dst, Address src) {
+  NOT_LP64(assert(VM_Version::supports_sse2(), ""));
+  InstructionMark im(this);
+  simd_prefix_q(dst, dst, src, VEX_SIMD_F2);
+  emit_byte(0x2A);
+  emit_operand(dst, src);
+}
+
 void Assembler::cvtsi2ssq(XMMRegister dst, Register src) {
   NOT_LP64(assert(VM_Version::supports_sse(), ""));
-  emit_byte(0xF3);
-  int encode = prefixq_and_encode(dst->encoding(), src->encoding());
-  emit_byte(0x0F);
+  int encode = simd_prefix_and_encode_q(dst, dst, src, VEX_SIMD_F3);
   emit_byte(0x2A);
   emit_byte(0xC0 | encode);
 }
 
+void Assembler::cvtsi2ssq(XMMRegister dst, Address src) {
+  NOT_LP64(assert(VM_Version::supports_sse(), ""));
+  InstructionMark im(this);
+  simd_prefix_q(dst, dst, src, VEX_SIMD_F3);
+  emit_byte(0x2A);
+  emit_operand(dst, src);
+}
+
 void Assembler::cvttsd2siq(Register dst, XMMRegister src) {
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
-  emit_byte(0xF2);
-  int encode = prefixq_and_encode(dst->encoding(), src->encoding());
-  emit_byte(0x0F);
+  int encode = simd_prefix_and_encode_q(dst, src, VEX_SIMD_F2);
   emit_byte(0x2C);
   emit_byte(0xC0 | encode);
 }
 
 void Assembler::cvttss2siq(Register dst, XMMRegister src) {
   NOT_LP64(assert(VM_Version::supports_sse(), ""));
-  emit_byte(0xF3);
-  int encode = prefixq_and_encode(dst->encoding(), src->encoding());
-  emit_byte(0x0F);
+  int encode = simd_prefix_and_encode_q(dst, src, VEX_SIMD_F3);
   emit_byte(0x2C);
   emit_byte(0xC0 | encode);
 }
@@ -4107,21 +4411,17 @@ void Assembler::lzcntq(Register dst, Register src) {
 
 void Assembler::movdq(XMMRegister dst, Register src) {
   // table D-1 says MMX/SSE2
-  NOT_LP64(assert(VM_Version::supports_sse2() || VM_Version::supports_mmx(), ""));
-  emit_byte(0x66);
-  int encode = prefixq_and_encode(dst->encoding(), src->encoding());
-  emit_byte(0x0F);
+  NOT_LP64(assert(VM_Version::supports_sse2(), ""));
+  int encode = simd_prefix_and_encode_q(dst, src, VEX_SIMD_66);
   emit_byte(0x6E);
   emit_byte(0xC0 | encode);
 }
 
 void Assembler::movdq(Register dst, XMMRegister src) {
   // table D-1 says MMX/SSE2
-  NOT_LP64(assert(VM_Version::supports_sse2() || VM_Version::supports_mmx(), ""));
-  emit_byte(0x66);
+  NOT_LP64(assert(VM_Version::supports_sse2(), ""));
   // swap src/dst to get correct prefix
-  int encode = prefixq_and_encode(src->encoding(), dst->encoding());
-  emit_byte(0x0F);
+  int encode = simd_prefix_and_encode_q(src, dst, VEX_SIMD_66);
   emit_byte(0x7E);
   emit_byte(0xC0 | encode);
 }
@@ -4632,7 +4932,7 @@ int MacroAssembler::biased_locking_enter(Register lock_reg,
     null_check_offset = offset();
   }
   movl(tmp_reg, klass_addr);
-  xorl(swap_reg, Address(tmp_reg, Klass::prototype_header_offset_in_bytes() + klassOopDesc::klass_part_offset_in_bytes()));
+  xorl(swap_reg, Address(tmp_reg, Klass::prototype_header_offset()));
   andl(swap_reg, ~((int) markOopDesc::age_mask_in_place));
   if (need_tmp_reg) {
     pop(tmp_reg);
@@ -4719,7 +5019,7 @@ int MacroAssembler::biased_locking_enter(Register lock_reg,
   }
   get_thread(tmp_reg);
   movl(swap_reg, klass_addr);
-  orl(tmp_reg, Address(swap_reg, Klass::prototype_header_offset_in_bytes() + klassOopDesc::klass_part_offset_in_bytes()));
+  orl(tmp_reg, Address(swap_reg, Klass::prototype_header_offset()));
   movl(swap_reg, saved_mark_addr);
   if (os::is_MP()) {
     lock();
@@ -4757,7 +5057,7 @@ int MacroAssembler::biased_locking_enter(Register lock_reg,
     push(tmp_reg);
   }
   movl(tmp_reg, klass_addr);
-  movl(tmp_reg, Address(tmp_reg, Klass::prototype_header_offset_in_bytes() + klassOopDesc::klass_part_offset_in_bytes()));
+  movl(tmp_reg, Address(tmp_reg, Klass::prototype_header_offset()));
   if (os::is_MP()) {
     lock();
   }
@@ -5680,6 +5980,24 @@ void MacroAssembler::addptr(Address dst, Register src) {
   LP64_ONLY(addq(dst, src)) NOT_LP64(addl(dst, src));
 }
 
+void MacroAssembler::addsd(XMMRegister dst, AddressLiteral src) {
+  if (reachable(src)) {
+    Assembler::addsd(dst, as_Address(src));
+  } else {
+    lea(rscratch1, src);
+    Assembler::addsd(dst, Address(rscratch1, 0));
+  }
+}
+
+void MacroAssembler::addss(XMMRegister dst, AddressLiteral src) {
+  if (reachable(src)) {
+    addss(dst, as_Address(src));
+  } else {
+    lea(rscratch1, src);
+    addss(dst, Address(rscratch1, 0));
+  }
+}
+
 void MacroAssembler::align(int modulus) {
   if (offset() % modulus != 0) {
     nop(modulus - (offset() % modulus));
@@ -5687,11 +6005,24 @@ void MacroAssembler::align(int modulus) {
 }
 
 void MacroAssembler::andpd(XMMRegister dst, AddressLiteral src) {
+  // Used in sign-masking with aligned address.
+  assert((UseAVX > 0) || (((intptr_t)src.target() & 15) == 0), "SSE mode requires address alignment 16 bytes");
   if (reachable(src)) {
-    andpd(dst, as_Address(src));
+    Assembler::andpd(dst, as_Address(src));
   } else {
     lea(rscratch1, src);
-    andpd(dst, Address(rscratch1, 0));
+    Assembler::andpd(dst, Address(rscratch1, 0));
+  }
+}
+
+void MacroAssembler::andps(XMMRegister dst, AddressLiteral src) {
+  // Used in sign-masking with aligned address.
+  assert((UseAVX > 0) || (((intptr_t)src.target() & 15) == 0), "SSE mode requires address alignment 16 bytes");
+  if (reachable(src)) {
+    Assembler::andps(dst, as_Address(src));
+  } else {
+    lea(rscratch1, src);
+    Assembler::andps(dst, Address(rscratch1, 0));
   }
 }
 
@@ -6270,19 +6601,19 @@ void MacroAssembler::cmpxchgptr(Register reg, Address adr) {
 
 void MacroAssembler::comisd(XMMRegister dst, AddressLiteral src) {
   if (reachable(src)) {
-    comisd(dst, as_Address(src));
+    Assembler::comisd(dst, as_Address(src));
   } else {
     lea(rscratch1, src);
-    comisd(dst, Address(rscratch1, 0));
+    Assembler::comisd(dst, Address(rscratch1, 0));
   }
 }
 
 void MacroAssembler::comiss(XMMRegister dst, AddressLiteral src) {
   if (reachable(src)) {
-    comiss(dst, as_Address(src));
+    Assembler::comiss(dst, as_Address(src));
   } else {
     lea(rscratch1, src);
-    comiss(dst, Address(rscratch1, 0));
+    Assembler::comiss(dst, Address(rscratch1, 0));
   }
 }
 
@@ -6366,6 +6697,24 @@ void MacroAssembler::division_with_shift (Register reg, int shift_value) {
   sarl(reg, shift_value);
 }
 
+void MacroAssembler::divsd(XMMRegister dst, AddressLiteral src) {
+  if (reachable(src)) {
+    Assembler::divsd(dst, as_Address(src));
+  } else {
+    lea(rscratch1, src);
+    Assembler::divsd(dst, Address(rscratch1, 0));
+  }
+}
+
+void MacroAssembler::divss(XMMRegister dst, AddressLiteral src) {
+  if (reachable(src)) {
+    Assembler::divss(dst, as_Address(src));
+  } else {
+    lea(rscratch1, src);
+    Assembler::divss(dst, Address(rscratch1, 0));
+  }
+}
+
 // !defined(COMPILER2) is because of stupid core builds
 #if !defined(_LP64) || defined(COMPILER1) || !defined(COMPILER2)
 void MacroAssembler::empty_FPU_stack() {
@@ -6805,12 +7154,39 @@ void MacroAssembler::movptr(Address dst, Register src) {
   LP64_ONLY(movq(dst, src)) NOT_LP64(movl(dst, src));
 }
 
-void MacroAssembler::movss(XMMRegister dst, AddressLiteral src) {
+void MacroAssembler::movsd(XMMRegister dst, AddressLiteral src) {
   if (reachable(src)) {
-    movss(dst, as_Address(src));
+    Assembler::movsd(dst, as_Address(src));
   } else {
     lea(rscratch1, src);
-    movss(dst, Address(rscratch1, 0));
+    Assembler::movsd(dst, Address(rscratch1, 0));
+  }
+}
+
+void MacroAssembler::movss(XMMRegister dst, AddressLiteral src) {
+  if (reachable(src)) {
+    Assembler::movss(dst, as_Address(src));
+  } else {
+    lea(rscratch1, src);
+    Assembler::movss(dst, Address(rscratch1, 0));
+  }
+}
+
+void MacroAssembler::mulsd(XMMRegister dst, AddressLiteral src) {
+  if (reachable(src)) {
+    Assembler::mulsd(dst, as_Address(src));
+  } else {
+    lea(rscratch1, src);
+    Assembler::mulsd(dst, Address(rscratch1, 0));
+  }
+}
+
+void MacroAssembler::mulss(XMMRegister dst, AddressLiteral src) {
+  if (reachable(src)) {
+    Assembler::mulss(dst, as_Address(src));
+  } else {
+    lea(rscratch1, src);
+    Assembler::mulss(dst, Address(rscratch1, 0));
   }
 }
 
@@ -6992,6 +7368,193 @@ void MacroAssembler::testl(Register dst, AddressLiteral src) {
   testl(dst, as_Address(src));
 }
 
+void MacroAssembler::sqrtsd(XMMRegister dst, AddressLiteral src) {
+  if (reachable(src)) {
+    Assembler::sqrtsd(dst, as_Address(src));
+  } else {
+    lea(rscratch1, src);
+    Assembler::sqrtsd(dst, Address(rscratch1, 0));
+  }
+}
+
+void MacroAssembler::sqrtss(XMMRegister dst, AddressLiteral src) {
+  if (reachable(src)) {
+    Assembler::sqrtss(dst, as_Address(src));
+  } else {
+    lea(rscratch1, src);
+    Assembler::sqrtss(dst, Address(rscratch1, 0));
+  }
+}
+
+void MacroAssembler::subsd(XMMRegister dst, AddressLiteral src) {
+  if (reachable(src)) {
+    Assembler::subsd(dst, as_Address(src));
+  } else {
+    lea(rscratch1, src);
+    Assembler::subsd(dst, Address(rscratch1, 0));
+  }
+}
+
+void MacroAssembler::subss(XMMRegister dst, AddressLiteral src) {
+  if (reachable(src)) {
+    Assembler::subss(dst, as_Address(src));
+  } else {
+    lea(rscratch1, src);
+    Assembler::subss(dst, Address(rscratch1, 0));
+  }
+}
+
+void MacroAssembler::ucomisd(XMMRegister dst, AddressLiteral src) {
+  if (reachable(src)) {
+    Assembler::ucomisd(dst, as_Address(src));
+  } else {
+    lea(rscratch1, src);
+    Assembler::ucomisd(dst, Address(rscratch1, 0));
+  }
+}
+
+void MacroAssembler::ucomiss(XMMRegister dst, AddressLiteral src) {
+  if (reachable(src)) {
+    Assembler::ucomiss(dst, as_Address(src));
+  } else {
+    lea(rscratch1, src);
+    Assembler::ucomiss(dst, Address(rscratch1, 0));
+  }
+}
+
+void MacroAssembler::xorpd(XMMRegister dst, AddressLiteral src) {
+  // Used in sign-bit flipping with aligned address.
+  assert((UseAVX > 0) || (((intptr_t)src.target() & 15) == 0), "SSE mode requires address alignment 16 bytes");
+  if (reachable(src)) {
+    Assembler::xorpd(dst, as_Address(src));
+  } else {
+    lea(rscratch1, src);
+    Assembler::xorpd(dst, Address(rscratch1, 0));
+  }
+}
+
+void MacroAssembler::xorps(XMMRegister dst, AddressLiteral src) {
+  // Used in sign-bit flipping with aligned address.
+  assert((UseAVX > 0) || (((intptr_t)src.target() & 15) == 0), "SSE mode requires address alignment 16 bytes");
+  if (reachable(src)) {
+    Assembler::xorps(dst, as_Address(src));
+  } else {
+    lea(rscratch1, src);
+    Assembler::xorps(dst, Address(rscratch1, 0));
+  }
+}
+
+// AVX 3-operands instructions
+
+void MacroAssembler::vaddsd(XMMRegister dst, XMMRegister nds, AddressLiteral src) {
+  if (reachable(src)) {
+    vaddsd(dst, nds, as_Address(src));
+  } else {
+    lea(rscratch1, src);
+    vaddsd(dst, nds, Address(rscratch1, 0));
+  }
+}
+
+void MacroAssembler::vaddss(XMMRegister dst, XMMRegister nds, AddressLiteral src) {
+  if (reachable(src)) {
+    vaddss(dst, nds, as_Address(src));
+  } else {
+    lea(rscratch1, src);
+    vaddss(dst, nds, Address(rscratch1, 0));
+  }
+}
+
+void MacroAssembler::vandpd(XMMRegister dst, XMMRegister nds, AddressLiteral src) {
+  if (reachable(src)) {
+    vandpd(dst, nds, as_Address(src));
+  } else {
+    lea(rscratch1, src);
+    vandpd(dst, nds, Address(rscratch1, 0));
+  }
+}
+
+void MacroAssembler::vandps(XMMRegister dst, XMMRegister nds, AddressLiteral src) {
+  if (reachable(src)) {
+    vandps(dst, nds, as_Address(src));
+  } else {
+    lea(rscratch1, src);
+    vandps(dst, nds, Address(rscratch1, 0));
+  }
+}
+
+void MacroAssembler::vdivsd(XMMRegister dst, XMMRegister nds, AddressLiteral src) {
+  if (reachable(src)) {
+    vdivsd(dst, nds, as_Address(src));
+  } else {
+    lea(rscratch1, src);
+    vdivsd(dst, nds, Address(rscratch1, 0));
+  }
+}
+
+void MacroAssembler::vdivss(XMMRegister dst, XMMRegister nds, AddressLiteral src) {
+  if (reachable(src)) {
+    vdivss(dst, nds, as_Address(src));
+  } else {
+    lea(rscratch1, src);
+    vdivss(dst, nds, Address(rscratch1, 0));
+  }
+}
+
+void MacroAssembler::vmulsd(XMMRegister dst, XMMRegister nds, AddressLiteral src) {
+  if (reachable(src)) {
+    vmulsd(dst, nds, as_Address(src));
+  } else {
+    lea(rscratch1, src);
+    vmulsd(dst, nds, Address(rscratch1, 0));
+  }
+}
+
+void MacroAssembler::vmulss(XMMRegister dst, XMMRegister nds, AddressLiteral src) {
+  if (reachable(src)) {
+    vmulss(dst, nds, as_Address(src));
+  } else {
+    lea(rscratch1, src);
+    vmulss(dst, nds, Address(rscratch1, 0));
+  }
+}
+
+void MacroAssembler::vsubsd(XMMRegister dst, XMMRegister nds, AddressLiteral src) {
+  if (reachable(src)) {
+    vsubsd(dst, nds, as_Address(src));
+  } else {
+    lea(rscratch1, src);
+    vsubsd(dst, nds, Address(rscratch1, 0));
+  }
+}
+
+void MacroAssembler::vsubss(XMMRegister dst, XMMRegister nds, AddressLiteral src) {
+  if (reachable(src)) {
+    vsubss(dst, nds, as_Address(src));
+  } else {
+    lea(rscratch1, src);
+    vsubss(dst, nds, Address(rscratch1, 0));
+  }
+}
+
+void MacroAssembler::vxorpd(XMMRegister dst, XMMRegister nds, AddressLiteral src) {
+  if (reachable(src)) {
+    vxorpd(dst, nds, as_Address(src));
+  } else {
+    lea(rscratch1, src);
+    vxorpd(dst, nds, Address(rscratch1, 0));
+  }
+}
+
+void MacroAssembler::vxorps(XMMRegister dst, XMMRegister nds, AddressLiteral src) {
+  if (reachable(src)) {
+    vxorps(dst, nds, as_Address(src));
+  } else {
+    lea(rscratch1, src);
+    vxorps(dst, nds, Address(rscratch1, 0));
+  }
+}
+
+
 //////////////////////////////////////////////////////////////////////////////////
 #ifndef SERIALGC
 
@@ -7430,6 +7993,16 @@ void MacroAssembler::incr_allocated_bytes(Register thread,
                                           Register var_size_in_bytes,
                                           int con_size_in_bytes,
                                           Register t1) {
+  if (!thread->is_valid()) {
+#ifdef _LP64
+    thread = r15_thread;
+#else
+    assert(t1->is_valid(), "need temp reg");
+    thread = t1;
+    get_thread(thread);
+#endif
+  }
+
 #ifdef _LP64
   if (var_size_in_bytes->is_valid()) {
     addq(Address(thread, in_bytes(JavaThread::allocated_bytes_offset())), var_size_in_bytes);
@@ -7437,12 +8010,6 @@ void MacroAssembler::incr_allocated_bytes(Register thread,
     addq(Address(thread, in_bytes(JavaThread::allocated_bytes_offset())), con_size_in_bytes);
   }
 #else
-  if (!thread->is_valid()) {
-    assert(t1->is_valid(), "need temp reg");
-    thread = t1;
-    get_thread(thread);
-  }
-
   if (var_size_in_bytes->is_valid()) {
     addl(Address(thread, in_bytes(JavaThread::allocated_bytes_offset())), var_size_in_bytes);
   } else {
@@ -7685,10 +8252,8 @@ void MacroAssembler::check_klass_subtype_fast_path(Register sub_klass,
   if (L_slow_path == NULL) { L_slow_path = &L_fallthrough; label_nulls++; }
   assert(label_nulls <= 1, "at most one NULL in the batch");
 
-  int sc_offset = (klassOopDesc::header_size() * HeapWordSize +
-                   Klass::secondary_super_cache_offset_in_bytes());
-  int sco_offset = (klassOopDesc::header_size() * HeapWordSize +
-                    Klass::super_check_offset_offset_in_bytes());
+  int sc_offset = in_bytes(Klass::secondary_super_cache_offset());
+  int sco_offset = in_bytes(Klass::super_check_offset_offset());
   Address super_check_offset_addr(super_klass, sco_offset);
 
   // Hacked jcc, which "knows" that L_fallthrough, at least, is in
@@ -7786,10 +8351,8 @@ void MacroAssembler::check_klass_subtype_slow_path(Register sub_klass,
   assert(label_nulls <= 1, "at most one NULL in the batch");
 
   // a couple of useful fields in sub_klass:
-  int ss_offset = (klassOopDesc::header_size() * HeapWordSize +
-                   Klass::secondary_supers_offset_in_bytes());
-  int sc_offset = (klassOopDesc::header_size() * HeapWordSize +
-                   Klass::secondary_super_cache_offset_in_bytes());
+  int ss_offset = in_bytes(Klass::secondary_supers_offset());
+  int sc_offset = in_bytes(Klass::secondary_super_cache_offset());
   Address secondary_supers_addr(sub_klass, ss_offset);
   Address super_cache_addr(     sub_klass, sc_offset);
 
@@ -7876,32 +8439,6 @@ void MacroAssembler::check_klass_subtype_slow_path(Register sub_klass,
 }
 
 
-void MacroAssembler::ucomisd(XMMRegister dst, AddressLiteral src) {
-  ucomisd(dst, as_Address(src));
-}
-
-void MacroAssembler::ucomiss(XMMRegister dst, AddressLiteral src) {
-  ucomiss(dst, as_Address(src));
-}
-
-void MacroAssembler::xorpd(XMMRegister dst, AddressLiteral src) {
-  if (reachable(src)) {
-    xorpd(dst, as_Address(src));
-  } else {
-    lea(rscratch1, src);
-    xorpd(dst, Address(rscratch1, 0));
-  }
-}
-
-void MacroAssembler::xorps(XMMRegister dst, AddressLiteral src) {
-  if (reachable(src)) {
-    xorps(dst, as_Address(src));
-  } else {
-    lea(rscratch1, src);
-    xorps(dst, Address(rscratch1, 0));
-  }
-}
-
 void MacroAssembler::cmov32(Condition cc, Register dst, Address src) {
   if (VM_Version::supports_cmov()) {
     cmovl(cc, dst, src);
@@ -8487,20 +9024,20 @@ void MacroAssembler::load_prototype_header(Register dst, Register src) {
     if (Universe::narrow_oop_shift() != 0) {
       assert(LogMinObjAlignmentInBytes == Universe::narrow_oop_shift(), "decode alg wrong");
       if (LogMinObjAlignmentInBytes == Address::times_8) {
-        movq(dst, Address(r12_heapbase, dst, Address::times_8, Klass::prototype_header_offset_in_bytes() + klassOopDesc::klass_part_offset_in_bytes()));
+        movq(dst, Address(r12_heapbase, dst, Address::times_8, Klass::prototype_header_offset()));
       } else {
         // OK to use shift since we don't need to preserve flags.
         shlq(dst, LogMinObjAlignmentInBytes);
-        movq(dst, Address(r12_heapbase, dst, Address::times_1, Klass::prototype_header_offset_in_bytes() + klassOopDesc::klass_part_offset_in_bytes()));
+        movq(dst, Address(r12_heapbase, dst, Address::times_1, Klass::prototype_header_offset()));
       }
     } else {
-      movq(dst, Address(dst, Klass::prototype_header_offset_in_bytes() + klassOopDesc::klass_part_offset_in_bytes()));
+      movq(dst, Address(dst, Klass::prototype_header_offset()));
     }
   } else
 #endif
   {
     movptr(dst, Address(src, oopDesc::klass_offset_in_bytes()));
-    movptr(dst, Address(dst, Klass::prototype_header_offset_in_bytes() + klassOopDesc::klass_part_offset_in_bytes()));
+    movptr(dst, Address(dst, Klass::prototype_header_offset()));
   }
 }
 
@@ -8761,6 +9298,7 @@ void MacroAssembler::string_indexofC8(Register str1, Register str2,
                                       Register cnt1, Register cnt2,
                                       int int_cnt2,  Register result,
                                       XMMRegister vec, Register tmp) {
+  ShortBranchVerifier sbv(this);
   assert(UseSSE42Intrinsics, "SSE4.2 is required");
 
   // This method uses pcmpestri inxtruction with bound registers
@@ -8890,9 +9428,9 @@ void MacroAssembler::string_indexofC8(Register str1, Register str2,
       pcmpestri(vec, Address(result, tmp, Address::times_2, 0), 0x0d);
     }
     // Need to reload strings pointers if not matched whole vector
-    jccb(Assembler::noOverflow, RELOAD_SUBSTR); // OF == 0
+    jcc(Assembler::noOverflow, RELOAD_SUBSTR); // OF == 0
     addptr(cnt2, 8);
-    jccb(Assembler::negative, SCAN_SUBSTR);
+    jcc(Assembler::negative, SCAN_SUBSTR);
     // Fall through if found full substring
 
   } // (int_cnt2 > 8)
@@ -8911,6 +9449,7 @@ void MacroAssembler::string_indexof(Register str1, Register str2,
                                     Register cnt1, Register cnt2,
                                     int int_cnt2,  Register result,
                                     XMMRegister vec, Register tmp) {
+  ShortBranchVerifier sbv(this);
   assert(UseSSE42Intrinsics, "SSE4.2 is required");
   //
   // int_cnt2 is length of small (< 8 chars) constant substring
@@ -9172,6 +9711,7 @@ void MacroAssembler::string_indexof(Register str1, Register str2,
 void MacroAssembler::string_compare(Register str1, Register str2,
                                     Register cnt1, Register cnt2, Register result,
                                     XMMRegister vec1) {
+  ShortBranchVerifier sbv(this);
   Label LENGTH_DIFF_LABEL, POP_LABEL, DONE_LABEL, WHILE_HEAD_LABEL;
 
   // Compute the minimum of the string lengths and the
@@ -9308,6 +9848,7 @@ void MacroAssembler::string_compare(Register str1, Register str2,
 void MacroAssembler::char_arrays_equals(bool is_array_equ, Register ary1, Register ary2,
                                         Register limit, Register result, Register chr,
                                         XMMRegister vec1, XMMRegister vec2) {
+  ShortBranchVerifier sbv(this);
   Label TRUE_LABEL, FALSE_LABEL, DONE, COMPARE_VECTORS, COMPARE_CHAR;
 
   int length_offset  = arrayOopDesc::length_offset_in_bytes();
@@ -9427,6 +9968,7 @@ void MacroAssembler::char_arrays_equals(bool is_array_equ, Register ary1, Regist
 void MacroAssembler::generate_fill(BasicType t, bool aligned,
                                    Register to, Register value, Register count,
                                    Register rtmp, XMMRegister xtmp) {
+  ShortBranchVerifier sbv(this);
   assert_different_registers(to, value, count, rtmp);
   Label L_exit, L_skip_align1, L_skip_align2, L_fill_byte;
   Label L_fill_2_bytes, L_fill_4_bytes;
diff --git a/hotspot/src/cpu/x86/vm/assembler_x86.hpp b/hotspot/src/cpu/x86/vm/assembler_x86.hpp
index d5c35dfdc06..1a2d4afa6e8 100644
--- a/hotspot/src/cpu/x86/vm/assembler_x86.hpp
+++ b/hotspot/src/cpu/x86/vm/assembler_x86.hpp
@@ -503,7 +503,31 @@ class Assembler : public AbstractAssembler  {
     REX_WR     = 0x4C,
     REX_WRB    = 0x4D,
     REX_WRX    = 0x4E,
-    REX_WRXB   = 0x4F
+    REX_WRXB   = 0x4F,
+
+    VEX_3bytes = 0xC4,
+    VEX_2bytes = 0xC5
+  };
+
+  enum VexPrefix {
+    VEX_B = 0x20,
+    VEX_X = 0x40,
+    VEX_R = 0x80,
+    VEX_W = 0x80
+  };
+
+  enum VexSimdPrefix {
+    VEX_SIMD_NONE = 0x0,
+    VEX_SIMD_66   = 0x1,
+    VEX_SIMD_F3   = 0x2,
+    VEX_SIMD_F2   = 0x3
+  };
+
+  enum VexOpcode {
+    VEX_OPCODE_NONE  = 0x0,
+    VEX_OPCODE_0F    = 0x1,
+    VEX_OPCODE_0F_38 = 0x2,
+    VEX_OPCODE_0F_3A = 0x3
   };
 
   enum WhichOperand {
@@ -546,12 +570,99 @@ private:
   void prefixq(Address adr);
 
   void prefix(Address adr, Register reg,  bool byteinst = false);
-  void prefixq(Address adr, Register reg);
-
   void prefix(Address adr, XMMRegister reg);
+  void prefixq(Address adr, Register reg);
+  void prefixq(Address adr, XMMRegister reg);
 
   void prefetch_prefix(Address src);
 
+  void rex_prefix(Address adr, XMMRegister xreg,
+                  VexSimdPrefix pre, VexOpcode opc, bool rex_w);
+  int  rex_prefix_and_encode(int dst_enc, int src_enc,
+                             VexSimdPrefix pre, VexOpcode opc, bool rex_w);
+
+  void vex_prefix(bool vex_r, bool vex_b, bool vex_x, bool vex_w,
+                  int nds_enc, VexSimdPrefix pre, VexOpcode opc,
+                  bool vector256);
+
+  void vex_prefix(Address adr, int nds_enc, int xreg_enc,
+                  VexSimdPrefix pre, VexOpcode opc,
+                  bool vex_w, bool vector256);
+
+  void vex_prefix(XMMRegister dst, XMMRegister nds, Address src,
+                  VexSimdPrefix pre, bool vector256 = false) {
+     vex_prefix(src, nds->encoding(), dst->encoding(),
+                pre, VEX_OPCODE_0F, false, vector256);
+  }
+
+  int  vex_prefix_and_encode(int dst_enc, int nds_enc, int src_enc,
+                             VexSimdPrefix pre, VexOpcode opc,
+                             bool vex_w, bool vector256);
+
+  int  vex_prefix_and_encode(XMMRegister dst, XMMRegister nds, XMMRegister src,
+                             VexSimdPrefix pre, bool vector256 = false) {
+     return vex_prefix_and_encode(dst->encoding(), nds->encoding(), src->encoding(),
+                                  pre, VEX_OPCODE_0F, false, vector256);
+  }
+
+  void simd_prefix(XMMRegister xreg, XMMRegister nds, Address adr,
+                   VexSimdPrefix pre, VexOpcode opc = VEX_OPCODE_0F,
+                   bool rex_w = false, bool vector256 = false);
+
+  void simd_prefix(XMMRegister dst, Address src,
+                   VexSimdPrefix pre, VexOpcode opc = VEX_OPCODE_0F) {
+    simd_prefix(dst, xnoreg, src, pre, opc);
+  }
+  void simd_prefix(Address dst, XMMRegister src, VexSimdPrefix pre) {
+    simd_prefix(src, dst, pre);
+  }
+  void simd_prefix_q(XMMRegister dst, XMMRegister nds, Address src,
+                     VexSimdPrefix pre) {
+    bool rex_w = true;
+    simd_prefix(dst, nds, src, pre, VEX_OPCODE_0F, rex_w);
+  }
+
+
+  int simd_prefix_and_encode(XMMRegister dst, XMMRegister nds, XMMRegister src,
+                             VexSimdPrefix pre, VexOpcode opc = VEX_OPCODE_0F,
+                             bool rex_w = false, bool vector256 = false);
+
+  int simd_prefix_and_encode(XMMRegister dst, XMMRegister src,
+                             VexSimdPrefix pre, VexOpcode opc = VEX_OPCODE_0F) {
+    return simd_prefix_and_encode(dst, xnoreg, src, pre, opc);
+  }
+
+  // Move/convert 32-bit integer value.
+  int simd_prefix_and_encode(XMMRegister dst, XMMRegister nds, Register src,
+                             VexSimdPrefix pre) {
+    // It is OK to cast from Register to XMMRegister to pass argument here
+    // since only encoding is used in simd_prefix_and_encode() and number of
+    // Gen and Xmm registers are the same.
+    return simd_prefix_and_encode(dst, nds, as_XMMRegister(src->encoding()), pre);
+  }
+  int simd_prefix_and_encode(XMMRegister dst, Register src, VexSimdPrefix pre) {
+    return simd_prefix_and_encode(dst, xnoreg, src, pre);
+  }
+  int simd_prefix_and_encode(Register dst, XMMRegister src,
+                             VexSimdPrefix pre, VexOpcode opc = VEX_OPCODE_0F) {
+    return simd_prefix_and_encode(as_XMMRegister(dst->encoding()), xnoreg, src, pre, opc);
+  }
+
+  // Move/convert 64-bit integer value.
+  int simd_prefix_and_encode_q(XMMRegister dst, XMMRegister nds, Register src,
+                               VexSimdPrefix pre) {
+    bool rex_w = true;
+    return simd_prefix_and_encode(dst, nds, as_XMMRegister(src->encoding()), pre, VEX_OPCODE_0F, rex_w);
+  }
+  int simd_prefix_and_encode_q(XMMRegister dst, Register src, VexSimdPrefix pre) {
+    return simd_prefix_and_encode_q(dst, xnoreg, src, pre);
+  }
+  int simd_prefix_and_encode_q(Register dst, XMMRegister src,
+                             VexSimdPrefix pre, VexOpcode opc = VEX_OPCODE_0F) {
+    bool rex_w = true;
+    return simd_prefix_and_encode(as_XMMRegister(dst->encoding()), xnoreg, src, pre, opc, rex_w);
+  }
+
   // Helper functions for groups of instructions
   void emit_arith_b(int op1, int op2, Register dst, int imm8);
 
@@ -764,6 +875,7 @@ private:
   void addss(XMMRegister dst, Address src);
   void addss(XMMRegister dst, XMMRegister src);
 
+  void andl(Address  dst, int32_t imm32);
   void andl(Register dst, int32_t imm32);
   void andl(Register dst, Address src);
   void andl(Register dst, Register src);
@@ -774,9 +886,11 @@ private:
   void andq(Register dst, Register src);
 
   // Bitwise Logical AND of Packed Double-Precision Floating-Point Values
-  void andpd(XMMRegister dst, Address src);
   void andpd(XMMRegister dst, XMMRegister src);
 
+  // Bitwise Logical AND of Packed Single-Precision Floating-Point Values
+  void andps(XMMRegister dst, XMMRegister src);
+
   void bsfl(Register dst, Register src);
   void bsrl(Register dst, Register src);
 
@@ -837,9 +951,11 @@ private:
 
   // Ordered Compare Scalar Double-Precision Floating-Point Values and set EFLAGS
   void comisd(XMMRegister dst, Address src);
+  void comisd(XMMRegister dst, XMMRegister src);
 
   // Ordered Compare Scalar Single-Precision Floating-Point Values and set EFLAGS
   void comiss(XMMRegister dst, Address src);
+  void comiss(XMMRegister dst, XMMRegister src);
 
   // Identify processor type and features
   void cpuid() {
@@ -849,14 +965,19 @@ private:
 
   // Convert Scalar Double-Precision Floating-Point Value to Scalar Single-Precision Floating-Point Value
   void cvtsd2ss(XMMRegister dst, XMMRegister src);
+  void cvtsd2ss(XMMRegister dst, Address src);
 
   // Convert Doubleword Integer to Scalar Double-Precision Floating-Point Value
   void cvtsi2sdl(XMMRegister dst, Register src);
+  void cvtsi2sdl(XMMRegister dst, Address src);
   void cvtsi2sdq(XMMRegister dst, Register src);
+  void cvtsi2sdq(XMMRegister dst, Address src);
 
   // Convert Doubleword Integer to Scalar Single-Precision Floating-Point Value
   void cvtsi2ssl(XMMRegister dst, Register src);
+  void cvtsi2ssl(XMMRegister dst, Address src);
   void cvtsi2ssq(XMMRegister dst, Register src);
+  void cvtsi2ssq(XMMRegister dst, Address src);
 
   // Convert Packed Signed Doubleword Integers to Packed Double-Precision Floating-Point Value
   void cvtdq2pd(XMMRegister dst, XMMRegister src);
@@ -866,6 +987,7 @@ private:
 
   // Convert Scalar Single-Precision Floating-Point Value to Scalar Double-Precision Floating-Point Value
   void cvtss2sd(XMMRegister dst, XMMRegister src);
+  void cvtss2sd(XMMRegister dst, Address src);
 
   // Convert with Truncation Scalar Double-Precision Floating-Point Value to Doubleword Integer
   void cvttsd2sil(Register dst, Address src);
@@ -1140,8 +1262,6 @@ private:
   void movdq(Register dst, XMMRegister src);
 
   // Move Aligned Double Quadword
-  void movdqa(Address     dst, XMMRegister src);
-  void movdqa(XMMRegister dst, Address src);
   void movdqa(XMMRegister dst, XMMRegister src);
 
   // Move Unaligned Double Quadword
@@ -1261,10 +1381,18 @@ private:
   void orq(Register dst, Address src);
   void orq(Register dst, Register src);
 
+  // Pack with unsigned saturation
+  void packuswb(XMMRegister dst, XMMRegister src);
+  void packuswb(XMMRegister dst, Address src);
+
   // SSE4.2 string instructions
   void pcmpestri(XMMRegister xmm1, XMMRegister xmm2, int imm8);
   void pcmpestri(XMMRegister xmm1, Address src, int imm8);
 
+  // SSE4.1 packed move
+  void pmovzxbw(XMMRegister dst, XMMRegister src);
+  void pmovzxbw(XMMRegister dst, Address src);
+
 #ifndef _LP64 // no 32bit push/pop on amd64
   void popl(Address dst);
 #endif
@@ -1292,6 +1420,7 @@ private:
 
   // POR - Bitwise logical OR
   void por(XMMRegister dst, XMMRegister src);
+  void por(XMMRegister dst, Address src);
 
   // Shuffle Packed Doublewords
   void pshufd(XMMRegister dst, XMMRegister src, int mode);
@@ -1313,6 +1442,11 @@ private:
 
   // Interleave Low Bytes
   void punpcklbw(XMMRegister dst, XMMRegister src);
+  void punpcklbw(XMMRegister dst, Address src);
+
+  // Interleave Low Doublewords
+  void punpckldq(XMMRegister dst, XMMRegister src);
+  void punpckldq(XMMRegister dst, Address src);
 
 #ifndef _LP64 // no 32bit push/pop on amd64
   void pushl(Address src);
@@ -1429,6 +1563,13 @@ private:
   void xchgq(Register reg, Address adr);
   void xchgq(Register dst, Register src);
 
+  // Get Value of Extended Control Register
+  void xgetbv() {
+    emit_byte(0x0F);
+    emit_byte(0x01);
+    emit_byte(0xD0);
+  }
+
   void xorl(Register dst, int32_t imm32);
   void xorl(Register dst, Address src);
   void xorl(Register dst, Register src);
@@ -1437,14 +1578,44 @@ private:
   void xorq(Register dst, Register src);
 
   // Bitwise Logical XOR of Packed Double-Precision Floating-Point Values
-  void xorpd(XMMRegister dst, Address src);
   void xorpd(XMMRegister dst, XMMRegister src);
 
   // Bitwise Logical XOR of Packed Single-Precision Floating-Point Values
-  void xorps(XMMRegister dst, Address src);
   void xorps(XMMRegister dst, XMMRegister src);
 
   void set_byte_if_not_zero(Register dst); // sets reg to 1 if not zero, otherwise 0
+
+  // AVX 3-operands instructions (encoded with VEX prefix)
+  void vaddsd(XMMRegister dst, XMMRegister nds, Address src);
+  void vaddsd(XMMRegister dst, XMMRegister nds, XMMRegister src);
+  void vaddss(XMMRegister dst, XMMRegister nds, Address src);
+  void vaddss(XMMRegister dst, XMMRegister nds, XMMRegister src);
+  void vandpd(XMMRegister dst, XMMRegister nds, Address src);
+  void vandps(XMMRegister dst, XMMRegister nds, Address src);
+  void vdivsd(XMMRegister dst, XMMRegister nds, Address src);
+  void vdivsd(XMMRegister dst, XMMRegister nds, XMMRegister src);
+  void vdivss(XMMRegister dst, XMMRegister nds, Address src);
+  void vdivss(XMMRegister dst, XMMRegister nds, XMMRegister src);
+  void vmulsd(XMMRegister dst, XMMRegister nds, Address src);
+  void vmulsd(XMMRegister dst, XMMRegister nds, XMMRegister src);
+  void vmulss(XMMRegister dst, XMMRegister nds, Address src);
+  void vmulss(XMMRegister dst, XMMRegister nds, XMMRegister src);
+  void vsubsd(XMMRegister dst, XMMRegister nds, Address src);
+  void vsubsd(XMMRegister dst, XMMRegister nds, XMMRegister src);
+  void vsubss(XMMRegister dst, XMMRegister nds, Address src);
+  void vsubss(XMMRegister dst, XMMRegister nds, XMMRegister src);
+  void vxorpd(XMMRegister dst, XMMRegister nds, Address src);
+  void vxorps(XMMRegister dst, XMMRegister nds, Address src);
+
+
+ protected:
+  // Next instructions require address alignment 16 bytes SSE mode.
+  // They should be called only from corresponding MacroAssembler instructions.
+  void andpd(XMMRegister dst, Address src);
+  void andps(XMMRegister dst, Address src);
+  void xorpd(XMMRegister dst, Address src);
+  void xorps(XMMRegister dst, Address src);
+
 };
 
 
@@ -2175,9 +2346,15 @@ class MacroAssembler: public Assembler {
   void andpd(XMMRegister dst, Address src) { Assembler::andpd(dst, src); }
   void andpd(XMMRegister dst, AddressLiteral src);
 
+  void andps(XMMRegister dst, XMMRegister src) { Assembler::andps(dst, src); }
+  void andps(XMMRegister dst, Address src) { Assembler::andps(dst, src); }
+  void andps(XMMRegister dst, AddressLiteral src);
+
+  void comiss(XMMRegister dst, XMMRegister src) { Assembler::comiss(dst, src); }
   void comiss(XMMRegister dst, Address src) { Assembler::comiss(dst, src); }
   void comiss(XMMRegister dst, AddressLiteral src);
 
+  void comisd(XMMRegister dst, XMMRegister src) { Assembler::comisd(dst, src); }
   void comisd(XMMRegister dst, Address src) { Assembler::comisd(dst, src); }
   void comisd(XMMRegister dst, AddressLiteral src);
 
@@ -2211,62 +2388,62 @@ private:
   void movss(XMMRegister dst, Address src)     { Assembler::movss(dst, src); }
   void movss(XMMRegister dst, AddressLiteral src);
 
-  void movlpd(XMMRegister dst, Address src)      {Assembler::movlpd(dst, src); }
+  void movlpd(XMMRegister dst, Address src)    {Assembler::movlpd(dst, src); }
   void movlpd(XMMRegister dst, AddressLiteral src);
 
 public:
 
   void addsd(XMMRegister dst, XMMRegister src)    { Assembler::addsd(dst, src); }
   void addsd(XMMRegister dst, Address src)        { Assembler::addsd(dst, src); }
-  void addsd(XMMRegister dst, AddressLiteral src) { Assembler::addsd(dst, as_Address(src)); }
+  void addsd(XMMRegister dst, AddressLiteral src);
 
   void addss(XMMRegister dst, XMMRegister src)    { Assembler::addss(dst, src); }
   void addss(XMMRegister dst, Address src)        { Assembler::addss(dst, src); }
-  void addss(XMMRegister dst, AddressLiteral src) { Assembler::addss(dst, as_Address(src)); }
+  void addss(XMMRegister dst, AddressLiteral src);
 
   void divsd(XMMRegister dst, XMMRegister src)    { Assembler::divsd(dst, src); }
   void divsd(XMMRegister dst, Address src)        { Assembler::divsd(dst, src); }
-  void divsd(XMMRegister dst, AddressLiteral src) { Assembler::divsd(dst, as_Address(src)); }
+  void divsd(XMMRegister dst, AddressLiteral src);
 
   void divss(XMMRegister dst, XMMRegister src)    { Assembler::divss(dst, src); }
   void divss(XMMRegister dst, Address src)        { Assembler::divss(dst, src); }
-  void divss(XMMRegister dst, AddressLiteral src) { Assembler::divss(dst, as_Address(src)); }
+  void divss(XMMRegister dst, AddressLiteral src);
 
   void movsd(XMMRegister dst, XMMRegister src) { Assembler::movsd(dst, src); }
   void movsd(Address dst, XMMRegister src)     { Assembler::movsd(dst, src); }
   void movsd(XMMRegister dst, Address src)     { Assembler::movsd(dst, src); }
-  void movsd(XMMRegister dst, AddressLiteral src) { Assembler::movsd(dst, as_Address(src)); }
+  void movsd(XMMRegister dst, AddressLiteral src);
 
   void mulsd(XMMRegister dst, XMMRegister src)    { Assembler::mulsd(dst, src); }
   void mulsd(XMMRegister dst, Address src)        { Assembler::mulsd(dst, src); }
-  void mulsd(XMMRegister dst, AddressLiteral src) { Assembler::mulsd(dst, as_Address(src)); }
+  void mulsd(XMMRegister dst, AddressLiteral src);
 
   void mulss(XMMRegister dst, XMMRegister src)    { Assembler::mulss(dst, src); }
   void mulss(XMMRegister dst, Address src)        { Assembler::mulss(dst, src); }
-  void mulss(XMMRegister dst, AddressLiteral src) { Assembler::mulss(dst, as_Address(src)); }
+  void mulss(XMMRegister dst, AddressLiteral src);
 
   void sqrtsd(XMMRegister dst, XMMRegister src)    { Assembler::sqrtsd(dst, src); }
   void sqrtsd(XMMRegister dst, Address src)        { Assembler::sqrtsd(dst, src); }
-  void sqrtsd(XMMRegister dst, AddressLiteral src) { Assembler::sqrtsd(dst, as_Address(src)); }
+  void sqrtsd(XMMRegister dst, AddressLiteral src);
 
   void sqrtss(XMMRegister dst, XMMRegister src)    { Assembler::sqrtss(dst, src); }
   void sqrtss(XMMRegister dst, Address src)        { Assembler::sqrtss(dst, src); }
-  void sqrtss(XMMRegister dst, AddressLiteral src) { Assembler::sqrtss(dst, as_Address(src)); }
+  void sqrtss(XMMRegister dst, AddressLiteral src);
 
   void subsd(XMMRegister dst, XMMRegister src)    { Assembler::subsd(dst, src); }
   void subsd(XMMRegister dst, Address src)        { Assembler::subsd(dst, src); }
-  void subsd(XMMRegister dst, AddressLiteral src) { Assembler::subsd(dst, as_Address(src)); }
+  void subsd(XMMRegister dst, AddressLiteral src);
 
   void subss(XMMRegister dst, XMMRegister src)    { Assembler::subss(dst, src); }
   void subss(XMMRegister dst, Address src)        { Assembler::subss(dst, src); }
-  void subss(XMMRegister dst, AddressLiteral src) { Assembler::subss(dst, as_Address(src)); }
+  void subss(XMMRegister dst, AddressLiteral src);
 
   void ucomiss(XMMRegister dst, XMMRegister src) { Assembler::ucomiss(dst, src); }
-  void ucomiss(XMMRegister dst, Address src) { Assembler::ucomiss(dst, src); }
+  void ucomiss(XMMRegister dst, Address src)     { Assembler::ucomiss(dst, src); }
   void ucomiss(XMMRegister dst, AddressLiteral src);
 
   void ucomisd(XMMRegister dst, XMMRegister src) { Assembler::ucomisd(dst, src); }
-  void ucomisd(XMMRegister dst, Address src) { Assembler::ucomisd(dst, src); }
+  void ucomisd(XMMRegister dst, Address src)     { Assembler::ucomisd(dst, src); }
   void ucomisd(XMMRegister dst, AddressLiteral src);
 
   // Bitwise Logical XOR of Packed Double-Precision Floating-Point Values
@@ -2279,6 +2456,53 @@ public:
   void xorps(XMMRegister dst, Address src)     { Assembler::xorps(dst, src); }
   void xorps(XMMRegister dst, AddressLiteral src);
 
+  // AVX 3-operands instructions
+
+  void vaddsd(XMMRegister dst, XMMRegister nds, XMMRegister src) { Assembler::vaddsd(dst, nds, src); }
+  void vaddsd(XMMRegister dst, XMMRegister nds, Address src)     { Assembler::vaddsd(dst, nds, src); }
+  void vaddsd(XMMRegister dst, XMMRegister nds, AddressLiteral src);
+
+  void vaddss(XMMRegister dst, XMMRegister nds, XMMRegister src) { Assembler::vaddss(dst, nds, src); }
+  void vaddss(XMMRegister dst, XMMRegister nds, Address src)     { Assembler::vaddss(dst, nds, src); }
+  void vaddss(XMMRegister dst, XMMRegister nds, AddressLiteral src);
+
+  void vandpd(XMMRegister dst, XMMRegister nds, Address src)     { Assembler::vandpd(dst, nds, src); }
+  void vandpd(XMMRegister dst, XMMRegister nds, AddressLiteral src);
+
+  void vandps(XMMRegister dst, XMMRegister nds, Address src)     { Assembler::vandps(dst, nds, src); }
+  void vandps(XMMRegister dst, XMMRegister nds, AddressLiteral src);
+
+  void vdivsd(XMMRegister dst, XMMRegister nds, XMMRegister src) { Assembler::vdivsd(dst, nds, src); }
+  void vdivsd(XMMRegister dst, XMMRegister nds, Address src)     { Assembler::vdivsd(dst, nds, src); }
+  void vdivsd(XMMRegister dst, XMMRegister nds, AddressLiteral src);
+
+  void vdivss(XMMRegister dst, XMMRegister nds, XMMRegister src) { Assembler::vdivss(dst, nds, src); }
+  void vdivss(XMMRegister dst, XMMRegister nds, Address src)     { Assembler::vdivss(dst, nds, src); }
+  void vdivss(XMMRegister dst, XMMRegister nds, AddressLiteral src);
+
+  void vmulsd(XMMRegister dst, XMMRegister nds, XMMRegister src) { Assembler::vmulsd(dst, nds, src); }
+  void vmulsd(XMMRegister dst, XMMRegister nds, Address src)     { Assembler::vmulsd(dst, nds, src); }
+  void vmulsd(XMMRegister dst, XMMRegister nds, AddressLiteral src);
+
+  void vmulss(XMMRegister dst, XMMRegister nds, XMMRegister src) { Assembler::vmulss(dst, nds, src); }
+  void vmulss(XMMRegister dst, XMMRegister nds, Address src)     { Assembler::vmulss(dst, nds, src); }
+  void vmulss(XMMRegister dst, XMMRegister nds, AddressLiteral src);
+
+  void vsubsd(XMMRegister dst, XMMRegister nds, XMMRegister src) { Assembler::vsubsd(dst, nds, src); }
+  void vsubsd(XMMRegister dst, XMMRegister nds, Address src)     { Assembler::vsubsd(dst, nds, src); }
+  void vsubsd(XMMRegister dst, XMMRegister nds, AddressLiteral src);
+
+  void vsubss(XMMRegister dst, XMMRegister nds, XMMRegister src) { Assembler::vsubss(dst, nds, src); }
+  void vsubss(XMMRegister dst, XMMRegister nds, Address src)     { Assembler::vsubss(dst, nds, src); }
+  void vsubss(XMMRegister dst, XMMRegister nds, AddressLiteral src);
+
+  void vxorpd(XMMRegister dst, XMMRegister nds, Address src) { Assembler::vxorpd(dst, nds, src); }
+  void vxorpd(XMMRegister dst, XMMRegister nds, AddressLiteral src);
+
+  void vxorps(XMMRegister dst, XMMRegister nds, Address src) { Assembler::vxorps(dst, nds, src); }
+  void vxorps(XMMRegister dst, XMMRegister nds, AddressLiteral src);
+
+
   // Data
 
   void cmov32( Condition cc, Register dst, Address  src);
diff --git a/hotspot/src/cpu/x86/vm/assembler_x86.inline.hpp b/hotspot/src/cpu/x86/vm/assembler_x86.inline.hpp
index 125bf3ffff0..bf299c6da8c 100644
--- a/hotspot/src/cpu/x86/vm/assembler_x86.inline.hpp
+++ b/hotspot/src/cpu/x86/vm/assembler_x86.inline.hpp
@@ -86,6 +86,7 @@ inline void Assembler::prefix(Address adr, Register reg,  bool byteinst) {}
 inline void Assembler::prefixq(Address adr, Register reg) {}
 
 inline void Assembler::prefix(Address adr, XMMRegister reg) {}
+inline void Assembler::prefixq(Address adr, XMMRegister reg) {}
 #else
 inline void Assembler::emit_long64(jlong x) {
   *(jlong*) _code_pos = x;
diff --git a/hotspot/src/cpu/x86/vm/c1_CodeStubs_x86.cpp b/hotspot/src/cpu/x86/vm/c1_CodeStubs_x86.cpp
index f276df9e537..fe5495dda94 100644
--- a/hotspot/src/cpu/x86/vm/c1_CodeStubs_x86.cpp
+++ b/hotspot/src/cpu/x86/vm/c1_CodeStubs_x86.cpp
@@ -320,7 +320,7 @@ void PatchingStub::emit_code(LIR_Assembler* ce) {
     // begin_initialized_entry_offset has to fit in a byte. Also, we know it's not null.
     __ load_heap_oop_not_null(tmp2, Address(_obj, java_lang_Class::klass_offset_in_bytes()));
     __ get_thread(tmp);
-    __ cmpptr(tmp, Address(tmp2, instanceKlass::init_thread_offset_in_bytes() + sizeof(klassOopDesc)));
+    __ cmpptr(tmp, Address(tmp2, instanceKlass::init_thread_offset()));
     __ pop(tmp2);
     __ pop(tmp);
     __ jcc(Assembler::notEqual, call_patch);
@@ -519,7 +519,7 @@ void G1UnsafeGetObjSATBBarrierStub::emit_code(LIR_Assembler* ce) {
 
   __ load_klass(tmp_reg, src_reg);
 
-  Address ref_type_adr(tmp_reg, instanceKlass::reference_type_offset_in_bytes() + sizeof(oopDesc));
+  Address ref_type_adr(tmp_reg, instanceKlass::reference_type_offset());
   __ cmpl(ref_type_adr, REF_NONE);
   __ jcc(Assembler::equal, _continuation);
 
diff --git a/hotspot/src/cpu/x86/vm/c1_LIRAssembler_x86.cpp b/hotspot/src/cpu/x86/vm/c1_LIRAssembler_x86.cpp
index dbdd7087fa7..3c24feb724b 100644
--- a/hotspot/src/cpu/x86/vm/c1_LIRAssembler_x86.cpp
+++ b/hotspot/src/cpu/x86/vm/c1_LIRAssembler_x86.cpp
@@ -1557,8 +1557,8 @@ void LIR_Assembler::emit_opConvert(LIR_OpConvert* op) {
 
 void LIR_Assembler::emit_alloc_obj(LIR_OpAllocObj* op) {
   if (op->init_check()) {
-    __ cmpl(Address(op->klass()->as_register(),
-                    instanceKlass::init_state_offset_in_bytes() + sizeof(oopDesc)),
+    __ cmpb(Address(op->klass()->as_register(),
+                    instanceKlass::init_state_offset()),
             instanceKlass::fully_initialized);
     add_debug_info_for_null_check_here(op->stub()->info());
     __ jcc(Assembler::notEqual, *op->stub()->entry());
@@ -1730,7 +1730,7 @@ void LIR_Assembler::emit_typecheck_helper(LIR_OpTypeCheck *op, Label* success, L
 #else
       __ cmpoop(Address(klass_RInfo, k->super_check_offset()), k->constant_encoding());
 #endif // _LP64
-      if (sizeof(oopDesc) + Klass::secondary_super_cache_offset_in_bytes() != k->super_check_offset()) {
+      if ((juint)in_bytes(Klass::secondary_super_cache_offset()) != k->super_check_offset()) {
         __ jcc(Assembler::notEqual, *failure_target);
         // successful cast, fall through to profile or jump
       } else {
@@ -1842,7 +1842,7 @@ void LIR_Assembler::emit_opTypeCheck(LIR_OpTypeCheck* op) {
     __ load_klass(klass_RInfo, value);
 
     // get instance klass (it's already uncompressed)
-    __ movptr(k_RInfo, Address(k_RInfo, objArrayKlass::element_klass_offset_in_bytes() + sizeof(oopDesc)));
+    __ movptr(k_RInfo, Address(k_RInfo, objArrayKlass::element_klass_offset()));
     // perform the fast part of the checking logic
     __ check_klass_subtype_fast_path(klass_RInfo, k_RInfo, Rtmp1, success_target, failure_target, NULL);
     // call out-of-line instance of __ check_klass_subtype_slow_path(...):
@@ -3289,8 +3289,7 @@ void LIR_Assembler::emit_arraycopy(LIR_OpArrayCopy* op) {
           } else if (!(flags & LIR_OpArrayCopy::dst_objarray)) {
             __ load_klass(tmp, dst);
           }
-          int lh_offset = klassOopDesc::header_size() * HeapWordSize +
-            Klass::layout_helper_offset_in_bytes();
+          int lh_offset = in_bytes(Klass::layout_helper_offset());
           Address klass_lh_addr(tmp, lh_offset);
           jint objArray_lh = Klass::array_layout_helper(T_OBJECT);
           __ cmpl(klass_lh_addr, objArray_lh);
@@ -3307,9 +3306,9 @@ void LIR_Assembler::emit_arraycopy(LIR_OpArrayCopy* op) {
 
 #ifndef _LP64
         __ movptr(tmp, dst_klass_addr);
-        __ movptr(tmp, Address(tmp, objArrayKlass::element_klass_offset_in_bytes() + sizeof(oopDesc)));
+        __ movptr(tmp, Address(tmp, objArrayKlass::element_klass_offset()));
         __ push(tmp);
-        __ movl(tmp, Address(tmp, Klass::super_check_offset_offset_in_bytes() + sizeof(oopDesc)));
+        __ movl(tmp, Address(tmp, Klass::super_check_offset_offset()));
         __ push(tmp);
         __ push(length);
         __ lea(tmp, Address(dst, dst_pos, scale, arrayOopDesc::base_offset_in_bytes(basic_type)));
@@ -3333,15 +3332,15 @@ void LIR_Assembler::emit_arraycopy(LIR_OpArrayCopy* op) {
         // Allocate abi space for args but be sure to keep stack aligned
         __ subptr(rsp, 6*wordSize);
         __ load_klass(c_rarg3, dst);
-        __ movptr(c_rarg3, Address(c_rarg3, objArrayKlass::element_klass_offset_in_bytes() + sizeof(oopDesc)));
+        __ movptr(c_rarg3, Address(c_rarg3, objArrayKlass::element_klass_offset()));
         store_parameter(c_rarg3, 4);
-        __ movl(c_rarg3, Address(c_rarg3, Klass::super_check_offset_offset_in_bytes() + sizeof(oopDesc)));
+        __ movl(c_rarg3, Address(c_rarg3, Klass::super_check_offset_offset()));
         __ call(RuntimeAddress(copyfunc_addr));
         __ addptr(rsp, 6*wordSize);
 #else
         __ load_klass(c_rarg4, dst);
-        __ movptr(c_rarg4, Address(c_rarg4, objArrayKlass::element_klass_offset_in_bytes() + sizeof(oopDesc)));
-        __ movl(c_rarg3, Address(c_rarg4, Klass::super_check_offset_offset_in_bytes() + sizeof(oopDesc)));
+        __ movptr(c_rarg4, Address(c_rarg4, objArrayKlass::element_klass_offset()));
+        __ movl(c_rarg3, Address(c_rarg4, Klass::super_check_offset_offset()));
         __ call(RuntimeAddress(copyfunc_addr));
 #endif
 
diff --git a/hotspot/src/cpu/x86/vm/c1_MacroAssembler_x86.cpp b/hotspot/src/cpu/x86/vm/c1_MacroAssembler_x86.cpp
index d386a99a096..ad507571479 100644
--- a/hotspot/src/cpu/x86/vm/c1_MacroAssembler_x86.cpp
+++ b/hotspot/src/cpu/x86/vm/c1_MacroAssembler_x86.cpp
@@ -150,7 +150,7 @@ void C1_MacroAssembler::initialize_header(Register obj, Register klass, Register
   assert_different_registers(obj, klass, len);
   if (UseBiasedLocking && !len->is_valid()) {
     assert_different_registers(obj, klass, len, t1, t2);
-    movptr(t1, Address(klass, Klass::prototype_header_offset_in_bytes() + klassOopDesc::klass_part_offset_in_bytes()));
+    movptr(t1, Address(klass, Klass::prototype_header_offset()));
     movptr(Address(obj, oopDesc::mark_offset_in_bytes()), t1);
   } else {
     // This assumes that all prototype bits fit in an int32_t
diff --git a/hotspot/src/cpu/x86/vm/c1_Runtime1_x86.cpp b/hotspot/src/cpu/x86/vm/c1_Runtime1_x86.cpp
index 9d132217058..5f2cf3886ca 100644
--- a/hotspot/src/cpu/x86/vm/c1_Runtime1_x86.cpp
+++ b/hotspot/src/cpu/x86/vm/c1_Runtime1_x86.cpp
@@ -1011,7 +1011,7 @@ OopMapSet* Runtime1::generate_code_for(StubID id, StubAssembler* sasm) {
 
           if (id == fast_new_instance_init_check_id) {
             // make sure the klass is initialized
-            __ cmpl(Address(klass, instanceKlass::init_state_offset_in_bytes() + sizeof(oopDesc)), instanceKlass::fully_initialized);
+            __ cmpb(Address(klass, instanceKlass::init_state_offset()), instanceKlass::fully_initialized);
             __ jcc(Assembler::notEqual, slow_path);
           }
 
@@ -1019,7 +1019,7 @@ OopMapSet* Runtime1::generate_code_for(StubID id, StubAssembler* sasm) {
           // assert object can be fast path allocated
           {
             Label ok, not_ok;
-            __ movl(obj_size, Address(klass, Klass::layout_helper_offset_in_bytes() + sizeof(oopDesc)));
+            __ movl(obj_size, Address(klass, Klass::layout_helper_offset()));
             __ cmpl(obj_size, 0);  // make sure it's an instance (LH > 0)
             __ jcc(Assembler::lessEqual, not_ok);
             __ testl(obj_size, Klass::_lh_instance_slow_path_bit);
@@ -1040,7 +1040,7 @@ OopMapSet* Runtime1::generate_code_for(StubID id, StubAssembler* sasm) {
           __ bind(retry_tlab);
 
           // get the instance size (size is postive so movl is fine for 64bit)
-          __ movl(obj_size, Address(klass, klassOopDesc::header_size() * HeapWordSize + Klass::layout_helper_offset_in_bytes()));
+          __ movl(obj_size, Address(klass, Klass::layout_helper_offset()));
 
           __ tlab_allocate(obj, obj_size, 0, t1, t2, slow_path);
 
@@ -1052,7 +1052,7 @@ OopMapSet* Runtime1::generate_code_for(StubID id, StubAssembler* sasm) {
 
           __ bind(try_eden);
           // get the instance size (size is postive so movl is fine for 64bit)
-          __ movl(obj_size, Address(klass, klassOopDesc::header_size() * HeapWordSize + Klass::layout_helper_offset_in_bytes()));
+          __ movl(obj_size, Address(klass, Klass::layout_helper_offset()));
 
           __ eden_allocate(obj, obj_size, 0, t1, slow_path);
           __ incr_allocated_bytes(thread, obj_size, 0);
@@ -1119,7 +1119,7 @@ OopMapSet* Runtime1::generate_code_for(StubID id, StubAssembler* sasm) {
         {
           Label ok;
           Register t0 = obj;
-          __ movl(t0, Address(klass, Klass::layout_helper_offset_in_bytes() + sizeof(oopDesc)));
+          __ movl(t0, Address(klass, Klass::layout_helper_offset()));
           __ sarl(t0, Klass::_lh_array_tag_shift);
           int tag = ((id == new_type_array_id)
                      ? Klass::_lh_array_tag_type_value
@@ -1153,7 +1153,7 @@ OopMapSet* Runtime1::generate_code_for(StubID id, StubAssembler* sasm) {
 
           // get the allocation size: round_up(hdr + length << (layout_helper & 0x1F))
           // since size is positive movl does right thing on 64bit
-          __ movl(t1, Address(klass, klassOopDesc::header_size() * HeapWordSize + Klass::layout_helper_offset_in_bytes()));
+          __ movl(t1, Address(klass, Klass::layout_helper_offset()));
           // since size is postive movl does right thing on 64bit
           __ movl(arr_size, length);
           assert(t1 == rcx, "fixed register usage");
@@ -1167,7 +1167,7 @@ OopMapSet* Runtime1::generate_code_for(StubID id, StubAssembler* sasm) {
           __ tlab_allocate(obj, arr_size, 0, t1, t2, slow_path);  // preserves arr_size
 
           __ initialize_header(obj, klass, length, t1, t2);
-          __ movb(t1, Address(klass, klassOopDesc::header_size() * HeapWordSize + Klass::layout_helper_offset_in_bytes() + (Klass::_lh_header_size_shift / BitsPerByte)));
+          __ movb(t1, Address(klass, in_bytes(Klass::layout_helper_offset()) + (Klass::_lh_header_size_shift / BitsPerByte)));
           assert(Klass::_lh_header_size_shift % BitsPerByte == 0, "bytewise");
           assert(Klass::_lh_header_size_mask <= 0xFF, "bytewise");
           __ andptr(t1, Klass::_lh_header_size_mask);
@@ -1180,7 +1180,7 @@ OopMapSet* Runtime1::generate_code_for(StubID id, StubAssembler* sasm) {
           __ bind(try_eden);
           // get the allocation size: round_up(hdr + length << (layout_helper & 0x1F))
           // since size is positive movl does right thing on 64bit
-          __ movl(t1, Address(klass, klassOopDesc::header_size() * HeapWordSize + Klass::layout_helper_offset_in_bytes()));
+          __ movl(t1, Address(klass, Klass::layout_helper_offset()));
           // since size is postive movl does right thing on 64bit
           __ movl(arr_size, length);
           assert(t1 == rcx, "fixed register usage");
@@ -1195,7 +1195,7 @@ OopMapSet* Runtime1::generate_code_for(StubID id, StubAssembler* sasm) {
           __ incr_allocated_bytes(thread, arr_size, 0);
 
           __ initialize_header(obj, klass, length, t1, t2);
-          __ movb(t1, Address(klass, klassOopDesc::header_size() * HeapWordSize + Klass::layout_helper_offset_in_bytes() + (Klass::_lh_header_size_shift / BitsPerByte)));
+          __ movb(t1, Address(klass, in_bytes(Klass::layout_helper_offset()) + (Klass::_lh_header_size_shift / BitsPerByte)));
           assert(Klass::_lh_header_size_shift % BitsPerByte == 0, "bytewise");
           assert(Klass::_lh_header_size_mask <= 0xFF, "bytewise");
           __ andptr(t1, Klass::_lh_header_size_mask);
@@ -1267,7 +1267,7 @@ OopMapSet* Runtime1::generate_code_for(StubID id, StubAssembler* sasm) {
         Label register_finalizer;
         Register t = rsi;
         __ load_klass(t, rax);
-        __ movl(t, Address(t, Klass::access_flags_offset_in_bytes() + sizeof(oopDesc)));
+        __ movl(t, Address(t, Klass::access_flags_offset()));
         __ testl(t, JVM_ACC_HAS_FINALIZER);
         __ jcc(Assembler::notZero, register_finalizer);
         __ ret(0);
diff --git a/hotspot/src/cpu/x86/vm/cppInterpreter_x86.cpp b/hotspot/src/cpu/x86/vm/cppInterpreter_x86.cpp
index 226c6cbc6e6..b9a5c22934f 100644
--- a/hotspot/src/cpu/x86/vm/cppInterpreter_x86.cpp
+++ b/hotspot/src/cpu/x86/vm/cppInterpreter_x86.cpp
@@ -511,7 +511,7 @@ void CppInterpreterGenerator::generate_compute_interpreter_state(const Register
     // get synchronization object
 
     Label done;
-    const int mirror_offset = klassOopDesc::klass_part_offset_in_bytes() + Klass::java_mirror_offset_in_bytes();
+    const int mirror_offset = in_bytes(Klass::java_mirror_offset());
     __ movl(rax, access_flags);
     __ testl(rax, JVM_ACC_STATIC);
     __ movptr(rax, Address(locals, 0));                   // get receiver (assume this is frequent case)
@@ -763,7 +763,7 @@ void InterpreterGenerator::lock_method(void) {
 #endif // ASSERT
   // get synchronization object
   { Label done;
-    const int mirror_offset = klassOopDesc::klass_part_offset_in_bytes() + Klass::java_mirror_offset_in_bytes();
+    const int mirror_offset = in_bytes(Klass::java_mirror_offset());
     __ movl(rax, access_flags);
     __ movptr(rdi, STATE(_locals));                                     // prepare to get receiver (assume common case)
     __ testl(rax, JVM_ACC_STATIC);
@@ -1180,7 +1180,7 @@ address InterpreterGenerator::generate_native_entry(bool synchronized) {
 
   // pass mirror handle if static call
   { Label L;
-    const int mirror_offset = klassOopDesc::klass_part_offset_in_bytes() + Klass::java_mirror_offset_in_bytes();
+    const int mirror_offset = in_bytes(Klass::java_mirror_offset());
     __ movl(t, Address(method, methodOopDesc::access_flags_offset()));
     __ testl(t, JVM_ACC_STATIC);
     __ jcc(Assembler::zero, L);
diff --git a/hotspot/src/cpu/x86/vm/methodHandles_x86.cpp b/hotspot/src/cpu/x86/vm/methodHandles_x86.cpp
index 254d087df96..7d987e58829 100644
--- a/hotspot/src/cpu/x86/vm/methodHandles_x86.cpp
+++ b/hotspot/src/cpu/x86/vm/methodHandles_x86.cpp
@@ -1160,7 +1160,7 @@ void MethodHandles::generate_method_handle_stub(MacroAssembler* _masm, MethodHan
   Address rcx_amh_conversion( rcx_recv, java_lang_invoke_AdapterMethodHandle::conversion_offset_in_bytes() );
   Address vmarg;                // __ argument_address(vmargslot)
 
-  const int java_mirror_offset = klassOopDesc::klass_part_offset_in_bytes() + Klass::java_mirror_offset_in_bytes();
+  const int java_mirror_offset = in_bytes(Klass::java_mirror_offset());
 
   if (have_entry(ek)) {
     __ nop();                   // empty stubs make SG sick
diff --git a/hotspot/src/cpu/x86/vm/nativeInst_x86.cpp b/hotspot/src/cpu/x86/vm/nativeInst_x86.cpp
index 7ec07737f61..1cf509992ca 100644
--- a/hotspot/src/cpu/x86/vm/nativeInst_x86.cpp
+++ b/hotspot/src/cpu/x86/vm/nativeInst_x86.cpp
@@ -237,9 +237,21 @@ int NativeMovRegMem::instruction_start() const {
   int off = 0;
   u_char instr_0 = ubyte_at(off);
 
+  // See comment in Assembler::locate_operand() about VEX prefixes.
+  if (instr_0 == instruction_VEX_prefix_2bytes) {
+    assert((UseAVX > 0), "shouldn't have VEX prefix");
+    NOT_LP64(assert((0xC0 & ubyte_at(1)) == 0xC0, "shouldn't have LDS and LES instructions"));
+    return 2;
+  }
+  if (instr_0 == instruction_VEX_prefix_3bytes) {
+    assert((UseAVX > 0), "shouldn't have VEX prefix");
+    NOT_LP64(assert((0xC0 & ubyte_at(1)) == 0xC0, "shouldn't have LDS and LES instructions"));
+    return 3;
+  }
+
   // First check to see if we have a (prefixed or not) xor
-  if ( instr_0 >= instruction_prefix_wide_lo &&      // 0x40
-       instr_0 <= instruction_prefix_wide_hi) { // 0x4f
+  if (instr_0 >= instruction_prefix_wide_lo && // 0x40
+      instr_0 <= instruction_prefix_wide_hi) { // 0x4f
     off++;
     instr_0 = ubyte_at(off);
   }
@@ -256,13 +268,13 @@ int NativeMovRegMem::instruction_start() const {
     instr_0 = ubyte_at(off);
   }
 
-  if ( instr_0 == instruction_code_xmm_ss_prefix ||      // 0xf3
+  if ( instr_0 == instruction_code_xmm_ss_prefix || // 0xf3
        instr_0 == instruction_code_xmm_sd_prefix) { // 0xf2
     off++;
     instr_0 = ubyte_at(off);
   }
 
-  if ( instr_0 >= instruction_prefix_wide_lo &&      // 0x40
+  if ( instr_0 >= instruction_prefix_wide_lo && // 0x40
        instr_0 <= instruction_prefix_wide_hi) { // 0x4f
     off++;
     instr_0 = ubyte_at(off);
diff --git a/hotspot/src/cpu/x86/vm/nativeInst_x86.hpp b/hotspot/src/cpu/x86/vm/nativeInst_x86.hpp
index fc7a1ab0753..470e971fe48 100644
--- a/hotspot/src/cpu/x86/vm/nativeInst_x86.hpp
+++ b/hotspot/src/cpu/x86/vm/nativeInst_x86.hpp
@@ -287,6 +287,9 @@ class NativeMovRegMem: public NativeInstruction {
     instruction_code_xmm_store          = 0x11,
     instruction_code_xmm_lpd            = 0x12,
 
+    instruction_VEX_prefix_2bytes       = Assembler::VEX_2bytes,
+    instruction_VEX_prefix_3bytes       = Assembler::VEX_3bytes,
+
     instruction_size                    = 4,
     instruction_offset                  = 0,
     data_offset                         = 2,
diff --git a/hotspot/src/cpu/x86/vm/register_definitions_x86.cpp b/hotspot/src/cpu/x86/vm/register_definitions_x86.cpp
index f1fd229b96b..7165872c239 100644
--- a/hotspot/src/cpu/x86/vm/register_definitions_x86.cpp
+++ b/hotspot/src/cpu/x86/vm/register_definitions_x86.cpp
@@ -53,6 +53,7 @@ REGISTER_DEFINITION(Register, r14);
 REGISTER_DEFINITION(Register, r15);
 #endif // AMD64
 
+REGISTER_DEFINITION(XMMRegister, xnoreg);
 REGISTER_DEFINITION(XMMRegister, xmm0 );
 REGISTER_DEFINITION(XMMRegister, xmm1 );
 REGISTER_DEFINITION(XMMRegister, xmm2 );
@@ -115,6 +116,7 @@ REGISTER_DEFINITION(Register, r12_heapbase);
 REGISTER_DEFINITION(Register, r15_thread);
 #endif // AMD64
 
+REGISTER_DEFINITION(MMXRegister, mnoreg );
 REGISTER_DEFINITION(MMXRegister, mmx0 );
 REGISTER_DEFINITION(MMXRegister, mmx1 );
 REGISTER_DEFINITION(MMXRegister, mmx2 );
diff --git a/hotspot/src/cpu/x86/vm/stubGenerator_x86_32.cpp b/hotspot/src/cpu/x86/vm/stubGenerator_x86_32.cpp
index fb85fffd77b..4d4e66f600b 100644
--- a/hotspot/src/cpu/x86/vm/stubGenerator_x86_32.cpp
+++ b/hotspot/src/cpu/x86/vm/stubGenerator_x86_32.cpp
@@ -1374,8 +1374,7 @@ class StubGenerator: public StubCodeGenerator {
     //                                  L_success, L_failure, NULL);
     assert_different_registers(sub_klass, temp);
 
-    int sc_offset = (klassOopDesc::header_size() * HeapWordSize +
-                     Klass::secondary_super_cache_offset_in_bytes());
+    int sc_offset = in_bytes(Klass::secondary_super_cache_offset());
 
     // if the pointers are equal, we are done (e.g., String[] elements)
     __ cmpptr(sub_klass, super_klass_addr);
@@ -1787,8 +1786,7 @@ class StubGenerator: public StubCodeGenerator {
     //   array_tag: typeArray = 0x3, objArray = 0x2, non-array = 0x0
     //
 
-    int lh_offset = klassOopDesc::header_size() * HeapWordSize +
-                    Klass::layout_helper_offset_in_bytes();
+    int lh_offset = in_bytes(Klass::layout_helper_offset());
     Address src_klass_lh_addr(rcx_src_klass, lh_offset);
 
     // Handle objArrays completely differently...
@@ -1914,10 +1912,8 @@ class StubGenerator: public StubCodeGenerator {
     // live at this point:  rcx_src_klass, dst[_pos], src[_pos]
     {
       // Handy offsets:
-      int  ek_offset = (klassOopDesc::header_size() * HeapWordSize +
-                        objArrayKlass::element_klass_offset_in_bytes());
-      int sco_offset = (klassOopDesc::header_size() * HeapWordSize +
-                        Klass::super_check_offset_offset_in_bytes());
+      int  ek_offset = in_bytes(objArrayKlass::element_klass_offset());
+      int sco_offset = in_bytes(Klass::super_check_offset_offset());
 
       Register rsi_dst_klass = rsi;
       Register rdi_temp      = rdi;
@@ -2323,6 +2319,9 @@ class StubGenerator: public StubCodeGenerator {
       generate_throw_exception("WrongMethodTypeException throw_exception",
                                CAST_FROM_FN_PTR(address, SharedRuntime::throw_WrongMethodTypeException),
                                rax, rcx);
+
+    // Build this early so it's available for the interpreter
+    StubRoutines::_throw_StackOverflowError_entry          = generate_throw_exception("StackOverflowError throw_exception",           CAST_FROM_FN_PTR(address, SharedRuntime::throw_StackOverflowError));
   }
 
 
@@ -2334,7 +2333,6 @@ class StubGenerator: public StubCodeGenerator {
     StubRoutines::_throw_AbstractMethodError_entry         = generate_throw_exception("AbstractMethodError throw_exception",          CAST_FROM_FN_PTR(address, SharedRuntime::throw_AbstractMethodError));
     StubRoutines::_throw_IncompatibleClassChangeError_entry= generate_throw_exception("IncompatibleClassChangeError throw_exception", CAST_FROM_FN_PTR(address, SharedRuntime::throw_IncompatibleClassChangeError));
     StubRoutines::_throw_NullPointerException_at_call_entry= generate_throw_exception("NullPointerException at call throw_exception", CAST_FROM_FN_PTR(address, SharedRuntime::throw_NullPointerException_at_call));
-    StubRoutines::_throw_StackOverflowError_entry          = generate_throw_exception("StackOverflowError throw_exception",           CAST_FROM_FN_PTR(address, SharedRuntime::throw_StackOverflowError));
 
     //------------------------------------------------------------------------------------------------------------------------
     // entry points that are platform specific
diff --git a/hotspot/src/cpu/x86/vm/stubGenerator_x86_64.cpp b/hotspot/src/cpu/x86/vm/stubGenerator_x86_64.cpp
index 86104223fd4..a6fcc782094 100644
--- a/hotspot/src/cpu/x86/vm/stubGenerator_x86_64.cpp
+++ b/hotspot/src/cpu/x86/vm/stubGenerator_x86_64.cpp
@@ -2261,8 +2261,7 @@ class StubGenerator: public StubCodeGenerator {
     // The ckoff and ckval must be mutually consistent,
     // even though caller generates both.
     { Label L;
-      int sco_offset = (klassOopDesc::header_size() * HeapWordSize +
-                        Klass::super_check_offset_offset_in_bytes());
+      int sco_offset = in_bytes(Klass::super_check_offset_offset());
       __ cmpl(ckoff, Address(ckval, sco_offset));
       __ jcc(Assembler::equal, L);
       __ stop("super_check_offset inconsistent");
@@ -2572,8 +2571,7 @@ class StubGenerator: public StubCodeGenerator {
     //   array_tag: typeArray = 0x3, objArray = 0x2, non-array = 0x0
     //
 
-    const int lh_offset = klassOopDesc::header_size() * HeapWordSize +
-                          Klass::layout_helper_offset_in_bytes();
+    const int lh_offset = in_bytes(Klass::layout_helper_offset());
 
     // Handle objArrays completely differently...
     const jint objArray_lh = Klass::array_layout_helper(T_OBJECT);
@@ -2722,15 +2720,13 @@ class StubGenerator: public StubCodeGenerator {
       assert_clean_int(count, sco_temp);
 
       // Generate the type check.
-      const int sco_offset = (klassOopDesc::header_size() * HeapWordSize +
-                              Klass::super_check_offset_offset_in_bytes());
+      const int sco_offset = in_bytes(Klass::super_check_offset_offset());
       __ movl(sco_temp, Address(r11_dst_klass, sco_offset));
       assert_clean_int(sco_temp, rax);
       generate_type_check(r10_src_klass, sco_temp, r11_dst_klass, L_plain_copy);
 
       // Fetch destination element klass from the objArrayKlass header.
-      int ek_offset = (klassOopDesc::header_size() * HeapWordSize +
-                       objArrayKlass::element_klass_offset_in_bytes());
+      int ek_offset = in_bytes(objArrayKlass::element_klass_offset());
       __ movptr(r11_dst_klass, Address(r11_dst_klass, ek_offset));
       __ movl(  sco_temp,      Address(r11_dst_klass, sco_offset));
       assert_clean_int(sco_temp, rax);
@@ -3072,6 +3068,13 @@ class StubGenerator: public StubCodeGenerator {
       generate_throw_exception("WrongMethodTypeException throw_exception",
                                CAST_FROM_FN_PTR(address, SharedRuntime::throw_WrongMethodTypeException),
                                rax, rcx);
+
+    // Build this early so it's available for the interpreter.
+    StubRoutines::_throw_StackOverflowError_entry =
+      generate_throw_exception("StackOverflowError throw_exception",
+                               CAST_FROM_FN_PTR(address,
+                                                SharedRuntime::
+                                                throw_StackOverflowError));
   }
 
   void generate_all() {
@@ -3098,12 +3101,6 @@ class StubGenerator: public StubCodeGenerator {
                                                 SharedRuntime::
                                                 throw_NullPointerException_at_call));
 
-    StubRoutines::_throw_StackOverflowError_entry =
-      generate_throw_exception("StackOverflowError throw_exception",
-                               CAST_FROM_FN_PTR(address,
-                                                SharedRuntime::
-                                                throw_StackOverflowError));
-
     // entry points that are platform specific
     StubRoutines::x86::_f2i_fixup = generate_f2i_fixup();
     StubRoutines::x86::_f2l_fixup = generate_f2l_fixup();
diff --git a/hotspot/src/cpu/x86/vm/templateInterpreter_x86_32.cpp b/hotspot/src/cpu/x86/vm/templateInterpreter_x86_32.cpp
index 6f8e35afdf4..29533832ef6 100644
--- a/hotspot/src/cpu/x86/vm/templateInterpreter_x86_32.cpp
+++ b/hotspot/src/cpu/x86/vm/templateInterpreter_x86_32.cpp
@@ -522,9 +522,18 @@ void InterpreterGenerator::generate_stack_overflow_check(void) {
 
   __ pop(rsi);  // get saved bcp / (c++ prev state ).
 
-  __ pop(rax);  // get return address
-  __ jump(ExternalAddress(Interpreter::throw_StackOverflowError_entry()));
+  // Restore sender's sp as SP. This is necessary if the sender's
+  // frame is an extended compiled frame (see gen_c2i_adapter())
+  // and safer anyway in case of JSR292 adaptations.
 
+  __ pop(rax); // return address must be moved if SP is changed
+  __ mov(rsp, rsi);
+  __ push(rax);
+
+  // Note: the restored frame is not necessarily interpreted.
+  // Use the shared runtime version of the StackOverflowError.
+  assert(StubRoutines::throw_StackOverflowError_entry() != NULL, "stub not yet generated");
+  __ jump(ExternalAddress(StubRoutines::throw_StackOverflowError_entry()));
   // all done with frame size check
   __ bind(after_frame_check_pop);
   __ pop(rsi);
@@ -552,7 +561,7 @@ void InterpreterGenerator::lock_method(void) {
   #endif // ASSERT
   // get synchronization object
   { Label done;
-    const int mirror_offset = klassOopDesc::klass_part_offset_in_bytes() + Klass::java_mirror_offset_in_bytes();
+    const int mirror_offset = in_bytes(Klass::java_mirror_offset());
     __ movl(rax, access_flags);
     __ testl(rax, JVM_ACC_STATIC);
     __ movptr(rax, Address(rdi, Interpreter::local_offset_in_bytes(0)));  // get receiver (assume this is frequent case)
@@ -1012,7 +1021,7 @@ address InterpreterGenerator::generate_native_entry(bool synchronized) {
 
   // pass mirror handle if static call
   { Label L;
-    const int mirror_offset = klassOopDesc::klass_part_offset_in_bytes() + Klass::java_mirror_offset_in_bytes();
+    const int mirror_offset = in_bytes(Klass::java_mirror_offset());
     __ movl(t, Address(method, methodOopDesc::access_flags_offset()));
     __ testl(t, JVM_ACC_STATIC);
     __ jcc(Assembler::zero, L);
diff --git a/hotspot/src/cpu/x86/vm/templateInterpreter_x86_64.cpp b/hotspot/src/cpu/x86/vm/templateInterpreter_x86_64.cpp
index 40c7d63e165..110d8ebdf3c 100644
--- a/hotspot/src/cpu/x86/vm/templateInterpreter_x86_64.cpp
+++ b/hotspot/src/cpu/x86/vm/templateInterpreter_x86_64.cpp
@@ -467,8 +467,18 @@ void InterpreterGenerator::generate_stack_overflow_check(void) {
   __ cmpptr(rsp, rax);
   __ jcc(Assembler::above, after_frame_check);
 
-  __ pop(rax); // get return address
-  __ jump(ExternalAddress(Interpreter::throw_StackOverflowError_entry()));
+  // Restore sender's sp as SP. This is necessary if the sender's
+  // frame is an extended compiled frame (see gen_c2i_adapter())
+  // and safer anyway in case of JSR292 adaptations.
+
+  __ pop(rax); // return address must be moved if SP is changed
+  __ mov(rsp, r13);
+  __ push(rax);
+
+  // Note: the restored frame is not necessarily interpreted.
+  // Use the shared runtime version of the StackOverflowError.
+  assert(StubRoutines::throw_StackOverflowError_entry() != NULL, "stub not yet generated");
+  __ jump(ExternalAddress(StubRoutines::throw_StackOverflowError_entry()));
 
   // all done with frame size check
   __ bind(after_frame_check);
@@ -505,8 +515,7 @@ void InterpreterGenerator::lock_method(void) {
 
   // get synchronization object
   {
-    const int mirror_offset = klassOopDesc::klass_part_offset_in_bytes() +
-                              Klass::java_mirror_offset_in_bytes();
+    const int mirror_offset = in_bytes(Klass::java_mirror_offset());
     Label done;
     __ movl(rax, access_flags);
     __ testl(rax, JVM_ACC_STATIC);
@@ -1006,8 +1015,7 @@ address InterpreterGenerator::generate_native_entry(bool synchronized) {
   // pass mirror handle if static call
   {
     Label L;
-    const int mirror_offset = klassOopDesc::klass_part_offset_in_bytes() +
-                              Klass::java_mirror_offset_in_bytes();
+    const int mirror_offset = in_bytes(Klass::java_mirror_offset());
     __ movl(t, Address(method, methodOopDesc::access_flags_offset()));
     __ testl(t, JVM_ACC_STATIC);
     __ jcc(Assembler::zero, L);
diff --git a/hotspot/src/cpu/x86/vm/templateTable_x86_32.cpp b/hotspot/src/cpu/x86/vm/templateTable_x86_32.cpp
index 65e65ef5b51..1cbc67e6060 100644
--- a/hotspot/src/cpu/x86/vm/templateTable_x86_32.cpp
+++ b/hotspot/src/cpu/x86/vm/templateTable_x86_32.cpp
@@ -980,7 +980,7 @@ void TemplateTable::aastore() {
   __ load_klass(rbx, rax);
   // Move superklass into EAX
   __ load_klass(rax, rdx);
-  __ movptr(rax, Address(rax, sizeof(oopDesc) + objArrayKlass::element_klass_offset_in_bytes()));
+  __ movptr(rax, Address(rax, objArrayKlass::element_klass_offset()));
   // Compress array+index*wordSize+12 into a single register.  Frees ECX.
   __ lea(rdx, element_address);
 
@@ -2033,7 +2033,7 @@ void TemplateTable::_return(TosState state) {
     assert(state == vtos, "only valid state");
     __ movptr(rax, aaddress(0));
     __ load_klass(rdi, rax);
-    __ movl(rdi, Address(rdi, Klass::access_flags_offset_in_bytes() + sizeof(oopDesc)));
+    __ movl(rdi, Address(rdi, Klass::access_flags_offset()));
     __ testl(rdi, JVM_ACC_HAS_FINALIZER);
     Label skip_register_finalizer;
     __ jcc(Assembler::zero, skip_register_finalizer);
@@ -3188,11 +3188,11 @@ void TemplateTable::_new() {
 
   // make sure klass is initialized & doesn't have finalizer
   // make sure klass is fully initialized
-  __ cmpl(Address(rcx, instanceKlass::init_state_offset_in_bytes() + sizeof(oopDesc)), instanceKlass::fully_initialized);
+  __ cmpb(Address(rcx, instanceKlass::init_state_offset()), instanceKlass::fully_initialized);
   __ jcc(Assembler::notEqual, slow_case);
 
   // get instance_size in instanceKlass (scaled to a count of bytes)
-  __ movl(rdx, Address(rcx, Klass::layout_helper_offset_in_bytes() + sizeof(oopDesc)));
+  __ movl(rdx, Address(rcx, Klass::layout_helper_offset()));
   // test to see if it has a finalizer or is malformed in some way
   __ testl(rdx, Klass::_lh_instance_slow_path_bit);
   __ jcc(Assembler::notZero, slow_case);
@@ -3293,7 +3293,7 @@ void TemplateTable::_new() {
     __ bind(initialize_header);
     if (UseBiasedLocking) {
       __ pop(rcx);   // get saved klass back in the register.
-      __ movptr(rbx, Address(rcx, Klass::prototype_header_offset_in_bytes() + klassOopDesc::klass_part_offset_in_bytes()));
+      __ movptr(rbx, Address(rcx, Klass::prototype_header_offset()));
       __ movptr(Address(rax, oopDesc::mark_offset_in_bytes ()), rbx);
     } else {
       __ movptr(Address(rax, oopDesc::mark_offset_in_bytes ()),
diff --git a/hotspot/src/cpu/x86/vm/templateTable_x86_64.cpp b/hotspot/src/cpu/x86/vm/templateTable_x86_64.cpp
index 818fb44e0a5..0e5ac274f36 100644
--- a/hotspot/src/cpu/x86/vm/templateTable_x86_64.cpp
+++ b/hotspot/src/cpu/x86/vm/templateTable_x86_64.cpp
@@ -1004,8 +1004,7 @@ void TemplateTable::aastore() {
   // Move superklass into rax
   __ load_klass(rax, rdx);
   __ movptr(rax, Address(rax,
-                         sizeof(oopDesc) +
-                         objArrayKlass::element_klass_offset_in_bytes()));
+                         objArrayKlass::element_klass_offset()));
   // Compress array + index*oopSize + 12 into a single register.  Frees rcx.
   __ lea(rdx, element_address);
 
@@ -2067,7 +2066,7 @@ void TemplateTable::_return(TosState state) {
     assert(state == vtos, "only valid state");
     __ movptr(c_rarg1, aaddress(0));
     __ load_klass(rdi, c_rarg1);
-    __ movl(rdi, Address(rdi, Klass::access_flags_offset_in_bytes() + sizeof(oopDesc)));
+    __ movl(rdi, Address(rdi, Klass::access_flags_offset()));
     __ testl(rdi, JVM_ACC_HAS_FINALIZER);
     Label skip_register_finalizer;
     __ jcc(Assembler::zero, skip_register_finalizer);
@@ -3235,16 +3234,15 @@ void TemplateTable::_new() {
 
   // make sure klass is initialized & doesn't have finalizer
   // make sure klass is fully initialized
-  __ cmpl(Address(rsi,
-                  instanceKlass::init_state_offset_in_bytes() +
-                  sizeof(oopDesc)),
+  __ cmpb(Address(rsi,
+                  instanceKlass::init_state_offset()),
           instanceKlass::fully_initialized);
   __ jcc(Assembler::notEqual, slow_case);
 
   // get instance_size in instanceKlass (scaled to a count of bytes)
   __ movl(rdx,
           Address(rsi,
-                  Klass::layout_helper_offset_in_bytes() + sizeof(oopDesc)));
+                  Klass::layout_helper_offset()));
   // test to see if it has a finalizer or is malformed in some way
   __ testl(rdx, Klass::_lh_instance_slow_path_bit);
   __ jcc(Assembler::notZero, slow_case);
@@ -3337,7 +3335,7 @@ void TemplateTable::_new() {
     // initialize object header only.
     __ bind(initialize_header);
     if (UseBiasedLocking) {
-      __ movptr(rscratch1, Address(rsi, Klass::prototype_header_offset_in_bytes() + klassOopDesc::klass_part_offset_in_bytes()));
+      __ movptr(rscratch1, Address(rsi, Klass::prototype_header_offset()));
       __ movptr(Address(rax, oopDesc::mark_offset_in_bytes()), rscratch1);
     } else {
       __ movptr(Address(rax, oopDesc::mark_offset_in_bytes()),
diff --git a/hotspot/src/cpu/x86/vm/vm_version_x86.cpp b/hotspot/src/cpu/x86/vm/vm_version_x86.cpp
index 7a8ee727b85..2155d767980 100644
--- a/hotspot/src/cpu/x86/vm/vm_version_x86.cpp
+++ b/hotspot/src/cpu/x86/vm/vm_version_x86.cpp
@@ -50,7 +50,7 @@ const char*           VM_Version::_features_str = "";
 VM_Version::CpuidInfo VM_Version::_cpuid_info   = { 0, };
 
 static BufferBlob* stub_blob;
-static const int stub_size = 400;
+static const int stub_size = 550;
 
 extern "C" {
   typedef void (*getPsrInfo_stub_t)(void*);
@@ -73,7 +73,7 @@ class VM_Version_StubGenerator: public StubCodeGenerator {
     const uint32_t CPU_FAMILY_486   = (4 << CPU_FAMILY_SHIFT);
 
     Label detect_486, cpu486, detect_586, std_cpuid1, std_cpuid4;
-    Label ext_cpuid1, ext_cpuid5, done;
+    Label sef_cpuid, ext_cpuid, ext_cpuid1, ext_cpuid5, ext_cpuid7, done;
 
     StubCodeMark mark(this, "VM_Version", "getPsrInfo_stub");
 #   define __ _masm->
@@ -229,14 +229,51 @@ class VM_Version_StubGenerator: public StubCodeGenerator {
     __ movl(Address(rsi, 8), rcx);
     __ movl(Address(rsi,12), rdx);
 
+    //
+    // Check if OS has enabled XGETBV instruction to access XCR0
+    // (OSXSAVE feature flag) and CPU supports AVX
+    //
+    __ andl(rcx, 0x18000000);
+    __ cmpl(rcx, 0x18000000);
+    __ jccb(Assembler::notEqual, sef_cpuid);
+
+    //
+    // XCR0, XFEATURE_ENABLED_MASK register
+    //
+    __ xorl(rcx, rcx);   // zero for XCR0 register
+    __ xgetbv();
+    __ lea(rsi, Address(rbp, in_bytes(VM_Version::xem_xcr0_offset())));
+    __ movl(Address(rsi, 0), rax);
+    __ movl(Address(rsi, 4), rdx);
+
+    //
+    // cpuid(0x7) Structured Extended Features
+    //
+    __ bind(sef_cpuid);
+    __ movl(rax, 7);
+    __ cmpl(rax, Address(rbp, in_bytes(VM_Version::std_cpuid0_offset()))); // Is cpuid(0x7) supported?
+    __ jccb(Assembler::greater, ext_cpuid);
+
+    __ xorl(rcx, rcx);
+    __ cpuid();
+    __ lea(rsi, Address(rbp, in_bytes(VM_Version::sef_cpuid7_offset())));
+    __ movl(Address(rsi, 0), rax);
+    __ movl(Address(rsi, 4), rbx);
+
+    //
+    // Extended cpuid(0x80000000)
+    //
+    __ bind(ext_cpuid);
     __ movl(rax, 0x80000000);
     __ cpuid();
     __ cmpl(rax, 0x80000000);     // Is cpuid(0x80000001) supported?
     __ jcc(Assembler::belowEqual, done);
     __ cmpl(rax, 0x80000004);     // Is cpuid(0x80000005) supported?
     __ jccb(Assembler::belowEqual, ext_cpuid1);
-    __ cmpl(rax, 0x80000007);     // Is cpuid(0x80000008) supported?
+    __ cmpl(rax, 0x80000006);     // Is cpuid(0x80000007) supported?
     __ jccb(Assembler::belowEqual, ext_cpuid5);
+    __ cmpl(rax, 0x80000007);     // Is cpuid(0x80000008) supported?
+    __ jccb(Assembler::belowEqual, ext_cpuid7);
     //
     // Extended cpuid(0x80000008)
     //
@@ -248,6 +285,18 @@ class VM_Version_StubGenerator: public StubCodeGenerator {
     __ movl(Address(rsi, 8), rcx);
     __ movl(Address(rsi,12), rdx);
 
+    //
+    // Extended cpuid(0x80000007)
+    //
+    __ bind(ext_cpuid7);
+    __ movl(rax, 0x80000007);
+    __ cpuid();
+    __ lea(rsi, Address(rbp, in_bytes(VM_Version::ext_cpuid7_offset())));
+    __ movl(Address(rsi, 0), rax);
+    __ movl(Address(rsi, 4), rbx);
+    __ movl(Address(rsi, 8), rcx);
+    __ movl(Address(rsi,12), rdx);
+
     //
     // Extended cpuid(0x80000005)
     //
@@ -359,13 +408,19 @@ void VM_Version::get_processor_features() {
   if (UseSSE < 1)
     _cpuFeatures &= ~CPU_SSE;
 
+  if (UseAVX < 2)
+    _cpuFeatures &= ~CPU_AVX2;
+
+  if (UseAVX < 1)
+    _cpuFeatures &= ~CPU_AVX;
+
   if (logical_processors_per_package() == 1) {
     // HT processor could be installed on a system which doesn't support HT.
     _cpuFeatures &= ~CPU_HT;
   }
 
   char buf[256];
-  jio_snprintf(buf, sizeof(buf), "(%u cores per cpu, %u threads per core) family %d model %d stepping %d%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s",
+  jio_snprintf(buf, sizeof(buf), "(%u cores per cpu, %u threads per core) family %d model %d stepping %d%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s",
                cores_per_cpu(), threads_per_core(),
                cpu_family(), _model, _stepping,
                (supports_cmov() ? ", cmov" : ""),
@@ -379,27 +434,39 @@ void VM_Version::get_processor_features() {
                (supports_sse4_1() ? ", sse4.1" : ""),
                (supports_sse4_2() ? ", sse4.2" : ""),
                (supports_popcnt() ? ", popcnt" : ""),
+               (supports_avx()    ? ", avx" : ""),
+               (supports_avx2()   ? ", avx2" : ""),
                (supports_mmx_ext() ? ", mmxext" : ""),
                (supports_3dnow_prefetch() ? ", 3dnowpref" : ""),
                (supports_lzcnt()   ? ", lzcnt": ""),
                (supports_sse4a()   ? ", sse4a": ""),
-               (supports_ht() ? ", ht": ""));
+               (supports_ht() ? ", ht": ""),
+               (supports_tsc() ? ", tsc": ""),
+               (supports_tscinv_bit() ? ", tscinvbit": ""),
+               (supports_tscinv() ? ", tscinv": ""));
   _features_str = strdup(buf);
 
   // UseSSE is set to the smaller of what hardware supports and what
   // the command line requires.  I.e., you cannot set UseSSE to 2 on
   // older Pentiums which do not support it.
-  if( UseSSE > 4 ) UseSSE=4;
-  if( UseSSE < 0 ) UseSSE=0;
-  if( !supports_sse4_1() ) // Drop to 3 if no SSE4 support
+  if (UseSSE > 4) UseSSE=4;
+  if (UseSSE < 0) UseSSE=0;
+  if (!supports_sse4_1()) // Drop to 3 if no SSE4 support
     UseSSE = MIN2((intx)3,UseSSE);
-  if( !supports_sse3() ) // Drop to 2 if no SSE3 support
+  if (!supports_sse3()) // Drop to 2 if no SSE3 support
     UseSSE = MIN2((intx)2,UseSSE);
-  if( !supports_sse2() ) // Drop to 1 if no SSE2 support
+  if (!supports_sse2()) // Drop to 1 if no SSE2 support
     UseSSE = MIN2((intx)1,UseSSE);
-  if( !supports_sse () ) // Drop to 0 if no SSE  support
+  if (!supports_sse ()) // Drop to 0 if no SSE  support
     UseSSE = 0;
 
+  if (UseAVX > 2) UseAVX=2;
+  if (UseAVX < 0) UseAVX=0;
+  if (!supports_avx2()) // Drop to 1 if no AVX2 support
+    UseAVX = MIN2((intx)1,UseAVX);
+  if (!supports_avx ()) // Drop to 0 if no AVX  support
+    UseAVX = 0;
+
   // On new cpus instructions which update whole XMM register should be used
   // to prevent partial register stall due to dependencies on high half.
   //
@@ -534,6 +601,9 @@ void VM_Version::get_processor_features() {
     if (FLAG_IS_DEFAULT(UsePopCountInstruction)) {
       UsePopCountInstruction = true;
     }
+  } else if (UsePopCountInstruction) {
+    warning("POPCNT instruction is not available on this CPU");
+    FLAG_SET_DEFAULT(UsePopCountInstruction, false);
   }
 
 #ifdef COMPILER2
@@ -605,7 +675,11 @@ void VM_Version::get_processor_features() {
   if (PrintMiscellaneous && Verbose) {
     tty->print_cr("Logical CPUs per core: %u",
                   logical_processors_per_package());
-    tty->print_cr("UseSSE=%d",UseSSE);
+    tty->print("UseSSE=%d",UseSSE);
+    if (UseAVX > 0) {
+      tty->print("  UseAVX=%d",UseAVX);
+    }
+    tty->cr();
     tty->print("Allocation");
     if (AllocatePrefetchStyle <= 0 || UseSSE == 0 && !supports_3dnow_prefetch()) {
       tty->print_cr(": no prefetching");
diff --git a/hotspot/src/cpu/x86/vm/vm_version_x86.hpp b/hotspot/src/cpu/x86/vm/vm_version_x86.hpp
index 47d81e01be6..27f3bde57d5 100644
--- a/hotspot/src/cpu/x86/vm/vm_version_x86.hpp
+++ b/hotspot/src/cpu/x86/vm/vm_version_x86.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 1997, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1997, 2011, Oracle and/or its affiliates. All rights reserved.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
  * This code is free software; you can redistribute it and/or modify it
@@ -78,7 +78,10 @@ public:
                sse4_2   : 1,
                         : 2,
                popcnt   : 1,
-                        : 8;
+                        : 3,
+               osxsave  : 1,
+               avx      : 1,
+                        : 3;
     } bits;
   };
 
@@ -168,6 +171,15 @@ public:
     } bits;
   };
 
+  union ExtCpuid7Edx {
+    uint32_t value;
+    struct {
+      uint32_t               : 8,
+              tsc_invariance : 1,
+                             : 23;
+    } bits;
+  };
+
   union ExtCpuid8Ecx {
     uint32_t value;
     struct {
@@ -176,32 +188,75 @@ public:
     } bits;
   };
 
-protected:
-   static int _cpu;
-   static int _model;
-   static int _stepping;
-   static int _cpuFeatures;     // features returned by the "cpuid" instruction
-                                // 0 if this instruction is not available
-   static const char* _features_str;
+  union SefCpuid7Eax {
+    uint32_t value;
+  };
 
-   enum {
-     CPU_CX8    = (1 << 0), // next bits are from cpuid 1 (EDX)
-     CPU_CMOV   = (1 << 1),
-     CPU_FXSR   = (1 << 2),
-     CPU_HT     = (1 << 3),
-     CPU_MMX    = (1 << 4),
-     CPU_3DNOW_PREFETCH  = (1 << 5), // Processor supports 3dnow prefetch and prefetchw instructions
-                                     // may not necessarily support other 3dnow instructions
-     CPU_SSE    = (1 << 6),
-     CPU_SSE2   = (1 << 7),
-     CPU_SSE3   = (1 << 8), // SSE3 comes from cpuid 1 (ECX)
-     CPU_SSSE3  = (1 << 9),
-     CPU_SSE4A  = (1 << 10),
-     CPU_SSE4_1 = (1 << 11),
-     CPU_SSE4_2 = (1 << 12),
-     CPU_POPCNT = (1 << 13),
-     CPU_LZCNT  = (1 << 14)
-   } cpuFeatureFlags;
+  union SefCpuid7Ebx {
+    uint32_t value;
+    struct {
+      uint32_t fsgsbase : 1,
+                        : 2,
+                   bmi1 : 1,
+                        : 1,
+                   avx2 : 1,
+                        : 2,
+                   bmi2 : 1,
+                        : 23;
+    } bits;
+  };
+
+  union XemXcr0Eax {
+    uint32_t value;
+    struct {
+      uint32_t x87 : 1,
+               sse : 1,
+               ymm : 1,
+                   : 29;
+    } bits;
+  };
+
+protected:
+  static int _cpu;
+  static int _model;
+  static int _stepping;
+  static int _cpuFeatures;     // features returned by the "cpuid" instruction
+                               // 0 if this instruction is not available
+  static const char* _features_str;
+
+  enum {
+    CPU_CX8    = (1 << 0), // next bits are from cpuid 1 (EDX)
+    CPU_CMOV   = (1 << 1),
+    CPU_FXSR   = (1 << 2),
+    CPU_HT     = (1 << 3),
+    CPU_MMX    = (1 << 4),
+    CPU_3DNOW_PREFETCH  = (1 << 5), // Processor supports 3dnow prefetch and prefetchw instructions
+                                    // may not necessarily support other 3dnow instructions
+    CPU_SSE    = (1 << 6),
+    CPU_SSE2   = (1 << 7),
+    CPU_SSE3   = (1 << 8), // SSE3 comes from cpuid 1 (ECX)
+    CPU_SSSE3  = (1 << 9),
+    CPU_SSE4A  = (1 << 10),
+    CPU_SSE4_1 = (1 << 11),
+    CPU_SSE4_2 = (1 << 12),
+    CPU_POPCNT = (1 << 13),
+    CPU_LZCNT  = (1 << 14),
+    CPU_TSC    = (1 << 15),
+    CPU_TSCINV = (1 << 16),
+    CPU_AVX    = (1 << 17),
+    CPU_AVX2   = (1 << 18)
+  } cpuFeatureFlags;
+
+  enum {
+    // AMD
+    CPU_FAMILY_AMD_11H       = 17,
+    // Intel
+    CPU_FAMILY_INTEL_CORE    = 6,
+    CPU_MODEL_NEHALEM_EP     = 26,
+    CPU_MODEL_WESTMERE_EP    = 44,
+//  CPU_MODEL_IVYBRIDGE_EP   = ??, TODO - get real value
+    CPU_MODEL_SANDYBRIDGE_EP = 45
+  } cpuExtendedFamily;
 
   // cpuid information block.  All info derived from executing cpuid with
   // various function numbers is stored here.  Intel and AMD info is
@@ -228,6 +283,12 @@ protected:
     uint32_t     dcp_cpuid4_ecx; // unused currently
     uint32_t     dcp_cpuid4_edx; // unused currently
 
+    // cpuid function 7 (structured extended features)
+    SefCpuid7Eax sef_cpuid7_eax;
+    SefCpuid7Ebx sef_cpuid7_ebx;
+    uint32_t     sef_cpuid7_ecx; // unused currently
+    uint32_t     sef_cpuid7_edx; // unused currently
+
     // cpuid function 0xB (processor topology)
     // ecx = 0
     uint32_t     tpl_cpuidB0_eax;
@@ -270,11 +331,21 @@ protected:
     ExtCpuid5Ex  ext_cpuid5_ecx; // L1 data cache info (AMD)
     ExtCpuid5Ex  ext_cpuid5_edx; // L1 instruction cache info (AMD)
 
+    // cpuid function 0x80000007
+    uint32_t     ext_cpuid7_eax; // reserved
+    uint32_t     ext_cpuid7_ebx; // reserved
+    uint32_t     ext_cpuid7_ecx; // reserved
+    ExtCpuid7Edx ext_cpuid7_edx; // tscinv
+
     // cpuid function 0x80000008
     uint32_t     ext_cpuid8_eax; // unused currently
     uint32_t     ext_cpuid8_ebx; // reserved
     ExtCpuid8Ecx ext_cpuid8_ecx;
     uint32_t     ext_cpuid8_edx; // reserved
+
+    // extended control register XCR0 (the XFEATURE_ENABLED_MASK register)
+    XemXcr0Eax   xem_xcr0_eax;
+    uint32_t     xem_xcr0_edx; // reserved
   };
 
   // The actual cpuid info block
@@ -286,19 +357,23 @@ protected:
     result += _cpuid_info.std_cpuid1_eax.bits.ext_family;
     return result;
   }
+
   static uint32_t extended_cpu_model() {
     uint32_t result = _cpuid_info.std_cpuid1_eax.bits.model;
     result |= _cpuid_info.std_cpuid1_eax.bits.ext_model << 4;
     return result;
   }
+
   static uint32_t cpu_stepping() {
     uint32_t result = _cpuid_info.std_cpuid1_eax.bits.stepping;
     return result;
   }
+
   static uint logical_processor_count() {
     uint result = threads_per_core();
     return result;
   }
+
   static uint32_t feature_flags() {
     uint32_t result = 0;
     if (_cpuid_info.std_cpuid1_edx.bits.cmpxchg8 != 0)
@@ -328,6 +403,18 @@ protected:
       result |= CPU_SSE4_2;
     if (_cpuid_info.std_cpuid1_ecx.bits.popcnt != 0)
       result |= CPU_POPCNT;
+    if (_cpuid_info.std_cpuid1_ecx.bits.avx != 0 &&
+        _cpuid_info.std_cpuid1_ecx.bits.osxsave != 0 &&
+        _cpuid_info.xem_xcr0_eax.bits.sse != 0 &&
+        _cpuid_info.xem_xcr0_eax.bits.ymm != 0) {
+      result |= CPU_AVX;
+      if (_cpuid_info.sef_cpuid7_ebx.bits.avx2 != 0)
+        result |= CPU_AVX2;
+    }
+    if (_cpuid_info.std_cpuid1_edx.bits.tsc != 0)
+      result |= CPU_TSC;
+    if (_cpuid_info.ext_cpuid7_edx.bits.tsc_invariance != 0)
+      result |= CPU_TSCINV;
 
     // AMD features.
     if (is_amd()) {
@@ -350,12 +437,15 @@ public:
   static ByteSize std_cpuid0_offset() { return byte_offset_of(CpuidInfo, std_max_function); }
   static ByteSize std_cpuid1_offset() { return byte_offset_of(CpuidInfo, std_cpuid1_eax); }
   static ByteSize dcp_cpuid4_offset() { return byte_offset_of(CpuidInfo, dcp_cpuid4_eax); }
+  static ByteSize sef_cpuid7_offset() { return byte_offset_of(CpuidInfo, sef_cpuid7_eax); }
   static ByteSize ext_cpuid1_offset() { return byte_offset_of(CpuidInfo, ext_cpuid1_eax); }
   static ByteSize ext_cpuid5_offset() { return byte_offset_of(CpuidInfo, ext_cpuid5_eax); }
+  static ByteSize ext_cpuid7_offset() { return byte_offset_of(CpuidInfo, ext_cpuid7_eax); }
   static ByteSize ext_cpuid8_offset() { return byte_offset_of(CpuidInfo, ext_cpuid8_eax); }
   static ByteSize tpl_cpuidB0_offset() { return byte_offset_of(CpuidInfo, tpl_cpuidB0_eax); }
   static ByteSize tpl_cpuidB1_offset() { return byte_offset_of(CpuidInfo, tpl_cpuidB1_eax); }
   static ByteSize tpl_cpuidB2_offset() { return byte_offset_of(CpuidInfo, tpl_cpuidB2_eax); }
+  static ByteSize xem_xcr0_offset() { return byte_offset_of(CpuidInfo, xem_xcr0_eax); }
 
   // Initialization
   static void initialize();
@@ -382,7 +472,6 @@ public:
   //
   static int  cpu_family()        { return _cpu;}
   static bool is_P6()             { return cpu_family() >= 6; }
-
   static bool is_amd()            { assert_is_initialized(); return _cpuid_info.std_vendor_name_0 == 0x68747541; } // 'htuA'
   static bool is_intel()          { assert_is_initialized(); return _cpuid_info.std_vendor_name_0 == 0x756e6547; } // 'uneG'
 
@@ -447,14 +536,51 @@ public:
   static bool supports_sse4_1()   { return (_cpuFeatures & CPU_SSE4_1) != 0; }
   static bool supports_sse4_2()   { return (_cpuFeatures & CPU_SSE4_2) != 0; }
   static bool supports_popcnt()   { return (_cpuFeatures & CPU_POPCNT) != 0; }
-  //
+  static bool supports_avx()      { return (_cpuFeatures & CPU_AVX) != 0; }
+  static bool supports_avx2()     { return (_cpuFeatures & CPU_AVX2) != 0; }
+  static bool supports_tsc()      { return (_cpuFeatures & CPU_TSC)    != 0; }
+
+  // Intel features
+  static bool is_intel_family_core() { return is_intel() &&
+                                       extended_cpu_family() == CPU_FAMILY_INTEL_CORE; }
+
+  static bool is_intel_tsc_synched_at_init()  {
+    if (is_intel_family_core()) {
+      uint32_t ext_model = extended_cpu_model();
+      if (ext_model == CPU_MODEL_NEHALEM_EP   ||
+          ext_model == CPU_MODEL_WESTMERE_EP  ||
+// TODO   ext_model == CPU_MODEL_IVYBRIDGE_EP ||
+          ext_model == CPU_MODEL_SANDYBRIDGE_EP) {
+        // 2-socket invtsc support. EX versions with 4 sockets are not
+        // guaranteed to synchronize tscs at initialization via a double
+        // handshake. The tscs can be explicitly set in software.  Code
+        // that uses tsc values must be prepared for them to arbitrarily
+        // jump backward or forward.
+        return true;
+      }
+    }
+    return false;
+  }
+
   // AMD features
-  //
   static bool supports_3dnow_prefetch()    { return (_cpuFeatures & CPU_3DNOW_PREFETCH) != 0; }
   static bool supports_mmx_ext()  { return is_amd() && _cpuid_info.ext_cpuid1_edx.bits.mmx_amd != 0; }
   static bool supports_lzcnt()    { return (_cpuFeatures & CPU_LZCNT) != 0; }
   static bool supports_sse4a()    { return (_cpuFeatures & CPU_SSE4A) != 0; }
 
+  static bool is_amd_Barcelona()  { return is_amd() &&
+                                           extended_cpu_family() == CPU_FAMILY_AMD_11H; }
+
+  // Intel and AMD newer cores support fast timestamps well
+  static bool supports_tscinv_bit() {
+    return (_cpuFeatures & CPU_TSCINV) != 0;
+  }
+  static bool supports_tscinv() {
+    return supports_tscinv_bit() &&
+           ( (is_amd() && !is_amd_Barcelona()) ||
+             is_intel_tsc_synched_at_init() );
+  }
+
   // Intel Core and newer cpus have fast IDIV instruction (excluding Atom).
   static bool has_fast_idiv()     { return is_intel() && cpu_family() == 6 &&
                                            supports_sse3() && _model != 0x1C; }
diff --git a/hotspot/src/cpu/x86/vm/x86.ad b/hotspot/src/cpu/x86/vm/x86.ad
new file mode 100644
index 00000000000..5f165a9ff12
--- /dev/null
+++ b/hotspot/src/cpu/x86/vm/x86.ad
@@ -0,0 +1,777 @@
+//
+// Copyright (c) 2011, Oracle and/or its affiliates. All rights reserved.
+// DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+//
+// This code is free software; you can redistribute it and/or modify it
+// under the terms of the GNU General Public License version 2 only, as
+// published by the Free Software Foundation.
+//
+// This code is distributed in the hope that it will be useful, but WITHOUT
+// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+// FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+// version 2 for more details (a copy is included in the LICENSE file that
+// accompanied this code).
+//
+// You should have received a copy of the GNU General Public License version
+// 2 along with this work; if not, write to the Free Software Foundation,
+// Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+//
+// Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+// or visit www.oracle.com if you need additional information or have any
+// questions.
+//
+//
+
+// X86 Common Architecture Description File
+
+source %{
+  // Float masks come from different places depending on platform.
+#ifdef _LP64
+  static address float_signmask()  { return StubRoutines::x86::float_sign_mask(); }
+  static address float_signflip()  { return StubRoutines::x86::float_sign_flip(); }
+  static address double_signmask() { return StubRoutines::x86::double_sign_mask(); }
+  static address double_signflip() { return StubRoutines::x86::double_sign_flip(); }
+#else
+  static address float_signmask()  { return (address)float_signmask_pool; }
+  static address float_signflip()  { return (address)float_signflip_pool; }
+  static address double_signmask() { return (address)double_signmask_pool; }
+  static address double_signflip() { return (address)double_signflip_pool; }
+#endif
+%}
+
+// INSTRUCTIONS -- Platform independent definitions (same for 32- and 64-bit)
+
+instruct addF_reg(regF dst, regF src) %{
+  predicate((UseSSE>=1) && (UseAVX == 0));
+  match(Set dst (AddF dst src));
+
+  format %{ "addss   $dst, $src" %}
+  ins_cost(150);
+  ins_encode %{
+    __ addss($dst$$XMMRegister, $src$$XMMRegister);
+  %}
+  ins_pipe(pipe_slow);
+%}
+
+instruct addF_mem(regF dst, memory src) %{
+  predicate((UseSSE>=1) && (UseAVX == 0));
+  match(Set dst (AddF dst (LoadF src)));
+
+  format %{ "addss   $dst, $src" %}
+  ins_cost(150);
+  ins_encode %{
+    __ addss($dst$$XMMRegister, $src$$Address);
+  %}
+  ins_pipe(pipe_slow);
+%}
+
+instruct addF_imm(regF dst, immF con) %{
+  predicate((UseSSE>=1) && (UseAVX == 0));
+  match(Set dst (AddF dst con));
+  format %{ "addss   $dst, [$constantaddress]\t# load from constant table: float=$con" %}
+  ins_cost(150);
+  ins_encode %{
+    __ addss($dst$$XMMRegister, $constantaddress($con));
+  %}
+  ins_pipe(pipe_slow);
+%}
+
+instruct vaddF_reg(regF dst, regF src1, regF src2) %{
+  predicate(UseAVX > 0);
+  match(Set dst (AddF src1 src2));
+
+  format %{ "vaddss  $dst, $src1, $src2" %}
+  ins_cost(150);
+  ins_encode %{
+    __ vaddss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
+  %}
+  ins_pipe(pipe_slow);
+%}
+
+instruct vaddF_mem(regF dst, regF src1, memory src2) %{
+  predicate(UseAVX > 0);
+  match(Set dst (AddF src1 (LoadF src2)));
+
+  format %{ "vaddss  $dst, $src1, $src2" %}
+  ins_cost(150);
+  ins_encode %{
+    __ vaddss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
+  %}
+  ins_pipe(pipe_slow);
+%}
+
+instruct vaddF_imm(regF dst, regF src, immF con) %{
+  predicate(UseAVX > 0);
+  match(Set dst (AddF src con));
+
+  format %{ "vaddss  $dst, $src, [$constantaddress]\t# load from constant table: float=$con" %}
+  ins_cost(150);
+  ins_encode %{
+    __ vaddss($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
+  %}
+  ins_pipe(pipe_slow);
+%}
+
+instruct addD_reg(regD dst, regD src) %{
+  predicate((UseSSE>=2) && (UseAVX == 0));
+  match(Set dst (AddD dst src));
+
+  format %{ "addsd   $dst, $src" %}
+  ins_cost(150);
+  ins_encode %{
+    __ addsd($dst$$XMMRegister, $src$$XMMRegister);
+  %}
+  ins_pipe(pipe_slow);
+%}
+
+instruct addD_mem(regD dst, memory src) %{
+  predicate((UseSSE>=2) && (UseAVX == 0));
+  match(Set dst (AddD dst (LoadD src)));
+
+  format %{ "addsd   $dst, $src" %}
+  ins_cost(150);
+  ins_encode %{
+    __ addsd($dst$$XMMRegister, $src$$Address);
+  %}
+  ins_pipe(pipe_slow);
+%}
+
+instruct addD_imm(regD dst, immD con) %{
+  predicate((UseSSE>=2) && (UseAVX == 0));
+  match(Set dst (AddD dst con));
+  format %{ "addsd   $dst, [$constantaddress]\t# load from constant table: double=$con" %}
+  ins_cost(150);
+  ins_encode %{
+    __ addsd($dst$$XMMRegister, $constantaddress($con));
+  %}
+  ins_pipe(pipe_slow);
+%}
+
+instruct vaddD_reg(regD dst, regD src1, regD src2) %{
+  predicate(UseAVX > 0);
+  match(Set dst (AddD src1 src2));
+
+  format %{ "vaddsd  $dst, $src1, $src2" %}
+  ins_cost(150);
+  ins_encode %{
+    __ vaddsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
+  %}
+  ins_pipe(pipe_slow);
+%}
+
+instruct vaddD_mem(regD dst, regD src1, memory src2) %{
+  predicate(UseAVX > 0);
+  match(Set dst (AddD src1 (LoadD src2)));
+
+  format %{ "vaddsd  $dst, $src1, $src2" %}
+  ins_cost(150);
+  ins_encode %{
+    __ vaddsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
+  %}
+  ins_pipe(pipe_slow);
+%}
+
+instruct vaddD_imm(regD dst, regD src, immD con) %{
+  predicate(UseAVX > 0);
+  match(Set dst (AddD src con));
+
+  format %{ "vaddsd  $dst, $src, [$constantaddress]\t# load from constant table: double=$con" %}
+  ins_cost(150);
+  ins_encode %{
+    __ vaddsd($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
+  %}
+  ins_pipe(pipe_slow);
+%}
+
+instruct subF_reg(regF dst, regF src) %{
+  predicate((UseSSE>=1) && (UseAVX == 0));
+  match(Set dst (SubF dst src));
+
+  format %{ "subss   $dst, $src" %}
+  ins_cost(150);
+  ins_encode %{
+    __ subss($dst$$XMMRegister, $src$$XMMRegister);
+  %}
+  ins_pipe(pipe_slow);
+%}
+
+instruct subF_mem(regF dst, memory src) %{
+  predicate((UseSSE>=1) && (UseAVX == 0));
+  match(Set dst (SubF dst (LoadF src)));
+
+  format %{ "subss   $dst, $src" %}
+  ins_cost(150);
+  ins_encode %{
+    __ subss($dst$$XMMRegister, $src$$Address);
+  %}
+  ins_pipe(pipe_slow);
+%}
+
+instruct subF_imm(regF dst, immF con) %{
+  predicate((UseSSE>=1) && (UseAVX == 0));
+  match(Set dst (SubF dst con));
+  format %{ "subss   $dst, [$constantaddress]\t# load from constant table: float=$con" %}
+  ins_cost(150);
+  ins_encode %{
+    __ subss($dst$$XMMRegister, $constantaddress($con));
+  %}
+  ins_pipe(pipe_slow);
+%}
+
+instruct vsubF_reg(regF dst, regF src1, regF src2) %{
+  predicate(UseAVX > 0);
+  match(Set dst (SubF src1 src2));
+
+  format %{ "vsubss  $dst, $src1, $src2" %}
+  ins_cost(150);
+  ins_encode %{
+    __ vsubss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
+  %}
+  ins_pipe(pipe_slow);
+%}
+
+instruct vsubF_mem(regF dst, regF src1, memory src2) %{
+  predicate(UseAVX > 0);
+  match(Set dst (SubF src1 (LoadF src2)));
+
+  format %{ "vsubss  $dst, $src1, $src2" %}
+  ins_cost(150);
+  ins_encode %{
+    __ vsubss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
+  %}
+  ins_pipe(pipe_slow);
+%}
+
+instruct vsubF_imm(regF dst, regF src, immF con) %{
+  predicate(UseAVX > 0);
+  match(Set dst (SubF src con));
+
+  format %{ "vsubss  $dst, $src, [$constantaddress]\t# load from constant table: float=$con" %}
+  ins_cost(150);
+  ins_encode %{
+    __ vsubss($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
+  %}
+  ins_pipe(pipe_slow);
+%}
+
+instruct subD_reg(regD dst, regD src) %{
+  predicate((UseSSE>=2) && (UseAVX == 0));
+  match(Set dst (SubD dst src));
+
+  format %{ "subsd   $dst, $src" %}
+  ins_cost(150);
+  ins_encode %{
+    __ subsd($dst$$XMMRegister, $src$$XMMRegister);
+  %}
+  ins_pipe(pipe_slow);
+%}
+
+instruct subD_mem(regD dst, memory src) %{
+  predicate((UseSSE>=2) && (UseAVX == 0));
+  match(Set dst (SubD dst (LoadD src)));
+
+  format %{ "subsd   $dst, $src" %}
+  ins_cost(150);
+  ins_encode %{
+    __ subsd($dst$$XMMRegister, $src$$Address);
+  %}
+  ins_pipe(pipe_slow);
+%}
+
+instruct subD_imm(regD dst, immD con) %{
+  predicate((UseSSE>=2) && (UseAVX == 0));
+  match(Set dst (SubD dst con));
+  format %{ "subsd   $dst, [$constantaddress]\t# load from constant table: double=$con" %}
+  ins_cost(150);
+  ins_encode %{
+    __ subsd($dst$$XMMRegister, $constantaddress($con));
+  %}
+  ins_pipe(pipe_slow);
+%}
+
+instruct vsubD_reg(regD dst, regD src1, regD src2) %{
+  predicate(UseAVX > 0);
+  match(Set dst (SubD src1 src2));
+
+  format %{ "vsubsd  $dst, $src1, $src2" %}
+  ins_cost(150);
+  ins_encode %{
+    __ vsubsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
+  %}
+  ins_pipe(pipe_slow);
+%}
+
+instruct vsubD_mem(regD dst, regD src1, memory src2) %{
+  predicate(UseAVX > 0);
+  match(Set dst (SubD src1 (LoadD src2)));
+
+  format %{ "vsubsd  $dst, $src1, $src2" %}
+  ins_cost(150);
+  ins_encode %{
+    __ vsubsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
+  %}
+  ins_pipe(pipe_slow);
+%}
+
+instruct vsubD_imm(regD dst, regD src, immD con) %{
+  predicate(UseAVX > 0);
+  match(Set dst (SubD src con));
+
+  format %{ "vsubsd  $dst, $src, [$constantaddress]\t# load from constant table: double=$con" %}
+  ins_cost(150);
+  ins_encode %{
+    __ vsubsd($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
+  %}
+  ins_pipe(pipe_slow);
+%}
+
+instruct mulF_reg(regF dst, regF src) %{
+  predicate((UseSSE>=1) && (UseAVX == 0));
+  match(Set dst (MulF dst src));
+
+  format %{ "mulss   $dst, $src" %}
+  ins_cost(150);
+  ins_encode %{
+    __ mulss($dst$$XMMRegister, $src$$XMMRegister);
+  %}
+  ins_pipe(pipe_slow);
+%}
+
+instruct mulF_mem(regF dst, memory src) %{
+  predicate((UseSSE>=1) && (UseAVX == 0));
+  match(Set dst (MulF dst (LoadF src)));
+
+  format %{ "mulss   $dst, $src" %}
+  ins_cost(150);
+  ins_encode %{
+    __ mulss($dst$$XMMRegister, $src$$Address);
+  %}
+  ins_pipe(pipe_slow);
+%}
+
+instruct mulF_imm(regF dst, immF con) %{
+  predicate((UseSSE>=1) && (UseAVX == 0));
+  match(Set dst (MulF dst con));
+  format %{ "mulss   $dst, [$constantaddress]\t# load from constant table: float=$con" %}
+  ins_cost(150);
+  ins_encode %{
+    __ mulss($dst$$XMMRegister, $constantaddress($con));
+  %}
+  ins_pipe(pipe_slow);
+%}
+
+instruct vmulF_reg(regF dst, regF src1, regF src2) %{
+  predicate(UseAVX > 0);
+  match(Set dst (MulF src1 src2));
+
+  format %{ "vmulss  $dst, $src1, $src2" %}
+  ins_cost(150);
+  ins_encode %{
+    __ vmulss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
+  %}
+  ins_pipe(pipe_slow);
+%}
+
+instruct vmulF_mem(regF dst, regF src1, memory src2) %{
+  predicate(UseAVX > 0);
+  match(Set dst (MulF src1 (LoadF src2)));
+
+  format %{ "vmulss  $dst, $src1, $src2" %}
+  ins_cost(150);
+  ins_encode %{
+    __ vmulss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
+  %}
+  ins_pipe(pipe_slow);
+%}
+
+instruct vmulF_imm(regF dst, regF src, immF con) %{
+  predicate(UseAVX > 0);
+  match(Set dst (MulF src con));
+
+  format %{ "vmulss  $dst, $src, [$constantaddress]\t# load from constant table: float=$con" %}
+  ins_cost(150);
+  ins_encode %{
+    __ vmulss($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
+  %}
+  ins_pipe(pipe_slow);
+%}
+
+instruct mulD_reg(regD dst, regD src) %{
+  predicate((UseSSE>=2) && (UseAVX == 0));
+  match(Set dst (MulD dst src));
+
+  format %{ "mulsd   $dst, $src" %}
+  ins_cost(150);
+  ins_encode %{
+    __ mulsd($dst$$XMMRegister, $src$$XMMRegister);
+  %}
+  ins_pipe(pipe_slow);
+%}
+
+instruct mulD_mem(regD dst, memory src) %{
+  predicate((UseSSE>=2) && (UseAVX == 0));
+  match(Set dst (MulD dst (LoadD src)));
+
+  format %{ "mulsd   $dst, $src" %}
+  ins_cost(150);
+  ins_encode %{
+    __ mulsd($dst$$XMMRegister, $src$$Address);
+  %}
+  ins_pipe(pipe_slow);
+%}
+
+instruct mulD_imm(regD dst, immD con) %{
+  predicate((UseSSE>=2) && (UseAVX == 0));
+  match(Set dst (MulD dst con));
+  format %{ "mulsd   $dst, [$constantaddress]\t# load from constant table: double=$con" %}
+  ins_cost(150);
+  ins_encode %{
+    __ mulsd($dst$$XMMRegister, $constantaddress($con));
+  %}
+  ins_pipe(pipe_slow);
+%}
+
+instruct vmulD_reg(regD dst, regD src1, regD src2) %{
+  predicate(UseAVX > 0);
+  match(Set dst (MulD src1 src2));
+
+  format %{ "vmulsd  $dst, $src1, $src2" %}
+  ins_cost(150);
+  ins_encode %{
+    __ vmulsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
+  %}
+  ins_pipe(pipe_slow);
+%}
+
+instruct vmulD_mem(regD dst, regD src1, memory src2) %{
+  predicate(UseAVX > 0);
+  match(Set dst (MulD src1 (LoadD src2)));
+
+  format %{ "vmulsd  $dst, $src1, $src2" %}
+  ins_cost(150);
+  ins_encode %{
+    __ vmulsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
+  %}
+  ins_pipe(pipe_slow);
+%}
+
+instruct vmulD_imm(regD dst, regD src, immD con) %{
+  predicate(UseAVX > 0);
+  match(Set dst (MulD src con));
+
+  format %{ "vmulsd  $dst, $src, [$constantaddress]\t# load from constant table: double=$con" %}
+  ins_cost(150);
+  ins_encode %{
+    __ vmulsd($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
+  %}
+  ins_pipe(pipe_slow);
+%}
+
+instruct divF_reg(regF dst, regF src) %{
+  predicate((UseSSE>=1) && (UseAVX == 0));
+  match(Set dst (DivF dst src));
+
+  format %{ "divss   $dst, $src" %}
+  ins_cost(150);
+  ins_encode %{
+    __ divss($dst$$XMMRegister, $src$$XMMRegister);
+  %}
+  ins_pipe(pipe_slow);
+%}
+
+instruct divF_mem(regF dst, memory src) %{
+  predicate((UseSSE>=1) && (UseAVX == 0));
+  match(Set dst (DivF dst (LoadF src)));
+
+  format %{ "divss   $dst, $src" %}
+  ins_cost(150);
+  ins_encode %{
+    __ divss($dst$$XMMRegister, $src$$Address);
+  %}
+  ins_pipe(pipe_slow);
+%}
+
+instruct divF_imm(regF dst, immF con) %{
+  predicate((UseSSE>=1) && (UseAVX == 0));
+  match(Set dst (DivF dst con));
+  format %{ "divss   $dst, [$constantaddress]\t# load from constant table: float=$con" %}
+  ins_cost(150);
+  ins_encode %{
+    __ divss($dst$$XMMRegister, $constantaddress($con));
+  %}
+  ins_pipe(pipe_slow);
+%}
+
+instruct vdivF_reg(regF dst, regF src1, regF src2) %{
+  predicate(UseAVX > 0);
+  match(Set dst (DivF src1 src2));
+
+  format %{ "vdivss  $dst, $src1, $src2" %}
+  ins_cost(150);
+  ins_encode %{
+    __ vdivss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
+  %}
+  ins_pipe(pipe_slow);
+%}
+
+instruct vdivF_mem(regF dst, regF src1, memory src2) %{
+  predicate(UseAVX > 0);
+  match(Set dst (DivF src1 (LoadF src2)));
+
+  format %{ "vdivss  $dst, $src1, $src2" %}
+  ins_cost(150);
+  ins_encode %{
+    __ vdivss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
+  %}
+  ins_pipe(pipe_slow);
+%}
+
+instruct vdivF_imm(regF dst, regF src, immF con) %{
+  predicate(UseAVX > 0);
+  match(Set dst (DivF src con));
+
+  format %{ "vdivss  $dst, $src, [$constantaddress]\t# load from constant table: float=$con" %}
+  ins_cost(150);
+  ins_encode %{
+    __ vdivss($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
+  %}
+  ins_pipe(pipe_slow);
+%}
+
+instruct divD_reg(regD dst, regD src) %{
+  predicate((UseSSE>=2) && (UseAVX == 0));
+  match(Set dst (DivD dst src));
+
+  format %{ "divsd   $dst, $src" %}
+  ins_cost(150);
+  ins_encode %{
+    __ divsd($dst$$XMMRegister, $src$$XMMRegister);
+  %}
+  ins_pipe(pipe_slow);
+%}
+
+instruct divD_mem(regD dst, memory src) %{
+  predicate((UseSSE>=2) && (UseAVX == 0));
+  match(Set dst (DivD dst (LoadD src)));
+
+  format %{ "divsd   $dst, $src" %}
+  ins_cost(150);
+  ins_encode %{
+    __ divsd($dst$$XMMRegister, $src$$Address);
+  %}
+  ins_pipe(pipe_slow);
+%}
+
+instruct divD_imm(regD dst, immD con) %{
+  predicate((UseSSE>=2) && (UseAVX == 0));
+  match(Set dst (DivD dst con));
+  format %{ "divsd   $dst, [$constantaddress]\t# load from constant table: double=$con" %}
+  ins_cost(150);
+  ins_encode %{
+    __ divsd($dst$$XMMRegister, $constantaddress($con));
+  %}
+  ins_pipe(pipe_slow);
+%}
+
+instruct vdivD_reg(regD dst, regD src1, regD src2) %{
+  predicate(UseAVX > 0);
+  match(Set dst (DivD src1 src2));
+
+  format %{ "vdivsd  $dst, $src1, $src2" %}
+  ins_cost(150);
+  ins_encode %{
+    __ vdivsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
+  %}
+  ins_pipe(pipe_slow);
+%}
+
+instruct vdivD_mem(regD dst, regD src1, memory src2) %{
+  predicate(UseAVX > 0);
+  match(Set dst (DivD src1 (LoadD src2)));
+
+  format %{ "vdivsd  $dst, $src1, $src2" %}
+  ins_cost(150);
+  ins_encode %{
+    __ vdivsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
+  %}
+  ins_pipe(pipe_slow);
+%}
+
+instruct vdivD_imm(regD dst, regD src, immD con) %{
+  predicate(UseAVX > 0);
+  match(Set dst (DivD src con));
+
+  format %{ "vdivsd  $dst, $src, [$constantaddress]\t# load from constant table: double=$con" %}
+  ins_cost(150);
+  ins_encode %{
+    __ vdivsd($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
+  %}
+  ins_pipe(pipe_slow);
+%}
+
+instruct absF_reg(regF dst) %{
+  predicate((UseSSE>=1) && (UseAVX == 0));
+  match(Set dst (AbsF dst));
+  ins_cost(150);
+  format %{ "andps   $dst, [0x7fffffff]\t# abs float by sign masking" %}
+  ins_encode %{
+    __ andps($dst$$XMMRegister, ExternalAddress(float_signmask()));
+  %}
+  ins_pipe(pipe_slow);
+%}
+
+instruct vabsF_reg(regF dst, regF src) %{
+  predicate(UseAVX > 0);
+  match(Set dst (AbsF src));
+  ins_cost(150);
+  format %{ "vandps  $dst, $src, [0x7fffffff]\t# abs float by sign masking" %}
+  ins_encode %{
+    __ vandps($dst$$XMMRegister, $src$$XMMRegister,
+              ExternalAddress(float_signmask()));
+  %}
+  ins_pipe(pipe_slow);
+%}
+
+instruct absD_reg(regD dst) %{
+  predicate((UseSSE>=2) && (UseAVX == 0));
+  match(Set dst (AbsD dst));
+  ins_cost(150);
+  format %{ "andpd   $dst, [0x7fffffffffffffff]\t"
+            "# abs double by sign masking" %}
+  ins_encode %{
+    __ andpd($dst$$XMMRegister, ExternalAddress(double_signmask()));
+  %}
+  ins_pipe(pipe_slow);
+%}
+
+instruct vabsD_reg(regD dst, regD src) %{
+  predicate(UseAVX > 0);
+  match(Set dst (AbsD src));
+  ins_cost(150);
+  format %{ "vandpd  $dst, $src, [0x7fffffffffffffff]\t"
+            "# abs double by sign masking" %}
+  ins_encode %{
+    __ vandpd($dst$$XMMRegister, $src$$XMMRegister,
+              ExternalAddress(double_signmask()));
+  %}
+  ins_pipe(pipe_slow);
+%}
+
+instruct negF_reg(regF dst) %{
+  predicate((UseSSE>=1) && (UseAVX == 0));
+  match(Set dst (NegF dst));
+  ins_cost(150);
+  format %{ "xorps   $dst, [0x80000000]\t# neg float by sign flipping" %}
+  ins_encode %{
+    __ xorps($dst$$XMMRegister, ExternalAddress(float_signflip()));
+  %}
+  ins_pipe(pipe_slow);
+%}
+
+instruct vnegF_reg(regF dst, regF src) %{
+  predicate(UseAVX > 0);
+  match(Set dst (NegF src));
+  ins_cost(150);
+  format %{ "vxorps  $dst, $src, [0x80000000]\t# neg float by sign flipping" %}
+  ins_encode %{
+    __ vxorps($dst$$XMMRegister, $src$$XMMRegister,
+              ExternalAddress(float_signflip()));
+  %}
+  ins_pipe(pipe_slow);
+%}
+
+instruct negD_reg(regD dst) %{
+  predicate((UseSSE>=2) && (UseAVX == 0));
+  match(Set dst (NegD dst));
+  ins_cost(150);
+  format %{ "xorpd   $dst, [0x8000000000000000]\t"
+            "# neg double by sign flipping" %}
+  ins_encode %{
+    __ xorpd($dst$$XMMRegister, ExternalAddress(double_signflip()));
+  %}
+  ins_pipe(pipe_slow);
+%}
+
+instruct vnegD_reg(regD dst, regD src) %{
+  predicate(UseAVX > 0);
+  match(Set dst (NegD src));
+  ins_cost(150);
+  format %{ "vxorpd  $dst, $src, [0x8000000000000000]\t"
+            "# neg double by sign flipping" %}
+  ins_encode %{
+    __ vxorpd($dst$$XMMRegister, $src$$XMMRegister,
+              ExternalAddress(double_signflip()));
+  %}
+  ins_pipe(pipe_slow);
+%}
+
+instruct sqrtF_reg(regF dst, regF src) %{
+  predicate(UseSSE>=1);
+  match(Set dst (ConvD2F (SqrtD (ConvF2D src))));
+
+  format %{ "sqrtss  $dst, $src" %}
+  ins_cost(150);
+  ins_encode %{
+    __ sqrtss($dst$$XMMRegister, $src$$XMMRegister);
+  %}
+  ins_pipe(pipe_slow);
+%}
+
+instruct sqrtF_mem(regF dst, memory src) %{
+  predicate(UseSSE>=1);
+  match(Set dst (ConvD2F (SqrtD (ConvF2D (LoadF src)))));
+
+  format %{ "sqrtss  $dst, $src" %}
+  ins_cost(150);
+  ins_encode %{
+    __ sqrtss($dst$$XMMRegister, $src$$Address);
+  %}
+  ins_pipe(pipe_slow);
+%}
+
+instruct sqrtF_imm(regF dst, immF con) %{
+  predicate(UseSSE>=1);
+  match(Set dst (ConvD2F (SqrtD (ConvF2D con))));
+  format %{ "sqrtss  $dst, [$constantaddress]\t# load from constant table: float=$con" %}
+  ins_cost(150);
+  ins_encode %{
+    __ sqrtss($dst$$XMMRegister, $constantaddress($con));
+  %}
+  ins_pipe(pipe_slow);
+%}
+
+instruct sqrtD_reg(regD dst, regD src) %{
+  predicate(UseSSE>=2);
+  match(Set dst (SqrtD src));
+
+  format %{ "sqrtsd  $dst, $src" %}
+  ins_cost(150);
+  ins_encode %{
+    __ sqrtsd($dst$$XMMRegister, $src$$XMMRegister);
+  %}
+  ins_pipe(pipe_slow);
+%}
+
+instruct sqrtD_mem(regD dst, memory src) %{
+  predicate(UseSSE>=2);
+  match(Set dst (SqrtD (LoadD src)));
+
+  format %{ "sqrtsd  $dst, $src" %}
+  ins_cost(150);
+  ins_encode %{
+    __ sqrtsd($dst$$XMMRegister, $src$$Address);
+  %}
+  ins_pipe(pipe_slow);
+%}
+
+instruct sqrtD_imm(regD dst, immD con) %{
+  predicate(UseSSE>=2);
+  match(Set dst (SqrtD con));
+  format %{ "sqrtsd  $dst, [$constantaddress]\t# load from constant table: double=$con" %}
+  ins_cost(150);
+  ins_encode %{
+    __ sqrtsd($dst$$XMMRegister, $constantaddress($con));
+  %}
+  ins_pipe(pipe_slow);
+%}
+
diff --git a/hotspot/src/cpu/x86/vm/x86_32.ad b/hotspot/src/cpu/x86/vm/x86_32.ad
index 84d6bbac73b..ca85f1596fd 100644
--- a/hotspot/src/cpu/x86/vm/x86_32.ad
+++ b/hotspot/src/cpu/x86/vm/x86_32.ad
@@ -281,7 +281,7 @@ static int pre_call_FPU_size() {
 }
 
 static int preserve_SP_size() {
-  return LP64_ONLY(1 +) 2;  // [rex,] op, rm(reg/reg)
+  return 2;  // op, rm(reg/reg)
 }
 
 // !!!!! Special hack to get all type of calls to specify the byte offset
@@ -495,14 +495,34 @@ void encode_Copy( CodeBuffer &cbuf, int dst_encoding, int src_encoding ) {
   }
 }
 
-void encode_CopyXD( CodeBuffer &cbuf, int dst_encoding, int src_encoding ) {
-  if( dst_encoding == src_encoding ) {
-    // reg-reg copy, use an empty encoding
-  } else {
-    MacroAssembler _masm(&cbuf);
+void emit_cmpfp_fixup(MacroAssembler& _masm) {
+  Label exit;
+  __ jccb(Assembler::noParity, exit);
+  __ pushf();
+  //
+  // comiss/ucomiss instructions set ZF,PF,CF flags and
+  // zero OF,AF,SF for NaN values.
+  // Fixup flags by zeroing ZF,PF so that compare of NaN
+  // values returns 'less than' result (CF is set).
+  // Leave the rest of flags unchanged.
+  //
+  //    7 6 5 4 3 2 1 0
+  //   |S|Z|r|A|r|P|r|C|  (r - reserved bit)
+  //    0 0 1 0 1 0 1 1   (0x2B)
+  //
+  __ andl(Address(rsp, 0), 0xffffff2b);
+  __ popf();
+  __ bind(exit);
+}
 
-    __ movdqa(as_XMMRegister(dst_encoding), as_XMMRegister(src_encoding));
-  }
+void emit_cmpfp3(MacroAssembler& _masm, Register dst) {
+  Label done;
+  __ movl(dst, -1);
+  __ jcc(Assembler::parity, done);
+  __ jcc(Assembler::below, done);
+  __ setb(Assembler::notEqual, dst);
+  __ movzbl(dst, dst);
+  __ bind(done);
 }
 
 
@@ -792,92 +812,88 @@ static int impl_helper( CodeBuffer *cbuf, bool do_size, bool is_load, int offset
 // Helper for XMM registers.  Extra opcode bits, limited syntax.
 static int impl_x_helper( CodeBuffer *cbuf, bool do_size, bool is_load,
                          int offset, int reg_lo, int reg_hi, int size, outputStream* st ) {
-  if( cbuf ) {
-    if( reg_lo+1 == reg_hi ) { // double move?
-      if( is_load && !UseXmmLoadAndClearUpper )
-        emit_opcode(*cbuf, 0x66 ); // use 'movlpd' for load
-      else
-        emit_opcode(*cbuf, 0xF2 ); // use 'movsd' otherwise
+  if (cbuf) {
+    MacroAssembler _masm(cbuf);
+    if (reg_lo+1 == reg_hi) { // double move?
+      if (is_load) {
+        __ movdbl(as_XMMRegister(Matcher::_regEncode[reg_lo]), Address(rsp, offset));
+      } else {
+        __ movdbl(Address(rsp, offset), as_XMMRegister(Matcher::_regEncode[reg_lo]));
+      }
     } else {
-      emit_opcode(*cbuf, 0xF3 );
+      if (is_load) {
+        __ movflt(as_XMMRegister(Matcher::_regEncode[reg_lo]), Address(rsp, offset));
+      } else {
+        __ movflt(Address(rsp, offset), as_XMMRegister(Matcher::_regEncode[reg_lo]));
+      }
     }
-    emit_opcode(*cbuf, 0x0F );
-    if( reg_lo+1 == reg_hi && is_load && !UseXmmLoadAndClearUpper )
-      emit_opcode(*cbuf, 0x12 );   // use 'movlpd' for load
-    else
-      emit_opcode(*cbuf, is_load ? 0x10 : 0x11 );
-    encode_RegMem(*cbuf, Matcher::_regEncode[reg_lo], ESP_enc, 0x4, 0, offset, false);
 #ifndef PRODUCT
-  } else if( !do_size ) {
-    if( size != 0 ) st->print("\n\t");
-    if( reg_lo+1 == reg_hi ) { // double move?
-      if( is_load ) st->print("%s %s,[ESP + #%d]",
-                               UseXmmLoadAndClearUpper ? "MOVSD " : "MOVLPD",
-                               Matcher::regName[reg_lo], offset);
-      else          st->print("MOVSD  [ESP + #%d],%s",
-                               offset, Matcher::regName[reg_lo]);
+  } else if (!do_size) {
+    if (size != 0) st->print("\n\t");
+    if (reg_lo+1 == reg_hi) { // double move?
+      if (is_load) st->print("%s %s,[ESP + #%d]",
+                              UseXmmLoadAndClearUpper ? "MOVSD " : "MOVLPD",
+                              Matcher::regName[reg_lo], offset);
+      else         st->print("MOVSD  [ESP + #%d],%s",
+                              offset, Matcher::regName[reg_lo]);
     } else {
-      if( is_load ) st->print("MOVSS  %s,[ESP + #%d]",
-                               Matcher::regName[reg_lo], offset);
-      else          st->print("MOVSS  [ESP + #%d],%s",
-                               offset, Matcher::regName[reg_lo]);
+      if (is_load) st->print("MOVSS  %s,[ESP + #%d]",
+                              Matcher::regName[reg_lo], offset);
+      else         st->print("MOVSS  [ESP + #%d],%s",
+                              offset, Matcher::regName[reg_lo]);
     }
 #endif
   }
   int offset_size = (offset == 0) ? 0 : ((offset <= 127) ? 1 : 4);
+  // VEX_2bytes prefix is used if UseAVX > 0, so it takes the same 2 bytes.
   return size+5+offset_size;
 }
 
 
 static int impl_movx_helper( CodeBuffer *cbuf, bool do_size, int src_lo, int dst_lo,
                             int src_hi, int dst_hi, int size, outputStream* st ) {
-  if( UseXmmRegToRegMoveAll ) {//Use movaps,movapd to move between xmm registers
-    if( cbuf ) {
-      if( (src_lo+1 == src_hi && dst_lo+1 == dst_hi) ) {
-        emit_opcode(*cbuf, 0x66 );
-      }
-      emit_opcode(*cbuf, 0x0F );
-      emit_opcode(*cbuf, 0x28 );
-      emit_rm    (*cbuf, 0x3, Matcher::_regEncode[dst_lo], Matcher::_regEncode[src_lo] );
+  if (cbuf) {
+    MacroAssembler _masm(cbuf);
+    if (src_lo+1 == src_hi && dst_lo+1 == dst_hi) { // double move?
+      __ movdbl(as_XMMRegister(Matcher::_regEncode[dst_lo]),
+                as_XMMRegister(Matcher::_regEncode[src_lo]));
+    } else {
+      __ movflt(as_XMMRegister(Matcher::_regEncode[dst_lo]),
+                as_XMMRegister(Matcher::_regEncode[src_lo]));
+    }
 #ifndef PRODUCT
-    } else if( !do_size ) {
-      if( size != 0 ) st->print("\n\t");
-      if( src_lo+1 == src_hi && dst_lo+1 == dst_hi ) { // double move?
+  } else if (!do_size) {
+    if (size != 0) st->print("\n\t");
+    if (UseXmmRegToRegMoveAll) {//Use movaps,movapd to move between xmm registers
+      if (src_lo+1 == src_hi && dst_lo+1 == dst_hi) { // double move?
         st->print("MOVAPD %s,%s",Matcher::regName[dst_lo],Matcher::regName[src_lo]);
       } else {
         st->print("MOVAPS %s,%s",Matcher::regName[dst_lo],Matcher::regName[src_lo]);
       }
-#endif
-    }
-    return size + ((src_lo+1 == src_hi && dst_lo+1 == dst_hi) ? 4 : 3);
-  } else {
-    if( cbuf ) {
-      emit_opcode(*cbuf, (src_lo+1 == src_hi && dst_lo+1 == dst_hi) ? 0xF2 : 0xF3 );
-      emit_opcode(*cbuf, 0x0F );
-      emit_opcode(*cbuf, 0x10 );
-      emit_rm    (*cbuf, 0x3, Matcher::_regEncode[dst_lo], Matcher::_regEncode[src_lo] );
-#ifndef PRODUCT
-    } else if( !do_size ) {
-      if( size != 0 ) st->print("\n\t");
+    } else {
       if( src_lo+1 == src_hi && dst_lo+1 == dst_hi ) { // double move?
         st->print("MOVSD  %s,%s",Matcher::regName[dst_lo],Matcher::regName[src_lo]);
       } else {
         st->print("MOVSS  %s,%s",Matcher::regName[dst_lo],Matcher::regName[src_lo]);
       }
-#endif
     }
-    return size+4;
+#endif
   }
+  // VEX_2bytes prefix is used if UseAVX > 0, and it takes the same 2 bytes.
+  // Only MOVAPS SSE prefix uses 1 byte.
+  int sz = 4;
+  if (!(src_lo+1 == src_hi && dst_lo+1 == dst_hi) &&
+      UseXmmRegToRegMoveAll && (UseAVX == 0)) sz = 3;
+  return size + sz;
 }
 
 static int impl_movgpr2x_helper( CodeBuffer *cbuf, bool do_size, int src_lo, int dst_lo,
                             int src_hi, int dst_hi, int size, outputStream* st ) {
   // 32-bit
   if (cbuf) {
-    emit_opcode(*cbuf, 0x66);
-    emit_opcode(*cbuf, 0x0F);
-    emit_opcode(*cbuf, 0x6E);
-    emit_rm(*cbuf, 0x3, Matcher::_regEncode[dst_lo] & 7, Matcher::_regEncode[src_lo] & 7);
+    MacroAssembler _masm(cbuf);
+    __ movdl(as_XMMRegister(Matcher::_regEncode[dst_lo]),
+             as_Register(Matcher::_regEncode[src_lo]));
 #ifndef PRODUCT
   } else if (!do_size) {
     st->print("movdl   %s, %s\t# spill", Matcher::regName[dst_lo], Matcher::regName[src_lo]);
@@ -891,10 +907,9 @@ static int impl_movx2gpr_helper( CodeBuffer *cbuf, bool do_size, int src_lo, int
                                  int src_hi, int dst_hi, int size, outputStream* st ) {
   // 32-bit
   if (cbuf) {
-    emit_opcode(*cbuf, 0x66);
-    emit_opcode(*cbuf, 0x0F);
-    emit_opcode(*cbuf, 0x7E);
-    emit_rm(*cbuf, 0x3, Matcher::_regEncode[src_lo] & 7, Matcher::_regEncode[dst_lo] & 7);
+    MacroAssembler _masm(cbuf);
+    __ movdl(as_Register(Matcher::_regEncode[dst_lo]),
+             as_XMMRegister(Matcher::_regEncode[src_lo]));
 #ifndef PRODUCT
   } else if (!do_size) {
     st->print("movdl   %s, %s\t# spill", Matcher::regName[dst_lo], Matcher::regName[src_lo]);
@@ -1760,7 +1775,7 @@ encode %{
     emit_cc(cbuf, $secondary, $cop$$cmpcode);
   %}
 
-  enc_class enc_cmov_d(cmpOp cop, regD src ) %{ // CMOV
+  enc_class enc_cmov_dpr(cmpOp cop, regDPR src ) %{ // CMOV
     int op = 0xDA00 + $cop$$cmpcode + ($src$$reg-1);
     emit_d8(cbuf, op >> 8 );
     emit_d8(cbuf, op & 255);
@@ -1931,11 +1946,6 @@ encode %{
 
   %}
 
-  enc_class Xor_Reg (eRegI dst) %{
-    emit_opcode(cbuf, 0x33);
-    emit_rm(cbuf, 0x3, $dst$$reg, $dst$$reg);
-  %}
-
 //   Following encoding is no longer used, but may be restored if calling
 //   convention changes significantly.
 //   Became: Xor_Reg(EBP), Java_To_Runtime( labl )
@@ -2013,64 +2023,6 @@ encode %{
   %}
 
 
-  enc_class MovI2X_reg(regX dst, eRegI src) %{
-    emit_opcode(cbuf, 0x66 );     // MOVD dst,src
-    emit_opcode(cbuf, 0x0F );
-    emit_opcode(cbuf, 0x6E );
-    emit_rm(cbuf, 0x3, $dst$$reg, $src$$reg);
-  %}
-
-  enc_class MovX2I_reg(eRegI dst, regX src) %{
-    emit_opcode(cbuf, 0x66 );     // MOVD dst,src
-    emit_opcode(cbuf, 0x0F );
-    emit_opcode(cbuf, 0x7E );
-    emit_rm(cbuf, 0x3, $src$$reg, $dst$$reg);
-  %}
-
-  enc_class MovL2XD_reg(regXD dst, eRegL src, regXD tmp) %{
-    { // MOVD $dst,$src.lo
-      emit_opcode(cbuf,0x66);
-      emit_opcode(cbuf,0x0F);
-      emit_opcode(cbuf,0x6E);
-      emit_rm(cbuf, 0x3, $dst$$reg, $src$$reg);
-    }
-    { // MOVD $tmp,$src.hi
-      emit_opcode(cbuf,0x66);
-      emit_opcode(cbuf,0x0F);
-      emit_opcode(cbuf,0x6E);
-      emit_rm(cbuf, 0x3, $tmp$$reg, HIGH_FROM_LOW($src$$reg));
-    }
-    { // PUNPCKLDQ $dst,$tmp
-      emit_opcode(cbuf,0x66);
-      emit_opcode(cbuf,0x0F);
-      emit_opcode(cbuf,0x62);
-      emit_rm(cbuf, 0x3, $dst$$reg, $tmp$$reg);
-     }
-  %}
-
-  enc_class MovXD2L_reg(eRegL dst, regXD src, regXD tmp) %{
-    { // MOVD $dst.lo,$src
-      emit_opcode(cbuf,0x66);
-      emit_opcode(cbuf,0x0F);
-      emit_opcode(cbuf,0x7E);
-      emit_rm(cbuf, 0x3, $src$$reg, $dst$$reg);
-    }
-    { // PSHUFLW $tmp,$src,0x4E  (01001110b)
-      emit_opcode(cbuf,0xF2);
-      emit_opcode(cbuf,0x0F);
-      emit_opcode(cbuf,0x70);
-      emit_rm(cbuf, 0x3, $tmp$$reg, $src$$reg);
-      emit_d8(cbuf, 0x4E);
-    }
-    { // MOVD $dst.hi,$tmp
-      emit_opcode(cbuf,0x66);
-      emit_opcode(cbuf,0x0F);
-      emit_opcode(cbuf,0x7E);
-      emit_rm(cbuf, 0x3, $tmp$$reg, HIGH_FROM_LOW($dst$$reg));
-    }
-  %}
-
-
   // Encode a reg-reg copy.  If it is useless, then empty encoding.
   enc_class enc_Copy( eRegI dst, eRegI src ) %{
     encode_Copy( cbuf, $dst$$reg, $src$$reg );
@@ -2080,11 +2032,6 @@ encode %{
     encode_Copy( cbuf, $dst$$reg, $src$$reg );
   %}
 
-  // Encode xmm reg-reg copy.  If it is useless, then empty encoding.
-  enc_class enc_CopyXD( RegXD dst, RegXD src ) %{
-    encode_CopyXD( cbuf, $dst$$reg, $src$$reg );
-  %}
-
   enc_class RegReg (eRegI dst, eRegI src) %{    // RegReg(Many)
     emit_rm(cbuf, 0x3, $dst$$reg, $src$$reg);
   %}
@@ -2116,14 +2063,14 @@ encode %{
     $$$emit32$src$$constant;
   %}
 
-  enc_class Con32F_as_bits(immF src) %{        // storeF_imm
+  enc_class Con32FPR_as_bits(immFPR src) %{        // storeF_imm
     // Output Float immediate bits
     jfloat jf = $src$$constant;
     int    jf_as_bits = jint_cast( jf );
     emit_d32(cbuf, jf_as_bits);
   %}
 
-  enc_class Con32XF_as_bits(immXF src) %{      // storeX_imm
+  enc_class Con32F_as_bits(immF src) %{      // storeX_imm
     // Output Float immediate bits
     jfloat jf = $src$$constant;
     int    jf_as_bits = jint_cast( jf );
@@ -2336,7 +2283,7 @@ encode %{
     emit_rm(cbuf, 0x3, $dst$$reg, $src$$reg);
   %}
 
-  enc_class enc_FP_store(memory mem, regD src) %{
+  enc_class enc_FPR_store(memory mem, regDPR src) %{
     // If src is FPR1, we can just FST to store it.
     // Else we need to FLD it to FPR1, then FSTP to store/pop it.
     int reg_encoding = 0x2; // Just store
@@ -2485,7 +2432,7 @@ encode %{
 
   // ----------------- Encodings for floating point unit -----------------
   // May leave result in FPU-TOS or FPU reg depending on opcodes
-  enc_class OpcReg_F (regF src) %{    // FMUL, FDIV
+  enc_class OpcReg_FPR(regFPR src) %{    // FMUL, FDIV
     $$$emit8$primary;
     emit_rm(cbuf, 0x3, $secondary, $src$$reg );
   %}
@@ -2497,17 +2444,17 @@ encode %{
   %}
 
   // !!!!! equivalent to Pop_Reg_F
-  enc_class Pop_Reg_D( regD dst ) %{
+  enc_class Pop_Reg_DPR( regDPR dst ) %{
     emit_opcode( cbuf, 0xDD );           // FSTP   ST(i)
     emit_d8( cbuf, 0xD8+$dst$$reg );
   %}
 
-  enc_class Push_Reg_D( regD dst ) %{
+  enc_class Push_Reg_DPR( regDPR dst ) %{
     emit_opcode( cbuf, 0xD9 );
     emit_d8( cbuf, 0xC0-1+$dst$$reg );   // FLD ST(i-1)
   %}
 
-  enc_class strictfp_bias1( regD dst ) %{
+  enc_class strictfp_bias1( regDPR dst ) %{
     emit_opcode( cbuf, 0xDB );           // FLD m80real
     emit_opcode( cbuf, 0x2D );
     emit_d32( cbuf, (int)StubRoutines::addr_fpu_subnormal_bias1() );
@@ -2515,7 +2462,7 @@ encode %{
     emit_opcode( cbuf, 0xC8+$dst$$reg );
   %}
 
-  enc_class strictfp_bias2( regD dst ) %{
+  enc_class strictfp_bias2( regDPR dst ) %{
     emit_opcode( cbuf, 0xDB );           // FLD m80real
     emit_opcode( cbuf, 0x2D );
     emit_d32( cbuf, (int)StubRoutines::addr_fpu_subnormal_bias2() );
@@ -2541,39 +2488,29 @@ encode %{
     store_to_stackslot( cbuf, $primary, $secondary, $src$$disp );
   %}
 
-  // Push the float in stackSlot 'src' onto FP-stack
-  enc_class Push_Mem_F( memory src ) %{    // FLD_S   [ESP+src]
-    store_to_stackslot( cbuf, 0xD9, 0x00, $src$$disp );
-  %}
-
-  // Push the double in stackSlot 'src' onto FP-stack
-  enc_class Push_Mem_D( memory src ) %{    // FLD_D   [ESP+src]
-    store_to_stackslot( cbuf, 0xDD, 0x00, $src$$disp );
-  %}
-
   // Push FPU's TOS float to a stack-slot, and pop FPU-stack
-  enc_class Pop_Mem_F( stackSlotF dst ) %{ // FSTP_S [ESP+dst]
+  enc_class Pop_Mem_FPR( stackSlotF dst ) %{ // FSTP_S [ESP+dst]
     store_to_stackslot( cbuf, 0xD9, 0x03, $dst$$disp );
   %}
 
   // Same as Pop_Mem_F except for opcode
   // Push FPU's TOS double to a stack-slot, and pop FPU-stack
-  enc_class Pop_Mem_D( stackSlotD dst ) %{ // FSTP_D [ESP+dst]
+  enc_class Pop_Mem_DPR( stackSlotD dst ) %{ // FSTP_D [ESP+dst]
     store_to_stackslot( cbuf, 0xDD, 0x03, $dst$$disp );
   %}
 
-  enc_class Pop_Reg_F( regF dst ) %{
+  enc_class Pop_Reg_FPR( regFPR dst ) %{
     emit_opcode( cbuf, 0xDD );           // FSTP   ST(i)
     emit_d8( cbuf, 0xD8+$dst$$reg );
   %}
 
-  enc_class Push_Reg_F( regF dst ) %{
+  enc_class Push_Reg_FPR( regFPR dst ) %{
     emit_opcode( cbuf, 0xD9 );           // FLD    ST(i-1)
     emit_d8( cbuf, 0xC0-1+$dst$$reg );
   %}
 
   // Push FPU's float to a stack-slot, and pop FPU-stack
-  enc_class Pop_Mem_Reg_F( stackSlotF dst, regF src ) %{
+  enc_class Pop_Mem_Reg_FPR( stackSlotF dst, regFPR src ) %{
     int pop = 0x02;
     if ($src$$reg != FPR1L_enc) {
       emit_opcode( cbuf, 0xD9 );         // FLD    ST(i-1)
@@ -2584,7 +2521,7 @@ encode %{
   %}
 
   // Push FPU's double to a stack-slot, and pop FPU-stack
-  enc_class Pop_Mem_Reg_D( stackSlotD dst, regD src ) %{
+  enc_class Pop_Mem_Reg_DPR( stackSlotD dst, regDPR src ) %{
     int pop = 0x02;
     if ($src$$reg != FPR1L_enc) {
       emit_opcode( cbuf, 0xD9 );         // FLD    ST(i-1)
@@ -2595,7 +2532,7 @@ encode %{
   %}
 
   // Push FPU's double to a FPU-stack-slot, and pop FPU-stack
-  enc_class Pop_Reg_Reg_D( regD dst, regF src ) %{
+  enc_class Pop_Reg_Reg_DPR( regDPR dst, regFPR src ) %{
     int pop = 0xD0 - 1; // -1 since we skip FLD
     if ($src$$reg != FPR1L_enc) {
       emit_opcode( cbuf, 0xD9 );         // FLD    ST(src-1)
@@ -2607,16 +2544,7 @@ encode %{
   %}
 
 
-  enc_class Mul_Add_F( regF dst, regF src, regF src1, regF src2 ) %{
-    MacroAssembler masm(&cbuf);
-    masm.fld_s(  $src1$$reg-1);   // nothing at TOS, load TOS from src1.reg
-    masm.fmul(   $src2$$reg+0);   // value at TOS
-    masm.fadd(   $src$$reg+0);    // value at TOS
-    masm.fstp_d( $dst$$reg+0);    // value at TOS, popped off after store
-  %}
-
-
-  enc_class Push_Reg_Mod_D( regD dst, regD src) %{
+  enc_class Push_Reg_Mod_DPR( regDPR dst, regDPR src) %{
     // load dst in FPR0
     emit_opcode( cbuf, 0xD9 );
     emit_d8( cbuf, 0xC0-1+$dst$$reg );
@@ -2634,116 +2562,59 @@ encode %{
     }
   %}
 
-  enc_class Push_ModD_encoding( regXD src0, regXD src1) %{
-    // Allocate a word
-    emit_opcode(cbuf,0x83);            // SUB ESP,8
-    emit_opcode(cbuf,0xEC);
-    emit_d8(cbuf,0x08);
-
-    emit_opcode  (cbuf, 0xF2 );     // MOVSD [ESP], src1
-    emit_opcode  (cbuf, 0x0F );
-    emit_opcode  (cbuf, 0x11 );
-    encode_RegMem(cbuf, $src1$$reg, ESP_enc, 0x4, 0, 0, false);
-
-    emit_opcode(cbuf,0xDD );      // FLD_D [ESP]
-    encode_RegMem(cbuf, 0x0, ESP_enc, 0x4, 0, 0, false);
-
-    emit_opcode  (cbuf, 0xF2 );     // MOVSD [ESP], src0
-    emit_opcode  (cbuf, 0x0F );
-    emit_opcode  (cbuf, 0x11 );
-    encode_RegMem(cbuf, $src0$$reg, ESP_enc, 0x4, 0, 0, false);
-
-    emit_opcode(cbuf,0xDD );      // FLD_D [ESP]
-    encode_RegMem(cbuf, 0x0, ESP_enc, 0x4, 0, 0, false);
-
+  enc_class Push_ModD_encoding(regD src0, regD src1) %{
+    MacroAssembler _masm(&cbuf);
+    __ subptr(rsp, 8);
+    __ movdbl(Address(rsp, 0), $src1$$XMMRegister);
+    __ fld_d(Address(rsp, 0));
+    __ movdbl(Address(rsp, 0), $src0$$XMMRegister);
+    __ fld_d(Address(rsp, 0));
   %}
 
-  enc_class Push_ModX_encoding( regX src0, regX src1) %{
-    // Allocate a word
-    emit_opcode(cbuf,0x83);            // SUB ESP,4
-    emit_opcode(cbuf,0xEC);
-    emit_d8(cbuf,0x04);
-
-    emit_opcode  (cbuf, 0xF3 );     // MOVSS [ESP], src1
-    emit_opcode  (cbuf, 0x0F );
-    emit_opcode  (cbuf, 0x11 );
-    encode_RegMem(cbuf, $src1$$reg, ESP_enc, 0x4, 0, 0, false);
-
-    emit_opcode(cbuf,0xD9 );      // FLD [ESP]
-    encode_RegMem(cbuf, 0x0, ESP_enc, 0x4, 0, 0, false);
-
-    emit_opcode  (cbuf, 0xF3 );     // MOVSS [ESP], src0
-    emit_opcode  (cbuf, 0x0F );
-    emit_opcode  (cbuf, 0x11 );
-    encode_RegMem(cbuf, $src0$$reg, ESP_enc, 0x4, 0, 0, false);
-
-    emit_opcode(cbuf,0xD9 );      // FLD [ESP]
-    encode_RegMem(cbuf, 0x0, ESP_enc, 0x4, 0, 0, false);
-
+  enc_class Push_ModF_encoding(regF src0, regF src1) %{
+    MacroAssembler _masm(&cbuf);
+    __ subptr(rsp, 4);
+    __ movflt(Address(rsp, 0), $src1$$XMMRegister);
+    __ fld_s(Address(rsp, 0));
+    __ movflt(Address(rsp, 0), $src0$$XMMRegister);
+    __ fld_s(Address(rsp, 0));
   %}
 
-  enc_class Push_ResultXD(regXD dst) %{
-    store_to_stackslot( cbuf, 0xDD, 0x03, 0 ); //FSTP [ESP]
-
-    // UseXmmLoadAndClearUpper ? movsd dst,[esp] : movlpd dst,[esp]
-    emit_opcode  (cbuf, UseXmmLoadAndClearUpper ? 0xF2 : 0x66);
-    emit_opcode  (cbuf, 0x0F );
-    emit_opcode  (cbuf, UseXmmLoadAndClearUpper ? 0x10 : 0x12);
-    encode_RegMem(cbuf, $dst$$reg, ESP_enc, 0x4, 0, 0, false);
-
-    emit_opcode(cbuf,0x83);    // ADD ESP,8
-    emit_opcode(cbuf,0xC4);
-    emit_d8(cbuf,0x08);
+  enc_class Push_ResultD(regD dst) %{
+    MacroAssembler _masm(&cbuf);
+    __ fstp_d(Address(rsp, 0));
+    __ movdbl($dst$$XMMRegister, Address(rsp, 0));
+    __ addptr(rsp, 8);
   %}
 
-  enc_class Push_ResultX(regX dst, immI d8) %{
-    store_to_stackslot( cbuf, 0xD9, 0x03, 0 ); //FSTP_S [ESP]
-
-    emit_opcode  (cbuf, 0xF3 );     // MOVSS dst(xmm), [ESP]
-    emit_opcode  (cbuf, 0x0F );
-    emit_opcode  (cbuf, 0x10 );
-    encode_RegMem(cbuf, $dst$$reg, ESP_enc, 0x4, 0, 0, false);
-
-    emit_opcode(cbuf,0x83);    // ADD ESP,d8 (4 or 8)
-    emit_opcode(cbuf,0xC4);
-    emit_d8(cbuf,$d8$$constant);
+  enc_class Push_ResultF(regF dst, immI d8) %{
+    MacroAssembler _masm(&cbuf);
+    __ fstp_s(Address(rsp, 0));
+    __ movflt($dst$$XMMRegister, Address(rsp, 0));
+    __ addptr(rsp, $d8$$constant);
   %}
 
-  enc_class Push_SrcXD(regXD src) %{
-    // Allocate a word
-    emit_opcode(cbuf,0x83);            // SUB ESP,8
-    emit_opcode(cbuf,0xEC);
-    emit_d8(cbuf,0x08);
-
-    emit_opcode  (cbuf, 0xF2 );     // MOVSD [ESP], src
-    emit_opcode  (cbuf, 0x0F );
-    emit_opcode  (cbuf, 0x11 );
-    encode_RegMem(cbuf, $src$$reg, ESP_enc, 0x4, 0, 0, false);
-
-    emit_opcode(cbuf,0xDD );      // FLD_D [ESP]
-    encode_RegMem(cbuf, 0x0, ESP_enc, 0x4, 0, 0, false);
+  enc_class Push_SrcD(regD src) %{
+    MacroAssembler _masm(&cbuf);
+    __ subptr(rsp, 8);
+    __ movdbl(Address(rsp, 0), $src$$XMMRegister);
+    __ fld_d(Address(rsp, 0));
   %}
 
   enc_class push_stack_temp_qword() %{
-    emit_opcode(cbuf,0x83);     // SUB ESP,8
-    emit_opcode(cbuf,0xEC);
-    emit_d8    (cbuf,0x08);
+    MacroAssembler _masm(&cbuf);
+    __ subptr(rsp, 8);
   %}
 
   enc_class pop_stack_temp_qword() %{
-    emit_opcode(cbuf,0x83);     // ADD ESP,8
-    emit_opcode(cbuf,0xC4);
-    emit_d8    (cbuf,0x08);
+    MacroAssembler _masm(&cbuf);
+    __ addptr(rsp, 8);
   %}
 
-  enc_class push_xmm_to_fpr1( regXD xmm_src ) %{
-    emit_opcode  (cbuf, 0xF2 );     // MOVSD [ESP], xmm_src
-    emit_opcode  (cbuf, 0x0F );
-    emit_opcode  (cbuf, 0x11 );
-    encode_RegMem(cbuf, $xmm_src$$reg, ESP_enc, 0x4, 0, 0, false);
-
-    emit_opcode(cbuf,0xDD );      // FLD_D [ESP]
-    encode_RegMem(cbuf, 0x0, ESP_enc, 0x4, 0, 0, false);
+  enc_class push_xmm_to_fpr1(regD src) %{
+    MacroAssembler _masm(&cbuf);
+    __ movdbl(Address(rsp, 0), $src$$XMMRegister);
+    __ fld_d(Address(rsp, 0));
   %}
 
   // Compute X^Y using Intel's fast hardware instructions, if possible.
@@ -2785,10 +2656,7 @@ encode %{
     encode_RegMem(cbuf, 0x1, ESP_enc, 0x4, 0, 0, false);
   %}
 
-//   enc_class Pop_Reg_Mod_D( regD dst, regD src)
-//   was replaced by Push_Result_Mod_D followed by Pop_Reg_X() or Pop_Mem_X()
-
-  enc_class Push_Result_Mod_D( regD src) %{
+  enc_class Push_Result_Mod_DPR( regDPR src) %{
     if ($src$$reg != FPR1L_enc) {
       // fincstp
       emit_opcode (cbuf, 0xD9);
@@ -2817,7 +2685,7 @@ encode %{
     emit_opcode( cbuf, 0x05 );
   %}
 
-  enc_class emitModD() %{
+  enc_class emitModDPR() %{
     // fprem must be iterative
     // :: loop
     // fprem
@@ -2922,24 +2790,6 @@ encode %{
   %}
 
 
-  // XMM version of CmpF_Result. Because the XMM compare
-  // instructions set the EFLAGS directly. It becomes simpler than
-  // the float version above.
-  enc_class CmpX_Result(eRegI dst) %{
-    MacroAssembler _masm(&cbuf);
-    Label nan, inc, done;
-
-    __ jccb(Assembler::parity, nan);
-    __ jccb(Assembler::equal,  done);
-    __ jccb(Assembler::above,  inc);
-    __ bind(nan);
-    __ decrement(as_Register($dst$$reg)); // NO L qqq
-    __ jmpb(done);
-    __ bind(inc);
-    __ increment(as_Register($dst$$reg)); // NO L qqq
-    __ bind(done);
-  %}
-
   // Compare the longs and set flags
   // BROKEN!  Do Not use as-is
   enc_class cmpl_test( eRegL src1, eRegL src2 ) %{
@@ -3162,48 +3012,6 @@ encode %{
     emit_d8    (cbuf,0 );
   %}
 
-  enc_class movq_ld(regXD dst, memory mem) %{
-    MacroAssembler _masm(&cbuf);
-    __ movq($dst$$XMMRegister, $mem$$Address);
-  %}
-
-  enc_class movq_st(memory mem, regXD src) %{
-    MacroAssembler _masm(&cbuf);
-    __ movq($mem$$Address, $src$$XMMRegister);
-  %}
-
-  enc_class pshufd_8x8(regX dst, regX src) %{
-    MacroAssembler _masm(&cbuf);
-
-    encode_CopyXD(cbuf, $dst$$reg, $src$$reg);
-    __ punpcklbw(as_XMMRegister($dst$$reg), as_XMMRegister($dst$$reg));
-    __ pshuflw(as_XMMRegister($dst$$reg), as_XMMRegister($dst$$reg), 0x00);
-  %}
-
-  enc_class pshufd_4x16(regX dst, regX src) %{
-    MacroAssembler _masm(&cbuf);
-
-    __ pshuflw(as_XMMRegister($dst$$reg), as_XMMRegister($src$$reg), 0x00);
-  %}
-
-  enc_class pshufd(regXD dst, regXD src, int mode) %{
-    MacroAssembler _masm(&cbuf);
-
-    __ pshufd(as_XMMRegister($dst$$reg), as_XMMRegister($src$$reg), $mode);
-  %}
-
-  enc_class pxor(regXD dst, regXD src) %{
-    MacroAssembler _masm(&cbuf);
-
-    __ pxor(as_XMMRegister($dst$$reg), as_XMMRegister($src$$reg));
-  %}
-
-  enc_class mov_i2x(regXD dst, eRegI src) %{
-    MacroAssembler _masm(&cbuf);
-
-    __ movdl(as_XMMRegister($dst$$reg), as_Register($src$$reg));
-  %}
-
 
   // Because the transitions from emitted code to the runtime
   // monitorenter/exit helper stubs are so slow it's critical that
@@ -3757,7 +3565,7 @@ encode %{
   // 'zero', store the darned double down as an int, and reset the
   // rounding mode to 'nearest'.  The hardware throws an exception which
   // patches up the correct value directly to the stack.
-  enc_class D2I_encoding( regD src ) %{
+  enc_class DPR2I_encoding( regDPR src ) %{
     // Flip to round-to-zero mode.  We attempted to allow invalid-op
     // exceptions here, so that a NAN or other corner-case value will
     // thrown an exception (but normal values get converted at full speed).
@@ -3800,7 +3608,7 @@ encode %{
     // Carry on here...
   %}
 
-  enc_class D2L_encoding( regD src ) %{
+  enc_class DPR2L_encoding( regDPR src ) %{
     emit_opcode(cbuf,0xD9);            // FLDCW  trunc
     emit_opcode(cbuf,0x2D);
     emit_d32(cbuf,(int)StubRoutines::addr_fpu_cntrl_wrd_trunc());
@@ -3842,294 +3650,27 @@ encode %{
     // Carry on here...
   %}
 
-  enc_class X2L_encoding( regX src ) %{
-    // Allocate a word
-    emit_opcode(cbuf,0x83);      // SUB ESP,8
-    emit_opcode(cbuf,0xEC);
-    emit_d8(cbuf,0x08);
-
-    emit_opcode  (cbuf, 0xF3 );  // MOVSS [ESP], src
-    emit_opcode  (cbuf, 0x0F );
-    emit_opcode  (cbuf, 0x11 );
-    encode_RegMem(cbuf, $src$$reg, ESP_enc, 0x4, 0, 0, false);
-
-    emit_opcode(cbuf,0xD9 );     // FLD_S [ESP]
-    encode_RegMem(cbuf, 0x0, ESP_enc, 0x4, 0, 0, false);
-
-    emit_opcode(cbuf,0xD9);      // FLDCW  trunc
-    emit_opcode(cbuf,0x2D);
-    emit_d32(cbuf,(int)StubRoutines::addr_fpu_cntrl_wrd_trunc());
-
-    // Encoding assumes a double has been pushed into FPR0.
-    // Store down the double as a long, popping the FPU stack
-    emit_opcode(cbuf,0xDF);      // FISTP [ESP]
-    emit_opcode(cbuf,0x3C);
-    emit_d8(cbuf,0x24);
-
-    // Restore the rounding mode; mask the exception
-    emit_opcode(cbuf,0xD9);      // FLDCW   std/24-bit mode
-    emit_opcode(cbuf,0x2D);
-    emit_d32( cbuf, Compile::current()->in_24_bit_fp_mode()
-      ? (int)StubRoutines::addr_fpu_cntrl_wrd_24()
-      : (int)StubRoutines::addr_fpu_cntrl_wrd_std());
-
-    // Load the converted int; adjust CPU stack
-    emit_opcode(cbuf,0x58);      // POP EAX
-
-    emit_opcode(cbuf,0x5A);      // POP EDX
-
-    emit_opcode(cbuf,0x81);      // CMP EDX,imm
-    emit_d8    (cbuf,0xFA);      // rdx
-    emit_d32   (cbuf,0x80000000);//         0x80000000
-
-    emit_opcode(cbuf,0x75);      // JNE around_slow_call
-    emit_d8    (cbuf,0x13+4);    // Size of slow_call
-
-    emit_opcode(cbuf,0x85);      // TEST EAX,EAX
-    emit_opcode(cbuf,0xC0);      // 2/rax,/rax,
-
-    emit_opcode(cbuf,0x75);      // JNE around_slow_call
-    emit_d8    (cbuf,0x13);      // Size of slow_call
-
-    // Allocate a word
-    emit_opcode(cbuf,0x83);      // SUB ESP,4
-    emit_opcode(cbuf,0xEC);
-    emit_d8(cbuf,0x04);
-
-    emit_opcode  (cbuf, 0xF3 );  // MOVSS [ESP], src
-    emit_opcode  (cbuf, 0x0F );
-    emit_opcode  (cbuf, 0x11 );
-    encode_RegMem(cbuf, $src$$reg, ESP_enc, 0x4, 0, 0, false);
-
-    emit_opcode(cbuf,0xD9 );     // FLD_S [ESP]
-    encode_RegMem(cbuf, 0x0, ESP_enc, 0x4, 0, 0, false);
-
-    emit_opcode(cbuf,0x83);      // ADD ESP,4
-    emit_opcode(cbuf,0xC4);
-    emit_d8(cbuf,0x04);
-
-    // CALL directly to the runtime
-    cbuf.set_insts_mark();
-    emit_opcode(cbuf,0xE8);       // Call into runtime
-    emit_d32_reloc(cbuf, (StubRoutines::d2l_wrapper() - cbuf.insts_end()) - 4, runtime_call_Relocation::spec(), RELOC_IMM32 );
-    // Carry on here...
-  %}
-
-  enc_class XD2L_encoding( regXD src ) %{
-    // Allocate a word
-    emit_opcode(cbuf,0x83);      // SUB ESP,8
-    emit_opcode(cbuf,0xEC);
-    emit_d8(cbuf,0x08);
-
-    emit_opcode  (cbuf, 0xF2 );  // MOVSD [ESP], src
-    emit_opcode  (cbuf, 0x0F );
-    emit_opcode  (cbuf, 0x11 );
-    encode_RegMem(cbuf, $src$$reg, ESP_enc, 0x4, 0, 0, false);
-
-    emit_opcode(cbuf,0xDD );     // FLD_D [ESP]
-    encode_RegMem(cbuf, 0x0, ESP_enc, 0x4, 0, 0, false);
-
-    emit_opcode(cbuf,0xD9);      // FLDCW  trunc
-    emit_opcode(cbuf,0x2D);
-    emit_d32(cbuf,(int)StubRoutines::addr_fpu_cntrl_wrd_trunc());
-
-    // Encoding assumes a double has been pushed into FPR0.
-    // Store down the double as a long, popping the FPU stack
-    emit_opcode(cbuf,0xDF);      // FISTP [ESP]
-    emit_opcode(cbuf,0x3C);
-    emit_d8(cbuf,0x24);
-
-    // Restore the rounding mode; mask the exception
-    emit_opcode(cbuf,0xD9);      // FLDCW   std/24-bit mode
-    emit_opcode(cbuf,0x2D);
-    emit_d32( cbuf, Compile::current()->in_24_bit_fp_mode()
-      ? (int)StubRoutines::addr_fpu_cntrl_wrd_24()
-      : (int)StubRoutines::addr_fpu_cntrl_wrd_std());
-
-    // Load the converted int; adjust CPU stack
-    emit_opcode(cbuf,0x58);      // POP EAX
-
-    emit_opcode(cbuf,0x5A);      // POP EDX
-
-    emit_opcode(cbuf,0x81);      // CMP EDX,imm
-    emit_d8    (cbuf,0xFA);      // rdx
-    emit_d32   (cbuf,0x80000000); //         0x80000000
-
-    emit_opcode(cbuf,0x75);      // JNE around_slow_call
-    emit_d8    (cbuf,0x13+4);    // Size of slow_call
-
-    emit_opcode(cbuf,0x85);      // TEST EAX,EAX
-    emit_opcode(cbuf,0xC0);      // 2/rax,/rax,
-
-    emit_opcode(cbuf,0x75);      // JNE around_slow_call
-    emit_d8    (cbuf,0x13);      // Size of slow_call
-
-    // Push src onto stack slow-path
-    // Allocate a word
-    emit_opcode(cbuf,0x83);      // SUB ESP,8
-    emit_opcode(cbuf,0xEC);
-    emit_d8(cbuf,0x08);
-
-    emit_opcode  (cbuf, 0xF2 );  // MOVSD [ESP], src
-    emit_opcode  (cbuf, 0x0F );
-    emit_opcode  (cbuf, 0x11 );
-    encode_RegMem(cbuf, $src$$reg, ESP_enc, 0x4, 0, 0, false);
-
-    emit_opcode(cbuf,0xDD );     // FLD_D [ESP]
-    encode_RegMem(cbuf, 0x0, ESP_enc, 0x4, 0, 0, false);
-
-    emit_opcode(cbuf,0x83);      // ADD ESP,8
-    emit_opcode(cbuf,0xC4);
-    emit_d8(cbuf,0x08);
-
-    // CALL directly to the runtime
-    cbuf.set_insts_mark();
-    emit_opcode(cbuf,0xE8);      // Call into runtime
-    emit_d32_reloc(cbuf, (StubRoutines::d2l_wrapper() - cbuf.insts_end()) - 4, runtime_call_Relocation::spec(), RELOC_IMM32 );
-    // Carry on here...
-  %}
-
-  enc_class D2X_encoding( regX dst, regD src ) %{
-    // Allocate a word
-    emit_opcode(cbuf,0x83);            // SUB ESP,4
-    emit_opcode(cbuf,0xEC);
-    emit_d8(cbuf,0x04);
-    int pop = 0x02;
-    if ($src$$reg != FPR1L_enc) {
-      emit_opcode( cbuf, 0xD9 );       // FLD    ST(i-1)
-      emit_d8( cbuf, 0xC0-1+$src$$reg );
-      pop = 0x03;
-    }
-    store_to_stackslot( cbuf, 0xD9, pop, 0 ); // FST<P>_S  [ESP]
-
-    emit_opcode  (cbuf, 0xF3 );        // MOVSS dst(xmm), [ESP]
-    emit_opcode  (cbuf, 0x0F );
-    emit_opcode  (cbuf, 0x10 );
-    encode_RegMem(cbuf, $dst$$reg, ESP_enc, 0x4, 0, 0, false);
-
-    emit_opcode(cbuf,0x83);            // ADD ESP,4
-    emit_opcode(cbuf,0xC4);
-    emit_d8(cbuf,0x04);
-    // Carry on here...
-  %}
-
-  enc_class FX2I_encoding( regX src, eRegI dst ) %{
-    emit_rm(cbuf, 0x3, $dst$$reg, $src$$reg);
-
-    // Compare the result to see if we need to go to the slow path
-    emit_opcode(cbuf,0x81);       // CMP dst,imm
-    emit_rm    (cbuf,0x3,0x7,$dst$$reg);
-    emit_d32   (cbuf,0x80000000); //         0x80000000
-
-    emit_opcode(cbuf,0x75);       // JNE around_slow_call
-    emit_d8    (cbuf,0x13);       // Size of slow_call
-    // Store xmm to a temp memory
-    // location and push it onto stack.
-
-    emit_opcode(cbuf,0x83);  // SUB ESP,4
-    emit_opcode(cbuf,0xEC);
-    emit_d8(cbuf, $primary ? 0x8 : 0x4);
-
-    emit_opcode  (cbuf, $primary ? 0xF2 : 0xF3 );   // MOVSS [ESP], xmm
-    emit_opcode  (cbuf, 0x0F );
-    emit_opcode  (cbuf, 0x11 );
-    encode_RegMem(cbuf, $src$$reg, ESP_enc, 0x4, 0, 0, false);
-
-    emit_opcode(cbuf, $primary ? 0xDD : 0xD9 );      // FLD [ESP]
-    encode_RegMem(cbuf, 0x0, ESP_enc, 0x4, 0, 0, false);
-
-    emit_opcode(cbuf,0x83);    // ADD ESP,4
-    emit_opcode(cbuf,0xC4);
-    emit_d8(cbuf, $primary ? 0x8 : 0x4);
-
-    // CALL directly to the runtime
-    cbuf.set_insts_mark();
-    emit_opcode(cbuf,0xE8);       // Call into runtime
-    emit_d32_reloc(cbuf, (StubRoutines::d2i_wrapper() - cbuf.insts_end()) - 4, runtime_call_Relocation::spec(), RELOC_IMM32 );
-
-    // Carry on here...
-  %}
-
-  enc_class X2D_encoding( regD dst, regX src ) %{
-    // Allocate a word
-    emit_opcode(cbuf,0x83);     // SUB ESP,4
-    emit_opcode(cbuf,0xEC);
-    emit_d8(cbuf,0x04);
-
-    emit_opcode  (cbuf, 0xF3 ); // MOVSS [ESP], xmm
-    emit_opcode  (cbuf, 0x0F );
-    emit_opcode  (cbuf, 0x11 );
-    encode_RegMem(cbuf, $src$$reg, ESP_enc, 0x4, 0, 0, false);
-
-    emit_opcode(cbuf,0xD9 );    // FLD_S [ESP]
-    encode_RegMem(cbuf, 0x0, ESP_enc, 0x4, 0, 0, false);
-
-    emit_opcode(cbuf,0x83);     // ADD ESP,4
-    emit_opcode(cbuf,0xC4);
-    emit_d8(cbuf,0x04);
-
-    // Carry on here...
-  %}
-
-  enc_class AbsXF_encoding(regX dst) %{
-    address signmask_address=(address)float_signmask_pool;
-    // andpd:\tANDPS  $dst,[signconst]
-    emit_opcode(cbuf, 0x0F);
-    emit_opcode(cbuf, 0x54);
-    emit_rm(cbuf, 0x0, $dst$$reg, 0x5);
-    emit_d32(cbuf, (int)signmask_address);
-  %}
-
-  enc_class AbsXD_encoding(regXD dst) %{
-    address signmask_address=(address)double_signmask_pool;
-    // andpd:\tANDPD  $dst,[signconst]
-    emit_opcode(cbuf, 0x66);
-    emit_opcode(cbuf, 0x0F);
-    emit_opcode(cbuf, 0x54);
-    emit_rm(cbuf, 0x0, $dst$$reg, 0x5);
-    emit_d32(cbuf, (int)signmask_address);
-  %}
-
-  enc_class NegXF_encoding(regX dst) %{
-    address signmask_address=(address)float_signflip_pool;
-    // andpd:\tXORPS  $dst,[signconst]
-    emit_opcode(cbuf, 0x0F);
-    emit_opcode(cbuf, 0x57);
-    emit_rm(cbuf, 0x0, $dst$$reg, 0x5);
-    emit_d32(cbuf, (int)signmask_address);
-  %}
-
-  enc_class NegXD_encoding(regXD dst) %{
-    address signmask_address=(address)double_signflip_pool;
-    // andpd:\tXORPD  $dst,[signconst]
-    emit_opcode(cbuf, 0x66);
-    emit_opcode(cbuf, 0x0F);
-    emit_opcode(cbuf, 0x57);
-    emit_rm(cbuf, 0x0, $dst$$reg, 0x5);
-    emit_d32(cbuf, (int)signmask_address);
-  %}
-
-  enc_class FMul_ST_reg( eRegF src1 ) %{
+  enc_class FMul_ST_reg( eRegFPR src1 ) %{
     // Operand was loaded from memory into fp ST (stack top)
     // FMUL   ST,$src  /* D8 C8+i */
     emit_opcode(cbuf, 0xD8);
     emit_opcode(cbuf, 0xC8 + $src1$$reg);
   %}
 
-  enc_class FAdd_ST_reg( eRegF src2 ) %{
+  enc_class FAdd_ST_reg( eRegFPR src2 ) %{
     // FADDP  ST,src2  /* D8 C0+i */
     emit_opcode(cbuf, 0xD8);
     emit_opcode(cbuf, 0xC0 + $src2$$reg);
     //could use FADDP  src2,fpST  /* DE C0+i */
   %}
 
-  enc_class FAddP_reg_ST( eRegF src2 ) %{
+  enc_class FAddP_reg_ST( eRegFPR src2 ) %{
     // FADDP  src2,ST  /* DE C0+i */
     emit_opcode(cbuf, 0xDE);
     emit_opcode(cbuf, 0xC0 + $src2$$reg);
   %}
 
-  enc_class subF_divF_encode( eRegF src1, eRegF src2) %{
+  enc_class subFPR_divFPR_encode( eRegFPR src1, eRegFPR src2) %{
     // Operand has been loaded into fp ST (stack top)
       // FSUB   ST,$src1
       emit_opcode(cbuf, 0xD8);
@@ -4140,7 +3681,7 @@ encode %{
       emit_opcode(cbuf, 0xF0 + $src2$$reg);
   %}
 
-  enc_class MulFAddF (eRegF src1, eRegF src2) %{
+  enc_class MulFAddF (eRegFPR src1, eRegFPR src2) %{
     // Operand was loaded from memory into fp ST (stack top)
     // FADD   ST,$src  /* D8 C0+i */
     emit_opcode(cbuf, 0xD8);
@@ -4152,7 +3693,7 @@ encode %{
   %}
 
 
-  enc_class MulFAddFreverse (eRegF src1, eRegF src2) %{
+  enc_class MulFAddFreverse (eRegFPR src1, eRegFPR src2) %{
     // Operand was loaded from memory into fp ST (stack top)
     // FADD   ST,$src  /* D8 C0+i */
     emit_opcode(cbuf, 0xD8);
@@ -4176,66 +3717,6 @@ encode %{
     store_to_stackslot( cbuf, 0x0DF, 0x07, $dst$$disp );
   %}
 
-  enc_class enc_loadLX_volatile( memory mem, stackSlotL dst, regXD tmp ) %{
-    { // Atomic long load
-      // UseXmmLoadAndClearUpper ? movsd $tmp,$mem : movlpd $tmp,$mem
-      emit_opcode(cbuf,UseXmmLoadAndClearUpper ? 0xF2 : 0x66);
-      emit_opcode(cbuf,0x0F);
-      emit_opcode(cbuf,UseXmmLoadAndClearUpper ? 0x10 : 0x12);
-      int base     = $mem$$base;
-      int index    = $mem$$index;
-      int scale    = $mem$$scale;
-      int displace = $mem$$disp;
-      bool disp_is_oop = $mem->disp_is_oop(); // disp-as-oop when working with static globals
-      encode_RegMem(cbuf, $tmp$$reg, base, index, scale, displace, disp_is_oop);
-    }
-    { // MOVSD $dst,$tmp ! atomic long store
-      emit_opcode(cbuf,0xF2);
-      emit_opcode(cbuf,0x0F);
-      emit_opcode(cbuf,0x11);
-      int base     = $dst$$base;
-      int index    = $dst$$index;
-      int scale    = $dst$$scale;
-      int displace = $dst$$disp;
-      bool disp_is_oop = $dst->disp_is_oop(); // disp-as-oop when working with static globals
-      encode_RegMem(cbuf, $tmp$$reg, base, index, scale, displace, disp_is_oop);
-    }
-  %}
-
-  enc_class enc_loadLX_reg_volatile( memory mem, eRegL dst, regXD tmp ) %{
-    { // Atomic long load
-      // UseXmmLoadAndClearUpper ? movsd $tmp,$mem : movlpd $tmp,$mem
-      emit_opcode(cbuf,UseXmmLoadAndClearUpper ? 0xF2 : 0x66);
-      emit_opcode(cbuf,0x0F);
-      emit_opcode(cbuf,UseXmmLoadAndClearUpper ? 0x10 : 0x12);
-      int base     = $mem$$base;
-      int index    = $mem$$index;
-      int scale    = $mem$$scale;
-      int displace = $mem$$disp;
-      bool disp_is_oop = $mem->disp_is_oop(); // disp-as-oop when working with static globals
-      encode_RegMem(cbuf, $tmp$$reg, base, index, scale, displace, disp_is_oop);
-    }
-    { // MOVD $dst.lo,$tmp
-      emit_opcode(cbuf,0x66);
-      emit_opcode(cbuf,0x0F);
-      emit_opcode(cbuf,0x7E);
-      emit_rm(cbuf, 0x3, $tmp$$reg, $dst$$reg);
-    }
-    { // PSRLQ $tmp,32
-      emit_opcode(cbuf,0x66);
-      emit_opcode(cbuf,0x0F);
-      emit_opcode(cbuf,0x73);
-      emit_rm(cbuf, 0x3, 0x02, $tmp$$reg);
-      emit_d8(cbuf, 0x20);
-    }
-    { // MOVD $dst.hi,$tmp
-      emit_opcode(cbuf,0x66);
-      emit_opcode(cbuf,0x0F);
-      emit_opcode(cbuf,0x7E);
-      emit_rm(cbuf, 0x3, $tmp$$reg, HIGH_FROM_LOW($dst$$reg));
-    }
-  %}
-
   // Volatile Store Long.  Must be atomic, so move it into
   // the FP TOS and then do a 64-bit FIST.  Has to probe the
   // target address before the store (for null-ptr checks)
@@ -4253,66 +3734,6 @@ encode %{
     encode_RegMem(cbuf, rm_byte_opcode, base, index, scale, displace, disp_is_oop);
   %}
 
-  enc_class enc_storeLX_volatile( memory mem, stackSlotL src, regXD tmp) %{
-    { // Atomic long load
-      // UseXmmLoadAndClearUpper ? movsd $tmp,[$src] : movlpd $tmp,[$src]
-      emit_opcode(cbuf,UseXmmLoadAndClearUpper ? 0xF2 : 0x66);
-      emit_opcode(cbuf,0x0F);
-      emit_opcode(cbuf,UseXmmLoadAndClearUpper ? 0x10 : 0x12);
-      int base     = $src$$base;
-      int index    = $src$$index;
-      int scale    = $src$$scale;
-      int displace = $src$$disp;
-      bool disp_is_oop = $src->disp_is_oop(); // disp-as-oop when working with static globals
-      encode_RegMem(cbuf, $tmp$$reg, base, index, scale, displace, disp_is_oop);
-    }
-    cbuf.set_insts_mark();            // Mark start of MOVSD in case $mem has an oop
-    { // MOVSD $mem,$tmp ! atomic long store
-      emit_opcode(cbuf,0xF2);
-      emit_opcode(cbuf,0x0F);
-      emit_opcode(cbuf,0x11);
-      int base     = $mem$$base;
-      int index    = $mem$$index;
-      int scale    = $mem$$scale;
-      int displace = $mem$$disp;
-      bool disp_is_oop = $mem->disp_is_oop(); // disp-as-oop when working with static globals
-      encode_RegMem(cbuf, $tmp$$reg, base, index, scale, displace, disp_is_oop);
-    }
-  %}
-
-  enc_class enc_storeLX_reg_volatile( memory mem, eRegL src, regXD tmp, regXD tmp2) %{
-    { // MOVD $tmp,$src.lo
-      emit_opcode(cbuf,0x66);
-      emit_opcode(cbuf,0x0F);
-      emit_opcode(cbuf,0x6E);
-      emit_rm(cbuf, 0x3, $tmp$$reg, $src$$reg);
-    }
-    { // MOVD $tmp2,$src.hi
-      emit_opcode(cbuf,0x66);
-      emit_opcode(cbuf,0x0F);
-      emit_opcode(cbuf,0x6E);
-      emit_rm(cbuf, 0x3, $tmp2$$reg, HIGH_FROM_LOW($src$$reg));
-    }
-    { // PUNPCKLDQ $tmp,$tmp2
-      emit_opcode(cbuf,0x66);
-      emit_opcode(cbuf,0x0F);
-      emit_opcode(cbuf,0x62);
-      emit_rm(cbuf, 0x3, $tmp$$reg, $tmp2$$reg);
-    }
-    cbuf.set_insts_mark();            // Mark start of MOVSD in case $mem has an oop
-    { // MOVSD $mem,$tmp ! atomic long store
-      emit_opcode(cbuf,0xF2);
-      emit_opcode(cbuf,0x0F);
-      emit_opcode(cbuf,0x11);
-      int base     = $mem$$base;
-      int index    = $mem$$index;
-      int scale    = $mem$$scale;
-      int displace = $mem$$disp;
-      bool disp_is_oop = $mem->disp_is_oop(); // disp-as-oop when working with static globals
-      encode_RegMem(cbuf, $tmp$$reg, base, index, scale, displace, disp_is_oop);
-    }
-  %}
-
   // Safepoint Poll.  This polls the safepoint page, and causes an
   // exception if it is not readable. Unfortunately, it kills the condition code
   // in the process
@@ -4705,7 +4126,7 @@ operand immL32() %{
 %}
 
 //Double Immediate zero
-operand immD0() %{
+operand immDPR0() %{
   // Do additional (and counter-intuitive) test against NaN to work around VC++
   // bug that generates code such that NaNs compare equal to 0.0
   predicate( UseSSE<=1 && n->getd() == 0.0 && !g_isnan(n->getd()) );
@@ -4717,7 +4138,7 @@ operand immD0() %{
 %}
 
 // Double Immediate one
-operand immD1() %{
+operand immDPR1() %{
   predicate( UseSSE<=1 && n->getd() == 1.0 );
   match(ConD);
 
@@ -4727,7 +4148,7 @@ operand immD1() %{
 %}
 
 // Double Immediate
-operand immD() %{
+operand immDPR() %{
   predicate(UseSSE<=1);
   match(ConD);
 
@@ -4736,7 +4157,7 @@ operand immD() %{
   interface(CONST_INTER);
 %}
 
-operand immXD() %{
+operand immD() %{
   predicate(UseSSE>=2);
   match(ConD);
 
@@ -4746,7 +4167,7 @@ operand immXD() %{
 %}
 
 // Double Immediate zero
-operand immXD0() %{
+operand immD0() %{
   // Do additional (and counter-intuitive) test against NaN to work around VC++
   // bug that generates code such that NaNs compare equal to 0.0 AND do not
   // compare equal to -0.0.
@@ -4758,7 +4179,7 @@ operand immXD0() %{
 %}
 
 // Float Immediate zero
-operand immF0() %{
+operand immFPR0() %{
   predicate(UseSSE == 0 && n->getf() == 0.0F);
   match(ConF);
 
@@ -4768,7 +4189,7 @@ operand immF0() %{
 %}
 
 // Float Immediate one
-operand immF1() %{
+operand immFPR1() %{
   predicate(UseSSE == 0 && n->getf() == 1.0F);
   match(ConF);
 
@@ -4778,7 +4199,7 @@ operand immF1() %{
 %}
 
 // Float Immediate
-operand immF() %{
+operand immFPR() %{
   predicate( UseSSE == 0 );
   match(ConF);
 
@@ -4788,7 +4209,7 @@ operand immF() %{
 %}
 
 // Float Immediate
-operand immXF() %{
+operand immF() %{
   predicate(UseSSE >= 1);
   match(ConF);
 
@@ -4798,7 +4219,7 @@ operand immXF() %{
 %}
 
 // Float Immediate zero.  Zero and not -0.0
-operand immXF0() %{
+operand immF0() %{
   predicate( UseSSE >= 1 && jint_cast(n->getf()) == 0 );
   match(ConF);
 
@@ -5174,7 +4595,7 @@ operand flagsReg_long_LEGT() %{
 %}
 
 // Float register operands
-operand regD() %{
+operand regDPR() %{
   predicate( UseSSE < 2 );
   constraint(ALLOC_IN_RC(dbl_reg));
   match(RegD);
@@ -5184,7 +4605,7 @@ operand regD() %{
   interface(REG_INTER);
 %}
 
-operand regDPR1(regD reg) %{
+operand regDPR1(regDPR reg) %{
   predicate( UseSSE < 2 );
   constraint(ALLOC_IN_RC(dbl_reg0));
   match(reg);
@@ -5192,7 +4613,7 @@ operand regDPR1(regD reg) %{
   interface(REG_INTER);
 %}
 
-operand regDPR2(regD reg) %{
+operand regDPR2(regDPR reg) %{
   predicate( UseSSE < 2 );
   constraint(ALLOC_IN_RC(dbl_reg1));
   match(reg);
@@ -5200,7 +4621,7 @@ operand regDPR2(regD reg) %{
   interface(REG_INTER);
 %}
 
-operand regnotDPR1(regD reg) %{
+operand regnotDPR1(regDPR reg) %{
   predicate( UseSSE < 2 );
   constraint(ALLOC_IN_RC(dbl_notreg0));
   match(reg);
@@ -5209,18 +4630,18 @@ operand regnotDPR1(regD reg) %{
 %}
 
 // XMM Double register operands
-operand regXD() %{
+operand regD() %{
   predicate( UseSSE>=2 );
   constraint(ALLOC_IN_RC(xdb_reg));
   match(RegD);
-  match(regXD6);
-  match(regXD7);
+  match(regD6);
+  match(regD7);
   format %{ %}
   interface(REG_INTER);
 %}
 
 // XMM6 double register operands
-operand regXD6(regXD reg) %{
+operand regD6(regD reg) %{
   predicate( UseSSE>=2 );
   constraint(ALLOC_IN_RC(xdb_reg6));
   match(reg);
@@ -5229,7 +4650,7 @@ operand regXD6(regXD reg) %{
 %}
 
 // XMM7 double register operands
-operand regXD7(regXD reg) %{
+operand regD7(regD reg) %{
   predicate( UseSSE>=2 );
   constraint(ALLOC_IN_RC(xdb_reg7));
   match(reg);
@@ -5238,7 +4659,7 @@ operand regXD7(regXD reg) %{
 %}
 
 // Float register operands
-operand regF() %{
+operand regFPR() %{
   predicate( UseSSE < 2 );
   constraint(ALLOC_IN_RC(flt_reg));
   match(RegF);
@@ -5248,7 +4669,7 @@ operand regF() %{
 %}
 
 // Float register operands
-operand regFPR1(regF reg) %{
+operand regFPR1(regFPR reg) %{
   predicate( UseSSE < 2 );
   constraint(ALLOC_IN_RC(flt_reg0));
   match(reg);
@@ -5257,7 +4678,7 @@ operand regFPR1(regF reg) %{
 %}
 
 // XMM register operands
-operand regX() %{
+operand regF() %{
   predicate( UseSSE>=1 );
   constraint(ALLOC_IN_RC(xmm_reg));
   match(RegF);
@@ -6001,7 +5422,7 @@ pipe_class pipe_cmov_reg_long( eFlagsReg cr, eRegL dst, eRegL src) %{
 %}
 
 // Conditional move double reg-reg
-pipe_class pipe_cmovD_reg( eFlagsReg cr, regDPR1 dst, regD src) %{
+pipe_class pipe_cmovDPR_reg( eFlagsReg cr, regDPR1 dst, regDPR src) %{
     single_instruction;
     dst    : S4(write);
     src    : S3(read);
@@ -6010,7 +5431,7 @@ pipe_class pipe_cmovD_reg( eFlagsReg cr, regDPR1 dst, regD src) %{
 %}
 
 // Float reg-reg operation
-pipe_class fpu_reg(regD dst) %{
+pipe_class fpu_reg(regDPR dst) %{
     instruction_count(2);
     dst    : S3(read);
     DECODE : S0(2);     // any 2 decoders
@@ -6018,7 +5439,7 @@ pipe_class fpu_reg(regD dst) %{
 %}
 
 // Float reg-reg operation
-pipe_class fpu_reg_reg(regD dst, regD src) %{
+pipe_class fpu_reg_reg(regDPR dst, regDPR src) %{
     instruction_count(2);
     dst    : S4(write);
     src    : S3(read);
@@ -6027,7 +5448,7 @@ pipe_class fpu_reg_reg(regD dst, regD src) %{
 %}
 
 // Float reg-reg operation
-pipe_class fpu_reg_reg_reg(regD dst, regD src1, regD src2) %{
+pipe_class fpu_reg_reg_reg(regDPR dst, regDPR src1, regDPR src2) %{
     instruction_count(3);
     dst    : S4(write);
     src1   : S3(read);
@@ -6037,7 +5458,7 @@ pipe_class fpu_reg_reg_reg(regD dst, regD src1, regD src2) %{
 %}
 
 // Float reg-reg operation
-pipe_class fpu_reg_reg_reg_reg(regD dst, regD src1, regD src2, regD src3) %{
+pipe_class fpu_reg_reg_reg_reg(regDPR dst, regDPR src1, regDPR src2, regDPR src3) %{
     instruction_count(4);
     dst    : S4(write);
     src1   : S3(read);
@@ -6048,7 +5469,7 @@ pipe_class fpu_reg_reg_reg_reg(regD dst, regD src1, regD src2, regD src3) %{
 %}
 
 // Float reg-reg operation
-pipe_class fpu_reg_mem_reg_reg(regD dst, memory src1, regD src2, regD src3) %{
+pipe_class fpu_reg_mem_reg_reg(regDPR dst, memory src1, regDPR src2, regDPR src3) %{
     instruction_count(4);
     dst    : S4(write);
     src1   : S3(read);
@@ -6061,7 +5482,7 @@ pipe_class fpu_reg_mem_reg_reg(regD dst, memory src1, regD src2, regD src3) %{
 %}
 
 // Float reg-mem operation
-pipe_class fpu_reg_mem(regD dst, memory mem) %{
+pipe_class fpu_reg_mem(regDPR dst, memory mem) %{
     instruction_count(2);
     dst    : S5(write);
     mem    : S3(read);
@@ -6072,7 +5493,7 @@ pipe_class fpu_reg_mem(regD dst, memory mem) %{
 %}
 
 // Float reg-mem operation
-pipe_class fpu_reg_reg_mem(regD dst, regD src1, memory mem) %{
+pipe_class fpu_reg_reg_mem(regDPR dst, regDPR src1, memory mem) %{
     instruction_count(3);
     dst    : S5(write);
     src1   : S3(read);
@@ -6084,7 +5505,7 @@ pipe_class fpu_reg_reg_mem(regD dst, regD src1, memory mem) %{
 %}
 
 // Float mem-reg operation
-pipe_class fpu_mem_reg(memory mem, regD src) %{
+pipe_class fpu_mem_reg(memory mem, regDPR src) %{
     instruction_count(2);
     src    : S5(read);
     mem    : S3(read);
@@ -6094,7 +5515,7 @@ pipe_class fpu_mem_reg(memory mem, regD src) %{
     MEM    : S3;        // any mem
 %}
 
-pipe_class fpu_mem_reg_reg(memory mem, regD src1, regD src2) %{
+pipe_class fpu_mem_reg_reg(memory mem, regDPR src1, regDPR src2) %{
     instruction_count(3);
     src1   : S3(read);
     src2   : S3(read);
@@ -6105,7 +5526,7 @@ pipe_class fpu_mem_reg_reg(memory mem, regD src1, regD src2) %{
     MEM    : S3;        // any mem
 %}
 
-pipe_class fpu_mem_reg_mem(memory mem, regD src1, memory src2) %{
+pipe_class fpu_mem_reg_mem(memory mem, regDPR src1, memory src2) %{
     instruction_count(3);
     src1   : S3(read);
     src2   : S3(read);
@@ -6134,7 +5555,7 @@ pipe_class fpu_mem_mem_mem(memory dst, memory src1, memory src2) %{
     MEM    : S3(3);     // any mem
 %}
 
-pipe_class fpu_mem_reg_con(memory mem, regD src1) %{
+pipe_class fpu_mem_reg_con(memory mem, regDPR src1) %{
     instruction_count(3);
     src1   : S4(read);
     mem    : S4(read);
@@ -6145,7 +5566,7 @@ pipe_class fpu_mem_reg_con(memory mem, regD src1) %{
 %}
 
 // Float load constant
-pipe_class fpu_reg_con(regD dst) %{
+pipe_class fpu_reg_con(regDPR dst) %{
     instruction_count(2);
     dst    : S5(write);
     D0     : S0;        // big decoder only for the load
@@ -6155,7 +5576,7 @@ pipe_class fpu_reg_con(regD dst) %{
 %}
 
 // Float load constant
-pipe_class fpu_reg_reg_con(regD dst, regD src) %{
+pipe_class fpu_reg_reg_con(regDPR dst, regDPR src) %{
     instruction_count(3);
     dst    : S5(write);
     src    : S3(read);
@@ -6870,18 +6291,21 @@ instruct loadL_volatile(stackSlotL dst, memory mem) %{
   ins_pipe( fpu_reg_mem );
 %}
 
-instruct loadLX_volatile(stackSlotL dst, memory mem, regXD tmp) %{
+instruct loadLX_volatile(stackSlotL dst, memory mem, regD tmp) %{
   predicate(UseSSE>=2 && ((LoadLNode*)n)->require_atomic_access());
   match(Set dst (LoadL mem));
   effect(TEMP tmp);
   ins_cost(180);
   format %{ "MOVSD  $tmp,$mem\t# Atomic volatile long load\n\t"
             "MOVSD  $dst,$tmp" %}
-  ins_encode(enc_loadLX_volatile(mem, dst, tmp));
+  ins_encode %{
+    __ movdbl($tmp$$XMMRegister, $mem$$Address);
+    __ movdbl(Address(rsp, $dst$$disp), $tmp$$XMMRegister);
+  %}
   ins_pipe( pipe_slow );
 %}
 
-instruct loadLX_reg_volatile(eRegL dst, memory mem, regXD tmp) %{
+instruct loadLX_reg_volatile(eRegL dst, memory mem, regD tmp) %{
   predicate(UseSSE>=2 && ((LoadLNode*)n)->require_atomic_access());
   match(Set dst (LoadL mem));
   effect(TEMP tmp);
@@ -6890,7 +6314,12 @@ instruct loadLX_reg_volatile(eRegL dst, memory mem, regXD tmp) %{
             "MOVD   $dst.lo,$tmp\n\t"
             "PSRLQ  $tmp,32\n\t"
             "MOVD   $dst.hi,$tmp" %}
-  ins_encode(enc_loadLX_reg_volatile(mem, dst, tmp));
+  ins_encode %{
+    __ movdbl($tmp$$XMMRegister, $mem$$Address);
+    __ movdl($dst$$Register, $tmp$$XMMRegister);
+    __ psrlq($tmp$$XMMRegister, 32);
+    __ movdl(HIGH_FROM_LOW($dst$$Register), $tmp$$XMMRegister);
+  %}
   ins_pipe( pipe_slow );
 %}
 
@@ -6929,7 +6358,7 @@ instruct loadKlass(eRegP dst, memory mem) %{
 %}
 
 // Load Double
-instruct loadD(regD dst, memory mem) %{
+instruct loadDPR(regDPR dst, memory mem) %{
   predicate(UseSSE<=1);
   match(Set dst (LoadD mem));
 
@@ -6938,42 +6367,48 @@ instruct loadD(regD dst, memory mem) %{
             "FSTP   $dst" %}
   opcode(0xDD);               /* DD /0 */
   ins_encode( OpcP, RMopc_Mem(0x00,mem),
-              Pop_Reg_D(dst) );
+              Pop_Reg_DPR(dst) );
   ins_pipe( fpu_reg_mem );
 %}
 
 // Load Double to XMM
-instruct loadXD(regXD dst, memory mem) %{
+instruct loadD(regD dst, memory mem) %{
   predicate(UseSSE>=2 && UseXmmLoadAndClearUpper);
   match(Set dst (LoadD mem));
   ins_cost(145);
   format %{ "MOVSD  $dst,$mem" %}
-  ins_encode( Opcode(0xF2), Opcode(0x0F), Opcode(0x10), RegMem(dst,mem));
+  ins_encode %{
+    __ movdbl ($dst$$XMMRegister, $mem$$Address);
+  %}
   ins_pipe( pipe_slow );
 %}
 
-instruct loadXD_partial(regXD dst, memory mem) %{
+instruct loadD_partial(regD dst, memory mem) %{
   predicate(UseSSE>=2 && !UseXmmLoadAndClearUpper);
   match(Set dst (LoadD mem));
   ins_cost(145);
   format %{ "MOVLPD $dst,$mem" %}
-  ins_encode( Opcode(0x66), Opcode(0x0F), Opcode(0x12), RegMem(dst,mem));
+  ins_encode %{
+    __ movdbl ($dst$$XMMRegister, $mem$$Address);
+  %}
   ins_pipe( pipe_slow );
 %}
 
 // Load to XMM register (single-precision floating point)
 // MOVSS instruction
-instruct loadX(regX dst, memory mem) %{
+instruct loadF(regF dst, memory mem) %{
   predicate(UseSSE>=1);
   match(Set dst (LoadF mem));
   ins_cost(145);
   format %{ "MOVSS  $dst,$mem" %}
-  ins_encode( Opcode(0xF3), Opcode(0x0F), Opcode(0x10), RegMem(dst,mem));
+  ins_encode %{
+    __ movflt ($dst$$XMMRegister, $mem$$Address);
+  %}
   ins_pipe( pipe_slow );
 %}
 
 // Load Float
-instruct loadF(regF dst, memory mem) %{
+instruct loadFPR(regFPR dst, memory mem) %{
   predicate(UseSSE==0);
   match(Set dst (LoadF mem));
 
@@ -6982,57 +6417,67 @@ instruct loadF(regF dst, memory mem) %{
             "FSTP   $dst" %}
   opcode(0xD9);               /* D9 /0 */
   ins_encode( OpcP, RMopc_Mem(0x00,mem),
-              Pop_Reg_F(dst) );
+              Pop_Reg_FPR(dst) );
   ins_pipe( fpu_reg_mem );
 %}
 
 // Load Aligned Packed Byte to XMM register
-instruct loadA8B(regXD dst, memory mem) %{
+instruct loadA8B(regD dst, memory mem) %{
   predicate(UseSSE>=1);
   match(Set dst (Load8B mem));
   ins_cost(125);
   format %{ "MOVQ  $dst,$mem\t! packed8B" %}
-  ins_encode( movq_ld(dst, mem));
+  ins_encode %{
+    __ movq($dst$$XMMRegister, $mem$$Address);
+  %}
   ins_pipe( pipe_slow );
 %}
 
 // Load Aligned Packed Short to XMM register
-instruct loadA4S(regXD dst, memory mem) %{
+instruct loadA4S(regD dst, memory mem) %{
   predicate(UseSSE>=1);
   match(Set dst (Load4S mem));
   ins_cost(125);
   format %{ "MOVQ  $dst,$mem\t! packed4S" %}
-  ins_encode( movq_ld(dst, mem));
+  ins_encode %{
+    __ movq($dst$$XMMRegister, $mem$$Address);
+  %}
   ins_pipe( pipe_slow );
 %}
 
 // Load Aligned Packed Char to XMM register
-instruct loadA4C(regXD dst, memory mem) %{
+instruct loadA4C(regD dst, memory mem) %{
   predicate(UseSSE>=1);
   match(Set dst (Load4C mem));
   ins_cost(125);
   format %{ "MOVQ  $dst,$mem\t! packed4C" %}
-  ins_encode( movq_ld(dst, mem));
+  ins_encode %{
+    __ movq($dst$$XMMRegister, $mem$$Address);
+  %}
   ins_pipe( pipe_slow );
 %}
 
 // Load Aligned Packed Integer to XMM register
-instruct load2IU(regXD dst, memory mem) %{
+instruct load2IU(regD dst, memory mem) %{
   predicate(UseSSE>=1);
   match(Set dst (Load2I mem));
   ins_cost(125);
   format %{ "MOVQ  $dst,$mem\t! packed2I" %}
-  ins_encode( movq_ld(dst, mem));
+  ins_encode %{
+    __ movq($dst$$XMMRegister, $mem$$Address);
+  %}
   ins_pipe( pipe_slow );
 %}
 
 // Load Aligned Packed Single to XMM
-instruct loadA2F(regXD dst, memory mem) %{
+instruct loadA2F(regD dst, memory mem) %{
   predicate(UseSSE>=1);
   match(Set dst (Load2F mem));
   ins_cost(145);
   format %{ "MOVQ  $dst,$mem\t! packed2F" %}
-  ins_encode( movq_ld(dst, mem));
+  ins_encode %{
+    __ movq($dst$$XMMRegister, $mem$$Address);
+  %}
   ins_pipe( pipe_slow );
 %}
 
@@ -7139,8 +6584,8 @@ instruct loadConL0(eRegL dst, immL0 src, eFlagsReg cr) %{
   ins_pipe( ialu_reg_long );
 %}
 
-// The instruction usage is guarded by predicate in operand immF().
-instruct loadConF(regF dst, immF con) %{
+// The instruction usage is guarded by predicate in operand immFPR().
+instruct loadConFPR(regFPR dst, immFPR con) %{
   match(Set dst con);
   ins_cost(125);
   format %{ "FLD_S  ST,[$constantaddress]\t# load from constant table: float=$con\n\t"
@@ -7152,8 +6597,8 @@ instruct loadConF(regF dst, immF con) %{
   ins_pipe(fpu_reg_con);
 %}
 
-// The instruction usage is guarded by predicate in operand immF0().
-instruct loadConF0(regF dst, immF0 con) %{
+// The instruction usage is guarded by predicate in operand immFPR0().
+instruct loadConFPR0(regFPR dst, immFPR0 con) %{
   match(Set dst con);
   ins_cost(125);
   format %{ "FLDZ   ST\n\t"
@@ -7165,8 +6610,8 @@ instruct loadConF0(regF dst, immF0 con) %{
   ins_pipe(fpu_reg_con);
 %}
 
-// The instruction usage is guarded by predicate in operand immF1().
-instruct loadConF1(regF dst, immF1 con) %{
+// The instruction usage is guarded by predicate in operand immFPR1().
+instruct loadConFPR1(regFPR dst, immFPR1 con) %{
   match(Set dst con);
   ins_cost(125);
   format %{ "FLD1   ST\n\t"
@@ -7178,8 +6623,8 @@ instruct loadConF1(regF dst, immF1 con) %{
   ins_pipe(fpu_reg_con);
 %}
 
-// The instruction usage is guarded by predicate in operand immXF().
-instruct loadConX(regX dst, immXF con) %{
+// The instruction usage is guarded by predicate in operand immF().
+instruct loadConF(regF dst, immF con) %{
   match(Set dst con);
   ins_cost(125);
   format %{ "MOVSS  $dst,[$constantaddress]\t# load from constant table: float=$con" %}
@@ -7189,8 +6634,8 @@ instruct loadConX(regX dst, immXF con) %{
   ins_pipe(pipe_slow);
 %}
 
-// The instruction usage is guarded by predicate in operand immXF0().
-instruct loadConX0(regX dst, immXF0 src) %{
+// The instruction usage is guarded by predicate in operand immF0().
+instruct loadConF0(regF dst, immF0 src) %{
   match(Set dst src);
   ins_cost(100);
   format %{ "XORPS  $dst,$dst\t# float 0.0" %}
@@ -7200,8 +6645,8 @@ instruct loadConX0(regX dst, immXF0 src) %{
   ins_pipe(pipe_slow);
 %}
 
-// The instruction usage is guarded by predicate in operand immD().
-instruct loadConD(regD dst, immD con) %{
+// The instruction usage is guarded by predicate in operand immDPR().
+instruct loadConDPR(regDPR dst, immDPR con) %{
   match(Set dst con);
   ins_cost(125);
 
@@ -7214,8 +6659,8 @@ instruct loadConD(regD dst, immD con) %{
   ins_pipe(fpu_reg_con);
 %}
 
-// The instruction usage is guarded by predicate in operand immD0().
-instruct loadConD0(regD dst, immD0 con) %{
+// The instruction usage is guarded by predicate in operand immDPR0().
+instruct loadConDPR0(regDPR dst, immDPR0 con) %{
   match(Set dst con);
   ins_cost(125);
 
@@ -7228,8 +6673,8 @@ instruct loadConD0(regD dst, immD0 con) %{
   ins_pipe(fpu_reg_con);
 %}
 
-// The instruction usage is guarded by predicate in operand immD1().
-instruct loadConD1(regD dst, immD1 con) %{
+// The instruction usage is guarded by predicate in operand immDPR1().
+instruct loadConDPR1(regDPR dst, immDPR1 con) %{
   match(Set dst con);
   ins_cost(125);
 
@@ -7242,8 +6687,8 @@ instruct loadConD1(regD dst, immD1 con) %{
   ins_pipe(fpu_reg_con);
 %}
 
-// The instruction usage is guarded by predicate in operand immXD().
-instruct loadConXD(regXD dst, immXD con) %{
+// The instruction usage is guarded by predicate in operand immD().
+instruct loadConD(regD dst, immD con) %{
   match(Set dst con);
   ins_cost(125);
   format %{ "MOVSD  $dst,[$constantaddress]\t# load from constant table: double=$con" %}
@@ -7253,12 +6698,14 @@ instruct loadConXD(regXD dst, immXD con) %{
   ins_pipe(pipe_slow);
 %}
 
-// The instruction usage is guarded by predicate in operand immXD0().
-instruct loadConXD0(regXD dst, immXD0 src) %{
+// The instruction usage is guarded by predicate in operand immD0().
+instruct loadConD0(regD dst, immD0 src) %{
   match(Set dst src);
   ins_cost(100);
   format %{ "XORPD  $dst,$dst\t# double 0.0" %}
-  ins_encode( Opcode(0x66), Opcode(0x0F), Opcode(0x57), RegReg(dst,dst));
+  ins_encode %{
+    __ xorpd ($dst$$XMMRegister, $dst$$XMMRegister);
+  %}
   ins_pipe( pipe_slow );
 %}
 
@@ -7296,7 +6743,7 @@ instruct loadSSP(eRegP dst, stackSlotP src) %{
 %}
 
 // Load Stack Slot
-instruct loadSSF(regF dst, stackSlotF src) %{
+instruct loadSSF(regFPR dst, stackSlotF src) %{
   match(Set dst src);
   ins_cost(125);
 
@@ -7304,12 +6751,12 @@ instruct loadSSF(regF dst, stackSlotF src) %{
             "FSTP   $dst" %}
   opcode(0xD9);               /* D9 /0, FLD m32real */
   ins_encode( OpcP, RMopc_Mem_no_oop(0x00,src),
-              Pop_Reg_F(dst) );
+              Pop_Reg_FPR(dst) );
   ins_pipe( fpu_reg_mem );
 %}
 
 // Load Stack Slot
-instruct loadSSD(regD dst, stackSlotD src) %{
+instruct loadSSD(regDPR dst, stackSlotD src) %{
   match(Set dst src);
   ins_cost(125);
 
@@ -7317,7 +6764,7 @@ instruct loadSSD(regD dst, stackSlotD src) %{
             "FSTP   $dst" %}
   opcode(0xDD);               /* DD /0, FLD m64real */
   ins_encode( OpcP, RMopc_Mem_no_oop(0x00,src),
-              Pop_Reg_D(dst) );
+              Pop_Reg_DPR(dst) );
   ins_pipe( fpu_reg_mem );
 %}
 
@@ -7552,7 +6999,7 @@ instruct storeL_volatile(memory mem, stackSlotL src, eFlagsReg cr ) %{
   ins_pipe( fpu_reg_mem );
 %}
 
-instruct storeLX_volatile(memory mem, stackSlotL src, regXD tmp, eFlagsReg cr) %{
+instruct storeLX_volatile(memory mem, stackSlotL src, regD tmp, eFlagsReg cr) %{
   predicate(UseSSE>=2 && ((StoreLNode*)n)->require_atomic_access());
   match(Set mem (StoreL mem src));
   effect( TEMP tmp, KILL cr );
@@ -7560,12 +7007,15 @@ instruct storeLX_volatile(memory mem, stackSlotL src, regXD tmp, eFlagsReg cr) %
   format %{ "CMP    $mem,EAX\t# Probe address for implicit null check\n\t"
             "MOVSD  $tmp,$src\n\t"
             "MOVSD  $mem,$tmp\t # 64-bit atomic volatile long store" %}
-  opcode(0x3B);
-  ins_encode( OpcP, RegMem( EAX, mem ), enc_storeLX_volatile(mem, src, tmp));
+  ins_encode %{
+    __ cmpl(rax, $mem$$Address);
+    __ movdbl($tmp$$XMMRegister, Address(rsp, $src$$disp));
+    __ movdbl($mem$$Address, $tmp$$XMMRegister);
+  %}
   ins_pipe( pipe_slow );
 %}
 
-instruct storeLX_reg_volatile(memory mem, eRegL src, regXD tmp2, regXD tmp, eFlagsReg cr) %{
+instruct storeLX_reg_volatile(memory mem, eRegL src, regD tmp2, regD tmp, eFlagsReg cr) %{
   predicate(UseSSE>=2 && ((StoreLNode*)n)->require_atomic_access());
   match(Set mem (StoreL mem src));
   effect( TEMP tmp2 , TEMP tmp, KILL cr );
@@ -7575,8 +7025,13 @@ instruct storeLX_reg_volatile(memory mem, eRegL src, regXD tmp2, regXD tmp, eFla
             "MOVD   $tmp2,$src.hi\n\t"
             "PUNPCKLDQ $tmp,$tmp2\n\t"
             "MOVSD  $mem,$tmp\t # 64-bit atomic volatile long store" %}
-  opcode(0x3B);
-  ins_encode( OpcP, RegMem( EAX, mem ), enc_storeLX_reg_volatile(mem, src, tmp, tmp2));
+  ins_encode %{
+    __ cmpl(rax, $mem$$Address);
+    __ movdl($tmp$$XMMRegister, $src$$Register);
+    __ movdl($tmp2$$XMMRegister, HIGH_FROM_LOW($src$$Register));
+    __ punpckldq($tmp$$XMMRegister, $tmp2$$XMMRegister);
+    __ movdbl($mem$$Address, $tmp$$XMMRegister);
+  %}
   ins_pipe( pipe_slow );
 %}
 
@@ -7638,32 +7093,38 @@ instruct storeImmB(memory mem, immI8 src) %{
 %}
 
 // Store Aligned Packed Byte XMM register to memory
-instruct storeA8B(memory mem, regXD src) %{
+instruct storeA8B(memory mem, regD src) %{
   predicate(UseSSE>=1);
   match(Set mem (Store8B mem src));
   ins_cost(145);
   format %{ "MOVQ  $mem,$src\t! packed8B" %}
-  ins_encode( movq_st(mem, src));
+  ins_encode %{
+    __ movq($mem$$Address, $src$$XMMRegister);
+  %}
   ins_pipe( pipe_slow );
 %}
 
 // Store Aligned Packed Char/Short XMM register to memory
-instruct storeA4C(memory mem, regXD src) %{
+instruct storeA4C(memory mem, regD src) %{
   predicate(UseSSE>=1);
   match(Set mem (Store4C mem src));
   ins_cost(145);
   format %{ "MOVQ  $mem,$src\t! packed4C" %}
-  ins_encode( movq_st(mem, src));
+  ins_encode %{
+    __ movq($mem$$Address, $src$$XMMRegister);
+  %}
   ins_pipe( pipe_slow );
 %}
 
 // Store Aligned Packed Integer XMM register to memory
-instruct storeA2I(memory mem, regXD src) %{
+instruct storeA2I(memory mem, regD src) %{
   predicate(UseSSE>=1);
   match(Set mem (Store2I mem src));
   ins_cost(145);
   format %{ "MOVQ  $mem,$src\t! packed2I" %}
-  ins_encode( movq_st(mem, src));
+  ins_encode %{
+    __ movq($mem$$Address, $src$$XMMRegister);
+  %}
   ins_pipe( pipe_slow );
 %}
 
@@ -7679,98 +7140,116 @@ instruct storeImmCM(memory mem, immI8 src) %{
 %}
 
 // Store Double
-instruct storeD( memory mem, regDPR1 src) %{
+instruct storeDPR( memory mem, regDPR1 src) %{
   predicate(UseSSE<=1);
   match(Set mem (StoreD mem src));
 
   ins_cost(100);
   format %{ "FST_D  $mem,$src" %}
   opcode(0xDD);       /* DD /2 */
-  ins_encode( enc_FP_store(mem,src) );
+  ins_encode( enc_FPR_store(mem,src) );
   ins_pipe( fpu_mem_reg );
 %}
 
 // Store double does rounding on x86
-instruct storeD_rounded( memory mem, regDPR1 src) %{
+instruct storeDPR_rounded( memory mem, regDPR1 src) %{
   predicate(UseSSE<=1);
   match(Set mem (StoreD mem (RoundDouble src)));
 
   ins_cost(100);
   format %{ "FST_D  $mem,$src\t# round" %}
   opcode(0xDD);       /* DD /2 */
-  ins_encode( enc_FP_store(mem,src) );
+  ins_encode( enc_FPR_store(mem,src) );
   ins_pipe( fpu_mem_reg );
 %}
 
 // Store XMM register to memory (double-precision floating points)
 // MOVSD instruction
-instruct storeXD(memory mem, regXD src) %{
+instruct storeD(memory mem, regD src) %{
   predicate(UseSSE>=2);
   match(Set mem (StoreD mem src));
   ins_cost(95);
   format %{ "MOVSD  $mem,$src" %}
-  ins_encode( Opcode(0xF2), Opcode(0x0F), Opcode(0x11), RegMem(src, mem));
+  ins_encode %{
+    __ movdbl($mem$$Address, $src$$XMMRegister);
+  %}
   ins_pipe( pipe_slow );
 %}
 
 // Store XMM register to memory (single-precision floating point)
 // MOVSS instruction
-instruct storeX(memory mem, regX src) %{
+instruct storeF(memory mem, regF src) %{
   predicate(UseSSE>=1);
   match(Set mem (StoreF mem src));
   ins_cost(95);
   format %{ "MOVSS  $mem,$src" %}
-  ins_encode( Opcode(0xF3), Opcode(0x0F), Opcode(0x11), RegMem(src, mem));
+  ins_encode %{
+    __ movflt($mem$$Address, $src$$XMMRegister);
+  %}
   ins_pipe( pipe_slow );
 %}
 
 // Store Aligned Packed Single Float XMM register to memory
-instruct storeA2F(memory mem, regXD src) %{
+instruct storeA2F(memory mem, regD src) %{
   predicate(UseSSE>=1);
   match(Set mem (Store2F mem src));
   ins_cost(145);
   format %{ "MOVQ  $mem,$src\t! packed2F" %}
-  ins_encode( movq_st(mem, src));
+  ins_encode %{
+    __ movq($mem$$Address, $src$$XMMRegister);
+  %}
   ins_pipe( pipe_slow );
 %}
 
 // Store Float
-instruct storeF( memory mem, regFPR1 src) %{
+instruct storeFPR( memory mem, regFPR1 src) %{
   predicate(UseSSE==0);
   match(Set mem (StoreF mem src));
 
   ins_cost(100);
   format %{ "FST_S  $mem,$src" %}
   opcode(0xD9);       /* D9 /2 */
-  ins_encode( enc_FP_store(mem,src) );
+  ins_encode( enc_FPR_store(mem,src) );
   ins_pipe( fpu_mem_reg );
 %}
 
 // Store Float does rounding on x86
-instruct storeF_rounded( memory mem, regFPR1 src) %{
+instruct storeFPR_rounded( memory mem, regFPR1 src) %{
   predicate(UseSSE==0);
   match(Set mem (StoreF mem (RoundFloat src)));
 
   ins_cost(100);
   format %{ "FST_S  $mem,$src\t# round" %}
   opcode(0xD9);       /* D9 /2 */
-  ins_encode( enc_FP_store(mem,src) );
+  ins_encode( enc_FPR_store(mem,src) );
   ins_pipe( fpu_mem_reg );
 %}
 
 // Store Float does rounding on x86
-instruct storeF_Drounded( memory mem, regDPR1 src) %{
+instruct storeFPR_Drounded( memory mem, regDPR1 src) %{
   predicate(UseSSE<=1);
   match(Set mem (StoreF mem (ConvD2F src)));
 
   ins_cost(100);
   format %{ "FST_S  $mem,$src\t# D-round" %}
   opcode(0xD9);       /* D9 /2 */
-  ins_encode( enc_FP_store(mem,src) );
+  ins_encode( enc_FPR_store(mem,src) );
   ins_pipe( fpu_mem_reg );
 %}
 
 // Store immediate Float value (it is faster than store from FPU register)
+// The instruction usage is guarded by predicate in operand immFPR().
+instruct storeFPR_imm( memory mem, immFPR src) %{
+  match(Set mem (StoreF mem src));
+
+  ins_cost(50);
+  format %{ "MOV    $mem,$src\t# store float" %}
+  opcode(0xC7);               /* C7 /0 */
+  ins_encode( OpcP, RMopc_Mem(0x00,mem),  Con32FPR_as_bits( src ));
+  ins_pipe( ialu_mem_imm );
+%}
+
+// Store immediate Float value (it is faster than store from XMM register)
 // The instruction usage is guarded by predicate in operand immF().
 instruct storeF_imm( memory mem, immF src) %{
   match(Set mem (StoreF mem src));
@@ -7782,18 +7261,6 @@ instruct storeF_imm( memory mem, immF src) %{
   ins_pipe( ialu_mem_imm );
 %}
 
-// Store immediate Float value (it is faster than store from XMM register)
-// The instruction usage is guarded by predicate in operand immXF().
-instruct storeX_imm( memory mem, immXF src) %{
-  match(Set mem (StoreF mem src));
-
-  ins_cost(50);
-  format %{ "MOV    $mem,$src\t# store float" %}
-  opcode(0xC7);               /* C7 /0 */
-  ins_encode( OpcP, RMopc_Mem(0x00,mem),  Con32XF_as_bits( src ));
-  ins_pipe( ialu_mem_imm );
-%}
-
 // Store Integer to stack slot
 instruct storeSSI(stackSlotI dst, eRegI src) %{
   match(Set dst src);
@@ -7901,6 +7368,16 @@ instruct unnecessary_membar_volatile() %{
   ins_pipe(empty);
 %}
 
+instruct membar_storestore() %{
+  match(MemBarStoreStore);
+  ins_cost(0);
+
+  size(0);
+  format %{ "MEMBAR-storestore (empty encoding)" %}
+  ins_encode( );
+  ins_pipe(empty);
+%}
+
 //----------Move Instructions--------------------------------------------------
 instruct castX2P(eAXRegP dst, eAXRegI src) %{
   match(Set dst (CastX2P src));
@@ -8088,29 +7565,29 @@ instruct cmovP_regUCF(cmpOpUCF cop, eFlagsRegUCF cr, eRegP dst, eRegP src ) %{
 //%}
 
 // Conditional move
-instruct fcmovD_regU(cmpOp_fcmov cop, eFlagsRegU cr, regDPR1 dst, regD src) %{
+instruct fcmovDPR_regU(cmpOp_fcmov cop, eFlagsRegU cr, regDPR1 dst, regDPR src) %{
   predicate(UseSSE<=1);
   match(Set dst (CMoveD (Binary cop cr) (Binary dst src)));
   ins_cost(200);
   format %{ "FCMOV$cop $dst,$src\t# double" %}
   opcode(0xDA);
-  ins_encode( enc_cmov_d(cop,src) );
-  ins_pipe( pipe_cmovD_reg );
+  ins_encode( enc_cmov_dpr(cop,src) );
+  ins_pipe( pipe_cmovDPR_reg );
 %}
 
 // Conditional move
-instruct fcmovF_regU(cmpOp_fcmov cop, eFlagsRegU cr, regFPR1 dst, regF src) %{
+instruct fcmovFPR_regU(cmpOp_fcmov cop, eFlagsRegU cr, regFPR1 dst, regFPR src) %{
   predicate(UseSSE==0);
   match(Set dst (CMoveF (Binary cop cr) (Binary dst src)));
   ins_cost(200);
   format %{ "FCMOV$cop $dst,$src\t# float" %}
   opcode(0xDA);
-  ins_encode( enc_cmov_d(cop,src) );
-  ins_pipe( pipe_cmovD_reg );
+  ins_encode( enc_cmov_dpr(cop,src) );
+  ins_pipe( pipe_cmovDPR_reg );
 %}
 
 // Float CMOV on Intel doesn't handle *signed* compares, only unsigned.
-instruct fcmovD_regS(cmpOp cop, eFlagsReg cr, regD dst, regD src) %{
+instruct fcmovDPR_regS(cmpOp cop, eFlagsReg cr, regDPR dst, regDPR src) %{
   predicate(UseSSE<=1);
   match(Set dst (CMoveD (Binary cop cr) (Binary dst src)));
   ins_cost(200);
@@ -8118,12 +7595,12 @@ instruct fcmovD_regS(cmpOp cop, eFlagsReg cr, regD dst, regD src) %{
             "MOV    $dst,$src\t# double\n"
       "skip:" %}
   opcode (0xdd, 0x3);     /* DD D8+i or DD /3 */
-  ins_encode( enc_cmov_branch( cop, 0x4 ), Push_Reg_D(src), OpcP, RegOpc(dst) );
-  ins_pipe( pipe_cmovD_reg );
+  ins_encode( enc_cmov_branch( cop, 0x4 ), Push_Reg_DPR(src), OpcP, RegOpc(dst) );
+  ins_pipe( pipe_cmovDPR_reg );
 %}
 
 // Float CMOV on Intel doesn't handle *signed* compares, only unsigned.
-instruct fcmovF_regS(cmpOp cop, eFlagsReg cr, regF dst, regF src) %{
+instruct fcmovFPR_regS(cmpOp cop, eFlagsReg cr, regFPR dst, regFPR src) %{
   predicate(UseSSE==0);
   match(Set dst (CMoveF (Binary cop cr) (Binary dst src)));
   ins_cost(200);
@@ -8131,12 +7608,12 @@ instruct fcmovF_regS(cmpOp cop, eFlagsReg cr, regF dst, regF src) %{
             "MOV    $dst,$src\t# float\n"
       "skip:" %}
   opcode (0xdd, 0x3);     /* DD D8+i or DD /3 */
-  ins_encode( enc_cmov_branch( cop, 0x4 ), Push_Reg_F(src), OpcP, RegOpc(dst) );
-  ins_pipe( pipe_cmovD_reg );
+  ins_encode( enc_cmov_branch( cop, 0x4 ), Push_Reg_FPR(src), OpcP, RegOpc(dst) );
+  ins_pipe( pipe_cmovDPR_reg );
 %}
 
 // No CMOVE with SSE/SSE2
-instruct fcmovX_regS(cmpOp cop, eFlagsReg cr, regX dst, regX src) %{
+instruct fcmovF_regS(cmpOp cop, eFlagsReg cr, regF dst, regF src) %{
   predicate (UseSSE>=1);
   match(Set dst (CMoveF (Binary cop cr) (Binary dst src)));
   ins_cost(200);
@@ -8154,7 +7631,7 @@ instruct fcmovX_regS(cmpOp cop, eFlagsReg cr, regX dst, regX src) %{
 %}
 
 // No CMOVE with SSE/SSE2
-instruct fcmovXD_regS(cmpOp cop, eFlagsReg cr, regXD dst, regXD src) %{
+instruct fcmovD_regS(cmpOp cop, eFlagsReg cr, regD dst, regD src) %{
   predicate (UseSSE>=2);
   match(Set dst (CMoveD (Binary cop cr) (Binary dst src)));
   ins_cost(200);
@@ -8172,7 +7649,7 @@ instruct fcmovXD_regS(cmpOp cop, eFlagsReg cr, regXD dst, regXD src) %{
 %}
 
 // unsigned version
-instruct fcmovX_regU(cmpOpU cop, eFlagsRegU cr, regX dst, regX src) %{
+instruct fcmovF_regU(cmpOpU cop, eFlagsRegU cr, regF dst, regF src) %{
   predicate (UseSSE>=1);
   match(Set dst (CMoveF (Binary cop cr) (Binary dst src)));
   ins_cost(200);
@@ -8189,17 +7666,17 @@ instruct fcmovX_regU(cmpOpU cop, eFlagsRegU cr, regX dst, regX src) %{
   ins_pipe( pipe_slow );
 %}
 
-instruct fcmovX_regUCF(cmpOpUCF cop, eFlagsRegUCF cr, regX dst, regX src) %{
+instruct fcmovF_regUCF(cmpOpUCF cop, eFlagsRegUCF cr, regF dst, regF src) %{
   predicate (UseSSE>=1);
   match(Set dst (CMoveF (Binary cop cr) (Binary dst src)));
   ins_cost(200);
   expand %{
-    fcmovX_regU(cop, cr, dst, src);
+    fcmovF_regU(cop, cr, dst, src);
   %}
 %}
 
 // unsigned version
-instruct fcmovXD_regU(cmpOpU cop, eFlagsRegU cr, regXD dst, regXD src) %{
+instruct fcmovD_regU(cmpOpU cop, eFlagsRegU cr, regD dst, regD src) %{
   predicate (UseSSE>=2);
   match(Set dst (CMoveD (Binary cop cr) (Binary dst src)));
   ins_cost(200);
@@ -8216,12 +7693,12 @@ instruct fcmovXD_regU(cmpOpU cop, eFlagsRegU cr, regXD dst, regXD src) %{
   ins_pipe( pipe_slow );
 %}
 
-instruct fcmovXD_regUCF(cmpOpUCF cop, eFlagsRegUCF cr, regXD dst, regXD src) %{
+instruct fcmovD_regUCF(cmpOpUCF cop, eFlagsRegUCF cr, regD dst, regD src) %{
   predicate (UseSSE>=2);
   match(Set dst (CMoveD (Binary cop cr) (Binary dst src)));
   ins_cost(200);
   expand %{
-    fcmovXD_regU(cop, cr, dst, src);
+    fcmovD_regU(cop, cr, dst, src);
   %}
 %}
 
@@ -8440,7 +7917,7 @@ instruct loadPLocked(eRegP dst, memory mem) %{
 %}
 
 // LoadLong-locked - same as a volatile long load when used with compare-swap
-instruct loadLLocked(stackSlotL dst, load_long_memory mem) %{
+instruct loadLLocked(stackSlotL dst, memory mem) %{
   predicate(UseSSE<=1);
   match(Set dst (LoadLLocked mem));
 
@@ -8451,18 +7928,21 @@ instruct loadLLocked(stackSlotL dst, load_long_memory mem) %{
   ins_pipe( fpu_reg_mem );
 %}
 
-instruct loadLX_Locked(stackSlotL dst, load_long_memory mem, regXD tmp) %{
+instruct loadLX_Locked(stackSlotL dst, memory mem, regD tmp) %{
   predicate(UseSSE>=2);
   match(Set dst (LoadLLocked mem));
   effect(TEMP tmp);
   ins_cost(180);
   format %{ "MOVSD  $tmp,$mem\t# Atomic volatile long load\n\t"
             "MOVSD  $dst,$tmp" %}
-  ins_encode(enc_loadLX_volatile(mem, dst, tmp));
+  ins_encode %{
+    __ movdbl($tmp$$XMMRegister, $mem$$Address);
+    __ movdbl(Address(rsp, $dst$$disp), $tmp$$XMMRegister);
+  %}
   ins_pipe( pipe_slow );
 %}
 
-instruct loadLX_reg_Locked(eRegL dst, load_long_memory mem, regXD tmp) %{
+instruct loadLX_reg_Locked(eRegL dst, memory mem, regD tmp) %{
   predicate(UseSSE>=2);
   match(Set dst (LoadLLocked mem));
   effect(TEMP tmp);
@@ -8471,7 +7951,12 @@ instruct loadLX_reg_Locked(eRegL dst, load_long_memory mem, regXD tmp) %{
             "MOVD   $dst.lo,$tmp\n\t"
             "PSRLQ  $tmp,32\n\t"
             "MOVD   $dst.hi,$tmp" %}
-  ins_encode(enc_loadLX_reg_volatile(mem, dst, tmp));
+  ins_encode %{
+    __ movdbl($tmp$$XMMRegister, $mem$$Address);
+    __ movdl($dst$$Register, $tmp$$XMMRegister);
+    __ psrlq($tmp$$XMMRegister, 32);
+    __ movdl(HIGH_FROM_LOW($dst$$Register), $tmp$$XMMRegister);
+  %}
   ins_pipe( pipe_slow );
 %}
 
@@ -10054,7 +9539,7 @@ instruct sarL_eReg_CL(eRegL dst, eCXRegI shift, eFlagsReg cr) %{
 // Compare & branch
 
 // P6 version of float compare, sets condition codes in EFLAGS
-instruct cmpD_cc_P6(eFlagsRegU cr, regD src1, regD src2, eAXRegI rax) %{
+instruct cmpDPR_cc_P6(eFlagsRegU cr, regDPR src1, regDPR src2, eAXRegI rax) %{
   predicate(VM_Version::supports_cmov() && UseSSE <=1);
   match(Set cr (CmpD src1 src2));
   effect(KILL rax);
@@ -10066,26 +9551,26 @@ instruct cmpD_cc_P6(eFlagsRegU cr, regD src1, regD src2, eAXRegI rax) %{
             "SAHF\n"
      "exit:\tNOP               // avoid branch to branch" %}
   opcode(0xDF, 0x05); /* DF E8+i or DF /5 */
-  ins_encode( Push_Reg_D(src1),
+  ins_encode( Push_Reg_DPR(src1),
               OpcP, RegOpc(src2),
               cmpF_P6_fixup );
   ins_pipe( pipe_slow );
 %}
 
-instruct cmpD_cc_P6CF(eFlagsRegUCF cr, regD src1, regD src2) %{
+instruct cmpDPR_cc_P6CF(eFlagsRegUCF cr, regDPR src1, regDPR src2) %{
   predicate(VM_Version::supports_cmov() && UseSSE <=1);
   match(Set cr (CmpD src1 src2));
   ins_cost(150);
   format %{ "FLD    $src1\n\t"
             "FUCOMIP ST,$src2  // P6 instruction" %}
   opcode(0xDF, 0x05); /* DF E8+i or DF /5 */
-  ins_encode( Push_Reg_D(src1),
+  ins_encode( Push_Reg_DPR(src1),
               OpcP, RegOpc(src2));
   ins_pipe( pipe_slow );
 %}
 
 // Compare & branch
-instruct cmpD_cc(eFlagsRegU cr, regD src1, regD src2, eAXRegI rax) %{
+instruct cmpDPR_cc(eFlagsRegU cr, regDPR src1, regDPR src2, eAXRegI rax) %{
   predicate(UseSSE<=1);
   match(Set cr (CmpD src1 src2));
   effect(KILL rax);
@@ -10098,138 +9583,140 @@ instruct cmpD_cc(eFlagsRegU cr, regD src1, regD src2, eAXRegI rax) %{
             "MOV    AH,1\t# unordered treat as LT\n"
     "flags:\tSAHF" %}
   opcode(0xD8, 0x3); /* D8 D8+i or D8 /3 */
-  ins_encode( Push_Reg_D(src1),
+  ins_encode( Push_Reg_DPR(src1),
               OpcP, RegOpc(src2),
               fpu_flags);
   ins_pipe( pipe_slow );
 %}
 
 // Compare vs zero into -1,0,1
-instruct cmpD_0(eRegI dst, regD src1, immD0 zero, eAXRegI rax, eFlagsReg cr) %{
+instruct cmpDPR_0(eRegI dst, regDPR src1, immDPR0 zero, eAXRegI rax, eFlagsReg cr) %{
   predicate(UseSSE<=1);
   match(Set dst (CmpD3 src1 zero));
   effect(KILL cr, KILL rax);
   ins_cost(280);
   format %{ "FTSTD  $dst,$src1" %}
   opcode(0xE4, 0xD9);
-  ins_encode( Push_Reg_D(src1),
+  ins_encode( Push_Reg_DPR(src1),
               OpcS, OpcP, PopFPU,
               CmpF_Result(dst));
   ins_pipe( pipe_slow );
 %}
 
 // Compare into -1,0,1
-instruct cmpD_reg(eRegI dst, regD src1, regD src2, eAXRegI rax, eFlagsReg cr) %{
+instruct cmpDPR_reg(eRegI dst, regDPR src1, regDPR src2, eAXRegI rax, eFlagsReg cr) %{
   predicate(UseSSE<=1);
   match(Set dst (CmpD3 src1 src2));
   effect(KILL cr, KILL rax);
   ins_cost(300);
   format %{ "FCMPD  $dst,$src1,$src2" %}
   opcode(0xD8, 0x3); /* D8 D8+i or D8 /3 */
-  ins_encode( Push_Reg_D(src1),
+  ins_encode( Push_Reg_DPR(src1),
               OpcP, RegOpc(src2),
               CmpF_Result(dst));
   ins_pipe( pipe_slow );
 %}
 
 // float compare and set condition codes in EFLAGS by XMM regs
-instruct cmpXD_cc(eFlagsRegU cr, regXD dst, regXD src, eAXRegI rax) %{
+instruct cmpD_cc(eFlagsRegU cr, regD src1, regD src2) %{
   predicate(UseSSE>=2);
-  match(Set cr (CmpD dst src));
-  effect(KILL rax);
-  ins_cost(125);
-  format %{ "COMISD $dst,$src\n"
-          "\tJNP    exit\n"
-          "\tMOV    ah,1       // saw a NaN, set CF\n"
-          "\tSAHF\n"
-     "exit:\tNOP               // avoid branch to branch" %}
-  opcode(0x66, 0x0F, 0x2F);
-  ins_encode(OpcP, OpcS, Opcode(tertiary), RegReg(dst, src), cmpF_P6_fixup);
+  match(Set cr (CmpD src1 src2));
+  ins_cost(145);
+  format %{ "UCOMISD $src1,$src2\n\t"
+            "JNP,s   exit\n\t"
+            "PUSHF\t# saw NaN, set CF\n\t"
+            "AND     [rsp], #0xffffff2b\n\t"
+            "POPF\n"
+    "exit:" %}
+  ins_encode %{
+    __ ucomisd($src1$$XMMRegister, $src2$$XMMRegister);
+    emit_cmpfp_fixup(_masm);
+  %}
   ins_pipe( pipe_slow );
 %}
 
-instruct cmpXD_ccCF(eFlagsRegUCF cr, regXD dst, regXD src) %{
+instruct cmpD_ccCF(eFlagsRegUCF cr, regD src1, regD src2) %{
   predicate(UseSSE>=2);
-  match(Set cr (CmpD dst src));
+  match(Set cr (CmpD src1 src2));
   ins_cost(100);
-  format %{ "COMISD $dst,$src" %}
-  opcode(0x66, 0x0F, 0x2F);
-  ins_encode(OpcP, OpcS, Opcode(tertiary), RegReg(dst, src));
+  format %{ "UCOMISD $src1,$src2" %}
+  ins_encode %{
+    __ ucomisd($src1$$XMMRegister, $src2$$XMMRegister);
+  %}
   ins_pipe( pipe_slow );
 %}
 
 // float compare and set condition codes in EFLAGS by XMM regs
-instruct cmpXD_ccmem(eFlagsRegU cr, regXD dst, memory src, eAXRegI rax) %{
+instruct cmpD_ccmem(eFlagsRegU cr, regD src1, memory src2) %{
   predicate(UseSSE>=2);
-  match(Set cr (CmpD dst (LoadD src)));
-  effect(KILL rax);
+  match(Set cr (CmpD src1 (LoadD src2)));
   ins_cost(145);
-  format %{ "COMISD $dst,$src\n"
-          "\tJNP    exit\n"
-          "\tMOV    ah,1       // saw a NaN, set CF\n"
-          "\tSAHF\n"
-     "exit:\tNOP               // avoid branch to branch" %}
-  opcode(0x66, 0x0F, 0x2F);
-  ins_encode(OpcP, OpcS, Opcode(tertiary), RegMem(dst, src), cmpF_P6_fixup);
+  format %{ "UCOMISD $src1,$src2\n\t"
+            "JNP,s   exit\n\t"
+            "PUSHF\t# saw NaN, set CF\n\t"
+            "AND     [rsp], #0xffffff2b\n\t"
+            "POPF\n"
+    "exit:" %}
+  ins_encode %{
+    __ ucomisd($src1$$XMMRegister, $src2$$Address);
+    emit_cmpfp_fixup(_masm);
+  %}
   ins_pipe( pipe_slow );
 %}
 
-instruct cmpXD_ccmemCF(eFlagsRegUCF cr, regXD dst, memory src) %{
+instruct cmpD_ccmemCF(eFlagsRegUCF cr, regD src1, memory src2) %{
   predicate(UseSSE>=2);
-  match(Set cr (CmpD dst (LoadD src)));
+  match(Set cr (CmpD src1 (LoadD src2)));
   ins_cost(100);
-  format %{ "COMISD $dst,$src" %}
-  opcode(0x66, 0x0F, 0x2F);
-  ins_encode(OpcP, OpcS, Opcode(tertiary), RegMem(dst, src));
+  format %{ "UCOMISD $src1,$src2" %}
+  ins_encode %{
+    __ ucomisd($src1$$XMMRegister, $src2$$Address);
+  %}
   ins_pipe( pipe_slow );
 %}
 
 // Compare into -1,0,1 in XMM
-instruct cmpXD_reg(eRegI dst, regXD src1, regXD src2, eFlagsReg cr) %{
+instruct cmpD_reg(xRegI dst, regD src1, regD src2, eFlagsReg cr) %{
   predicate(UseSSE>=2);
   match(Set dst (CmpD3 src1 src2));
   effect(KILL cr);
   ins_cost(255);
-  format %{ "XOR    $dst,$dst\n"
-          "\tCOMISD $src1,$src2\n"
-          "\tJP,s   nan\n"
-          "\tJEQ,s  exit\n"
-          "\tJA,s   inc\n"
-      "nan:\tDEC    $dst\n"
-          "\tJMP,s  exit\n"
-      "inc:\tINC    $dst\n"
-      "exit:"
-                %}
-  opcode(0x66, 0x0F, 0x2F);
-  ins_encode(Xor_Reg(dst), OpcP, OpcS, Opcode(tertiary), RegReg(src1, src2),
-             CmpX_Result(dst));
+  format %{ "UCOMISD $src1, $src2\n\t"
+            "MOV     $dst, #-1\n\t"
+            "JP,s    done\n\t"
+            "JB,s    done\n\t"
+            "SETNE   $dst\n\t"
+            "MOVZB   $dst, $dst\n"
+    "done:" %}
+  ins_encode %{
+    __ ucomisd($src1$$XMMRegister, $src2$$XMMRegister);
+    emit_cmpfp3(_masm, $dst$$Register);
+  %}
   ins_pipe( pipe_slow );
 %}
 
 // Compare into -1,0,1 in XMM and memory
-instruct cmpXD_regmem(eRegI dst, regXD src1, memory mem, eFlagsReg cr) %{
+instruct cmpD_regmem(xRegI dst, regD src1, memory src2, eFlagsReg cr) %{
   predicate(UseSSE>=2);
-  match(Set dst (CmpD3 src1 (LoadD mem)));
+  match(Set dst (CmpD3 src1 (LoadD src2)));
   effect(KILL cr);
   ins_cost(275);
-  format %{ "COMISD $src1,$mem\n"
-          "\tMOV    $dst,0\t\t# do not blow flags\n"
-          "\tJP,s   nan\n"
-          "\tJEQ,s  exit\n"
-          "\tJA,s   inc\n"
-      "nan:\tDEC    $dst\n"
-          "\tJMP,s  exit\n"
-      "inc:\tINC    $dst\n"
-      "exit:"
-                %}
-  opcode(0x66, 0x0F, 0x2F);
-  ins_encode(OpcP, OpcS, Opcode(tertiary), RegMem(src1, mem),
-             LdImmI(dst,0x0), CmpX_Result(dst));
+  format %{ "UCOMISD $src1, $src2\n\t"
+            "MOV     $dst, #-1\n\t"
+            "JP,s    done\n\t"
+            "JB,s    done\n\t"
+            "SETNE   $dst\n\t"
+            "MOVZB   $dst, $dst\n"
+    "done:" %}
+  ins_encode %{
+    __ ucomisd($src1$$XMMRegister, $src2$$Address);
+    emit_cmpfp3(_masm, $dst$$Register);
+  %}
   ins_pipe( pipe_slow );
 %}
 
 
-instruct subD_reg(regD dst, regD src) %{
+instruct subDPR_reg(regDPR dst, regDPR src) %{
   predicate (UseSSE <=1);
   match(Set dst (SubD dst src));
 
@@ -10237,12 +9724,12 @@ instruct subD_reg(regD dst, regD src) %{
             "DSUBp  $dst,ST" %}
   opcode(0xDE, 0x5); /* DE E8+i  or DE /5 */
   ins_cost(150);
-  ins_encode( Push_Reg_D(src),
+  ins_encode( Push_Reg_DPR(src),
               OpcP, RegOpc(dst) );
   ins_pipe( fpu_reg_reg );
 %}
 
-instruct subD_reg_round(stackSlotD dst, regD src1, regD src2) %{
+instruct subDPR_reg_round(stackSlotD dst, regDPR src1, regDPR src2) %{
   predicate (UseSSE <=1);
   match(Set dst (RoundDouble (SubD src1 src2)));
   ins_cost(250);
@@ -10251,13 +9738,13 @@ instruct subD_reg_round(stackSlotD dst, regD src1, regD src2) %{
             "DSUB   ST,$src1\n\t"
             "FSTP_D $dst\t# D-round" %}
   opcode(0xD8, 0x5);
-  ins_encode( Push_Reg_D(src2),
-              OpcP, RegOpc(src1), Pop_Mem_D(dst) );
+  ins_encode( Push_Reg_DPR(src2),
+              OpcP, RegOpc(src1), Pop_Mem_DPR(dst) );
   ins_pipe( fpu_mem_reg_reg );
 %}
 
 
-instruct subD_reg_mem(regD dst, memory src) %{
+instruct subDPR_reg_mem(regDPR dst, memory src) %{
   predicate (UseSSE <=1);
   match(Set dst (SubD dst (LoadD src)));
   ins_cost(150);
@@ -10270,7 +9757,7 @@ instruct subD_reg_mem(regD dst, memory src) %{
   ins_pipe( fpu_reg_mem );
 %}
 
-instruct absD_reg(regDPR1 dst, regDPR1 src) %{
+instruct absDPR_reg(regDPR1 dst, regDPR1 src) %{
   predicate (UseSSE<=1);
   match(Set dst (AbsD src));
   ins_cost(100);
@@ -10280,15 +9767,7 @@ instruct absD_reg(regDPR1 dst, regDPR1 src) %{
   ins_pipe( fpu_reg_reg );
 %}
 
-instruct absXD_reg( regXD dst ) %{
-  predicate(UseSSE>=2);
-  match(Set dst (AbsD dst));
-  format %{ "ANDPD  $dst,[0x7FFFFFFFFFFFFFFF]\t# ABS D by sign masking" %}
-  ins_encode( AbsXD_encoding(dst));
-  ins_pipe( pipe_slow );
-%}
-
-instruct negD_reg(regDPR1 dst, regDPR1 src) %{
+instruct negDPR_reg(regDPR1 dst, regDPR1 src) %{
   predicate(UseSSE<=1);
   match(Set dst (NegD src));
   ins_cost(100);
@@ -10298,18 +9777,7 @@ instruct negD_reg(regDPR1 dst, regDPR1 src) %{
   ins_pipe( fpu_reg_reg );
 %}
 
-instruct negXD_reg( regXD dst ) %{
-  predicate(UseSSE>=2);
-  match(Set dst (NegD dst));
-  format %{ "XORPD  $dst,[0x8000000000000000]\t# CHS D by sign flipping" %}
-  ins_encode %{
-     __ xorpd($dst$$XMMRegister,
-              ExternalAddress((address)double_signflip_pool));
-  %}
-  ins_pipe( pipe_slow );
-%}
-
-instruct addD_reg(regD dst, regD src) %{
+instruct addDPR_reg(regDPR dst, regDPR src) %{
   predicate(UseSSE<=1);
   match(Set dst (AddD dst src));
   format %{ "FLD    $src\n\t"
@@ -10317,13 +9785,13 @@ instruct addD_reg(regD dst, regD src) %{
   size(4);
   ins_cost(150);
   opcode(0xDE, 0x0); /* DE C0+i or DE /0*/
-  ins_encode( Push_Reg_D(src),
+  ins_encode( Push_Reg_DPR(src),
               OpcP, RegOpc(dst) );
   ins_pipe( fpu_reg_reg );
 %}
 
 
-instruct addD_reg_round(stackSlotD dst, regD src1, regD src2) %{
+instruct addDPR_reg_round(stackSlotD dst, regDPR src1, regDPR src2) %{
   predicate(UseSSE<=1);
   match(Set dst (RoundDouble (AddD src1 src2)));
   ins_cost(250);
@@ -10332,13 +9800,13 @@ instruct addD_reg_round(stackSlotD dst, regD src1, regD src2) %{
             "DADD   ST,$src1\n\t"
             "FSTP_D $dst\t# D-round" %}
   opcode(0xD8, 0x0); /* D8 C0+i or D8 /0*/
-  ins_encode( Push_Reg_D(src2),
-              OpcP, RegOpc(src1), Pop_Mem_D(dst) );
+  ins_encode( Push_Reg_DPR(src2),
+              OpcP, RegOpc(src1), Pop_Mem_DPR(dst) );
   ins_pipe( fpu_mem_reg_reg );
 %}
 
 
-instruct addD_reg_mem(regD dst, memory src) %{
+instruct addDPR_reg_mem(regDPR dst, memory src) %{
   predicate(UseSSE<=1);
   match(Set dst (AddD dst (LoadD src)));
   ins_cost(150);
@@ -10352,7 +9820,7 @@ instruct addD_reg_mem(regD dst, memory src) %{
 %}
 
 // add-to-memory
-instruct addD_mem_reg(memory dst, regD src) %{
+instruct addDPR_mem_reg(memory dst, regDPR src) %{
   predicate(UseSSE<=1);
   match(Set dst (StoreD dst (RoundDouble (AddD (LoadD dst) src))));
   ins_cost(150);
@@ -10368,7 +9836,7 @@ instruct addD_mem_reg(memory dst, regD src) %{
   ins_pipe( fpu_reg_mem );
 %}
 
-instruct addD_reg_imm1(regD dst, immD1 con) %{
+instruct addDPR_reg_imm1(regDPR dst, immDPR1 con) %{
   predicate(UseSSE<=1);
   match(Set dst (AddD dst con));
   ins_cost(125);
@@ -10381,7 +9849,7 @@ instruct addD_reg_imm1(regD dst, immD1 con) %{
   ins_pipe(fpu_reg);
 %}
 
-instruct addD_reg_imm(regD dst, immD con) %{
+instruct addDPR_reg_imm(regDPR dst, immDPR con) %{
   predicate(UseSSE<=1 && _kids[1]->_leaf->getd() != 0.0 && _kids[1]->_leaf->getd() != 1.0 );
   match(Set dst (AddD dst con));
   ins_cost(200);
@@ -10394,7 +9862,7 @@ instruct addD_reg_imm(regD dst, immD con) %{
   ins_pipe(fpu_reg_mem);
 %}
 
-instruct addD_reg_imm_round(stackSlotD dst, regD src, immD con) %{
+instruct addDPR_reg_imm_round(stackSlotD dst, regDPR src, immDPR con) %{
   predicate(UseSSE<=1 && _kids[0]->_kids[1]->_leaf->getd() != 0.0 && _kids[0]->_kids[1]->_leaf->getd() != 1.0 );
   match(Set dst (RoundDouble (AddD src con)));
   ins_cost(200);
@@ -10409,124 +9877,14 @@ instruct addD_reg_imm_round(stackSlotD dst, regD src, immD con) %{
   ins_pipe(fpu_mem_reg_con);
 %}
 
-// Add two double precision floating point values in xmm
-instruct addXD_reg(regXD dst, regXD src) %{
-  predicate(UseSSE>=2);
-  match(Set dst (AddD dst src));
-  format %{ "ADDSD  $dst,$src" %}
-  ins_encode( Opcode(0xF2), Opcode(0x0F), Opcode(0x58), RegReg(dst, src));
-  ins_pipe( pipe_slow );
-%}
-
-instruct addXD_imm(regXD dst, immXD con) %{
-  predicate(UseSSE>=2);
-  match(Set dst (AddD dst con));
-  format %{ "ADDSD  $dst,[$constantaddress]\t# load from constant table: double=$con" %}
-  ins_encode %{
-    __ addsd($dst$$XMMRegister, $constantaddress($con));
-  %}
-  ins_pipe(pipe_slow);
-%}
-
-instruct addXD_mem(regXD dst, memory mem) %{
-  predicate(UseSSE>=2);
-  match(Set dst (AddD dst (LoadD mem)));
-  format %{ "ADDSD  $dst,$mem" %}
-  ins_encode( Opcode(0xF2), Opcode(0x0F), Opcode(0x58), RegMem(dst,mem));
-  ins_pipe( pipe_slow );
-%}
-
-// Sub two double precision floating point values in xmm
-instruct subXD_reg(regXD dst, regXD src) %{
-  predicate(UseSSE>=2);
-  match(Set dst (SubD dst src));
-  format %{ "SUBSD  $dst,$src" %}
-  ins_encode( Opcode(0xF2), Opcode(0x0F), Opcode(0x5C), RegReg(dst, src));
-  ins_pipe( pipe_slow );
-%}
-
-instruct subXD_imm(regXD dst, immXD con) %{
-  predicate(UseSSE>=2);
-  match(Set dst (SubD dst con));
-  format %{ "SUBSD  $dst,[$constantaddress]\t# load from constant table: double=$con" %}
-  ins_encode %{
-    __ subsd($dst$$XMMRegister, $constantaddress($con));
-  %}
-  ins_pipe(pipe_slow);
-%}
-
-instruct subXD_mem(regXD dst, memory mem) %{
-  predicate(UseSSE>=2);
-  match(Set dst (SubD dst (LoadD mem)));
-  format %{ "SUBSD  $dst,$mem" %}
-  ins_encode( Opcode(0xF2), Opcode(0x0F), Opcode(0x5C), RegMem(dst,mem));
-  ins_pipe( pipe_slow );
-%}
-
-// Mul two double precision floating point values in xmm
-instruct mulXD_reg(regXD dst, regXD src) %{
-  predicate(UseSSE>=2);
-  match(Set dst (MulD dst src));
-  format %{ "MULSD  $dst,$src" %}
-  ins_encode( Opcode(0xF2), Opcode(0x0F), Opcode(0x59), RegReg(dst, src));
-  ins_pipe( pipe_slow );
-%}
-
-instruct mulXD_imm(regXD dst, immXD con) %{
-  predicate(UseSSE>=2);
-  match(Set dst (MulD dst con));
-  format %{ "MULSD  $dst,[$constantaddress]\t# load from constant table: double=$con" %}
-  ins_encode %{
-    __ mulsd($dst$$XMMRegister, $constantaddress($con));
-  %}
-  ins_pipe(pipe_slow);
-%}
-
-instruct mulXD_mem(regXD dst, memory mem) %{
-  predicate(UseSSE>=2);
-  match(Set dst (MulD dst (LoadD mem)));
-  format %{ "MULSD  $dst,$mem" %}
-  ins_encode( Opcode(0xF2), Opcode(0x0F), Opcode(0x59), RegMem(dst,mem));
-  ins_pipe( pipe_slow );
-%}
-
-// Div two double precision floating point values in xmm
-instruct divXD_reg(regXD dst, regXD src) %{
-  predicate(UseSSE>=2);
-  match(Set dst (DivD dst src));
-  format %{ "DIVSD  $dst,$src" %}
-  opcode(0xF2, 0x0F, 0x5E);
-  ins_encode( Opcode(0xF2), Opcode(0x0F), Opcode(0x5E), RegReg(dst, src));
-  ins_pipe( pipe_slow );
-%}
-
-instruct divXD_imm(regXD dst, immXD con) %{
-  predicate(UseSSE>=2);
-  match(Set dst (DivD dst con));
-  format %{ "DIVSD  $dst,[$constantaddress]\t# load from constant table: double=$con" %}
-  ins_encode %{
-    __ divsd($dst$$XMMRegister, $constantaddress($con));
-  %}
-  ins_pipe(pipe_slow);
-%}
-
-instruct divXD_mem(regXD dst, memory mem) %{
-  predicate(UseSSE>=2);
-  match(Set dst (DivD dst (LoadD mem)));
-  format %{ "DIVSD  $dst,$mem" %}
-  ins_encode( Opcode(0xF2), Opcode(0x0F), Opcode(0x5E), RegMem(dst,mem));
-  ins_pipe( pipe_slow );
-%}
-
-
-instruct mulD_reg(regD dst, regD src) %{
+instruct mulDPR_reg(regDPR dst, regDPR src) %{
   predicate(UseSSE<=1);
   match(Set dst (MulD dst src));
   format %{ "FLD    $src\n\t"
             "DMULp  $dst,ST" %}
   opcode(0xDE, 0x1); /* DE C8+i or DE /1*/
   ins_cost(150);
-  ins_encode( Push_Reg_D(src),
+  ins_encode( Push_Reg_DPR(src),
               OpcP, RegOpc(dst) );
   ins_pipe( fpu_reg_reg );
 %}
@@ -10539,7 +9897,7 @@ instruct mulD_reg(regD dst, regD src) %{
 // multiply scaled arg1 by arg2
 // rescale product by 2^(15360)
 //
-instruct strictfp_mulD_reg(regDPR1 dst, regnotDPR1 src) %{
+instruct strictfp_mulDPR_reg(regDPR1 dst, regnotDPR1 src) %{
   predicate( UseSSE<=1 && Compile::current()->has_method() && Compile::current()->method()->is_strict() );
   match(Set dst (MulD dst src));
   ins_cost(1);   // Select this instruction for all strict FP double multiplies
@@ -10552,13 +9910,13 @@ instruct strictfp_mulD_reg(regDPR1 dst, regnotDPR1 src) %{
             "DMULp  $dst,ST\n\t" %}
   opcode(0xDE, 0x1); /* DE C8+i or DE /1*/
   ins_encode( strictfp_bias1(dst),
-              Push_Reg_D(src),
+              Push_Reg_DPR(src),
               OpcP, RegOpc(dst),
               strictfp_bias2(dst) );
   ins_pipe( fpu_reg_reg );
 %}
 
-instruct mulD_reg_imm(regD dst, immD con) %{
+instruct mulDPR_reg_imm(regDPR dst, immDPR con) %{
   predicate( UseSSE<=1 && _kids[1]->_leaf->getd() != 0.0 && _kids[1]->_leaf->getd() != 1.0 );
   match(Set dst (MulD dst con));
   ins_cost(200);
@@ -10572,7 +9930,7 @@ instruct mulD_reg_imm(regD dst, immD con) %{
 %}
 
 
-instruct mulD_reg_mem(regD dst, memory src) %{
+instruct mulDPR_reg_mem(regDPR dst, memory src) %{
   predicate( UseSSE<=1 );
   match(Set dst (MulD dst (LoadD src)));
   ins_cost(200);
@@ -10586,7 +9944,7 @@ instruct mulD_reg_mem(regD dst, memory src) %{
 
 //
 // Cisc-alternate to reg-reg multiply
-instruct mulD_reg_mem_cisc(regD dst, regD src, memory mem) %{
+instruct mulDPR_reg_mem_cisc(regDPR dst, regDPR src, memory mem) %{
   predicate( UseSSE<=1 );
   match(Set dst (MulD src (LoadD mem)));
   ins_cost(250);
@@ -10595,17 +9953,17 @@ instruct mulD_reg_mem_cisc(regD dst, regD src, memory mem) %{
             "FSTP_D $dst" %}
   opcode(0xD8, 0x1, 0xD9); /* D8 C8+i */  /* LoadD D9 /0 */
   ins_encode( Opcode(tertiary), RMopc_Mem(0x00,mem),
-              OpcReg_F(src),
-              Pop_Reg_D(dst) );
+              OpcReg_FPR(src),
+              Pop_Reg_DPR(dst) );
   ins_pipe( fpu_reg_reg_mem );
 %}
 
 
-// MACRO3 -- addD a mulD
+// MACRO3 -- addDPR a mulDPR
 // This instruction is a '2-address' instruction in that the result goes
 // back to src2.  This eliminates a move from the macro; possibly the
 // register allocator will have to add it back (and maybe not).
-instruct addD_mulD_reg(regD src2, regD src1, regD src0) %{
+instruct addDPR_mulDPR_reg(regDPR src2, regDPR src1, regDPR src0) %{
   predicate( UseSSE<=1 );
   match(Set src2 (AddD (MulD src0 src1) src2));
   format %{ "FLD    $src0\t# ===MACRO3d===\n\t"
@@ -10613,29 +9971,29 @@ instruct addD_mulD_reg(regD src2, regD src1, regD src0) %{
             "DADDp  $src2,ST" %}
   ins_cost(250);
   opcode(0xDD); /* LoadD DD /0 */
-  ins_encode( Push_Reg_F(src0),
+  ins_encode( Push_Reg_FPR(src0),
               FMul_ST_reg(src1),
               FAddP_reg_ST(src2) );
   ins_pipe( fpu_reg_reg_reg );
 %}
 
 
-// MACRO3 -- subD a mulD
-instruct subD_mulD_reg(regD src2, regD src1, regD src0) %{
+// MACRO3 -- subDPR a mulDPR
+instruct subDPR_mulDPR_reg(regDPR src2, regDPR src1, regDPR src0) %{
   predicate( UseSSE<=1 );
   match(Set src2 (SubD (MulD src0 src1) src2));
   format %{ "FLD    $src0\t# ===MACRO3d===\n\t"
             "DMUL   ST,$src1\n\t"
             "DSUBRp $src2,ST" %}
   ins_cost(250);
-  ins_encode( Push_Reg_F(src0),
+  ins_encode( Push_Reg_FPR(src0),
               FMul_ST_reg(src1),
               Opcode(0xDE), Opc_plus(0xE0,src2));
   ins_pipe( fpu_reg_reg_reg );
 %}
 
 
-instruct divD_reg(regD dst, regD src) %{
+instruct divDPR_reg(regDPR dst, regDPR src) %{
   predicate( UseSSE<=1 );
   match(Set dst (DivD dst src));
 
@@ -10643,7 +10001,7 @@ instruct divD_reg(regD dst, regD src) %{
             "FDIVp  $dst,ST" %}
   opcode(0xDE, 0x7); /* DE F8+i or DE /7*/
   ins_cost(150);
-  ins_encode( Push_Reg_D(src),
+  ins_encode( Push_Reg_DPR(src),
               OpcP, RegOpc(dst) );
   ins_pipe( fpu_reg_reg );
 %}
@@ -10656,7 +10014,7 @@ instruct divD_reg(regD dst, regD src) %{
 // divide scaled dividend by divisor
 // rescale quotient by 2^(15360)
 //
-instruct strictfp_divD_reg(regDPR1 dst, regnotDPR1 src) %{
+instruct strictfp_divDPR_reg(regDPR1 dst, regnotDPR1 src) %{
   predicate (UseSSE<=1);
   match(Set dst (DivD dst src));
   predicate( UseSSE<=1 && Compile::current()->has_method() && Compile::current()->method()->is_strict() );
@@ -10670,13 +10028,13 @@ instruct strictfp_divD_reg(regDPR1 dst, regnotDPR1 src) %{
             "DMULp  $dst,ST\n\t" %}
   opcode(0xDE, 0x7); /* DE F8+i or DE /7*/
   ins_encode( strictfp_bias1(dst),
-              Push_Reg_D(src),
+              Push_Reg_DPR(src),
               OpcP, RegOpc(dst),
               strictfp_bias2(dst) );
   ins_pipe( fpu_reg_reg );
 %}
 
-instruct divD_reg_round(stackSlotD dst, regD src1, regD src2) %{
+instruct divDPR_reg_round(stackSlotD dst, regDPR src1, regDPR src2) %{
   predicate( UseSSE<=1 && !(Compile::current()->has_method() && Compile::current()->method()->is_strict()) );
   match(Set dst (RoundDouble (DivD src1 src2)));
 
@@ -10684,27 +10042,27 @@ instruct divD_reg_round(stackSlotD dst, regD src1, regD src2) %{
             "FDIV   ST,$src2\n\t"
             "FSTP_D $dst\t# D-round" %}
   opcode(0xD8, 0x6); /* D8 F0+i or D8 /6 */
-  ins_encode( Push_Reg_D(src1),
-              OpcP, RegOpc(src2), Pop_Mem_D(dst) );
+  ins_encode( Push_Reg_DPR(src1),
+              OpcP, RegOpc(src2), Pop_Mem_DPR(dst) );
   ins_pipe( fpu_mem_reg_reg );
 %}
 
 
-instruct modD_reg(regD dst, regD src, eAXRegI rax, eFlagsReg cr) %{
+instruct modDPR_reg(regDPR dst, regDPR src, eAXRegI rax, eFlagsReg cr) %{
   predicate(UseSSE<=1);
   match(Set dst (ModD dst src));
-  effect(KILL rax, KILL cr); // emitModD() uses EAX and EFLAGS
+  effect(KILL rax, KILL cr); // emitModDPR() uses EAX and EFLAGS
 
   format %{ "DMOD   $dst,$src" %}
   ins_cost(250);
-  ins_encode(Push_Reg_Mod_D(dst, src),
-              emitModD(),
-              Push_Result_Mod_D(src),
-              Pop_Reg_D(dst));
+  ins_encode(Push_Reg_Mod_DPR(dst, src),
+              emitModDPR(),
+              Push_Result_Mod_DPR(src),
+              Pop_Reg_DPR(dst));
   ins_pipe( pipe_slow );
 %}
 
-instruct modXD_reg(regXD dst, regXD src0, regXD src1, eAXRegI rax, eFlagsReg cr) %{
+instruct modD_reg(regD dst, regD src0, regD src1, eAXRegI rax, eFlagsReg cr) %{
   predicate(UseSSE>=2);
   match(Set dst (ModD src0 src1));
   effect(KILL rax, KILL cr);
@@ -10725,11 +10083,11 @@ instruct modXD_reg(regXD dst, regXD src0, regXD src1, eAXRegI rax, eFlagsReg cr)
           "\tFSTP   ST0\t # Restore FPU Stack"
     %}
   ins_cost(250);
-  ins_encode( Push_ModD_encoding(src0, src1), emitModD(), Push_ResultXD(dst), PopFPU);
+  ins_encode( Push_ModD_encoding(src0, src1), emitModDPR(), Push_ResultD(dst), PopFPU);
   ins_pipe( pipe_slow );
 %}
 
-instruct sinD_reg(regDPR1 dst, regDPR1 src) %{
+instruct sinDPR_reg(regDPR1 dst, regDPR1 src) %{
   predicate (UseSSE<=1);
   match(Set dst (SinD src));
   ins_cost(1800);
@@ -10739,18 +10097,18 @@ instruct sinD_reg(regDPR1 dst, regDPR1 src) %{
   ins_pipe( pipe_slow );
 %}
 
-instruct sinXD_reg(regXD dst, eFlagsReg cr) %{
+instruct sinD_reg(regD dst, eFlagsReg cr) %{
   predicate (UseSSE>=2);
   match(Set dst (SinD dst));
-  effect(KILL cr); // Push_{Src|Result}XD() uses "{SUB|ADD} ESP,8"
+  effect(KILL cr); // Push_{Src|Result}D() uses "{SUB|ADD} ESP,8"
   ins_cost(1800);
   format %{ "DSIN   $dst" %}
   opcode(0xD9, 0xFE);
-  ins_encode( Push_SrcXD(dst), OpcP, OpcS, Push_ResultXD(dst) );
+  ins_encode( Push_SrcD(dst), OpcP, OpcS, Push_ResultD(dst) );
   ins_pipe( pipe_slow );
 %}
 
-instruct cosD_reg(regDPR1 dst, regDPR1 src) %{
+instruct cosDPR_reg(regDPR1 dst, regDPR1 src) %{
   predicate (UseSSE<=1);
   match(Set dst (CosD src));
   ins_cost(1800);
@@ -10760,18 +10118,18 @@ instruct cosD_reg(regDPR1 dst, regDPR1 src) %{
   ins_pipe( pipe_slow );
 %}
 
-instruct cosXD_reg(regXD dst, eFlagsReg cr) %{
+instruct cosD_reg(regD dst, eFlagsReg cr) %{
   predicate (UseSSE>=2);
   match(Set dst (CosD dst));
-  effect(KILL cr); // Push_{Src|Result}XD() uses "{SUB|ADD} ESP,8"
+  effect(KILL cr); // Push_{Src|Result}D() uses "{SUB|ADD} ESP,8"
   ins_cost(1800);
   format %{ "DCOS   $dst" %}
   opcode(0xD9, 0xFF);
-  ins_encode( Push_SrcXD(dst), OpcP, OpcS, Push_ResultXD(dst) );
+  ins_encode( Push_SrcD(dst), OpcP, OpcS, Push_ResultD(dst) );
   ins_pipe( pipe_slow );
 %}
 
-instruct tanD_reg(regDPR1 dst, regDPR1 src) %{
+instruct tanDPR_reg(regDPR1 dst, regDPR1 src) %{
   predicate (UseSSE<=1);
   match(Set dst(TanD src));
   format %{ "DTAN   $dst" %}
@@ -10780,50 +10138,50 @@ instruct tanD_reg(regDPR1 dst, regDPR1 src) %{
   ins_pipe( pipe_slow );
 %}
 
-instruct tanXD_reg(regXD dst, eFlagsReg cr) %{
+instruct tanD_reg(regD dst, eFlagsReg cr) %{
   predicate (UseSSE>=2);
   match(Set dst(TanD dst));
-  effect(KILL cr); // Push_{Src|Result}XD() uses "{SUB|ADD} ESP,8"
+  effect(KILL cr); // Push_{Src|Result}D() uses "{SUB|ADD} ESP,8"
   format %{ "DTAN   $dst" %}
-  ins_encode( Push_SrcXD(dst),
+  ins_encode( Push_SrcD(dst),
               Opcode(0xD9), Opcode(0xF2),    // fptan
               Opcode(0xDD), Opcode(0xD8),   // fstp st
-              Push_ResultXD(dst) );
+              Push_ResultD(dst) );
   ins_pipe( pipe_slow );
 %}
 
-instruct atanD_reg(regD dst, regD src) %{
+instruct atanDPR_reg(regDPR dst, regDPR src) %{
   predicate (UseSSE<=1);
   match(Set dst(AtanD dst src));
   format %{ "DATA   $dst,$src" %}
   opcode(0xD9, 0xF3);
-  ins_encode( Push_Reg_D(src),
+  ins_encode( Push_Reg_DPR(src),
               OpcP, OpcS, RegOpc(dst) );
   ins_pipe( pipe_slow );
 %}
 
-instruct atanXD_reg(regXD dst, regXD src, eFlagsReg cr) %{
+instruct atanD_reg(regD dst, regD src, eFlagsReg cr) %{
   predicate (UseSSE>=2);
   match(Set dst(AtanD dst src));
-  effect(KILL cr); // Push_{Src|Result}XD() uses "{SUB|ADD} ESP,8"
+  effect(KILL cr); // Push_{Src|Result}D() uses "{SUB|ADD} ESP,8"
   format %{ "DATA   $dst,$src" %}
   opcode(0xD9, 0xF3);
-  ins_encode( Push_SrcXD(src),
-              OpcP, OpcS, Push_ResultXD(dst) );
+  ins_encode( Push_SrcD(src),
+              OpcP, OpcS, Push_ResultD(dst) );
   ins_pipe( pipe_slow );
 %}
 
-instruct sqrtD_reg(regD dst, regD src) %{
+instruct sqrtDPR_reg(regDPR dst, regDPR src) %{
   predicate (UseSSE<=1);
   match(Set dst (SqrtD src));
   format %{ "DSQRT  $dst,$src" %}
   opcode(0xFA, 0xD9);
-  ins_encode( Push_Reg_D(src),
-              OpcS, OpcP, Pop_Reg_D(dst) );
+  ins_encode( Push_Reg_DPR(src),
+              OpcS, OpcP, Pop_Reg_DPR(dst) );
   ins_pipe( pipe_slow );
 %}
 
-instruct powD_reg(regD X, regDPR1 Y, eAXRegI rax, eBXRegI rbx, eCXRegI rcx) %{
+instruct powDPR_reg(regDPR X, regDPR1 Y, eAXRegI rax, eBXRegI rbx, eCXRegI rcx) %{
   predicate (UseSSE<=1);
   match(Set Y (PowD X Y));  // Raise X to the Yth power
   effect(KILL rax, KILL rbx, KILL rcx);
@@ -10852,14 +10210,14 @@ instruct powD_reg(regD X, regDPR1 Y, eAXRegI rax, eBXRegI rbx, eCXRegI rcx) %{
             "ADD    ESP,8"
              %}
   ins_encode( push_stack_temp_qword,
-              Push_Reg_D(X),
+              Push_Reg_DPR(X),
               Opcode(0xD9), Opcode(0xF1),   // fyl2x
               pow_exp_core_encoding,
               pop_stack_temp_qword);
   ins_pipe( pipe_slow );
 %}
 
-instruct powXD_reg(regXD dst, regXD src0, regXD src1, regDPR1 tmp1, eAXRegI rax, eBXRegI rbx, eCXRegI rcx ) %{
+instruct powD_reg(regD dst, regD src0, regD src1, regDPR1 tmp1, eAXRegI rax, eBXRegI rbx, eCXRegI rcx ) %{
   predicate (UseSSE>=2);
   match(Set dst (PowD src0 src1));  // Raise src0 to the src1'th power
   effect(KILL tmp1, KILL rax, KILL rbx, KILL rcx );
@@ -10897,12 +10255,12 @@ instruct powXD_reg(regXD dst, regXD src0, regXD src1, regDPR1 tmp1, eAXRegI rax,
               push_xmm_to_fpr1(src0),
               Opcode(0xD9), Opcode(0xF1),   // fyl2x
               pow_exp_core_encoding,
-              Push_ResultXD(dst) );
+              Push_ResultD(dst) );
   ins_pipe( pipe_slow );
 %}
 
 
-instruct expD_reg(regDPR1 dpr1, eAXRegI rax, eBXRegI rbx, eCXRegI rcx) %{
+instruct expDPR_reg(regDPR1 dpr1, eAXRegI rax, eBXRegI rbx, eCXRegI rcx) %{
   predicate (UseSSE<=1);
   match(Set dpr1 (ExpD dpr1));
   effect(KILL rax, KILL rbx, KILL rcx);
@@ -10938,7 +10296,7 @@ instruct expD_reg(regDPR1 dpr1, eAXRegI rax, eBXRegI rbx, eCXRegI rcx) %{
   ins_pipe( pipe_slow );
 %}
 
-instruct expXD_reg(regXD dst, regXD src, regDPR1 tmp1, eAXRegI rax, eBXRegI rbx, eCXRegI rcx) %{
+instruct expD_reg(regD dst, regD src, regDPR1 tmp1, eAXRegI rax, eBXRegI rbx, eCXRegI rcx) %{
   predicate (UseSSE>=2);
   match(Set dst (ExpD src));
   effect(KILL tmp1, KILL rax, KILL rbx, KILL rcx);
@@ -10969,17 +10327,17 @@ instruct expXD_reg(regXD dst, regXD src, regDPR1 tmp1, eAXRegI rax, eBXRegI rbx,
             "MOVSD  $dst,[ESP]\n\t"
             "ADD    ESP,8"
              %}
-  ins_encode( Push_SrcXD(src),
+  ins_encode( Push_SrcD(src),
               Opcode(0xD9), Opcode(0xEA),   // fldl2e
               Opcode(0xDE), Opcode(0xC9),   // fmulp
               pow_exp_core_encoding,
-              Push_ResultXD(dst) );
+              Push_ResultD(dst) );
   ins_pipe( pipe_slow );
 %}
 
 
 
-instruct log10D_reg(regDPR1 dst, regDPR1 src) %{
+instruct log10DPR_reg(regDPR1 dst, regDPR1 src) %{
   predicate (UseSSE<=1);
   // The source Double operand on FPU stack
   match(Set dst (Log10D src));
@@ -10997,7 +10355,7 @@ instruct log10D_reg(regDPR1 dst, regDPR1 src) %{
   ins_pipe( pipe_slow );
 %}
 
-instruct log10XD_reg(regXD dst, regXD src, eFlagsReg cr) %{
+instruct log10D_reg(regD dst, regD src, eFlagsReg cr) %{
   predicate (UseSSE>=2);
   effect(KILL cr);
   match(Set dst (Log10D src));
@@ -11007,14 +10365,14 @@ instruct log10XD_reg(regXD dst, regXD src, eFlagsReg cr) %{
             "FYL2X  \t\t\t# Q=Log10*Log_2(x)"
          %}
   ins_encode( Opcode(0xD9), Opcode(0xEC),   // fldlg2
-              Push_SrcXD(src),
+              Push_SrcD(src),
               Opcode(0xD9), Opcode(0xF1),   // fyl2x
-              Push_ResultXD(dst));
+              Push_ResultD(dst));
 
   ins_pipe( pipe_slow );
 %}
 
-instruct logD_reg(regDPR1 dst, regDPR1 src) %{
+instruct logDPR_reg(regDPR1 dst, regDPR1 src) %{
   predicate (UseSSE<=1);
   // The source Double operand on FPU stack
   match(Set dst (LogD src));
@@ -11032,7 +10390,7 @@ instruct logD_reg(regDPR1 dst, regDPR1 src) %{
   ins_pipe( pipe_slow );
 %}
 
-instruct logXD_reg(regXD dst, regXD src, eFlagsReg cr) %{
+instruct logD_reg(regD dst, regD src, eFlagsReg cr) %{
   predicate (UseSSE>=2);
   effect(KILL cr);
   // The source and result Double operands in XMM registers
@@ -11043,9 +10401,9 @@ instruct logXD_reg(regXD dst, regXD src, eFlagsReg cr) %{
             "FYL2X  \t\t\t# Q=Log_e*Log_2(x)"
          %}
   ins_encode( Opcode(0xD9), Opcode(0xED),   // fldln2
-              Push_SrcXD(src),
+              Push_SrcD(src),
               Opcode(0xD9), Opcode(0xF1),   // fyl2x
-              Push_ResultXD(dst));
+              Push_ResultD(dst));
   ins_pipe( pipe_slow );
 %}
 
@@ -11066,7 +10424,7 @@ instruct logXD_reg(regXD dst, regXD src, eFlagsReg cr) %{
 //   exit:
 
 // P6 version of float compare, sets condition codes in EFLAGS
-instruct cmpF_cc_P6(eFlagsRegU cr, regF src1, regF src2, eAXRegI rax) %{
+instruct cmpFPR_cc_P6(eFlagsRegU cr, regFPR src1, regFPR src2, eAXRegI rax) %{
   predicate(VM_Version::supports_cmov() && UseSSE == 0);
   match(Set cr (CmpF src1 src2));
   effect(KILL rax);
@@ -11078,27 +10436,27 @@ instruct cmpF_cc_P6(eFlagsRegU cr, regF src1, regF src2, eAXRegI rax) %{
             "SAHF\n"
      "exit:\tNOP               // avoid branch to branch" %}
   opcode(0xDF, 0x05); /* DF E8+i or DF /5 */
-  ins_encode( Push_Reg_D(src1),
+  ins_encode( Push_Reg_DPR(src1),
               OpcP, RegOpc(src2),
               cmpF_P6_fixup );
   ins_pipe( pipe_slow );
 %}
 
-instruct cmpF_cc_P6CF(eFlagsRegUCF cr, regF src1, regF src2) %{
+instruct cmpFPR_cc_P6CF(eFlagsRegUCF cr, regFPR src1, regFPR src2) %{
   predicate(VM_Version::supports_cmov() && UseSSE == 0);
   match(Set cr (CmpF src1 src2));
   ins_cost(100);
   format %{ "FLD    $src1\n\t"
             "FUCOMIP ST,$src2  // P6 instruction" %}
   opcode(0xDF, 0x05); /* DF E8+i or DF /5 */
-  ins_encode( Push_Reg_D(src1),
+  ins_encode( Push_Reg_DPR(src1),
               OpcP, RegOpc(src2));
   ins_pipe( pipe_slow );
 %}
 
 
 // Compare & branch
-instruct cmpF_cc(eFlagsRegU cr, regF src1, regF src2, eAXRegI rax) %{
+instruct cmpFPR_cc(eFlagsRegU cr, regFPR src1, regFPR src2, eAXRegI rax) %{
   predicate(UseSSE == 0);
   match(Set cr (CmpF src1 src2));
   effect(KILL rax);
@@ -11111,328 +10469,190 @@ instruct cmpF_cc(eFlagsRegU cr, regF src1, regF src2, eAXRegI rax) %{
             "MOV    AH,1\t# unordered treat as LT\n"
     "flags:\tSAHF" %}
   opcode(0xD8, 0x3); /* D8 D8+i or D8 /3 */
-  ins_encode( Push_Reg_D(src1),
+  ins_encode( Push_Reg_DPR(src1),
               OpcP, RegOpc(src2),
               fpu_flags);
   ins_pipe( pipe_slow );
 %}
 
 // Compare vs zero into -1,0,1
-instruct cmpF_0(eRegI dst, regF src1, immF0 zero, eAXRegI rax, eFlagsReg cr) %{
+instruct cmpFPR_0(eRegI dst, regFPR src1, immFPR0 zero, eAXRegI rax, eFlagsReg cr) %{
   predicate(UseSSE == 0);
   match(Set dst (CmpF3 src1 zero));
   effect(KILL cr, KILL rax);
   ins_cost(280);
   format %{ "FTSTF  $dst,$src1" %}
   opcode(0xE4, 0xD9);
-  ins_encode( Push_Reg_D(src1),
+  ins_encode( Push_Reg_DPR(src1),
               OpcS, OpcP, PopFPU,
               CmpF_Result(dst));
   ins_pipe( pipe_slow );
 %}
 
 // Compare into -1,0,1
-instruct cmpF_reg(eRegI dst, regF src1, regF src2, eAXRegI rax, eFlagsReg cr) %{
+instruct cmpFPR_reg(eRegI dst, regFPR src1, regFPR src2, eAXRegI rax, eFlagsReg cr) %{
   predicate(UseSSE == 0);
   match(Set dst (CmpF3 src1 src2));
   effect(KILL cr, KILL rax);
   ins_cost(300);
   format %{ "FCMPF  $dst,$src1,$src2" %}
   opcode(0xD8, 0x3); /* D8 D8+i or D8 /3 */
-  ins_encode( Push_Reg_D(src1),
+  ins_encode( Push_Reg_DPR(src1),
               OpcP, RegOpc(src2),
               CmpF_Result(dst));
   ins_pipe( pipe_slow );
 %}
 
 // float compare and set condition codes in EFLAGS by XMM regs
-instruct cmpX_cc(eFlagsRegU cr, regX dst, regX src, eAXRegI rax) %{
+instruct cmpF_cc(eFlagsRegU cr, regF src1, regF src2) %{
   predicate(UseSSE>=1);
-  match(Set cr (CmpF dst src));
-  effect(KILL rax);
+  match(Set cr (CmpF src1 src2));
   ins_cost(145);
-  format %{ "COMISS $dst,$src\n"
-          "\tJNP    exit\n"
-          "\tMOV    ah,1       // saw a NaN, set CF\n"
-          "\tSAHF\n"
-     "exit:\tNOP               // avoid branch to branch" %}
-  opcode(0x0F, 0x2F);
-  ins_encode(OpcP, OpcS, RegReg(dst, src), cmpF_P6_fixup);
+  format %{ "UCOMISS $src1,$src2\n\t"
+            "JNP,s   exit\n\t"
+            "PUSHF\t# saw NaN, set CF\n\t"
+            "AND     [rsp], #0xffffff2b\n\t"
+            "POPF\n"
+    "exit:" %}
+  ins_encode %{
+    __ ucomiss($src1$$XMMRegister, $src2$$XMMRegister);
+    emit_cmpfp_fixup(_masm);
+  %}
   ins_pipe( pipe_slow );
 %}
 
-instruct cmpX_ccCF(eFlagsRegUCF cr, regX dst, regX src) %{
+instruct cmpF_ccCF(eFlagsRegUCF cr, regF src1, regF src2) %{
   predicate(UseSSE>=1);
-  match(Set cr (CmpF dst src));
+  match(Set cr (CmpF src1 src2));
   ins_cost(100);
-  format %{ "COMISS $dst,$src" %}
-  opcode(0x0F, 0x2F);
-  ins_encode(OpcP, OpcS, RegReg(dst, src));
+  format %{ "UCOMISS $src1,$src2" %}
+  ins_encode %{
+    __ ucomiss($src1$$XMMRegister, $src2$$XMMRegister);
+  %}
   ins_pipe( pipe_slow );
 %}
 
 // float compare and set condition codes in EFLAGS by XMM regs
-instruct cmpX_ccmem(eFlagsRegU cr, regX dst, memory src, eAXRegI rax) %{
+instruct cmpF_ccmem(eFlagsRegU cr, regF src1, memory src2) %{
   predicate(UseSSE>=1);
-  match(Set cr (CmpF dst (LoadF src)));
-  effect(KILL rax);
+  match(Set cr (CmpF src1 (LoadF src2)));
   ins_cost(165);
-  format %{ "COMISS $dst,$src\n"
-          "\tJNP    exit\n"
-          "\tMOV    ah,1       // saw a NaN, set CF\n"
-          "\tSAHF\n"
-     "exit:\tNOP               // avoid branch to branch" %}
-  opcode(0x0F, 0x2F);
-  ins_encode(OpcP, OpcS, RegMem(dst, src), cmpF_P6_fixup);
+  format %{ "UCOMISS $src1,$src2\n\t"
+            "JNP,s   exit\n\t"
+            "PUSHF\t# saw NaN, set CF\n\t"
+            "AND     [rsp], #0xffffff2b\n\t"
+            "POPF\n"
+    "exit:" %}
+  ins_encode %{
+    __ ucomiss($src1$$XMMRegister, $src2$$Address);
+    emit_cmpfp_fixup(_masm);
+  %}
   ins_pipe( pipe_slow );
 %}
 
-instruct cmpX_ccmemCF(eFlagsRegUCF cr, regX dst, memory src) %{
+instruct cmpF_ccmemCF(eFlagsRegUCF cr, regF src1, memory src2) %{
   predicate(UseSSE>=1);
-  match(Set cr (CmpF dst (LoadF src)));
+  match(Set cr (CmpF src1 (LoadF src2)));
   ins_cost(100);
-  format %{ "COMISS $dst,$src" %}
-  opcode(0x0F, 0x2F);
-  ins_encode(OpcP, OpcS, RegMem(dst, src));
+  format %{ "UCOMISS $src1,$src2" %}
+  ins_encode %{
+    __ ucomiss($src1$$XMMRegister, $src2$$Address);
+  %}
   ins_pipe( pipe_slow );
 %}
 
 // Compare into -1,0,1 in XMM
-instruct cmpX_reg(eRegI dst, regX src1, regX src2, eFlagsReg cr) %{
+instruct cmpF_reg(xRegI dst, regF src1, regF src2, eFlagsReg cr) %{
   predicate(UseSSE>=1);
   match(Set dst (CmpF3 src1 src2));
   effect(KILL cr);
   ins_cost(255);
-  format %{ "XOR    $dst,$dst\n"
-          "\tCOMISS $src1,$src2\n"
-          "\tJP,s   nan\n"
-          "\tJEQ,s  exit\n"
-          "\tJA,s   inc\n"
-      "nan:\tDEC    $dst\n"
-          "\tJMP,s  exit\n"
-      "inc:\tINC    $dst\n"
-      "exit:"
-                %}
-  opcode(0x0F, 0x2F);
-  ins_encode(Xor_Reg(dst), OpcP, OpcS, RegReg(src1, src2), CmpX_Result(dst));
+  format %{ "UCOMISS $src1, $src2\n\t"
+            "MOV     $dst, #-1\n\t"
+            "JP,s    done\n\t"
+            "JB,s    done\n\t"
+            "SETNE   $dst\n\t"
+            "MOVZB   $dst, $dst\n"
+    "done:" %}
+  ins_encode %{
+    __ ucomiss($src1$$XMMRegister, $src2$$XMMRegister);
+    emit_cmpfp3(_masm, $dst$$Register);
+  %}
   ins_pipe( pipe_slow );
 %}
 
 // Compare into -1,0,1 in XMM and memory
-instruct cmpX_regmem(eRegI dst, regX src1, memory mem, eFlagsReg cr) %{
+instruct cmpF_regmem(xRegI dst, regF src1, memory src2, eFlagsReg cr) %{
   predicate(UseSSE>=1);
-  match(Set dst (CmpF3 src1 (LoadF mem)));
+  match(Set dst (CmpF3 src1 (LoadF src2)));
   effect(KILL cr);
   ins_cost(275);
-  format %{ "COMISS $src1,$mem\n"
-          "\tMOV    $dst,0\t\t# do not blow flags\n"
-          "\tJP,s   nan\n"
-          "\tJEQ,s  exit\n"
-          "\tJA,s   inc\n"
-      "nan:\tDEC    $dst\n"
-          "\tJMP,s  exit\n"
-      "inc:\tINC    $dst\n"
-      "exit:"
-                %}
-  opcode(0x0F, 0x2F);
-  ins_encode(OpcP, OpcS, RegMem(src1, mem), LdImmI(dst,0x0), CmpX_Result(dst));
+  format %{ "UCOMISS $src1, $src2\n\t"
+            "MOV     $dst, #-1\n\t"
+            "JP,s    done\n\t"
+            "JB,s    done\n\t"
+            "SETNE   $dst\n\t"
+            "MOVZB   $dst, $dst\n"
+    "done:" %}
+  ins_encode %{
+    __ ucomiss($src1$$XMMRegister, $src2$$Address);
+    emit_cmpfp3(_masm, $dst$$Register);
+  %}
   ins_pipe( pipe_slow );
 %}
 
 // Spill to obtain 24-bit precision
-instruct subF24_reg(stackSlotF dst, regF src1, regF src2) %{
+instruct subFPR24_reg(stackSlotF dst, regFPR src1, regFPR src2) %{
   predicate(UseSSE==0 && Compile::current()->select_24_bit_instr());
   match(Set dst (SubF src1 src2));
 
   format %{ "FSUB   $dst,$src1 - $src2" %}
   opcode(0xD8, 0x4); /* D8 E0+i or D8 /4 mod==0x3 ;; result in TOS */
-  ins_encode( Push_Reg_F(src1),
-              OpcReg_F(src2),
-              Pop_Mem_F(dst) );
+  ins_encode( Push_Reg_FPR(src1),
+              OpcReg_FPR(src2),
+              Pop_Mem_FPR(dst) );
   ins_pipe( fpu_mem_reg_reg );
 %}
 //
 // This instruction does not round to 24-bits
-instruct subF_reg(regF dst, regF src) %{
+instruct subFPR_reg(regFPR dst, regFPR src) %{
   predicate(UseSSE==0 && !Compile::current()->select_24_bit_instr());
   match(Set dst (SubF dst src));
 
   format %{ "FSUB   $dst,$src" %}
   opcode(0xDE, 0x5); /* DE E8+i  or DE /5 */
-  ins_encode( Push_Reg_F(src),
+  ins_encode( Push_Reg_FPR(src),
               OpcP, RegOpc(dst) );
   ins_pipe( fpu_reg_reg );
 %}
 
 // Spill to obtain 24-bit precision
-instruct addF24_reg(stackSlotF dst, regF src1, regF src2) %{
+instruct addFPR24_reg(stackSlotF dst, regFPR src1, regFPR src2) %{
   predicate(UseSSE==0 && Compile::current()->select_24_bit_instr());
   match(Set dst (AddF src1 src2));
 
   format %{ "FADD   $dst,$src1,$src2" %}
   opcode(0xD8, 0x0); /* D8 C0+i */
-  ins_encode( Push_Reg_F(src2),
-              OpcReg_F(src1),
-              Pop_Mem_F(dst) );
+  ins_encode( Push_Reg_FPR(src2),
+              OpcReg_FPR(src1),
+              Pop_Mem_FPR(dst) );
   ins_pipe( fpu_mem_reg_reg );
 %}
 //
 // This instruction does not round to 24-bits
-instruct addF_reg(regF dst, regF src) %{
+instruct addFPR_reg(regFPR dst, regFPR src) %{
   predicate(UseSSE==0 && !Compile::current()->select_24_bit_instr());
   match(Set dst (AddF dst src));
 
   format %{ "FLD    $src\n\t"
             "FADDp  $dst,ST" %}
   opcode(0xDE, 0x0); /* DE C0+i or DE /0*/
-  ins_encode( Push_Reg_F(src),
+  ins_encode( Push_Reg_FPR(src),
               OpcP, RegOpc(dst) );
   ins_pipe( fpu_reg_reg );
 %}
 
-// Add two single precision floating point values in xmm
-instruct addX_reg(regX dst, regX src) %{
-  predicate(UseSSE>=1);
-  match(Set dst (AddF dst src));
-  format %{ "ADDSS  $dst,$src" %}
-  ins_encode( Opcode(0xF3), Opcode(0x0F), Opcode(0x58), RegReg(dst, src));
-  ins_pipe( pipe_slow );
-%}
-
-instruct addX_imm(regX dst, immXF con) %{
-  predicate(UseSSE>=1);
-  match(Set dst (AddF dst con));
-  format %{ "ADDSS  $dst,[$constantaddress]\t# load from constant table: float=$con" %}
-  ins_encode %{
-    __ addss($dst$$XMMRegister, $constantaddress($con));
-  %}
-  ins_pipe(pipe_slow);
-%}
-
-instruct addX_mem(regX dst, memory mem) %{
-  predicate(UseSSE>=1);
-  match(Set dst (AddF dst (LoadF mem)));
-  format %{ "ADDSS  $dst,$mem" %}
-  ins_encode( Opcode(0xF3), Opcode(0x0F), Opcode(0x58), RegMem(dst, mem));
-  ins_pipe( pipe_slow );
-%}
-
-// Subtract two single precision floating point values in xmm
-instruct subX_reg(regX dst, regX src) %{
-  predicate(UseSSE>=1);
-  match(Set dst (SubF dst src));
-  format %{ "SUBSS  $dst,$src" %}
-  ins_encode( Opcode(0xF3), Opcode(0x0F), Opcode(0x5C), RegReg(dst, src));
-  ins_pipe( pipe_slow );
-%}
-
-instruct subX_imm(regX dst, immXF con) %{
-  predicate(UseSSE>=1);
-  match(Set dst (SubF dst con));
-  format %{ "SUBSS  $dst,[$constantaddress]\t# load from constant table: float=$con" %}
-  ins_encode %{
-    __ subss($dst$$XMMRegister, $constantaddress($con));
-  %}
-  ins_pipe(pipe_slow);
-%}
-
-instruct subX_mem(regX dst, memory mem) %{
-  predicate(UseSSE>=1);
-  match(Set dst (SubF dst (LoadF mem)));
-  format %{ "SUBSS  $dst,$mem" %}
-  ins_encode( Opcode(0xF3), Opcode(0x0F), Opcode(0x5C), RegMem(dst,mem));
-  ins_pipe( pipe_slow );
-%}
-
-// Multiply two single precision floating point values in xmm
-instruct mulX_reg(regX dst, regX src) %{
-  predicate(UseSSE>=1);
-  match(Set dst (MulF dst src));
-  format %{ "MULSS  $dst,$src" %}
-  ins_encode( Opcode(0xF3), Opcode(0x0F), Opcode(0x59), RegReg(dst, src));
-  ins_pipe( pipe_slow );
-%}
-
-instruct mulX_imm(regX dst, immXF con) %{
-  predicate(UseSSE>=1);
-  match(Set dst (MulF dst con));
-  format %{ "MULSS  $dst,[$constantaddress]\t# load from constant table: float=$con" %}
-  ins_encode %{
-    __ mulss($dst$$XMMRegister, $constantaddress($con));
-  %}
-  ins_pipe(pipe_slow);
-%}
-
-instruct mulX_mem(regX dst, memory mem) %{
-  predicate(UseSSE>=1);
-  match(Set dst (MulF dst (LoadF mem)));
-  format %{ "MULSS  $dst,$mem" %}
-  ins_encode( Opcode(0xF3), Opcode(0x0F), Opcode(0x59), RegMem(dst,mem));
-  ins_pipe( pipe_slow );
-%}
-
-// Divide two single precision floating point values in xmm
-instruct divX_reg(regX dst, regX src) %{
-  predicate(UseSSE>=1);
-  match(Set dst (DivF dst src));
-  format %{ "DIVSS  $dst,$src" %}
-  ins_encode( Opcode(0xF3), Opcode(0x0F), Opcode(0x5E), RegReg(dst, src));
-  ins_pipe( pipe_slow );
-%}
-
-instruct divX_imm(regX dst, immXF con) %{
-  predicate(UseSSE>=1);
-  match(Set dst (DivF dst con));
-  format %{ "DIVSS  $dst,[$constantaddress]\t# load from constant table: float=$con" %}
-  ins_encode %{
-    __ divss($dst$$XMMRegister, $constantaddress($con));
-  %}
-  ins_pipe(pipe_slow);
-%}
-
-instruct divX_mem(regX dst, memory mem) %{
-  predicate(UseSSE>=1);
-  match(Set dst (DivF dst (LoadF mem)));
-  format %{ "DIVSS  $dst,$mem" %}
-  ins_encode( Opcode(0xF3), Opcode(0x0F), Opcode(0x5E), RegMem(dst,mem));
-  ins_pipe( pipe_slow );
-%}
-
-// Get the square root of a single precision floating point values in xmm
-instruct sqrtX_reg(regX dst, regX src) %{
-  predicate(UseSSE>=1);
-  match(Set dst (ConvD2F (SqrtD (ConvF2D src))));
-  format %{ "SQRTSS $dst,$src" %}
-  ins_encode( Opcode(0xF3), Opcode(0x0F), Opcode(0x51), RegReg(dst, src));
-  ins_pipe( pipe_slow );
-%}
-
-instruct sqrtX_mem(regX dst, memory mem) %{
-  predicate(UseSSE>=1);
-  match(Set dst (ConvD2F (SqrtD (ConvF2D (LoadF mem)))));
-  format %{ "SQRTSS $dst,$mem" %}
-  ins_encode( Opcode(0xF3), Opcode(0x0F), Opcode(0x51), RegMem(dst, mem));
-  ins_pipe( pipe_slow );
-%}
-
-// Get the square root of a double precision floating point values in xmm
-instruct sqrtXD_reg(regXD dst, regXD src) %{
-  predicate(UseSSE>=2);
-  match(Set dst (SqrtD src));
-  format %{ "SQRTSD $dst,$src" %}
-  ins_encode( Opcode(0xF2), Opcode(0x0F), Opcode(0x51), RegReg(dst, src));
-  ins_pipe( pipe_slow );
-%}
-
-instruct sqrtXD_mem(regXD dst, memory mem) %{
-  predicate(UseSSE>=2);
-  match(Set dst (SqrtD (LoadD mem)));
-  format %{ "SQRTSD $dst,$mem" %}
-  ins_encode( Opcode(0xF2), Opcode(0x0F), Opcode(0x51), RegMem(dst, mem));
-  ins_pipe( pipe_slow );
-%}
-
-instruct absF_reg(regFPR1 dst, regFPR1 src) %{
+instruct absFPR_reg(regFPR1 dst, regFPR1 src) %{
   predicate(UseSSE==0);
   match(Set dst (AbsF src));
   ins_cost(100);
@@ -11442,15 +10662,7 @@ instruct absF_reg(regFPR1 dst, regFPR1 src) %{
   ins_pipe( fpu_reg_reg );
 %}
 
-instruct absX_reg(regX dst ) %{
-  predicate(UseSSE>=1);
-  match(Set dst (AbsF dst));
-  format %{ "ANDPS  $dst,[0x7FFFFFFF]\t# ABS F by sign masking" %}
-  ins_encode( AbsXF_encoding(dst));
-  ins_pipe( pipe_slow );
-%}
-
-instruct negF_reg(regFPR1 dst, regFPR1 src) %{
+instruct negFPR_reg(regFPR1 dst, regFPR1 src) %{
   predicate(UseSSE==0);
   match(Set dst (NegF src));
   ins_cost(100);
@@ -11460,17 +10672,9 @@ instruct negF_reg(regFPR1 dst, regFPR1 src) %{
   ins_pipe( fpu_reg_reg );
 %}
 
-instruct negX_reg( regX dst ) %{
-  predicate(UseSSE>=1);
-  match(Set dst (NegF dst));
-  format %{ "XORPS  $dst,[0x80000000]\t# CHS F by sign flipping" %}
-  ins_encode( NegXF_encoding(dst));
-  ins_pipe( pipe_slow );
-%}
-
-// Cisc-alternate to addF_reg
+// Cisc-alternate to addFPR_reg
 // Spill to obtain 24-bit precision
-instruct addF24_reg_mem(stackSlotF dst, regF src1, memory src2) %{
+instruct addFPR24_reg_mem(stackSlotF dst, regFPR src1, memory src2) %{
   predicate(UseSSE==0 && Compile::current()->select_24_bit_instr());
   match(Set dst (AddF src1 (LoadF src2)));
 
@@ -11479,14 +10683,14 @@ instruct addF24_reg_mem(stackSlotF dst, regF src1, memory src2) %{
             "FSTP_S $dst" %}
   opcode(0xD8, 0x0, 0xD9); /* D8 C0+i */  /* LoadF  D9 /0 */
   ins_encode( Opcode(tertiary), RMopc_Mem(0x00,src2),
-              OpcReg_F(src1),
-              Pop_Mem_F(dst) );
+              OpcReg_FPR(src1),
+              Pop_Mem_FPR(dst) );
   ins_pipe( fpu_mem_reg_mem );
 %}
 //
-// Cisc-alternate to addF_reg
+// Cisc-alternate to addFPR_reg
 // This instruction does not round to 24-bits
-instruct addF_reg_mem(regF dst, memory src) %{
+instruct addFPR_reg_mem(regFPR dst, memory src) %{
   predicate(UseSSE==0 && !Compile::current()->select_24_bit_instr());
   match(Set dst (AddF dst (LoadF src)));
 
@@ -11499,21 +10703,21 @@ instruct addF_reg_mem(regF dst, memory src) %{
 
 // // Following two instructions for _222_mpegaudio
 // Spill to obtain 24-bit precision
-instruct addF24_mem_reg(stackSlotF dst, regF src2, memory src1 ) %{
+instruct addFPR24_mem_reg(stackSlotF dst, regFPR src2, memory src1 ) %{
   predicate(UseSSE==0 && Compile::current()->select_24_bit_instr());
   match(Set dst (AddF src1 src2));
 
   format %{ "FADD   $dst,$src1,$src2" %}
   opcode(0xD8, 0x0, 0xD9); /* D8 C0+i */  /* LoadF  D9 /0 */
   ins_encode( Opcode(tertiary), RMopc_Mem(0x00,src1),
-              OpcReg_F(src2),
-              Pop_Mem_F(dst) );
+              OpcReg_FPR(src2),
+              Pop_Mem_FPR(dst) );
   ins_pipe( fpu_mem_reg_mem );
 %}
 
 // Cisc-spill variant
 // Spill to obtain 24-bit precision
-instruct addF24_mem_cisc(stackSlotF dst, memory src1, memory src2) %{
+instruct addFPR24_mem_cisc(stackSlotF dst, memory src1, memory src2) %{
   predicate(UseSSE==0 && Compile::current()->select_24_bit_instr());
   match(Set dst (AddF src1 (LoadF src2)));
 
@@ -11522,12 +10726,12 @@ instruct addF24_mem_cisc(stackSlotF dst, memory src1, memory src2) %{
   ins_encode( Opcode(tertiary), RMopc_Mem(0x00,src2),
               set_instruction_start,
               OpcP, RMopc_Mem(secondary,src1),
-              Pop_Mem_F(dst) );
+              Pop_Mem_FPR(dst) );
   ins_pipe( fpu_mem_mem_mem );
 %}
 
 // Spill to obtain 24-bit precision
-instruct addF24_mem_mem(stackSlotF dst, memory src1, memory src2) %{
+instruct addFPR24_mem_mem(stackSlotF dst, memory src1, memory src2) %{
   predicate(UseSSE==0 && Compile::current()->select_24_bit_instr());
   match(Set dst (AddF src1 src2));
 
@@ -11536,13 +10740,13 @@ instruct addF24_mem_mem(stackSlotF dst, memory src1, memory src2) %{
   ins_encode( Opcode(tertiary), RMopc_Mem(0x00,src2),
               set_instruction_start,
               OpcP, RMopc_Mem(secondary,src1),
-              Pop_Mem_F(dst) );
+              Pop_Mem_FPR(dst) );
   ins_pipe( fpu_mem_mem_mem );
 %}
 
 
 // Spill to obtain 24-bit precision
-instruct addF24_reg_imm(stackSlotF dst, regF src, immF con) %{
+instruct addFPR24_reg_imm(stackSlotF dst, regFPR src, immFPR con) %{
   predicate(UseSSE==0 && Compile::current()->select_24_bit_instr());
   match(Set dst (AddF src con));
   format %{ "FLD    $src\n\t"
@@ -11557,7 +10761,7 @@ instruct addF24_reg_imm(stackSlotF dst, regF src, immF con) %{
 %}
 //
 // This instruction does not round to 24-bits
-instruct addF_reg_imm(regF dst, regF src, immF con) %{
+instruct addFPR_reg_imm(regFPR dst, regFPR src, immFPR con) %{
   predicate(UseSSE==0 && !Compile::current()->select_24_bit_instr());
   match(Set dst (AddF src con));
   format %{ "FLD    $src\n\t"
@@ -11572,7 +10776,7 @@ instruct addF_reg_imm(regF dst, regF src, immF con) %{
 %}
 
 // Spill to obtain 24-bit precision
-instruct mulF24_reg(stackSlotF dst, regF src1, regF src2) %{
+instruct mulFPR24_reg(stackSlotF dst, regFPR src1, regFPR src2) %{
   predicate(UseSSE==0 && Compile::current()->select_24_bit_instr());
   match(Set dst (MulF src1 src2));
 
@@ -11580,14 +10784,14 @@ instruct mulF24_reg(stackSlotF dst, regF src1, regF src2) %{
             "FMUL   $src2\n\t"
             "FSTP_S $dst"  %}
   opcode(0xD8, 0x1); /* D8 C8+i or D8 /1 ;; result in TOS */
-  ins_encode( Push_Reg_F(src1),
-              OpcReg_F(src2),
-              Pop_Mem_F(dst) );
+  ins_encode( Push_Reg_FPR(src1),
+              OpcReg_FPR(src2),
+              Pop_Mem_FPR(dst) );
   ins_pipe( fpu_mem_reg_reg );
 %}
 //
 // This instruction does not round to 24-bits
-instruct mulF_reg(regF dst, regF src1, regF src2) %{
+instruct mulFPR_reg(regFPR dst, regFPR src1, regFPR src2) %{
   predicate(UseSSE==0 && !Compile::current()->select_24_bit_instr());
   match(Set dst (MulF src1 src2));
 
@@ -11595,16 +10799,16 @@ instruct mulF_reg(regF dst, regF src1, regF src2) %{
             "FMUL   $src2\n\t"
             "FSTP_S $dst"  %}
   opcode(0xD8, 0x1); /* D8 C8+i */
-  ins_encode( Push_Reg_F(src2),
-              OpcReg_F(src1),
-              Pop_Reg_F(dst) );
+  ins_encode( Push_Reg_FPR(src2),
+              OpcReg_FPR(src1),
+              Pop_Reg_FPR(dst) );
   ins_pipe( fpu_reg_reg_reg );
 %}
 
 
 // Spill to obtain 24-bit precision
 // Cisc-alternate to reg-reg multiply
-instruct mulF24_reg_mem(stackSlotF dst, regF src1, memory src2) %{
+instruct mulFPR24_reg_mem(stackSlotF dst, regFPR src1, memory src2) %{
   predicate(UseSSE==0 && Compile::current()->select_24_bit_instr());
   match(Set dst (MulF src1 (LoadF src2)));
 
@@ -11613,27 +10817,27 @@ instruct mulF24_reg_mem(stackSlotF dst, regF src1, memory src2) %{
             "FSTP_S $dst"  %}
   opcode(0xD8, 0x1, 0xD9); /* D8 C8+i or DE /1*/  /* LoadF D9 /0 */
   ins_encode( Opcode(tertiary), RMopc_Mem(0x00,src2),
-              OpcReg_F(src1),
-              Pop_Mem_F(dst) );
+              OpcReg_FPR(src1),
+              Pop_Mem_FPR(dst) );
   ins_pipe( fpu_mem_reg_mem );
 %}
 //
 // This instruction does not round to 24-bits
 // Cisc-alternate to reg-reg multiply
-instruct mulF_reg_mem(regF dst, regF src1, memory src2) %{
+instruct mulFPR_reg_mem(regFPR dst, regFPR src1, memory src2) %{
   predicate(UseSSE==0 && !Compile::current()->select_24_bit_instr());
   match(Set dst (MulF src1 (LoadF src2)));
 
   format %{ "FMUL   $dst,$src1,$src2" %}
   opcode(0xD8, 0x1, 0xD9); /* D8 C8+i */  /* LoadF D9 /0 */
   ins_encode( Opcode(tertiary), RMopc_Mem(0x00,src2),
-              OpcReg_F(src1),
-              Pop_Reg_F(dst) );
+              OpcReg_FPR(src1),
+              Pop_Reg_FPR(dst) );
   ins_pipe( fpu_reg_reg_mem );
 %}
 
 // Spill to obtain 24-bit precision
-instruct mulF24_mem_mem(stackSlotF dst, memory src1, memory src2) %{
+instruct mulFPR24_mem_mem(stackSlotF dst, memory src1, memory src2) %{
   predicate(UseSSE==0 && Compile::current()->select_24_bit_instr());
   match(Set dst (MulF src1 src2));
 
@@ -11642,12 +10846,12 @@ instruct mulF24_mem_mem(stackSlotF dst, memory src1, memory src2) %{
   ins_encode( Opcode(tertiary), RMopc_Mem(0x00,src2),
               set_instruction_start,
               OpcP, RMopc_Mem(secondary,src1),
-              Pop_Mem_F(dst) );
+              Pop_Mem_FPR(dst) );
   ins_pipe( fpu_mem_mem_mem );
 %}
 
 // Spill to obtain 24-bit precision
-instruct mulF24_reg_imm(stackSlotF dst, regF src, immF con) %{
+instruct mulFPR24_reg_imm(stackSlotF dst, regFPR src, immFPR con) %{
   predicate(UseSSE==0 && Compile::current()->select_24_bit_instr());
   match(Set dst (MulF src con));
 
@@ -11663,7 +10867,7 @@ instruct mulF24_reg_imm(stackSlotF dst, regF src, immF con) %{
 %}
 //
 // This instruction does not round to 24-bits
-instruct mulF_reg_imm(regF dst, regF src, immF con) %{
+instruct mulFPR_reg_imm(regFPR dst, regFPR src, immFPR con) %{
   predicate(UseSSE==0 && !Compile::current()->select_24_bit_instr());
   match(Set dst (MulF src con));
 
@@ -11680,9 +10884,9 @@ instruct mulF_reg_imm(regF dst, regF src, immF con) %{
 
 
 //
-// MACRO1 -- subsume unshared load into mulF
+// MACRO1 -- subsume unshared load into mulFPR
 // This instruction does not round to 24-bits
-instruct mulF_reg_load1(regF dst, regF src, memory mem1 ) %{
+instruct mulFPR_reg_load1(regFPR dst, regFPR src, memory mem1 ) %{
   predicate(UseSSE==0 && !Compile::current()->select_24_bit_instr());
   match(Set dst (MulF (LoadF mem1) src));
 
@@ -11691,36 +10895,36 @@ instruct mulF_reg_load1(regF dst, regF src, memory mem1 ) %{
             "FSTP   $dst" %}
   opcode(0xD8, 0x1, 0xD9); /* D8 C8+i or D8 /1 */  /* LoadF D9 /0 */
   ins_encode( Opcode(tertiary), RMopc_Mem(0x00,mem1),
-              OpcReg_F(src),
-              Pop_Reg_F(dst) );
+              OpcReg_FPR(src),
+              Pop_Reg_FPR(dst) );
   ins_pipe( fpu_reg_reg_mem );
 %}
 //
-// MACRO2 -- addF a mulF which subsumed an unshared load
+// MACRO2 -- addFPR a mulFPR which subsumed an unshared load
 // This instruction does not round to 24-bits
-instruct addF_mulF_reg_load1(regF dst, memory mem1, regF src1, regF src2) %{
+instruct addFPR_mulFPR_reg_load1(regFPR dst, memory mem1, regFPR src1, regFPR src2) %{
   predicate(UseSSE==0 && !Compile::current()->select_24_bit_instr());
   match(Set dst (AddF (MulF (LoadF mem1) src1) src2));
   ins_cost(95);
 
   format %{ "FLD    $mem1     ===MACRO2===\n\t"
-            "FMUL   ST,$src1  subsume mulF left load\n\t"
+            "FMUL   ST,$src1  subsume mulFPR left load\n\t"
             "FADD   ST,$src2\n\t"
             "FSTP   $dst" %}
   opcode(0xD9); /* LoadF D9 /0 */
   ins_encode( OpcP, RMopc_Mem(0x00,mem1),
               FMul_ST_reg(src1),
               FAdd_ST_reg(src2),
-              Pop_Reg_F(dst) );
+              Pop_Reg_FPR(dst) );
   ins_pipe( fpu_reg_mem_reg_reg );
 %}
 
-// MACRO3 -- addF a mulF
+// MACRO3 -- addFPR a mulFPR
 // This instruction does not round to 24-bits.  It is a '2-address'
 // instruction in that the result goes back to src2.  This eliminates
 // a move from the macro; possibly the register allocator will have
 // to add it back (and maybe not).
-instruct addF_mulF_reg(regF src2, regF src1, regF src0) %{
+instruct addFPR_mulFPR_reg(regFPR src2, regFPR src1, regFPR src0) %{
   predicate(UseSSE==0 && !Compile::current()->select_24_bit_instr());
   match(Set src2 (AddF (MulF src0 src1) src2));
 
@@ -11728,15 +10932,15 @@ instruct addF_mulF_reg(regF src2, regF src1, regF src0) %{
             "FMUL   ST,$src1\n\t"
             "FADDP  $src2,ST" %}
   opcode(0xD9); /* LoadF D9 /0 */
-  ins_encode( Push_Reg_F(src0),
+  ins_encode( Push_Reg_FPR(src0),
               FMul_ST_reg(src1),
               FAddP_reg_ST(src2) );
   ins_pipe( fpu_reg_reg_reg );
 %}
 
-// MACRO4 -- divF subF
+// MACRO4 -- divFPR subFPR
 // This instruction does not round to 24-bits
-instruct subF_divF_reg(regF dst, regF src1, regF src2, regF src3) %{
+instruct subFPR_divFPR_reg(regFPR dst, regFPR src1, regFPR src2, regFPR src3) %{
   predicate(UseSSE==0 && !Compile::current()->select_24_bit_instr());
   match(Set dst (DivF (SubF src2 src1) src3));
 
@@ -11745,67 +10949,67 @@ instruct subF_divF_reg(regF dst, regF src1, regF src2, regF src3) %{
             "FDIV   ST,$src3\n\t"
             "FSTP  $dst" %}
   opcode(0xDE, 0x7); /* DE F8+i or DE /7*/
-  ins_encode( Push_Reg_F(src2),
-              subF_divF_encode(src1,src3),
-              Pop_Reg_F(dst) );
+  ins_encode( Push_Reg_FPR(src2),
+              subFPR_divFPR_encode(src1,src3),
+              Pop_Reg_FPR(dst) );
   ins_pipe( fpu_reg_reg_reg_reg );
 %}
 
 // Spill to obtain 24-bit precision
-instruct divF24_reg(stackSlotF dst, regF src1, regF src2) %{
+instruct divFPR24_reg(stackSlotF dst, regFPR src1, regFPR src2) %{
   predicate(UseSSE==0 && Compile::current()->select_24_bit_instr());
   match(Set dst (DivF src1 src2));
 
   format %{ "FDIV   $dst,$src1,$src2" %}
   opcode(0xD8, 0x6); /* D8 F0+i or DE /6*/
-  ins_encode( Push_Reg_F(src1),
-              OpcReg_F(src2),
-              Pop_Mem_F(dst) );
+  ins_encode( Push_Reg_FPR(src1),
+              OpcReg_FPR(src2),
+              Pop_Mem_FPR(dst) );
   ins_pipe( fpu_mem_reg_reg );
 %}
 //
 // This instruction does not round to 24-bits
-instruct divF_reg(regF dst, regF src) %{
+instruct divFPR_reg(regFPR dst, regFPR src) %{
   predicate(UseSSE==0 && !Compile::current()->select_24_bit_instr());
   match(Set dst (DivF dst src));
 
   format %{ "FDIV   $dst,$src" %}
   opcode(0xDE, 0x7); /* DE F8+i or DE /7*/
-  ins_encode( Push_Reg_F(src),
+  ins_encode( Push_Reg_FPR(src),
               OpcP, RegOpc(dst) );
   ins_pipe( fpu_reg_reg );
 %}
 
 
 // Spill to obtain 24-bit precision
-instruct modF24_reg(stackSlotF dst, regF src1, regF src2, eAXRegI rax, eFlagsReg cr) %{
+instruct modFPR24_reg(stackSlotF dst, regFPR src1, regFPR src2, eAXRegI rax, eFlagsReg cr) %{
   predicate( UseSSE==0 && Compile::current()->select_24_bit_instr());
   match(Set dst (ModF src1 src2));
-  effect(KILL rax, KILL cr); // emitModD() uses EAX and EFLAGS
+  effect(KILL rax, KILL cr); // emitModDPR() uses EAX and EFLAGS
 
   format %{ "FMOD   $dst,$src1,$src2" %}
-  ins_encode( Push_Reg_Mod_D(src1, src2),
-              emitModD(),
-              Push_Result_Mod_D(src2),
-              Pop_Mem_F(dst));
+  ins_encode( Push_Reg_Mod_DPR(src1, src2),
+              emitModDPR(),
+              Push_Result_Mod_DPR(src2),
+              Pop_Mem_FPR(dst));
   ins_pipe( pipe_slow );
 %}
 //
 // This instruction does not round to 24-bits
-instruct modF_reg(regF dst, regF src, eAXRegI rax, eFlagsReg cr) %{
+instruct modFPR_reg(regFPR dst, regFPR src, eAXRegI rax, eFlagsReg cr) %{
   predicate( UseSSE==0 && !Compile::current()->select_24_bit_instr());
   match(Set dst (ModF dst src));
-  effect(KILL rax, KILL cr); // emitModD() uses EAX and EFLAGS
+  effect(KILL rax, KILL cr); // emitModDPR() uses EAX and EFLAGS
 
   format %{ "FMOD   $dst,$src" %}
-  ins_encode(Push_Reg_Mod_D(dst, src),
-              emitModD(),
-              Push_Result_Mod_D(src),
-              Pop_Reg_F(dst));
+  ins_encode(Push_Reg_Mod_DPR(dst, src),
+              emitModDPR(),
+              Push_Result_Mod_DPR(src),
+              Pop_Reg_FPR(dst));
   ins_pipe( pipe_slow );
 %}
 
-instruct modX_reg(regX dst, regX src0, regX src1, eAXRegI rax, eFlagsReg cr) %{
+instruct modF_reg(regF dst, regF src0, regF src1, eAXRegI rax, eFlagsReg cr) %{
   predicate(UseSSE>=1);
   match(Set dst (ModF src0 src1));
   effect(KILL rax, KILL cr);
@@ -11825,7 +11029,7 @@ instruct modX_reg(regX dst, regX src0, regX src1, eAXRegI rax, eFlagsReg cr) %{
           "\tFSTP   ST0\t # Restore FPU Stack"
     %}
   ins_cost(250);
-  ins_encode( Push_ModX_encoding(src0, src1), emitModD(), Push_ResultX(dst,0x4), PopFPU);
+  ins_encode( Push_ModF_encoding(src0, src1), emitModDPR(), Push_ResultF(dst,0x4), PopFPU);
   ins_pipe( pipe_slow );
 %}
 
@@ -11833,26 +11037,26 @@ instruct modX_reg(regX dst, regX src0, regX src1, eAXRegI rax, eFlagsReg cr) %{
 //----------Arithmetic Conversion Instructions---------------------------------
 // The conversions operations are all Alpha sorted.  Please keep it that way!
 
-instruct roundFloat_mem_reg(stackSlotF dst, regF src) %{
+instruct roundFloat_mem_reg(stackSlotF dst, regFPR src) %{
   predicate(UseSSE==0);
   match(Set dst (RoundFloat src));
   ins_cost(125);
   format %{ "FST_S  $dst,$src\t# F-round" %}
-  ins_encode( Pop_Mem_Reg_F(dst, src) );
+  ins_encode( Pop_Mem_Reg_FPR(dst, src) );
   ins_pipe( fpu_mem_reg );
 %}
 
-instruct roundDouble_mem_reg(stackSlotD dst, regD src) %{
+instruct roundDouble_mem_reg(stackSlotD dst, regDPR src) %{
   predicate(UseSSE<=1);
   match(Set dst (RoundDouble src));
   ins_cost(125);
   format %{ "FST_D  $dst,$src\t# D-round" %}
-  ins_encode( Pop_Mem_Reg_D(dst, src) );
+  ins_encode( Pop_Mem_Reg_DPR(dst, src) );
   ins_pipe( fpu_mem_reg );
 %}
 
 // Force rounding to 24-bit precision and 6-bit exponent
-instruct convD2F_reg(stackSlotF dst, regD src) %{
+instruct convDPR2FPR_reg(stackSlotF dst, regDPR src) %{
   predicate(UseSSE==0);
   match(Set dst (ConvD2F src));
   format %{ "FST_S  $dst,$src\t# F-round" %}
@@ -11862,7 +11066,7 @@ instruct convD2F_reg(stackSlotF dst, regD src) %{
 %}
 
 // Force rounding to 24-bit precision and 6-bit exponent
-instruct convD2X_reg(regX dst, regD src, eFlagsReg cr) %{
+instruct convDPR2F_reg(regF dst, regDPR src, eFlagsReg cr) %{
   predicate(UseSSE==1);
   match(Set dst (ConvD2F src));
   effect( KILL cr );
@@ -11870,29 +11074,40 @@ instruct convD2X_reg(regX dst, regD src, eFlagsReg cr) %{
             "FST_S  [ESP],$src\t# F-round\n\t"
             "MOVSS  $dst,[ESP]\n\t"
             "ADD ESP,4" %}
-  ins_encode( D2X_encoding(dst, src) );
+  ins_encode %{
+    __ subptr(rsp, 4);
+    if ($src$$reg != FPR1L_enc) {
+      __ fld_s($src$$reg-1);
+      __ fstp_s(Address(rsp, 0));
+    } else {
+      __ fst_s(Address(rsp, 0));
+    }
+    __ movflt($dst$$XMMRegister, Address(rsp, 0));
+    __ addptr(rsp, 4);
+  %}
   ins_pipe( pipe_slow );
 %}
 
 // Force rounding double precision to single precision
-instruct convXD2X_reg(regX dst, regXD src) %{
+instruct convD2F_reg(regF dst, regD src) %{
   predicate(UseSSE>=2);
   match(Set dst (ConvD2F src));
   format %{ "CVTSD2SS $dst,$src\t# F-round" %}
-  opcode(0xF2, 0x0F, 0x5A);
-  ins_encode( OpcP, OpcS, Opcode(tertiary), RegReg(dst, src));
+  ins_encode %{
+    __ cvtsd2ss ($dst$$XMMRegister, $src$$XMMRegister);
+  %}
   ins_pipe( pipe_slow );
 %}
 
-instruct convF2D_reg_reg(regD dst, regF src) %{
+instruct convFPR2DPR_reg_reg(regDPR dst, regFPR src) %{
   predicate(UseSSE==0);
   match(Set dst (ConvF2D src));
   format %{ "FST_S  $dst,$src\t# D-round" %}
-  ins_encode( Pop_Reg_Reg_D(dst, src));
+  ins_encode( Pop_Reg_Reg_DPR(dst, src));
   ins_pipe( fpu_reg_reg );
 %}
 
-instruct convF2D_reg(stackSlotD dst, regF src) %{
+instruct convFPR2D_reg(stackSlotD dst, regFPR src) %{
   predicate(UseSSE==1);
   match(Set dst (ConvF2D src));
   format %{ "FST_D  $dst,$src\t# D-round" %}
@@ -11901,7 +11116,7 @@ instruct convF2D_reg(stackSlotD dst, regF src) %{
   %}
 %}
 
-instruct convX2D_reg(regD dst, regX src, eFlagsReg cr) %{
+instruct convF2DPR_reg(regDPR dst, regF src, eFlagsReg cr) %{
   predicate(UseSSE==1);
   match(Set dst (ConvF2D src));
   effect( KILL cr );
@@ -11910,21 +11125,28 @@ instruct convX2D_reg(regD dst, regX src, eFlagsReg cr) %{
             "FLD_S  [ESP]\n\t"
             "ADD    ESP,4\n\t"
             "FSTP   $dst\t# D-round" %}
-  ins_encode( X2D_encoding(dst, src), Pop_Reg_D(dst));
+  ins_encode %{
+    __ subptr(rsp, 4);
+    __ movflt(Address(rsp, 0), $src$$XMMRegister);
+    __ fld_s(Address(rsp, 0));
+    __ addptr(rsp, 4);
+    __ fstp_d($dst$$reg);
+  %}
   ins_pipe( pipe_slow );
 %}
 
-instruct convX2XD_reg(regXD dst, regX src) %{
+instruct convF2D_reg(regD dst, regF src) %{
   predicate(UseSSE>=2);
   match(Set dst (ConvF2D src));
   format %{ "CVTSS2SD $dst,$src\t# D-round" %}
-  opcode(0xF3, 0x0F, 0x5A);
-  ins_encode( OpcP, OpcS, Opcode(tertiary), RegReg(dst, src));
+  ins_encode %{
+    __ cvtss2sd ($dst$$XMMRegister, $src$$XMMRegister);
+  %}
   ins_pipe( pipe_slow );
 %}
 
 // Convert a double to an int.  If the double is a NAN, stuff a zero in instead.
-instruct convD2I_reg_reg( eAXRegI dst, eDXRegI tmp, regD src, eFlagsReg cr ) %{
+instruct convDPR2I_reg_reg( eAXRegI dst, eDXRegI tmp, regDPR src, eFlagsReg cr ) %{
   predicate(UseSSE<=1);
   match(Set dst (ConvD2I src));
   effect( KILL tmp, KILL cr );
@@ -11939,12 +11161,12 @@ instruct convD2I_reg_reg( eAXRegI dst, eDXRegI tmp, regD src, eFlagsReg cr ) %{
             "FLD_D  $src\n\t"
             "CALL   d2i_wrapper\n"
       "fast:" %}
-  ins_encode( Push_Reg_D(src), D2I_encoding(src) );
+  ins_encode( Push_Reg_DPR(src), DPR2I_encoding(src) );
   ins_pipe( pipe_slow );
 %}
 
 // Convert a double to an int.  If the double is a NAN, stuff a zero in instead.
-instruct convXD2I_reg_reg( eAXRegI dst, eDXRegI tmp, regXD src, eFlagsReg cr ) %{
+instruct convD2I_reg_reg( eAXRegI dst, eDXRegI tmp, regD src, eFlagsReg cr ) %{
   predicate(UseSSE>=2);
   match(Set dst (ConvD2I src));
   effect( KILL tmp, KILL cr );
@@ -11957,12 +11179,22 @@ instruct convXD2I_reg_reg( eAXRegI dst, eDXRegI tmp, regXD src, eFlagsReg cr ) %
             "ADD    ESP, 8\n\t"
             "CALL   d2i_wrapper\n"
       "fast:" %}
-  opcode(0x1); // double-precision conversion
-  ins_encode( Opcode(0xF2), Opcode(0x0F), Opcode(0x2C), FX2I_encoding(src,dst));
+  ins_encode %{
+    Label fast;
+    __ cvttsd2sil($dst$$Register, $src$$XMMRegister);
+    __ cmpl($dst$$Register, 0x80000000);
+    __ jccb(Assembler::notEqual, fast);
+    __ subptr(rsp, 8);
+    __ movdbl(Address(rsp, 0), $src$$XMMRegister);
+    __ fld_d(Address(rsp, 0));
+    __ addptr(rsp, 8);
+    __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, StubRoutines::d2i_wrapper())));
+    __ bind(fast);
+  %}
   ins_pipe( pipe_slow );
 %}
 
-instruct convD2L_reg_reg( eADXRegL dst, regD src, eFlagsReg cr ) %{
+instruct convDPR2L_reg_reg( eADXRegL dst, regDPR src, eFlagsReg cr ) %{
   predicate(UseSSE<=1);
   match(Set dst (ConvD2L src));
   effect( KILL cr );
@@ -11980,12 +11212,12 @@ instruct convD2L_reg_reg( eADXRegL dst, regD src, eFlagsReg cr ) %{
             "FLD    $src\n\t"
             "CALL   d2l_wrapper\n"
       "fast:" %}
-  ins_encode( Push_Reg_D(src),  D2L_encoding(src) );
+  ins_encode( Push_Reg_DPR(src),  DPR2L_encoding(src) );
   ins_pipe( pipe_slow );
 %}
 
 // XMM lacks a float/double->long conversion, so use the old FPU stack.
-instruct convXD2L_reg_reg( eADXRegL dst, regXD src, eFlagsReg cr ) %{
+instruct convD2L_reg_reg( eADXRegL dst, regD src, eFlagsReg cr ) %{
   predicate (UseSSE>=2);
   match(Set dst (ConvD2L src));
   effect( KILL cr );
@@ -12004,9 +11236,36 @@ instruct convXD2L_reg_reg( eADXRegL dst, regXD src, eFlagsReg cr ) %{
             "SUB    ESP,8\n\t"
             "MOVSD  [ESP],$src\n\t"
             "FLD_D  [ESP]\n\t"
+            "ADD    ESP,8\n\t"
             "CALL   d2l_wrapper\n"
       "fast:" %}
-  ins_encode( XD2L_encoding(src) );
+  ins_encode %{
+    Label fast;
+    __ subptr(rsp, 8);
+    __ movdbl(Address(rsp, 0), $src$$XMMRegister);
+    __ fld_d(Address(rsp, 0));
+    __ fldcw(ExternalAddress(StubRoutines::addr_fpu_cntrl_wrd_trunc()));
+    __ fistp_d(Address(rsp, 0));
+    // Restore the rounding mode, mask the exception
+    if (Compile::current()->in_24_bit_fp_mode()) {
+      __ fldcw(ExternalAddress(StubRoutines::addr_fpu_cntrl_wrd_24()));
+    } else {
+      __ fldcw(ExternalAddress(StubRoutines::addr_fpu_cntrl_wrd_std()));
+    }
+    // Load the converted long, adjust CPU stack
+    __ pop(rax);
+    __ pop(rdx);
+    __ cmpl(rdx, 0x80000000);
+    __ jccb(Assembler::notEqual, fast);
+    __ testl(rax, rax);
+    __ jccb(Assembler::notEqual, fast);
+    __ subptr(rsp, 8);
+    __ movdbl(Address(rsp, 0), $src$$XMMRegister);
+    __ fld_d(Address(rsp, 0));
+    __ addptr(rsp, 8);
+    __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, StubRoutines::d2l_wrapper())));
+    __ bind(fast);
+  %}
   ins_pipe( pipe_slow );
 %}
 
@@ -12016,7 +11275,7 @@ instruct convXD2L_reg_reg( eADXRegL dst, regXD src, eFlagsReg cr ) %{
 // rounding mode to 'nearest'.  The hardware stores a flag value down
 // if we would overflow or converted a NAN; we check for this and
 // and go the slow path if needed.
-instruct convF2I_reg_reg(eAXRegI dst, eDXRegI tmp, regF src, eFlagsReg cr ) %{
+instruct convFPR2I_reg_reg(eAXRegI dst, eDXRegI tmp, regFPR src, eFlagsReg cr ) %{
   predicate(UseSSE==0);
   match(Set dst (ConvF2I src));
   effect( KILL tmp, KILL cr );
@@ -12031,13 +11290,13 @@ instruct convF2I_reg_reg(eAXRegI dst, eDXRegI tmp, regF src, eFlagsReg cr ) %{
             "FLD    $src\n\t"
             "CALL   d2i_wrapper\n"
       "fast:" %}
-  // D2I_encoding works for F2I
-  ins_encode( Push_Reg_F(src), D2I_encoding(src) );
+  // DPR2I_encoding works for FPR2I
+  ins_encode( Push_Reg_FPR(src), DPR2I_encoding(src) );
   ins_pipe( pipe_slow );
 %}
 
 // Convert a float in xmm to an int reg.
-instruct convX2I_reg(eAXRegI dst, eDXRegI tmp, regX src, eFlagsReg cr ) %{
+instruct convF2I_reg(eAXRegI dst, eDXRegI tmp, regF src, eFlagsReg cr ) %{
   predicate(UseSSE>=1);
   match(Set dst (ConvF2I src));
   effect( KILL tmp, KILL cr );
@@ -12050,12 +11309,22 @@ instruct convX2I_reg(eAXRegI dst, eDXRegI tmp, regX src, eFlagsReg cr ) %{
             "ADD    ESP, 4\n\t"
             "CALL   d2i_wrapper\n"
       "fast:" %}
-  opcode(0x0); // single-precision conversion
-  ins_encode( Opcode(0xF3), Opcode(0x0F), Opcode(0x2C), FX2I_encoding(src,dst));
+  ins_encode %{
+    Label fast;
+    __ cvttss2sil($dst$$Register, $src$$XMMRegister);
+    __ cmpl($dst$$Register, 0x80000000);
+    __ jccb(Assembler::notEqual, fast);
+    __ subptr(rsp, 4);
+    __ movflt(Address(rsp, 0), $src$$XMMRegister);
+    __ fld_s(Address(rsp, 0));
+    __ addptr(rsp, 4);
+    __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, StubRoutines::d2i_wrapper())));
+    __ bind(fast);
+  %}
   ins_pipe( pipe_slow );
 %}
 
-instruct convF2L_reg_reg( eADXRegL dst, regF src, eFlagsReg cr ) %{
+instruct convFPR2L_reg_reg( eADXRegL dst, regFPR src, eFlagsReg cr ) %{
   predicate(UseSSE==0);
   match(Set dst (ConvF2L src));
   effect( KILL cr );
@@ -12073,13 +11342,13 @@ instruct convF2L_reg_reg( eADXRegL dst, regF src, eFlagsReg cr ) %{
             "FLD    $src\n\t"
             "CALL   d2l_wrapper\n"
       "fast:" %}
-  // D2L_encoding works for F2L
-  ins_encode( Push_Reg_F(src), D2L_encoding(src) );
+  // DPR2L_encoding works for FPR2L
+  ins_encode( Push_Reg_FPR(src), DPR2L_encoding(src) );
   ins_pipe( pipe_slow );
 %}
 
 // XMM lacks a float/double->long conversion, so use the old FPU stack.
-instruct convX2L_reg_reg( eADXRegL dst, regX src, eFlagsReg cr ) %{
+instruct convF2L_reg_reg( eADXRegL dst, regF src, eFlagsReg cr ) %{
   predicate (UseSSE>=1);
   match(Set dst (ConvF2L src));
   effect( KILL cr );
@@ -12101,39 +11370,67 @@ instruct convX2L_reg_reg( eADXRegL dst, regX src, eFlagsReg cr ) %{
             "ADD    ESP,4\n\t"
             "CALL   d2l_wrapper\n"
       "fast:" %}
-  ins_encode( X2L_encoding(src) );
+  ins_encode %{
+    Label fast;
+    __ subptr(rsp, 8);
+    __ movflt(Address(rsp, 0), $src$$XMMRegister);
+    __ fld_s(Address(rsp, 0));
+    __ fldcw(ExternalAddress(StubRoutines::addr_fpu_cntrl_wrd_trunc()));
+    __ fistp_d(Address(rsp, 0));
+    // Restore the rounding mode, mask the exception
+    if (Compile::current()->in_24_bit_fp_mode()) {
+      __ fldcw(ExternalAddress(StubRoutines::addr_fpu_cntrl_wrd_24()));
+    } else {
+      __ fldcw(ExternalAddress(StubRoutines::addr_fpu_cntrl_wrd_std()));
+    }
+    // Load the converted long, adjust CPU stack
+    __ pop(rax);
+    __ pop(rdx);
+    __ cmpl(rdx, 0x80000000);
+    __ jccb(Assembler::notEqual, fast);
+    __ testl(rax, rax);
+    __ jccb(Assembler::notEqual, fast);
+    __ subptr(rsp, 4);
+    __ movflt(Address(rsp, 0), $src$$XMMRegister);
+    __ fld_s(Address(rsp, 0));
+    __ addptr(rsp, 4);
+    __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, StubRoutines::d2l_wrapper())));
+    __ bind(fast);
+  %}
   ins_pipe( pipe_slow );
 %}
 
-instruct convI2D_reg(regD dst, stackSlotI src) %{
+instruct convI2DPR_reg(regDPR dst, stackSlotI src) %{
   predicate( UseSSE<=1 );
   match(Set dst (ConvI2D src));
   format %{ "FILD   $src\n\t"
             "FSTP   $dst" %}
   opcode(0xDB, 0x0);  /* DB /0 */
-  ins_encode(Push_Mem_I(src), Pop_Reg_D(dst));
+  ins_encode(Push_Mem_I(src), Pop_Reg_DPR(dst));
   ins_pipe( fpu_reg_mem );
 %}
 
-instruct convI2XD_reg(regXD dst, eRegI src) %{
+instruct convI2D_reg(regD dst, eRegI src) %{
   predicate( UseSSE>=2 && !UseXmmI2D );
   match(Set dst (ConvI2D src));
   format %{ "CVTSI2SD $dst,$src" %}
-  opcode(0xF2, 0x0F, 0x2A);
-  ins_encode( OpcP, OpcS, Opcode(tertiary), RegReg(dst, src));
+  ins_encode %{
+    __ cvtsi2sdl ($dst$$XMMRegister, $src$$Register);
+  %}
   ins_pipe( pipe_slow );
 %}
 
-instruct convI2XD_mem(regXD dst, memory mem) %{
+instruct convI2D_mem(regD dst, memory mem) %{
   predicate( UseSSE>=2 );
   match(Set dst (ConvI2D (LoadI mem)));
   format %{ "CVTSI2SD $dst,$mem" %}
-  opcode(0xF2, 0x0F, 0x2A);
-  ins_encode( OpcP, OpcS, Opcode(tertiary), RegMem(dst, mem));
+  ins_encode %{
+    __ cvtsi2sdl ($dst$$XMMRegister, $mem$$Address);
+  %}
   ins_pipe( pipe_slow );
 %}
 
-instruct convXI2XD_reg(regXD dst, eRegI src)
+instruct convXI2D_reg(regD dst, eRegI src)
 %{
   predicate( UseSSE>=2 && UseXmmI2D );
   match(Set dst (ConvI2D src));
@@ -12147,31 +11444,31 @@ instruct convXI2XD_reg(regXD dst, eRegI src)
   ins_pipe(pipe_slow); // XXX
 %}
 
-instruct convI2D_mem(regD dst, memory mem) %{
+instruct convI2DPR_mem(regDPR dst, memory mem) %{
   predicate( UseSSE<=1 && !Compile::current()->select_24_bit_instr());
   match(Set dst (ConvI2D (LoadI mem)));
   format %{ "FILD   $mem\n\t"
             "FSTP   $dst" %}
   opcode(0xDB);      /* DB /0 */
   ins_encode( OpcP, RMopc_Mem(0x00,mem),
-              Pop_Reg_D(dst));
+              Pop_Reg_DPR(dst));
   ins_pipe( fpu_reg_mem );
 %}
 
 // Convert a byte to a float; no rounding step needed.
-instruct conv24I2F_reg(regF dst, stackSlotI src) %{
+instruct conv24I2FPR_reg(regFPR dst, stackSlotI src) %{
   predicate( UseSSE==0 && n->in(1)->Opcode() == Op_AndI && n->in(1)->in(2)->is_Con() && n->in(1)->in(2)->get_int() == 255 );
   match(Set dst (ConvI2F src));
   format %{ "FILD   $src\n\t"
             "FSTP   $dst" %}
 
   opcode(0xDB, 0x0);  /* DB /0 */
-  ins_encode(Push_Mem_I(src), Pop_Reg_F(dst));
+  ins_encode(Push_Mem_I(src), Pop_Reg_FPR(dst));
   ins_pipe( fpu_reg_mem );
 %}
 
 // In 24-bit mode, force exponent rounding by storing back out
-instruct convI2F_SSF(stackSlotF dst, stackSlotI src) %{
+instruct convI2FPR_SSF(stackSlotF dst, stackSlotI src) %{
   predicate( UseSSE==0 && Compile::current()->select_24_bit_instr());
   match(Set dst (ConvI2F src));
   ins_cost(200);
@@ -12179,12 +11476,12 @@ instruct convI2F_SSF(stackSlotF dst, stackSlotI src) %{
             "FSTP_S $dst" %}
   opcode(0xDB, 0x0);  /* DB /0 */
   ins_encode( Push_Mem_I(src),
-              Pop_Mem_F(dst));
+              Pop_Mem_FPR(dst));
   ins_pipe( fpu_mem_mem );
 %}
 
 // In 24-bit mode, force exponent rounding by storing back out
-instruct convI2F_SSF_mem(stackSlotF dst, memory mem) %{
+instruct convI2FPR_SSF_mem(stackSlotF dst, memory mem) %{
   predicate( UseSSE==0 && Compile::current()->select_24_bit_instr());
   match(Set dst (ConvI2F (LoadI mem)));
   ins_cost(200);
@@ -12192,46 +11489,46 @@ instruct convI2F_SSF_mem(stackSlotF dst, memory mem) %{
             "FSTP_S $dst" %}
   opcode(0xDB);  /* DB /0 */
   ins_encode( OpcP, RMopc_Mem(0x00,mem),
-              Pop_Mem_F(dst));
+              Pop_Mem_FPR(dst));
   ins_pipe( fpu_mem_mem );
 %}
 
 // This instruction does not round to 24-bits
-instruct convI2F_reg(regF dst, stackSlotI src) %{
+instruct convI2FPR_reg(regFPR dst, stackSlotI src) %{
   predicate( UseSSE==0 && !Compile::current()->select_24_bit_instr());
   match(Set dst (ConvI2F src));
   format %{ "FILD   $src\n\t"
             "FSTP   $dst" %}
   opcode(0xDB, 0x0);  /* DB /0 */
   ins_encode( Push_Mem_I(src),
-              Pop_Reg_F(dst));
+              Pop_Reg_FPR(dst));
   ins_pipe( fpu_reg_mem );
 %}
 
 // This instruction does not round to 24-bits
-instruct convI2F_mem(regF dst, memory mem) %{
+instruct convI2FPR_mem(regFPR dst, memory mem) %{
   predicate( UseSSE==0 && !Compile::current()->select_24_bit_instr());
   match(Set dst (ConvI2F (LoadI mem)));
   format %{ "FILD   $mem\n\t"
             "FSTP   $dst" %}
   opcode(0xDB);      /* DB /0 */
   ins_encode( OpcP, RMopc_Mem(0x00,mem),
-              Pop_Reg_F(dst));
+              Pop_Reg_FPR(dst));
   ins_pipe( fpu_reg_mem );
 %}
 
 // Convert an int to a float in xmm; no rounding step needed.
-instruct convI2X_reg(regX dst, eRegI src) %{
+instruct convI2F_reg(regF dst, eRegI src) %{
   predicate( UseSSE==1 || UseSSE>=2 && !UseXmmI2F );
   match(Set dst (ConvI2F src));
   format %{ "CVTSI2SS $dst, $src" %}
-
-  opcode(0xF3, 0x0F, 0x2A);  /* F3 0F 2A /r */
-  ins_encode( OpcP, OpcS, Opcode(tertiary), RegReg(dst, src));
+  ins_encode %{
+    __ cvtsi2ssl ($dst$$XMMRegister, $src$$Register);
+  %}
   ins_pipe( pipe_slow );
 %}
 
- instruct convXI2X_reg(regX dst, eRegI src)
+ instruct convXI2F_reg(regF dst, eRegI src)
 %{
   predicate( UseSSE>=2 && UseXmmI2F );
   match(Set dst (ConvI2F src));
@@ -12280,7 +11577,7 @@ instruct zerox_long(eRegL dst, eRegL src, immL_32bits mask, eFlagsReg flags ) %{
   ins_pipe( ialu_reg_reg_long );
 %}
 
-instruct convL2D_reg( stackSlotD dst, eRegL src, eFlagsReg cr) %{
+instruct convL2DPR_reg( stackSlotD dst, eRegL src, eFlagsReg cr) %{
   predicate (UseSSE<=1);
   match(Set dst (ConvL2D src));
   effect( KILL cr );
@@ -12290,11 +11587,11 @@ instruct convL2D_reg( stackSlotD dst, eRegL src, eFlagsReg cr) %{
             "ADD    ESP,8\n\t"
             "FSTP_D $dst\t# D-round" %}
   opcode(0xDF, 0x5);  /* DF /5 */
-  ins_encode(convert_long_double(src), Pop_Mem_D(dst));
+  ins_encode(convert_long_double(src), Pop_Mem_DPR(dst));
   ins_pipe( pipe_slow );
 %}
 
-instruct convL2XD_reg( regXD dst, eRegL src, eFlagsReg cr) %{
+instruct convL2D_reg( regD dst, eRegL src, eFlagsReg cr) %{
   predicate (UseSSE>=2);
   match(Set dst (ConvL2D src));
   effect( KILL cr );
@@ -12305,11 +11602,11 @@ instruct convL2XD_reg( regXD dst, eRegL src, eFlagsReg cr) %{
             "MOVSD  $dst,[ESP]\n\t"
             "ADD    ESP,8" %}
   opcode(0xDF, 0x5);  /* DF /5 */
-  ins_encode(convert_long_double2(src), Push_ResultXD(dst));
+  ins_encode(convert_long_double2(src), Push_ResultD(dst));
   ins_pipe( pipe_slow );
 %}
 
-instruct convL2X_reg( regX dst, eRegL src, eFlagsReg cr) %{
+instruct convL2F_reg( regF dst, eRegL src, eFlagsReg cr) %{
   predicate (UseSSE>=1);
   match(Set dst (ConvL2F src));
   effect( KILL cr );
@@ -12320,11 +11617,11 @@ instruct convL2X_reg( regX dst, eRegL src, eFlagsReg cr) %{
             "MOVSS  $dst,[ESP]\n\t"
             "ADD    ESP,8" %}
   opcode(0xDF, 0x5);  /* DF /5 */
-  ins_encode(convert_long_double2(src), Push_ResultX(dst,0x8));
+  ins_encode(convert_long_double2(src), Push_ResultF(dst,0x8));
   ins_pipe( pipe_slow );
 %}
 
-instruct convL2F_reg( stackSlotF dst, eRegL src, eFlagsReg cr) %{
+instruct convL2FPR_reg( stackSlotF dst, eRegL src, eFlagsReg cr) %{
   match(Set dst (ConvL2F src));
   effect( KILL cr );
   format %{ "PUSH   $src.hi\t# Convert long to single float\n\t"
@@ -12333,7 +11630,7 @@ instruct convL2F_reg( stackSlotF dst, eRegL src, eFlagsReg cr) %{
             "ADD    ESP,8\n\t"
             "FSTP_S $dst\t# F-round" %}
   opcode(0xDF, 0x5);  /* DF /5 */
-  ins_encode(convert_long_double(src), Pop_Mem_F(dst));
+  ins_encode(convert_long_double(src), Pop_Mem_FPR(dst));
   ins_pipe( pipe_slow );
 %}
 
@@ -12351,40 +11648,45 @@ instruct MoveF2I_stack_reg(eRegI dst, stackSlotF src) %{
   effect( DEF dst, USE src );
   ins_cost(100);
   format %{ "MOV    $dst,$src\t# MoveF2I_stack_reg" %}
-  opcode(0x8B);
-  ins_encode( OpcP, RegMem(dst,src));
+  ins_encode %{
+    __ movl($dst$$Register, Address(rsp, $src$$disp));
+  %}
   ins_pipe( ialu_reg_mem );
 %}
 
-instruct MoveF2I_reg_stack(stackSlotI dst, regF src) %{
+instruct MoveFPR2I_reg_stack(stackSlotI dst, regFPR src) %{
   predicate(UseSSE==0);
   match(Set dst (MoveF2I src));
   effect( DEF dst, USE src );
 
   ins_cost(125);
   format %{ "FST_S  $dst,$src\t# MoveF2I_reg_stack" %}
-  ins_encode( Pop_Mem_Reg_F(dst, src) );
+  ins_encode( Pop_Mem_Reg_FPR(dst, src) );
   ins_pipe( fpu_mem_reg );
 %}
 
-instruct MoveF2I_reg_stack_sse(stackSlotI dst, regX src) %{
+instruct MoveF2I_reg_stack_sse(stackSlotI dst, regF src) %{
   predicate(UseSSE>=1);
   match(Set dst (MoveF2I src));
   effect( DEF dst, USE src );
 
   ins_cost(95);
   format %{ "MOVSS  $dst,$src\t# MoveF2I_reg_stack_sse" %}
-  ins_encode( Opcode(0xF3), Opcode(0x0F), Opcode(0x11), RegMem(src, dst));
+  ins_encode %{
+    __ movflt(Address(rsp, $dst$$disp), $src$$XMMRegister);
+  %}
   ins_pipe( pipe_slow );
 %}
 
-instruct MoveF2I_reg_reg_sse(eRegI dst, regX src) %{
+instruct MoveF2I_reg_reg_sse(eRegI dst, regF src) %{
   predicate(UseSSE>=2);
   match(Set dst (MoveF2I src));
   effect( DEF dst, USE src );
   ins_cost(85);
   format %{ "MOVD   $dst,$src\t# MoveF2I_reg_reg_sse" %}
-  ins_encode( MovX2I_reg(dst, src));
+  ins_encode %{
+    __ movdl($dst$$Register, $src$$XMMRegister);
+  %}
   ins_pipe( pipe_slow );
 %}
 
@@ -12394,13 +11696,14 @@ instruct MoveI2F_reg_stack(stackSlotF dst, eRegI src) %{
 
   ins_cost(100);
   format %{ "MOV    $dst,$src\t# MoveI2F_reg_stack" %}
-  opcode(0x89);
-  ins_encode( OpcPRegSS( dst, src ) );
+  ins_encode %{
+    __ movl(Address(rsp, $dst$$disp), $src$$Register);
+  %}
   ins_pipe( ialu_mem_reg );
 %}
 
 
-instruct MoveI2F_stack_reg(regF dst, stackSlotI src) %{
+instruct MoveI2FPR_stack_reg(regFPR dst, stackSlotI src) %{
   predicate(UseSSE==0);
   match(Set dst (MoveI2F src));
   effect(DEF dst, USE src);
@@ -12410,29 +11713,33 @@ instruct MoveI2F_stack_reg(regF dst, stackSlotI src) %{
             "FSTP   $dst\t# MoveI2F_stack_reg" %}
   opcode(0xD9);               /* D9 /0, FLD m32real */
   ins_encode( OpcP, RMopc_Mem_no_oop(0x00,src),
-              Pop_Reg_F(dst) );
+              Pop_Reg_FPR(dst) );
   ins_pipe( fpu_reg_mem );
 %}
 
-instruct MoveI2F_stack_reg_sse(regX dst, stackSlotI src) %{
+instruct MoveI2F_stack_reg_sse(regF dst, stackSlotI src) %{
   predicate(UseSSE>=1);
   match(Set dst (MoveI2F src));
   effect( DEF dst, USE src );
 
   ins_cost(95);
   format %{ "MOVSS  $dst,$src\t# MoveI2F_stack_reg_sse" %}
-  ins_encode( Opcode(0xF3), Opcode(0x0F), Opcode(0x10), RegMem(dst,src));
+  ins_encode %{
+    __ movflt($dst$$XMMRegister, Address(rsp, $src$$disp));
+  %}
   ins_pipe( pipe_slow );
 %}
 
-instruct MoveI2F_reg_reg_sse(regX dst, eRegI src) %{
+instruct MoveI2F_reg_reg_sse(regF dst, eRegI src) %{
   predicate(UseSSE>=2);
   match(Set dst (MoveI2F src));
   effect( DEF dst, USE src );
 
   ins_cost(85);
   format %{ "MOVD   $dst,$src\t# MoveI2F_reg_reg_sse" %}
-  ins_encode( MovI2X_reg(dst, src) );
+  ins_encode %{
+    __ movdl($dst$$XMMRegister, $src$$Register);
+  %}
   ins_pipe( pipe_slow );
 %}
 
@@ -12448,29 +11755,30 @@ instruct MoveD2L_stack_reg(eRegL dst, stackSlotD src) %{
   ins_pipe( ialu_mem_long_reg );
 %}
 
-instruct MoveD2L_reg_stack(stackSlotL dst, regD src) %{
+instruct MoveDPR2L_reg_stack(stackSlotL dst, regDPR src) %{
   predicate(UseSSE<=1);
   match(Set dst (MoveD2L src));
   effect(DEF dst, USE src);
 
   ins_cost(125);
   format %{ "FST_D  $dst,$src\t# MoveD2L_reg_stack" %}
-  ins_encode( Pop_Mem_Reg_D(dst, src) );
+  ins_encode( Pop_Mem_Reg_DPR(dst, src) );
   ins_pipe( fpu_mem_reg );
 %}
 
-instruct MoveD2L_reg_stack_sse(stackSlotL dst, regXD src) %{
+instruct MoveD2L_reg_stack_sse(stackSlotL dst, regD src) %{
   predicate(UseSSE>=2);
   match(Set dst (MoveD2L src));
   effect(DEF dst, USE src);
   ins_cost(95);
-
   format %{ "MOVSD  $dst,$src\t# MoveD2L_reg_stack_sse" %}
-  ins_encode( Opcode(0xF2), Opcode(0x0F), Opcode(0x11), RegMem(src,dst));
+  ins_encode %{
+    __ movdbl(Address(rsp, $dst$$disp), $src$$XMMRegister);
+  %}
   ins_pipe( pipe_slow );
 %}
 
-instruct MoveD2L_reg_reg_sse(eRegL dst, regXD src, regXD tmp) %{
+instruct MoveD2L_reg_reg_sse(eRegL dst, regD src, regD tmp) %{
   predicate(UseSSE>=2);
   match(Set dst (MoveD2L src));
   effect(DEF dst, USE src, TEMP tmp);
@@ -12478,7 +11786,11 @@ instruct MoveD2L_reg_reg_sse(eRegL dst, regXD src, regXD tmp) %{
   format %{ "MOVD   $dst.lo,$src\n\t"
             "PSHUFLW $tmp,$src,0x4E\n\t"
             "MOVD   $dst.hi,$tmp\t# MoveD2L_reg_reg_sse" %}
-  ins_encode( MovXD2L_reg(dst, src, tmp) );
+  ins_encode %{
+    __ movdl($dst$$Register, $src$$XMMRegister);
+    __ pshuflw($tmp$$XMMRegister, $src$$XMMRegister, 0x4e);
+    __ movdl(HIGH_FROM_LOW($dst$$Register), $tmp$$XMMRegister);
+  %}
   ins_pipe( pipe_slow );
 %}
 
@@ -12495,7 +11807,7 @@ instruct MoveL2D_reg_stack(stackSlotD dst, eRegL src) %{
 %}
 
 
-instruct MoveL2D_stack_reg(regD dst, stackSlotL src) %{
+instruct MoveL2DPR_stack_reg(regDPR dst, stackSlotL src) %{
   predicate(UseSSE<=1);
   match(Set dst (MoveL2D src));
   effect(DEF dst, USE src);
@@ -12505,34 +11817,38 @@ instruct MoveL2D_stack_reg(regD dst, stackSlotL src) %{
             "FSTP   $dst\t# MoveL2D_stack_reg" %}
   opcode(0xDD);               /* DD /0, FLD m64real */
   ins_encode( OpcP, RMopc_Mem_no_oop(0x00,src),
-              Pop_Reg_D(dst) );
+              Pop_Reg_DPR(dst) );
   ins_pipe( fpu_reg_mem );
 %}
 
 
-instruct MoveL2D_stack_reg_sse(regXD dst, stackSlotL src) %{
+instruct MoveL2D_stack_reg_sse(regD dst, stackSlotL src) %{
   predicate(UseSSE>=2 && UseXmmLoadAndClearUpper);
   match(Set dst (MoveL2D src));
   effect(DEF dst, USE src);
 
   ins_cost(95);
   format %{ "MOVSD  $dst,$src\t# MoveL2D_stack_reg_sse" %}
-  ins_encode( Opcode(0xF2), Opcode(0x0F), Opcode(0x10), RegMem(dst,src));
+  ins_encode %{
+    __ movdbl($dst$$XMMRegister, Address(rsp, $src$$disp));
+  %}
   ins_pipe( pipe_slow );
 %}
 
-instruct MoveL2D_stack_reg_sse_partial(regXD dst, stackSlotL src) %{
+instruct MoveL2D_stack_reg_sse_partial(regD dst, stackSlotL src) %{
   predicate(UseSSE>=2 && !UseXmmLoadAndClearUpper);
   match(Set dst (MoveL2D src));
   effect(DEF dst, USE src);
 
   ins_cost(95);
   format %{ "MOVLPD $dst,$src\t# MoveL2D_stack_reg_sse" %}
-  ins_encode( Opcode(0x66), Opcode(0x0F), Opcode(0x12), RegMem(dst,src));
+  ins_encode %{
+    __ movdbl($dst$$XMMRegister, Address(rsp, $src$$disp));
+  %}
   ins_pipe( pipe_slow );
 %}
 
-instruct MoveL2D_reg_reg_sse(regXD dst, eRegL src, regXD tmp) %{
+instruct MoveL2D_reg_reg_sse(regD dst, eRegL src, regD tmp) %{
   predicate(UseSSE>=2);
   match(Set dst (MoveL2D src));
   effect(TEMP dst, USE src, TEMP tmp);
@@ -12540,149 +11856,192 @@ instruct MoveL2D_reg_reg_sse(regXD dst, eRegL src, regXD tmp) %{
   format %{ "MOVD   $dst,$src.lo\n\t"
             "MOVD   $tmp,$src.hi\n\t"
             "PUNPCKLDQ $dst,$tmp\t# MoveL2D_reg_reg_sse" %}
-  ins_encode( MovL2XD_reg(dst, src, tmp) );
+  ins_encode %{
+    __ movdl($dst$$XMMRegister, $src$$Register);
+    __ movdl($tmp$$XMMRegister, HIGH_FROM_LOW($src$$Register));
+    __ punpckldq($dst$$XMMRegister, $tmp$$XMMRegister);
+  %}
   ins_pipe( pipe_slow );
 %}
 
 // Replicate scalar to packed byte (1 byte) values in xmm
-instruct Repl8B_reg(regXD dst, regXD src) %{
+instruct Repl8B_reg(regD dst, regD src) %{
   predicate(UseSSE>=2);
   match(Set dst (Replicate8B src));
   format %{ "MOVDQA  $dst,$src\n\t"
             "PUNPCKLBW $dst,$dst\n\t"
             "PSHUFLW $dst,$dst,0x00\t! replicate8B" %}
-  ins_encode( pshufd_8x8(dst, src));
+  ins_encode %{
+    if ($dst$$reg != $src$$reg) {
+      __ movdqa($dst$$XMMRegister, $src$$XMMRegister);
+    }
+    __ punpcklbw($dst$$XMMRegister, $dst$$XMMRegister);
+    __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
+  %}
   ins_pipe( pipe_slow );
 %}
 
 // Replicate scalar to packed byte (1 byte) values in xmm
-instruct Repl8B_eRegI(regXD dst, eRegI src) %{
+instruct Repl8B_eRegI(regD dst, eRegI src) %{
   predicate(UseSSE>=2);
   match(Set dst (Replicate8B src));
   format %{ "MOVD    $dst,$src\n\t"
             "PUNPCKLBW $dst,$dst\n\t"
             "PSHUFLW $dst,$dst,0x00\t! replicate8B" %}
-  ins_encode( mov_i2x(dst, src), pshufd_8x8(dst, dst));
+  ins_encode %{
+    __ movdl($dst$$XMMRegister, $src$$Register);
+    __ punpcklbw($dst$$XMMRegister, $dst$$XMMRegister);
+    __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
+  %}
   ins_pipe( pipe_slow );
 %}
 
 // Replicate scalar zero to packed byte (1 byte) values in xmm
-instruct Repl8B_immI0(regXD dst, immI0 zero) %{
+instruct Repl8B_immI0(regD dst, immI0 zero) %{
   predicate(UseSSE>=2);
   match(Set dst (Replicate8B zero));
   format %{ "PXOR  $dst,$dst\t! replicate8B" %}
-  ins_encode( pxor(dst, dst));
+  ins_encode %{
+    __ pxor($dst$$XMMRegister, $dst$$XMMRegister);
+  %}
   ins_pipe( fpu_reg_reg );
 %}
 
 // Replicate scalar to packed shore (2 byte) values in xmm
-instruct Repl4S_reg(regXD dst, regXD src) %{
+instruct Repl4S_reg(regD dst, regD src) %{
   predicate(UseSSE>=2);
   match(Set dst (Replicate4S src));
   format %{ "PSHUFLW $dst,$src,0x00\t! replicate4S" %}
-  ins_encode( pshufd_4x16(dst, src));
+  ins_encode %{
+    __ pshuflw($dst$$XMMRegister, $src$$XMMRegister, 0x00);
+  %}
   ins_pipe( fpu_reg_reg );
 %}
 
 // Replicate scalar to packed shore (2 byte) values in xmm
-instruct Repl4S_eRegI(regXD dst, eRegI src) %{
+instruct Repl4S_eRegI(regD dst, eRegI src) %{
   predicate(UseSSE>=2);
   match(Set dst (Replicate4S src));
   format %{ "MOVD    $dst,$src\n\t"
             "PSHUFLW $dst,$dst,0x00\t! replicate4S" %}
-  ins_encode( mov_i2x(dst, src), pshufd_4x16(dst, dst));
+  ins_encode %{
+    __ movdl($dst$$XMMRegister, $src$$Register);
+    __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
+  %}
   ins_pipe( fpu_reg_reg );
 %}
 
 // Replicate scalar zero to packed short (2 byte) values in xmm
-instruct Repl4S_immI0(regXD dst, immI0 zero) %{
+instruct Repl4S_immI0(regD dst, immI0 zero) %{
   predicate(UseSSE>=2);
   match(Set dst (Replicate4S zero));
   format %{ "PXOR  $dst,$dst\t! replicate4S" %}
-  ins_encode( pxor(dst, dst));
+  ins_encode %{
+    __ pxor($dst$$XMMRegister, $dst$$XMMRegister);
+  %}
   ins_pipe( fpu_reg_reg );
 %}
 
 // Replicate scalar to packed char (2 byte) values in xmm
-instruct Repl4C_reg(regXD dst, regXD src) %{
+instruct Repl4C_reg(regD dst, regD src) %{
   predicate(UseSSE>=2);
   match(Set dst (Replicate4C src));
   format %{ "PSHUFLW $dst,$src,0x00\t! replicate4C" %}
-  ins_encode( pshufd_4x16(dst, src));
+  ins_encode %{
+    __ pshuflw($dst$$XMMRegister, $src$$XMMRegister, 0x00);
+  %}
   ins_pipe( fpu_reg_reg );
 %}
 
 // Replicate scalar to packed char (2 byte) values in xmm
-instruct Repl4C_eRegI(regXD dst, eRegI src) %{
+instruct Repl4C_eRegI(regD dst, eRegI src) %{
   predicate(UseSSE>=2);
   match(Set dst (Replicate4C src));
   format %{ "MOVD    $dst,$src\n\t"
             "PSHUFLW $dst,$dst,0x00\t! replicate4C" %}
-  ins_encode( mov_i2x(dst, src), pshufd_4x16(dst, dst));
+  ins_encode %{
+    __ movdl($dst$$XMMRegister, $src$$Register);
+    __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
+  %}
   ins_pipe( fpu_reg_reg );
 %}
 
 // Replicate scalar zero to packed char (2 byte) values in xmm
-instruct Repl4C_immI0(regXD dst, immI0 zero) %{
+instruct Repl4C_immI0(regD dst, immI0 zero) %{
   predicate(UseSSE>=2);
   match(Set dst (Replicate4C zero));
   format %{ "PXOR  $dst,$dst\t! replicate4C" %}
-  ins_encode( pxor(dst, dst));
+  ins_encode %{
+    __ pxor($dst$$XMMRegister, $dst$$XMMRegister);
+  %}
   ins_pipe( fpu_reg_reg );
 %}
 
 // Replicate scalar to packed integer (4 byte) values in xmm
-instruct Repl2I_reg(regXD dst, regXD src) %{
+instruct Repl2I_reg(regD dst, regD src) %{
   predicate(UseSSE>=2);
   match(Set dst (Replicate2I src));
   format %{ "PSHUFD $dst,$src,0x00\t! replicate2I" %}
-  ins_encode( pshufd(dst, src, 0x00));
+  ins_encode %{
+    __ pshufd($dst$$XMMRegister, $src$$XMMRegister, 0x00);
+  %}
   ins_pipe( fpu_reg_reg );
 %}
 
 // Replicate scalar to packed integer (4 byte) values in xmm
-instruct Repl2I_eRegI(regXD dst, eRegI src) %{
+instruct Repl2I_eRegI(regD dst, eRegI src) %{
   predicate(UseSSE>=2);
   match(Set dst (Replicate2I src));
   format %{ "MOVD   $dst,$src\n\t"
             "PSHUFD $dst,$dst,0x00\t! replicate2I" %}
-  ins_encode( mov_i2x(dst, src), pshufd(dst, dst, 0x00));
+  ins_encode %{
+    __ movdl($dst$$XMMRegister, $src$$Register);
+    __ pshufd($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
+  %}
   ins_pipe( fpu_reg_reg );
 %}
 
 // Replicate scalar zero to packed integer (2 byte) values in xmm
-instruct Repl2I_immI0(regXD dst, immI0 zero) %{
+instruct Repl2I_immI0(regD dst, immI0 zero) %{
   predicate(UseSSE>=2);
   match(Set dst (Replicate2I zero));
   format %{ "PXOR  $dst,$dst\t! replicate2I" %}
-  ins_encode( pxor(dst, dst));
+  ins_encode %{
+    __ pxor($dst$$XMMRegister, $dst$$XMMRegister);
+  %}
   ins_pipe( fpu_reg_reg );
 %}
 
 // Replicate scalar to packed single precision floating point values in xmm
-instruct Repl2F_reg(regXD dst, regXD src) %{
+instruct Repl2F_reg(regD dst, regD src) %{
   predicate(UseSSE>=2);
   match(Set dst (Replicate2F src));
   format %{ "PSHUFD $dst,$src,0xe0\t! replicate2F" %}
-  ins_encode( pshufd(dst, src, 0xe0));
+  ins_encode %{
+    __ pshufd($dst$$XMMRegister, $src$$XMMRegister, 0xe0);
+  %}
   ins_pipe( fpu_reg_reg );
 %}
 
 // Replicate scalar to packed single precision floating point values in xmm
-instruct Repl2F_regX(regXD dst, regX src) %{
+instruct Repl2F_regF(regD dst, regF src) %{
   predicate(UseSSE>=2);
   match(Set dst (Replicate2F src));
   format %{ "PSHUFD $dst,$src,0xe0\t! replicate2F" %}
-  ins_encode( pshufd(dst, src, 0xe0));
+  ins_encode %{
+    __ pshufd($dst$$XMMRegister, $src$$XMMRegister, 0xe0);
+  %}
   ins_pipe( fpu_reg_reg );
 %}
 
 // Replicate scalar to packed single precision floating point values in xmm
-instruct Repl2F_immXF0(regXD dst, immXF0 zero) %{
+instruct Repl2F_immF0(regD dst, immF0 zero) %{
   predicate(UseSSE>=2);
   match(Set dst (Replicate2F zero));
   format %{ "PXOR  $dst,$dst\t! replicate2F" %}
-  ins_encode( pxor(dst, dst));
+  ins_encode %{
+    __ pxor($dst$$XMMRegister, $dst$$XMMRegister);
+  %}
   ins_pipe( fpu_reg_reg );
 %}
 
@@ -12702,7 +12061,7 @@ instruct rep_stos(eCXRegI cnt, eDIRegP base, eAXRegI zero, Universe dummy, eFlag
 %}
 
 instruct string_compare(eDIRegP str1, eCXRegI cnt1, eSIRegP str2, eDXRegI cnt2,
-                        eAXRegI result, regXD tmp1, eFlagsReg cr) %{
+                        eAXRegI result, regD tmp1, eFlagsReg cr) %{
   match(Set result (StrComp (Binary str1 cnt1) (Binary str2 cnt2)));
   effect(TEMP tmp1, USE_KILL str1, USE_KILL str2, USE_KILL cnt1, USE_KILL cnt2, KILL cr);
 
@@ -12717,7 +12076,7 @@ instruct string_compare(eDIRegP str1, eCXRegI cnt1, eSIRegP str2, eDXRegI cnt2,
 
 // fast string equals
 instruct string_equals(eDIRegP str1, eSIRegP str2, eCXRegI cnt, eAXRegI result,
-                       regXD tmp1, regXD tmp2, eBXRegI tmp3, eFlagsReg cr) %{
+                       regD tmp1, regD tmp2, eBXRegI tmp3, eFlagsReg cr) %{
   match(Set result (StrEquals (Binary str1 str2) cnt));
   effect(TEMP tmp1, TEMP tmp2, USE_KILL str1, USE_KILL str2, USE_KILL cnt, KILL tmp3, KILL cr);
 
@@ -12732,7 +12091,7 @@ instruct string_equals(eDIRegP str1, eSIRegP str2, eCXRegI cnt, eAXRegI result,
 
 // fast search of substring with known size.
 instruct string_indexof_con(eDIRegP str1, eDXRegI cnt1, eSIRegP str2, immI int_cnt2,
-                            eBXRegI result, regXD vec, eAXRegI cnt2, eCXRegI tmp, eFlagsReg cr) %{
+                            eBXRegI result, regD vec, eAXRegI cnt2, eCXRegI tmp, eFlagsReg cr) %{
   predicate(UseSSE42Intrinsics);
   match(Set result (StrIndexOf (Binary str1 cnt1) (Binary str2 int_cnt2)));
   effect(TEMP vec, USE_KILL str1, USE_KILL str2, USE_KILL cnt1, KILL cnt2, KILL tmp, KILL cr);
@@ -12759,7 +12118,7 @@ instruct string_indexof_con(eDIRegP str1, eDXRegI cnt1, eSIRegP str2, immI int_c
 %}
 
 instruct string_indexof(eDIRegP str1, eDXRegI cnt1, eSIRegP str2, eAXRegI cnt2,
-                        eBXRegI result, regXD vec, eCXRegI tmp, eFlagsReg cr) %{
+                        eBXRegI result, regD vec, eCXRegI tmp, eFlagsReg cr) %{
   predicate(UseSSE42Intrinsics);
   match(Set result (StrIndexOf (Binary str1 cnt1) (Binary str2 cnt2)));
   effect(TEMP vec, USE_KILL str1, USE_KILL str2, USE_KILL cnt1, USE_KILL cnt2, KILL tmp, KILL cr);
@@ -12776,7 +12135,7 @@ instruct string_indexof(eDIRegP str1, eDXRegI cnt1, eSIRegP str2, eAXRegI cnt2,
 
 // fast array equals
 instruct array_equals(eDIRegP ary1, eSIRegP ary2, eAXRegI result,
-                      regXD tmp1, regXD tmp2, eCXRegI tmp3, eBXRegI tmp4, eFlagsReg cr)
+                      regD tmp1, regD tmp2, eCXRegI tmp3, eBXRegI tmp4, eFlagsReg cr)
 %{
   match(Set result (AryEq ary1 ary2));
   effect(TEMP tmp1, TEMP tmp2, USE_KILL ary1, USE_KILL ary2, KILL tmp3, KILL tmp4, KILL cr);
@@ -13602,40 +12961,40 @@ instruct cmovPP_reg_LTGE(cmpOp cmp, flagsReg_long_LTGE flags, eRegP dst, eRegP s
 %}
 
 // Compare 2 longs and CMOVE doubles
-instruct cmovDD_reg_LTGE(cmpOp cmp, flagsReg_long_LTGE flags, regD dst, regD src) %{
+instruct cmovDDPR_reg_LTGE(cmpOp cmp, flagsReg_long_LTGE flags, regDPR dst, regDPR src) %{
   predicate( UseSSE<=1 && _kids[0]->_kids[0]->_leaf->as_Bool()->_test._test == BoolTest::lt || _kids[0]->_kids[0]->_leaf->as_Bool()->_test._test == BoolTest::ge );
   match(Set dst (CMoveD (Binary cmp flags) (Binary dst src)));
   ins_cost(200);
+  expand %{
+    fcmovDPR_regS(cmp,flags,dst,src);
+  %}
+%}
+
+// Compare 2 longs and CMOVE doubles
+instruct cmovDD_reg_LTGE(cmpOp cmp, flagsReg_long_LTGE flags, regD dst, regD src) %{
+  predicate( UseSSE>=2 && _kids[0]->_kids[0]->_leaf->as_Bool()->_test._test == BoolTest::lt || _kids[0]->_kids[0]->_leaf->as_Bool()->_test._test == BoolTest::ge );
+  match(Set dst (CMoveD (Binary cmp flags) (Binary dst src)));
+  ins_cost(200);
   expand %{
     fcmovD_regS(cmp,flags,dst,src);
   %}
 %}
 
-// Compare 2 longs and CMOVE doubles
-instruct cmovXDD_reg_LTGE(cmpOp cmp, flagsReg_long_LTGE flags, regXD dst, regXD src) %{
-  predicate( UseSSE>=2 && _kids[0]->_kids[0]->_leaf->as_Bool()->_test._test == BoolTest::lt || _kids[0]->_kids[0]->_leaf->as_Bool()->_test._test == BoolTest::ge );
-  match(Set dst (CMoveD (Binary cmp flags) (Binary dst src)));
-  ins_cost(200);
-  expand %{
-    fcmovXD_regS(cmp,flags,dst,src);
-  %}
-%}
-
-instruct cmovFF_reg_LTGE(cmpOp cmp, flagsReg_long_LTGE flags, regF dst, regF src) %{
+instruct cmovFFPR_reg_LTGE(cmpOp cmp, flagsReg_long_LTGE flags, regFPR dst, regFPR src) %{
   predicate( UseSSE==0 && _kids[0]->_kids[0]->_leaf->as_Bool()->_test._test == BoolTest::lt || _kids[0]->_kids[0]->_leaf->as_Bool()->_test._test == BoolTest::ge );
   match(Set dst (CMoveF (Binary cmp flags) (Binary dst src)));
   ins_cost(200);
   expand %{
-    fcmovF_regS(cmp,flags,dst,src);
+    fcmovFPR_regS(cmp,flags,dst,src);
   %}
 %}
 
-instruct cmovXX_reg_LTGE(cmpOp cmp, flagsReg_long_LTGE flags, regX dst, regX src) %{
+instruct cmovFF_reg_LTGE(cmpOp cmp, flagsReg_long_LTGE flags, regF dst, regF src) %{
   predicate( UseSSE>=1 && _kids[0]->_kids[0]->_leaf->as_Bool()->_test._test == BoolTest::lt || _kids[0]->_kids[0]->_leaf->as_Bool()->_test._test == BoolTest::ge );
   match(Set dst (CMoveF (Binary cmp flags) (Binary dst src)));
   ins_cost(200);
   expand %{
-    fcmovX_regS(cmp,flags,dst,src);
+    fcmovF_regS(cmp,flags,dst,src);
   %}
 %}
 
@@ -13730,40 +13089,40 @@ instruct cmovPP_reg_EQNE(cmpOp cmp, flagsReg_long_EQNE flags, eRegP dst, eRegP s
 %}
 
 // Compare 2 longs and CMOVE doubles
-instruct cmovDD_reg_EQNE(cmpOp cmp, flagsReg_long_EQNE flags, regD dst, regD src) %{
+instruct cmovDDPR_reg_EQNE(cmpOp cmp, flagsReg_long_EQNE flags, regDPR dst, regDPR src) %{
   predicate( UseSSE<=1 && _kids[0]->_kids[0]->_leaf->as_Bool()->_test._test == BoolTest::eq || _kids[0]->_kids[0]->_leaf->as_Bool()->_test._test == BoolTest::ne );
   match(Set dst (CMoveD (Binary cmp flags) (Binary dst src)));
   ins_cost(200);
+  expand %{
+    fcmovDPR_regS(cmp,flags,dst,src);
+  %}
+%}
+
+// Compare 2 longs and CMOVE doubles
+instruct cmovDD_reg_EQNE(cmpOp cmp, flagsReg_long_EQNE flags, regD dst, regD src) %{
+  predicate( UseSSE>=2 && _kids[0]->_kids[0]->_leaf->as_Bool()->_test._test == BoolTest::eq || _kids[0]->_kids[0]->_leaf->as_Bool()->_test._test == BoolTest::ne );
+  match(Set dst (CMoveD (Binary cmp flags) (Binary dst src)));
+  ins_cost(200);
   expand %{
     fcmovD_regS(cmp,flags,dst,src);
   %}
 %}
 
-// Compare 2 longs and CMOVE doubles
-instruct cmovXDD_reg_EQNE(cmpOp cmp, flagsReg_long_EQNE flags, regXD dst, regXD src) %{
-  predicate( UseSSE>=2 && _kids[0]->_kids[0]->_leaf->as_Bool()->_test._test == BoolTest::eq || _kids[0]->_kids[0]->_leaf->as_Bool()->_test._test == BoolTest::ne );
-  match(Set dst (CMoveD (Binary cmp flags) (Binary dst src)));
-  ins_cost(200);
-  expand %{
-    fcmovXD_regS(cmp,flags,dst,src);
-  %}
-%}
-
-instruct cmovFF_reg_EQNE(cmpOp cmp, flagsReg_long_EQNE flags, regF dst, regF src) %{
+instruct cmovFFPR_reg_EQNE(cmpOp cmp, flagsReg_long_EQNE flags, regFPR dst, regFPR src) %{
   predicate( UseSSE==0 && _kids[0]->_kids[0]->_leaf->as_Bool()->_test._test == BoolTest::eq || _kids[0]->_kids[0]->_leaf->as_Bool()->_test._test == BoolTest::ne );
   match(Set dst (CMoveF (Binary cmp flags) (Binary dst src)));
   ins_cost(200);
   expand %{
-    fcmovF_regS(cmp,flags,dst,src);
+    fcmovFPR_regS(cmp,flags,dst,src);
   %}
 %}
 
-instruct cmovXX_reg_EQNE(cmpOp cmp, flagsReg_long_EQNE flags, regX dst, regX src) %{
+instruct cmovFF_reg_EQNE(cmpOp cmp, flagsReg_long_EQNE flags, regF dst, regF src) %{
   predicate( UseSSE>=1 && _kids[0]->_kids[0]->_leaf->as_Bool()->_test._test == BoolTest::eq || _kids[0]->_kids[0]->_leaf->as_Bool()->_test._test == BoolTest::ne );
   match(Set dst (CMoveF (Binary cmp flags) (Binary dst src)));
   ins_cost(200);
   expand %{
-    fcmovX_regS(cmp,flags,dst,src);
+    fcmovF_regS(cmp,flags,dst,src);
   %}
 %}
 
@@ -13863,41 +13222,41 @@ instruct cmovPP_reg_LEGT(cmpOp_commute cmp, flagsReg_long_LEGT flags, eRegP dst,
 %}
 
 // Compare 2 longs and CMOVE doubles
-instruct cmovDD_reg_LEGT(cmpOp_commute cmp, flagsReg_long_LEGT flags, regD dst, regD src) %{
+instruct cmovDDPR_reg_LEGT(cmpOp_commute cmp, flagsReg_long_LEGT flags, regDPR dst, regDPR src) %{
   predicate( UseSSE<=1 && _kids[0]->_kids[0]->_leaf->as_Bool()->_test._test == BoolTest::le || _kids[0]->_kids[0]->_leaf->as_Bool()->_test._test == BoolTest::gt );
   match(Set dst (CMoveD (Binary cmp flags) (Binary dst src)));
   ins_cost(200);
+  expand %{
+    fcmovDPR_regS(cmp,flags,dst,src);
+  %}
+%}
+
+// Compare 2 longs and CMOVE doubles
+instruct cmovDD_reg_LEGT(cmpOp_commute cmp, flagsReg_long_LEGT flags, regD dst, regD src) %{
+  predicate( UseSSE>=2 && _kids[0]->_kids[0]->_leaf->as_Bool()->_test._test == BoolTest::le || _kids[0]->_kids[0]->_leaf->as_Bool()->_test._test == BoolTest::gt );
+  match(Set dst (CMoveD (Binary cmp flags) (Binary dst src)));
+  ins_cost(200);
   expand %{
     fcmovD_regS(cmp,flags,dst,src);
   %}
 %}
 
-// Compare 2 longs and CMOVE doubles
-instruct cmovXDD_reg_LEGT(cmpOp_commute cmp, flagsReg_long_LEGT flags, regXD dst, regXD src) %{
-  predicate( UseSSE>=2 && _kids[0]->_kids[0]->_leaf->as_Bool()->_test._test == BoolTest::le || _kids[0]->_kids[0]->_leaf->as_Bool()->_test._test == BoolTest::gt );
-  match(Set dst (CMoveD (Binary cmp flags) (Binary dst src)));
-  ins_cost(200);
-  expand %{
-    fcmovXD_regS(cmp,flags,dst,src);
-  %}
-%}
-
-instruct cmovFF_reg_LEGT(cmpOp_commute cmp, flagsReg_long_LEGT flags, regF dst, regF src) %{
+instruct cmovFFPR_reg_LEGT(cmpOp_commute cmp, flagsReg_long_LEGT flags, regFPR dst, regFPR src) %{
   predicate( UseSSE==0 && _kids[0]->_kids[0]->_leaf->as_Bool()->_test._test == BoolTest::le || _kids[0]->_kids[0]->_leaf->as_Bool()->_test._test == BoolTest::gt );
   match(Set dst (CMoveF (Binary cmp flags) (Binary dst src)));
   ins_cost(200);
   expand %{
-    fcmovF_regS(cmp,flags,dst,src);
+    fcmovFPR_regS(cmp,flags,dst,src);
   %}
 %}
 
 
-instruct cmovXX_reg_LEGT(cmpOp_commute cmp, flagsReg_long_LEGT flags, regX dst, regX src) %{
+instruct cmovFF_reg_LEGT(cmpOp_commute cmp, flagsReg_long_LEGT flags, regF dst, regF src) %{
   predicate( UseSSE>=1 && _kids[0]->_kids[0]->_leaf->as_Bool()->_test._test == BoolTest::le || _kids[0]->_kids[0]->_leaf->as_Bool()->_test._test == BoolTest::gt );
   match(Set dst (CMoveF (Binary cmp flags) (Binary dst src)));
   ins_cost(200);
   expand %{
-    fcmovX_regS(cmp,flags,dst,src);
+    fcmovF_regS(cmp,flags,dst,src);
   %}
 %}
 
@@ -14076,20 +13435,20 @@ instruct RethrowException()
 // inlined locking and unlocking
 
 
-instruct cmpFastLock( eFlagsReg cr, eRegP object, eRegP box, eAXRegI tmp, eRegP scr) %{
+instruct cmpFastLock( eFlagsReg cr, eRegP object, eBXRegP box, eAXRegI tmp, eRegP scr) %{
   match( Set cr (FastLock object box) );
-  effect( TEMP tmp, TEMP scr );
+  effect( TEMP tmp, TEMP scr, USE_KILL box );
   ins_cost(300);
-  format %{ "FASTLOCK $object, $box KILLS $tmp,$scr" %}
+  format %{ "FASTLOCK $object,$box\t! kills $box,$tmp,$scr" %}
   ins_encode( Fast_Lock(object,box,tmp,scr) );
   ins_pipe( pipe_slow );
 %}
 
 instruct cmpFastUnlock( eFlagsReg cr, eRegP object, eAXRegP box, eRegP tmp ) %{
   match( Set cr (FastUnlock object box) );
-  effect( TEMP tmp );
+  effect( TEMP tmp, USE_KILL box );
   ins_cost(300);
-  format %{ "FASTUNLOCK $object, $box, $tmp" %}
+  format %{ "FASTUNLOCK $object,$box\t! kills $box,$tmp" %}
   ins_encode( Fast_Unlock(object,box,tmp) );
   ins_pipe( pipe_slow );
 %}
diff --git a/hotspot/src/cpu/x86/vm/x86_64.ad b/hotspot/src/cpu/x86/vm/x86_64.ad
index 57e82bd4323..338db98c33d 100644
--- a/hotspot/src/cpu/x86/vm/x86_64.ad
+++ b/hotspot/src/cpu/x86/vm/x86_64.ad
@@ -552,7 +552,7 @@ source %{
 #define __ _masm.
 
 static int preserve_SP_size() {
-  return LP64_ONLY(1 +) 2;  // [rex,] op, rm(reg/reg)
+  return 3;  // rex.w, op, rm(reg/reg)
 }
 
 // !!!!! Special hack to get all types of calls to specify the byte offset
@@ -797,48 +797,35 @@ void encode_RegMem(CodeBuffer &cbuf,
   }
 }
 
-void encode_copy(CodeBuffer &cbuf, int dstenc, int srcenc)
-{
-  if (dstenc != srcenc) {
-    if (dstenc < 8) {
-      if (srcenc >= 8) {
-        emit_opcode(cbuf, Assembler::REX_B);
-        srcenc -= 8;
-      }
-    } else {
-      if (srcenc < 8) {
-        emit_opcode(cbuf, Assembler::REX_R);
-      } else {
-        emit_opcode(cbuf, Assembler::REX_RB);
-        srcenc -= 8;
-      }
-      dstenc -= 8;
-    }
-
-    emit_opcode(cbuf, 0x8B);
-    emit_rm(cbuf, 0x3, dstenc, srcenc);
-  }
-}
-
-void encode_CopyXD( CodeBuffer &cbuf, int dst_encoding, int src_encoding ) {
-  if( dst_encoding == src_encoding ) {
-    // reg-reg copy, use an empty encoding
-  } else {
-    MacroAssembler _masm(&cbuf);
-
-    __ movdqa(as_XMMRegister(dst_encoding), as_XMMRegister(src_encoding));
-  }
-}
-
 // This could be in MacroAssembler but it's fairly C2 specific
 void emit_cmpfp_fixup(MacroAssembler& _masm) {
   Label exit;
   __ jccb(Assembler::noParity, exit);
   __ pushf();
+  //
+  // comiss/ucomiss instructions set ZF,PF,CF flags and
+  // zero OF,AF,SF for NaN values.
+  // Fixup flags by zeroing ZF,PF so that compare of NaN
+  // values returns 'less than' result (CF is set).
+  // Leave the rest of flags unchanged.
+  //
+  //    7 6 5 4 3 2 1 0
+  //   |S|Z|r|A|r|P|r|C|  (r - reserved bit)
+  //    0 0 1 0 1 0 1 1   (0x2B)
+  //
   __ andq(Address(rsp, 0), 0xffffff2b);
   __ popf();
   __ bind(exit);
-  __ nop(); // (target for branch to avoid branch to branch)
+}
+
+void emit_cmpfp3(MacroAssembler& _masm, Register dst) {
+  Label done;
+  __ movl(dst, -1);
+  __ jcc(Assembler::parity, done);
+  __ jcc(Assembler::below, done);
+  __ setb(Assembler::notEqual, dst);
+  __ movzbl(dst, dst);
+  __ bind(done);
 }
 
 
@@ -1274,16 +1261,8 @@ uint MachSpillCopyNode::implementation(CodeBuffer* cbuf,
         // 64-bit
         int offset = ra_->reg2offset(src_first);
         if (cbuf) {
-          emit_opcode(*cbuf, UseXmmLoadAndClearUpper ? 0xF2 : 0x66);
-          if (Matcher::_regEncode[dst_first] >= 8) {
-            emit_opcode(*cbuf, Assembler::REX_R);
-          }
-          emit_opcode(*cbuf, 0x0F);
-          emit_opcode(*cbuf, UseXmmLoadAndClearUpper ? 0x10 : 0x12);
-          encode_RegMem(*cbuf,
-                        Matcher::_regEncode[dst_first],
-                        RSP_enc, 0x4, 0, offset,
-                        false);
+          MacroAssembler _masm(cbuf);
+          __ movdbl( as_XMMRegister(Matcher::_regEncode[dst_first]), Address(rsp, offset));
 #ifndef PRODUCT
         } else if (!do_size) {
           st->print("%s  %s, [rsp + #%d]\t# spill",
@@ -1294,25 +1273,17 @@ uint MachSpillCopyNode::implementation(CodeBuffer* cbuf,
         }
         return
           ((offset == 0) ? 0 : (offset < 0x80 ? 1 : 4)) +
-          ((Matcher::_regEncode[dst_first] < 8)
-           ? 5
-           : 6); // REX
+          ((Matcher::_regEncode[dst_first] >= 8)
+           ? 6
+           : (5 + ((UseAVX>0)?1:0))); // REX
       } else {
         // 32-bit
         assert(!((src_first & 1) == 0 && src_first + 1 == src_second), "no transform");
         assert(!((dst_first & 1) == 0 && dst_first + 1 == dst_second), "no transform");
         int offset = ra_->reg2offset(src_first);
         if (cbuf) {
-          emit_opcode(*cbuf, 0xF3);
-          if (Matcher::_regEncode[dst_first] >= 8) {
-            emit_opcode(*cbuf, Assembler::REX_R);
-          }
-          emit_opcode(*cbuf, 0x0F);
-          emit_opcode(*cbuf, 0x10);
-          encode_RegMem(*cbuf,
-                        Matcher::_regEncode[dst_first],
-                        RSP_enc, 0x4, 0, offset,
-                        false);
+          MacroAssembler _masm(cbuf);
+          __ movflt( as_XMMRegister(Matcher::_regEncode[dst_first]), Address(rsp, offset));
 #ifndef PRODUCT
         } else if (!do_size) {
           st->print("movss   %s, [rsp + #%d]\t# spill",
@@ -1322,9 +1293,9 @@ uint MachSpillCopyNode::implementation(CodeBuffer* cbuf,
         }
         return
           ((offset == 0) ? 0 : (offset < 0x80 ? 1 : 4)) +
-          ((Matcher::_regEncode[dst_first] < 8)
-           ? 5
-           : 6); // REX
+          ((Matcher::_regEncode[dst_first] >= 8)
+           ? 6
+           : (5 + ((UseAVX>0)?1:0))); // REX
       }
     }
   } else if (src_first_rc == rc_int) {
@@ -1450,25 +1421,8 @@ uint MachSpillCopyNode::implementation(CodeBuffer* cbuf,
           (dst_first & 1) == 0 && dst_first + 1 == dst_second) {
         // 64-bit
         if (cbuf) {
-          emit_opcode(*cbuf, 0x66);
-          if (Matcher::_regEncode[dst_first] < 8) {
-            if (Matcher::_regEncode[src_first] < 8) {
-              emit_opcode(*cbuf, Assembler::REX_W);
-            } else {
-              emit_opcode(*cbuf, Assembler::REX_WB);
-            }
-          } else {
-            if (Matcher::_regEncode[src_first] < 8) {
-              emit_opcode(*cbuf, Assembler::REX_WR);
-            } else {
-              emit_opcode(*cbuf, Assembler::REX_WRB);
-            }
-          }
-          emit_opcode(*cbuf, 0x0F);
-          emit_opcode(*cbuf, 0x6E);
-          emit_rm(*cbuf, 0x3,
-                  Matcher::_regEncode[dst_first] & 7,
-                  Matcher::_regEncode[src_first] & 7);
+          MacroAssembler _masm(cbuf);
+          __ movdq( as_XMMRegister(Matcher::_regEncode[dst_first]), as_Register(Matcher::_regEncode[src_first]));
 #ifndef PRODUCT
         } else if (!do_size) {
           st->print("movdq   %s, %s\t# spill",
@@ -1482,23 +1436,8 @@ uint MachSpillCopyNode::implementation(CodeBuffer* cbuf,
         assert(!((src_first & 1) == 0 && src_first + 1 == src_second), "no transform");
         assert(!((dst_first & 1) == 0 && dst_first + 1 == dst_second), "no transform");
         if (cbuf) {
-          emit_opcode(*cbuf, 0x66);
-          if (Matcher::_regEncode[dst_first] < 8) {
-            if (Matcher::_regEncode[src_first] >= 8) {
-              emit_opcode(*cbuf, Assembler::REX_B);
-            }
-          } else {
-            if (Matcher::_regEncode[src_first] < 8) {
-              emit_opcode(*cbuf, Assembler::REX_R);
-            } else {
-              emit_opcode(*cbuf, Assembler::REX_RB);
-            }
-          }
-          emit_opcode(*cbuf, 0x0F);
-          emit_opcode(*cbuf, 0x6E);
-          emit_rm(*cbuf, 0x3,
-                  Matcher::_regEncode[dst_first] & 7,
-                  Matcher::_regEncode[src_first] & 7);
+          MacroAssembler _masm(cbuf);
+          __ movdl( as_XMMRegister(Matcher::_regEncode[dst_first]), as_Register(Matcher::_regEncode[src_first]));
 #ifndef PRODUCT
         } else if (!do_size) {
           st->print("movdl   %s, %s\t# spill",
@@ -1507,9 +1446,9 @@ uint MachSpillCopyNode::implementation(CodeBuffer* cbuf,
 #endif
         }
         return
-          (Matcher::_regEncode[src_first] < 8 && Matcher::_regEncode[dst_first] < 8)
-          ? 4
-          : 5; // REX
+          (Matcher::_regEncode[src_first] >= 8 || Matcher::_regEncode[dst_first] >= 8)
+          ? 5
+          : (4 + ((UseAVX>0)?1:0)); // REX
       }
     }
   } else if (src_first_rc == rc_float) {
@@ -1521,16 +1460,8 @@ uint MachSpillCopyNode::implementation(CodeBuffer* cbuf,
         // 64-bit
         int offset = ra_->reg2offset(dst_first);
         if (cbuf) {
-          emit_opcode(*cbuf, 0xF2);
-          if (Matcher::_regEncode[src_first] >= 8) {
-              emit_opcode(*cbuf, Assembler::REX_R);
-          }
-          emit_opcode(*cbuf, 0x0F);
-          emit_opcode(*cbuf, 0x11);
-          encode_RegMem(*cbuf,
-                        Matcher::_regEncode[src_first],
-                        RSP_enc, 0x4, 0, offset,
-                        false);
+          MacroAssembler _masm(cbuf);
+          __ movdbl( Address(rsp, offset), as_XMMRegister(Matcher::_regEncode[src_first]));
 #ifndef PRODUCT
         } else if (!do_size) {
           st->print("movsd   [rsp + #%d], %s\t# spill",
@@ -1540,25 +1471,17 @@ uint MachSpillCopyNode::implementation(CodeBuffer* cbuf,
         }
         return
           ((offset == 0) ? 0 : (offset < 0x80 ? 1 : 4)) +
-          ((Matcher::_regEncode[src_first] < 8)
-           ? 5
-           : 6); // REX
+          ((Matcher::_regEncode[src_first] >= 8)
+           ? 6
+           : (5 + ((UseAVX>0)?1:0))); // REX
       } else {
         // 32-bit
         assert(!((src_first & 1) == 0 && src_first + 1 == src_second), "no transform");
         assert(!((dst_first & 1) == 0 && dst_first + 1 == dst_second), "no transform");
         int offset = ra_->reg2offset(dst_first);
         if (cbuf) {
-          emit_opcode(*cbuf, 0xF3);
-          if (Matcher::_regEncode[src_first] >= 8) {
-              emit_opcode(*cbuf, Assembler::REX_R);
-          }
-          emit_opcode(*cbuf, 0x0F);
-          emit_opcode(*cbuf, 0x11);
-          encode_RegMem(*cbuf,
-                        Matcher::_regEncode[src_first],
-                        RSP_enc, 0x4, 0, offset,
-                        false);
+          MacroAssembler _masm(cbuf);
+          __ movflt(Address(rsp, offset), as_XMMRegister(Matcher::_regEncode[src_first]));
 #ifndef PRODUCT
         } else if (!do_size) {
           st->print("movss   [rsp + #%d], %s\t# spill",
@@ -1568,9 +1491,9 @@ uint MachSpillCopyNode::implementation(CodeBuffer* cbuf,
         }
         return
           ((offset == 0) ? 0 : (offset < 0x80 ? 1 : 4)) +
-          ((Matcher::_regEncode[src_first] < 8)
-           ? 5
-           : 6); // REX
+          ((Matcher::_regEncode[src_first] >=8)
+           ? 6
+           : (5 + ((UseAVX>0)?1:0))); // REX
       }
     } else if (dst_first_rc == rc_int) {
       // xmm -> gpr
@@ -1578,25 +1501,8 @@ uint MachSpillCopyNode::implementation(CodeBuffer* cbuf,
           (dst_first & 1) == 0 && dst_first + 1 == dst_second) {
         // 64-bit
         if (cbuf) {
-          emit_opcode(*cbuf, 0x66);
-          if (Matcher::_regEncode[dst_first] < 8) {
-            if (Matcher::_regEncode[src_first] < 8) {
-              emit_opcode(*cbuf, Assembler::REX_W);
-            } else {
-              emit_opcode(*cbuf, Assembler::REX_WR); // attention!
-            }
-          } else {
-            if (Matcher::_regEncode[src_first] < 8) {
-              emit_opcode(*cbuf, Assembler::REX_WB); // attention!
-            } else {
-              emit_opcode(*cbuf, Assembler::REX_WRB);
-            }
-          }
-          emit_opcode(*cbuf, 0x0F);
-          emit_opcode(*cbuf, 0x7E);
-          emit_rm(*cbuf, 0x3,
-                  Matcher::_regEncode[src_first] & 7,
-                  Matcher::_regEncode[dst_first] & 7);
+          MacroAssembler _masm(cbuf);
+          __ movdq( as_Register(Matcher::_regEncode[dst_first]), as_XMMRegister(Matcher::_regEncode[src_first]));
 #ifndef PRODUCT
         } else if (!do_size) {
           st->print("movdq   %s, %s\t# spill",
@@ -1610,23 +1516,8 @@ uint MachSpillCopyNode::implementation(CodeBuffer* cbuf,
         assert(!((src_first & 1) == 0 && src_first + 1 == src_second), "no transform");
         assert(!((dst_first & 1) == 0 && dst_first + 1 == dst_second), "no transform");
         if (cbuf) {
-          emit_opcode(*cbuf, 0x66);
-          if (Matcher::_regEncode[dst_first] < 8) {
-            if (Matcher::_regEncode[src_first] >= 8) {
-              emit_opcode(*cbuf, Assembler::REX_R); // attention!
-            }
-          } else {
-            if (Matcher::_regEncode[src_first] < 8) {
-              emit_opcode(*cbuf, Assembler::REX_B); // attention!
-            } else {
-              emit_opcode(*cbuf, Assembler::REX_RB);
-            }
-          }
-          emit_opcode(*cbuf, 0x0F);
-          emit_opcode(*cbuf, 0x7E);
-          emit_rm(*cbuf, 0x3,
-                  Matcher::_regEncode[src_first] & 7,
-                  Matcher::_regEncode[dst_first] & 7);
+          MacroAssembler _masm(cbuf);
+          __ movdl( as_Register(Matcher::_regEncode[dst_first]), as_XMMRegister(Matcher::_regEncode[src_first]));
 #ifndef PRODUCT
         } else if (!do_size) {
           st->print("movdl   %s, %s\t# spill",
@@ -1635,9 +1526,9 @@ uint MachSpillCopyNode::implementation(CodeBuffer* cbuf,
 #endif
         }
         return
-          (Matcher::_regEncode[src_first] < 8 && Matcher::_regEncode[dst_first] < 8)
-          ? 4
-          : 5; // REX
+          (Matcher::_regEncode[src_first] >= 8 || Matcher::_regEncode[dst_first] >= 8)
+          ? 5
+          : (4 + ((UseAVX>0)?1:0)); // REX
       }
     } else if (dst_first_rc == rc_float) {
       // xmm -> xmm
@@ -1645,23 +1536,8 @@ uint MachSpillCopyNode::implementation(CodeBuffer* cbuf,
           (dst_first & 1) == 0 && dst_first + 1 == dst_second) {
         // 64-bit
         if (cbuf) {
-          emit_opcode(*cbuf, UseXmmRegToRegMoveAll ? 0x66 : 0xF2);
-          if (Matcher::_regEncode[dst_first] < 8) {
-            if (Matcher::_regEncode[src_first] >= 8) {
-              emit_opcode(*cbuf, Assembler::REX_B);
-            }
-          } else {
-            if (Matcher::_regEncode[src_first] < 8) {
-              emit_opcode(*cbuf, Assembler::REX_R);
-            } else {
-              emit_opcode(*cbuf, Assembler::REX_RB);
-            }
-          }
-          emit_opcode(*cbuf, 0x0F);
-          emit_opcode(*cbuf, UseXmmRegToRegMoveAll ? 0x28 : 0x10);
-          emit_rm(*cbuf, 0x3,
-                  Matcher::_regEncode[dst_first] & 7,
-                  Matcher::_regEncode[src_first] & 7);
+          MacroAssembler _masm(cbuf);
+          __ movdbl( as_XMMRegister(Matcher::_regEncode[dst_first]), as_XMMRegister(Matcher::_regEncode[src_first]));
 #ifndef PRODUCT
         } else if (!do_size) {
           st->print("%s  %s, %s\t# spill",
@@ -1671,32 +1547,16 @@ uint MachSpillCopyNode::implementation(CodeBuffer* cbuf,
 #endif
         }
         return
-          (Matcher::_regEncode[src_first] < 8 && Matcher::_regEncode[dst_first] < 8)
-          ? 4
-          : 5; // REX
+          (Matcher::_regEncode[src_first] >= 8 || Matcher::_regEncode[dst_first] >= 8)
+          ? 5
+          : (4 + ((UseAVX>0)?1:0)); // REX
       } else {
         // 32-bit
         assert(!((src_first & 1) == 0 && src_first + 1 == src_second), "no transform");
         assert(!((dst_first & 1) == 0 && dst_first + 1 == dst_second), "no transform");
         if (cbuf) {
-          if (!UseXmmRegToRegMoveAll)
-            emit_opcode(*cbuf, 0xF3);
-          if (Matcher::_regEncode[dst_first] < 8) {
-            if (Matcher::_regEncode[src_first] >= 8) {
-              emit_opcode(*cbuf, Assembler::REX_B);
-            }
-          } else {
-            if (Matcher::_regEncode[src_first] < 8) {
-              emit_opcode(*cbuf, Assembler::REX_R);
-            } else {
-              emit_opcode(*cbuf, Assembler::REX_RB);
-            }
-          }
-          emit_opcode(*cbuf, 0x0F);
-          emit_opcode(*cbuf, UseXmmRegToRegMoveAll ? 0x28 : 0x10);
-          emit_rm(*cbuf, 0x3,
-                  Matcher::_regEncode[dst_first] & 7,
-                  Matcher::_regEncode[src_first] & 7);
+          MacroAssembler _masm(cbuf);
+          __ movflt( as_XMMRegister(Matcher::_regEncode[dst_first]), as_XMMRegister(Matcher::_regEncode[src_first]));
 #ifndef PRODUCT
         } else if (!do_size) {
           st->print("%s  %s, %s\t# spill",
@@ -1705,10 +1565,10 @@ uint MachSpillCopyNode::implementation(CodeBuffer* cbuf,
                      Matcher::regName[src_first]);
 #endif
         }
-        return
-          (Matcher::_regEncode[src_first] < 8 && Matcher::_regEncode[dst_first] < 8)
-          ? (UseXmmRegToRegMoveAll ? 3 : 4)
-          : (UseXmmRegToRegMoveAll ? 4 : 5); // REX
+        return ((UseAVX>0) ? 5:
+          ((Matcher::_regEncode[src_first] >= 8 || Matcher::_regEncode[dst_first] >= 8)
+           ? (UseXmmRegToRegMoveAll ? 4 : 5)
+           : (UseXmmRegToRegMoveAll ? 3 : 4))); // REX
       }
     }
   }
@@ -2205,47 +2065,6 @@ encode %{
     emit_rm(cbuf, 0x3, $dst$$reg & 7, $src$$reg & 7);
   %}
 
-  enc_class cmpfp_fixup() %{
-      MacroAssembler _masm(&cbuf);
-      emit_cmpfp_fixup(_masm);
-  %}
-
-  enc_class cmpfp3(rRegI dst)
-  %{
-    int dstenc = $dst$$reg;
-
-    // movl $dst, -1
-    if (dstenc >= 8) {
-      emit_opcode(cbuf, Assembler::REX_B);
-    }
-    emit_opcode(cbuf, 0xB8 | (dstenc & 7));
-    emit_d32(cbuf, -1);
-
-    // jp,s done
-    emit_opcode(cbuf, 0x7A);
-    emit_d8(cbuf, dstenc < 4 ? 0x08 : 0x0A);
-
-    // jb,s done
-    emit_opcode(cbuf, 0x72);
-    emit_d8(cbuf, dstenc < 4 ? 0x06 : 0x08);
-
-    // setne $dst
-    if (dstenc >= 4) {
-      emit_opcode(cbuf, dstenc < 8 ? Assembler::REX : Assembler::REX_B);
-    }
-    emit_opcode(cbuf, 0x0F);
-    emit_opcode(cbuf, 0x95);
-    emit_opcode(cbuf, 0xC0 | (dstenc & 7));
-
-    // movzbl $dst, $dst
-    if (dstenc >= 4) {
-      emit_opcode(cbuf, dstenc < 8 ? Assembler::REX : Assembler::REX_RB);
-    }
-    emit_opcode(cbuf, 0x0F);
-    emit_opcode(cbuf, 0xB6);
-    emit_rm(cbuf, 0x3, dstenc & 7, dstenc & 7);
-  %}
-
   enc_class cdql_enc(no_rax_rdx_RegI div)
   %{
     // Full implementation of Java idiv and irem; checks for
@@ -2472,55 +2291,6 @@ encode %{
     emit_cc(cbuf, $secondary, $cop$$cmpcode);
   %}
 
-  enc_class enc_cmovf_branch(cmpOp cop, regF dst, regF src)
-  %{
-    // Invert sense of branch from sense of cmov
-    emit_cc(cbuf, 0x70, $cop$$cmpcode ^ 1);
-    emit_d8(cbuf, ($dst$$reg < 8 && $src$$reg < 8)
-                  ? (UseXmmRegToRegMoveAll ? 3 : 4)
-                  : (UseXmmRegToRegMoveAll ? 4 : 5) ); // REX
-    // UseXmmRegToRegMoveAll ? movaps(dst, src) : movss(dst, src)
-    if (!UseXmmRegToRegMoveAll) emit_opcode(cbuf, 0xF3);
-    if ($dst$$reg < 8) {
-      if ($src$$reg >= 8) {
-        emit_opcode(cbuf, Assembler::REX_B);
-      }
-    } else {
-      if ($src$$reg < 8) {
-        emit_opcode(cbuf, Assembler::REX_R);
-      } else {
-        emit_opcode(cbuf, Assembler::REX_RB);
-      }
-    }
-    emit_opcode(cbuf, 0x0F);
-    emit_opcode(cbuf, UseXmmRegToRegMoveAll ? 0x28 : 0x10);
-    emit_rm(cbuf, 0x3, $dst$$reg & 7, $src$$reg & 7);
-  %}
-
-  enc_class enc_cmovd_branch(cmpOp cop, regD dst, regD src)
-  %{
-    // Invert sense of branch from sense of cmov
-    emit_cc(cbuf, 0x70, $cop$$cmpcode ^ 1);
-    emit_d8(cbuf, $dst$$reg < 8 && $src$$reg < 8 ? 4 : 5); // REX
-
-    //  UseXmmRegToRegMoveAll ? movapd(dst, src) : movsd(dst, src)
-    emit_opcode(cbuf, UseXmmRegToRegMoveAll ? 0x66 : 0xF2);
-    if ($dst$$reg < 8) {
-      if ($src$$reg >= 8) {
-        emit_opcode(cbuf, Assembler::REX_B);
-      }
-    } else {
-      if ($src$$reg < 8) {
-        emit_opcode(cbuf, Assembler::REX_R);
-      } else {
-        emit_opcode(cbuf, Assembler::REX_RB);
-      }
-    }
-    emit_opcode(cbuf, 0x0F);
-    emit_opcode(cbuf, UseXmmRegToRegMoveAll ? 0x28 : 0x10);
-    emit_rm(cbuf, 0x3, $dst$$reg & 7, $src$$reg & 7);
-  %}
-
   enc_class enc_PartialSubtypeCheck()
   %{
     Register Rrdi = as_Register(RDI_enc); // result register
@@ -2751,68 +2521,6 @@ encode %{
     }
   %}
 
-  // Encode a reg-reg copy.  If it is useless, then empty encoding.
-  enc_class enc_copy(rRegI dst, rRegI src)
-  %{
-    encode_copy(cbuf, $dst$$reg, $src$$reg);
-  %}
-
-  // Encode xmm reg-reg copy.  If it is useless, then empty encoding.
-  enc_class enc_CopyXD( RegD dst, RegD src ) %{
-    encode_CopyXD( cbuf, $dst$$reg, $src$$reg );
-  %}
-
-  enc_class enc_copy_always(rRegI dst, rRegI src)
-  %{
-    int srcenc = $src$$reg;
-    int dstenc = $dst$$reg;
-
-    if (dstenc < 8) {
-      if (srcenc >= 8) {
-        emit_opcode(cbuf, Assembler::REX_B);
-        srcenc -= 8;
-      }
-    } else {
-      if (srcenc < 8) {
-        emit_opcode(cbuf, Assembler::REX_R);
-      } else {
-        emit_opcode(cbuf, Assembler::REX_RB);
-        srcenc -= 8;
-      }
-      dstenc -= 8;
-    }
-
-    emit_opcode(cbuf, 0x8B);
-    emit_rm(cbuf, 0x3, dstenc, srcenc);
-  %}
-
-  enc_class enc_copy_wide(rRegL dst, rRegL src)
-  %{
-    int srcenc = $src$$reg;
-    int dstenc = $dst$$reg;
-
-    if (dstenc != srcenc) {
-      if (dstenc < 8) {
-        if (srcenc < 8) {
-          emit_opcode(cbuf, Assembler::REX_W);
-        } else {
-          emit_opcode(cbuf, Assembler::REX_WB);
-          srcenc -= 8;
-        }
-      } else {
-        if (srcenc < 8) {
-          emit_opcode(cbuf, Assembler::REX_WR);
-        } else {
-          emit_opcode(cbuf, Assembler::REX_WRB);
-          srcenc -= 8;
-        }
-        dstenc -= 8;
-      }
-      emit_opcode(cbuf, 0x8B);
-      emit_rm(cbuf, 0x3, dstenc, srcenc);
-    }
-  %}
-
   enc_class Con32(immI src)
   %{
     // Output immediate
@@ -3212,92 +2920,19 @@ encode %{
   %}
 
   enc_class Push_ResultXD(regD dst) %{
-    int dstenc = $dst$$reg;
-
-    store_to_stackslot( cbuf, 0xDD, 0x03, 0 ); //FSTP [RSP]
-
-    // UseXmmLoadAndClearUpper ? movsd dst,[rsp] : movlpd dst,[rsp]
-    emit_opcode  (cbuf, UseXmmLoadAndClearUpper ? 0xF2 : 0x66);
-    if (dstenc >= 8) {
-      emit_opcode(cbuf, Assembler::REX_R);
-    }
-    emit_opcode  (cbuf, 0x0F );
-    emit_opcode  (cbuf, UseXmmLoadAndClearUpper ? 0x10 : 0x12 );
-    encode_RegMem(cbuf, dstenc, RSP_enc, 0x4, 0, 0, false);
-
-    // add rsp,8
-    emit_opcode(cbuf, Assembler::REX_W);
-    emit_opcode(cbuf,0x83);
-    emit_rm(cbuf,0x3, 0x0, RSP_enc);
-    emit_d8(cbuf,0x08);
+    MacroAssembler _masm(&cbuf);
+    __ fstp_d(Address(rsp, 0));
+    __ movdbl($dst$$XMMRegister, Address(rsp, 0));
+    __ addptr(rsp, 8);
   %}
 
   enc_class Push_SrcXD(regD src) %{
-    int srcenc = $src$$reg;
-
-    // subq rsp,#8
-    emit_opcode(cbuf, Assembler::REX_W);
-    emit_opcode(cbuf, 0x83);
-    emit_rm(cbuf, 0x3, 0x5, RSP_enc);
-    emit_d8(cbuf, 0x8);
-
-    // movsd [rsp],src
-    emit_opcode(cbuf, 0xF2);
-    if (srcenc >= 8) {
-      emit_opcode(cbuf, Assembler::REX_R);
-    }
-    emit_opcode(cbuf, 0x0F);
-    emit_opcode(cbuf, 0x11);
-    encode_RegMem(cbuf, srcenc, RSP_enc, 0x4, 0, 0, false);
-
-    // fldd [rsp]
-    emit_opcode(cbuf, 0x66);
-    emit_opcode(cbuf, 0xDD);
-    encode_RegMem(cbuf, 0x0, RSP_enc, 0x4, 0, 0, false);
-  %}
-
-
-  enc_class movq_ld(regD dst, memory mem) %{
     MacroAssembler _masm(&cbuf);
-    __ movq($dst$$XMMRegister, $mem$$Address);
+    __ subptr(rsp, 8);
+    __ movdbl(Address(rsp, 0), $src$$XMMRegister);
+    __ fld_d(Address(rsp, 0));
   %}
 
-  enc_class movq_st(memory mem, regD src) %{
-    MacroAssembler _masm(&cbuf);
-    __ movq($mem$$Address, $src$$XMMRegister);
-  %}
-
-  enc_class pshufd_8x8(regF dst, regF src) %{
-    MacroAssembler _masm(&cbuf);
-
-    encode_CopyXD(cbuf, $dst$$reg, $src$$reg);
-    __ punpcklbw(as_XMMRegister($dst$$reg), as_XMMRegister($dst$$reg));
-    __ pshuflw(as_XMMRegister($dst$$reg), as_XMMRegister($dst$$reg), 0x00);
-  %}
-
-  enc_class pshufd_4x16(regF dst, regF src) %{
-    MacroAssembler _masm(&cbuf);
-
-    __ pshuflw(as_XMMRegister($dst$$reg), as_XMMRegister($src$$reg), 0x00);
-  %}
-
-  enc_class pshufd(regD dst, regD src, int mode) %{
-    MacroAssembler _masm(&cbuf);
-
-    __ pshufd(as_XMMRegister($dst$$reg), as_XMMRegister($src$$reg), $mode);
-  %}
-
-  enc_class pxor(regD dst, regD src) %{
-    MacroAssembler _masm(&cbuf);
-
-    __ pxor(as_XMMRegister($dst$$reg), as_XMMRegister($src$$reg));
-  %}
-
-  enc_class mov_i2x(regD dst, rRegI src) %{
-    MacroAssembler _masm(&cbuf);
-
-    __ movdl(as_XMMRegister($dst$$reg), as_Register($src$$reg));
-  %}
 
   // obj: object to lock
   // box: box address (header location) -- killed
@@ -3534,303 +3169,6 @@ encode %{
                    RELOC_DISP32);
   %}
 
-  enc_class absF_encoding(regF dst)
-  %{
-    int dstenc = $dst$$reg;
-    address signmask_address = (address) StubRoutines::x86::float_sign_mask();
-
-    cbuf.set_insts_mark();
-    if (dstenc >= 8) {
-      emit_opcode(cbuf, Assembler::REX_R);
-      dstenc -= 8;
-    }
-    // XXX reg_mem doesn't support RIP-relative addressing yet
-    emit_opcode(cbuf, 0x0F);
-    emit_opcode(cbuf, 0x54);
-    emit_rm(cbuf, 0x0, dstenc, 0x5);  // 00 reg 101
-    emit_d32_reloc(cbuf, signmask_address);
-  %}
-
-  enc_class absD_encoding(regD dst)
-  %{
-    int dstenc = $dst$$reg;
-    address signmask_address = (address) StubRoutines::x86::double_sign_mask();
-
-    cbuf.set_insts_mark();
-    emit_opcode(cbuf, 0x66);
-    if (dstenc >= 8) {
-      emit_opcode(cbuf, Assembler::REX_R);
-      dstenc -= 8;
-    }
-    // XXX reg_mem doesn't support RIP-relative addressing yet
-    emit_opcode(cbuf, 0x0F);
-    emit_opcode(cbuf, 0x54);
-    emit_rm(cbuf, 0x0, dstenc, 0x5);  // 00 reg 101
-    emit_d32_reloc(cbuf, signmask_address);
-  %}
-
-  enc_class negF_encoding(regF dst)
-  %{
-    int dstenc = $dst$$reg;
-    address signflip_address = (address) StubRoutines::x86::float_sign_flip();
-
-    cbuf.set_insts_mark();
-    if (dstenc >= 8) {
-      emit_opcode(cbuf, Assembler::REX_R);
-      dstenc -= 8;
-    }
-    // XXX reg_mem doesn't support RIP-relative addressing yet
-    emit_opcode(cbuf, 0x0F);
-    emit_opcode(cbuf, 0x57);
-    emit_rm(cbuf, 0x0, dstenc, 0x5);  // 00 reg 101
-    emit_d32_reloc(cbuf, signflip_address);
-  %}
-
-  enc_class negD_encoding(regD dst)
-  %{
-    int dstenc = $dst$$reg;
-    address signflip_address = (address) StubRoutines::x86::double_sign_flip();
-
-    cbuf.set_insts_mark();
-    emit_opcode(cbuf, 0x66);
-    if (dstenc >= 8) {
-      emit_opcode(cbuf, Assembler::REX_R);
-      dstenc -= 8;
-    }
-    // XXX reg_mem doesn't support RIP-relative addressing yet
-    emit_opcode(cbuf, 0x0F);
-    emit_opcode(cbuf, 0x57);
-    emit_rm(cbuf, 0x0, dstenc, 0x5);  // 00 reg 101
-    emit_d32_reloc(cbuf, signflip_address);
-  %}
-
-  enc_class f2i_fixup(rRegI dst, regF src)
-  %{
-    int dstenc = $dst$$reg;
-    int srcenc = $src$$reg;
-
-    // cmpl $dst, #0x80000000
-    if (dstenc >= 8) {
-      emit_opcode(cbuf, Assembler::REX_B);
-    }
-    emit_opcode(cbuf, 0x81);
-    emit_rm(cbuf, 0x3, 0x7, dstenc & 7);
-    emit_d32(cbuf, 0x80000000);
-
-    // jne,s done
-    emit_opcode(cbuf, 0x75);
-    if (srcenc < 8 && dstenc < 8) {
-      emit_d8(cbuf, 0xF);
-    } else if (srcenc >= 8 && dstenc >= 8) {
-      emit_d8(cbuf, 0x11);
-    } else {
-      emit_d8(cbuf, 0x10);
-    }
-
-    // subq rsp, #8
-    emit_opcode(cbuf, Assembler::REX_W);
-    emit_opcode(cbuf, 0x83);
-    emit_rm(cbuf, 0x3, 0x5, RSP_enc);
-    emit_d8(cbuf, 8);
-
-    // movss [rsp], $src
-    emit_opcode(cbuf, 0xF3);
-    if (srcenc >= 8) {
-      emit_opcode(cbuf, Assembler::REX_R);
-    }
-    emit_opcode(cbuf, 0x0F);
-    emit_opcode(cbuf, 0x11);
-    encode_RegMem(cbuf, srcenc, RSP_enc, 0x4, 0, 0, false); // 2 bytes
-
-    // call f2i_fixup
-    cbuf.set_insts_mark();
-    emit_opcode(cbuf, 0xE8);
-    emit_d32_reloc(cbuf,
-                   (int)
-                   (StubRoutines::x86::f2i_fixup() - cbuf.insts_end() - 4),
-                   runtime_call_Relocation::spec(),
-                   RELOC_DISP32);
-
-    // popq $dst
-    if (dstenc >= 8) {
-      emit_opcode(cbuf, Assembler::REX_B);
-    }
-    emit_opcode(cbuf, 0x58 | (dstenc & 7));
-
-    // done:
-  %}
-
-  enc_class f2l_fixup(rRegL dst, regF src)
-  %{
-    int dstenc = $dst$$reg;
-    int srcenc = $src$$reg;
-    address const_address = (address) StubRoutines::x86::double_sign_flip();
-
-    // cmpq $dst, [0x8000000000000000]
-    cbuf.set_insts_mark();
-    emit_opcode(cbuf, dstenc < 8 ? Assembler::REX_W : Assembler::REX_WR);
-    emit_opcode(cbuf, 0x39);
-    // XXX reg_mem doesn't support RIP-relative addressing yet
-    emit_rm(cbuf, 0x0, dstenc & 7, 0x5); // 00 reg 101
-    emit_d32_reloc(cbuf, const_address);
-
-
-    // jne,s done
-    emit_opcode(cbuf, 0x75);
-    if (srcenc < 8 && dstenc < 8) {
-      emit_d8(cbuf, 0xF);
-    } else if (srcenc >= 8 && dstenc >= 8) {
-      emit_d8(cbuf, 0x11);
-    } else {
-      emit_d8(cbuf, 0x10);
-    }
-
-    // subq rsp, #8
-    emit_opcode(cbuf, Assembler::REX_W);
-    emit_opcode(cbuf, 0x83);
-    emit_rm(cbuf, 0x3, 0x5, RSP_enc);
-    emit_d8(cbuf, 8);
-
-    // movss [rsp], $src
-    emit_opcode(cbuf, 0xF3);
-    if (srcenc >= 8) {
-      emit_opcode(cbuf, Assembler::REX_R);
-    }
-    emit_opcode(cbuf, 0x0F);
-    emit_opcode(cbuf, 0x11);
-    encode_RegMem(cbuf, srcenc, RSP_enc, 0x4, 0, 0, false); // 2 bytes
-
-    // call f2l_fixup
-    cbuf.set_insts_mark();
-    emit_opcode(cbuf, 0xE8);
-    emit_d32_reloc(cbuf,
-                   (int)
-                   (StubRoutines::x86::f2l_fixup() - cbuf.insts_end() - 4),
-                   runtime_call_Relocation::spec(),
-                   RELOC_DISP32);
-
-    // popq $dst
-    if (dstenc >= 8) {
-      emit_opcode(cbuf, Assembler::REX_B);
-    }
-    emit_opcode(cbuf, 0x58 | (dstenc & 7));
-
-    // done:
-  %}
-
-  enc_class d2i_fixup(rRegI dst, regD src)
-  %{
-    int dstenc = $dst$$reg;
-    int srcenc = $src$$reg;
-
-    // cmpl $dst, #0x80000000
-    if (dstenc >= 8) {
-      emit_opcode(cbuf, Assembler::REX_B);
-    }
-    emit_opcode(cbuf, 0x81);
-    emit_rm(cbuf, 0x3, 0x7, dstenc & 7);
-    emit_d32(cbuf, 0x80000000);
-
-    // jne,s done
-    emit_opcode(cbuf, 0x75);
-    if (srcenc < 8 && dstenc < 8) {
-      emit_d8(cbuf, 0xF);
-    } else if (srcenc >= 8 && dstenc >= 8) {
-      emit_d8(cbuf, 0x11);
-    } else {
-      emit_d8(cbuf, 0x10);
-    }
-
-    // subq rsp, #8
-    emit_opcode(cbuf, Assembler::REX_W);
-    emit_opcode(cbuf, 0x83);
-    emit_rm(cbuf, 0x3, 0x5, RSP_enc);
-    emit_d8(cbuf, 8);
-
-    // movsd [rsp], $src
-    emit_opcode(cbuf, 0xF2);
-    if (srcenc >= 8) {
-      emit_opcode(cbuf, Assembler::REX_R);
-    }
-    emit_opcode(cbuf, 0x0F);
-    emit_opcode(cbuf, 0x11);
-    encode_RegMem(cbuf, srcenc, RSP_enc, 0x4, 0, 0, false); // 2 bytes
-
-    // call d2i_fixup
-    cbuf.set_insts_mark();
-    emit_opcode(cbuf, 0xE8);
-    emit_d32_reloc(cbuf,
-                   (int)
-                   (StubRoutines::x86::d2i_fixup() - cbuf.insts_end() - 4),
-                   runtime_call_Relocation::spec(),
-                   RELOC_DISP32);
-
-    // popq $dst
-    if (dstenc >= 8) {
-      emit_opcode(cbuf, Assembler::REX_B);
-    }
-    emit_opcode(cbuf, 0x58 | (dstenc & 7));
-
-    // done:
-  %}
-
-  enc_class d2l_fixup(rRegL dst, regD src)
-  %{
-    int dstenc = $dst$$reg;
-    int srcenc = $src$$reg;
-    address const_address = (address) StubRoutines::x86::double_sign_flip();
-
-    // cmpq $dst, [0x8000000000000000]
-    cbuf.set_insts_mark();
-    emit_opcode(cbuf, dstenc < 8 ? Assembler::REX_W : Assembler::REX_WR);
-    emit_opcode(cbuf, 0x39);
-    // XXX reg_mem doesn't support RIP-relative addressing yet
-    emit_rm(cbuf, 0x0, dstenc & 7, 0x5); // 00 reg 101
-    emit_d32_reloc(cbuf, const_address);
-
-
-    // jne,s done
-    emit_opcode(cbuf, 0x75);
-    if (srcenc < 8 && dstenc < 8) {
-      emit_d8(cbuf, 0xF);
-    } else if (srcenc >= 8 && dstenc >= 8) {
-      emit_d8(cbuf, 0x11);
-    } else {
-      emit_d8(cbuf, 0x10);
-    }
-
-    // subq rsp, #8
-    emit_opcode(cbuf, Assembler::REX_W);
-    emit_opcode(cbuf, 0x83);
-    emit_rm(cbuf, 0x3, 0x5, RSP_enc);
-    emit_d8(cbuf, 8);
-
-    // movsd [rsp], $src
-    emit_opcode(cbuf, 0xF2);
-    if (srcenc >= 8) {
-      emit_opcode(cbuf, Assembler::REX_R);
-    }
-    emit_opcode(cbuf, 0x0F);
-    emit_opcode(cbuf, 0x11);
-    encode_RegMem(cbuf, srcenc, RSP_enc, 0x4, 0, 0, false); // 2 bytes
-
-    // call d2l_fixup
-    cbuf.set_insts_mark();
-    emit_opcode(cbuf, 0xE8);
-    emit_d32_reloc(cbuf,
-                   (int)
-                   (StubRoutines::x86::d2l_fixup() - cbuf.insts_end() - 4),
-                   runtime_call_Relocation::spec(),
-                   RELOC_DISP32);
-
-    // popq $dst
-    if (dstenc >= 8) {
-      emit_opcode(cbuf, Assembler::REX_B);
-    }
-    emit_opcode(cbuf, 0x58 | (dstenc & 7));
-
-    // done:
-  %}
 %}
 
 
@@ -6156,8 +5494,9 @@ instruct loadF(regF dst, memory mem)
 
   ins_cost(145); // XXX
   format %{ "movss   $dst, $mem\t# float" %}
-  opcode(0xF3, 0x0F, 0x10);
-  ins_encode(OpcP, REX_reg_mem(dst, mem), OpcS, OpcT, reg_mem(dst, mem));
+  ins_encode %{
+    __ movflt($dst$$XMMRegister, $mem$$Address);
+  %}
   ins_pipe(pipe_slow); // XXX
 %}
 
@@ -6169,8 +5508,9 @@ instruct loadD_partial(regD dst, memory mem)
 
   ins_cost(145); // XXX
   format %{ "movlpd  $dst, $mem\t# double" %}
-  opcode(0x66, 0x0F, 0x12);
-  ins_encode(OpcP, REX_reg_mem(dst, mem), OpcS, OpcT, reg_mem(dst, mem));
+  ins_encode %{
+    __ movdbl($dst$$XMMRegister, $mem$$Address);
+  %}
   ins_pipe(pipe_slow); // XXX
 %}
 
@@ -6181,8 +5521,9 @@ instruct loadD(regD dst, memory mem)
 
   ins_cost(145); // XXX
   format %{ "movsd   $dst, $mem\t# double" %}
-  opcode(0xF2, 0x0F, 0x10);
-  ins_encode(OpcP, REX_reg_mem(dst, mem), OpcS, OpcT, reg_mem(dst, mem));
+  ins_encode %{
+    __ movdbl($dst$$XMMRegister, $mem$$Address);
+  %}
   ins_pipe(pipe_slow); // XXX
 %}
 
@@ -6191,7 +5532,9 @@ instruct loadA8B(regD dst, memory mem) %{
   match(Set dst (Load8B mem));
   ins_cost(125);
   format %{ "MOVQ  $dst,$mem\t! packed8B" %}
-  ins_encode( movq_ld(dst, mem));
+  ins_encode %{
+    __ movq($dst$$XMMRegister, $mem$$Address);
+  %}
   ins_pipe( pipe_slow );
 %}
 
@@ -6200,7 +5543,9 @@ instruct loadA4S(regD dst, memory mem) %{
   match(Set dst (Load4S mem));
   ins_cost(125);
   format %{ "MOVQ  $dst,$mem\t! packed4S" %}
-  ins_encode( movq_ld(dst, mem));
+  ins_encode %{
+    __ movq($dst$$XMMRegister, $mem$$Address);
+  %}
   ins_pipe( pipe_slow );
 %}
 
@@ -6209,7 +5554,9 @@ instruct loadA4C(regD dst, memory mem) %{
   match(Set dst (Load4C mem));
   ins_cost(125);
   format %{ "MOVQ  $dst,$mem\t! packed4C" %}
-  ins_encode( movq_ld(dst, mem));
+  ins_encode %{
+    __ movq($dst$$XMMRegister, $mem$$Address);
+  %}
   ins_pipe( pipe_slow );
 %}
 
@@ -6218,16 +5565,20 @@ instruct load2IU(regD dst, memory mem) %{
   match(Set dst (Load2I mem));
   ins_cost(125);
   format %{ "MOVQ  $dst,$mem\t! packed2I" %}
-  ins_encode( movq_ld(dst, mem));
+  ins_encode %{
+    __ movq($dst$$XMMRegister, $mem$$Address);
+  %}
   ins_pipe( pipe_slow );
 %}
 
 // Load Aligned Packed Single to XMM
 instruct loadA2F(regD dst, memory mem) %{
   match(Set dst (Load2F mem));
-  ins_cost(145);
+  ins_cost(125);
   format %{ "MOVQ  $dst,$mem\t! packed2F" %}
-  ins_encode( movq_ld(dst, mem));
+  ins_encode %{
+    __ movq($dst$$XMMRegister, $mem$$Address);
+  %}
   ins_pipe( pipe_slow );
 %}
 
@@ -6540,8 +5891,9 @@ instruct loadConF0(regF dst, immF0 src)
   ins_cost(100);
 
   format %{ "xorps   $dst, $dst\t# float 0.0" %}
-  opcode(0x0F, 0x57);
-  ins_encode(REX_reg_reg(dst, dst), OpcP, OpcS, reg_reg(dst, dst));
+  ins_encode %{
+    __ xorps($dst$$XMMRegister, $dst$$XMMRegister);
+  %}
   ins_pipe(pipe_slow);
 %}
 
@@ -6562,8 +5914,9 @@ instruct loadConD0(regD dst, immD0 src)
   ins_cost(100);
 
   format %{ "xorpd   $dst, $dst\t# double 0.0" %}
-  opcode(0x66, 0x0F, 0x57);
-  ins_encode(OpcP, REX_reg_reg(dst, dst), OpcS, OpcT, reg_reg(dst, dst));
+  ins_encode %{
+    __ xorpd ($dst$$XMMRegister, $dst$$XMMRegister);
+  %}
   ins_pipe(pipe_slow);
 %}
 
@@ -6606,8 +5959,9 @@ instruct loadSSF(regF dst, stackSlotF src)
 
   ins_cost(125);
   format %{ "movss   $dst, $src\t# float stk" %}
-  opcode(0xF3, 0x0F, 0x10);
-  ins_encode(OpcP, REX_reg_mem(dst, src), OpcS, OpcT, reg_mem(dst, src));
+  ins_encode %{
+    __ movflt($dst$$XMMRegister, Address(rsp, $src$$disp));
+  %}
   ins_pipe(pipe_slow); // XXX
 %}
 
@@ -6972,7 +6326,9 @@ instruct storeA8B(memory mem, regD src) %{
   match(Set mem (Store8B mem src));
   ins_cost(145);
   format %{ "MOVQ  $mem,$src\t! packed8B" %}
-  ins_encode( movq_st(mem, src));
+  ins_encode %{
+    __ movq($mem$$Address, $src$$XMMRegister);
+  %}
   ins_pipe( pipe_slow );
 %}
 
@@ -6981,7 +6337,9 @@ instruct storeA4C(memory mem, regD src) %{
   match(Set mem (Store4C mem src));
   ins_cost(145);
   format %{ "MOVQ  $mem,$src\t! packed4C" %}
-  ins_encode( movq_st(mem, src));
+  ins_encode %{
+    __ movq($mem$$Address, $src$$XMMRegister);
+  %}
   ins_pipe( pipe_slow );
 %}
 
@@ -6990,7 +6348,9 @@ instruct storeA2I(memory mem, regD src) %{
   match(Set mem (Store2I mem src));
   ins_cost(145);
   format %{ "MOVQ  $mem,$src\t! packed2I" %}
-  ins_encode( movq_st(mem, src));
+  ins_encode %{
+    __ movq($mem$$Address, $src$$XMMRegister);
+  %}
   ins_pipe( pipe_slow );
 %}
 
@@ -7024,7 +6384,9 @@ instruct storeA2F(memory mem, regD src) %{
   match(Set mem (Store2F mem src));
   ins_cost(145);
   format %{ "MOVQ  $mem,$src\t! packed2F" %}
-  ins_encode( movq_st(mem, src));
+  ins_encode %{
+    __ movq($mem$$Address, $src$$XMMRegister);
+  %}
   ins_pipe( pipe_slow );
 %}
 
@@ -7035,8 +6397,9 @@ instruct storeF(memory mem, regF src)
 
   ins_cost(95); // XXX
   format %{ "movss   $mem, $src\t# float" %}
-  opcode(0xF3, 0x0F, 0x11);
-  ins_encode(OpcP, REX_reg_mem(src, mem), OpcS, OpcT, reg_mem(src, mem));
+  ins_encode %{
+    __ movflt($mem$$Address, $src$$XMMRegister);
+  %}
   ins_pipe(pipe_slow); // XXX
 %}
 
@@ -7072,8 +6435,9 @@ instruct storeD(memory mem, regD src)
 
   ins_cost(95); // XXX
   format %{ "movsd   $mem, $src\t# double" %}
-  opcode(0xF2, 0x0F, 0x11);
-  ins_encode(OpcP, REX_reg_mem(src, mem), OpcS, OpcT, reg_mem(src, mem));
+  ins_encode %{
+    __ movdbl($mem$$Address, $src$$XMMRegister);
+  %}
   ins_pipe(pipe_slow); // XXX
 %}
 
@@ -7142,8 +6506,9 @@ instruct storeSSF(stackSlotF dst, regF src)
 
   ins_cost(95); // XXX
   format %{ "movss   $dst, $src\t# float stk" %}
-  opcode(0xF3, 0x0F, 0x11);
-  ins_encode(OpcP, REX_reg_mem(src, dst), OpcS, OpcT, reg_mem(src, dst));
+  ins_encode %{
+    __ movflt(Address(rsp, $dst$$disp), $src$$XMMRegister);
+  %}
   ins_pipe(pipe_slow); // XXX
 %}
 
@@ -7153,8 +6518,9 @@ instruct storeSSD(stackSlotD dst, regD src)
 
   ins_cost(95); // XXX
   format %{ "movsd   $dst, $src\t# double stk" %}
-  opcode(0xF2, 0x0F, 0x11);
-  ins_encode(OpcP, REX_reg_mem(src, dst), OpcS, OpcT, reg_mem(src, dst));
+  ins_encode %{
+    __ movdbl(Address(rsp, $dst$$disp), $src$$XMMRegister);
+  %}
   ins_pipe(pipe_slow); // XXX
 %}
 
@@ -7444,6 +6810,16 @@ instruct unnecessary_membar_volatile()
   ins_pipe(empty);
 %}
 
+instruct membar_storestore() %{
+  match(MemBarStoreStore);
+  ins_cost(0);
+
+  size(0);
+  format %{ "MEMBAR-storestore (empty encoding)" %}
+  ins_encode( );
+  ins_pipe(empty);
+%}
+
 //----------Move Instructions--------------------------------------------------
 
 instruct castX2P(rRegP dst, rRegL src)
@@ -7451,7 +6827,11 @@ instruct castX2P(rRegP dst, rRegL src)
   match(Set dst (CastX2P src));
 
   format %{ "movq    $dst, $src\t# long->ptr" %}
-  ins_encode(enc_copy_wide(dst, src));
+  ins_encode %{
+    if ($dst$$reg != $src$$reg) {
+      __ movptr($dst$$Register, $src$$Register);
+    }
+  %}
   ins_pipe(ialu_reg_reg); // XXX
 %}
 
@@ -7460,7 +6840,11 @@ instruct castP2X(rRegL dst, rRegP src)
   match(Set dst (CastP2X src));
 
   format %{ "movq    $dst, $src\t# ptr -> long" %}
-  ins_encode(enc_copy_wide(dst, src));
+  ins_encode %{
+    if ($dst$$reg != $src$$reg) {
+      __ movptr($dst$$Register, $src$$Register);
+    }
+  %}
   ins_pipe(ialu_reg_reg); // XXX
 %}
 
@@ -7813,7 +7197,13 @@ instruct cmovF_reg(cmpOp cop, rFlagsReg cr, regF dst, regF src)
   format %{ "jn$cop    skip\t# signed cmove float\n\t"
             "movss     $dst, $src\n"
     "skip:" %}
-  ins_encode(enc_cmovf_branch(cop, dst, src));
+  ins_encode %{
+    Label Lskip;
+    // Invert sense of branch from sense of CMOV
+    __ jccb((Assembler::Condition)($cop$$cmpcode^1), Lskip);
+    __ movflt($dst$$XMMRegister, $src$$XMMRegister);
+    __ bind(Lskip);
+  %}
   ins_pipe(pipe_slow);
 %}
 
@@ -7837,7 +7227,13 @@ instruct cmovF_regU(cmpOpU cop, rFlagsRegU cr, regF dst, regF src)
   format %{ "jn$cop    skip\t# unsigned cmove float\n\t"
             "movss     $dst, $src\n"
     "skip:" %}
-  ins_encode(enc_cmovf_branch(cop, dst, src));
+  ins_encode %{
+    Label Lskip;
+    // Invert sense of branch from sense of CMOV
+    __ jccb((Assembler::Condition)($cop$$cmpcode^1), Lskip);
+    __ movflt($dst$$XMMRegister, $src$$XMMRegister);
+    __ bind(Lskip);
+  %}
   ins_pipe(pipe_slow);
 %}
 
@@ -7857,7 +7253,13 @@ instruct cmovD_reg(cmpOp cop, rFlagsReg cr, regD dst, regD src)
   format %{ "jn$cop    skip\t# signed cmove double\n\t"
             "movsd     $dst, $src\n"
     "skip:" %}
-  ins_encode(enc_cmovd_branch(cop, dst, src));
+  ins_encode %{
+    Label Lskip;
+    // Invert sense of branch from sense of CMOV
+    __ jccb((Assembler::Condition)($cop$$cmpcode^1), Lskip);
+    __ movdbl($dst$$XMMRegister, $src$$XMMRegister);
+    __ bind(Lskip);
+  %}
   ins_pipe(pipe_slow);
 %}
 
@@ -7869,7 +7271,13 @@ instruct cmovD_regU(cmpOpU cop, rFlagsRegU cr, regD dst, regD src)
   format %{ "jn$cop    skip\t# unsigned cmove double\n\t"
             "movsd     $dst, $src\n"
     "skip:" %}
-  ins_encode(enc_cmovd_branch(cop, dst, src));
+  ins_encode %{
+    Label Lskip;
+    // Invert sense of branch from sense of CMOV
+    __ jccb((Assembler::Condition)($cop$$cmpcode^1), Lskip);
+    __ movdbl($dst$$XMMRegister, $src$$XMMRegister);
+    __ bind(Lskip);
+  %}
   ins_pipe(pipe_slow);
 %}
 
@@ -10191,17 +9599,18 @@ instruct cmpF_cc_reg(rFlagsRegU cr, regF src1, regF src2)
             "pushfq\t# saw NaN, set CF\n\t"
             "andq    [rsp], #0xffffff2b\n\t"
             "popfq\n"
-    "exit:   nop\t# avoid branch to branch" %}
-  opcode(0x0F, 0x2E);
-  ins_encode(REX_reg_reg(src1, src2), OpcP, OpcS, reg_reg(src1, src2),
-             cmpfp_fixup);
+    "exit:" %}
+  ins_encode %{
+    __ ucomiss($src1$$XMMRegister, $src2$$XMMRegister);
+    emit_cmpfp_fixup(_masm);
+  %}
   ins_pipe(pipe_slow);
 %}
 
 instruct cmpF_cc_reg_CF(rFlagsRegUCF cr, regF src1, regF src2) %{
   match(Set cr (CmpF src1 src2));
 
-  ins_cost(145);
+  ins_cost(100);
   format %{ "ucomiss $src1, $src2" %}
   ins_encode %{
     __ ucomiss($src1$$XMMRegister, $src2$$XMMRegister);
@@ -10219,10 +9628,11 @@ instruct cmpF_cc_mem(rFlagsRegU cr, regF src1, memory src2)
             "pushfq\t# saw NaN, set CF\n\t"
             "andq    [rsp], #0xffffff2b\n\t"
             "popfq\n"
-    "exit:   nop\t# avoid branch to branch" %}
-  opcode(0x0F, 0x2E);
-  ins_encode(REX_reg_mem(src1, src2), OpcP, OpcS, reg_mem(src1, src2),
-             cmpfp_fixup);
+    "exit:" %}
+  ins_encode %{
+    __ ucomiss($src1$$XMMRegister, $src2$$Address);
+    emit_cmpfp_fixup(_masm);
+  %}
   ins_pipe(pipe_slow);
 %}
 
@@ -10231,8 +9641,9 @@ instruct cmpF_cc_memCF(rFlagsRegUCF cr, regF src1, memory src2) %{
 
   ins_cost(100);
   format %{ "ucomiss $src1, $src2" %}
-  opcode(0x0F, 0x2E);
-  ins_encode(REX_reg_mem(src1, src2), OpcP, OpcS, reg_mem(src1, src2));
+  ins_encode %{
+    __ ucomiss($src1$$XMMRegister, $src2$$Address);
+  %}
   ins_pipe(pipe_slow);
 %}
 
@@ -10245,7 +9656,7 @@ instruct cmpF_cc_imm(rFlagsRegU cr, regF src, immF con) %{
             "pushfq\t# saw NaN, set CF\n\t"
             "andq    [rsp], #0xffffff2b\n\t"
             "popfq\n"
-    "exit:   nop\t# avoid branch to branch" %}
+    "exit:" %}
   ins_encode %{
     __ ucomiss($src$$XMMRegister, $constantaddress($con));
     emit_cmpfp_fixup(_masm);
@@ -10273,10 +9684,11 @@ instruct cmpD_cc_reg(rFlagsRegU cr, regD src1, regD src2)
             "pushfq\t# saw NaN, set CF\n\t"
             "andq    [rsp], #0xffffff2b\n\t"
             "popfq\n"
-    "exit:   nop\t# avoid branch to branch" %}
-  opcode(0x66, 0x0F, 0x2E);
-  ins_encode(OpcP, REX_reg_reg(src1, src2), OpcS, OpcT, reg_reg(src1, src2),
-             cmpfp_fixup);
+    "exit:" %}
+  ins_encode %{
+    __ ucomisd($src1$$XMMRegister, $src2$$XMMRegister);
+    emit_cmpfp_fixup(_masm);
+  %}
   ins_pipe(pipe_slow);
 %}
 
@@ -10301,10 +9713,11 @@ instruct cmpD_cc_mem(rFlagsRegU cr, regD src1, memory src2)
             "pushfq\t# saw NaN, set CF\n\t"
             "andq    [rsp], #0xffffff2b\n\t"
             "popfq\n"
-    "exit:   nop\t# avoid branch to branch" %}
-  opcode(0x66, 0x0F, 0x2E);
-  ins_encode(OpcP, REX_reg_mem(src1, src2), OpcS, OpcT, reg_mem(src1, src2),
-             cmpfp_fixup);
+    "exit:" %}
+  ins_encode %{
+    __ ucomisd($src1$$XMMRegister, $src2$$Address);
+    emit_cmpfp_fixup(_masm);
+  %}
   ins_pipe(pipe_slow);
 %}
 
@@ -10313,8 +9726,9 @@ instruct cmpD_cc_memCF(rFlagsRegUCF cr, regD src1, memory src2) %{
 
   ins_cost(100);
   format %{ "ucomisd $src1, $src2" %}
-  opcode(0x66, 0x0F, 0x2E);
-  ins_encode(OpcP, REX_reg_mem(src1, src2), OpcS, OpcT, reg_mem(src1, src2));
+  ins_encode %{
+    __ ucomisd($src1$$XMMRegister, $src2$$Address);
+  %}
   ins_pipe(pipe_slow);
 %}
 
@@ -10327,7 +9741,7 @@ instruct cmpD_cc_imm(rFlagsRegU cr, regD src, immD con) %{
             "pushfq\t# saw NaN, set CF\n\t"
             "andq    [rsp], #0xffffff2b\n\t"
             "popfq\n"
-    "exit:   nop\t# avoid branch to branch" %}
+    "exit:" %}
   ins_encode %{
     __ ucomisd($src$$XMMRegister, $constantaddress($con));
     emit_cmpfp_fixup(_masm);
@@ -10359,10 +9773,10 @@ instruct cmpF_reg(rRegI dst, regF src1, regF src2, rFlagsReg cr)
             "setne   $dst\n\t"
             "movzbl  $dst, $dst\n"
     "done:" %}
-
-  opcode(0x0F, 0x2E);
-  ins_encode(REX_reg_reg(src1, src2), OpcP, OpcS, reg_reg(src1, src2),
-             cmpfp3(dst));
+  ins_encode %{
+    __ ucomiss($src1$$XMMRegister, $src2$$XMMRegister);
+    emit_cmpfp3(_masm, $dst$$Register);
+  %}
   ins_pipe(pipe_slow);
 %}
 
@@ -10380,10 +9794,10 @@ instruct cmpF_mem(rRegI dst, regF src1, memory src2, rFlagsReg cr)
             "setne   $dst\n\t"
             "movzbl  $dst, $dst\n"
     "done:" %}
-
-  opcode(0x0F, 0x2E);
-  ins_encode(REX_reg_mem(src1, src2), OpcP, OpcS, reg_mem(src1, src2),
-             cmpfp3(dst));
+  ins_encode %{
+    __ ucomiss($src1$$XMMRegister, $src2$$Address);
+    emit_cmpfp3(_masm, $dst$$Register);
+  %}
   ins_pipe(pipe_slow);
 %}
 
@@ -10401,15 +9815,8 @@ instruct cmpF_imm(rRegI dst, regF src, immF con, rFlagsReg cr) %{
             "movzbl  $dst, $dst\n"
     "done:" %}
   ins_encode %{
-    Label L_done;
-    Register Rdst = $dst$$Register;
     __ ucomiss($src$$XMMRegister, $constantaddress($con));
-    __ movl(Rdst, -1);
-    __ jcc(Assembler::parity, L_done);
-    __ jcc(Assembler::below, L_done);
-    __ setb(Assembler::notEqual, Rdst);
-    __ movzbl(Rdst, Rdst);
-    __ bind(L_done);
+    emit_cmpfp3(_masm, $dst$$Register);
   %}
   ins_pipe(pipe_slow);
 %}
@@ -10428,10 +9835,10 @@ instruct cmpD_reg(rRegI dst, regD src1, regD src2, rFlagsReg cr)
             "setne   $dst\n\t"
             "movzbl  $dst, $dst\n"
     "done:" %}
-
-  opcode(0x66, 0x0F, 0x2E);
-  ins_encode(OpcP, REX_reg_reg(src1, src2), OpcS, OpcT, reg_reg(src1, src2),
-             cmpfp3(dst));
+  ins_encode %{
+    __ ucomisd($src1$$XMMRegister, $src2$$XMMRegister);
+    emit_cmpfp3(_masm, $dst$$Register);
+  %}
   ins_pipe(pipe_slow);
 %}
 
@@ -10449,10 +9856,10 @@ instruct cmpD_mem(rRegI dst, regD src1, memory src2, rFlagsReg cr)
             "setne   $dst\n\t"
             "movzbl  $dst, $dst\n"
     "done:" %}
-
-  opcode(0x66, 0x0F, 0x2E);
-  ins_encode(OpcP, REX_reg_mem(src1, src2), OpcS, OpcT, reg_mem(src1, src2),
-             cmpfp3(dst));
+  ins_encode %{
+    __ ucomisd($src1$$XMMRegister, $src2$$Address);
+    emit_cmpfp3(_masm, $dst$$Register);
+  %}
   ins_pipe(pipe_slow);
 %}
 
@@ -10470,377 +9877,12 @@ instruct cmpD_imm(rRegI dst, regD src, immD con, rFlagsReg cr) %{
             "movzbl  $dst, $dst\n"
     "done:" %}
   ins_encode %{
-    Register Rdst = $dst$$Register;
-    Label L_done;
     __ ucomisd($src$$XMMRegister, $constantaddress($con));
-    __ movl(Rdst, -1);
-    __ jcc(Assembler::parity, L_done);
-    __ jcc(Assembler::below, L_done);
-    __ setb(Assembler::notEqual, Rdst);
-    __ movzbl(Rdst, Rdst);
-    __ bind(L_done);
+    emit_cmpfp3(_masm, $dst$$Register);
   %}
   ins_pipe(pipe_slow);
 %}
 
-instruct addF_reg(regF dst, regF src)
-%{
-  match(Set dst (AddF dst src));
-
-  format %{ "addss   $dst, $src" %}
-  ins_cost(150); // XXX
-  opcode(0xF3, 0x0F, 0x58);
-  ins_encode(OpcP, REX_reg_reg(dst, src), OpcS, OpcT, reg_reg(dst, src));
-  ins_pipe(pipe_slow);
-%}
-
-instruct addF_mem(regF dst, memory src)
-%{
-  match(Set dst (AddF dst (LoadF src)));
-
-  format %{ "addss   $dst, $src" %}
-  ins_cost(150); // XXX
-  opcode(0xF3, 0x0F, 0x58);
-  ins_encode(OpcP, REX_reg_mem(dst, src), OpcS, OpcT, reg_mem(dst, src));
-  ins_pipe(pipe_slow);
-%}
-
-instruct addF_imm(regF dst, immF con) %{
-  match(Set dst (AddF dst con));
-  format %{ "addss   $dst, [$constantaddress]\t# load from constant table: float=$con" %}
-  ins_cost(150); // XXX
-  ins_encode %{
-    __ addss($dst$$XMMRegister, $constantaddress($con));
-  %}
-  ins_pipe(pipe_slow);
-%}
-
-instruct addD_reg(regD dst, regD src)
-%{
-  match(Set dst (AddD dst src));
-
-  format %{ "addsd   $dst, $src" %}
-  ins_cost(150); // XXX
-  opcode(0xF2, 0x0F, 0x58);
-  ins_encode(OpcP, REX_reg_reg(dst, src), OpcS, OpcT, reg_reg(dst, src));
-  ins_pipe(pipe_slow);
-%}
-
-instruct addD_mem(regD dst, memory src)
-%{
-  match(Set dst (AddD dst (LoadD src)));
-
-  format %{ "addsd   $dst, $src" %}
-  ins_cost(150); // XXX
-  opcode(0xF2, 0x0F, 0x58);
-  ins_encode(OpcP, REX_reg_mem(dst, src), OpcS, OpcT, reg_mem(dst, src));
-  ins_pipe(pipe_slow);
-%}
-
-instruct addD_imm(regD dst, immD con) %{
-  match(Set dst (AddD dst con));
-  format %{ "addsd   $dst, [$constantaddress]\t# load from constant table: double=$con" %}
-  ins_cost(150); // XXX
-  ins_encode %{
-    __ addsd($dst$$XMMRegister, $constantaddress($con));
-  %}
-  ins_pipe(pipe_slow);
-%}
-
-instruct subF_reg(regF dst, regF src)
-%{
-  match(Set dst (SubF dst src));
-
-  format %{ "subss   $dst, $src" %}
-  ins_cost(150); // XXX
-  opcode(0xF3, 0x0F, 0x5C);
-  ins_encode(OpcP, REX_reg_reg(dst, src), OpcS, OpcT, reg_reg(dst, src));
-  ins_pipe(pipe_slow);
-%}
-
-instruct subF_mem(regF dst, memory src)
-%{
-  match(Set dst (SubF dst (LoadF src)));
-
-  format %{ "subss   $dst, $src" %}
-  ins_cost(150); // XXX
-  opcode(0xF3, 0x0F, 0x5C);
-  ins_encode(OpcP, REX_reg_mem(dst, src), OpcS, OpcT, reg_mem(dst, src));
-  ins_pipe(pipe_slow);
-%}
-
-instruct subF_imm(regF dst, immF con) %{
-  match(Set dst (SubF dst con));
-  format %{ "subss   $dst, [$constantaddress]\t# load from constant table: float=$con" %}
-  ins_cost(150); // XXX
-  ins_encode %{
-    __ subss($dst$$XMMRegister, $constantaddress($con));
-  %}
-  ins_pipe(pipe_slow);
-%}
-
-instruct subD_reg(regD dst, regD src)
-%{
-  match(Set dst (SubD dst src));
-
-  format %{ "subsd   $dst, $src" %}
-  ins_cost(150); // XXX
-  opcode(0xF2, 0x0F, 0x5C);
-  ins_encode(OpcP, REX_reg_reg(dst, src), OpcS, OpcT, reg_reg(dst, src));
-  ins_pipe(pipe_slow);
-%}
-
-instruct subD_mem(regD dst, memory src)
-%{
-  match(Set dst (SubD dst (LoadD src)));
-
-  format %{ "subsd   $dst, $src" %}
-  ins_cost(150); // XXX
-  opcode(0xF2, 0x0F, 0x5C);
-  ins_encode(OpcP, REX_reg_mem(dst, src), OpcS, OpcT, reg_mem(dst, src));
-  ins_pipe(pipe_slow);
-%}
-
-instruct subD_imm(regD dst, immD con) %{
-  match(Set dst (SubD dst con));
-  format %{ "subsd   $dst, [$constantaddress]\t# load from constant table: double=$con" %}
-  ins_cost(150); // XXX
-  ins_encode %{
-    __ subsd($dst$$XMMRegister, $constantaddress($con));
-  %}
-  ins_pipe(pipe_slow);
-%}
-
-instruct mulF_reg(regF dst, regF src)
-%{
-  match(Set dst (MulF dst src));
-
-  format %{ "mulss   $dst, $src" %}
-  ins_cost(150); // XXX
-  opcode(0xF3, 0x0F, 0x59);
-  ins_encode(OpcP, REX_reg_reg(dst, src), OpcS, OpcT, reg_reg(dst, src));
-  ins_pipe(pipe_slow);
-%}
-
-instruct mulF_mem(regF dst, memory src)
-%{
-  match(Set dst (MulF dst (LoadF src)));
-
-  format %{ "mulss   $dst, $src" %}
-  ins_cost(150); // XXX
-  opcode(0xF3, 0x0F, 0x59);
-  ins_encode(OpcP, REX_reg_mem(dst, src), OpcS, OpcT, reg_mem(dst, src));
-  ins_pipe(pipe_slow);
-%}
-
-instruct mulF_imm(regF dst, immF con) %{
-  match(Set dst (MulF dst con));
-  format %{ "mulss   $dst, [$constantaddress]\t# load from constant table: float=$con" %}
-  ins_cost(150); // XXX
-  ins_encode %{
-    __ mulss($dst$$XMMRegister, $constantaddress($con));
-  %}
-  ins_pipe(pipe_slow);
-%}
-
-instruct mulD_reg(regD dst, regD src)
-%{
-  match(Set dst (MulD dst src));
-
-  format %{ "mulsd   $dst, $src" %}
-  ins_cost(150); // XXX
-  opcode(0xF2, 0x0F, 0x59);
-  ins_encode(OpcP, REX_reg_reg(dst, src), OpcS, OpcT, reg_reg(dst, src));
-  ins_pipe(pipe_slow);
-%}
-
-instruct mulD_mem(regD dst, memory src)
-%{
-  match(Set dst (MulD dst (LoadD src)));
-
-  format %{ "mulsd   $dst, $src" %}
-  ins_cost(150); // XXX
-  opcode(0xF2, 0x0F, 0x59);
-  ins_encode(OpcP, REX_reg_mem(dst, src), OpcS, OpcT, reg_mem(dst, src));
-  ins_pipe(pipe_slow);
-%}
-
-instruct mulD_imm(regD dst, immD con) %{
-  match(Set dst (MulD dst con));
-  format %{ "mulsd   $dst, [$constantaddress]\t# load from constant table: double=$con" %}
-  ins_cost(150); // XXX
-  ins_encode %{
-    __ mulsd($dst$$XMMRegister, $constantaddress($con));
-  %}
-  ins_pipe(pipe_slow);
-%}
-
-instruct divF_reg(regF dst, regF src)
-%{
-  match(Set dst (DivF dst src));
-
-  format %{ "divss   $dst, $src" %}
-  ins_cost(150); // XXX
-  opcode(0xF3, 0x0F, 0x5E);
-  ins_encode(OpcP, REX_reg_reg(dst, src), OpcS, OpcT, reg_reg(dst, src));
-  ins_pipe(pipe_slow);
-%}
-
-instruct divF_mem(regF dst, memory src)
-%{
-  match(Set dst (DivF dst (LoadF src)));
-
-  format %{ "divss   $dst, $src" %}
-  ins_cost(150); // XXX
-  opcode(0xF3, 0x0F, 0x5E);
-  ins_encode(OpcP, REX_reg_mem(dst, src), OpcS, OpcT, reg_mem(dst, src));
-  ins_pipe(pipe_slow);
-%}
-
-instruct divF_imm(regF dst, immF con) %{
-  match(Set dst (DivF dst con));
-  format %{ "divss   $dst, [$constantaddress]\t# load from constant table: float=$con" %}
-  ins_cost(150); // XXX
-  ins_encode %{
-    __ divss($dst$$XMMRegister, $constantaddress($con));
-  %}
-  ins_pipe(pipe_slow);
-%}
-
-instruct divD_reg(regD dst, regD src)
-%{
-  match(Set dst (DivD dst src));
-
-  format %{ "divsd   $dst, $src" %}
-  ins_cost(150); // XXX
-  opcode(0xF2, 0x0F, 0x5E);
-  ins_encode(OpcP, REX_reg_reg(dst, src), OpcS, OpcT, reg_reg(dst, src));
-  ins_pipe(pipe_slow);
-%}
-
-instruct divD_mem(regD dst, memory src)
-%{
-  match(Set dst (DivD dst (LoadD src)));
-
-  format %{ "divsd   $dst, $src" %}
-  ins_cost(150); // XXX
-  opcode(0xF2, 0x0F, 0x5E);
-  ins_encode(OpcP, REX_reg_mem(dst, src), OpcS, OpcT, reg_mem(dst, src));
-  ins_pipe(pipe_slow);
-%}
-
-instruct divD_imm(regD dst, immD con) %{
-  match(Set dst (DivD dst con));
-  format %{ "divsd   $dst, [$constantaddress]\t# load from constant table: double=$con" %}
-  ins_cost(150); // XXX
-  ins_encode %{
-    __ divsd($dst$$XMMRegister, $constantaddress($con));
-  %}
-  ins_pipe(pipe_slow);
-%}
-
-instruct sqrtF_reg(regF dst, regF src)
-%{
-  match(Set dst (ConvD2F (SqrtD (ConvF2D src))));
-
-  format %{ "sqrtss  $dst, $src" %}
-  ins_cost(150); // XXX
-  opcode(0xF3, 0x0F, 0x51);
-  ins_encode(OpcP, REX_reg_reg(dst, src), OpcS, OpcT, reg_reg(dst, src));
-  ins_pipe(pipe_slow);
-%}
-
-instruct sqrtF_mem(regF dst, memory src)
-%{
-  match(Set dst (ConvD2F (SqrtD (ConvF2D (LoadF src)))));
-
-  format %{ "sqrtss  $dst, $src" %}
-  ins_cost(150); // XXX
-  opcode(0xF3, 0x0F, 0x51);
-  ins_encode(OpcP, REX_reg_mem(dst, src), OpcS, OpcT, reg_mem(dst, src));
-  ins_pipe(pipe_slow);
-%}
-
-instruct sqrtF_imm(regF dst, immF con) %{
-  match(Set dst (ConvD2F (SqrtD (ConvF2D con))));
-  format %{ "sqrtss  $dst, [$constantaddress]\t# load from constant table: float=$con" %}
-  ins_cost(150); // XXX
-  ins_encode %{
-    __ sqrtss($dst$$XMMRegister, $constantaddress($con));
-  %}
-  ins_pipe(pipe_slow);
-%}
-
-instruct sqrtD_reg(regD dst, regD src)
-%{
-  match(Set dst (SqrtD src));
-
-  format %{ "sqrtsd  $dst, $src" %}
-  ins_cost(150); // XXX
-  opcode(0xF2, 0x0F, 0x51);
-  ins_encode(OpcP, REX_reg_reg(dst, src), OpcS, OpcT, reg_reg(dst, src));
-  ins_pipe(pipe_slow);
-%}
-
-instruct sqrtD_mem(regD dst, memory src)
-%{
-  match(Set dst (SqrtD (LoadD src)));
-
-  format %{ "sqrtsd  $dst, $src" %}
-  ins_cost(150); // XXX
-  opcode(0xF2, 0x0F, 0x51);
-  ins_encode(OpcP, REX_reg_mem(dst, src), OpcS, OpcT, reg_mem(dst, src));
-  ins_pipe(pipe_slow);
-%}
-
-instruct sqrtD_imm(regD dst, immD con) %{
-  match(Set dst (SqrtD con));
-  format %{ "sqrtsd  $dst, [$constantaddress]\t# load from constant table: double=$con" %}
-  ins_cost(150); // XXX
-  ins_encode %{
-    __ sqrtsd($dst$$XMMRegister, $constantaddress($con));
-  %}
-  ins_pipe(pipe_slow);
-%}
-
-instruct absF_reg(regF dst)
-%{
-  match(Set dst (AbsF dst));
-
-  format %{ "andps   $dst, [0x7fffffff]\t# abs float by sign masking" %}
-  ins_encode(absF_encoding(dst));
-  ins_pipe(pipe_slow);
-%}
-
-instruct absD_reg(regD dst)
-%{
-  match(Set dst (AbsD dst));
-
-  format %{ "andpd   $dst, [0x7fffffffffffffff]\t"
-            "# abs double by sign masking" %}
-  ins_encode(absD_encoding(dst));
-  ins_pipe(pipe_slow);
-%}
-
-instruct negF_reg(regF dst)
-%{
-  match(Set dst (NegF dst));
-
-  format %{ "xorps   $dst, [0x80000000]\t# neg float by sign flipping" %}
-  ins_encode(negF_encoding(dst));
-  ins_pipe(pipe_slow);
-%}
-
-instruct negD_reg(regD dst)
-%{
-  match(Set dst (NegD dst));
-
-  format %{ "xorpd   $dst, [0x8000000000000000]\t"
-            "# neg double by sign flipping" %}
-  ins_encode(negD_encoding(dst));
-  ins_pipe(pipe_slow);
-%}
-
 // -----------Trig and Trancendental Instructions------------------------------
 instruct cosD_reg(regD dst) %{
   match(Set dst (CosD dst));
@@ -10929,8 +9971,9 @@ instruct convF2D_reg_reg(regD dst, regF src)
   match(Set dst (ConvF2D src));
 
   format %{ "cvtss2sd $dst, $src" %}
-  opcode(0xF3, 0x0F, 0x5A);
-  ins_encode(OpcP, REX_reg_reg(dst, src), OpcS, OpcT, reg_reg(dst, src));
+  ins_encode %{
+    __ cvtss2sd ($dst$$XMMRegister, $src$$XMMRegister);
+  %}
   ins_pipe(pipe_slow); // XXX
 %}
 
@@ -10939,8 +9982,9 @@ instruct convF2D_reg_mem(regD dst, memory src)
   match(Set dst (ConvF2D (LoadF src)));
 
   format %{ "cvtss2sd $dst, $src" %}
-  opcode(0xF3, 0x0F, 0x5A);
-  ins_encode(OpcP, REX_reg_mem(dst, src), OpcS, OpcT, reg_mem(dst, src));
+  ins_encode %{
+    __ cvtss2sd ($dst$$XMMRegister, $src$$Address);
+  %}
   ins_pipe(pipe_slow); // XXX
 %}
 
@@ -10949,8 +9993,9 @@ instruct convD2F_reg_reg(regF dst, regD src)
   match(Set dst (ConvD2F src));
 
   format %{ "cvtsd2ss $dst, $src" %}
-  opcode(0xF2, 0x0F, 0x5A);
-  ins_encode(OpcP, REX_reg_reg(dst, src), OpcS, OpcT, reg_reg(dst, src));
+  ins_encode %{
+    __ cvtsd2ss ($dst$$XMMRegister, $src$$XMMRegister);
+  %}
   ins_pipe(pipe_slow); // XXX
 %}
 
@@ -10959,8 +10004,9 @@ instruct convD2F_reg_mem(regF dst, memory src)
   match(Set dst (ConvD2F (LoadD src)));
 
   format %{ "cvtsd2ss $dst, $src" %}
-  opcode(0xF2, 0x0F, 0x5A);
-  ins_encode(OpcP, REX_reg_mem(dst, src), OpcS, OpcT, reg_mem(dst, src));
+  ins_encode %{
+    __ cvtsd2ss ($dst$$XMMRegister, $src$$Address);
+  %}
   ins_pipe(pipe_slow); // XXX
 %}
 
@@ -10978,9 +10024,17 @@ instruct convF2I_reg_reg(rRegI dst, regF src, rFlagsReg cr)
             "call    f2i_fixup\n\t"
             "popq    $dst\n"
     "done:   "%}
-  opcode(0xF3, 0x0F, 0x2C);
-  ins_encode(OpcP, REX_reg_reg(dst, src), OpcS, OpcT, reg_reg(dst, src),
-             f2i_fixup(dst, src));
+  ins_encode %{
+    Label done;
+    __ cvttss2sil($dst$$Register, $src$$XMMRegister);
+    __ cmpl($dst$$Register, 0x80000000);
+    __ jccb(Assembler::notEqual, done);
+    __ subptr(rsp, 8);
+    __ movflt(Address(rsp, 0), $src$$XMMRegister);
+    __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, StubRoutines::x86::f2i_fixup())));
+    __ pop($dst$$Register);
+    __ bind(done);
+  %}
   ins_pipe(pipe_slow);
 %}
 
@@ -10997,9 +10051,18 @@ instruct convF2L_reg_reg(rRegL dst, regF src, rFlagsReg cr)
             "call    f2l_fixup\n\t"
             "popq    $dst\n"
     "done:   "%}
-  opcode(0xF3, 0x0F, 0x2C);
-  ins_encode(OpcP, REX_reg_reg_wide(dst, src), OpcS, OpcT, reg_reg(dst, src),
-             f2l_fixup(dst, src));
+  ins_encode %{
+    Label done;
+    __ cvttss2siq($dst$$Register, $src$$XMMRegister);
+    __ cmp64($dst$$Register,
+             ExternalAddress((address) StubRoutines::x86::double_sign_flip()));
+    __ jccb(Assembler::notEqual, done);
+    __ subptr(rsp, 8);
+    __ movflt(Address(rsp, 0), $src$$XMMRegister);
+    __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, StubRoutines::x86::f2l_fixup())));
+    __ pop($dst$$Register);
+    __ bind(done);
+  %}
   ins_pipe(pipe_slow);
 %}
 
@@ -11016,9 +10079,17 @@ instruct convD2I_reg_reg(rRegI dst, regD src, rFlagsReg cr)
             "call    d2i_fixup\n\t"
             "popq    $dst\n"
     "done:   "%}
-  opcode(0xF2, 0x0F, 0x2C);
-  ins_encode(OpcP, REX_reg_reg(dst, src), OpcS, OpcT, reg_reg(dst, src),
-             d2i_fixup(dst, src));
+  ins_encode %{
+    Label done;
+    __ cvttsd2sil($dst$$Register, $src$$XMMRegister);
+    __ cmpl($dst$$Register, 0x80000000);
+    __ jccb(Assembler::notEqual, done);
+    __ subptr(rsp, 8);
+    __ movdbl(Address(rsp, 0), $src$$XMMRegister);
+    __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, StubRoutines::x86::d2i_fixup())));
+    __ pop($dst$$Register);
+    __ bind(done);
+  %}
   ins_pipe(pipe_slow);
 %}
 
@@ -11035,9 +10106,18 @@ instruct convD2L_reg_reg(rRegL dst, regD src, rFlagsReg cr)
             "call    d2l_fixup\n\t"
             "popq    $dst\n"
     "done:   "%}
-  opcode(0xF2, 0x0F, 0x2C);
-  ins_encode(OpcP, REX_reg_reg_wide(dst, src), OpcS, OpcT, reg_reg(dst, src),
-             d2l_fixup(dst, src));
+  ins_encode %{
+    Label done;
+    __ cvttsd2siq($dst$$Register, $src$$XMMRegister);
+    __ cmp64($dst$$Register,
+             ExternalAddress((address) StubRoutines::x86::double_sign_flip()));
+    __ jccb(Assembler::notEqual, done);
+    __ subptr(rsp, 8);
+    __ movdbl(Address(rsp, 0), $src$$XMMRegister);
+    __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, StubRoutines::x86::d2l_fixup())));
+    __ pop($dst$$Register);
+    __ bind(done);
+  %}
   ins_pipe(pipe_slow);
 %}
 
@@ -11047,8 +10127,9 @@ instruct convI2F_reg_reg(regF dst, rRegI src)
   match(Set dst (ConvI2F src));
 
   format %{ "cvtsi2ssl $dst, $src\t# i2f" %}
-  opcode(0xF3, 0x0F, 0x2A);
-  ins_encode(OpcP, REX_reg_reg(dst, src), OpcS, OpcT, reg_reg(dst, src));
+  ins_encode %{
+    __ cvtsi2ssl ($dst$$XMMRegister, $src$$Register);
+  %}
   ins_pipe(pipe_slow); // XXX
 %}
 
@@ -11057,8 +10138,9 @@ instruct convI2F_reg_mem(regF dst, memory src)
   match(Set dst (ConvI2F (LoadI src)));
 
   format %{ "cvtsi2ssl $dst, $src\t# i2f" %}
-  opcode(0xF3, 0x0F, 0x2A);
-  ins_encode(OpcP, REX_reg_mem(dst, src), OpcS, OpcT, reg_mem(dst, src));
+  ins_encode %{
+    __ cvtsi2ssl ($dst$$XMMRegister, $src$$Address);
+  %}
   ins_pipe(pipe_slow); // XXX
 %}
 
@@ -11068,8 +10150,9 @@ instruct convI2D_reg_reg(regD dst, rRegI src)
   match(Set dst (ConvI2D src));
 
   format %{ "cvtsi2sdl $dst, $src\t# i2d" %}
-  opcode(0xF2, 0x0F, 0x2A);
-  ins_encode(OpcP, REX_reg_reg(dst, src), OpcS, OpcT, reg_reg(dst, src));
+  ins_encode %{
+    __ cvtsi2sdl ($dst$$XMMRegister, $src$$Register);
+  %}
   ins_pipe(pipe_slow); // XXX
 %}
 
@@ -11078,8 +10161,9 @@ instruct convI2D_reg_mem(regD dst, memory src)
   match(Set dst (ConvI2D (LoadI src)));
 
   format %{ "cvtsi2sdl $dst, $src\t# i2d" %}
-  opcode(0xF2, 0x0F, 0x2A);
-  ins_encode(OpcP, REX_reg_mem(dst, src), OpcS, OpcT, reg_mem(dst, src));
+  ins_encode %{
+    __ cvtsi2sdl ($dst$$XMMRegister, $src$$Address);
+  %}
   ins_pipe(pipe_slow); // XXX
 %}
 
@@ -11116,8 +10200,9 @@ instruct convL2F_reg_reg(regF dst, rRegL src)
   match(Set dst (ConvL2F src));
 
   format %{ "cvtsi2ssq $dst, $src\t# l2f" %}
-  opcode(0xF3, 0x0F, 0x2A);
-  ins_encode(OpcP, REX_reg_reg_wide(dst, src), OpcS, OpcT, reg_reg(dst, src));
+  ins_encode %{
+    __ cvtsi2ssq ($dst$$XMMRegister, $src$$Register);
+  %}
   ins_pipe(pipe_slow); // XXX
 %}
 
@@ -11126,8 +10211,9 @@ instruct convL2F_reg_mem(regF dst, memory src)
   match(Set dst (ConvL2F (LoadL src)));
 
   format %{ "cvtsi2ssq $dst, $src\t# l2f" %}
-  opcode(0xF3, 0x0F, 0x2A);
-  ins_encode(OpcP, REX_reg_mem_wide(dst, src), OpcS, OpcT, reg_mem(dst, src));
+  ins_encode %{
+    __ cvtsi2ssq ($dst$$XMMRegister, $src$$Address);
+  %}
   ins_pipe(pipe_slow); // XXX
 %}
 
@@ -11136,8 +10222,9 @@ instruct convL2D_reg_reg(regD dst, rRegL src)
   match(Set dst (ConvL2D src));
 
   format %{ "cvtsi2sdq $dst, $src\t# l2d" %}
-  opcode(0xF2, 0x0F, 0x2A);
-  ins_encode(OpcP, REX_reg_reg_wide(dst, src), OpcS, OpcT, reg_reg(dst, src));
+  ins_encode %{
+    __ cvtsi2sdq ($dst$$XMMRegister, $src$$Register);
+  %}
   ins_pipe(pipe_slow); // XXX
 %}
 
@@ -11146,8 +10233,9 @@ instruct convL2D_reg_mem(regD dst, memory src)
   match(Set dst (ConvL2D (LoadL src)));
 
   format %{ "cvtsi2sdq $dst, $src\t# l2d" %}
-  opcode(0xF2, 0x0F, 0x2A);
-  ins_encode(OpcP, REX_reg_mem_wide(dst, src), OpcS, OpcT, reg_mem(dst, src));
+  ins_encode %{
+    __ cvtsi2sdq ($dst$$XMMRegister, $src$$Address);
+  %}
   ins_pipe(pipe_slow); // XXX
 %}
 
@@ -11186,7 +10274,11 @@ instruct convI2L_reg_reg_zex(rRegL dst, rRegI src, immL_32bits mask)
   match(Set dst (AndL (ConvI2L src) mask));
 
   format %{ "movl    $dst, $src\t# i2l zero-extend\n\t" %}
-  ins_encode(enc_copy(dst, src));
+  ins_encode %{
+    if ($dst$$reg != $src$$reg) {
+      __ movl($dst$$Register, $src$$Register);
+    }
+  %}
   ins_pipe(ialu_reg_reg);
 %}
 
@@ -11196,8 +10288,9 @@ instruct convI2L_reg_mem_zex(rRegL dst, memory src, immL_32bits mask)
   match(Set dst (AndL (ConvI2L (LoadI src)) mask));
 
   format %{ "movl    $dst, $src\t# i2l zero-extend\n\t" %}
-  opcode(0x8B);
-  ins_encode(REX_reg_mem(dst, src), OpcP, reg_mem(dst, src));
+  ins_encode %{
+    __ movl($dst$$Register, $src$$Address);
+  %}
   ins_pipe(ialu_reg_mem);
 %}
 
@@ -11206,7 +10299,9 @@ instruct zerox_long_reg_reg(rRegL dst, rRegL src, immL_32bits mask)
   match(Set dst (AndL src mask));
 
   format %{ "movl    $dst, $src\t# zero-extend long" %}
-  ins_encode(enc_copy_always(dst, src));
+  ins_encode %{
+    __ movl($dst$$Register, $src$$Register);
+  %}
   ins_pipe(ialu_reg_reg);
 %}
 
@@ -11215,7 +10310,9 @@ instruct convL2I_reg_reg(rRegI dst, rRegL src)
   match(Set dst (ConvL2I src));
 
   format %{ "movl    $dst, $src\t# l2i" %}
-  ins_encode(enc_copy_always(dst, src));
+  ins_encode %{
+    __ movl($dst$$Register, $src$$Register);
+  %}
   ins_pipe(ialu_reg_reg);
 %}
 
@@ -11226,8 +10323,9 @@ instruct MoveF2I_stack_reg(rRegI dst, stackSlotF src) %{
 
   ins_cost(125);
   format %{ "movl    $dst, $src\t# MoveF2I_stack_reg" %}
-  opcode(0x8B);
-  ins_encode(REX_reg_mem(dst, src), OpcP, reg_mem(dst, src));
+  ins_encode %{
+    __ movl($dst$$Register, Address(rsp, $src$$disp));
+  %}
   ins_pipe(ialu_reg_mem);
 %}
 
@@ -11237,8 +10335,9 @@ instruct MoveI2F_stack_reg(regF dst, stackSlotI src) %{
 
   ins_cost(125);
   format %{ "movss   $dst, $src\t# MoveI2F_stack_reg" %}
-  opcode(0xF3, 0x0F, 0x10);
-  ins_encode(OpcP, REX_reg_mem(dst, src), OpcS, OpcT, reg_mem(dst, src));
+  ins_encode %{
+    __ movflt($dst$$XMMRegister, Address(rsp, $src$$disp));
+  %}
   ins_pipe(pipe_slow);
 %}
 
@@ -11248,8 +10347,9 @@ instruct MoveD2L_stack_reg(rRegL dst, stackSlotD src) %{
 
   ins_cost(125);
   format %{ "movq    $dst, $src\t# MoveD2L_stack_reg" %}
-  opcode(0x8B);
-  ins_encode(REX_reg_mem_wide(dst, src), OpcP, reg_mem(dst, src));
+  ins_encode %{
+    __ movq($dst$$Register, Address(rsp, $src$$disp));
+  %}
   ins_pipe(ialu_reg_mem);
 %}
 
@@ -11260,8 +10360,9 @@ instruct MoveL2D_stack_reg_partial(regD dst, stackSlotL src) %{
 
   ins_cost(125);
   format %{ "movlpd  $dst, $src\t# MoveL2D_stack_reg" %}
-  opcode(0x66, 0x0F, 0x12);
-  ins_encode(OpcP, REX_reg_mem(dst, src), OpcS, OpcT, reg_mem(dst, src));
+  ins_encode %{
+    __ movdbl($dst$$XMMRegister, Address(rsp, $src$$disp));
+  %}
   ins_pipe(pipe_slow);
 %}
 
@@ -11272,8 +10373,9 @@ instruct MoveL2D_stack_reg(regD dst, stackSlotL src) %{
 
   ins_cost(125);
   format %{ "movsd   $dst, $src\t# MoveL2D_stack_reg" %}
-  opcode(0xF2, 0x0F, 0x10);
-  ins_encode(OpcP, REX_reg_mem(dst, src), OpcS, OpcT, reg_mem(dst, src));
+  ins_encode %{
+    __ movdbl($dst$$XMMRegister, Address(rsp, $src$$disp));
+  %}
   ins_pipe(pipe_slow);
 %}
 
@@ -11284,8 +10386,9 @@ instruct MoveF2I_reg_stack(stackSlotI dst, regF src) %{
 
   ins_cost(95); // XXX
   format %{ "movss   $dst, $src\t# MoveF2I_reg_stack" %}
-  opcode(0xF3, 0x0F, 0x11);
-  ins_encode(OpcP, REX_reg_mem(src, dst), OpcS, OpcT, reg_mem(src, dst));
+  ins_encode %{
+    __ movflt(Address(rsp, $dst$$disp), $src$$XMMRegister);
+  %}
   ins_pipe(pipe_slow);
 %}
 
@@ -11295,8 +10398,9 @@ instruct MoveI2F_reg_stack(stackSlotF dst, rRegI src) %{
 
   ins_cost(100);
   format %{ "movl    $dst, $src\t# MoveI2F_reg_stack" %}
-  opcode(0x89);
-  ins_encode(REX_reg_mem(src, dst), OpcP, reg_mem(src, dst));
+  ins_encode %{
+    __ movl(Address(rsp, $dst$$disp), $src$$Register);
+  %}
   ins_pipe( ialu_mem_reg );
 %}
 
@@ -11306,8 +10410,9 @@ instruct MoveD2L_reg_stack(stackSlotL dst, regD src) %{
 
   ins_cost(95); // XXX
   format %{ "movsd   $dst, $src\t# MoveL2D_reg_stack" %}
-  opcode(0xF2, 0x0F, 0x11);
-  ins_encode(OpcP, REX_reg_mem(src, dst), OpcS, OpcT, reg_mem(src, dst));
+  ins_encode %{
+    __ movdbl(Address(rsp, $dst$$disp), $src$$XMMRegister);
+  %}
   ins_pipe(pipe_slow);
 %}
 
@@ -11317,8 +10422,9 @@ instruct MoveL2D_reg_stack(stackSlotD dst, rRegL src) %{
 
   ins_cost(100);
   format %{ "movq    $dst, $src\t# MoveL2D_reg_stack" %}
-  opcode(0x89);
-  ins_encode(REX_reg_mem_wide(src, dst), OpcP, reg_mem(src, dst));
+  ins_encode %{
+    __ movq(Address(rsp, $dst$$disp), $src$$Register);
+  %}
   ins_pipe(ialu_mem_reg);
 %}
 
@@ -11327,7 +10433,9 @@ instruct MoveF2I_reg_reg(rRegI dst, regF src) %{
   effect(DEF dst, USE src);
   ins_cost(85);
   format %{ "movd    $dst,$src\t# MoveF2I" %}
-  ins_encode %{ __ movdl($dst$$Register, $src$$XMMRegister); %}
+  ins_encode %{
+    __ movdl($dst$$Register, $src$$XMMRegister);
+  %}
   ins_pipe( pipe_slow );
 %}
 
@@ -11336,7 +10444,9 @@ instruct MoveD2L_reg_reg(rRegL dst, regD src) %{
   effect(DEF dst, USE src);
   ins_cost(85);
   format %{ "movd    $dst,$src\t# MoveD2L" %}
-  ins_encode %{ __ movdq($dst$$Register, $src$$XMMRegister); %}
+  ins_encode %{
+    __ movdq($dst$$Register, $src$$XMMRegister);
+  %}
   ins_pipe( pipe_slow );
 %}
 
@@ -11346,7 +10456,9 @@ instruct MoveI2F_reg_reg(regF dst, rRegI src) %{
   effect(DEF dst, USE src);
   ins_cost(300);
   format %{ "movd    $dst,$src\t# MoveI2F" %}
-  ins_encode %{ __ movdl($dst$$XMMRegister, $src$$Register); %}
+  ins_encode %{
+    __ movdl($dst$$XMMRegister, $src$$Register);
+  %}
   ins_pipe( pipe_slow );
 %}
 
@@ -11355,7 +10467,9 @@ instruct MoveL2D_reg_reg(regD dst, rRegL src) %{
   effect(DEF dst, USE src);
   ins_cost(300);
   format %{ "movd    $dst,$src\t# MoveL2D" %}
-  ins_encode %{ __ movdq($dst$$XMMRegister, $src$$Register); %}
+  ins_encode %{
+     __ movdq($dst$$XMMRegister, $src$$Register);
+  %}
   ins_pipe( pipe_slow );
 %}
 
@@ -11365,7 +10479,13 @@ instruct Repl8B_reg(regD dst, regD src) %{
   format %{ "MOVDQA  $dst,$src\n\t"
             "PUNPCKLBW $dst,$dst\n\t"
             "PSHUFLW $dst,$dst,0x00\t! replicate8B" %}
-  ins_encode( pshufd_8x8(dst, src));
+  ins_encode %{
+    if ($dst$$reg != $src$$reg) {
+      __ movdqa($dst$$XMMRegister, $src$$XMMRegister);
+    }
+    __ punpcklbw($dst$$XMMRegister, $dst$$XMMRegister);
+    __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
+  %}
   ins_pipe( pipe_slow );
 %}
 
@@ -11375,7 +10495,11 @@ instruct Repl8B_rRegI(regD dst, rRegI src) %{
   format %{ "MOVD    $dst,$src\n\t"
             "PUNPCKLBW $dst,$dst\n\t"
             "PSHUFLW $dst,$dst,0x00\t! replicate8B" %}
-  ins_encode( mov_i2x(dst, src), pshufd_8x8(dst, dst));
+  ins_encode %{
+    __ movdl($dst$$XMMRegister, $src$$Register);
+    __ punpcklbw($dst$$XMMRegister, $dst$$XMMRegister);
+    __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
+  %}
   ins_pipe( pipe_slow );
 %}
 
@@ -11383,7 +10507,9 @@ instruct Repl8B_rRegI(regD dst, rRegI src) %{
 instruct Repl8B_immI0(regD dst, immI0 zero) %{
   match(Set dst (Replicate8B zero));
   format %{ "PXOR  $dst,$dst\t! replicate8B" %}
-  ins_encode( pxor(dst, dst));
+  ins_encode %{
+    __ pxor($dst$$XMMRegister, $dst$$XMMRegister);
+  %}
   ins_pipe( fpu_reg_reg );
 %}
 
@@ -11391,7 +10517,9 @@ instruct Repl8B_immI0(regD dst, immI0 zero) %{
 instruct Repl4S_reg(regD dst, regD src) %{
   match(Set dst (Replicate4S src));
   format %{ "PSHUFLW $dst,$src,0x00\t! replicate4S" %}
-  ins_encode( pshufd_4x16(dst, src));
+  ins_encode %{
+    __ pshuflw($dst$$XMMRegister, $src$$XMMRegister, 0x00);
+  %}
   ins_pipe( fpu_reg_reg );
 %}
 
@@ -11400,7 +10528,10 @@ instruct Repl4S_rRegI(regD dst, rRegI src) %{
   match(Set dst (Replicate4S src));
   format %{ "MOVD    $dst,$src\n\t"
             "PSHUFLW $dst,$dst,0x00\t! replicate4S" %}
-  ins_encode( mov_i2x(dst, src), pshufd_4x16(dst, dst));
+  ins_encode %{
+    __ movdl($dst$$XMMRegister, $src$$Register);
+    __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
+  %}
   ins_pipe( fpu_reg_reg );
 %}
 
@@ -11408,7 +10539,9 @@ instruct Repl4S_rRegI(regD dst, rRegI src) %{
 instruct Repl4S_immI0(regD dst, immI0 zero) %{
   match(Set dst (Replicate4S zero));
   format %{ "PXOR  $dst,$dst\t! replicate4S" %}
-  ins_encode( pxor(dst, dst));
+  ins_encode %{
+    __ pxor($dst$$XMMRegister, $dst$$XMMRegister);
+  %}
   ins_pipe( fpu_reg_reg );
 %}
 
@@ -11416,7 +10549,9 @@ instruct Repl4S_immI0(regD dst, immI0 zero) %{
 instruct Repl4C_reg(regD dst, regD src) %{
   match(Set dst (Replicate4C src));
   format %{ "PSHUFLW $dst,$src,0x00\t! replicate4C" %}
-  ins_encode( pshufd_4x16(dst, src));
+  ins_encode %{
+    __ pshuflw($dst$$XMMRegister, $src$$XMMRegister, 0x00);
+  %}
   ins_pipe( fpu_reg_reg );
 %}
 
@@ -11425,7 +10560,10 @@ instruct Repl4C_rRegI(regD dst, rRegI src) %{
   match(Set dst (Replicate4C src));
   format %{ "MOVD    $dst,$src\n\t"
             "PSHUFLW $dst,$dst,0x00\t! replicate4C" %}
-  ins_encode( mov_i2x(dst, src), pshufd_4x16(dst, dst));
+  ins_encode %{
+    __ movdl($dst$$XMMRegister, $src$$Register);
+    __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
+  %}
   ins_pipe( fpu_reg_reg );
 %}
 
@@ -11433,7 +10571,9 @@ instruct Repl4C_rRegI(regD dst, rRegI src) %{
 instruct Repl4C_immI0(regD dst, immI0 zero) %{
   match(Set dst (Replicate4C zero));
   format %{ "PXOR  $dst,$dst\t! replicate4C" %}
-  ins_encode( pxor(dst, dst));
+  ins_encode %{
+    __ pxor($dst$$XMMRegister, $dst$$XMMRegister);
+  %}
   ins_pipe( fpu_reg_reg );
 %}
 
@@ -11441,7 +10581,9 @@ instruct Repl4C_immI0(regD dst, immI0 zero) %{
 instruct Repl2I_reg(regD dst, regD src) %{
   match(Set dst (Replicate2I src));
   format %{ "PSHUFD $dst,$src,0x00\t! replicate2I" %}
-  ins_encode( pshufd(dst, src, 0x00));
+  ins_encode %{
+    __ pshufd($dst$$XMMRegister, $src$$XMMRegister, 0x00);
+  %}
   ins_pipe( fpu_reg_reg );
 %}
 
@@ -11450,7 +10592,10 @@ instruct Repl2I_rRegI(regD dst, rRegI src) %{
   match(Set dst (Replicate2I src));
   format %{ "MOVD   $dst,$src\n\t"
             "PSHUFD $dst,$dst,0x00\t! replicate2I" %}
-  ins_encode( mov_i2x(dst, src), pshufd(dst, dst, 0x00));
+  ins_encode %{
+    __ movdl($dst$$XMMRegister, $src$$Register);
+    __ pshufd($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
+  %}
   ins_pipe( fpu_reg_reg );
 %}
 
@@ -11458,7 +10603,9 @@ instruct Repl2I_rRegI(regD dst, rRegI src) %{
 instruct Repl2I_immI0(regD dst, immI0 zero) %{
   match(Set dst (Replicate2I zero));
   format %{ "PXOR  $dst,$dst\t! replicate2I" %}
-  ins_encode( pxor(dst, dst));
+  ins_encode %{
+    __ pxor($dst$$XMMRegister, $dst$$XMMRegister);
+  %}
   ins_pipe( fpu_reg_reg );
 %}
 
@@ -11466,7 +10613,9 @@ instruct Repl2I_immI0(regD dst, immI0 zero) %{
 instruct Repl2F_reg(regD dst, regD src) %{
   match(Set dst (Replicate2F src));
   format %{ "PSHUFD $dst,$src,0xe0\t! replicate2F" %}
-  ins_encode( pshufd(dst, src, 0xe0));
+  ins_encode %{
+    __ pshufd($dst$$XMMRegister, $src$$XMMRegister, 0xe0);
+  %}
   ins_pipe( fpu_reg_reg );
 %}
 
@@ -11474,7 +10623,9 @@ instruct Repl2F_reg(regD dst, regD src) %{
 instruct Repl2F_regF(regD dst, regF src) %{
   match(Set dst (Replicate2F src));
   format %{ "PSHUFD $dst,$src,0xe0\t! replicate2F" %}
-  ins_encode( pshufd(dst, src, 0xe0));
+  ins_encode %{
+    __ pshufd($dst$$XMMRegister, $src$$XMMRegister, 0xe0);
+  %}
   ins_pipe( fpu_reg_reg );
 %}
 
@@ -11482,7 +10633,9 @@ instruct Repl2F_regF(regD dst, regF src) %{
 instruct Repl2F_immF0(regD dst, immF0 zero) %{
   match(Set dst (Replicate2F zero));
   format %{ "PXOR  $dst,$dst\t! replicate2F" %}
-  ins_encode( pxor(dst, dst));
+  ins_encode %{
+    __ pxor($dst$$XMMRegister, $dst$$XMMRegister);
+  %}
   ins_pipe( fpu_reg_reg );
 %}
 
@@ -12162,12 +11315,12 @@ instruct partialSubtypeCheck(rdi_RegP result,
   effect(KILL rcx, KILL cr);
 
   ins_cost(1100);  // slightly larger than the next version
-  format %{ "movq    rdi, [$sub + (sizeof(oopDesc) + Klass::secondary_supers_offset_in_bytes())]\n\t"
+  format %{ "movq    rdi, [$sub + in_bytes(Klass::secondary_supers_offset())]\n\t"
             "movl    rcx, [rdi + arrayOopDesc::length_offset_in_bytes()]\t# length to scan\n\t"
             "addq    rdi, arrayOopDex::base_offset_in_bytes(T_OBJECT)\t# Skip to start of data; set NZ in case count is zero\n\t"
             "repne   scasq\t# Scan *rdi++ for a match with rax while rcx--\n\t"
             "jne,s   miss\t\t# Missed: rdi not-zero\n\t"
-            "movq    [$sub + (sizeof(oopDesc) + Klass::secondary_super_cache_offset_in_bytes())], $super\t# Hit: update cache\n\t"
+            "movq    [$sub + in_bytes(Klass::secondary_super_cache_offset())], $super\t# Hit: update cache\n\t"
             "xorq    $result, $result\t\t Hit: rdi zero\n\t"
     "miss:\t" %}
 
@@ -12185,12 +11338,12 @@ instruct partialSubtypeCheck_vs_Zero(rFlagsReg cr,
   effect(KILL rcx, KILL result);
 
   ins_cost(1000);
-  format %{ "movq    rdi, [$sub + (sizeof(oopDesc) + Klass::secondary_supers_offset_in_bytes())]\n\t"
+  format %{ "movq    rdi, [$sub + in_bytes(Klass::secondary_supers_offset())]\n\t"
             "movl    rcx, [rdi + arrayOopDesc::length_offset_in_bytes()]\t# length to scan\n\t"
             "addq    rdi, arrayOopDex::base_offset_in_bytes(T_OBJECT)\t# Skip to start of data; set NZ in case count is zero\n\t"
             "repne   scasq\t# Scan *rdi++ for a match with rax while cx-- != 0\n\t"
             "jne,s   miss\t\t# Missed: flags nz\n\t"
-            "movq    [$sub + (sizeof(oopDesc) + Klass::secondary_super_cache_offset_in_bytes())], $super\t# Hit: update cache\n\t"
+            "movq    [$sub + in_bytes(Klass::secondary_super_cache_offset())], $super\t# Hit: update cache\n\t"
     "miss:\t" %}
 
   opcode(0x0); // No need to XOR RDI
@@ -12358,13 +11511,13 @@ instruct jmpConUCF2_short(cmpOpUCF2 cop, rFlagsRegUCF cmp, label labl) %{
 // inlined locking and unlocking
 
 instruct cmpFastLock(rFlagsReg cr,
-                     rRegP object, rRegP box, rax_RegI tmp, rRegP scr)
+                     rRegP object, rbx_RegP box, rax_RegI tmp, rRegP scr)
 %{
   match(Set cr (FastLock object box));
-  effect(TEMP tmp, TEMP scr);
+  effect(TEMP tmp, TEMP scr, USE_KILL box);
 
   ins_cost(300);
-  format %{ "fastlock $object,$box,$tmp,$scr" %}
+  format %{ "fastlock $object,$box\t! kills $box,$tmp,$scr" %}
   ins_encode(Fast_Lock(object, box, tmp, scr));
   ins_pipe(pipe_slow);
 %}
@@ -12373,10 +11526,10 @@ instruct cmpFastUnlock(rFlagsReg cr,
                        rRegP object, rax_RegP box, rRegP tmp)
 %{
   match(Set cr (FastUnlock object box));
-  effect(TEMP tmp);
+  effect(TEMP tmp, USE_KILL box);
 
   ins_cost(300);
-  format %{ "fastunlock $object, $box, $tmp" %}
+  format %{ "fastunlock $object,$box\t! kills $box,$tmp" %}
   ins_encode(Fast_Unlock(object, box, tmp));
   ins_pipe(pipe_slow);
 %}
diff --git a/hotspot/src/os/bsd/vm/os_bsd.cpp b/hotspot/src/os/bsd/vm/os_bsd.cpp
index c4af904a338..c6e27b273fd 100644
--- a/hotspot/src/os/bsd/vm/os_bsd.cpp
+++ b/hotspot/src/os/bsd/vm/os_bsd.cpp
@@ -2835,7 +2835,7 @@ void os::realign_memory(char *addr, size_t bytes, size_t alignment_hint) {
 #endif
 }
 
-void os::free_memory(char *addr, size_t bytes) {
+void os::free_memory(char *addr, size_t bytes, size_t alignment_hint) {
   ::madvise(addr, bytes, MADV_DONTNEED);
 }
 
diff --git a/hotspot/src/os/linux/vm/os_linux.cpp b/hotspot/src/os/linux/vm/os_linux.cpp
index a141f6a665c..0946b753eab 100644
--- a/hotspot/src/os/linux/vm/os_linux.cpp
+++ b/hotspot/src/os/linux/vm/os_linux.cpp
@@ -2546,8 +2546,8 @@ void os::realign_memory(char *addr, size_t bytes, size_t alignment_hint) {
   }
 }
 
-void os::free_memory(char *addr, size_t bytes) {
-  commit_memory(addr, bytes, false);
+void os::free_memory(char *addr, size_t bytes, size_t alignment_hint) {
+  commit_memory(addr, bytes, alignment_hint, false);
 }
 
 void os::numa_make_global(char *addr, size_t bytes) {
diff --git a/hotspot/src/os/posix/vm/os_posix.cpp b/hotspot/src/os/posix/vm/os_posix.cpp
index 4795d06fbeb..42078342ec9 100644
--- a/hotspot/src/os/posix/vm/os_posix.cpp
+++ b/hotspot/src/os/posix/vm/os_posix.cpp
@@ -59,6 +59,10 @@ void os::check_or_create_dump(void* exceptionRecord, void* contextRecord, char*
   VMError::report_coredump_status(buffer, success);
 }
 
+int os::get_last_error() {
+  return errno;
+}
+
 bool os::is_debugger_attached() {
   // not implemented
   return false;
diff --git a/hotspot/src/os/solaris/vm/os_solaris.cpp b/hotspot/src/os/solaris/vm/os_solaris.cpp
index f4043e19595..925152c3bb1 100644
--- a/hotspot/src/os/solaris/vm/os_solaris.cpp
+++ b/hotspot/src/os/solaris/vm/os_solaris.cpp
@@ -2821,7 +2821,7 @@ bool os::commit_memory(char* addr, size_t bytes, size_t alignment_hint,
 }
 
 // Uncommit the pages in a specified region.
-void os::free_memory(char* addr, size_t bytes) {
+void os::free_memory(char* addr, size_t bytes, size_t alignment_hint) {
   if (madvise(addr, bytes, MADV_FREE) < 0) {
     debug_only(warning("MADV_FREE failed."));
     return;
diff --git a/hotspot/src/os/windows/vm/os_windows.cpp b/hotspot/src/os/windows/vm/os_windows.cpp
index 01ccc179e3b..889faf54010 100644
--- a/hotspot/src/os/windows/vm/os_windows.cpp
+++ b/hotspot/src/os/windows/vm/os_windows.cpp
@@ -132,7 +132,6 @@ PVOID  topLevelVectoredExceptionHandler = NULL;
 // save DLL module handle, used by GetModuleFileName
 
 HINSTANCE vm_lib_handle;
-static int getLastErrorString(char *buf, size_t len);
 
 BOOL WINAPI DllMain(HINSTANCE hinst, DWORD reason, LPVOID reserved) {
   switch (reason) {
@@ -1452,7 +1451,7 @@ void * os::dll_load(const char *name, char *ebuf, int ebuflen)
     return result;
   }
 
-  long errcode = GetLastError();
+  DWORD errcode = GetLastError();
   if (errcode == ERROR_MOD_NOT_FOUND) {
     strncpy(ebuf, "Can't find dependent libraries", ebuflen-1);
     ebuf[ebuflen-1]='\0';
@@ -1463,11 +1462,11 @@ void * os::dll_load(const char *name, char *ebuf, int ebuflen)
   // If we can read dll-info and find that dll was built
   // for an architecture other than Hotspot is running in
   // - then print to buffer "DLL was built for a different architecture"
-  // else call getLastErrorString to obtain system error message
+  // else call os::lasterror to obtain system error message
 
   // Read system error message into ebuf
   // It may or may not be overwritten below (in the for loop and just above)
-  getLastErrorString(ebuf, (size_t) ebuflen);
+  lasterror(ebuf, (size_t) ebuflen);
   ebuf[ebuflen-1]='\0';
   int file_descriptor=::open(name, O_RDONLY | O_BINARY, 0);
   if (file_descriptor<0)
@@ -1500,7 +1499,7 @@ void * os::dll_load(const char *name, char *ebuf, int ebuflen)
   ::close(file_descriptor);
   if (failed_to_get_lib_arch)
   {
-    // file i/o error - report getLastErrorString(...) msg
+    // file i/o error - report os::lasterror(...) msg
     return NULL;
   }
 
@@ -1543,7 +1542,7 @@ void * os::dll_load(const char *name, char *ebuf, int ebuflen)
     "Didn't find runing architecture code in arch_array");
 
   // If the architure is right
-  // but some other error took place - report getLastErrorString(...) msg
+  // but some other error took place - report os::lasterror(...) msg
   if (lib_arch == running_arch)
   {
     return NULL;
@@ -1775,12 +1774,12 @@ void os::print_jni_name_suffix_on(outputStream* st, int args_size) {
 // This method is a copy of JDK's sysGetLastErrorString
 // from src/windows/hpi/src/system_md.c
 
-size_t os::lasterror(char *buf, size_t len) {
-  long errval;
+size_t os::lasterror(char* buf, size_t len) {
+  DWORD errval;
 
   if ((errval = GetLastError()) != 0) {
-      /* DOS error */
-    int n = (int)FormatMessage(
+    // DOS error
+    size_t n = (size_t)FormatMessage(
           FORMAT_MESSAGE_FROM_SYSTEM|FORMAT_MESSAGE_IGNORE_INSERTS,
           NULL,
           errval,
@@ -1789,7 +1788,7 @@ size_t os::lasterror(char *buf, size_t len) {
           (DWORD)len,
           NULL);
     if (n > 3) {
-      /* Drop final '.', CR, LF */
+      // Drop final '.', CR, LF
       if (buf[n - 1] == '\n') n--;
       if (buf[n - 1] == '\r') n--;
       if (buf[n - 1] == '.') n--;
@@ -1799,17 +1798,25 @@ size_t os::lasterror(char *buf, size_t len) {
   }
 
   if (errno != 0) {
-    /* C runtime error that has no corresponding DOS error code */
-    const char *s = strerror(errno);
+    // C runtime error that has no corresponding DOS error code
+    const char* s = strerror(errno);
     size_t n = strlen(s);
     if (n >= len) n = len - 1;
     strncpy(buf, s, n);
     buf[n] = '\0';
     return n;
   }
+
   return 0;
 }
 
+int os::get_last_error() {
+  DWORD error = GetLastError();
+  if (error == 0)
+    error = errno;
+  return (int)error;
+}
+
 // sun.misc.Signal
 // NOTE that this is a workaround for an apparent kernel bug where if
 // a signal handler for SIGBREAK is installed then that signal handler
@@ -3130,7 +3137,7 @@ bool os::unguard_memory(char* addr, size_t bytes) {
 }
 
 void os::realign_memory(char *addr, size_t bytes, size_t alignment_hint) { }
-void os::free_memory(char *addr, size_t bytes)         { }
+void os::free_memory(char *addr, size_t bytes, size_t alignment_hint)    { }
 void os::numa_make_global(char *addr, size_t bytes)    { }
 void os::numa_make_local(char *addr, size_t bytes, int lgrp_hint)    { }
 bool os::numa_topology_changed()                       { return false; }
@@ -4746,7 +4753,7 @@ bool os::check_heap(bool force) {
           fatal("corrupted C heap");
         }
       }
-      int err = GetLastError();
+      DWORD err = GetLastError();
       if (err != ERROR_NO_MORE_ITEMS && err != ERROR_CALL_NOT_IMPLEMENTED) {
         fatal(err_msg("heap walk aborted with error %d", err));
       }
@@ -4778,45 +4785,6 @@ LONG WINAPI os::win32::serialize_fault_filter(struct _EXCEPTION_POINTERS* e) {
   return EXCEPTION_CONTINUE_SEARCH;
 }
 
-static int getLastErrorString(char *buf, size_t len)
-{
-    long errval;
-
-    if ((errval = GetLastError()) != 0)
-    {
-      /* DOS error */
-      size_t n = (size_t)FormatMessage(
-            FORMAT_MESSAGE_FROM_SYSTEM|FORMAT_MESSAGE_IGNORE_INSERTS,
-            NULL,
-            errval,
-            0,
-            buf,
-            (DWORD)len,
-            NULL);
-      if (n > 3) {
-        /* Drop final '.', CR, LF */
-        if (buf[n - 1] == '\n') n--;
-        if (buf[n - 1] == '\r') n--;
-        if (buf[n - 1] == '.') n--;
-        buf[n] = '\0';
-      }
-      return (int)n;
-    }
-
-    if (errno != 0)
-    {
-      /* C runtime error that has no corresponding DOS error code */
-      const char *s = strerror(errno);
-      size_t n = strlen(s);
-      if (n >= len) n = len - 1;
-      strncpy(buf, s, n);
-      buf[n] = '\0';
-      return (int)n;
-    }
-    return 0;
-}
-
-
 // We don't build a headless jre for Windows
 bool os::is_headless_jre() { return false; }
 
diff --git a/hotspot/src/os_cpu/bsd_x86/vm/os_bsd_x86.hpp b/hotspot/src/os_cpu/bsd_x86/vm/os_bsd_x86.hpp
index 7a4e71081cf..f109e246fae 100644
--- a/hotspot/src/os_cpu/bsd_x86/vm/os_bsd_x86.hpp
+++ b/hotspot/src/os_cpu/bsd_x86/vm/os_bsd_x86.hpp
@@ -28,6 +28,8 @@
   static void setup_fpu();
   static bool supports_sse();
 
+  static jlong rdtsc();
+
   static bool is_allocatable(size_t bytes);
 
   // Used to register dynamic code cache area with the OS
diff --git a/hotspot/src/os_cpu/bsd_x86/vm/os_bsd_x86.inline.hpp b/hotspot/src/os_cpu/bsd_x86/vm/os_bsd_x86.inline.hpp
new file mode 100644
index 00000000000..67b2482aaf3
--- /dev/null
+++ b/hotspot/src/os_cpu/bsd_x86/vm/os_bsd_x86.inline.hpp
@@ -0,0 +1,46 @@
+/*
+ * Copyright (c) 2011, Oracle and/or its affiliates. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+#ifndef OS_CPU_BSD_X86_VM_OS_BSD_X86_INLINE_HPP
+#define OS_CPU_BSD_X86_VM_OS_BSD_X86_INLINE_HPP
+
+#include "runtime/os.hpp"
+
+// See http://www.technovelty.org/code/c/reading-rdtsc.htl for details
+inline jlong os::rdtsc() {
+#ifndef AMD64
+  // 64 bit result in edx:eax
+  uint64_t res;
+  __asm__ __volatile__ ("rdtsc" : "=A" (res));
+  return (jlong)res;
+#else
+  uint64_t res;
+  uint32_t ts1, ts2;
+  __asm__ __volatile__ ("rdtsc" : "=a" (ts1), "=d" (ts2));
+  res = ((uint64_t)ts1 | (uint64_t)ts2 << 32);
+  return (jlong)res;
+#endif // AMD64
+}
+
+#endif // OS_CPU_BSD_X86_VM_OS_BSD_X86_INLINE_HPP
diff --git a/hotspot/src/os_cpu/linux_x86/vm/os_linux_x86.hpp b/hotspot/src/os_cpu/linux_x86/vm/os_linux_x86.hpp
index 64954d480f8..9bb22f8e6ba 100644
--- a/hotspot/src/os_cpu/linux_x86/vm/os_linux_x86.hpp
+++ b/hotspot/src/os_cpu/linux_x86/vm/os_linux_x86.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 1999, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1999, 2011, Oracle and/or its affiliates. All rights reserved.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
  * This code is free software; you can redistribute it and/or modify it
@@ -28,6 +28,8 @@
   static void setup_fpu();
   static bool supports_sse();
 
+  static jlong rdtsc();
+
   static bool is_allocatable(size_t bytes);
 
   // Used to register dynamic code cache area with the OS
diff --git a/hotspot/src/os_cpu/linux_x86/vm/os_linux_x86.inline.hpp b/hotspot/src/os_cpu/linux_x86/vm/os_linux_x86.inline.hpp
new file mode 100644
index 00000000000..fee719b01b8
--- /dev/null
+++ b/hotspot/src/os_cpu/linux_x86/vm/os_linux_x86.inline.hpp
@@ -0,0 +1,46 @@
+/*
+ * Copyright (c) 2011, Oracle and/or its affiliates. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+#ifndef OS_CPU_LINUX_X86_VM_OS_LINUX_X86_INLINE_HPP
+#define OS_CPU_LINUX_X86_VM_OS_LINUX_X86_INLINE_HPP
+
+#include "runtime/os.hpp"
+
+// See http://www.technovelty.org/code/c/reading-rdtsc.htl for details
+inline jlong os::rdtsc() {
+#ifndef AMD64
+  // 64 bit result in edx:eax
+  uint64_t res;
+  __asm__ __volatile__ ("rdtsc" : "=A" (res));
+  return (jlong)res;
+#else
+  uint64_t res;
+  uint32_t ts1, ts2;
+  __asm__ __volatile__ ("rdtsc" : "=a" (ts1), "=d" (ts2));
+  res = ((uint64_t)ts1 | (uint64_t)ts2 << 32);
+  return (jlong)res;
+#endif // AMD64
+}
+
+#endif // OS_CPU_LINUX_X86_VM_OS_LINUX_X86_INLINE_HPP
diff --git a/hotspot/src/os_cpu/solaris_x86/vm/os_solaris_x86.hpp b/hotspot/src/os_cpu/solaris_x86/vm/os_solaris_x86.hpp
index 5841fb3eb59..fd4f15282b4 100644
--- a/hotspot/src/os_cpu/solaris_x86/vm/os_solaris_x86.hpp
+++ b/hotspot/src/os_cpu/solaris_x86/vm/os_solaris_x86.hpp
@@ -46,6 +46,8 @@
 
   static bool supports_sse();
 
+  static jlong rdtsc();
+
   static bool is_allocatable(size_t bytes);
 
   // Used to register dynamic code cache area with the OS
diff --git a/hotspot/src/os_cpu/solaris_x86/vm/os_solaris_x86.inline.hpp b/hotspot/src/os_cpu/solaris_x86/vm/os_solaris_x86.inline.hpp
new file mode 100644
index 00000000000..986884e031f
--- /dev/null
+++ b/hotspot/src/os_cpu/solaris_x86/vm/os_solaris_x86.inline.hpp
@@ -0,0 +1,32 @@
+/*
+ * Copyright (c) 2011, Oracle and/or its affiliates. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+#ifndef OS_CPU_SOLARIS_X86_VM_OS_SOLARIS_X86_INLINE_HPP
+#define OS_CPU_SOLARIS_X86_VM_OS_SOLARIS_X86_INLINE_HPP
+
+#include "runtime/os.hpp"
+
+inline jlong os::rdtsc() { return _raw_rdtsc(); }
+
+#endif // OS_CPU_SOLARIS_X86_VM_OS_SOLARIS_X86_INLINE_HPP
diff --git a/hotspot/src/os_cpu/solaris_x86/vm/solaris_x86_32.il b/hotspot/src/os_cpu/solaris_x86/vm/solaris_x86_32.il
index 056cb140293..b635a8292e6 100644
--- a/hotspot/src/os_cpu/solaris_x86/vm/solaris_x86_32.il
+++ b/hotspot/src/os_cpu/solaris_x86/vm/solaris_x86_32.il
@@ -43,6 +43,11 @@
       movl     %ebp, %eax 
       .end
 
+  // Support for os::rdtsc()
+      .inline _raw_rdtsc,0
+      rdtsc
+      .end
+
   // Support for jint Atomic::add(jint inc, volatile jint* dest)
   // An additional bool (os::is_MP()) is passed as the last argument.
       .inline _Atomic_add,3
@@ -113,7 +118,6 @@
       fistpll   (%eax)
       .end
 
-
   // Support for OrderAccess::acquire()
       .inline _OrderAccess_acquire,0
       movl     0(%esp), %eax
diff --git a/hotspot/src/os_cpu/solaris_x86/vm/solaris_x86_64.il b/hotspot/src/os_cpu/solaris_x86/vm/solaris_x86_64.il
index b260375b164..fb7946b8c5c 100644
--- a/hotspot/src/os_cpu/solaris_x86/vm/solaris_x86_64.il
+++ b/hotspot/src/os_cpu/solaris_x86/vm/solaris_x86_64.il
@@ -30,12 +30,19 @@
       movq     %fs:0, %rax 
       .end
 
-  // Get the frame pointer from current frame.
+  // Get current fp
       .inline _get_current_fp,0
       .volatile
       movq     %rbp, %rax 
       .end
 
+  // Support for os::rdtsc()
+      .inline _raw_rdtsc,0
+      rdtsc
+      salq     $32, %rdx
+      orq      %rdx, %rax
+      .end
+
   // Support for jint Atomic::add(jint add_value, volatile jint* dest)
       .inline _Atomic_add,2
       movl     %edi, %eax      // save add_value for return
diff --git a/hotspot/src/os_cpu/windows_x86/vm/os_windows_x86.hpp b/hotspot/src/os_cpu/windows_x86/vm/os_windows_x86.hpp
index e7c3303c0f7..74e3519c901 100644
--- a/hotspot/src/os_cpu/windows_x86/vm/os_windows_x86.hpp
+++ b/hotspot/src/os_cpu/windows_x86/vm/os_windows_x86.hpp
@@ -58,6 +58,8 @@
   static void setup_fpu();
   static bool supports_sse() { return true; }
 
+  static jlong rdtsc();
+
   static bool      register_code_area(char *low, char *high);
 
 #endif // OS_CPU_WINDOWS_X86_VM_OS_WINDOWS_X86_HPP
diff --git a/hotspot/src/os_cpu/windows_x86/vm/os_windows_x86.inline.hpp b/hotspot/src/os_cpu/windows_x86/vm/os_windows_x86.inline.hpp
new file mode 100644
index 00000000000..d108e3e4202
--- /dev/null
+++ b/hotspot/src/os_cpu/windows_x86/vm/os_windows_x86.inline.hpp
@@ -0,0 +1,38 @@
+/*
+ * Copyright (c) 2011, Oracle and/or its affiliates. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+#ifndef OS_CPU_WINDOWS_X86_VM_OS_WINDOWS_X86_INLINE_HPP
+#define OS_CPU_WINDOWS_X86_VM_OS_WINDOWS_X86_INLINE_HPP
+
+#include "runtime/os.hpp"
+
+inline jlong os::rdtsc() {
+  // 32 bit: 64 bit result in edx:eax
+  // 64 bit: 64 bit value in rax
+  uint64_t res;
+  res = (uint64_t)__rdtsc();
+  return (jlong)res;
+}
+
+#endif // OS_CPU_WINDOWS_X86_VM_OS_WINDOWS_X86_INLINE_HPP
diff --git a/hotspot/src/share/vm/adlc/formssel.cpp b/hotspot/src/share/vm/adlc/formssel.cpp
index c1d39849bee..3c69d6a0da8 100644
--- a/hotspot/src/share/vm/adlc/formssel.cpp
+++ b/hotspot/src/share/vm/adlc/formssel.cpp
@@ -627,6 +627,7 @@ bool InstructForm::is_wide_memory_kill(FormDict &globals) const {
   if( strcmp(_matrule->_opType,"MemBarAcquire") == 0 ) return true;
   if( strcmp(_matrule->_opType,"MemBarReleaseLock") == 0 ) return true;
   if( strcmp(_matrule->_opType,"MemBarAcquireLock") == 0 ) return true;
+  if( strcmp(_matrule->_opType,"MemBarStoreStore") == 0 ) return true;
 
   return false;
 }
@@ -3978,7 +3979,8 @@ bool MatchRule::is_ideal_membar() const {
     !strcmp(_opType,"MemBarAcquireLock") ||
     !strcmp(_opType,"MemBarReleaseLock") ||
     !strcmp(_opType,"MemBarVolatile" ) ||
-    !strcmp(_opType,"MemBarCPUOrder" ) ;
+    !strcmp(_opType,"MemBarCPUOrder" ) ||
+    !strcmp(_opType,"MemBarStoreStore" );
 }
 
 bool MatchRule::is_ideal_loadPC() const {
diff --git a/hotspot/src/share/vm/asm/assembler.cpp b/hotspot/src/share/vm/asm/assembler.cpp
index 86011e97496..2bcdcbc884d 100644
--- a/hotspot/src/share/vm/asm/assembler.cpp
+++ b/hotspot/src/share/vm/asm/assembler.cpp
@@ -61,6 +61,7 @@ AbstractAssembler::AbstractAssembler(CodeBuffer* code) {
   _code_limit  = cs->limit();
   _code_pos    = cs->end();
   _oop_recorder= code->oop_recorder();
+  DEBUG_ONLY( _short_branch_delta = 0; )
   if (_code_begin == NULL)  {
     vm_exit_out_of_memory(0, err_msg("CodeCache: no room for %s",
                                      code->name()));
diff --git a/hotspot/src/share/vm/asm/assembler.hpp b/hotspot/src/share/vm/asm/assembler.hpp
index 8db7eef2ea5..c25aa3fca3f 100644
--- a/hotspot/src/share/vm/asm/assembler.hpp
+++ b/hotspot/src/share/vm/asm/assembler.hpp
@@ -241,6 +241,33 @@ class AbstractAssembler : public ResourceObj  {
   // Make it return true on platforms which need to verify
   // instruction boundaries for some operations.
   inline static bool pd_check_instruction_mark();
+
+  // Add delta to short branch distance to verify that it still fit into imm8.
+  int _short_branch_delta;
+
+  int  short_branch_delta() const { return _short_branch_delta; }
+  void set_short_branch_delta()   { _short_branch_delta = 32; }
+  void clear_short_branch_delta() { _short_branch_delta = 0; }
+
+  class ShortBranchVerifier: public StackObj {
+   private:
+    AbstractAssembler* _assm;
+
+   public:
+    ShortBranchVerifier(AbstractAssembler* assm) : _assm(assm) {
+      assert(assm->short_branch_delta() == 0, "overlapping instructions");
+      _assm->set_short_branch_delta();
+    }
+    ~ShortBranchVerifier() {
+      _assm->clear_short_branch_delta();
+    }
+  };
+  #else
+  // Dummy in product.
+  class ShortBranchVerifier: public StackObj {
+   public:
+    ShortBranchVerifier(AbstractAssembler* assm) {}
+  };
   #endif
 
   // Label functions
diff --git a/hotspot/src/share/vm/c1/c1_LIR.cpp b/hotspot/src/share/vm/c1/c1_LIR.cpp
index 267b966641c..629d849bc60 100644
--- a/hotspot/src/share/vm/c1/c1_LIR.cpp
+++ b/hotspot/src/share/vm/c1/c1_LIR.cpp
@@ -854,6 +854,9 @@ void LIR_OpVisitState::visit(LIR_Op* op) {
       if (opTypeCheck->_info_for_exception)       do_info(opTypeCheck->_info_for_exception);
       if (opTypeCheck->_info_for_patch)           do_info(opTypeCheck->_info_for_patch);
       if (opTypeCheck->_object->is_valid())       do_input(opTypeCheck->_object);
+      if (op->code() == lir_store_check && opTypeCheck->_object->is_valid()) {
+        do_temp(opTypeCheck->_object);
+      }
       if (opTypeCheck->_array->is_valid())        do_input(opTypeCheck->_array);
       if (opTypeCheck->_tmp1->is_valid())         do_temp(opTypeCheck->_tmp1);
       if (opTypeCheck->_tmp2->is_valid())         do_temp(opTypeCheck->_tmp2);
diff --git a/hotspot/src/share/vm/c1/c1_LIRGenerator.cpp b/hotspot/src/share/vm/c1/c1_LIRGenerator.cpp
index 0491d71565e..5f3155d412a 100644
--- a/hotspot/src/share/vm/c1/c1_LIRGenerator.cpp
+++ b/hotspot/src/share/vm/c1/c1_LIRGenerator.cpp
@@ -1256,8 +1256,7 @@ void LIRGenerator::do_getClass(Intrinsic* x) {
     info = state_for(x);
   }
   __ move(new LIR_Address(rcvr.result(), oopDesc::klass_offset_in_bytes(), T_OBJECT), result, info);
-  __ move_wide(new LIR_Address(result, Klass::java_mirror_offset_in_bytes() +
-                               klassOopDesc::klass_part_offset_in_bytes(), T_OBJECT), result);
+  __ move_wide(new LIR_Address(result, in_bytes(Klass::java_mirror_offset()), T_OBJECT), result);
 }
 
 
diff --git a/hotspot/src/share/vm/c1/c1_Optimizer.cpp b/hotspot/src/share/vm/c1/c1_Optimizer.cpp
index 5c3640b1ad7..e1a0ef7bfdd 100644
--- a/hotspot/src/share/vm/c1/c1_Optimizer.cpp
+++ b/hotspot/src/share/vm/c1/c1_Optimizer.cpp
@@ -122,18 +122,32 @@ void CE_Eliminator::block_do(BlockBegin* block) {
   if (sux != f_goto->default_sux()) return;
 
   // check if at least one word was pushed on sux_state
+  // inlining depths must match
+  ValueStack* if_state = if_->state();
   ValueStack* sux_state = sux->state();
-  if (sux_state->stack_size() <= if_->state()->stack_size()) return;
+  if (if_state->scope()->level() > sux_state->scope()->level()) {
+    while (sux_state->scope() != if_state->scope()) {
+      if_state = if_state->caller_state();
+      assert(if_state != NULL, "states do not match up");
+    }
+  } else if (if_state->scope()->level() < sux_state->scope()->level()) {
+    while (sux_state->scope() != if_state->scope()) {
+      sux_state = sux_state->caller_state();
+      assert(sux_state != NULL, "states do not match up");
+    }
+  }
+
+  if (sux_state->stack_size() <= if_state->stack_size()) return;
 
   // check if phi function is present at end of successor stack and that
   // only this phi was pushed on the stack
-  Value sux_phi = sux_state->stack_at(if_->state()->stack_size());
+  Value sux_phi = sux_state->stack_at(if_state->stack_size());
   if (sux_phi == NULL || sux_phi->as_Phi() == NULL || sux_phi->as_Phi()->block() != sux) return;
-  if (sux_phi->type()->size() != sux_state->stack_size() - if_->state()->stack_size()) return;
+  if (sux_phi->type()->size() != sux_state->stack_size() - if_state->stack_size()) return;
 
   // get the values that were pushed in the true- and false-branch
-  Value t_value = t_goto->state()->stack_at(if_->state()->stack_size());
-  Value f_value = f_goto->state()->stack_at(if_->state()->stack_size());
+  Value t_value = t_goto->state()->stack_at(if_state->stack_size());
+  Value f_value = f_goto->state()->stack_at(if_state->stack_size());
 
   // backend does not support floats
   assert(t_value->type()->base() == f_value->type()->base(), "incompatible types");
@@ -180,11 +194,7 @@ void CE_Eliminator::block_do(BlockBegin* block) {
   Goto* goto_ = new Goto(sux, state_before, if_->is_safepoint() || t_goto->is_safepoint() || f_goto->is_safepoint());
 
   // prepare state for Goto
-  ValueStack* goto_state = if_->state();
-  while (sux_state->scope() != goto_state->scope()) {
-    goto_state = goto_state->caller_state();
-    assert(goto_state != NULL, "states do not match up");
-  }
+  ValueStack* goto_state = if_state;
   goto_state = goto_state->copy(ValueStack::StateAfter, goto_state->bci());
   goto_state->push(result->type(), result);
   assert(goto_state->is_same(sux_state), "states must match now");
diff --git a/hotspot/src/share/vm/ci/ciInstanceKlass.cpp b/hotspot/src/share/vm/ci/ciInstanceKlass.cpp
index f10b92fa33d..b7c2ab75850 100644
--- a/hotspot/src/share/vm/ci/ciInstanceKlass.cpp
+++ b/hotspot/src/share/vm/ci/ciInstanceKlass.cpp
@@ -54,7 +54,7 @@ ciInstanceKlass::ciInstanceKlass(KlassHandle h_k) :
   _flags = ciFlags(access_flags);
   _has_finalizer = access_flags.has_finalizer();
   _has_subklass = ik->subklass() != NULL;
-  _init_state = (instanceKlass::ClassState)ik->get_init_state();
+  _init_state = ik->init_state();
   _nonstatic_field_size = ik->nonstatic_field_size();
   _has_nonstatic_fields = ik->has_nonstatic_fields();
   _nonstatic_fields = NULL; // initialized lazily by compute_nonstatic_fields:
@@ -118,7 +118,7 @@ ciInstanceKlass::ciInstanceKlass(ciSymbol* name,
 void ciInstanceKlass::compute_shared_init_state() {
   GUARDED_VM_ENTRY(
     instanceKlass* ik = get_instanceKlass();
-    _init_state = (instanceKlass::ClassState)ik->get_init_state();
+    _init_state = ik->init_state();
   )
 }
 
diff --git a/hotspot/src/share/vm/ci/ciTypeFlow.cpp b/hotspot/src/share/vm/ci/ciTypeFlow.cpp
index a5042d9676f..6678b4c4b3a 100644
--- a/hotspot/src/share/vm/ci/ciTypeFlow.cpp
+++ b/hotspot/src/share/vm/ci/ciTypeFlow.cpp
@@ -1589,7 +1589,7 @@ ciTypeFlow::Block::Block(ciTypeFlow* outer,
   _next = NULL;
   _on_work_list = false;
   _backedge_copy = false;
-  _exception_entry = false;
+  _has_monitorenter = false;
   _trap_bci = -1;
   _trap_index = 0;
   df_init();
@@ -2182,6 +2182,10 @@ bool ciTypeFlow::clone_loop_heads(Loop* lp, StateVector* temp_vector, JsrSet* te
         !head->is_clonable_exit(lp))
       continue;
 
+    // Avoid BoxLock merge.
+    if (EliminateNestedLocks && head->has_monitorenter())
+      continue;
+
     // check not already cloned
     if (head->backedge_copy_count() != 0)
       continue;
@@ -2322,6 +2326,10 @@ void ciTypeFlow::flow_block(ciTypeFlow::Block* block,
     // Watch for bailouts.
     if (failing())  return;
 
+    if (str.cur_bc() == Bytecodes::_monitorenter) {
+      block->set_has_monitorenter();
+    }
+
     if (res) {
 
       // We have encountered a trap.  Record it in this block.
diff --git a/hotspot/src/share/vm/ci/ciTypeFlow.hpp b/hotspot/src/share/vm/ci/ciTypeFlow.hpp
index 5cc5c90c891..8a8d241dac8 100644
--- a/hotspot/src/share/vm/ci/ciTypeFlow.hpp
+++ b/hotspot/src/share/vm/ci/ciTypeFlow.hpp
@@ -544,15 +544,19 @@ public:
     // Has this block been cloned for a loop backedge?
     bool                             _backedge_copy;
 
+    // This block is entry to irreducible loop.
+    bool                             _irreducible_entry;
+
+    // This block has monitor entry point.
+    bool                             _has_monitorenter;
+
     // A pointer used for our internal work list
-    Block*                           _next;
     bool                             _on_work_list;      // on the work list
+    Block*                           _next;
     Block*                           _rpo_next;          // Reverse post order list
 
     // Loop info
     Loop*                            _loop;              // nearest loop
-    bool                             _irreducible_entry; // entry to irreducible loop
-    bool                             _exception_entry;   // entry to exception handler
 
     ciBlock*     ciblock() const     { return _ciblock; }
     StateVector* state() const     { return _state; }
@@ -689,6 +693,8 @@ public:
     bool   is_loop_head() const          { return _loop && _loop->head() == this; }
     void   set_irreducible_entry(bool c) { _irreducible_entry = c; }
     bool   is_irreducible_entry() const  { return _irreducible_entry; }
+    void   set_has_monitorenter()        { _has_monitorenter = true; }
+    bool   has_monitorenter() const      { return _has_monitorenter; }
     bool   is_visited() const            { return has_pre_order(); }
     bool   is_post_visited() const       { return has_post_order(); }
     bool   is_clonable_exit(Loop* lp);
diff --git a/hotspot/src/share/vm/classfile/classFileParser.cpp b/hotspot/src/share/vm/classfile/classFileParser.cpp
index b28b8ad7e19..83bd0381ce2 100644
--- a/hotspot/src/share/vm/classfile/classFileParser.cpp
+++ b/hotspot/src/share/vm/classfile/classFileParser.cpp
@@ -45,6 +45,7 @@
 #include "oops/methodOop.hpp"
 #include "oops/symbol.hpp"
 #include "prims/jvmtiExport.hpp"
+#include "prims/jvmtiThreadState.hpp"
 #include "runtime/javaCalls.hpp"
 #include "runtime/perfData.hpp"
 #include "runtime/reflection.hpp"
@@ -1050,7 +1051,7 @@ static FieldAllocationType basic_type_to_atype(bool is_static, BasicType type) {
 
 class FieldAllocationCount: public ResourceObj {
  public:
-  unsigned int count[MAX_FIELD_ALLOCATION_TYPE];
+  u2 count[MAX_FIELD_ALLOCATION_TYPE];
 
   FieldAllocationCount() {
     for (int i = 0; i < MAX_FIELD_ALLOCATION_TYPE; i++) {
@@ -1060,6 +1061,8 @@ class FieldAllocationCount: public ResourceObj {
 
   FieldAllocationType update(bool is_static, BasicType type) {
     FieldAllocationType atype = basic_type_to_atype(is_static, type);
+    // Make sure there is no overflow with injected fields.
+    assert(count[atype] < 0xFFFF, "More than 65535 fields");
     count[atype]++;
     return atype;
   }
@@ -1070,7 +1073,7 @@ typeArrayHandle ClassFileParser::parse_fields(Symbol* class_name,
                                               constantPoolHandle cp, bool is_interface,
                                               FieldAllocationCount *fac,
                                               objArrayHandle* fields_annotations,
-                                              int* java_fields_count_ptr, TRAPS) {
+                                              u2* java_fields_count_ptr, TRAPS) {
   ClassFileStream* cfs = stream();
   typeArrayHandle nullHandle;
   cfs->guarantee_more(2, CHECK_(nullHandle));  // length
@@ -2639,8 +2642,11 @@ instanceKlassHandle ClassFileParser::parseClassFile(Symbol* name,
                                                     TempNewSymbol& parsed_name,
                                                     bool verify,
                                                     TRAPS) {
-  // So that JVMTI can cache class file in the state before retransformable agents
-  // have modified it
+  // When a retransformable agent is attached, JVMTI caches the
+  // class bytes that existed before the first retransformation.
+  // If RedefineClasses() was used before the retransformable
+  // agent attached, then the cached class bytes may not be the
+  // original class bytes.
   unsigned char *cached_class_file_bytes = NULL;
   jint cached_class_file_length;
 
@@ -2660,6 +2666,25 @@ instanceKlassHandle ClassFileParser::parseClassFile(Symbol* name,
   _max_bootstrap_specifier_index = -1;
 
   if (JvmtiExport::should_post_class_file_load_hook()) {
+    // Get the cached class file bytes (if any) from the class that
+    // is being redefined or retransformed. We use jvmti_thread_state()
+    // instead of JvmtiThreadState::state_for(jt) so we don't allocate
+    // a JvmtiThreadState any earlier than necessary. This will help
+    // avoid the bug described by 7126851.
+    JvmtiThreadState *state = jt->jvmti_thread_state();
+    if (state != NULL) {
+      KlassHandle *h_class_being_redefined =
+                     state->get_class_being_redefined();
+      if (h_class_being_redefined != NULL) {
+        instanceKlassHandle ikh_class_being_redefined =
+          instanceKlassHandle(THREAD, (*h_class_being_redefined)());
+        cached_class_file_bytes =
+          ikh_class_being_redefined->get_cached_class_file_bytes();
+        cached_class_file_length =
+          ikh_class_being_redefined->get_cached_class_file_len();
+      }
+    }
+
     unsigned char* ptr = cfs->buffer();
     unsigned char* end_ptr = cfs->buffer() + cfs->length();
 
@@ -2843,7 +2868,7 @@ instanceKlassHandle ClassFileParser::parseClassFile(Symbol* name,
       local_interfaces = parse_interfaces(cp, itfs_len, class_loader, protection_domain, _class_name, CHECK_(nullHandle));
     }
 
-    int java_fields_count = 0;
+    u2 java_fields_count = 0;
     // Fields (offsets are filled in later)
     FieldAllocationCount fac;
     objArrayHandle fields_annotations;
diff --git a/hotspot/src/share/vm/classfile/classFileParser.hpp b/hotspot/src/share/vm/classfile/classFileParser.hpp
index 8e0db5651c1..fef48eb61f4 100644
--- a/hotspot/src/share/vm/classfile/classFileParser.hpp
+++ b/hotspot/src/share/vm/classfile/classFileParser.hpp
@@ -91,7 +91,7 @@ class ClassFileParser VALUE_OBJ_CLASS_SPEC {
                                constantPoolHandle cp, bool is_interface,
                                FieldAllocationCount *fac,
                                objArrayHandle* fields_annotations,
-                               int* java_fields_count_ptr, TRAPS);
+                               u2* java_fields_count_ptr, TRAPS);
 
   // Method parsing
   methodHandle parse_method(constantPoolHandle cp, bool is_interface,
diff --git a/hotspot/src/share/vm/classfile/vmSymbols.hpp b/hotspot/src/share/vm/classfile/vmSymbols.hpp
index 16652e5efb7..40f36ad66cf 100644
--- a/hotspot/src/share/vm/classfile/vmSymbols.hpp
+++ b/hotspot/src/share/vm/classfile/vmSymbols.hpp
@@ -296,6 +296,7 @@
   template(finalize_method_name,                      "finalize")                                 \
   template(reference_lock_name,                       "lock")                                     \
   template(reference_discovered_name,                 "discovered")                               \
+  template(run_finalization_name,                     "runFinalization")                          \
   template(run_finalizers_on_exit_name,               "runFinalizersOnExit")                      \
   template(uncaughtException_name,                    "uncaughtException")                        \
   template(dispatchUncaughtException_name,            "dispatchUncaughtException")                \
diff --git a/hotspot/src/share/vm/gc_implementation/concurrentMarkSweep/compactibleFreeListSpace.cpp b/hotspot/src/share/vm/gc_implementation/concurrentMarkSweep/compactibleFreeListSpace.cpp
index 2f960435043..2cb5e2f3cef 100644
--- a/hotspot/src/share/vm/gc_implementation/concurrentMarkSweep/compactibleFreeListSpace.cpp
+++ b/hotspot/src/share/vm/gc_implementation/concurrentMarkSweep/compactibleFreeListSpace.cpp
@@ -2598,7 +2598,7 @@ void CompactibleFreeListSpace::printFLCensus(size_t sweep_count) const {
 AdaptiveWeightedAverage CFLS_LAB::_blocks_to_claim[]    =
   VECTOR_257(AdaptiveWeightedAverage(OldPLABWeight, (float)CMSParPromoteBlocksToClaim));
 size_t CFLS_LAB::_global_num_blocks[]  = VECTOR_257(0);
-int    CFLS_LAB::_global_num_workers[] = VECTOR_257(0);
+uint   CFLS_LAB::_global_num_workers[] = VECTOR_257(0);
 
 CFLS_LAB::CFLS_LAB(CompactibleFreeListSpace* cfls) :
   _cfls(cfls)
@@ -2732,7 +2732,7 @@ void CFLS_LAB::retire(int tid) {
         // Update globals stats for num_blocks used
         _global_num_blocks[i] += (_num_blocks[i] - num_retire);
         _global_num_workers[i]++;
-        assert(_global_num_workers[i] <= (ssize_t)ParallelGCThreads, "Too big");
+        assert(_global_num_workers[i] <= ParallelGCThreads, "Too big");
         if (num_retire > 0) {
           _cfls->_indexedFreeList[i].prepend(&_indexedFreeList[i]);
           // Reset this list.
diff --git a/hotspot/src/share/vm/gc_implementation/concurrentMarkSweep/compactibleFreeListSpace.hpp b/hotspot/src/share/vm/gc_implementation/concurrentMarkSweep/compactibleFreeListSpace.hpp
index 8b7e1077283..90d2f5f3918 100644
--- a/hotspot/src/share/vm/gc_implementation/concurrentMarkSweep/compactibleFreeListSpace.hpp
+++ b/hotspot/src/share/vm/gc_implementation/concurrentMarkSweep/compactibleFreeListSpace.hpp
@@ -631,7 +631,7 @@ class CFLS_LAB : public CHeapObj {
   static AdaptiveWeightedAverage
                  _blocks_to_claim  [CompactibleFreeListSpace::IndexSetSize];
   static size_t _global_num_blocks [CompactibleFreeListSpace::IndexSetSize];
-  static int    _global_num_workers[CompactibleFreeListSpace::IndexSetSize];
+  static uint   _global_num_workers[CompactibleFreeListSpace::IndexSetSize];
   size_t        _num_blocks        [CompactibleFreeListSpace::IndexSetSize];
 
   // Internal work method
diff --git a/hotspot/src/share/vm/gc_implementation/concurrentMarkSweep/concurrentMarkSweepGeneration.cpp b/hotspot/src/share/vm/gc_implementation/concurrentMarkSweep/concurrentMarkSweepGeneration.cpp
index bff5b40ab9a..39b57341ee8 100644
--- a/hotspot/src/share/vm/gc_implementation/concurrentMarkSweep/concurrentMarkSweepGeneration.cpp
+++ b/hotspot/src/share/vm/gc_implementation/concurrentMarkSweep/concurrentMarkSweepGeneration.cpp
@@ -3779,7 +3779,7 @@ class CMSConcMarkingTask: public YieldingFlexibleGangTask {
     terminator()->reset_for_reuse(active_workers);
   }
 
-  void work(int i);
+  void work(uint worker_id);
   bool should_yield() {
     return    ConcurrentMarkSweepThread::should_yield()
            && !_collector->foregroundGCIsActive()
@@ -3852,7 +3852,7 @@ void CMSConcMarkingTerminator::yield() {
 //    . if neither is available, offer termination
 // -- Terminate and return result
 //
-void CMSConcMarkingTask::work(int i) {
+void CMSConcMarkingTask::work(uint worker_id) {
   elapsedTimer _timer;
   ResourceMark rm;
   HandleMark hm;
@@ -3860,37 +3860,40 @@ void CMSConcMarkingTask::work(int i) {
   DEBUG_ONLY(_collector->verify_overflow_empty();)
 
   // Before we begin work, our work queue should be empty
-  assert(work_queue(i)->size() == 0, "Expected to be empty");
+  assert(work_queue(worker_id)->size() == 0, "Expected to be empty");
   // Scan the bitmap covering _cms_space, tracing through grey objects.
   _timer.start();
-  do_scan_and_mark(i, _cms_space);
+  do_scan_and_mark(worker_id, _cms_space);
   _timer.stop();
   if (PrintCMSStatistics != 0) {
     gclog_or_tty->print_cr("Finished cms space scanning in %dth thread: %3.3f sec",
-      i, _timer.seconds()); // XXX: need xxx/xxx type of notation, two timers
+      worker_id, _timer.seconds());
+      // XXX: need xxx/xxx type of notation, two timers
   }
 
   // ... do the same for the _perm_space
   _timer.reset();
   _timer.start();
-  do_scan_and_mark(i, _perm_space);
+  do_scan_and_mark(worker_id, _perm_space);
   _timer.stop();
   if (PrintCMSStatistics != 0) {
     gclog_or_tty->print_cr("Finished perm space scanning in %dth thread: %3.3f sec",
-      i, _timer.seconds()); // XXX: need xxx/xxx type of notation, two timers
+      worker_id, _timer.seconds());
+      // XXX: need xxx/xxx type of notation, two timers
   }
 
   // ... do work stealing
   _timer.reset();
   _timer.start();
-  do_work_steal(i);
+  do_work_steal(worker_id);
   _timer.stop();
   if (PrintCMSStatistics != 0) {
     gclog_or_tty->print_cr("Finished work stealing in %dth thread: %3.3f sec",
-      i, _timer.seconds()); // XXX: need xxx/xxx type of notation, two timers
+      worker_id, _timer.seconds());
+      // XXX: need xxx/xxx type of notation, two timers
   }
   assert(_collector->_markStack.isEmpty(), "Should have been emptied");
-  assert(work_queue(i)->size() == 0, "Should have been emptied");
+  assert(work_queue(worker_id)->size() == 0, "Should have been emptied");
   // Note that under the current task protocol, the
   // following assertion is true even of the spaces
   // expanded since the completion of the concurrent
@@ -3946,7 +3949,7 @@ void CMSConcMarkingTask::do_scan_and_mark(int i, CompactibleFreeListSpace* sp) {
   // We allow that there may be no tasks to do here because
   // we are restarting after a stack overflow.
   assert(pst->valid() || n_tasks == 0, "Uninitialized use?");
-  int nth_task = 0;
+  uint nth_task = 0;
 
   HeapWord* aligned_start = sp->bottom();
   if (sp->used_region().contains(_restart_addr)) {
@@ -5075,7 +5078,7 @@ class CMSParRemarkTask: public AbstractGangTask {
   ParallelTaskTerminator* terminator() { return &_term; }
   int n_workers() { return _n_workers; }
 
-  void work(int i);
+  void work(uint worker_id);
 
  private:
   // Work method in support of parallel rescan ... of young gen spaces
@@ -5096,7 +5099,7 @@ class CMSParRemarkTask: public AbstractGangTask {
 // also is passed to do_dirty_card_rescan_tasks() and to
 // do_work_steal() to select the i-th task_queue.
 
-void CMSParRemarkTask::work(int i) {
+void CMSParRemarkTask::work(uint worker_id) {
   elapsedTimer _timer;
   ResourceMark rm;
   HandleMark   hm;
@@ -5107,7 +5110,7 @@ void CMSParRemarkTask::work(int i) {
   Par_MarkRefsIntoAndScanClosure par_mrias_cl(_collector,
     _collector->_span, _collector->ref_processor(),
     &(_collector->_markBitMap),
-    work_queue(i), &(_collector->_revisitStack));
+    work_queue(worker_id), &(_collector->_revisitStack));
 
   // Rescan young gen roots first since these are likely
   // coarsely partitioned and may, on that account, constitute
@@ -5128,15 +5131,15 @@ void CMSParRemarkTask::work(int i) {
     assert(ect <= _collector->_eden_chunk_capacity, "out of bounds");
     assert(sct <= _collector->_survivor_chunk_capacity, "out of bounds");
 
-    do_young_space_rescan(i, &par_mrias_cl, to_space, NULL, 0);
-    do_young_space_rescan(i, &par_mrias_cl, from_space, sca, sct);
-    do_young_space_rescan(i, &par_mrias_cl, eden_space, eca, ect);
+    do_young_space_rescan(worker_id, &par_mrias_cl, to_space, NULL, 0);
+    do_young_space_rescan(worker_id, &par_mrias_cl, from_space, sca, sct);
+    do_young_space_rescan(worker_id, &par_mrias_cl, eden_space, eca, ect);
 
     _timer.stop();
     if (PrintCMSStatistics != 0) {
       gclog_or_tty->print_cr(
         "Finished young gen rescan work in %dth thread: %3.3f sec",
-        i, _timer.seconds());
+        worker_id, _timer.seconds());
     }
   }
 
@@ -5158,7 +5161,7 @@ void CMSParRemarkTask::work(int i) {
   if (PrintCMSStatistics != 0) {
     gclog_or_tty->print_cr(
       "Finished remaining root rescan work in %dth thread: %3.3f sec",
-      i, _timer.seconds());
+      worker_id, _timer.seconds());
   }
 
   // ---------- rescan dirty cards ------------
@@ -5167,26 +5170,26 @@ void CMSParRemarkTask::work(int i) {
 
   // Do the rescan tasks for each of the two spaces
   // (cms_space and perm_space) in turn.
-  // "i" is passed to select the "i-th" task_queue
-  do_dirty_card_rescan_tasks(_cms_space, i, &par_mrias_cl);
-  do_dirty_card_rescan_tasks(_perm_space, i, &par_mrias_cl);
+  // "worker_id" is passed to select the task_queue for "worker_id"
+  do_dirty_card_rescan_tasks(_cms_space, worker_id, &par_mrias_cl);
+  do_dirty_card_rescan_tasks(_perm_space, worker_id, &par_mrias_cl);
   _timer.stop();
   if (PrintCMSStatistics != 0) {
     gclog_or_tty->print_cr(
       "Finished dirty card rescan work in %dth thread: %3.3f sec",
-      i, _timer.seconds());
+      worker_id, _timer.seconds());
   }
 
   // ---------- steal work from other threads ...
   // ---------- ... and drain overflow list.
   _timer.reset();
   _timer.start();
-  do_work_steal(i, &par_mrias_cl, _collector->hash_seed(i));
+  do_work_steal(worker_id, &par_mrias_cl, _collector->hash_seed(worker_id));
   _timer.stop();
   if (PrintCMSStatistics != 0) {
     gclog_or_tty->print_cr(
       "Finished work stealing in %dth thread: %3.3f sec",
-      i, _timer.seconds());
+      worker_id, _timer.seconds());
   }
 }
 
@@ -5207,8 +5210,8 @@ CMSParRemarkTask::do_young_space_rescan(int i,
   SequentialSubTasksDone* pst = space->par_seq_tasks();
   assert(pst->valid(), "Uninitialized use?");
 
-  int nth_task = 0;
-  int n_tasks  = pst->n_tasks();
+  uint nth_task = 0;
+  uint n_tasks  = pst->n_tasks();
 
   HeapWord *start, *end;
   while (!pst->is_task_claimed(/* reference */ nth_task)) {
@@ -5220,12 +5223,12 @@ CMSParRemarkTask::do_young_space_rescan(int i,
     } else if (nth_task == 0) {
       start = space->bottom();
       end   = chunk_array[nth_task];
-    } else if (nth_task < (jint)chunk_top) {
+    } else if (nth_task < (uint)chunk_top) {
       assert(nth_task >= 1, "Control point invariant");
       start = chunk_array[nth_task - 1];
       end   = chunk_array[nth_task];
     } else {
-      assert(nth_task == (jint)chunk_top, "Control point invariant");
+      assert(nth_task == (uint)chunk_top, "Control point invariant");
       start = chunk_array[chunk_top - 1];
       end   = space->top();
     }
@@ -5288,7 +5291,7 @@ CMSParRemarkTask::do_dirty_card_rescan_tasks(
 
   SequentialSubTasksDone* pst = sp->conc_par_seq_tasks();
   assert(pst->valid(), "Uninitialized use?");
-  int nth_task = 0;
+  uint nth_task = 0;
   const int alignment = CardTableModRefBS::card_size * BitsPerWord;
   MemRegion span = sp->used_region();
   HeapWord* start_addr = span.start();
@@ -5736,26 +5739,26 @@ public:
                      CMSParKeepAliveClosure* keep_alive,
                      int* seed);
 
-  virtual void work(int i);
+  virtual void work(uint worker_id);
 };
 
-void CMSRefProcTaskProxy::work(int i) {
+void CMSRefProcTaskProxy::work(uint worker_id) {
   assert(_collector->_span.equals(_span), "Inconsistency in _span");
   CMSParKeepAliveClosure par_keep_alive(_collector, _span,
                                         _mark_bit_map,
                                         &_collector->_revisitStack,
-                                        work_queue(i));
+                                        work_queue(worker_id));
   CMSParDrainMarkingStackClosure par_drain_stack(_collector, _span,
                                                  _mark_bit_map,
                                                  &_collector->_revisitStack,
-                                                 work_queue(i));
+                                                 work_queue(worker_id));
   CMSIsAliveClosure is_alive_closure(_span, _mark_bit_map);
-  _task.work(i, is_alive_closure, par_keep_alive, par_drain_stack);
+  _task.work(worker_id, is_alive_closure, par_keep_alive, par_drain_stack);
   if (_task.marks_oops_alive()) {
-    do_work_steal(i, &par_drain_stack, &par_keep_alive,
-                  _collector->hash_seed(i));
+    do_work_steal(worker_id, &par_drain_stack, &par_keep_alive,
+                  _collector->hash_seed(worker_id));
   }
-  assert(work_queue(i)->size() == 0, "work_queue should be empty");
+  assert(work_queue(worker_id)->size() == 0, "work_queue should be empty");
   assert(_collector->_overflow_list == NULL, "non-empty _overflow_list");
 }
 
@@ -5769,9 +5772,9 @@ public:
       _task(task)
   { }
 
-  virtual void work(int i)
+  virtual void work(uint worker_id)
   {
-    _task.work(i);
+    _task.work(worker_id);
   }
 };
 
diff --git a/hotspot/src/share/vm/gc_implementation/g1/collectionSetChooser.cpp b/hotspot/src/share/vm/gc_implementation/g1/collectionSetChooser.cpp
index 544b5a8676c..354fefbf71f 100644
--- a/hotspot/src/share/vm/gc_implementation/g1/collectionSetChooser.cpp
+++ b/hotspot/src/share/vm/gc_implementation/g1/collectionSetChooser.cpp
@@ -264,7 +264,7 @@ prepareForAddMarkedHeapRegionsPar(size_t n_regions, size_t chunkSize) {
     // or some improperly initialized variable with leads to no
     // active threads, protect against that in a product build.
     n_threads = MAX2(G1CollectedHeap::heap()->workers()->active_workers(),
-                     1);
+                     1U);
   }
   size_t max_waste = n_threads * chunkSize;
   // it should be aligned with respect to chunkSize
diff --git a/hotspot/src/share/vm/gc_implementation/g1/concurrentMark.cpp b/hotspot/src/share/vm/gc_implementation/g1/concurrentMark.cpp
index e208929a2a1..4712d803542 100644
--- a/hotspot/src/share/vm/gc_implementation/g1/concurrentMark.cpp
+++ b/hotspot/src/share/vm/gc_implementation/g1/concurrentMark.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2001, 2011, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2001, 2012, Oracle and/or its affiliates. All rights reserved.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
  * This code is free software; you can redistribute it and/or modify it
@@ -31,6 +31,7 @@
 #include "gc_implementation/g1/g1ErgoVerbose.hpp"
 #include "gc_implementation/g1/g1OopClosures.inline.hpp"
 #include "gc_implementation/g1/g1RemSet.hpp"
+#include "gc_implementation/g1/heapRegion.inline.hpp"
 #include "gc_implementation/g1/heapRegionRemSet.hpp"
 #include "gc_implementation/g1/heapRegionSeq.inline.hpp"
 #include "gc_implementation/shared/vmGCOperations.hpp"
@@ -183,12 +184,11 @@ CMMarkStack::CMMarkStack(ConcurrentMark* cm) :
 void CMMarkStack::allocate(size_t size) {
   _base = NEW_C_HEAP_ARRAY(oop, size);
   if (_base == NULL) {
-    vm_exit_during_initialization("Failed to allocate "
-                                  "CM region mark stack");
+    vm_exit_during_initialization("Failed to allocate CM region mark stack");
   }
   _index = 0;
   _capacity = (jint) size;
-  _oops_do_bound = -1;
+  _saved_index = -1;
   NOT_PRODUCT(_max_depth = 0);
 }
 
@@ -283,7 +283,6 @@ bool CMMarkStack::par_pop_arr(oop* ptr_arr, int max, int* n) {
   }
 }
 
-
 CMRegionStack::CMRegionStack() : _base(NULL) {}
 
 void CMRegionStack::allocate(size_t size) {
@@ -302,6 +301,8 @@ CMRegionStack::~CMRegionStack() {
 }
 
 void CMRegionStack::push_lock_free(MemRegion mr) {
+  guarantee(false, "push_lock_free(): don't call this any more");
+
   assert(mr.word_size() > 0, "Precondition");
   while (true) {
     jint index = _index;
@@ -325,6 +326,8 @@ void CMRegionStack::push_lock_free(MemRegion mr) {
 // marking / remark phases. Should only be called in tandem with
 // other lock-free pops.
 MemRegion CMRegionStack::pop_lock_free() {
+  guarantee(false, "pop_lock_free(): don't call this any more");
+
   while (true) {
     jint index = _index;
 
@@ -390,6 +393,8 @@ MemRegion CMRegionStack::pop_with_lock() {
 #endif
 
 bool CMRegionStack::invalidate_entries_into_cset() {
+  guarantee(false, "invalidate_entries_into_cset(): don't call this any more");
+
   bool result = false;
   G1CollectedHeap* g1h = G1CollectedHeap::heap();
   for (int i = 0; i < _oops_do_bound; ++i) {
@@ -438,14 +443,29 @@ bool CMMarkStack::drain(OopClosureClass* cl, CMBitMap* bm, bool yield_after) {
   return res;
 }
 
+void CMMarkStack::note_start_of_gc() {
+  assert(_saved_index == -1,
+         "note_start_of_gc()/end_of_gc() bracketed incorrectly");
+  _saved_index = _index;
+}
+
+void CMMarkStack::note_end_of_gc() {
+  // This is intentionally a guarantee, instead of an assert. If we
+  // accidentally add something to the mark stack during GC, it
+  // will be a correctness issue so it's better if we crash. we'll
+  // only check this once per GC anyway, so it won't be a performance
+  // issue in any way.
+  guarantee(_saved_index == _index,
+            err_msg("saved index: %d index: %d", _saved_index, _index));
+  _saved_index = -1;
+}
+
 void CMMarkStack::oops_do(OopClosure* f) {
-  if (_index == 0) return;
-  assert(_oops_do_bound != -1 && _oops_do_bound <= _index,
-         "Bound must be set.");
-  for (int i = 0; i < _oops_do_bound; i++) {
+  assert(_saved_index == _index,
+         err_msg("saved index: %d index: %d", _saved_index, _index));
+  for (int i = 0; i < _index; i += 1) {
     f->do_oop(&_base[i]);
   }
-  _oops_do_bound = -1;
 }
 
 bool ConcurrentMark::not_yet_marked(oop obj) const {
@@ -458,8 +478,8 @@ bool ConcurrentMark::not_yet_marked(oop obj) const {
 #pragma warning( disable:4355 ) // 'this' : used in base member initializer list
 #endif // _MSC_VER
 
-size_t ConcurrentMark::scale_parallel_threads(size_t n_par_threads) {
-  return MAX2((n_par_threads + 2) / 4, (size_t)1);
+uint ConcurrentMark::scale_parallel_threads(uint n_par_threads) {
+  return MAX2((n_par_threads + 2) / 4, 1U);
 }
 
 ConcurrentMark::ConcurrentMark(ReservedSpace rs,
@@ -486,7 +506,7 @@ ConcurrentMark::ConcurrentMark(ReservedSpace rs,
   _regionStack(),
   // _finger set in set_non_marking_state
 
-  _max_task_num(MAX2(ParallelGCThreads, (size_t)1)),
+  _max_task_num(MAX2((uint)ParallelGCThreads, 1U)),
   // _active_tasks set in set_non_marking_state
   // _tasks set inside the constructor
   _task_queues(new CMTaskQueueSet((int) _max_task_num)),
@@ -506,7 +526,6 @@ ConcurrentMark::ConcurrentMark(ReservedSpace rs,
   _cleanup_times(),
   _total_counting_time(0.0),
   _total_rs_scrub_time(0.0),
-
   _parallel_workers(NULL) {
   CMVerboseLevel verbose_level = (CMVerboseLevel) G1MarkingVerboseLevel;
   if (verbose_level < no_verbose) {
@@ -568,7 +587,7 @@ ConcurrentMark::ConcurrentMark(ReservedSpace rs,
       // notice that ConcGCThreads overwrites G1MarkingOverheadPercent
       // if both are set
 
-      _parallel_marking_threads = ConcGCThreads;
+      _parallel_marking_threads = (uint) ConcGCThreads;
       _max_parallel_marking_threads = _parallel_marking_threads;
       _sleep_factor             = 0.0;
       _marking_task_overhead    = 1.0;
@@ -589,12 +608,12 @@ ConcurrentMark::ConcurrentMark(ReservedSpace rs,
       double sleep_factor =
                          (1.0 - marking_task_overhead) / marking_task_overhead;
 
-      _parallel_marking_threads = (size_t) marking_thread_num;
+      _parallel_marking_threads = (uint) marking_thread_num;
       _max_parallel_marking_threads = _parallel_marking_threads;
       _sleep_factor             = sleep_factor;
       _marking_task_overhead    = marking_task_overhead;
     } else {
-      _parallel_marking_threads = scale_parallel_threads(ParallelGCThreads);
+      _parallel_marking_threads = scale_parallel_threads((uint)ParallelGCThreads);
       _max_parallel_marking_threads = _parallel_marking_threads;
       _sleep_factor             = 0.0;
       _marking_task_overhead    = 1.0;
@@ -618,7 +637,7 @@ ConcurrentMark::ConcurrentMark(ReservedSpace rs,
 
     guarantee(parallel_marking_threads() > 0, "peace of mind");
     _parallel_workers = new FlexibleWorkGang("G1 Parallel Marking Threads",
-         (int) _max_parallel_marking_threads, false, true);
+         _max_parallel_marking_threads, false, true);
     if (_parallel_workers == NULL) {
       vm_exit_during_initialization("Failed necessary allocation.");
     } else {
@@ -691,7 +710,7 @@ void ConcurrentMark::reset() {
   set_concurrent_marking_in_progress();
 }
 
-void ConcurrentMark::set_phase(size_t active_tasks, bool concurrent) {
+void ConcurrentMark::set_phase(uint active_tasks, bool concurrent) {
   assert(active_tasks <= _max_task_num, "we should not have more");
 
   _active_tasks = active_tasks;
@@ -727,12 +746,8 @@ void ConcurrentMark::set_non_marking_state() {
 }
 
 ConcurrentMark::~ConcurrentMark() {
-  for (int i = 0; i < (int) _max_task_num; ++i) {
-    delete _task_queues->queue(i);
-    delete _tasks[i];
-  }
-  delete _task_queues;
-  FREE_C_HEAP_ARRAY(CMTask*, _max_task_num);
+  // The ConcurrentMark instance is never freed.
+  ShouldNotReachHere();
 }
 
 // This closure is used to mark refs into the g1 generation
@@ -788,7 +803,7 @@ class NoteStartOfMarkHRClosure: public HeapRegionClosure {
 public:
   bool doHeapRegion(HeapRegion* r) {
     if (!r->continuesHumongous()) {
-      r->note_start_of_marking(true);
+      r->note_start_of_marking();
     }
     return false;
   }
@@ -809,6 +824,10 @@ void ConcurrentMark::checkpointRootsInitialPre() {
 
   // Initialise marking structures. This has to be done in a STW phase.
   reset();
+
+  // For each region note start of marking.
+  NoteStartOfMarkHRClosure startcl;
+  g1h->heap_region_iterate(&startcl);
 }
 
 
@@ -823,10 +842,6 @@ void ConcurrentMark::checkpointRootsInitialPost() {
   // every remark and we'll eventually not need to cause one.
   force_overflow_stw()->init();
 
-  // For each region note start of marking.
-  NoteStartOfMarkHRClosure startcl;
-  g1h->heap_region_iterate(&startcl);
-
   // Start Concurrent Marking weak-reference discovery.
   ReferenceProcessor* rp = g1h->ref_processor_cm();
   // enable ("weak") refs discovery
@@ -951,22 +966,9 @@ bool ForceOverflowSettings::should_force() {
 }
 #endif // !PRODUCT
 
-void ConcurrentMark::grayRoot(oop p) {
-  HeapWord* addr = (HeapWord*) p;
-  // We can't really check against _heap_start and _heap_end, since it
-  // is possible during an evacuation pause with piggy-backed
-  // initial-mark that the committed space is expanded during the
-  // pause without CM observing this change. So the assertions below
-  // is a bit conservative; but better than nothing.
-  assert(_g1h->g1_committed().contains(addr),
-         "address should be within the heap bounds");
-
-  if (!_nextMarkBitMap->isMarked(addr)) {
-    _nextMarkBitMap->parMark(addr);
-  }
-}
-
 void ConcurrentMark::grayRegionIfNecessary(MemRegion mr) {
+  guarantee(false, "grayRegionIfNecessary(): don't call this any more");
+
   // The objects on the region have already been marked "in bulk" by
   // the caller. We only need to decide whether to push the region on
   // the region stack or not.
@@ -1012,6 +1014,8 @@ void ConcurrentMark::grayRegionIfNecessary(MemRegion mr) {
 }
 
 void ConcurrentMark::markAndGrayObjectIfNecessary(oop p) {
+  guarantee(false, "markAndGrayObjectIfNecessary(): don't call this any more");
+
   // The object is not marked by the caller. We need to at least mark
   // it and maybe push in on the stack.
 
@@ -1048,7 +1052,7 @@ private:
   ConcurrentMarkThread* _cmt;
 
 public:
-  void work(int worker_i) {
+  void work(uint worker_id) {
     assert(Thread::current()->is_ConcurrentGC_thread(),
            "this should only be done by a conc GC thread");
     ResourceMark rm;
@@ -1057,8 +1061,8 @@ public:
 
     ConcurrentGCThread::stsJoin();
 
-    assert((size_t) worker_i < _cm->active_tasks(), "invariant");
-    CMTask* the_task = _cm->task(worker_i);
+    assert(worker_id < _cm->active_tasks(), "invariant");
+    CMTask* the_task = _cm->task(worker_id);
     the_task->record_start_time();
     if (!_cm->has_aborted()) {
       do {
@@ -1076,7 +1080,7 @@ public:
         double elapsed_time_sec = end_time_sec - start_time_sec;
         _cm->clear_has_overflown();
 
-        bool ret = _cm->do_yield_check(worker_i);
+        bool ret = _cm->do_yield_check(worker_id);
 
         jlong sleep_time_ms;
         if (!_cm->has_aborted() && the_task->has_aborted()) {
@@ -1105,7 +1109,7 @@ public:
     ConcurrentGCThread::stsLeave();
 
     double end_vtime = os::elapsedVTime();
-    _cm->update_accum_task_vtime(worker_i, end_vtime - start_vtime);
+    _cm->update_accum_task_vtime(worker_id, end_vtime - start_vtime);
   }
 
   CMConcurrentMarkingTask(ConcurrentMark* cm,
@@ -1117,9 +1121,9 @@ public:
 
 // Calculates the number of active workers for a concurrent
 // phase.
-size_t ConcurrentMark::calc_parallel_marking_threads() {
+uint ConcurrentMark::calc_parallel_marking_threads() {
   if (G1CollectedHeap::use_parallel_gc_threads()) {
-    size_t n_conc_workers = 0;
+    uint n_conc_workers = 0;
     if (!UseDynamicNumberOfGCThreads ||
         (!FLAG_IS_DEFAULT(ConcGCThreads) &&
          !ForceDynamicNumberOfGCThreads)) {
@@ -1159,7 +1163,7 @@ void ConcurrentMark::markFromRoots() {
   assert(parallel_marking_threads() <= max_parallel_marking_threads(),
     "Maximum number of marking threads exceeded");
 
-  size_t active_workers = MAX2((size_t) 1, parallel_marking_threads());
+  uint active_workers = MAX2(1U, parallel_marking_threads());
 
   // Parallel task terminator is set in "set_phase()"
   set_phase(active_workers, true /* concurrent */);
@@ -1229,7 +1233,6 @@ void ConcurrentMark::checkpointRootsFinal(bool clear_all_soft_refs) {
                                        true /* expected_active */);
 
     if (VerifyDuringGC) {
-
       HandleMark hm;  // handle scope
       gclog_or_tty->print(" VerifyDuringGC:(after)");
       Universe::heap()->prepare_for_verify();
@@ -1503,7 +1506,7 @@ class G1ParFinalCountTask: public AbstractGangTask {
 protected:
   G1CollectedHeap* _g1h;
   CMBitMap* _bm;
-  size_t _n_workers;
+  uint    _n_workers;
   size_t *_live_bytes;
   size_t *_used_bytes;
   BitMap* _region_bm;
@@ -1535,13 +1538,13 @@ public:
     FREE_C_HEAP_ARRAY(size_t, _used_bytes);
   }
 
-  void work(int i) {
+  void work(uint worker_id) {
     CalcLiveObjectsClosure calccl(true /*final*/,
                                   _bm, _g1h->concurrent_mark(),
                                   _region_bm, _card_bm);
     calccl.no_yield();
     if (G1CollectedHeap::use_parallel_gc_threads()) {
-      _g1h->heap_region_par_iterate_chunked(&calccl, i,
+      _g1h->heap_region_par_iterate_chunked(&calccl, worker_id,
                                             (int) _n_workers,
                                             HeapRegion::FinalCountClaimValue);
     } else {
@@ -1549,19 +1552,19 @@ public:
     }
     assert(calccl.complete(), "Shouldn't have yielded!");
 
-    assert((size_t) i < _n_workers, "invariant");
-    _live_bytes[i] = calccl.tot_live();
-    _used_bytes[i] = calccl.tot_used();
+    assert(worker_id < _n_workers, "invariant");
+    _live_bytes[worker_id] = calccl.tot_live();
+    _used_bytes[worker_id] = calccl.tot_used();
   }
   size_t live_bytes()  {
     size_t live_bytes = 0;
-    for (size_t i = 0; i < _n_workers; ++i)
+    for (uint i = 0; i < _n_workers; ++i)
       live_bytes += _live_bytes[i];
     return live_bytes;
   }
   size_t used_bytes()  {
     size_t used_bytes = 0;
-    for (size_t i = 0; i < _n_workers; ++i)
+    for (uint i = 0; i < _n_workers; ++i)
       used_bytes += _used_bytes[i];
     return used_bytes;
   }
@@ -1646,18 +1649,18 @@ public:
     AbstractGangTask("G1 note end"), _g1h(g1h),
     _max_live_bytes(0), _freed_bytes(0), _cleanup_list(cleanup_list) { }
 
-  void work(int i) {
+  void work(uint worker_id) {
     double start = os::elapsedTime();
     FreeRegionList local_cleanup_list("Local Cleanup List");
     OldRegionSet old_proxy_set("Local Cleanup Old Proxy Set");
     HumongousRegionSet humongous_proxy_set("Local Cleanup Humongous Proxy Set");
     HRRSCleanupTask hrrs_cleanup_task;
-    G1NoteEndOfConcMarkClosure g1_note_end(_g1h, i, &local_cleanup_list,
+    G1NoteEndOfConcMarkClosure g1_note_end(_g1h, worker_id, &local_cleanup_list,
                                            &old_proxy_set,
                                            &humongous_proxy_set,
                                            &hrrs_cleanup_task);
     if (G1CollectedHeap::use_parallel_gc_threads()) {
-      _g1h->heap_region_par_iterate_chunked(&g1_note_end, i,
+      _g1h->heap_region_par_iterate_chunked(&g1_note_end, worker_id,
                                             _g1h->workers()->active_workers(),
                                             HeapRegion::NoteEndClaimValue);
     } else {
@@ -1701,8 +1704,8 @@ public:
     double end = os::elapsedTime();
     if (G1PrintParCleanupStats) {
       gclog_or_tty->print("     Worker thread %d [%8.3f..%8.3f = %8.3f ms] "
-                          "claimed %d regions (tot = %8.3f ms, max = %8.3f ms).\n",
-                          i, start, end, (end-start)*1000.0,
+                          "claimed %u regions (tot = %8.3f ms, max = %8.3f ms).\n",
+                          worker_id, start, end, (end-start)*1000.0,
                           g1_note_end.regions_claimed(),
                           g1_note_end.claimed_region_time_sec()*1000.0,
                           g1_note_end.max_region_time_sec()*1000.0);
@@ -1724,9 +1727,9 @@ public:
     _region_bm(region_bm), _card_bm(card_bm)
   {}
 
-  void work(int i) {
+  void work(uint worker_id) {
     if (G1CollectedHeap::use_parallel_gc_threads()) {
-      _g1rs->scrub_par(_region_bm, _card_bm, i,
+      _g1rs->scrub_par(_region_bm, _card_bm, worker_id,
                        HeapRegion::ScrubRemSetClaimValue);
     } else {
       _g1rs->scrub(_region_bm, _card_bm);
@@ -1766,7 +1769,7 @@ void ConcurrentMark::cleanup() {
 
   HeapRegionRemSet::reset_for_cleanup_tasks();
 
-  size_t n_workers;
+  uint n_workers;
 
   // Do counting once more with the world stopped for good measure.
   G1ParFinalCountTask g1_par_count_task(g1h, nextMarkBitMap(),
@@ -1778,7 +1781,7 @@ void ConcurrentMark::cleanup() {
 
     g1h->set_par_threads();
     n_workers = g1h->n_par_threads();
-    assert(g1h->n_par_threads() == (int) n_workers,
+    assert(g1h->n_par_threads() == n_workers,
            "Should not have been reset");
     g1h->workers()->run_task(&g1_par_count_task);
     // Done with the parallel phase so reset to 0.
@@ -1884,10 +1887,6 @@ void ConcurrentMark::cleanup() {
   double end = os::elapsedTime();
   _cleanup_times.add((end - start) * 1000.0);
 
-  // G1CollectedHeap::heap()->print();
-  // gclog_or_tty->print_cr("HEAP GC TIME STAMP : %d",
-  // G1CollectedHeap::heap()->get_gc_time_stamp());
-
   if (PrintGC || PrintGCDetails) {
     g1h->print_size_transition(gclog_or_tty,
                                start_used_bytes,
@@ -2169,13 +2168,13 @@ public:
     AbstractGangTask("Process reference objects in parallel"),
     _proc_task(proc_task), _g1h(g1h), _cm(cm) { }
 
-  virtual void work(int i) {
-    CMTask* marking_task = _cm->task(i);
+  virtual void work(uint worker_id) {
+    CMTask* marking_task = _cm->task(worker_id);
     G1CMIsAliveClosure g1_is_alive(_g1h);
     G1CMParKeepAliveAndDrainClosure g1_par_keep_alive(_cm, marking_task);
     G1CMParDrainMarkingStackClosure g1_par_drain(_cm, marking_task);
 
-    _proc_task.work(i, g1_is_alive, g1_par_keep_alive, g1_par_drain);
+    _proc_task.work(worker_id, g1_is_alive, g1_par_keep_alive, g1_par_drain);
   }
 };
 
@@ -2201,8 +2200,8 @@ public:
     AbstractGangTask("Enqueue reference objects in parallel"),
     _enq_task(enq_task) { }
 
-  virtual void work(int i) {
-    _enq_task.work(i);
+  virtual void work(uint worker_id) {
+    _enq_task.work(worker_id);
   }
 };
 
@@ -2249,8 +2248,8 @@ void ConcurrentMark::weakRefsWork(bool clear_all_soft_refs) {
 
     // We use the work gang from the G1CollectedHeap and we utilize all
     // the worker threads.
-    int active_workers = g1h->workers() ? g1h->workers()->active_workers() : 1;
-    active_workers = MAX2(MIN2(active_workers, (int)_max_task_num), 1);
+    uint active_workers = g1h->workers() ? g1h->workers()->active_workers() : 1U;
+    active_workers = MAX2(MIN2(active_workers, _max_task_num), 1U);
 
     G1CMRefProcTaskExecutor par_task_executor(g1h, this,
                                               g1h->workers(), active_workers);
@@ -2314,11 +2313,11 @@ private:
   ConcurrentMark *_cm;
 
 public:
-  void work(int worker_i) {
+  void work(uint worker_id) {
     // Since all available tasks are actually started, we should
     // only proceed if we're supposed to be actived.
-    if ((size_t)worker_i < _cm->active_tasks()) {
-      CMTask* task = _cm->task(worker_i);
+    if (worker_id < _cm->active_tasks()) {
+      CMTask* task = _cm->task(worker_id);
       task->record_start_time();
       do {
         task->do_marking_step(1000000000.0 /* something very large */,
@@ -2347,10 +2346,10 @@ void ConcurrentMark::checkpointRootsFinalWork() {
   if (G1CollectedHeap::use_parallel_gc_threads()) {
     G1CollectedHeap::StrongRootsScope srs(g1h);
     // this is remark, so we'll use up all active threads
-    int active_workers = g1h->workers()->active_workers();
+    uint active_workers = g1h->workers()->active_workers();
     if (active_workers == 0) {
       assert(active_workers > 0, "Should have been set earlier");
-      active_workers = ParallelGCThreads;
+      active_workers = (uint) ParallelGCThreads;
       g1h->workers()->set_active_workers(active_workers);
     }
     set_phase(active_workers, false /* concurrent */);
@@ -2366,7 +2365,7 @@ void ConcurrentMark::checkpointRootsFinalWork() {
   } else {
     G1CollectedHeap::StrongRootsScope srs(g1h);
     // this is remark, so we'll use up all available threads
-    int active_workers = 1;
+    uint active_workers = 1;
     set_phase(active_workers, false /* concurrent */);
 
     CMRemarkTask remarkTask(this, active_workers);
@@ -2674,6 +2673,8 @@ void ConcurrentMark::deal_with_reference(oop obj) {
 }
 
 void ConcurrentMark::drainAllSATBBuffers() {
+  guarantee(false, "drainAllSATBBuffers(): don't call this any more");
+
   CMGlobalObjectClosure oc(this);
   SATBMarkQueueSet& satb_mq_set = JavaThread::satb_mark_queue_set();
   satb_mq_set.set_closure(&oc);
@@ -2692,12 +2693,6 @@ void ConcurrentMark::drainAllSATBBuffers() {
   assert(satb_mq_set.completed_buffers_num() == 0, "invariant");
 }
 
-void ConcurrentMark::markPrev(oop p) {
-  // Note we are overriding the read-only view of the prev map here, via
-  // the cast.
-  ((CMBitMap*)_prevMarkBitMap)->mark((HeapWord*)p);
-}
-
 void ConcurrentMark::clear(oop p) {
   assert(p != NULL && p->is_oop(), "expected an oop");
   HeapWord* addr = (HeapWord*)p;
@@ -2707,13 +2702,21 @@ void ConcurrentMark::clear(oop p) {
   _nextMarkBitMap->clear(addr);
 }
 
-void ConcurrentMark::clearRangeBothMaps(MemRegion mr) {
+void ConcurrentMark::clearRangePrevBitmap(MemRegion mr) {
   // Note we are overriding the read-only view of the prev map here, via
   // the cast.
   ((CMBitMap*)_prevMarkBitMap)->clearRange(mr);
+}
+
+void ConcurrentMark::clearRangeNextBitmap(MemRegion mr) {
   _nextMarkBitMap->clearRange(mr);
 }
 
+void ConcurrentMark::clearRangeBothBitmaps(MemRegion mr) {
+  clearRangePrevBitmap(mr);
+  clearRangeNextBitmap(mr);
+}
+
 HeapRegion*
 ConcurrentMark::claim_region(int task_num) {
   // "checkpoint" the finger
@@ -2808,6 +2811,9 @@ ConcurrentMark::claim_region(int task_num) {
 }
 
 bool ConcurrentMark::invalidate_aborted_regions_in_cset() {
+  guarantee(false, "invalidate_aborted_regions_in_cset(): "
+                   "don't call this any more");
+
   bool result = false;
   for (int i = 0; i < (int)_max_task_num; ++i) {
     CMTask* the_task = _tasks[i];
@@ -2859,25 +2865,136 @@ void ConcurrentMark::oops_do(OopClosure* cl) {
     // ...then over the contents of the all the task queues.
     queue->oops_do(cl);
   }
-
-  // Invalidate any entries, that are in the region stack, that
-  // point into the collection set
-  if (_regionStack.invalidate_entries_into_cset()) {
-    // otherwise, any gray objects copied during the evacuation pause
-    // might not be visited.
-    assert(_should_gray_objects, "invariant");
-  }
-
-  // Invalidate any aborted regions, recorded in the individual CM
-  // tasks, that point into the collection set.
-  if (invalidate_aborted_regions_in_cset()) {
-    // otherwise, any gray objects copied during the evacuation pause
-    // might not be visited.
-    assert(_should_gray_objects, "invariant");
-  }
-
 }
 
+#ifndef PRODUCT
+enum VerifyNoCSetOopsPhase {
+  VerifyNoCSetOopsStack,
+  VerifyNoCSetOopsQueues,
+  VerifyNoCSetOopsSATBCompleted,
+  VerifyNoCSetOopsSATBThread
+};
+
+class VerifyNoCSetOopsClosure : public OopClosure, public ObjectClosure  {
+private:
+  G1CollectedHeap* _g1h;
+  VerifyNoCSetOopsPhase _phase;
+  int _info;
+
+  const char* phase_str() {
+    switch (_phase) {
+    case VerifyNoCSetOopsStack:         return "Stack";
+    case VerifyNoCSetOopsQueues:        return "Queue";
+    case VerifyNoCSetOopsSATBCompleted: return "Completed SATB Buffers";
+    case VerifyNoCSetOopsSATBThread:    return "Thread SATB Buffers";
+    default:                            ShouldNotReachHere();
+    }
+    return NULL;
+  }
+
+  void do_object_work(oop obj) {
+    guarantee(!_g1h->obj_in_cs(obj),
+              err_msg("obj: "PTR_FORMAT" in CSet, phase: %s, info: %d",
+                      (void*) obj, phase_str(), _info));
+  }
+
+public:
+  VerifyNoCSetOopsClosure() : _g1h(G1CollectedHeap::heap()) { }
+
+  void set_phase(VerifyNoCSetOopsPhase phase, int info = -1) {
+    _phase = phase;
+    _info = info;
+  }
+
+  virtual void do_oop(oop* p) {
+    oop obj = oopDesc::load_decode_heap_oop(p);
+    do_object_work(obj);
+  }
+
+  virtual void do_oop(narrowOop* p) {
+    // We should not come across narrow oops while scanning marking
+    // stacks and SATB buffers.
+    ShouldNotReachHere();
+  }
+
+  virtual void do_object(oop obj) {
+    do_object_work(obj);
+  }
+};
+
+void ConcurrentMark::verify_no_cset_oops(bool verify_stacks,
+                                         bool verify_enqueued_buffers,
+                                         bool verify_thread_buffers,
+                                         bool verify_fingers) {
+  assert(SafepointSynchronize::is_at_safepoint(), "should be at a safepoint");
+  if (!G1CollectedHeap::heap()->mark_in_progress()) {
+    return;
+  }
+
+  VerifyNoCSetOopsClosure cl;
+
+  if (verify_stacks) {
+    // Verify entries on the global mark stack
+    cl.set_phase(VerifyNoCSetOopsStack);
+    _markStack.oops_do(&cl);
+
+    // Verify entries on the task queues
+    for (int i = 0; i < (int) _max_task_num; i += 1) {
+      cl.set_phase(VerifyNoCSetOopsQueues, i);
+      OopTaskQueue* queue = _task_queues->queue(i);
+      queue->oops_do(&cl);
+    }
+  }
+
+  SATBMarkQueueSet& satb_qs = JavaThread::satb_mark_queue_set();
+
+  // Verify entries on the enqueued SATB buffers
+  if (verify_enqueued_buffers) {
+    cl.set_phase(VerifyNoCSetOopsSATBCompleted);
+    satb_qs.iterate_completed_buffers_read_only(&cl);
+  }
+
+  // Verify entries on the per-thread SATB buffers
+  if (verify_thread_buffers) {
+    cl.set_phase(VerifyNoCSetOopsSATBThread);
+    satb_qs.iterate_thread_buffers_read_only(&cl);
+  }
+
+  if (verify_fingers) {
+    // Verify the global finger
+    HeapWord* global_finger = finger();
+    if (global_finger != NULL && global_finger < _heap_end) {
+      // The global finger always points to a heap region boundary. We
+      // use heap_region_containing_raw() to get the containing region
+      // given that the global finger could be pointing to a free region
+      // which subsequently becomes continues humongous. If that
+      // happens, heap_region_containing() will return the bottom of the
+      // corresponding starts humongous region and the check below will
+      // not hold any more.
+      HeapRegion* global_hr = _g1h->heap_region_containing_raw(global_finger);
+      guarantee(global_finger == global_hr->bottom(),
+                err_msg("global finger: "PTR_FORMAT" region: "HR_FORMAT,
+                        global_finger, HR_FORMAT_PARAMS(global_hr)));
+    }
+
+    // Verify the task fingers
+    assert(parallel_marking_threads() <= _max_task_num, "sanity");
+    for (int i = 0; i < (int) parallel_marking_threads(); i += 1) {
+      CMTask* task = _tasks[i];
+      HeapWord* task_finger = task->finger();
+      if (task_finger != NULL && task_finger < _heap_end) {
+        // See above note on the global finger verification.
+        HeapRegion* task_hr = _g1h->heap_region_containing_raw(task_finger);
+        guarantee(task_finger == task_hr->bottom() ||
+                  !task_hr->in_collection_set(),
+                  err_msg("task finger: "PTR_FORMAT" region: "HR_FORMAT,
+                          task_finger, HR_FORMAT_PARAMS(task_hr)));
+      }
+    }
+  }
+}
+#endif // PRODUCT
+
 void ConcurrentMark::clear_marking_state(bool clear_overflow) {
   _markStack.setEmpty();
   _markStack.clear_overflow();
@@ -2921,7 +3038,7 @@ class CSetMarkOopClosure: public OopClosure {
   int              _ms_size;
   int              _ms_ind;
   int              _array_increment;
-  int              _worker_i;
+  uint             _worker_id;
 
   bool push(oop obj, int arr_ind = 0) {
     if (_ms_ind == _ms_size) {
@@ -2971,7 +3088,7 @@ class CSetMarkOopClosure: public OopClosure {
   }
 
 public:
-  CSetMarkOopClosure(ConcurrentMark* cm, int ms_size, int worker_i) :
+  CSetMarkOopClosure(ConcurrentMark* cm, int ms_size, uint worker_id) :
     _g1h(G1CollectedHeap::heap()),
     _cm(cm),
     _bm(cm->nextMarkBitMap()),
@@ -2979,7 +3096,7 @@ public:
     _ms(NEW_C_HEAP_ARRAY(oop, ms_size)),
     _array_ind_stack(NEW_C_HEAP_ARRAY(jint, ms_size)),
     _array_increment(MAX2(ms_size/8, 16)),
-    _worker_i(worker_i) { }
+    _worker_id(worker_id) { }
 
   ~CSetMarkOopClosure() {
     FREE_C_HEAP_ARRAY(oop, _ms);
@@ -3024,14 +3141,14 @@ class CSetMarkBitMapClosure: public BitMapClosure {
   CMBitMap*          _bitMap;
   ConcurrentMark*    _cm;
   CSetMarkOopClosure _oop_cl;
-  int                _worker_i;
+  uint               _worker_id;
 
 public:
-  CSetMarkBitMapClosure(ConcurrentMark* cm, int ms_size, int worker_i) :
+  CSetMarkBitMapClosure(ConcurrentMark* cm, int ms_size, int worker_id) :
     _g1h(G1CollectedHeap::heap()),
     _bitMap(cm->nextMarkBitMap()),
-    _oop_cl(cm, ms_size, worker_i),
-    _worker_i(worker_i) { }
+    _oop_cl(cm, ms_size, worker_id),
+    _worker_id(worker_id) { }
 
   bool do_bit(size_t offset) {
     // convert offset into a HeapWord*
@@ -3056,17 +3173,17 @@ public:
 class CompleteMarkingInCSetHRClosure: public HeapRegionClosure {
   CMBitMap*             _bm;
   CSetMarkBitMapClosure _bit_cl;
-  int                   _worker_i;
+  uint                  _worker_id;
 
   enum SomePrivateConstants {
     MSSize = 1000
   };
 
 public:
-  CompleteMarkingInCSetHRClosure(ConcurrentMark* cm, int worker_i) :
+  CompleteMarkingInCSetHRClosure(ConcurrentMark* cm, int worker_id) :
     _bm(cm->nextMarkBitMap()),
-    _bit_cl(cm, MSSize, worker_i),
-    _worker_i(worker_i) { }
+    _bit_cl(cm, MSSize, worker_id),
+    _worker_id(worker_id) { }
 
   bool doHeapRegion(HeapRegion* hr) {
     if (hr->claimHeapRegion(HeapRegion::CompleteMarkCSetClaimValue)) {
@@ -3085,19 +3202,6 @@ public:
   }
 };
 
-class SetClaimValuesInCSetHRClosure: public HeapRegionClosure {
-  jint _claim_value;
-
-public:
-  SetClaimValuesInCSetHRClosure(jint claim_value) :
-    _claim_value(claim_value) { }
-
-  bool doHeapRegion(HeapRegion* hr) {
-    hr->set_claim_value(_claim_value);
-    return false;
-  }
-};
-
 class G1ParCompleteMarkInCSetTask: public AbstractGangTask {
 protected:
   G1CollectedHeap* _g1h;
@@ -3109,14 +3213,17 @@ public:
     AbstractGangTask("Complete Mark in CSet"),
     _g1h(g1h), _cm(cm) { }
 
-  void work(int worker_i) {
-    CompleteMarkingInCSetHRClosure cmplt(_cm, worker_i);
-    HeapRegion* hr = _g1h->start_cset_region_for_worker(worker_i);
+  void work(uint worker_id) {
+    CompleteMarkingInCSetHRClosure cmplt(_cm, worker_id);
+    HeapRegion* hr = _g1h->start_cset_region_for_worker(worker_id);
     _g1h->collection_set_iterate_from(hr, &cmplt);
   }
 };
 
 void ConcurrentMark::complete_marking_in_collection_set() {
+  guarantee(false, "complete_marking_in_collection_set(): "
+                   "don't call this any more");
+
   G1CollectedHeap* g1h =  G1CollectedHeap::heap();
 
   if (!g1h->mark_in_progress()) {
@@ -3140,9 +3247,8 @@ void ConcurrentMark::complete_marking_in_collection_set() {
 
   assert(g1h->check_cset_heap_region_claim_values(HeapRegion::CompleteMarkCSetClaimValue), "sanity");
 
-  // Now reset the claim values in the regions in the collection set.
-  SetClaimValuesInCSetHRClosure set_cv_cl(HeapRegion::InitialClaimValue);
-  g1h->collection_set_iterate(&set_cv_cl);
+  // Reset the claim values in the regions in the collection set.
+  g1h->reset_cset_heap_region_claim_values();
 
   assert(g1h->check_cset_heap_region_claim_values(HeapRegion::InitialClaimValue), "sanity");
 
@@ -3165,6 +3271,8 @@ void ConcurrentMark::complete_marking_in_collection_set() {
 // newCSet().
 
 void ConcurrentMark::newCSet() {
+  guarantee(false, "newCSet(): don't call this any more");
+
   if (!concurrent_marking_in_progress()) {
     // nothing to do if marking is not in progress
     return;
@@ -3203,6 +3311,8 @@ void ConcurrentMark::newCSet() {
 }
 
 void ConcurrentMark::registerCSetRegion(HeapRegion* hr) {
+  guarantee(false, "registerCSetRegion(): don't call this any more");
+
   if (!concurrent_marking_in_progress()) return;
 
   HeapWord* region_end = hr->end();
@@ -3214,6 +3324,9 @@ void ConcurrentMark::registerCSetRegion(HeapRegion* hr) {
 // Resets the region fields of active CMTasks whose values point
 // into the collection set.
 void ConcurrentMark::reset_active_task_region_fields_in_cset() {
+  guarantee(false, "reset_active_task_region_fields_in_cset(): "
+                   "don't call this any more");
+
   assert(SafepointSynchronize::is_at_safepoint(), "should be in STW");
   assert(parallel_marking_threads() <= _max_task_num, "sanity");
 
@@ -3307,13 +3420,13 @@ void ConcurrentMark::print_worker_threads_on(outputStream* st) const {
 // the CMS bit map. Called at the first checkpoint.
 
 // We take a break if someone is trying to stop the world.
-bool ConcurrentMark::do_yield_check(int worker_i) {
+bool ConcurrentMark::do_yield_check(uint worker_id) {
   if (should_yield()) {
-    if (worker_i == 0) {
+    if (worker_id == 0) {
       _g1h->g1_policy()->record_concurrent_pause();
     }
     cmThread()->yield();
-    if (worker_i == 0) {
+    if (worker_id == 0) {
       _g1h->g1_policy()->record_concurrent_pause_end();
     }
     return true;
@@ -3924,6 +4037,10 @@ void CMTask::drain_satb_buffers() {
 }
 
 void CMTask::drain_region_stack(BitMapClosure* bc) {
+  assert(_cm->region_stack_empty(), "region stack should be empty");
+  assert(_aborted_region.is_empty(), "aborted region should be empty");
+  return;
+
   if (has_aborted()) return;
 
   assert(_region_finger == NULL,
diff --git a/hotspot/src/share/vm/gc_implementation/g1/concurrentMark.hpp b/hotspot/src/share/vm/gc_implementation/g1/concurrentMark.hpp
index 6383227d90e..1a407848499 100644
--- a/hotspot/src/share/vm/gc_implementation/g1/concurrentMark.hpp
+++ b/hotspot/src/share/vm/gc_implementation/g1/concurrentMark.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2001, 2011, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2001, 2012, Oracle and/or its affiliates. All rights reserved.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
  * This code is free software; you can redistribute it and/or modify it
@@ -166,10 +166,10 @@ class CMBitMap : public CMBitMapRO {
 // Ideally this should be GrowableArray<> just like MSC's marking stack(s).
 class CMMarkStack VALUE_OBJ_CLASS_SPEC {
   ConcurrentMark* _cm;
-  oop*   _base;      // bottom of stack
-  jint   _index;     // one more than last occupied index
-  jint   _capacity;  // max #elements
-  jint   _oops_do_bound;  // Number of elements to include in next iteration.
+  oop*   _base;        // bottom of stack
+  jint   _index;       // one more than last occupied index
+  jint   _capacity;    // max #elements
+  jint   _saved_index; // value of _index saved at start of GC
   NOT_PRODUCT(jint _max_depth;)  // max depth plumbed during run
 
   bool   _overflow;
@@ -247,16 +247,12 @@ class CMMarkStack VALUE_OBJ_CLASS_SPEC {
 
   void setEmpty()   { _index = 0; clear_overflow(); }
 
-  // Record the current size; a subsequent "oops_do" will iterate only over
-  // indices valid at the time of this call.
-  void set_oops_do_bound(jint bound = -1) {
-    if (bound == -1) {
-      _oops_do_bound = _index;
-    } else {
-      _oops_do_bound = bound;
-    }
-  }
-  jint oops_do_bound() { return _oops_do_bound; }
+  // Record the current index.
+  void note_start_of_gc();
+
+  // Make sure that we have not added any entries to the stack during GC.
+  void note_end_of_gc();
+
   // iterate over the oops in the mark stack, up to the bound recorded via
   // the call above.
   void oops_do(OopClosure* f);
@@ -374,9 +370,9 @@ class ConcurrentMark: public CHeapObj {
 protected:
   ConcurrentMarkThread* _cmThread;   // the thread doing the work
   G1CollectedHeap*      _g1h;        // the heap.
-  size_t                _parallel_marking_threads; // the number of marking
+  uint                  _parallel_marking_threads; // the number of marking
                                                    // threads we're use
-  size_t                _max_parallel_marking_threads; // max number of marking
+  uint                  _max_parallel_marking_threads; // max number of marking
                                                    // threads we'll ever use
   double                _sleep_factor; // how much we have to sleep, with
                                        // respect to the work we just did, to
@@ -412,8 +408,8 @@ protected:
                                     // last claimed region
 
   // marking tasks
-  size_t                  _max_task_num; // maximum task number
-  size_t                  _active_tasks; // task num currently active
+  uint                    _max_task_num; // maximum task number
+  uint                    _active_tasks; // task num currently active
   CMTask**                _tasks;        // task queue array (max_task_num len)
   CMTaskQueueSet*         _task_queues;  // task queue set
   ParallelTaskTerminator  _terminator;   // for termination
@@ -492,7 +488,7 @@ protected:
 
   // It should be called to indicate which phase we're in (concurrent
   // mark or remark) and how many threads are currently active.
-  void set_phase(size_t active_tasks, bool concurrent);
+  void set_phase(uint active_tasks, bool concurrent);
   // We do this after we're done with marking so that the marking data
   // structures are initialised to a sensible and predictable state.
   void set_non_marking_state();
@@ -505,8 +501,8 @@ protected:
   }
 
   // accessor methods
-  size_t parallel_marking_threads() { return _parallel_marking_threads; }
-  size_t max_parallel_marking_threads() { return _max_parallel_marking_threads;}
+  uint parallel_marking_threads() { return _parallel_marking_threads; }
+  uint max_parallel_marking_threads() { return _max_parallel_marking_threads;}
   double sleep_factor()             { return _sleep_factor; }
   double marking_task_overhead()    { return _marking_task_overhead;}
   double cleanup_sleep_factor()     { return _cleanup_sleep_factor; }
@@ -514,7 +510,7 @@ protected:
 
   HeapWord*               finger()        { return _finger;   }
   bool                    concurrent()    { return _concurrent; }
-  size_t                  active_tasks()  { return _active_tasks; }
+  uint                    active_tasks()  { return _active_tasks; }
   ParallelTaskTerminator* terminator()    { return &_terminator; }
 
   // It claims the next available region to be scanned by a marking
@@ -715,19 +711,18 @@ public:
   // Returns the number of GC threads to be used in a concurrent
   // phase based on the number of GC threads being used in a STW
   // phase.
-  size_t scale_parallel_threads(size_t n_par_threads);
+  uint scale_parallel_threads(uint n_par_threads);
 
   // Calculates the number of GC threads to be used in a concurrent phase.
-  size_t calc_parallel_marking_threads();
+  uint calc_parallel_marking_threads();
 
   // The following three are interaction between CM and
   // G1CollectedHeap
 
   // This notifies CM that a root during initial-mark needs to be
-  // grayed and it's MT-safe. Currently, we just mark it. But, in the
-  // future, we can experiment with pushing it on the stack and we can
-  // do this without changing G1CollectedHeap.
-  void grayRoot(oop p);
+  // grayed. It is MT-safe.
+  inline void grayRoot(oop obj, size_t word_size);
+
   // It's used during evacuation pauses to gray a region, if
   // necessary, and it's MT-safe. It assumes that the caller has
   // marked any objects on that region. If _should_gray_objects is
@@ -735,6 +730,7 @@ public:
   // pushed on the region stack, if it is located below the global
   // finger, otherwise we do nothing.
   void grayRegionIfNecessary(MemRegion mr);
+
   // It's used during evacuation pauses to mark and, if necessary,
   // gray a single object and it's MT-safe. It assumes the caller did
   // not mark the object. If _should_gray_objects is true and we're
@@ -791,24 +787,40 @@ public:
 
   // Mark in the previous bitmap.  NB: this is usually read-only, so use
   // this carefully!
-  void markPrev(oop p);
+  inline void markPrev(oop p);
+  inline void markNext(oop p);
   void clear(oop p);
-  // Clears marks for all objects in the given range, for both prev and
-  // next bitmaps.  NB: the previous bitmap is usually read-only, so use
-  // this carefully!
-  void clearRangeBothMaps(MemRegion mr);
+  // Clears marks for all objects in the given range, for the prev,
+  // next, or both bitmaps.  NB: the previous bitmap is usually
+  // read-only, so use this carefully!
+  void clearRangePrevBitmap(MemRegion mr);
+  void clearRangeNextBitmap(MemRegion mr);
+  void clearRangeBothBitmaps(MemRegion mr);
 
-  // Record the current top of the mark and region stacks; a
-  // subsequent oops_do() on the mark stack and
-  // invalidate_entries_into_cset() on the region stack will iterate
-  // only over indices valid at the time of this call.
-  void set_oops_do_bound() {
-    _markStack.set_oops_do_bound();
-    _regionStack.set_oops_do_bound();
+  // Notify data structures that a GC has started.
+  void note_start_of_gc() {
+    _markStack.note_start_of_gc();
   }
+
+  // Notify data structures that a GC is finished.
+  void note_end_of_gc() {
+    _markStack.note_end_of_gc();
+  }
+
   // Iterate over the oops in the mark stack and all local queues. It
   // also calls invalidate_entries_into_cset() on the region stack.
   void oops_do(OopClosure* f);
+
+  // Verify that there are no CSet oops on the stacks (taskqueues /
+  // global mark stack), enqueued SATB buffers, per-thread SATB
+  // buffers, and fingers (global / per-task). The boolean parameters
+  // decide which of the above data structures to verify. If marking
+  // is not in progress, it's a no-op.
+  void verify_no_cset_oops(bool verify_stacks,
+                           bool verify_enqueued_buffers,
+                           bool verify_thread_buffers,
+                           bool verify_fingers) PRODUCT_RETURN;
+
   // It is called at the end of an evacuation pause during marking so
   // that CM is notified of where the new end of the heap is. It
   // doesn't do anything if concurrent_marking_in_progress() is false,
@@ -873,7 +885,7 @@ public:
     return _prevMarkBitMap->isMarked(addr);
   }
 
-  inline bool do_yield_check(int worker_i = 0);
+  inline bool do_yield_check(uint worker_i = 0);
   inline bool should_yield();
 
   // Called to abort the marking cycle after a Full GC takes palce.
@@ -1166,6 +1178,7 @@ public:
   // It keeps picking SATB buffers and processing them until no SATB
   // buffers are available.
   void drain_satb_buffers();
+
   // It keeps popping regions from the region stack and processing
   // them until the region stack is empty.
   void drain_region_stack(BitMapClosure* closure);
diff --git a/hotspot/src/share/vm/gc_implementation/g1/concurrentMark.inline.hpp b/hotspot/src/share/vm/gc_implementation/g1/concurrentMark.inline.hpp
index fdba22a0ebc..d72db9ea78f 100644
--- a/hotspot/src/share/vm/gc_implementation/g1/concurrentMark.inline.hpp
+++ b/hotspot/src/share/vm/gc_implementation/g1/concurrentMark.inline.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2001, 2011, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2001, 2012, Oracle and/or its affiliates. All rights reserved.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
  * This code is free software; you can redistribute it and/or modify it
@@ -153,4 +153,46 @@ inline void CMTask::deal_with_reference(oop obj) {
   }
 }
 
+inline void ConcurrentMark::markPrev(oop p) {
+  assert(!_prevMarkBitMap->isMarked((HeapWord*) p), "sanity");
+  // Note we are overriding the read-only view of the prev map here, via
+  // the cast.
+  ((CMBitMap*)_prevMarkBitMap)->mark((HeapWord*) p);
+}
+
+inline void ConcurrentMark::markNext(oop p) {
+  assert(!_nextMarkBitMap->isMarked((HeapWord*) p), "sanity");
+  _nextMarkBitMap->mark((HeapWord*) p);
+}
+
+inline void ConcurrentMark::grayRoot(oop obj, size_t word_size) {
+  HeapWord* addr = (HeapWord*) obj;
+
+  // Currently we don't do anything with word_size but we will use it
+  // in the very near future in the liveness calculation piggy-backing
+  // changes.
+
+#ifdef ASSERT
+  HeapRegion* hr = _g1h->heap_region_containing(addr);
+  assert(hr != NULL, "sanity");
+  assert(!hr->is_survivor(), "should not allocate survivors during IM");
+  assert(addr < hr->next_top_at_mark_start(),
+         err_msg("addr: "PTR_FORMAT" hr: "HR_FORMAT" NTAMS: "PTR_FORMAT,
+                 addr, HR_FORMAT_PARAMS(hr), hr->next_top_at_mark_start()));
+  // We cannot assert that word_size == obj->size() given that obj
+  // might not be in a consistent state (another thread might be in
+  // the process of copying it). So the best thing we can do is to
+  // assert that word_size is under an upper bound which is its
+  // containing region's capacity.
+  assert(word_size * HeapWordSize <= hr->capacity(),
+         err_msg("size: "SIZE_FORMAT" capacity: "SIZE_FORMAT" "HR_FORMAT,
+                 word_size * HeapWordSize, hr->capacity(),
+                 HR_FORMAT_PARAMS(hr)));
+#endif // ASSERT
+
+  if (!_nextMarkBitMap->isMarked(addr)) {
+    _nextMarkBitMap->parMark(addr);
+  }
+}
+
 #endif // SHARE_VM_GC_IMPLEMENTATION_G1_CONCURRENTMARK_INLINE_HPP
diff --git a/hotspot/src/share/vm/gc_implementation/g1/g1CollectedHeap.cpp b/hotspot/src/share/vm/gc_implementation/g1/g1CollectedHeap.cpp
index 03280b448f9..f1645f32b0b 100644
--- a/hotspot/src/share/vm/gc_implementation/g1/g1CollectedHeap.cpp
+++ b/hotspot/src/share/vm/gc_implementation/g1/g1CollectedHeap.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2001, 2011, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2001, 2012, Oracle and/or its affiliates. All rights reserved.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
  * This code is free software; you can redistribute it and/or modify it
@@ -32,9 +32,11 @@
 #include "gc_implementation/g1/g1CollectedHeap.inline.hpp"
 #include "gc_implementation/g1/g1CollectorPolicy.hpp"
 #include "gc_implementation/g1/g1ErgoVerbose.hpp"
+#include "gc_implementation/g1/g1EvacFailure.hpp"
 #include "gc_implementation/g1/g1MarkSweep.hpp"
 #include "gc_implementation/g1/g1OopClosures.inline.hpp"
 #include "gc_implementation/g1/g1RemSet.inline.hpp"
+#include "gc_implementation/g1/heapRegion.inline.hpp"
 #include "gc_implementation/g1/heapRegionRemSet.hpp"
 #include "gc_implementation/g1/heapRegionSeq.inline.hpp"
 #include "gc_implementation/g1/vm_operations_g1.hpp"
@@ -591,17 +593,29 @@ HeapRegion* G1CollectedHeap::new_region(size_t word_size, bool do_expand) {
     }
     res = new_region_try_secondary_free_list();
   }
-  if (res == NULL && do_expand) {
+  if (res == NULL && do_expand && _expand_heap_after_alloc_failure) {
+    // Currently, only attempts to allocate GC alloc regions set
+    // do_expand to true. So, we should only reach here during a
+    // safepoint. If this assumption changes we might have to
+    // reconsider the use of _expand_heap_after_alloc_failure.
+    assert(SafepointSynchronize::is_at_safepoint(), "invariant");
+
     ergo_verbose1(ErgoHeapSizing,
                   "attempt heap expansion",
                   ergo_format_reason("region allocation request failed")
                   ergo_format_byte("allocation request"),
                   word_size * HeapWordSize);
     if (expand(word_size * HeapWordSize)) {
-      // Even though the heap was expanded, it might not have reached
-      // the desired size. So, we cannot assume that the allocation
-      // will succeed.
+      // Given that expand() succeeded in expanding the heap, and we
+      // always expand the heap by an amount aligned to the heap
+      // region size, the free list should in theory not be empty. So
+      // it would probably be OK to use remove_head(). But the extra
+      // check for NULL is unlikely to be a performance issue here (we
+      // just expanded the heap!) so let's just be conservative and
+      // use remove_head_or_null().
       res = _free_list.remove_head_or_null();
+    } else {
+      _expand_heap_after_alloc_failure = false;
     }
   }
   return res;
@@ -1165,9 +1179,9 @@ public:
       _g1(g1)
   { }
 
-  void work(int i) {
-    RebuildRSOutOfRegionClosure rebuild_rs(_g1, i);
-    _g1->heap_region_par_iterate_chunked(&rebuild_rs, i,
+  void work(uint worker_id) {
+    RebuildRSOutOfRegionClosure rebuild_rs(_g1, worker_id);
+    _g1->heap_region_par_iterate_chunked(&rebuild_rs, worker_id,
                                           _g1->workers()->active_workers(),
                                          HeapRegion::RebuildRSClaimValue);
   }
@@ -1374,7 +1388,7 @@ bool G1CollectedHeap::do_collection(bool explicit_gc,
 
     // Rebuild remembered sets of all regions.
     if (G1CollectedHeap::use_parallel_gc_threads()) {
-      int n_workers =
+      uint n_workers =
         AdaptiveSizePolicy::calc_active_workers(workers()->total_workers(),
                                        workers()->active_workers(),
                                        Threads::number_of_non_daemon_threads());
@@ -1838,6 +1852,7 @@ G1CollectedHeap::G1CollectedHeap(G1CollectorPolicy* policy_) :
   _young_list(new YoungList(this)),
   _gc_time_stamp(0),
   _retained_old_gc_alloc_region(NULL),
+  _expand_heap_after_alloc_failure(true),
   _surviving_young_words(NULL),
   _full_collections_completed(0),
   _in_cset_fast_test(NULL),
@@ -2519,11 +2534,11 @@ void G1CollectedHeap::heap_region_iterate_from(HeapRegion* r,
 
 void
 G1CollectedHeap::heap_region_par_iterate_chunked(HeapRegionClosure* cl,
-                                                 int worker,
-                                                 int no_of_par_workers,
+                                                 uint worker,
+                                                 uint no_of_par_workers,
                                                  jint claim_value) {
   const size_t regions = n_regions();
-  const size_t max_workers = (G1CollectedHeap::use_parallel_gc_threads() ?
+  const uint max_workers = (G1CollectedHeap::use_parallel_gc_threads() ?
                              no_of_par_workers :
                              1);
   assert(UseDynamicNumberOfGCThreads ||
@@ -2605,12 +2620,16 @@ public:
   }
 };
 
-void
-G1CollectedHeap::reset_heap_region_claim_values() {
+void G1CollectedHeap::reset_heap_region_claim_values() {
   ResetClaimValuesClosure blk;
   heap_region_iterate(&blk);
 }
 
+void G1CollectedHeap::reset_cset_heap_region_claim_values() {
+  ResetClaimValuesClosure blk;
+  collection_set_iterate(&blk);
+}
+
 #ifdef ASSERT
 // This checks whether all regions in the heap have the correct claim
 // value. I also piggy-backed on this a check to ensure that the
@@ -2739,7 +2758,7 @@ HeapRegion* G1CollectedHeap::start_cset_region_for_worker(int worker_i) {
   result = g1_policy()->collection_set();
   if (G1CollectedHeap::use_parallel_gc_threads()) {
     size_t cs_size = g1_policy()->cset_region_length();
-    int active_workers = workers()->active_workers();
+    uint active_workers = workers()->active_workers();
     assert(UseDynamicNumberOfGCThreads ||
              active_workers == workers()->total_workers(),
              "Unless dynamic should use total workers");
@@ -3000,14 +3019,20 @@ public:
       } else {
         VerifyObjsInRegionClosure not_dead_yet_cl(r, _vo);
         r->object_iterate(&not_dead_yet_cl);
-        if (r->max_live_bytes() < not_dead_yet_cl.live_bytes()) {
-          gclog_or_tty->print_cr("["PTR_FORMAT","PTR_FORMAT"] "
-                                 "max_live_bytes "SIZE_FORMAT" "
-                                 "< calculated "SIZE_FORMAT,
-                                 r->bottom(), r->end(),
-                                 r->max_live_bytes(),
+        if (_vo != VerifyOption_G1UseNextMarking) {
+          if (r->max_live_bytes() < not_dead_yet_cl.live_bytes()) {
+            gclog_or_tty->print_cr("["PTR_FORMAT","PTR_FORMAT"] "
+                                   "max_live_bytes "SIZE_FORMAT" "
+                                   "< calculated "SIZE_FORMAT,
+                                   r->bottom(), r->end(),
+                                   r->max_live_bytes(),
                                  not_dead_yet_cl.live_bytes());
-          _failures = true;
+            _failures = true;
+          }
+        } else {
+          // When vo == UseNextMarking we cannot currently do a sanity
+          // check on the live bytes as the calculation has not been
+          // finalized yet.
         }
       }
     }
@@ -3075,10 +3100,10 @@ public:
     return _failures;
   }
 
-  void work(int worker_i) {
+  void work(uint worker_id) {
     HandleMark hm;
     VerifyRegionClosure blk(_allow_dirty, true, _vo);
-    _g1h->heap_region_par_iterate_chunked(&blk, worker_i,
+    _g1h->heap_region_par_iterate_chunked(&blk, worker_id,
                                           _g1h->workers()->active_workers(),
                                           HeapRegion::ParVerifyClaimValue);
     if (blk.failures()) {
@@ -3641,25 +3666,6 @@ G1CollectedHeap::do_collection_pause_at_safepoint(double target_pause_time_ms) {
         }
         perm_gen()->save_marks();
 
-        // We must do this before any possible evacuation that should propagate
-        // marks.
-        if (mark_in_progress()) {
-          double start_time_sec = os::elapsedTime();
-
-          _cm->drainAllSATBBuffers();
-          double finish_mark_ms = (os::elapsedTime() - start_time_sec) * 1000.0;
-          g1_policy()->record_satb_drain_time(finish_mark_ms);
-        }
-        // Record the number of elements currently on the mark stack, so we
-        // only iterate over these.  (Since evacuation may add to the mark
-        // stack, doing more exposes race conditions.)  If no mark is in
-        // progress, this will be zero.
-        _cm->set_oops_do_bound();
-
-        if (mark_in_progress()) {
-          concurrent_mark()->newCSet();
-        }
-
 #if YOUNG_LIST_VERBOSE
         gclog_or_tty->print_cr("\nBefore choosing collection set.\nYoung_list:");
         _young_list->print();
@@ -3668,6 +3674,16 @@ G1CollectedHeap::do_collection_pause_at_safepoint(double target_pause_time_ms) {
 
         g1_policy()->choose_collection_set(target_pause_time_ms);
 
+        _cm->note_start_of_gc();
+        // We should not verify the per-thread SATB buffers given that
+        // we have not filtered them yet (we'll do so during the
+        // GC). We also call this after choose_collection_set() to
+        // ensure that the CSet has been finalized.
+        _cm->verify_no_cset_oops(true  /* verify_stacks */,
+                                 true  /* verify_enqueued_buffers */,
+                                 false /* verify_thread_buffers */,
+                                 true  /* verify_fingers */);
+
         if (_hr_printer.is_active()) {
           HeapRegion* hr = g1_policy()->collection_set();
           while (hr != NULL) {
@@ -3684,16 +3700,6 @@ G1CollectedHeap::do_collection_pause_at_safepoint(double target_pause_time_ms) {
           }
         }
 
-        // We have chosen the complete collection set. If marking is
-        // active then, we clear the region fields of any of the
-        // concurrent marking tasks whose region fields point into
-        // the collection set as these values will become stale. This
-        // will cause the owning marking threads to claim a new region
-        // when marking restarts.
-        if (mark_in_progress()) {
-          concurrent_mark()->reset_active_task_region_fields_in_cset();
-        }
-
 #ifdef ASSERT
         VerifyCSetClosure cl;
         collection_set_iterate(&cl);
@@ -3707,6 +3713,16 @@ G1CollectedHeap::do_collection_pause_at_safepoint(double target_pause_time_ms) {
         // Actually do the work...
         evacuate_collection_set();
 
+        // We do this to mainly verify the per-thread SATB buffers
+        // (which have been filtered by now) since we didn't verify
+        // them earlier. No point in re-checking the stacks / enqueued
+        // buffers given that the CSet has not changed since last time
+        // we checked.
+        _cm->verify_no_cset_oops(false /* verify_stacks */,
+                                 false /* verify_enqueued_buffers */,
+                                 true  /* verify_thread_buffers */,
+                                 true  /* verify_fingers */);
+
         free_collection_set(g1_policy()->collection_set());
         g1_policy()->clear_collection_set();
 
@@ -3775,6 +3791,8 @@ G1CollectedHeap::do_collection_pause_at_safepoint(double target_pause_time_ms) {
           size_t expand_bytes = g1_policy()->expansion_amount();
           if (expand_bytes > 0) {
             size_t bytes_before = capacity();
+            // No need for an ergo verbose message here,
+            // expansion_amount() does this when it returns a value > 0.
             if (!expand(expand_bytes)) {
               // We failed to expand the heap so let's verify that
               // committed/uncommitted amount match the backing store
@@ -3784,6 +3802,14 @@ G1CollectedHeap::do_collection_pause_at_safepoint(double target_pause_time_ms) {
           }
         }
 
+        // We redo the verificaiton but now wrt to the new CSet which
+        // has just got initialized after the previous CSet was freed.
+        _cm->verify_no_cset_oops(true  /* verify_stacks */,
+                                 true  /* verify_enqueued_buffers */,
+                                 true  /* verify_thread_buffers */,
+                                 true  /* verify_fingers */);
+        _cm->note_end_of_gc();
+
         double end_time_sec = os::elapsedTime();
         double pause_time_ms = (end_time_sec - start_time_sec) * MILLIUNITS;
         g1_policy()->record_pause_time_ms(pause_time_ms);
@@ -3831,21 +3857,6 @@ G1CollectedHeap::do_collection_pause_at_safepoint(double target_pause_time_ms) {
         // CM reference discovery will be re-enabled if necessary.
       }
 
-      {
-        size_t expand_bytes = g1_policy()->expansion_amount();
-        if (expand_bytes > 0) {
-          size_t bytes_before = capacity();
-          // No need for an ergo verbose message here,
-          // expansion_amount() does this when it returns a value > 0.
-          if (!expand(expand_bytes)) {
-            // We failed to expand the heap so let's verify that
-            // committed/uncommitted amount match the backing store
-            assert(capacity() == _g1_storage.committed_size(), "committed size mismatch");
-            assert(max_capacity() == _g1_storage.reserved_size(), "reserved size mismatch");
-          }
-        }
-      }
-
       // We should do this after we potentially expand the heap so
       // that all the COMMIT events are generated before the end GC
       // event, and after we retire the GC alloc regions so that all
@@ -3949,6 +3960,8 @@ void G1CollectedHeap::init_gc_alloc_regions() {
     // we allocate to in the region sets. We'll re-add it later, when
     // it's retired again.
     _old_set.remove(retained_region);
+    bool during_im = g1_policy()->during_initial_mark_pause();
+    retained_region->note_start_of_copying(during_im);
     _old_gc_alloc_region.set(retained_region);
     _hr_printer.reuse(retained_region);
   }
@@ -3985,157 +3998,26 @@ void G1CollectedHeap::finalize_for_evac_failure() {
   _evac_failure_scan_stack = NULL;
 }
 
-class UpdateRSetDeferred : public OopsInHeapRegionClosure {
-private:
-  G1CollectedHeap* _g1;
-  DirtyCardQueue *_dcq;
-  CardTableModRefBS* _ct_bs;
-
-public:
-  UpdateRSetDeferred(G1CollectedHeap* g1, DirtyCardQueue* dcq) :
-    _g1(g1), _ct_bs((CardTableModRefBS*)_g1->barrier_set()), _dcq(dcq) {}
-
-  virtual void do_oop(narrowOop* p) { do_oop_work(p); }
-  virtual void do_oop(      oop* p) { do_oop_work(p); }
-  template <class T> void do_oop_work(T* p) {
-    assert(_from->is_in_reserved(p), "paranoia");
-    if (!_from->is_in_reserved(oopDesc::load_decode_heap_oop(p)) &&
-        !_from->is_survivor()) {
-      size_t card_index = _ct_bs->index_for(p);
-      if (_ct_bs->mark_card_deferred(card_index)) {
-        _dcq->enqueue((jbyte*)_ct_bs->byte_for_index(card_index));
-      }
-    }
-  }
-};
-
-class RemoveSelfPointerClosure: public ObjectClosure {
-private:
-  G1CollectedHeap* _g1;
-  ConcurrentMark* _cm;
-  HeapRegion* _hr;
-  size_t _prev_marked_bytes;
-  size_t _next_marked_bytes;
-  OopsInHeapRegionClosure *_cl;
-public:
-  RemoveSelfPointerClosure(G1CollectedHeap* g1, HeapRegion* hr,
-                           OopsInHeapRegionClosure* cl) :
-    _g1(g1), _hr(hr), _cm(_g1->concurrent_mark()),  _prev_marked_bytes(0),
-    _next_marked_bytes(0), _cl(cl) {}
-
-  size_t prev_marked_bytes() { return _prev_marked_bytes; }
-  size_t next_marked_bytes() { return _next_marked_bytes; }
-
-  // <original comment>
-  // The original idea here was to coalesce evacuated and dead objects.
-  // However that caused complications with the block offset table (BOT).
-  // In particular if there were two TLABs, one of them partially refined.
-  // |----- TLAB_1--------|----TLAB_2-~~~(partially refined part)~~~|
-  // The BOT entries of the unrefined part of TLAB_2 point to the start
-  // of TLAB_2. If the last object of the TLAB_1 and the first object
-  // of TLAB_2 are coalesced, then the cards of the unrefined part
-  // would point into middle of the filler object.
-  // The current approach is to not coalesce and leave the BOT contents intact.
-  // </original comment>
-  //
-  // We now reset the BOT when we start the object iteration over the
-  // region and refine its entries for every object we come across. So
-  // the above comment is not really relevant and we should be able
-  // to coalesce dead objects if we want to.
-  void do_object(oop obj) {
-    HeapWord* obj_addr = (HeapWord*) obj;
-    assert(_hr->is_in(obj_addr), "sanity");
-    size_t obj_size = obj->size();
-    _hr->update_bot_for_object(obj_addr, obj_size);
-    if (obj->is_forwarded() && obj->forwardee() == obj) {
-      // The object failed to move.
-      assert(!_g1->is_obj_dead(obj), "We should not be preserving dead objs.");
-      _cm->markPrev(obj);
-      assert(_cm->isPrevMarked(obj), "Should be marked!");
-      _prev_marked_bytes += (obj_size * HeapWordSize);
-      if (_g1->mark_in_progress() && !_g1->is_obj_ill(obj)) {
-        _cm->markAndGrayObjectIfNecessary(obj);
-      }
-      obj->set_mark(markOopDesc::prototype());
-      // While we were processing RSet buffers during the
-      // collection, we actually didn't scan any cards on the
-      // collection set, since we didn't want to update remebered
-      // sets with entries that point into the collection set, given
-      // that live objects fromthe collection set are about to move
-      // and such entries will be stale very soon. This change also
-      // dealt with a reliability issue which involved scanning a
-      // card in the collection set and coming across an array that
-      // was being chunked and looking malformed. The problem is
-      // that, if evacuation fails, we might have remembered set
-      // entries missing given that we skipped cards on the
-      // collection set. So, we'll recreate such entries now.
-      obj->oop_iterate(_cl);
-      assert(_cm->isPrevMarked(obj), "Should be marked!");
-    } else {
-      // The object has been either evacuated or is dead. Fill it with a
-      // dummy object.
-      MemRegion mr((HeapWord*)obj, obj_size);
-      CollectedHeap::fill_with_object(mr);
-      _cm->clearRangeBothMaps(mr);
-    }
-  }
-};
-
 void G1CollectedHeap::remove_self_forwarding_pointers() {
-  UpdateRSetImmediate immediate_update(_g1h->g1_rem_set());
-  DirtyCardQueue dcq(&_g1h->dirty_card_queue_set());
-  UpdateRSetDeferred deferred_update(_g1h, &dcq);
-  OopsInHeapRegionClosure *cl;
-  if (G1DeferredRSUpdate) {
-    cl = &deferred_update;
+  assert(check_cset_heap_region_claim_values(HeapRegion::InitialClaimValue), "sanity");
+  assert(g1_policy()->assertMarkedBytesDataOK(), "Should be!");
+
+  G1ParRemoveSelfForwardPtrsTask rsfp_task(this);
+
+  if (G1CollectedHeap::use_parallel_gc_threads()) {
+    set_par_threads();
+    workers()->run_task(&rsfp_task);
+    set_par_threads(0);
   } else {
-    cl = &immediate_update;
+    rsfp_task.work(0);
   }
-  HeapRegion* cur = g1_policy()->collection_set();
-  while (cur != NULL) {
-    assert(g1_policy()->assertMarkedBytesDataOK(), "Should be!");
-    assert(!cur->isHumongous(), "sanity");
 
-    if (cur->evacuation_failed()) {
-      assert(cur->in_collection_set(), "bad CS");
-      RemoveSelfPointerClosure rspc(_g1h, cur, cl);
+  assert(check_cset_heap_region_claim_values(HeapRegion::ParEvacFailureClaimValue), "sanity");
 
-      // In the common case we make sure that this is done when the
-      // region is freed so that it is "ready-to-go" when it's
-      // re-allocated. However, when evacuation failure happens, a
-      // region will remain in the heap and might ultimately be added
-      // to a CSet in the future. So we have to be careful here and
-      // make sure the region's RSet is ready for parallel iteration
-      // whenever this might be required in the future.
-      cur->rem_set()->reset_for_par_iteration();
-      cur->reset_bot();
-      cl->set_region(cur);
-      cur->object_iterate(&rspc);
+  // Reset the claim values in the regions in the collection set.
+  reset_cset_heap_region_claim_values();
 
-      // A number of manipulations to make the TAMS be the current top,
-      // and the marked bytes be the ones observed in the iteration.
-      if (_g1h->concurrent_mark()->at_least_one_mark_complete()) {
-        // The comments below are the postconditions achieved by the
-        // calls.  Note especially the last such condition, which says that
-        // the count of marked bytes has been properly restored.
-        cur->note_start_of_marking(false);
-        // _next_top_at_mark_start == top, _next_marked_bytes == 0
-        cur->add_to_marked_bytes(rspc.prev_marked_bytes());
-        // _next_marked_bytes == prev_marked_bytes.
-        cur->note_end_of_marking();
-        // _prev_top_at_mark_start == top(),
-        // _prev_marked_bytes == prev_marked_bytes
-      }
-      // If there is no mark in progress, we modified the _next variables
-      // above needlessly, but harmlessly.
-      if (_g1h->mark_in_progress()) {
-        cur->note_start_of_marking(false);
-        // _next_top_at_mark_start == top, _next_marked_bytes == 0
-        // _next_marked_bytes == next_marked_bytes.
-      }
-    }
-    cur = cur->next_in_collection_set();
-  }
+  assert(check_cset_heap_region_claim_values(HeapRegion::InitialClaimValue), "sanity");
   assert(g1_policy()->assertMarkedBytesDataOK(), "Should be!");
 
   // Now restore saved marks, if any.
@@ -4148,6 +4030,7 @@ void G1CollectedHeap::remove_self_forwarding_pointers() {
       markOop m = _preserved_marks_of_objs->at(i);
       obj->set_mark(m);
     }
+
     // Delete the preserved marks growable arrays (allocated on the C heap).
     delete _objs_with_preserved_marks;
     delete _preserved_marks_of_objs;
@@ -4172,8 +4055,7 @@ void G1CollectedHeap::drain_evac_failure_scan_stack() {
 
 oop
 G1CollectedHeap::handle_evacuation_failure_par(OopsInHeapRegionClosure* cl,
-                                               oop old,
-                                               bool should_mark_root) {
+                                               oop old) {
   assert(obj_in_cs(old),
          err_msg("obj: "PTR_FORMAT" should still be in the CSet",
                  (HeapWord*) old));
@@ -4182,15 +4064,6 @@ G1CollectedHeap::handle_evacuation_failure_par(OopsInHeapRegionClosure* cl,
   if (forward_ptr == NULL) {
     // Forward-to-self succeeded.
 
-    // should_mark_root will be true when this routine is called
-    // from a root scanning closure during an initial mark pause.
-    // In this case the thread that succeeds in self-forwarding the
-    // object is also responsible for marking the object.
-    if (should_mark_root) {
-      assert(!oopDesc::is_null(old), "shouldn't be");
-      _cm->grayRoot(old);
-    }
-
     if (_evac_failure_closure != cl) {
       MutexLockerEx x(EvacFailureStack_lock, Mutex::_no_safepoint_check_flag);
       assert(!_drain_in_progress,
@@ -4286,30 +4159,8 @@ HeapWord* G1CollectedHeap::par_allocate_during_gc(GCAllocPurpose purpose,
   return NULL;
 }
 
-#ifndef PRODUCT
-bool GCLabBitMapClosure::do_bit(size_t offset) {
-  HeapWord* addr = _bitmap->offsetToHeapWord(offset);
-  guarantee(_cm->isMarked(oop(addr)), "it should be!");
-  return true;
-}
-#endif // PRODUCT
-
 G1ParGCAllocBuffer::G1ParGCAllocBuffer(size_t gclab_word_size) :
-  ParGCAllocBuffer(gclab_word_size),
-  _should_mark_objects(false),
-  _bitmap(G1CollectedHeap::heap()->reserved_region().start(), gclab_word_size),
-  _retired(false)
-{
-  //_should_mark_objects is set to true when G1ParCopyHelper needs to
-  // mark the forwarded location of an evacuated object.
-  // We set _should_mark_objects to true if marking is active, i.e. when we
-  // need to propagate a mark, or during an initial mark pause, i.e. when we
-  // need to mark objects immediately reachable by the roots.
-  if (G1CollectedHeap::heap()->mark_in_progress() ||
-      G1CollectedHeap::heap()->g1_policy()->during_initial_mark_pause()) {
-    _should_mark_objects = true;
-  }
-}
+  ParGCAllocBuffer(gclab_word_size), _retired(false) { }
 
 G1ParScanThreadState::G1ParScanThreadState(G1CollectedHeap* g1h, int queue_num)
   : _g1h(g1h),
@@ -4323,8 +4174,7 @@ G1ParScanThreadState::G1ParScanThreadState(G1CollectedHeap* g1h, int queue_num)
     _tenured_alloc_buffer(g1h->desired_plab_sz(GCAllocForTenured)),
     _age_table(false),
     _strong_roots_time(0), _term_time(0),
-    _alloc_buffer_waste(0), _undo_waste(0)
-{
+    _alloc_buffer_waste(0), _undo_waste(0) {
   // we allocate G1YoungSurvRateNumRegions plus one entries, since
   // we "sacrifice" entry 0 to keep track of surviving bytes for
   // non-young regions (where the age is -1)
@@ -4429,35 +4279,53 @@ void G1ParScanThreadState::trim_queue() {
   } while (!refs()->is_empty());
 }
 
-G1ParClosureSuper::G1ParClosureSuper(G1CollectedHeap* g1, G1ParScanThreadState* par_scan_state) :
+G1ParClosureSuper::G1ParClosureSuper(G1CollectedHeap* g1,
+                                     G1ParScanThreadState* par_scan_state) :
   _g1(g1), _g1_rem(_g1->g1_rem_set()), _cm(_g1->concurrent_mark()),
   _par_scan_state(par_scan_state),
   _during_initial_mark(_g1->g1_policy()->during_initial_mark_pause()),
   _mark_in_progress(_g1->mark_in_progress()) { }
 
-template <class T> void G1ParCopyHelper::mark_object(T* p) {
-  // This is called from do_oop_work for objects that are not
-  // in the collection set. Objects in the collection set
-  // are marked after they have been evacuated.
+void G1ParCopyHelper::mark_object(oop obj) {
+#ifdef ASSERT
+  HeapRegion* hr = _g1->heap_region_containing(obj);
+  assert(hr != NULL, "sanity");
+  assert(!hr->in_collection_set(), "should not mark objects in the CSet");
+#endif // ASSERT
 
-  T heap_oop = oopDesc::load_heap_oop(p);
-  if (!oopDesc::is_null(heap_oop)) {
-    oop obj = oopDesc::decode_heap_oop(heap_oop);
-    HeapWord* addr = (HeapWord*)obj;
-    if (_g1->is_in_g1_reserved(addr)) {
-      _cm->grayRoot(oop(addr));
-    }
-  }
+  // We know that the object is not moving so it's safe to read its size.
+  _cm->grayRoot(obj, (size_t) obj->size());
 }
 
-oop G1ParCopyHelper::copy_to_survivor_space(oop old, bool should_mark_root,
-                                                     bool should_mark_copy) {
+void G1ParCopyHelper::mark_forwarded_object(oop from_obj, oop to_obj) {
+#ifdef ASSERT
+  assert(from_obj->is_forwarded(), "from obj should be forwarded");
+  assert(from_obj->forwardee() == to_obj, "to obj should be the forwardee");
+  assert(from_obj != to_obj, "should not be self-forwarded");
+
+  HeapRegion* from_hr = _g1->heap_region_containing(from_obj);
+  assert(from_hr != NULL, "sanity");
+  assert(from_hr->in_collection_set(), "from obj should be in the CSet");
+
+  HeapRegion* to_hr = _g1->heap_region_containing(to_obj);
+  assert(to_hr != NULL, "sanity");
+  assert(!to_hr->in_collection_set(), "should not mark objects in the CSet");
+#endif // ASSERT
+
+  // The object might be in the process of being copied by another
+  // worker so we cannot trust that its to-space image is
+  // well-formed. So we have to read its size from its from-space
+  // image which we know should not be changing.
+  _cm->grayRoot(to_obj, (size_t) from_obj->size());
+}
+
+oop G1ParCopyHelper::copy_to_survivor_space(oop old) {
   size_t    word_sz = old->size();
   HeapRegion* from_region = _g1->heap_region_containing_raw(old);
   // +1 to make the -1 indexes valid...
   int       young_index = from_region->young_index_in_cset()+1;
-  assert( (from_region->is_young() && young_index > 0) ||
-          (!from_region->is_young() && young_index == 0), "invariant" );
+  assert( (from_region->is_young() && young_index >  0) ||
+         (!from_region->is_young() && young_index == 0), "invariant" );
   G1CollectorPolicy* g1p = _g1->g1_policy();
   markOop m = old->mark();
   int age = m->has_displaced_mark_helper() ? m->displaced_mark_helper()->age()
@@ -4471,7 +4339,7 @@ oop G1ParCopyHelper::copy_to_survivor_space(oop old, bool should_mark_root,
     // This will either forward-to-self, or detect that someone else has
     // installed a forwarding pointer.
     OopsInHeapRegionClosure* cl = _par_scan_state->evac_failure_closure();
-    return _g1->handle_evacuation_failure_par(cl, old, should_mark_root);
+    return _g1->handle_evacuation_failure_par(cl, old);
   }
 
   // We're going to allocate linearly, so might as well prefetch ahead.
@@ -4507,28 +4375,14 @@ oop G1ParCopyHelper::copy_to_survivor_space(oop old, bool should_mark_root,
       obj->set_mark(m);
     }
 
-    // Mark the evacuated object or propagate "next" mark bit
-    if (should_mark_copy) {
-      if (!use_local_bitmaps ||
-          !_par_scan_state->alloc_buffer(alloc_purpose)->mark(obj_ptr)) {
-        // if we couldn't mark it on the local bitmap (this happens when
-        // the object was not allocated in the GCLab), we have to bite
-        // the bullet and do the standard parallel mark
-        _cm->markAndGrayObjectIfNecessary(obj);
-      }
-
-      if (_g1->isMarkedNext(old)) {
-        // Unmark the object's old location so that marking
-        // doesn't think the old object is alive.
-        _cm->nextMarkBitMap()->parClear((HeapWord*)old);
-      }
-    }
-
     size_t* surv_young_words = _par_scan_state->surviving_young_words();
     surv_young_words[young_index] += word_sz;
 
     if (obj->is_objArray() && arrayOop(obj)->length() >= ParGCArrayScanChunk) {
-      arrayOop(old)->set_length(0);
+      // We keep track of the next start index in the length field of
+      // the to-space object. The actual length can be found in the
+      // length field of the from-space object.
+      arrayOop(obj)->set_length(0);
       oop* old_p = set_partial_array_mask(old);
       _par_scan_state->push_on_queue(old_p);
     } else {
@@ -4550,61 +4404,24 @@ void G1ParCopyClosure<do_gen_barrier, barrier, do_mark_object>
 ::do_oop_work(T* p) {
   oop obj = oopDesc::load_decode_heap_oop(p);
   assert(barrier != G1BarrierRS || obj != NULL,
-         "Precondition: G1BarrierRS implies obj is nonNull");
-
-  // Marking:
-  // If the object is in the collection set, then the thread
-  // that copies the object should mark, or propagate the
-  // mark to, the evacuated object.
-  // If the object is not in the collection set then we
-  // should call the mark_object() method depending on the
-  // value of the template parameter do_mark_object (which will
-  // be true for root scanning closures during an initial mark
-  // pause).
-  // The mark_object() method first checks whether the object
-  // is marked and, if not, attempts to mark the object.
+         "Precondition: G1BarrierRS implies obj is non-NULL");
 
   // here the null check is implicit in the cset_fast_test() test
   if (_g1->in_cset_fast_test(obj)) {
+    oop forwardee;
     if (obj->is_forwarded()) {
-      oopDesc::encode_store_heap_oop(p, obj->forwardee());
-      // If we are a root scanning closure during an initial
-      // mark pause (i.e. do_mark_object will be true) then
-      // we also need to handle marking of roots in the
-      // event of an evacuation failure. In the event of an
-      // evacuation failure, the object is forwarded to itself
-      // and not copied. For root-scanning closures, the
-      // object would be marked after a successful self-forward
-      // but an object could be pointed to by both a root and non
-      // root location and be self-forwarded by a non-root-scanning
-      // closure. Therefore we also have to attempt to mark the
-      // self-forwarded root object here.
-      if (do_mark_object && obj->forwardee() == obj) {
-        mark_object(p);
-      }
+      forwardee = obj->forwardee();
     } else {
-      // During an initial mark pause, objects that are pointed to
-      // by the roots need to be marked - even in the event of an
-      // evacuation failure. We pass the template parameter
-      // do_mark_object (which is true for root scanning closures
-      // during an initial mark pause) to copy_to_survivor_space
-      // which will pass it on to the evacuation failure handling
-      // code. The thread that successfully self-forwards a root
-      // object to itself is responsible for marking the object.
-      bool should_mark_root = do_mark_object;
-
-      // We need to mark the copied object if we're a root scanning
-      // closure during an initial mark pause (i.e. do_mark_object
-      // will be true), or the object is already marked and we need
-      // to propagate the mark to the evacuated copy.
-      bool should_mark_copy = do_mark_object ||
-                              _during_initial_mark ||
-                              (_mark_in_progress && !_g1->is_obj_ill(obj));
-
-      oop copy_oop = copy_to_survivor_space(obj, should_mark_root,
-                                                 should_mark_copy);
-      oopDesc::encode_store_heap_oop(p, copy_oop);
+      forwardee = copy_to_survivor_space(obj);
     }
+    assert(forwardee != NULL, "forwardee should not be NULL");
+    oopDesc::encode_store_heap_oop(p, forwardee);
+    if (do_mark_object && forwardee != obj) {
+      // If the object is self-forwarded we don't need to explicitly
+      // mark it, the evacuation failure protocol will do so.
+      mark_forwarded_object(obj, forwardee);
+    }
+
     // When scanning the RS, we only care about objs in CS.
     if (barrier == G1BarrierRS) {
       _par_scan_state->update_rs(_from, p, _par_scan_state->queue_num());
@@ -4613,8 +4430,8 @@ void G1ParCopyClosure<do_gen_barrier, barrier, do_mark_object>
     // The object is not in collection set. If we're a root scanning
     // closure during an initial mark pause (i.e. do_mark_object will
     // be true) then attempt to mark the object.
-    if (do_mark_object) {
-      mark_object(p);
+    if (do_mark_object && _g1->is_in_g1_reserved(obj)) {
+      mark_object(obj);
     }
   }
 
@@ -4632,35 +4449,51 @@ template void G1ParCopyClosure<false, G1BarrierEvac, false>::do_oop_work(narrowO
 
 template <class T> void G1ParScanPartialArrayClosure::do_oop_nv(T* p) {
   assert(has_partial_array_mask(p), "invariant");
-  oop old = clear_partial_array_mask(p);
-  assert(old->is_objArray(), "must be obj array");
-  assert(old->is_forwarded(), "must be forwarded");
-  assert(Universe::heap()->is_in_reserved(old), "must be in heap.");
+  oop from_obj = clear_partial_array_mask(p);
 
-  objArrayOop obj = objArrayOop(old->forwardee());
-  assert((void*)old != (void*)old->forwardee(), "self forwarding here?");
-  // Process ParGCArrayScanChunk elements now
-  // and push the remainder back onto queue
-  int start     = arrayOop(old)->length();
-  int end       = obj->length();
-  int remainder = end - start;
-  assert(start <= end, "just checking");
+  assert(Universe::heap()->is_in_reserved(from_obj), "must be in heap.");
+  assert(from_obj->is_objArray(), "must be obj array");
+  objArrayOop from_obj_array = objArrayOop(from_obj);
+  // The from-space object contains the real length.
+  int length                 = from_obj_array->length();
+
+  assert(from_obj->is_forwarded(), "must be forwarded");
+  oop to_obj                 = from_obj->forwardee();
+  assert(from_obj != to_obj, "should not be chunking self-forwarded objects");
+  objArrayOop to_obj_array   = objArrayOop(to_obj);
+  // We keep track of the next start index in the length field of the
+  // to-space object.
+  int next_index             = to_obj_array->length();
+  assert(0 <= next_index && next_index < length,
+         err_msg("invariant, next index: %d, length: %d", next_index, length));
+
+  int start                  = next_index;
+  int end                    = length;
+  int remainder              = end - start;
+  // We'll try not to push a range that's smaller than ParGCArrayScanChunk.
   if (remainder > 2 * ParGCArrayScanChunk) {
-    // Test above combines last partial chunk with a full chunk
     end = start + ParGCArrayScanChunk;
-    arrayOop(old)->set_length(end);
-    // Push remainder.
-    oop* old_p = set_partial_array_mask(old);
-    assert(arrayOop(old)->length() < obj->length(), "Empty push?");
-    _par_scan_state->push_on_queue(old_p);
+    to_obj_array->set_length(end);
+    // Push the remainder before we process the range in case another
+    // worker has run out of things to do and can steal it.
+    oop* from_obj_p = set_partial_array_mask(from_obj);
+    _par_scan_state->push_on_queue(from_obj_p);
   } else {
-    // Restore length so that the heap remains parsable in
-    // case of evacuation failure.
-    arrayOop(old)->set_length(end);
+    assert(length == end, "sanity");
+    // We'll process the final range for this object. Restore the length
+    // so that the heap remains parsable in case of evacuation failure.
+    to_obj_array->set_length(end);
   }
-  _scanner.set_region(_g1->heap_region_containing_raw(obj));
-  // process our set of indices (include header in first chunk)
-  obj->oop_iterate_range(&_scanner, start, end);
+  _scanner.set_region(_g1->heap_region_containing_raw(to_obj));
+  // Process indexes [start,end). It will also process the header
+  // along with the first chunk (i.e., the chunk with start == 0).
+  // Note that at this point the length field of to_obj_array is not
+  // correct given that we are using it to keep track of the next
+  // start index. oop_iterate_range() (thankfully!) ignores the length
+  // field and only relies on the start / end parameters.  It does
+  // however return the size of the object which will be incorrect. So
+  // we have to ignore it even if we wanted to use it.
+  to_obj_array->oop_iterate_range(&_scanner, start, end);
 }
 
 class G1ParEvacuateFollowersClosure : public VoidClosure {
@@ -4725,7 +4558,7 @@ protected:
   G1CollectedHeap*       _g1h;
   RefToScanQueueSet      *_queues;
   ParallelTaskTerminator _terminator;
-  int _n_workers;
+  uint _n_workers;
 
   Mutex _stats_lock;
   Mutex* stats_lock() { return &_stats_lock; }
@@ -4765,18 +4598,18 @@ public:
     _n_workers = active_workers;
   }
 
-  void work(int i) {
-    if (i >= _n_workers) return;  // no work needed this round
+  void work(uint worker_id) {
+    if (worker_id >= _n_workers) return;  // no work needed this round
 
     double start_time_ms = os::elapsedTime() * 1000.0;
-    _g1h->g1_policy()->record_gc_worker_start_time(i, start_time_ms);
+    _g1h->g1_policy()->record_gc_worker_start_time(worker_id, start_time_ms);
 
     ResourceMark rm;
     HandleMark   hm;
 
     ReferenceProcessor*             rp = _g1h->ref_processor_stw();
 
-    G1ParScanThreadState            pss(_g1h, i);
+    G1ParScanThreadState            pss(_g1h, worker_id);
     G1ParScanHeapEvacClosure        scan_evac_cl(_g1h, &pss, rp);
     G1ParScanHeapEvacFailureClosure evac_failure_cl(_g1h, &pss, rp);
     G1ParScanPartialArrayClosure    partial_scan_cl(_g1h, &pss, rp);
@@ -4808,7 +4641,7 @@ public:
                                   scan_root_cl,
                                   &push_heap_rs_cl,
                                   scan_perm_cl,
-                                  i);
+                                  worker_id);
     pss.end_strong_roots();
 
     {
@@ -4817,8 +4650,8 @@ public:
       evac.do_void();
       double elapsed_ms = (os::elapsedTime()-start)*1000.0;
       double term_ms = pss.term_time()*1000.0;
-      _g1h->g1_policy()->record_obj_copy_time(i, elapsed_ms-term_ms);
-      _g1h->g1_policy()->record_termination(i, term_ms, pss.term_attempts());
+      _g1h->g1_policy()->record_obj_copy_time(worker_id, elapsed_ms-term_ms);
+      _g1h->g1_policy()->record_termination(worker_id, term_ms, pss.term_attempts());
     }
     _g1h->g1_policy()->record_thread_age_table(pss.age_table());
     _g1h->update_surviving_young_words(pss.surviving_young_words()+1);
@@ -4828,12 +4661,12 @@ public:
 
     if (ParallelGCVerbose) {
       MutexLocker x(stats_lock());
-      pss.print_termination_stats(i);
+      pss.print_termination_stats(worker_id);
     }
 
     assert(pss.refs()->is_empty(), "should be empty");
     double end_time_ms = os::elapsedTime() * 1000.0;
-    _g1h->g1_policy()->record_gc_worker_end_time(i, end_time_ms);
+    _g1h->g1_policy()->record_gc_worker_end_time(worker_id, end_time_ms);
   }
 };
 
@@ -4893,12 +4726,16 @@ g1_process_strong_roots(bool collecting_perm_gen,
 
   g1_policy()->record_ext_root_scan_time(worker_i, ext_root_time_ms);
 
-  // Scan strong roots in mark stack.
-  if (!_process_strong_tasks->is_task_claimed(G1H_PS_mark_stack_oops_do)) {
-    concurrent_mark()->oops_do(scan_non_heap_roots);
+  // During conc marking we have to filter the per-thread SATB buffers
+  // to make sure we remove any oops into the CSet (which will show up
+  // as implicitly live).
+  if (!_process_strong_tasks->is_task_claimed(G1H_PS_filter_satb_buffers)) {
+    if (mark_in_progress()) {
+      JavaThread::satb_mark_queue_set().filter_thread_buffers();
+    }
   }
-  double mark_stack_scan_ms = (os::elapsedTime() - ext_roots_end) * 1000.0;
-  g1_policy()->record_mark_stack_scan_time(worker_i, mark_stack_scan_ms);
+  double satb_filtering_ms = (os::elapsedTime() - ext_roots_end) * 1000.0;
+  g1_policy()->record_satb_filtering_time(worker_i, satb_filtering_ms);
 
   // Now scan the complement of the collection set.
   if (scan_rs != NULL) {
@@ -5091,14 +4928,14 @@ public:
     _terminator(terminator)
   {}
 
-  virtual void work(int i) {
+  virtual void work(uint worker_id) {
     // The reference processing task executed by a single worker.
     ResourceMark rm;
     HandleMark   hm;
 
     G1STWIsAliveClosure is_alive(_g1h);
 
-    G1ParScanThreadState pss(_g1h, i);
+    G1ParScanThreadState pss(_g1h, worker_id);
 
     G1ParScanHeapEvacClosure        scan_evac_cl(_g1h, &pss, NULL);
     G1ParScanHeapEvacFailureClosure evac_failure_cl(_g1h, &pss, NULL);
@@ -5130,7 +4967,7 @@ public:
     G1ParEvacuateFollowersClosure drain_queue(_g1h, &pss, _task_queues, _terminator);
 
     // Call the reference processing task's work routine.
-    _proc_task.work(i, is_alive, keep_alive, drain_queue);
+    _proc_task.work(worker_id, is_alive, keep_alive, drain_queue);
 
     // Note we cannot assert that the refs array is empty here as not all
     // of the processing tasks (specifically phase2 - pp2_work) execute
@@ -5165,8 +5002,8 @@ public:
     _enq_task(enq_task)
   { }
 
-  virtual void work(int i) {
-    _enq_task.work(i);
+  virtual void work(uint worker_id) {
+    _enq_task.work(worker_id);
   }
 };
 
@@ -5195,7 +5032,7 @@ protected:
   G1CollectedHeap* _g1h;
   RefToScanQueueSet      *_queues;
   ParallelTaskTerminator _terminator;
-  int _n_workers;
+  uint _n_workers;
 
 public:
   G1ParPreserveCMReferentsTask(G1CollectedHeap* g1h,int workers, RefToScanQueueSet *task_queues) :
@@ -5206,11 +5043,11 @@ public:
     _n_workers(workers)
   { }
 
-  void work(int i) {
+  void work(uint worker_id) {
     ResourceMark rm;
     HandleMark   hm;
 
-    G1ParScanThreadState            pss(_g1h, i);
+    G1ParScanThreadState            pss(_g1h, worker_id);
     G1ParScanHeapEvacClosure        scan_evac_cl(_g1h, &pss, NULL);
     G1ParScanHeapEvacFailureClosure evac_failure_cl(_g1h, &pss, NULL);
     G1ParScanPartialArrayClosure    partial_scan_cl(_g1h, &pss, NULL);
@@ -5246,17 +5083,17 @@ public:
 
     ReferenceProcessor* rp = _g1h->ref_processor_cm();
 
-    int limit = ReferenceProcessor::number_of_subclasses_of_ref() * rp->max_num_q();
-    int stride = MIN2(MAX2(_n_workers, 1), limit);
+    uint limit = ReferenceProcessor::number_of_subclasses_of_ref() * rp->max_num_q();
+    uint stride = MIN2(MAX2(_n_workers, 1U), limit);
 
     // limit is set using max_num_q() - which was set using ParallelGCThreads.
     // So this must be true - but assert just in case someone decides to
     // change the worker ids.
-    assert(0 <= i && i < limit, "sanity");
+    assert(0 <= worker_id && worker_id < limit, "sanity");
     assert(!rp->discovery_is_atomic(), "check this code");
 
     // Select discovered lists [i, i+stride, i+2*stride,...,limit)
-    for (int idx = i; idx < limit; idx += stride) {
+    for (uint idx = worker_id; idx < limit; idx += stride) {
       DiscoveredList& ref_list = rp->discovered_refs()[idx];
 
       DiscoveredListIterator iter(ref_list, &keep_alive, &always_alive);
@@ -5310,7 +5147,7 @@ void G1CollectedHeap::process_discovered_references() {
   // referents points to another object which is also referenced by an
   // object discovered by the STW ref processor.
 
-  int active_workers = (G1CollectedHeap::use_parallel_gc_threads() ?
+  uint active_workers = (G1CollectedHeap::use_parallel_gc_threads() ?
                         workers()->active_workers() : 1);
 
   assert(!G1CollectedHeap::use_parallel_gc_threads() ||
@@ -5416,7 +5253,7 @@ void G1CollectedHeap::enqueue_discovered_references() {
   } else {
     // Parallel reference enqueuing
 
-    int active_workers = (ParallelGCThreads > 0 ? workers()->active_workers() : 1);
+    uint active_workers = (ParallelGCThreads > 0 ? workers()->active_workers() : 1);
     assert(active_workers == workers()->active_workers(),
            "Need to reset active_workers");
     assert(rp->num_q() == active_workers, "sanity");
@@ -5439,13 +5276,14 @@ void G1CollectedHeap::enqueue_discovered_references() {
 }
 
 void G1CollectedHeap::evacuate_collection_set() {
+  _expand_heap_after_alloc_failure = true;
   set_evacuation_failed(false);
 
   g1_rem_set()->prepare_for_oops_into_collection_set_do();
   concurrent_g1_refine()->set_use_cache(false);
   concurrent_g1_refine()->clear_hot_cache_claimed_index();
 
-  int n_workers;
+  uint n_workers;
   if (G1CollectedHeap::use_parallel_gc_threads()) {
     n_workers =
       AdaptiveSizePolicy::calc_active_workers(workers()->total_workers(),
@@ -5516,13 +5354,6 @@ void G1CollectedHeap::evacuate_collection_set() {
 
   finalize_for_evac_failure();
 
-  // Must do this before clearing the per-region evac-failure flags
-  // (which is currently done when we free the collection set).
-  // We also only do this if marking is actually in progress and so
-  // have to do this before we set the mark_in_progress flag at the
-  // end of an initial mark pause.
-  concurrent_mark()->complete_marking_in_collection_set();
-
   if (evacuation_failed()) {
     remove_self_forwarding_pointers();
     if (PrintGCDetails) {
@@ -5658,7 +5489,7 @@ public:
     AbstractGangTask("G1 Par Cleanup CT Task"),
     _ct_bs(ct_bs), _g1h(g1h) { }
 
-  void work(int i) {
+  void work(uint worker_id) {
     HeapRegion* r;
     while (r = _g1h->pop_dirty_cards_region()) {
       clear_cards(r);
@@ -6141,7 +5972,7 @@ void G1CollectedHeap::set_par_threads() {
   // Don't change the number of workers.  Use the value previously set
   // in the workgroup.
   assert(G1CollectedHeap::use_parallel_gc_threads(), "shouldn't be here otherwise");
-  int n_workers = workers()->active_workers();
+  uint n_workers = workers()->active_workers();
   assert(UseDynamicNumberOfGCThreads ||
            n_workers == workers()->total_workers(),
       "Otherwise should be using the total number of workers");
@@ -6179,6 +6010,8 @@ HeapRegion* G1CollectedHeap::new_gc_alloc_region(size_t word_size,
       } else {
         _hr_printer.alloc(new_alloc_region, G1HRPrinter::Old);
       }
+      bool during_im = g1_policy()->during_initial_mark_pause();
+      new_alloc_region->note_start_of_copying(during_im);
       return new_alloc_region;
     } else {
       g1_policy()->note_alloc_region_limit_reached(ap);
@@ -6190,7 +6023,8 @@ HeapRegion* G1CollectedHeap::new_gc_alloc_region(size_t word_size,
 void G1CollectedHeap::retire_gc_alloc_region(HeapRegion* alloc_region,
                                              size_t allocated_bytes,
                                              GCAllocPurpose ap) {
-  alloc_region->note_end_of_copying();
+  bool during_im = g1_policy()->during_initial_mark_pause();
+  alloc_region->note_end_of_copying(during_im);
   g1_policy()->record_bytes_copied_during_gc(allocated_bytes);
   if (ap == GCAllocForSurvived) {
     young_list()->add_survivor_region(alloc_region);
diff --git a/hotspot/src/share/vm/gc_implementation/g1/g1CollectedHeap.hpp b/hotspot/src/share/vm/gc_implementation/g1/g1CollectedHeap.hpp
index 8d43e822a56..30b7fedd417 100644
--- a/hotspot/src/share/vm/gc_implementation/g1/g1CollectedHeap.hpp
+++ b/hotspot/src/share/vm/gc_implementation/g1/g1CollectedHeap.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2001, 2011, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2001, 2012, Oracle and/or its affiliates. All rights reserved.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
  * This code is free software; you can redistribute it and/or modify it
@@ -285,6 +285,14 @@ private:
   // Typically, it is not full so we should re-use it during the next GC.
   HeapRegion* _retained_old_gc_alloc_region;
 
+  // It specifies whether we should attempt to expand the heap after a
+  // region allocation failure. If heap expansion fails we set this to
+  // false so that we don't re-attempt the heap expansion (it's likely
+  // that subsequent expansion attempts will also fail if one fails).
+  // Currently, it is only consulted during GC and it's reset at the
+  // start of each GC.
+  bool _expand_heap_after_alloc_failure;
+
   // It resets the mutator alloc region before new allocations can take place.
   void init_mutator_alloc_region();
 
@@ -861,8 +869,7 @@ protected:
   void finalize_for_evac_failure();
 
   // An attempt to evacuate "obj" has failed; take necessary steps.
-  oop handle_evacuation_failure_par(OopsInHeapRegionClosure* cl, oop obj,
-                                    bool should_mark_root);
+  oop handle_evacuation_failure_par(OopsInHeapRegionClosure* cl, oop obj);
   void handle_evacuation_failure_common(oop obj, markOop m);
 
   // ("Weak") Reference processing support.
@@ -954,7 +961,7 @@ protected:
   unsigned int* _worker_cset_start_region_time_stamp;
 
   enum G1H_process_strong_roots_tasks {
-    G1H_PS_mark_stack_oops_do,
+    G1H_PS_filter_satb_buffers,
     G1H_PS_refProcessor_oops_do,
     // Leave this one last.
     G1H_PS_NumElements
@@ -995,7 +1002,7 @@ public:
   // Initialize weak reference processing.
   virtual void ref_processing_init();
 
-  void set_par_threads(int t) {
+  void set_par_threads(uint t) {
     SharedHeap::set_par_threads(t);
     // Done in SharedHeap but oddly there are
     // two _process_strong_tasks's in a G1CollectedHeap
@@ -1298,13 +1305,17 @@ public:
   // chunk.)  For now requires that "doHeapRegion" always returns "false",
   // i.e., that a closure never attempt to abort a traversal.
   void heap_region_par_iterate_chunked(HeapRegionClosure* blk,
-                                       int worker,
-                                       int no_of_par_workers,
+                                       uint worker,
+                                       uint no_of_par_workers,
                                        jint claim_value);
 
   // It resets all the region claim values to the default.
   void reset_heap_region_claim_values();
 
+  // Resets the claim values of regions in the current
+  // collection set to the default.
+  void reset_cset_heap_region_claim_values();
+
 #ifdef ASSERT
   bool check_heap_region_claim_values(jint claim_value);
 
@@ -1740,10 +1751,8 @@ public:
       _gclab_word_size(gclab_word_size),
       _real_start_word(NULL),
       _real_end_word(NULL),
-      _start_word(NULL)
-  {
-    guarantee( size_in_words() >= bitmap_size_in_words(),
-               "just making sure");
+      _start_word(NULL) {
+    guarantee(false, "GCLabBitMap::GCLabBitmap(): don't call this any more");
   }
 
   inline unsigned heapWordToOffset(HeapWord* addr) {
@@ -1797,6 +1806,8 @@ public:
   }
 
   void set_buffer(HeapWord* start) {
+    guarantee(false, "set_buffer(): don't call this any more");
+
     guarantee(use_local_bitmaps, "invariant");
     clear();
 
@@ -1820,6 +1831,8 @@ public:
 #endif // PRODUCT
 
   void retire() {
+    guarantee(false, "retire(): don't call this any more");
+
     guarantee(use_local_bitmaps, "invariant");
     assert(fields_well_formed(), "invariant");
 
@@ -1853,32 +1866,18 @@ public:
 class G1ParGCAllocBuffer: public ParGCAllocBuffer {
 private:
   bool        _retired;
-  bool        _should_mark_objects;
-  GCLabBitMap _bitmap;
 
 public:
   G1ParGCAllocBuffer(size_t gclab_word_size);
 
-  inline bool mark(HeapWord* addr) {
-    guarantee(use_local_bitmaps, "invariant");
-    assert(_should_mark_objects, "invariant");
-    return _bitmap.mark(addr);
-  }
-
-  inline void set_buf(HeapWord* buf) {
-    if (use_local_bitmaps && _should_mark_objects) {
-      _bitmap.set_buffer(buf);
-    }
+  void set_buf(HeapWord* buf) {
     ParGCAllocBuffer::set_buf(buf);
     _retired = false;
   }
 
-  inline void retire(bool end_of_gc, bool retain) {
+  void retire(bool end_of_gc, bool retain) {
     if (_retired)
       return;
-    if (use_local_bitmaps && _should_mark_objects) {
-      _bitmap.retire();
-    }
     ParGCAllocBuffer::retire(end_of_gc, retain);
     _retired = true;
   }
diff --git a/hotspot/src/share/vm/gc_implementation/g1/g1CollectorPolicy.cpp b/hotspot/src/share/vm/gc_implementation/g1/g1CollectorPolicy.cpp
index 2e5284b844f..f32db16d402 100644
--- a/hotspot/src/share/vm/gc_implementation/g1/g1CollectorPolicy.cpp
+++ b/hotspot/src/share/vm/gc_implementation/g1/g1CollectorPolicy.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2001, 2011, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2001, 2012, Oracle and/or its affiliates. All rights reserved.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
  * This code is free software; you can redistribute it and/or modify it
@@ -136,7 +136,6 @@ G1CollectorPolicy::G1CollectorPolicy() :
   _stop_world_start(0.0),
   _all_stop_world_times_ms(new NumberSeq()),
   _all_yield_times_ms(new NumberSeq()),
-  _using_new_ratio_calculations(false),
 
   _summary(new Summary()),
 
@@ -230,7 +229,9 @@ G1CollectorPolicy::G1CollectorPolicy() :
   _inc_cset_bytes_used_before(0),
   _inc_cset_max_finger(NULL),
   _inc_cset_recorded_rs_lengths(0),
+  _inc_cset_recorded_rs_lengths_diffs(0),
   _inc_cset_predicted_elapsed_time_ms(0.0),
+  _inc_cset_predicted_elapsed_time_ms_diffs(0.0),
 
 #ifdef _MSC_VER // the use of 'this' below gets a warning, make it go away
 #pragma warning( disable:4355 ) // 'this' : used in base member initializer list
@@ -280,7 +281,7 @@ G1CollectorPolicy::G1CollectorPolicy() :
 
   _par_last_gc_worker_start_times_ms = new double[_parallel_gc_threads];
   _par_last_ext_root_scan_times_ms = new double[_parallel_gc_threads];
-  _par_last_mark_stack_scan_times_ms = new double[_parallel_gc_threads];
+  _par_last_satb_filtering_times_ms = new double[_parallel_gc_threads];
 
   _par_last_update_rs_times_ms = new double[_parallel_gc_threads];
   _par_last_update_rs_processed_buffers = new double[_parallel_gc_threads];
@@ -407,11 +408,7 @@ G1CollectorPolicy::G1CollectorPolicy() :
 
   initialize_all();
   _collectionSetChooser = new CollectionSetChooser();
-}
-
-// Increment "i", mod "len"
-static void inc_mod(int& i, int len) {
-  i++; if (i == len) i = 0;
+  _young_gen_sizer = new G1YoungGenSizer(); // Must be after call to initialize_flags
 }
 
 void G1CollectorPolicy::initialize_flags() {
@@ -423,39 +420,74 @@ void G1CollectorPolicy::initialize_flags() {
   CollectorPolicy::initialize_flags();
 }
 
-// The easiest way to deal with the parsing of the NewSize /
-// MaxNewSize / etc. parameteres is to re-use the code in the
-// TwoGenerationCollectorPolicy class. This is similar to what
-// ParallelScavenge does with its GenerationSizer class (see
-// ParallelScavengeHeap::initialize()). We might change this in the
-// future, but it's a good start.
-class G1YoungGenSizer : public TwoGenerationCollectorPolicy {
-private:
-  size_t size_to_region_num(size_t byte_size) {
-    return MAX2((size_t) 1, byte_size / HeapRegion::GrainBytes);
+G1YoungGenSizer::G1YoungGenSizer() : _sizer_kind(SizerDefaults), _adaptive_size(true) {
+  assert(G1DefaultMinNewGenPercent <= G1DefaultMaxNewGenPercent, "Min larger than max");
+  assert(G1DefaultMinNewGenPercent > 0 && G1DefaultMinNewGenPercent < 100, "Min out of bounds");
+  assert(G1DefaultMaxNewGenPercent > 0 && G1DefaultMaxNewGenPercent < 100, "Max out of bounds");
+
+  if (FLAG_IS_CMDLINE(NewRatio)) {
+    if (FLAG_IS_CMDLINE(NewSize) || FLAG_IS_CMDLINE(MaxNewSize)) {
+      warning("-XX:NewSize and -XX:MaxNewSize override -XX:NewRatio");
+    } else {
+      _sizer_kind = SizerNewRatio;
+      _adaptive_size = false;
+      return;
+    }
   }
 
-public:
-  G1YoungGenSizer() {
-    initialize_flags();
-    initialize_size_info();
+  if (FLAG_IS_CMDLINE(NewSize)) {
+     _min_desired_young_length = MAX2((size_t) 1, NewSize / HeapRegion::GrainBytes);
+    if (FLAG_IS_CMDLINE(MaxNewSize)) {
+      _max_desired_young_length = MAX2((size_t) 1, MaxNewSize / HeapRegion::GrainBytes);
+      _sizer_kind = SizerMaxAndNewSize;
+      _adaptive_size = _min_desired_young_length == _max_desired_young_length;
+    } else {
+      _sizer_kind = SizerNewSizeOnly;
+    }
+  } else if (FLAG_IS_CMDLINE(MaxNewSize)) {
+    _max_desired_young_length = MAX2((size_t) 1, MaxNewSize / HeapRegion::GrainBytes);
+    _sizer_kind = SizerMaxNewSizeOnly;
   }
-  size_t min_young_region_num() {
-    return size_to_region_num(_min_gen0_size);
-  }
-  size_t initial_young_region_num() {
-    return size_to_region_num(_initial_gen0_size);
-  }
-  size_t max_young_region_num() {
-    return size_to_region_num(_max_gen0_size);
-  }
-};
+}
 
-void G1CollectorPolicy::update_young_list_size_using_newratio(size_t number_of_heap_regions) {
-  assert(number_of_heap_regions > 0, "Heap must be initialized");
-  size_t young_size = number_of_heap_regions / (NewRatio + 1);
-  _min_desired_young_length = young_size;
-  _max_desired_young_length = young_size;
+size_t G1YoungGenSizer::calculate_default_min_length(size_t new_number_of_heap_regions) {
+  size_t default_value = (new_number_of_heap_regions * G1DefaultMinNewGenPercent) / 100;
+  return MAX2((size_t)1, default_value);
+}
+
+size_t G1YoungGenSizer::calculate_default_max_length(size_t new_number_of_heap_regions) {
+  size_t default_value = (new_number_of_heap_regions * G1DefaultMaxNewGenPercent) / 100;
+  return MAX2((size_t)1, default_value);
+}
+
+void G1YoungGenSizer::heap_size_changed(size_t new_number_of_heap_regions) {
+  assert(new_number_of_heap_regions > 0, "Heap must be initialized");
+
+  switch (_sizer_kind) {
+    case SizerDefaults:
+      _min_desired_young_length = calculate_default_min_length(new_number_of_heap_regions);
+      _max_desired_young_length = calculate_default_max_length(new_number_of_heap_regions);
+      break;
+    case SizerNewSizeOnly:
+      _max_desired_young_length = calculate_default_max_length(new_number_of_heap_regions);
+      _max_desired_young_length = MAX2(_min_desired_young_length, _max_desired_young_length);
+      break;
+    case SizerMaxNewSizeOnly:
+      _min_desired_young_length = calculate_default_min_length(new_number_of_heap_regions);
+      _min_desired_young_length = MIN2(_min_desired_young_length, _max_desired_young_length);
+      break;
+    case SizerMaxAndNewSize:
+      // Do nothing. Values set on the command line, don't update them at runtime.
+      break;
+    case SizerNewRatio:
+      _min_desired_young_length = new_number_of_heap_regions / (NewRatio + 1);
+      _max_desired_young_length = _min_desired_young_length;
+      break;
+    default:
+      ShouldNotReachHere();
+  }
+
+  assert(_min_desired_young_length <= _max_desired_young_length, "Invalid min/max young gen size values");
 }
 
 void G1CollectorPolicy::init() {
@@ -466,28 +498,10 @@ void G1CollectorPolicy::init() {
 
   initialize_gc_policy_counters();
 
-  G1YoungGenSizer sizer;
-  _min_desired_young_length = sizer.min_young_region_num();
-  _max_desired_young_length = sizer.max_young_region_num();
-
-  if (FLAG_IS_CMDLINE(NewRatio)) {
-    if (FLAG_IS_CMDLINE(NewSize) || FLAG_IS_CMDLINE(MaxNewSize)) {
-      warning("-XX:NewSize and -XX:MaxNewSize override -XX:NewRatio");
-    } else {
-      // Treat NewRatio as a fixed size that is only recalculated when the heap size changes
-      update_young_list_size_using_newratio(_g1->n_regions());
-      _using_new_ratio_calculations = true;
-    }
-  }
-
-  assert(_min_desired_young_length <= _max_desired_young_length, "Invalid min/max young gen size values");
-
-  set_adaptive_young_list_length(_min_desired_young_length < _max_desired_young_length);
   if (adaptive_young_list_length()) {
     _young_list_fixed_length = 0;
   } else {
-    assert(_min_desired_young_length == _max_desired_young_length, "Min and max young size differ");
-    _young_list_fixed_length = _min_desired_young_length;
+    _young_list_fixed_length = _young_gen_sizer->min_desired_young_length();
   }
   _free_regions_at_end_of_collection = _g1->free_regions();
   update_young_list_target_length();
@@ -541,11 +555,7 @@ void G1CollectorPolicy::record_new_heap_size(size_t new_number_of_regions) {
   // smaller than 1.0) we'll get 1.
   _reserve_regions = (size_t) ceil(reserve_regions_d);
 
-  if (_using_new_ratio_calculations) {
-    // -XX:NewRatio was specified so we need to update the
-    // young gen length when the heap size has changed.
-    update_young_list_size_using_newratio(new_number_of_regions);
-  }
+  _young_gen_sizer->heap_size_changed(new_number_of_regions);
 }
 
 size_t G1CollectorPolicy::calculate_young_list_desired_min_length(
@@ -563,14 +573,14 @@ size_t G1CollectorPolicy::calculate_young_list_desired_min_length(
   }
   desired_min_length += base_min_length;
   // make sure we don't go below any user-defined minimum bound
-  return MAX2(_min_desired_young_length, desired_min_length);
+  return MAX2(_young_gen_sizer->min_desired_young_length(), desired_min_length);
 }
 
 size_t G1CollectorPolicy::calculate_young_list_desired_max_length() {
   // Here, we might want to also take into account any additional
   // constraints (i.e., user-defined minimum bound). Currently, we
   // effectively don't set this bound.
-  return _max_desired_young_length;
+  return _young_gen_sizer->max_desired_young_length();
 }
 
 void G1CollectorPolicy::update_young_list_target_length(size_t rs_lengths) {
@@ -895,10 +905,19 @@ void G1CollectorPolicy::record_collection_pause_start(double start_time_sec,
     gclog_or_tty->print(" (%s)", gcs_are_young() ? "young" : "mixed");
   }
 
-  // We only need to do this here as the policy will only be applied
-  // to the GC we're about to start. so, no point is calculating this
-  // every time we calculate / recalculate the target young length.
-  update_survivors_policy();
+  if (!during_initial_mark_pause()) {
+    // We only need to do this here as the policy will only be applied
+    // to the GC we're about to start. so, no point is calculating this
+    // every time we calculate / recalculate the target young length.
+    update_survivors_policy();
+  } else {
+    // The marking phase has a "we only copy implicitly live
+    // objects during marking" invariant. The easiest way to ensure it
+    // holds is not to allocate any survivor regions and tenure all
+    // objects. In the future we might change this and handle survivor
+    // regions specially during marking.
+    tenure_all_objects();
+  }
 
   assert(_g1->used() == _g1->recalculate_used(),
          err_msg("sanity, used: "SIZE_FORMAT" recalculate_used: "SIZE_FORMAT,
@@ -929,7 +948,7 @@ void G1CollectorPolicy::record_collection_pause_start(double start_time_sec,
   for (int i = 0; i < _parallel_gc_threads; ++i) {
     _par_last_gc_worker_start_times_ms[i] = -1234.0;
     _par_last_ext_root_scan_times_ms[i] = -1234.0;
-    _par_last_mark_stack_scan_times_ms[i] = -1234.0;
+    _par_last_satb_filtering_times_ms[i] = -1234.0;
     _par_last_update_rs_times_ms[i] = -1234.0;
     _par_last_update_rs_processed_buffers[i] = -1234.0;
     _par_last_scan_rs_times_ms[i] = -1234.0;
@@ -1217,7 +1236,7 @@ void G1CollectorPolicy::record_collection_pause_end(int no_of_gc_threads) {
   // of the PrintGCDetails output, in the non-parallel case.
 
   double ext_root_scan_time = avg_value(_par_last_ext_root_scan_times_ms);
-  double mark_stack_scan_time = avg_value(_par_last_mark_stack_scan_times_ms);
+  double satb_filtering_time = avg_value(_par_last_satb_filtering_times_ms);
   double update_rs_time = avg_value(_par_last_update_rs_times_ms);
   double update_rs_processed_buffers =
     sum_of_values(_par_last_update_rs_processed_buffers);
@@ -1226,7 +1245,7 @@ void G1CollectorPolicy::record_collection_pause_end(int no_of_gc_threads) {
   double termination_time = avg_value(_par_last_termination_times_ms);
 
   double known_time = ext_root_scan_time +
-                      mark_stack_scan_time +
+                      satb_filtering_time +
                       update_rs_time +
                       scan_rs_time +
                       obj_copy_time;
@@ -1272,7 +1291,7 @@ void G1CollectorPolicy::record_collection_pause_end(int no_of_gc_threads) {
     body_summary->record_satb_drain_time_ms(_cur_satb_drain_time_ms);
 
     body_summary->record_ext_root_scan_time_ms(ext_root_scan_time);
-    body_summary->record_mark_stack_scan_time_ms(mark_stack_scan_time);
+    body_summary->record_satb_filtering_time_ms(satb_filtering_time);
     body_summary->record_update_rs_time_ms(update_rs_time);
     body_summary->record_scan_rs_time_ms(scan_rs_time);
     body_summary->record_obj_copy_time_ms(obj_copy_time);
@@ -1366,16 +1385,12 @@ void G1CollectorPolicy::record_collection_pause_end(int no_of_gc_threads) {
                            (last_pause_included_initial_mark) ? " (initial-mark)" : "",
                            elapsed_ms / 1000.0);
 
-    if (print_marking_info) {
-      print_stats(1, "SATB Drain Time", _cur_satb_drain_time_ms);
-    }
-
     if (parallel) {
       print_stats(1, "Parallel Time", _cur_collection_par_time_ms);
       print_par_stats(2, "GC Worker Start", _par_last_gc_worker_start_times_ms);
       print_par_stats(2, "Ext Root Scanning", _par_last_ext_root_scan_times_ms);
       if (print_marking_info) {
-        print_par_stats(2, "Mark Stack Scanning", _par_last_mark_stack_scan_times_ms);
+        print_par_stats(2, "SATB Filtering", _par_last_satb_filtering_times_ms);
       }
       print_par_stats(2, "Update RS", _par_last_update_rs_times_ms);
       print_par_sizes(3, "Processed Buffers", _par_last_update_rs_processed_buffers);
@@ -1389,7 +1404,7 @@ void G1CollectorPolicy::record_collection_pause_end(int no_of_gc_threads) {
         _par_last_gc_worker_times_ms[i] = _par_last_gc_worker_end_times_ms[i] - _par_last_gc_worker_start_times_ms[i];
 
         double worker_known_time = _par_last_ext_root_scan_times_ms[i] +
-                                   _par_last_mark_stack_scan_times_ms[i] +
+                                   _par_last_satb_filtering_times_ms[i] +
                                    _par_last_update_rs_times_ms[i] +
                                    _par_last_scan_rs_times_ms[i] +
                                    _par_last_obj_copy_times_ms[i] +
@@ -1402,7 +1417,7 @@ void G1CollectorPolicy::record_collection_pause_end(int no_of_gc_threads) {
     } else {
       print_stats(1, "Ext Root Scanning", ext_root_scan_time);
       if (print_marking_info) {
-        print_stats(1, "Mark Stack Scanning", mark_stack_scan_time);
+        print_stats(1, "SATB Filtering", satb_filtering_time);
       }
       print_stats(1, "Update RS", update_rs_time);
       print_stats(2, "Processed Buffers", (int)update_rs_processed_buffers);
@@ -1551,10 +1566,19 @@ void G1CollectorPolicy::record_collection_pause_end(int no_of_gc_threads) {
       }
     }
 
-    // It turns out that, sometimes, _max_rs_lengths can get smaller
-    // than _recorded_rs_lengths which causes rs_length_diff to get
-    // very large and mess up the RSet length predictions. We'll be
-    // defensive until we work out why this happens.
+    // This is defensive. For a while _max_rs_lengths could get
+    // smaller than _recorded_rs_lengths which was causing
+    // rs_length_diff to get very large and mess up the RSet length
+    // predictions. The reason was unsafe concurrent updates to the
+    // _inc_cset_recorded_rs_lengths field which the code below guards
+    // against (see CR 7118202). This bug has now been fixed (see CR
+    // 7119027). However, I'm still worried that
+    // _inc_cset_recorded_rs_lengths might still end up somewhat
+    // inaccurate. The concurrent refinement thread calculates an
+    // RSet's length concurrently with other CR threads updating it
+    // which might cause it to calculate the length incorrectly (if,
+    // say, it's in mid-coarsening). So I'll leave in the defensive
+    // conditional below just in case.
     size_t rs_length_diff = 0;
     if (_max_rs_lengths > _recorded_rs_lengths) {
       rs_length_diff = _max_rs_lengths - _recorded_rs_lengths;
@@ -1964,11 +1988,10 @@ void G1CollectorPolicy::print_summary(PauseSummary* summary) const {
   if (summary->get_total_seq()->num() > 0) {
     print_summary_sd(0, "Evacuation Pauses", summary->get_total_seq());
     if (body_summary != NULL) {
-      print_summary(1, "SATB Drain", body_summary->get_satb_drain_seq());
       if (parallel) {
         print_summary(1, "Parallel Time", body_summary->get_parallel_seq());
         print_summary(2, "Ext Root Scanning", body_summary->get_ext_root_scan_seq());
-        print_summary(2, "Mark Stack Scanning", body_summary->get_mark_stack_scan_seq());
+        print_summary(2, "SATB Filtering", body_summary->get_satb_filtering_seq());
         print_summary(2, "Update RS", body_summary->get_update_rs_seq());
         print_summary(2, "Scan RS", body_summary->get_scan_rs_seq());
         print_summary(2, "Object Copy", body_summary->get_obj_copy_seq());
@@ -1977,7 +2000,7 @@ void G1CollectorPolicy::print_summary(PauseSummary* summary) const {
         {
           NumberSeq* other_parts[] = {
             body_summary->get_ext_root_scan_seq(),
-            body_summary->get_mark_stack_scan_seq(),
+            body_summary->get_satb_filtering_seq(),
             body_summary->get_update_rs_seq(),
             body_summary->get_scan_rs_seq(),
             body_summary->get_obj_copy_seq(),
@@ -1990,7 +2013,7 @@ void G1CollectorPolicy::print_summary(PauseSummary* summary) const {
         }
       } else {
         print_summary(1, "Ext Root Scanning", body_summary->get_ext_root_scan_seq());
-        print_summary(1, "Mark Stack Scanning", body_summary->get_mark_stack_scan_seq());
+        print_summary(1, "SATB Filtering", body_summary->get_satb_filtering_seq());
         print_summary(1, "Update RS", body_summary->get_update_rs_seq());
         print_summary(1, "Scan RS", body_summary->get_scan_rs_seq());
         print_summary(1, "Object Copy", body_summary->get_obj_copy_seq());
@@ -2017,7 +2040,7 @@ void G1CollectorPolicy::print_summary(PauseSummary* summary) const {
             body_summary->get_satb_drain_seq(),
             body_summary->get_update_rs_seq(),
             body_summary->get_ext_root_scan_seq(),
-            body_summary->get_mark_stack_scan_seq(),
+            body_summary->get_satb_filtering_seq(),
             body_summary->get_scan_rs_seq(),
             body_summary->get_obj_copy_seq()
           };
@@ -2321,17 +2344,19 @@ public:
     _g1(G1CollectedHeap::heap())
   {}
 
-  void work(int i) {
-    ParKnownGarbageHRClosure parKnownGarbageCl(_hrSorted, _chunk_size, i);
+  void work(uint worker_id) {
+    ParKnownGarbageHRClosure parKnownGarbageCl(_hrSorted,
+                                               _chunk_size,
+                                               worker_id);
     // Back to zero for the claim value.
-    _g1->heap_region_par_iterate_chunked(&parKnownGarbageCl, i,
+    _g1->heap_region_par_iterate_chunked(&parKnownGarbageCl, worker_id,
                                          _g1->workers()->active_workers(),
                                          HeapRegion::InitialClaimValue);
     jint regions_added = parKnownGarbageCl.marked_regions_added();
     _hrSorted->incNumMarkedHeapRegions(regions_added);
     if (G1PrintParCleanupStats) {
       gclog_or_tty->print_cr("     Thread %d called %d times, added %d regions to list.",
-                 i, parKnownGarbageCl.invokes(), regions_added);
+                 worker_id, parKnownGarbageCl.invokes(), regions_added);
     }
   }
 };
@@ -2412,9 +2437,6 @@ void G1CollectorPolicy::add_old_region_to_cset(HeapRegion* hr) {
   assert(_inc_cset_build_state == Active, "Precondition");
   assert(!hr->is_young(), "non-incremental add of young region");
 
-  if (_g1->mark_in_progress())
-    _g1->concurrent_mark()->registerCSetRegion(hr);
-
   assert(!hr->in_collection_set(), "should not already be in the CSet");
   hr->set_in_collection_set(true);
   hr->set_next_in_collection_set(_collection_set);
@@ -2436,10 +2458,45 @@ void G1CollectorPolicy::start_incremental_cset_building() {
 
   _inc_cset_max_finger = 0;
   _inc_cset_recorded_rs_lengths = 0;
-  _inc_cset_predicted_elapsed_time_ms = 0;
+  _inc_cset_recorded_rs_lengths_diffs = 0;
+  _inc_cset_predicted_elapsed_time_ms = 0.0;
+  _inc_cset_predicted_elapsed_time_ms_diffs = 0.0;
   _inc_cset_build_state = Active;
 }
 
+void G1CollectorPolicy::finalize_incremental_cset_building() {
+  assert(_inc_cset_build_state == Active, "Precondition");
+  assert(SafepointSynchronize::is_at_safepoint(), "should be at a safepoint");
+
+  // The two "main" fields, _inc_cset_recorded_rs_lengths and
+  // _inc_cset_predicted_elapsed_time_ms, are updated by the thread
+  // that adds a new region to the CSet. Further updates by the
+  // concurrent refinement thread that samples the young RSet lengths
+  // are accumulated in the *_diffs fields. Here we add the diffs to
+  // the "main" fields.
+
+  if (_inc_cset_recorded_rs_lengths_diffs >= 0) {
+    _inc_cset_recorded_rs_lengths += _inc_cset_recorded_rs_lengths_diffs;
+  } else {
+    // This is defensive. The diff should in theory be always positive
+    // as RSets can only grow between GCs. However, given that we
+    // sample their size concurrently with other threads updating them
+    // it's possible that we might get the wrong size back, which
+    // could make the calculations somewhat inaccurate.
+    size_t diffs = (size_t) (-_inc_cset_recorded_rs_lengths_diffs);
+    if (_inc_cset_recorded_rs_lengths >= diffs) {
+      _inc_cset_recorded_rs_lengths -= diffs;
+    } else {
+      _inc_cset_recorded_rs_lengths = 0;
+    }
+  }
+  _inc_cset_predicted_elapsed_time_ms +=
+                                     _inc_cset_predicted_elapsed_time_ms_diffs;
+
+  _inc_cset_recorded_rs_lengths_diffs = 0;
+  _inc_cset_predicted_elapsed_time_ms_diffs = 0.0;
+}
+
 void G1CollectorPolicy::add_to_incremental_cset_info(HeapRegion* hr, size_t rs_length) {
   // This routine is used when:
   // * adding survivor regions to the incremental cset at the end of an
@@ -2455,10 +2512,8 @@ void G1CollectorPolicy::add_to_incremental_cset_info(HeapRegion* hr, size_t rs_l
 
   double region_elapsed_time_ms = predict_region_elapsed_time_ms(hr, true);
   size_t used_bytes = hr->used();
-
   _inc_cset_recorded_rs_lengths += rs_length;
   _inc_cset_predicted_elapsed_time_ms += region_elapsed_time_ms;
-
   _inc_cset_bytes_used_before += used_bytes;
 
   // Cache the values we have added to the aggregated informtion
@@ -2469,37 +2524,33 @@ void G1CollectorPolicy::add_to_incremental_cset_info(HeapRegion* hr, size_t rs_l
   hr->set_predicted_elapsed_time_ms(region_elapsed_time_ms);
 }
 
-void G1CollectorPolicy::remove_from_incremental_cset_info(HeapRegion* hr) {
-  // This routine is currently only called as part of the updating of
-  // existing policy information for regions in the incremental cset that
-  // is performed by the concurrent refine thread(s) as part of young list
-  // RSet sampling. Therefore we should not be at a safepoint.
-
-  assert(!SafepointSynchronize::is_at_safepoint(), "should not be at safepoint");
-  assert(hr->is_young(), "it should be");
-
-  size_t used_bytes = hr->used();
-  size_t old_rs_length = hr->recorded_rs_length();
-  double old_elapsed_time_ms = hr->predicted_elapsed_time_ms();
-
-  // Subtract the old recorded/predicted policy information for
-  // the given heap region from the collection set info.
-  _inc_cset_recorded_rs_lengths -= old_rs_length;
-  _inc_cset_predicted_elapsed_time_ms -= old_elapsed_time_ms;
-
-  _inc_cset_bytes_used_before -= used_bytes;
-
-  // Clear the values cached in the heap region
-  hr->set_recorded_rs_length(0);
-  hr->set_predicted_elapsed_time_ms(0);
-}
-
-void G1CollectorPolicy::update_incremental_cset_info(HeapRegion* hr, size_t new_rs_length) {
-  // Update the collection set information that is dependent on the new RS length
+void G1CollectorPolicy::update_incremental_cset_info(HeapRegion* hr,
+                                                     size_t new_rs_length) {
+  // Update the CSet information that is dependent on the new RS length
   assert(hr->is_young(), "Precondition");
+  assert(!SafepointSynchronize::is_at_safepoint(),
+                                               "should not be at a safepoint");
 
-  remove_from_incremental_cset_info(hr);
-  add_to_incremental_cset_info(hr, new_rs_length);
+  // We could have updated _inc_cset_recorded_rs_lengths and
+  // _inc_cset_predicted_elapsed_time_ms directly but we'd need to do
+  // that atomically, as this code is executed by a concurrent
+  // refinement thread, potentially concurrently with a mutator thread
+  // allocating a new region and also updating the same fields. To
+  // avoid the atomic operations we accumulate these updates on two
+  // separate fields (*_diffs) and we'll just add them to the "main"
+  // fields at the start of a GC.
+
+  ssize_t old_rs_length = (ssize_t) hr->recorded_rs_length();
+  ssize_t rs_lengths_diff = (ssize_t) new_rs_length - old_rs_length;
+  _inc_cset_recorded_rs_lengths_diffs += rs_lengths_diff;
+
+  double old_elapsed_time_ms = hr->predicted_elapsed_time_ms();
+  double new_region_elapsed_time_ms = predict_region_elapsed_time_ms(hr, true);
+  double elapsed_ms_diff = new_region_elapsed_time_ms - old_elapsed_time_ms;
+  _inc_cset_predicted_elapsed_time_ms_diffs += elapsed_ms_diff;
+
+  hr->set_recorded_rs_length(new_rs_length);
+  hr->set_predicted_elapsed_time_ms(new_region_elapsed_time_ms);
 }
 
 void G1CollectorPolicy::add_region_to_incremental_cset_common(HeapRegion* hr) {
@@ -2591,6 +2642,7 @@ void G1CollectorPolicy::choose_collection_set(double target_pause_time_ms) {
   double non_young_start_time_sec = os::elapsedTime();
 
   YoungList* young_list = _g1->young_list();
+  finalize_incremental_cset_building();
 
   guarantee(target_pause_time_ms > 0.0,
             err_msg("target_pause_time_ms = %1.6lf should be positive",
@@ -2654,9 +2706,6 @@ void G1CollectorPolicy::choose_collection_set(double target_pause_time_ms) {
   // Clear the fields that point to the survivor list - they are all young now.
   young_list->clear_survivors();
 
-  if (_g1->mark_in_progress())
-    _g1->concurrent_mark()->register_collection_set_finger(_inc_cset_max_finger);
-
   _collection_set = _inc_cset_head;
   _collection_set_bytes_used_before = _inc_cset_bytes_used_before;
   time_remaining_ms -= _inc_cset_predicted_elapsed_time_ms;
diff --git a/hotspot/src/share/vm/gc_implementation/g1/g1CollectorPolicy.hpp b/hotspot/src/share/vm/gc_implementation/g1/g1CollectorPolicy.hpp
index 91aef4b2f5e..90660c00162 100644
--- a/hotspot/src/share/vm/gc_implementation/g1/g1CollectorPolicy.hpp
+++ b/hotspot/src/share/vm/gc_implementation/g1/g1CollectorPolicy.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2001, 2011, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2001, 2012, Oracle and/or its affiliates. All rights reserved.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
  * This code is free software; you can redistribute it and/or modify it
@@ -67,7 +67,7 @@ class MainBodySummary: public CHeapObj {
   define_num_seq(satb_drain) // optional
   define_num_seq(parallel) // parallel only
     define_num_seq(ext_root_scan)
-    define_num_seq(mark_stack_scan)
+    define_num_seq(satb_filtering)
     define_num_seq(update_rs)
     define_num_seq(scan_rs)
     define_num_seq(obj_copy)
@@ -83,6 +83,72 @@ public:
   virtual MainBodySummary*    main_body_summary()    { return this; }
 };
 
+// There are three command line options related to the young gen size:
+// NewSize, MaxNewSize and NewRatio (There is also -Xmn, but that is
+// just a short form for NewSize==MaxNewSize). G1 will use its internal
+// heuristics to calculate the actual young gen size, so these options
+// basically only limit the range within which G1 can pick a young gen
+// size. Also, these are general options taking byte sizes. G1 will
+// internally work with a number of regions instead. So, some rounding
+// will occur.
+//
+// If nothing related to the the young gen size is set on the command
+// line we should allow the young gen to be between
+// G1DefaultMinNewGenPercent and G1DefaultMaxNewGenPercent of the
+// heap size. This means that every time the heap size changes the
+// limits for the young gen size will be updated.
+//
+// If only -XX:NewSize is set we should use the specified value as the
+// minimum size for young gen. Still using G1DefaultMaxNewGenPercent
+// of the heap as maximum.
+//
+// If only -XX:MaxNewSize is set we should use the specified value as the
+// maximum size for young gen. Still using G1DefaultMinNewGenPercent
+// of the heap as minimum.
+//
+// If -XX:NewSize and -XX:MaxNewSize are both specified we use these values.
+// No updates when the heap size changes. There is a special case when
+// NewSize==MaxNewSize. This is interpreted as "fixed" and will use a
+// different heuristic for calculating the collection set when we do mixed
+// collection.
+//
+// If only -XX:NewRatio is set we should use the specified ratio of the heap
+// as both min and max. This will be interpreted as "fixed" just like the
+// NewSize==MaxNewSize case above. But we will update the min and max
+// everytime the heap size changes.
+//
+// NewSize and MaxNewSize override NewRatio. So, NewRatio is ignored if it is
+// combined with either NewSize or MaxNewSize. (A warning message is printed.)
+class G1YoungGenSizer : public CHeapObj {
+private:
+  enum SizerKind {
+    SizerDefaults,
+    SizerNewSizeOnly,
+    SizerMaxNewSizeOnly,
+    SizerMaxAndNewSize,
+    SizerNewRatio
+  };
+  SizerKind _sizer_kind;
+  size_t _min_desired_young_length;
+  size_t _max_desired_young_length;
+  bool _adaptive_size;
+  size_t calculate_default_min_length(size_t new_number_of_heap_regions);
+  size_t calculate_default_max_length(size_t new_number_of_heap_regions);
+
+public:
+  G1YoungGenSizer();
+  void heap_size_changed(size_t new_number_of_heap_regions);
+  size_t min_desired_young_length() {
+    return _min_desired_young_length;
+  }
+  size_t max_desired_young_length() {
+    return _max_desired_young_length;
+  }
+  bool adaptive_young_list_length() {
+    return _adaptive_size;
+  }
+};
+
 class G1CollectorPolicy: public CollectorPolicy {
 private:
   // either equal to the number of parallel threads, if ParallelGCThreads
@@ -149,7 +215,7 @@ private:
 
   double* _par_last_gc_worker_start_times_ms;
   double* _par_last_ext_root_scan_times_ms;
-  double* _par_last_mark_stack_scan_times_ms;
+  double* _par_last_satb_filtering_times_ms;
   double* _par_last_update_rs_times_ms;
   double* _par_last_update_rs_processed_buffers;
   double* _par_last_scan_rs_times_ms;
@@ -167,9 +233,6 @@ private:
   // indicates whether we are in young or mixed GC mode
   bool _gcs_are_young;
 
-  // if true, then it tries to dynamically adjust the length of the
-  // young list
-  bool _adaptive_young_list_length;
   size_t _young_list_target_length;
   size_t _young_list_fixed_length;
   size_t _prev_eden_capacity; // used for logging
@@ -227,9 +290,7 @@ private:
 
   TruncatedSeq* _young_gc_eff_seq;
 
-  bool   _using_new_ratio_calculations;
-  size_t _min_desired_young_length; // as set on the command line or default calculations
-  size_t _max_desired_young_length; // as set on the command line or default calculations
+  G1YoungGenSizer* _young_gen_sizer;
 
   size_t _eden_cset_region_length;
   size_t _survivor_cset_region_length;
@@ -588,16 +649,29 @@ private:
   // Used to record the highest end of heap region in collection set
   HeapWord* _inc_cset_max_finger;
 
-  // The RSet lengths recorded for regions in the collection set
-  // (updated by the periodic sampling of the regions in the
-  // young list/collection set).
+  // The RSet lengths recorded for regions in the CSet. It is updated
+  // by the thread that adds a new region to the CSet. We assume that
+  // only one thread can be allocating a new CSet region (currently,
+  // it does so after taking the Heap_lock) hence no need to
+  // synchronize updates to this field.
   size_t _inc_cset_recorded_rs_lengths;
 
-  // The predicted elapsed time it will take to collect the regions
-  // in the collection set (updated by the periodic sampling of the
-  // regions in the young list/collection set).
+  // A concurrent refinement thread periodcially samples the young
+  // region RSets and needs to update _inc_cset_recorded_rs_lengths as
+  // the RSets grow. Instead of having to syncronize updates to that
+  // field we accumulate them in this field and add it to
+  // _inc_cset_recorded_rs_lengths_diffs at the start of a GC.
+  ssize_t _inc_cset_recorded_rs_lengths_diffs;
+
+  // The predicted elapsed time it will take to collect the regions in
+  // the CSet. This is updated by the thread that adds a new region to
+  // the CSet. See the comment for _inc_cset_recorded_rs_lengths about
+  // MT-safety assumptions.
   double _inc_cset_predicted_elapsed_time_ms;
 
+  // See the comment for _inc_cset_recorded_rs_lengths_diffs.
+  double _inc_cset_predicted_elapsed_time_ms_diffs;
+
   // Stash a pointer to the g1 heap.
   G1CollectedHeap* _g1;
 
@@ -682,8 +756,6 @@ private:
   // Count the number of bytes used in the CS.
   void count_CS_bytes_used();
 
-  void update_young_list_size_using_newratio(size_t number_of_heap_regions);
-
 public:
 
   G1CollectorPolicy();
@@ -710,8 +782,6 @@ public:
   // This should be called after the heap is resized.
   void record_new_heap_size(size_t new_number_of_regions);
 
-public:
-
   void init();
 
   // Create jstat counters for the policy.
@@ -771,8 +841,8 @@ public:
     _par_last_ext_root_scan_times_ms[worker_i] = ms;
   }
 
-  void record_mark_stack_scan_time(int worker_i, double ms) {
-    _par_last_mark_stack_scan_times_ms[worker_i] = ms;
+  void record_satb_filtering_time(int worker_i, double ms) {
+    _par_last_satb_filtering_times_ms[worker_i] = ms;
   }
 
   void record_satb_drain_time(double ms) {
@@ -894,6 +964,10 @@ public:
   // Initialize incremental collection set info.
   void start_incremental_cset_building();
 
+  // Perform any final calculations on the incremental CSet fields
+  // before we can use them.
+  void finalize_incremental_cset_building();
+
   void clear_incremental_cset() {
     _inc_cset_head = NULL;
     _inc_cset_tail = NULL;
@@ -902,10 +976,9 @@ public:
   // Stop adding regions to the incremental collection set
   void stop_incremental_cset_building() { _inc_cset_build_state = Inactive; }
 
-  // Add/remove information about hr to the aggregated information
-  // for the incrementally built collection set.
+  // Add information about hr to the aggregated information for the
+  // incrementally built collection set.
   void add_to_incremental_cset_info(HeapRegion* hr, size_t rs_length);
-  void remove_from_incremental_cset_info(HeapRegion* hr);
 
   // Update information about hr in the aggregated information for
   // the incrementally built collection set.
@@ -998,10 +1071,7 @@ public:
   }
 
   bool adaptive_young_list_length() {
-    return _adaptive_young_list_length;
-  }
-  void set_adaptive_young_list_length(bool adaptive_young_list_length) {
-    _adaptive_young_list_length = adaptive_young_list_length;
+    return _young_gen_sizer->adaptive_young_list_length();
   }
 
   inline double get_gc_eff_factor() {
@@ -1076,6 +1146,11 @@ public:
     _survivor_surv_rate_group->stop_adding_regions();
   }
 
+  void tenure_all_objects() {
+    _max_survivor_regions = 0;
+    _tenuring_threshold = 0;
+  }
+
   void record_survivor_regions(size_t      regions,
                                HeapRegion* head,
                                HeapRegion* tail) {
diff --git a/hotspot/src/share/vm/gc_implementation/g1/g1EvacFailure.hpp b/hotspot/src/share/vm/gc_implementation/g1/g1EvacFailure.hpp
new file mode 100644
index 00000000000..37c30431410
--- /dev/null
+++ b/hotspot/src/share/vm/gc_implementation/g1/g1EvacFailure.hpp
@@ -0,0 +1,236 @@
+/*
+ * Copyright (c) 2012, Oracle and/or its affiliates. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+#ifndef SHARE_VM_GC_IMPLEMENTATION_G1_G1EVACFAILURE_HPP
+#define SHARE_VM_GC_IMPLEMENTATION_G1_G1EVACFAILURE_HPP
+
+#include "gc_implementation/g1/concurrentMark.inline.hpp"
+#include "gc_implementation/g1/dirtyCardQueue.hpp"
+#include "gc_implementation/g1/g1CollectedHeap.inline.hpp"
+#include "gc_implementation/g1/g1_globals.hpp"
+#include "gc_implementation/g1/g1OopClosures.inline.hpp"
+#include "gc_implementation/g1/heapRegion.hpp"
+#include "gc_implementation/g1/heapRegionRemSet.hpp"
+#include "utilities/workgroup.hpp"
+
+// Closures and tasks associated with any self-forwarding pointers
+// installed as a result of an evacuation failure.
+
+class UpdateRSetDeferred : public OopsInHeapRegionClosure {
+private:
+  G1CollectedHeap* _g1;
+  DirtyCardQueue *_dcq;
+  CardTableModRefBS* _ct_bs;
+
+public:
+  UpdateRSetDeferred(G1CollectedHeap* g1, DirtyCardQueue* dcq) :
+    _g1(g1), _ct_bs((CardTableModRefBS*)_g1->barrier_set()), _dcq(dcq) {}
+
+  virtual void do_oop(narrowOop* p) { do_oop_work(p); }
+  virtual void do_oop(      oop* p) { do_oop_work(p); }
+  template <class T> void do_oop_work(T* p) {
+    assert(_from->is_in_reserved(p), "paranoia");
+    if (!_from->is_in_reserved(oopDesc::load_decode_heap_oop(p)) &&
+        !_from->is_survivor()) {
+      size_t card_index = _ct_bs->index_for(p);
+      if (_ct_bs->mark_card_deferred(card_index)) {
+        _dcq->enqueue((jbyte*)_ct_bs->byte_for_index(card_index));
+      }
+    }
+  }
+};
+
+class RemoveSelfForwardPtrObjClosure: public ObjectClosure {
+private:
+  G1CollectedHeap* _g1;
+  ConcurrentMark* _cm;
+  HeapRegion* _hr;
+  size_t _marked_bytes;
+  OopsInHeapRegionClosure *_update_rset_cl;
+  bool _during_initial_mark;
+  bool _during_conc_mark;
+public:
+  RemoveSelfForwardPtrObjClosure(G1CollectedHeap* g1, ConcurrentMark* cm,
+                                 HeapRegion* hr,
+                                 OopsInHeapRegionClosure* update_rset_cl,
+                                 bool during_initial_mark,
+                                 bool during_conc_mark) :
+    _g1(g1), _cm(cm), _hr(hr), _marked_bytes(0),
+    _update_rset_cl(update_rset_cl),
+    _during_initial_mark(during_initial_mark),
+    _during_conc_mark(during_conc_mark) { }
+
+  size_t marked_bytes() { return _marked_bytes; }
+
+  // <original comment>
+  // The original idea here was to coalesce evacuated and dead objects.
+  // However that caused complications with the block offset table (BOT).
+  // In particular if there were two TLABs, one of them partially refined.
+  // |----- TLAB_1--------|----TLAB_2-~~~(partially refined part)~~~|
+  // The BOT entries of the unrefined part of TLAB_2 point to the start
+  // of TLAB_2. If the last object of the TLAB_1 and the first object
+  // of TLAB_2 are coalesced, then the cards of the unrefined part
+  // would point into middle of the filler object.
+  // The current approach is to not coalesce and leave the BOT contents intact.
+  // </original comment>
+  //
+  // We now reset the BOT when we start the object iteration over the
+  // region and refine its entries for every object we come across. So
+  // the above comment is not really relevant and we should be able
+  // to coalesce dead objects if we want to.
+  void do_object(oop obj) {
+    HeapWord* obj_addr = (HeapWord*) obj;
+    assert(_hr->is_in(obj_addr), "sanity");
+    size_t obj_size = obj->size();
+    _hr->update_bot_for_object(obj_addr, obj_size);
+
+    if (obj->is_forwarded() && obj->forwardee() == obj) {
+      // The object failed to move.
+
+      // We consider all objects that we find self-forwarded to be
+      // live. What we'll do is that we'll update the prev marking
+      // info so that they are all under PTAMS and explicitly marked.
+      _cm->markPrev(obj);
+      if (_during_initial_mark) {
+        // For the next marking info we'll only mark the
+        // self-forwarded objects explicitly if we are during
+        // initial-mark (since, normally, we only mark objects pointed
+        // to by roots if we succeed in copying them). By marking all
+        // self-forwarded objects we ensure that we mark any that are
+        // still pointed to be roots. During concurrent marking, and
+        // after initial-mark, we don't need to mark any objects
+        // explicitly and all objects in the CSet are considered
+        // (implicitly) live. So, we won't mark them explicitly and
+        // we'll leave them over NTAMS.
+        _cm->markNext(obj);
+      }
+      _marked_bytes += (obj_size * HeapWordSize);
+      obj->set_mark(markOopDesc::prototype());
+
+      // While we were processing RSet buffers during the collection,
+      // we actually didn't scan any cards on the collection set,
+      // since we didn't want to update remembered sets with entries
+      // that point into the collection set, given that live objects
+      // from the collection set are about to move and such entries
+      // will be stale very soon.
+      // This change also dealt with a reliability issue which
+      // involved scanning a card in the collection set and coming
+      // across an array that was being chunked and looking malformed.
+      // The problem is that, if evacuation fails, we might have
+      // remembered set entries missing given that we skipped cards on
+      // the collection set. So, we'll recreate such entries now.
+      obj->oop_iterate(_update_rset_cl);
+      assert(_cm->isPrevMarked(obj), "Should be marked!");
+    } else {
+      // The object has been either evacuated or is dead. Fill it with a
+      // dummy object.
+      MemRegion mr((HeapWord*) obj, obj_size);
+      CollectedHeap::fill_with_object(mr);
+    }
+  }
+};
+
+class RemoveSelfForwardPtrHRClosure: public HeapRegionClosure {
+  G1CollectedHeap* _g1h;
+  ConcurrentMark* _cm;
+  OopsInHeapRegionClosure *_update_rset_cl;
+
+public:
+  RemoveSelfForwardPtrHRClosure(G1CollectedHeap* g1h,
+                                OopsInHeapRegionClosure* update_rset_cl) :
+    _g1h(g1h), _update_rset_cl(update_rset_cl),
+    _cm(_g1h->concurrent_mark()) { }
+
+  bool doHeapRegion(HeapRegion *hr) {
+    bool during_initial_mark = _g1h->g1_policy()->during_initial_mark_pause();
+    bool during_conc_mark = _g1h->mark_in_progress();
+
+    assert(!hr->isHumongous(), "sanity");
+    assert(hr->in_collection_set(), "bad CS");
+
+    if (hr->claimHeapRegion(HeapRegion::ParEvacFailureClaimValue)) {
+      if (hr->evacuation_failed()) {
+        RemoveSelfForwardPtrObjClosure rspc(_g1h, _cm, hr, _update_rset_cl,
+                                            during_initial_mark,
+                                            during_conc_mark);
+
+        MemRegion mr(hr->bottom(), hr->end());
+        // We'll recreate the prev marking info so we'll first clear
+        // the prev bitmap range for this region. We never mark any
+        // CSet objects explicitly so the next bitmap range should be
+        // cleared anyway.
+        _cm->clearRangePrevBitmap(mr);
+
+        hr->note_self_forwarding_removal_start(during_initial_mark,
+                                               during_conc_mark);
+
+        // In the common case (i.e. when there is no evacuation
+        // failure) we make sure that the following is done when
+        // the region is freed so that it is "ready-to-go" when it's
+        // re-allocated. However, when evacuation failure happens, a
+        // region will remain in the heap and might ultimately be added
+        // to a CSet in the future. So we have to be careful here and
+        // make sure the region's RSet is ready for parallel iteration
+        // whenever this might be required in the future.
+        hr->rem_set()->reset_for_par_iteration();
+        hr->reset_bot();
+        _update_rset_cl->set_region(hr);
+        hr->object_iterate(&rspc);
+
+        hr->note_self_forwarding_removal_end(during_initial_mark,
+                                             during_conc_mark,
+                                             rspc.marked_bytes());
+      }
+    }
+    return false;
+  }
+};
+
+class G1ParRemoveSelfForwardPtrsTask: public AbstractGangTask {
+protected:
+  G1CollectedHeap* _g1h;
+
+public:
+  G1ParRemoveSelfForwardPtrsTask(G1CollectedHeap* g1h) :
+    AbstractGangTask("G1 Remove Self-forwarding Pointers"),
+    _g1h(g1h) { }
+
+  void work(uint worker_id) {
+    UpdateRSetImmediate immediate_update(_g1h->g1_rem_set());
+    DirtyCardQueue dcq(&_g1h->dirty_card_queue_set());
+    UpdateRSetDeferred deferred_update(_g1h, &dcq);
+
+    OopsInHeapRegionClosure *update_rset_cl = &deferred_update;
+    if (!G1DeferredRSUpdate) {
+      update_rset_cl = &immediate_update;
+    }
+
+    RemoveSelfForwardPtrHRClosure rsfp_cl(_g1h, update_rset_cl);
+
+    HeapRegion* hr = _g1h->start_cset_region_for_worker(worker_id);
+    _g1h->collection_set_iterate_from(hr, &rsfp_cl);
+  }
+};
+
+#endif // SHARE_VM_GC_IMPLEMENTATION_G1_G1EVACFAILURE_HPP
diff --git a/hotspot/src/share/vm/gc_implementation/g1/g1OopClosures.hpp b/hotspot/src/share/vm/gc_implementation/g1/g1OopClosures.hpp
index ace59ead402..fc28d9611b9 100644
--- a/hotspot/src/share/vm/gc_implementation/g1/g1OopClosures.hpp
+++ b/hotspot/src/share/vm/gc_implementation/g1/g1OopClosures.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2001, 2011, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2001, 2012, Oracle and/or its affiliates. All rights reserved.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
  * This code is free software; you can redistribute it and/or modify it
@@ -121,17 +121,25 @@ public:
 class G1ParCopyHelper : public G1ParClosureSuper {
   G1ParScanClosure *_scanner;
 protected:
-  template <class T> void mark_object(T* p);
-  oop copy_to_survivor_space(oop obj, bool should_mark_root,
-                                      bool should_mark_copy);
+  // Mark the object if it's not already marked. This is used to mark
+  // objects pointed to by roots that are guaranteed not to move
+  // during the GC (i.e., non-CSet objects). It is MT-safe.
+  void mark_object(oop obj);
+
+  // Mark the object if it's not already marked. This is used to mark
+  // objects pointed to by roots that have been forwarded during a
+  // GC. It is MT-safe.
+  void mark_forwarded_object(oop from_obj, oop to_obj);
+
+  oop copy_to_survivor_space(oop obj);
+
 public:
   G1ParCopyHelper(G1CollectedHeap* g1, G1ParScanThreadState* par_scan_state,
                   G1ParScanClosure *scanner) :
     G1ParClosureSuper(g1, par_scan_state), _scanner(scanner) { }
 };
 
-template<bool do_gen_barrier, G1Barrier barrier,
-         bool do_mark_object>
+template <bool do_gen_barrier, G1Barrier barrier, bool do_mark_object>
 class G1ParCopyClosure : public G1ParCopyHelper {
   G1ParScanClosure _scanner;
 
@@ -140,9 +148,8 @@ class G1ParCopyClosure : public G1ParCopyHelper {
 public:
   G1ParCopyClosure(G1CollectedHeap* g1, G1ParScanThreadState* par_scan_state,
                    ReferenceProcessor* rp) :
-    _scanner(g1, par_scan_state, rp),
-    G1ParCopyHelper(g1, par_scan_state, &_scanner)
-  {
+      _scanner(g1, par_scan_state, rp),
+      G1ParCopyHelper(g1, par_scan_state, &_scanner) {
     assert(_ref_processor == NULL, "sanity");
   }
 
diff --git a/hotspot/src/share/vm/gc_implementation/g1/g1RemSet.cpp b/hotspot/src/share/vm/gc_implementation/g1/g1RemSet.cpp
index f8d96180de5..4a98258b996 100644
--- a/hotspot/src/share/vm/gc_implementation/g1/g1RemSet.cpp
+++ b/hotspot/src/share/vm/gc_implementation/g1/g1RemSet.cpp
@@ -558,11 +558,11 @@ void G1RemSet::scrub(BitMap* region_bm, BitMap* card_bm) {
 }
 
 void G1RemSet::scrub_par(BitMap* region_bm, BitMap* card_bm,
-                                int worker_num, int claim_val) {
+                                uint worker_num, int claim_val) {
   ScrubRSClosure scrub_cl(region_bm, card_bm);
   _g1->heap_region_par_iterate_chunked(&scrub_cl,
                                        worker_num,
-                                       (int) n_workers(),
+                                       n_workers(),
                                        claim_val);
 }
 
diff --git a/hotspot/src/share/vm/gc_implementation/g1/g1RemSet.hpp b/hotspot/src/share/vm/gc_implementation/g1/g1RemSet.hpp
index 330134da6f0..79d550ef27d 100644
--- a/hotspot/src/share/vm/gc_implementation/g1/g1RemSet.hpp
+++ b/hotspot/src/share/vm/gc_implementation/g1/g1RemSet.hpp
@@ -40,7 +40,7 @@ class G1RemSet: public CHeapObj {
 protected:
   G1CollectedHeap* _g1;
   unsigned _conc_refine_cards;
-  size_t n_workers();
+  uint n_workers();
 
 protected:
   enum SomePrivateConstants {
@@ -122,7 +122,7 @@ public:
   // parallel thread id of the current thread, and "claim_val" is the
   // value that should be used to claim heap regions.
   void scrub_par(BitMap* region_bm, BitMap* card_bm,
-                 int worker_num, int claim_val);
+                 uint worker_num, int claim_val);
 
   // Refine the card corresponding to "card_ptr".  If "sts" is non-NULL,
   // join and leave around parts that must be atomic wrt GC.  (NULL means
diff --git a/hotspot/src/share/vm/gc_implementation/g1/g1RemSet.inline.hpp b/hotspot/src/share/vm/gc_implementation/g1/g1RemSet.inline.hpp
index 29b44b2e1b7..7491e3a8509 100644
--- a/hotspot/src/share/vm/gc_implementation/g1/g1RemSet.inline.hpp
+++ b/hotspot/src/share/vm/gc_implementation/g1/g1RemSet.inline.hpp
@@ -29,7 +29,7 @@
 #include "gc_implementation/g1/heapRegionRemSet.hpp"
 #include "oops/oop.inline.hpp"
 
-inline size_t G1RemSet::n_workers() {
+inline uint G1RemSet::n_workers() {
   if (_g1->workers() != NULL) {
     return _g1->workers()->total_workers();
   } else {
diff --git a/hotspot/src/share/vm/gc_implementation/g1/g1_globals.hpp b/hotspot/src/share/vm/gc_implementation/g1/g1_globals.hpp
index facf4237bff..d1264587841 100644
--- a/hotspot/src/share/vm/gc_implementation/g1/g1_globals.hpp
+++ b/hotspot/src/share/vm/gc_implementation/g1/g1_globals.hpp
@@ -289,7 +289,15 @@
                                                                             \
   develop(uintx, G1ConcMarkForceOverflow, 0,                                \
           "The number of times we'll force an overflow during "             \
-          "concurrent marking")
+          "concurrent marking")                                             \
+                                                                            \
+  develop(uintx, G1DefaultMinNewGenPercent, 20,                             \
+          "Percentage (0-100) of the heap size to use as minimum "          \
+          "young gen size.")                                                \
+                                                                            \
+  develop(uintx, G1DefaultMaxNewGenPercent, 80,                             \
+          "Percentage (0-100) of the heap size to use as maximum "          \
+          "young gen size.")
 
 G1_FLAGS(DECLARE_DEVELOPER_FLAG, DECLARE_PD_DEVELOPER_FLAG, DECLARE_PRODUCT_FLAG, DECLARE_PD_PRODUCT_FLAG, DECLARE_DIAGNOSTIC_FLAG, DECLARE_EXPERIMENTAL_FLAG, DECLARE_NOTPRODUCT_FLAG, DECLARE_MANAGEABLE_FLAG, DECLARE_PRODUCT_RW_FLAG)
 
diff --git a/hotspot/src/share/vm/gc_implementation/g1/heapRegion.cpp b/hotspot/src/share/vm/gc_implementation/g1/heapRegion.cpp
index 98b7ebf0cbc..336e4cab5a6 100644
--- a/hotspot/src/share/vm/gc_implementation/g1/heapRegion.cpp
+++ b/hotspot/src/share/vm/gc_implementation/g1/heapRegion.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2001, 2011, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2001, 2012, Oracle and/or its affiliates. All rights reserved.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
  * This code is free software; you can redistribute it and/or modify it
@@ -94,7 +94,8 @@ public:
 #endif // PRODUCT
   }
 
-  template <class T> void do_oop_work(T* p) {
+  template <class T>
+  void do_oop_work(T* p) {
     assert(_containing_obj != NULL, "Precondition");
     assert(!_g1h->is_obj_dead_cond(_containing_obj, _vo),
            "Precondition");
@@ -102,8 +103,10 @@ public:
     if (!oopDesc::is_null(heap_oop)) {
       oop obj = oopDesc::decode_heap_oop_not_null(heap_oop);
       bool failed = false;
-      if (!_g1h->is_in_closed_subset(obj) ||
-          _g1h->is_obj_dead_cond(obj, _vo)) {
+      if (!_g1h->is_in_closed_subset(obj) || _g1h->is_obj_dead_cond(obj, _vo)) {
+        MutexLockerEx x(ParGCRareEvent_lock,
+                        Mutex::_no_safepoint_check_flag);
+
         if (!_failures) {
           gclog_or_tty->print_cr("");
           gclog_or_tty->print_cr("----------");
@@ -133,6 +136,7 @@ public:
           print_object(gclog_or_tty, obj);
         }
         gclog_or_tty->print_cr("----------");
+        gclog_or_tty->flush();
         _failures = true;
         failed = true;
         _n_failures++;
@@ -155,6 +159,9 @@ public:
                                   cv_field == dirty
                                : cv_obj == dirty || cv_field == dirty));
           if (is_bad) {
+            MutexLockerEx x(ParGCRareEvent_lock,
+                            Mutex::_no_safepoint_check_flag);
+
             if (!_failures) {
               gclog_or_tty->print_cr("");
               gclog_or_tty->print_cr("----------");
@@ -174,6 +181,7 @@ public:
             gclog_or_tty->print_cr("Obj head CTE = %d, field CTE = %d.",
                           cv_obj, cv_field);
             gclog_or_tty->print_cr("----------");
+            gclog_or_tty->flush();
             _failures = true;
             if (!failed) _n_failures++;
           }
@@ -567,6 +575,40 @@ void HeapRegion::oop_before_save_marks_iterate(OopClosure* cl) {
   oops_in_mr_iterate(MemRegion(bottom(), saved_mark_word()), cl);
 }
 
+void HeapRegion::note_self_forwarding_removal_start(bool during_initial_mark,
+                                                    bool during_conc_mark) {
+  // We always recreate the prev marking info and we'll explicitly
+  // mark all objects we find to be self-forwarded on the prev
+  // bitmap. So all objects need to be below PTAMS.
+  _prev_top_at_mark_start = top();
+  _prev_marked_bytes = 0;
+
+  if (during_initial_mark) {
+    // During initial-mark, we'll also explicitly mark all objects
+    // we find to be self-forwarded on the next bitmap. So all
+    // objects need to be below NTAMS.
+    _next_top_at_mark_start = top();
+    set_top_at_conc_mark_count(bottom());
+    _next_marked_bytes = 0;
+  } else if (during_conc_mark) {
+    // During concurrent mark, all objects in the CSet (including
+    // the ones we find to be self-forwarded) are implicitly live.
+    // So all objects need to be above NTAMS.
+    _next_top_at_mark_start = bottom();
+    set_top_at_conc_mark_count(bottom());
+    _next_marked_bytes = 0;
+  }
+}
+
+void HeapRegion::note_self_forwarding_removal_end(bool during_initial_mark,
+                                                  bool during_conc_mark,
+                                                  size_t marked_bytes) {
+  assert(0 <= marked_bytes && marked_bytes <= used(),
+         err_msg("marked: "SIZE_FORMAT" used: "SIZE_FORMAT,
+                 marked_bytes, used()));
+  _prev_marked_bytes = marked_bytes;
+}
+
 HeapWord*
 HeapRegion::object_iterate_mem_careful(MemRegion mr,
                                                  ObjectClosure* cl) {
diff --git a/hotspot/src/share/vm/gc_implementation/g1/heapRegion.hpp b/hotspot/src/share/vm/gc_implementation/g1/heapRegion.hpp
index ccd05e0cc7c..ad1599c5ab8 100644
--- a/hotspot/src/share/vm/gc_implementation/g1/heapRegion.hpp
+++ b/hotspot/src/share/vm/gc_implementation/g1/heapRegion.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2001, 2011, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2001, 2012, Oracle and/or its affiliates. All rights reserved.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
  * This code is free software; you can redistribute it and/or modify it
@@ -373,7 +373,8 @@ class HeapRegion: public G1OffsetTableContigSpace {
     ScrubRemSetClaimValue      = 3,
     ParVerifyClaimValue        = 4,
     RebuildRSClaimValue        = 5,
-    CompleteMarkCSetClaimValue = 6
+    CompleteMarkCSetClaimValue = 6,
+    ParEvacFailureClaimValue   = 7
   };
 
   inline HeapWord* par_allocate_no_bot_updates(size_t word_size) {
@@ -582,37 +583,33 @@ class HeapRegion: public G1OffsetTableContigSpace {
   // that the collector is about to start or has finished (concurrently)
   // marking the heap.
 
-  // Note the start of a marking phase. Record the
-  // start of the unmarked area of the region here.
-  void note_start_of_marking(bool during_initial_mark) {
-    init_top_at_conc_mark_count();
-    _next_marked_bytes = 0;
-    if (during_initial_mark && is_young() && !is_survivor())
-      _next_top_at_mark_start = bottom();
-    else
-      _next_top_at_mark_start = top();
-  }
+  // Notify the region that concurrent marking is starting. Initialize
+  // all fields related to the next marking info.
+  inline void note_start_of_marking();
 
-  // Note the end of a marking phase. Install the start of
-  // the unmarked area that was captured at start of marking.
-  void note_end_of_marking() {
-    _prev_top_at_mark_start = _next_top_at_mark_start;
-    _prev_marked_bytes = _next_marked_bytes;
-    _next_marked_bytes = 0;
+  // Notify the region that concurrent marking has finished. Copy the
+  // (now finalized) next marking info fields into the prev marking
+  // info fields.
+  inline void note_end_of_marking();
 
-    guarantee(_prev_marked_bytes <=
-              (size_t) (prev_top_at_mark_start() - bottom()) * HeapWordSize,
-              "invariant");
-  }
+  // Notify the region that it will be used as to-space during a GC
+  // and we are about to start copying objects into it.
+  inline void note_start_of_copying(bool during_initial_mark);
 
-  // After an evacuation, we need to update _next_top_at_mark_start
-  // to be the current top.  Note this is only valid if we have only
-  // ever evacuated into this region.  If we evacuate, allocate, and
-  // then evacuate we are in deep doodoo.
-  void note_end_of_copying() {
-    assert(top() >= _next_top_at_mark_start, "Increase only");
-    _next_top_at_mark_start = top();
-  }
+  // Notify the region that it ceases being to-space during a GC and
+  // we will not copy objects into it any more.
+  inline void note_end_of_copying(bool during_initial_mark);
+
+  // Notify the region that we are about to start processing
+  // self-forwarded objects during evac failure handling.
+  void note_self_forwarding_removal_start(bool during_initial_mark,
+                                          bool during_conc_mark);
+
+  // Notify the region that we have finished processing self-forwarded
+  // objects during evac failure handling.
+  void note_self_forwarding_removal_end(bool during_initial_mark,
+                                        bool during_conc_mark,
+                                        size_t marked_bytes);
 
   // Returns "false" iff no object in the region was allocated when the
   // last mark phase ended.
diff --git a/hotspot/src/share/vm/gc_implementation/g1/heapRegion.inline.hpp b/hotspot/src/share/vm/gc_implementation/g1/heapRegion.inline.hpp
index 93cad742435..55d05d998fe 100644
--- a/hotspot/src/share/vm/gc_implementation/g1/heapRegion.inline.hpp
+++ b/hotspot/src/share/vm/gc_implementation/g1/heapRegion.inline.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2001, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2001, 2012, Oracle and/or its affiliates. All rights reserved.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
  * This code is free software; you can redistribute it and/or modify it
@@ -55,4 +55,71 @@ G1OffsetTableContigSpace::block_start_const(const void* p) const {
   return _offsets.block_start_const(p);
 }
 
+inline void HeapRegion::note_start_of_marking() {
+  init_top_at_conc_mark_count();
+  _next_marked_bytes = 0;
+  _next_top_at_mark_start = top();
+}
+
+inline void HeapRegion::note_end_of_marking() {
+  _prev_top_at_mark_start = _next_top_at_mark_start;
+  _prev_marked_bytes = _next_marked_bytes;
+  _next_marked_bytes = 0;
+
+  assert(_prev_marked_bytes <=
+         (size_t) pointer_delta(prev_top_at_mark_start(), bottom()) *
+         HeapWordSize, "invariant");
+}
+
+inline void HeapRegion::note_start_of_copying(bool during_initial_mark) {
+  if (during_initial_mark) {
+    if (is_survivor()) {
+      assert(false, "should not allocate survivors during IM");
+    } else {
+      // During initial-mark we'll explicitly mark any objects on old
+      // regions that are pointed to by roots. Given that explicit
+      // marks only make sense under NTAMS it'd be nice if we could
+      // check that condition if we wanted to. Given that we don't
+      // know where the top of this region will end up, we simply set
+      // NTAMS to the end of the region so all marks will be below
+      // NTAMS. We'll set it to the actual top when we retire this region.
+      _next_top_at_mark_start = end();
+    }
+  } else {
+    if (is_survivor()) {
+      // This is how we always allocate survivors.
+      assert(_next_top_at_mark_start == bottom(), "invariant");
+    } else {
+      // We could have re-used this old region as to-space over a
+      // couple of GCs since the start of the concurrent marking
+      // cycle. This means that [bottom,NTAMS) will contain objects
+      // copied up to and including initial-mark and [NTAMS, top)
+      // will contain objects copied during the concurrent marking cycle.
+      assert(top() >= _next_top_at_mark_start, "invariant");
+    }
+  }
+}
+
+inline void HeapRegion::note_end_of_copying(bool during_initial_mark) {
+  if (during_initial_mark) {
+    if (is_survivor()) {
+      assert(false, "should not allocate survivors during IM");
+    } else {
+      // See the comment for note_start_of_copying() for the details
+      // on this.
+      assert(_next_top_at_mark_start == end(), "pre-condition");
+      _next_top_at_mark_start = top();
+    }
+  } else {
+    if (is_survivor()) {
+      // This is how we always allocate survivors.
+      assert(_next_top_at_mark_start == bottom(), "invariant");
+    } else {
+      // See the comment for note_start_of_copying() for the details
+      // on this.
+      assert(top() >= _next_top_at_mark_start, "invariant");
+    }
+  }
+}
+
 #endif // SHARE_VM_GC_IMPLEMENTATION_G1_HEAPREGION_INLINE_HPP
diff --git a/hotspot/src/share/vm/gc_implementation/g1/ptrQueue.hpp b/hotspot/src/share/vm/gc_implementation/g1/ptrQueue.hpp
index 55a96fce129..1f6d9b21388 100644
--- a/hotspot/src/share/vm/gc_implementation/g1/ptrQueue.hpp
+++ b/hotspot/src/share/vm/gc_implementation/g1/ptrQueue.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2001, 2011, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2001, 2012, Oracle and/or its affiliates. All rights reserved.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
  * This code is free software; you can redistribute it and/or modify it
@@ -70,7 +70,7 @@ public:
   // given PtrQueueSet.
   PtrQueue(PtrQueueSet* qset, bool perm = false, bool active = false);
   // Release any contained resources.
-  void flush();
+  virtual void flush();
   // Calls flush() when destroyed.
   ~PtrQueue() { flush(); }
 
diff --git a/hotspot/src/share/vm/gc_implementation/g1/satbQueue.cpp b/hotspot/src/share/vm/gc_implementation/g1/satbQueue.cpp
index ea0c19a89a8..69b01bc69f5 100644
--- a/hotspot/src/share/vm/gc_implementation/g1/satbQueue.cpp
+++ b/hotspot/src/share/vm/gc_implementation/g1/satbQueue.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2001, 2011, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2001, 2012, Oracle and/or its affiliates. All rights reserved.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
  * This code is free software; you can redistribute it and/or modify it
@@ -31,6 +31,14 @@
 #include "runtime/thread.hpp"
 #include "runtime/vmThread.hpp"
 
+void ObjPtrQueue::flush() {
+  // The buffer might contain refs into the CSet. We have to filter it
+  // first before we flush it, otherwise we might end up with an
+  // enqueued buffer with refs into the CSet which breaks our invariants.
+  filter();
+  PtrQueue::flush();
+}
+
 // This method removes entries from an SATB buffer that will not be
 // useful to the concurrent marking threads. An entry is removed if it
 // satisfies one of the following conditions:
@@ -44,38 +52,27 @@
 //     process it again).
 //
 // The rest of the entries will be retained and are compacted towards
-// the top of the buffer. If with this filtering we clear a large
-// enough chunk of the buffer we can re-use it (instead of enqueueing
-// it) and we can just allow the mutator to carry on executing.
-
-bool ObjPtrQueue::should_enqueue_buffer() {
-  assert(_lock == NULL || _lock->owned_by_self(),
-         "we should have taken the lock before calling this");
-
-  // A value of 0 means "don't filter SATB buffers".
-  if (G1SATBBufferEnqueueingThresholdPercent == 0) {
-    return true;
-  }
+// the top of the buffer. Note that, because we do not allow old
+// regions in the CSet during marking, all objects on the CSet regions
+// are young (eden or survivors) and therefore implicitly live. So any
+// references into the CSet will be removed during filtering.
 
+void ObjPtrQueue::filter() {
   G1CollectedHeap* g1h = G1CollectedHeap::heap();
-
-  // This method should only be called if there is a non-NULL buffer
-  // that is full.
-  assert(_index == 0, "pre-condition");
-  assert(_buf != NULL, "pre-condition");
-
   void** buf = _buf;
   size_t sz = _sz;
 
+  if (buf == NULL) {
+    // nothing to do
+    return;
+  }
+
   // Used for sanity checking at the end of the loop.
   debug_only(size_t entries = 0; size_t retained = 0;)
 
   size_t i = sz;
   size_t new_index = sz;
 
-  // Given that we are expecting _index == 0, we could have changed
-  // the loop condition to (i > 0). But we are using _index for
-  // generality.
   while (i > _index) {
     assert(i > 0, "we should have at least one more entry to process");
     i -= oopSize;
@@ -103,20 +100,56 @@ bool ObjPtrQueue::should_enqueue_buffer() {
       debug_only(retained += 1;)
     }
   }
+
+#ifdef ASSERT
   size_t entries_calc = (sz - _index) / oopSize;
   assert(entries == entries_calc, "the number of entries we counted "
          "should match the number of entries we calculated");
   size_t retained_calc = (sz - new_index) / oopSize;
   assert(retained == retained_calc, "the number of retained entries we counted "
          "should match the number of retained entries we calculated");
-  size_t perc = retained_calc * 100 / entries_calc;
-  bool should_enqueue = perc > (size_t) G1SATBBufferEnqueueingThresholdPercent;
-  _index = new_index;
+#endif // ASSERT
 
+  _index = new_index;
+}
+
+// This method will first apply the above filtering to the buffer. If
+// post-filtering a large enough chunk of the buffer has been cleared
+// we can re-use the buffer (instead of enqueueing it) and we can just
+// allow the mutator to carry on executing using the same buffer
+// instead of replacing it.
+
+bool ObjPtrQueue::should_enqueue_buffer() {
+  assert(_lock == NULL || _lock->owned_by_self(),
+         "we should have taken the lock before calling this");
+
+  // Even if G1SATBBufferEnqueueingThresholdPercent == 0 we have to
+  // filter the buffer given that this will remove any references into
+  // the CSet as we currently assume that no such refs will appear in
+  // enqueued buffers.
+
+  // This method should only be called if there is a non-NULL buffer
+  // that is full.
+  assert(_index == 0, "pre-condition");
+  assert(_buf != NULL, "pre-condition");
+
+  filter();
+
+  size_t sz = _sz;
+  size_t all_entries = sz / oopSize;
+  size_t retained_entries = (sz - _index) / oopSize;
+  size_t perc = retained_entries * 100 / all_entries;
+  bool should_enqueue = perc > (size_t) G1SATBBufferEnqueueingThresholdPercent;
   return should_enqueue;
 }
 
 void ObjPtrQueue::apply_closure(ObjectClosure* cl) {
+  if (_buf != NULL) {
+    apply_closure_to_buffer(cl, _buf, _index, _sz);
+  }
+}
+
+void ObjPtrQueue::apply_closure_and_empty(ObjectClosure* cl) {
   if (_buf != NULL) {
     apply_closure_to_buffer(cl, _buf, _index, _sz);
     _index = _sz;
@@ -135,6 +168,21 @@ void ObjPtrQueue::apply_closure_to_buffer(ObjectClosure* cl,
   }
 }
 
+#ifndef PRODUCT
+// Helpful for debugging
+
+void ObjPtrQueue::print(const char* name) {
+  print(name, _buf, _index, _sz);
+}
+
+void ObjPtrQueue::print(const char* name,
+                        void** buf, size_t index, size_t sz) {
+  gclog_or_tty->print_cr("  SATB BUFFER [%s] buf: "PTR_FORMAT" "
+                         "index: "SIZE_FORMAT" sz: "SIZE_FORMAT,
+                         name, buf, index, sz);
+}
+#endif // PRODUCT
+
 #ifdef ASSERT
 void ObjPtrQueue::verify_oops_in_buffer() {
   if (_buf == NULL) return;
@@ -150,12 +198,9 @@ void ObjPtrQueue::verify_oops_in_buffer() {
 #pragma warning( disable:4355 ) // 'this' : used in base member initializer list
 #endif // _MSC_VER
 
-
 SATBMarkQueueSet::SATBMarkQueueSet() :
-  PtrQueueSet(),
-  _closure(NULL), _par_closures(NULL),
-  _shared_satb_queue(this, true /*perm*/)
-{}
+  PtrQueueSet(), _closure(NULL), _par_closures(NULL),
+  _shared_satb_queue(this, true /*perm*/) { }
 
 void SATBMarkQueueSet::initialize(Monitor* cbl_mon, Mutex* fl_lock,
                                   int process_completed_threshold,
@@ -167,7 +212,6 @@ void SATBMarkQueueSet::initialize(Monitor* cbl_mon, Mutex* fl_lock,
   }
 }
 
-
 void SATBMarkQueueSet::handle_zero_index_for_thread(JavaThread* t) {
   DEBUG_ONLY(t->satb_mark_queue().verify_oops_in_buffer();)
   t->satb_mark_queue().handle_zero_index();
@@ -228,6 +272,13 @@ void SATBMarkQueueSet::set_active_all_threads(bool b,
   }
 }
 
+void SATBMarkQueueSet::filter_thread_buffers() {
+  for(JavaThread* t = Threads::first(); t; t = t->next()) {
+    t->satb_mark_queue().filter();
+  }
+  shared_satb_queue()->filter();
+}
+
 void SATBMarkQueueSet::set_closure(ObjectClosure* closure) {
   _closure = closure;
 }
@@ -239,9 +290,9 @@ void SATBMarkQueueSet::set_par_closure(int i, ObjectClosure* par_closure) {
 
 void SATBMarkQueueSet::iterate_closure_all_threads() {
   for(JavaThread* t = Threads::first(); t; t = t->next()) {
-    t->satb_mark_queue().apply_closure(_closure);
+    t->satb_mark_queue().apply_closure_and_empty(_closure);
   }
-  shared_satb_queue()->apply_closure(_closure);
+  shared_satb_queue()->apply_closure_and_empty(_closure);
 }
 
 void SATBMarkQueueSet::par_iterate_closure_all_threads(int worker) {
@@ -250,7 +301,7 @@ void SATBMarkQueueSet::par_iterate_closure_all_threads(int worker) {
 
   for(JavaThread* t = Threads::first(); t; t = t->next()) {
     if (t->claim_oops_do(true, parity)) {
-      t->satb_mark_queue().apply_closure(_par_closures[worker]);
+      t->satb_mark_queue().apply_closure_and_empty(_par_closures[worker]);
     }
   }
 
@@ -264,7 +315,7 @@ void SATBMarkQueueSet::par_iterate_closure_all_threads(int worker) {
 
   VMThread* vmt = VMThread::vm_thread();
   if (vmt->claim_oops_do(true, parity)) {
-    shared_satb_queue()->apply_closure(_par_closures[worker]);
+    shared_satb_queue()->apply_closure_and_empty(_par_closures[worker]);
   }
 }
 
@@ -292,6 +343,61 @@ bool SATBMarkQueueSet::apply_closure_to_completed_buffer_work(bool par,
   }
 }
 
+void SATBMarkQueueSet::iterate_completed_buffers_read_only(ObjectClosure* cl) {
+  assert(SafepointSynchronize::is_at_safepoint(), "Must be at safepoint.");
+  assert(cl != NULL, "pre-condition");
+
+  BufferNode* nd = _completed_buffers_head;
+  while (nd != NULL) {
+    void** buf = BufferNode::make_buffer_from_node(nd);
+    ObjPtrQueue::apply_closure_to_buffer(cl, buf, 0, _sz);
+    nd = nd->next();
+  }
+}
+
+void SATBMarkQueueSet::iterate_thread_buffers_read_only(ObjectClosure* cl) {
+  assert(SafepointSynchronize::is_at_safepoint(), "Must be at safepoint.");
+  assert(cl != NULL, "pre-condition");
+
+  for (JavaThread* t = Threads::first(); t; t = t->next()) {
+    t->satb_mark_queue().apply_closure(cl);
+  }
+  shared_satb_queue()->apply_closure(cl);
+}
+
+#ifndef PRODUCT
+// Helpful for debugging
+
+#define SATB_PRINTER_BUFFER_SIZE 256
+
+void SATBMarkQueueSet::print_all(const char* msg) {
+  char buffer[SATB_PRINTER_BUFFER_SIZE];
+  assert(SafepointSynchronize::is_at_safepoint(), "Must be at safepoint.");
+
+  gclog_or_tty->cr();
+  gclog_or_tty->print_cr("SATB BUFFERS [%s]", msg);
+
+  BufferNode* nd = _completed_buffers_head;
+  int i = 0;
+  while (nd != NULL) {
+    void** buf = BufferNode::make_buffer_from_node(nd);
+    jio_snprintf(buffer, SATB_PRINTER_BUFFER_SIZE, "Enqueued: %d", i);
+    ObjPtrQueue::print(buffer, buf, 0, _sz);
+    nd = nd->next();
+    i += 1;
+  }
+
+  for (JavaThread* t = Threads::first(); t; t = t->next()) {
+    jio_snprintf(buffer, SATB_PRINTER_BUFFER_SIZE, "Thread: %s", t->name());
+    t->satb_mark_queue().print(buffer);
+  }
+
+  shared_satb_queue()->print("Shared");
+
+  gclog_or_tty->cr();
+}
+#endif // PRODUCT
+
 void SATBMarkQueueSet::abandon_partial_marking() {
   BufferNode* buffers_to_delete = NULL;
   {
@@ -316,5 +422,5 @@ void SATBMarkQueueSet::abandon_partial_marking() {
   for (JavaThread* t = Threads::first(); t; t = t->next()) {
     t->satb_mark_queue().reset();
   }
-  shared_satb_queue()->reset();
+ shared_satb_queue()->reset();
 }
diff --git a/hotspot/src/share/vm/gc_implementation/g1/satbQueue.hpp b/hotspot/src/share/vm/gc_implementation/g1/satbQueue.hpp
index 5a44b1695e1..adce5ff687e 100644
--- a/hotspot/src/share/vm/gc_implementation/g1/satbQueue.hpp
+++ b/hotspot/src/share/vm/gc_implementation/g1/satbQueue.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2001, 2011, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2001, 2012, Oracle and/or its affiliates. All rights reserved.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
  * This code is free software; you can redistribute it and/or modify it
@@ -29,9 +29,26 @@
 
 class ObjectClosure;
 class JavaThread;
+class SATBMarkQueueSet;
 
 // A ptrQueue whose elements are "oops", pointers to object heads.
 class ObjPtrQueue: public PtrQueue {
+  friend class SATBMarkQueueSet;
+
+private:
+  // Filter out unwanted entries from the buffer.
+  void filter();
+
+  // Apply the closure to all elements.
+  void apply_closure(ObjectClosure* cl);
+
+  // Apply the closure to all elements and empty the buffer;
+  void apply_closure_and_empty(ObjectClosure* cl);
+
+  // Apply the closure to all elements of "buf", down to "index" (inclusive.)
+  static void apply_closure_to_buffer(ObjectClosure* cl,
+                                      void** buf, size_t index, size_t sz);
+
 public:
   ObjPtrQueue(PtrQueueSet* qset, bool perm = false) :
     // SATB queues are only active during marking cycles. We create
@@ -41,23 +58,23 @@ public:
     // field to true. This is done in JavaThread::initialize_queues().
     PtrQueue(qset, perm, false /* active */) { }
 
+  // Overrides PtrQueue::flush() so that it can filter the buffer
+  // before it is flushed.
+  virtual void flush();
+
   // Overrides PtrQueue::should_enqueue_buffer(). See the method's
   // definition for more information.
   virtual bool should_enqueue_buffer();
 
-  // Apply the closure to all elements, and reset the index to make the
-  // buffer empty.
-  void apply_closure(ObjectClosure* cl);
-
-  // Apply the closure to all elements of "buf", down to "index" (inclusive.)
-  static void apply_closure_to_buffer(ObjectClosure* cl,
-                                      void** buf, size_t index, size_t sz);
+#ifndef PRODUCT
+  // Helpful for debugging
+  void print(const char* name);
+  static void print(const char* name, void** buf, size_t index, size_t sz);
+#endif // PRODUCT
 
   void verify_oops_in_buffer() NOT_DEBUG_RETURN;
 };
 
-
-
 class SATBMarkQueueSet: public PtrQueueSet {
   ObjectClosure* _closure;
   ObjectClosure** _par_closures;  // One per ParGCThread.
@@ -88,6 +105,9 @@ public:
   // set itself, has an active value same as expected_active.
   void set_active_all_threads(bool b, bool expected_active);
 
+  // Filter all the currently-active SATB buffers.
+  void filter_thread_buffers();
+
   // Register "blk" as "the closure" for all queues.  Only one such closure
   // is allowed.  The "apply_closure_to_completed_buffer" method will apply
   // this closure to a completed buffer, and "iterate_closure_all_threads"
@@ -98,10 +118,9 @@ public:
   // closures, one for each parallel GC thread.
   void set_par_closure(int i, ObjectClosure* closure);
 
-  // If there is a registered closure for buffers, apply it to all entries
-  // in all currently-active buffers.  This should only be applied at a
-  // safepoint.  (Currently must not be called in parallel; this should
-  // change in the future.)
+  // Apply the registered closure to all entries on each
+  // currently-active buffer and then empty the buffer. It should only
+  // be called serially and at a safepoint.
   void iterate_closure_all_threads();
   // Parallel version of the above.
   void par_iterate_closure_all_threads(int worker);
@@ -117,11 +136,21 @@ public:
     return apply_closure_to_completed_buffer_work(true, worker);
   }
 
+  // Apply the given closure on enqueued and currently-active buffers
+  // respectively. Both methods are read-only, i.e., they do not
+  // modify any of the buffers.
+  void iterate_completed_buffers_read_only(ObjectClosure* cl);
+  void iterate_thread_buffers_read_only(ObjectClosure* cl);
+
+#ifndef PRODUCT
+  // Helpful for debugging
+  void print_all(const char* msg);
+#endif // PRODUCT
+
   ObjPtrQueue* shared_satb_queue() { return &_shared_satb_queue; }
 
   // If a marking is being abandoned, reset any unprocessed log buffers.
   void abandon_partial_marking();
-
 };
 
 #endif // SHARE_VM_GC_IMPLEMENTATION_G1_SATBQUEUE_HPP
diff --git a/hotspot/src/share/vm/gc_implementation/parNew/parCardTableModRefBS.cpp b/hotspot/src/share/vm/gc_implementation/parNew/parCardTableModRefBS.cpp
index 5155ca934a0..25be723aa98 100644
--- a/hotspot/src/share/vm/gc_implementation/parNew/parCardTableModRefBS.cpp
+++ b/hotspot/src/share/vm/gc_implementation/parNew/parCardTableModRefBS.cpp
@@ -56,14 +56,14 @@ void CardTableModRefBS::non_clean_card_iterate_parallel_work(Space* sp, MemRegio
                           lowest_non_clean_base_chunk_index,
                           lowest_non_clean_chunk_size);
 
-  int n_strides = n_threads * ParGCStridesPerThread;
+  uint n_strides = n_threads * ParGCStridesPerThread;
   SequentialSubTasksDone* pst = sp->par_seq_tasks();
   // Sets the condition for completion of the subtask (how many threads
   // need to finish in order to be done).
   pst->set_n_threads(n_threads);
   pst->set_n_tasks(n_strides);
 
-  int stride = 0;
+  uint stride = 0;
   while (!pst->is_task_claimed(/* reference */ stride)) {
     process_stride(sp, mr, stride, n_strides, cl, ct,
                    lowest_non_clean,
diff --git a/hotspot/src/share/vm/gc_implementation/parNew/parNewGeneration.cpp b/hotspot/src/share/vm/gc_implementation/parNew/parNewGeneration.cpp
index 1c20f87c126..91445947c33 100644
--- a/hotspot/src/share/vm/gc_implementation/parNew/parNewGeneration.cpp
+++ b/hotspot/src/share/vm/gc_implementation/parNew/parNewGeneration.cpp
@@ -590,7 +590,7 @@ void ParNewGenTask::set_for_termination(int active_workers) {
 // called after a task is started.  So "i" is based on
 // first-come-first-served.
 
-void ParNewGenTask::work(int i) {
+void ParNewGenTask::work(uint worker_id) {
   GenCollectedHeap* gch = GenCollectedHeap::heap();
   // Since this is being done in a separate thread, need new resource
   // and handle marks.
@@ -601,8 +601,8 @@ void ParNewGenTask::work(int i) {
 
   Generation* old_gen = gch->next_gen(_gen);
 
-  ParScanThreadState& par_scan_state = _state_set->thread_state(i);
-  assert(_state_set->is_valid(i), "Should not have been called");
+  ParScanThreadState& par_scan_state = _state_set->thread_state(worker_id);
+  assert(_state_set->is_valid(worker_id), "Should not have been called");
 
   par_scan_state.set_young_old_boundary(_young_old_boundary);
 
@@ -755,7 +755,7 @@ public:
                          ParScanThreadStateSet& state_set);
 
 private:
-  virtual void work(int i);
+  virtual void work(uint worker_id);
   virtual void set_for_termination(int active_workers) {
     _state_set.terminator()->reset_for_reuse(active_workers);
   }
@@ -781,13 +781,13 @@ ParNewRefProcTaskProxy::ParNewRefProcTaskProxy(
 {
 }
 
-void ParNewRefProcTaskProxy::work(int i)
+void ParNewRefProcTaskProxy::work(uint worker_id)
 {
   ResourceMark rm;
   HandleMark hm;
-  ParScanThreadState& par_scan_state = _state_set.thread_state(i);
+  ParScanThreadState& par_scan_state = _state_set.thread_state(worker_id);
   par_scan_state.set_young_old_boundary(_young_old_boundary);
-  _task.work(i, par_scan_state.is_alive_closure(),
+  _task.work(worker_id, par_scan_state.is_alive_closure(),
              par_scan_state.keep_alive_closure(),
              par_scan_state.evacuate_followers_closure());
 }
@@ -802,9 +802,9 @@ public:
       _task(task)
   { }
 
-  virtual void work(int i)
+  virtual void work(uint worker_id)
   {
-    _task.work(i);
+    _task.work(worker_id);
   }
 };
 
diff --git a/hotspot/src/share/vm/gc_implementation/parNew/parNewGeneration.hpp b/hotspot/src/share/vm/gc_implementation/parNew/parNewGeneration.hpp
index ccd6268a66b..75eac033b06 100644
--- a/hotspot/src/share/vm/gc_implementation/parNew/parNewGeneration.hpp
+++ b/hotspot/src/share/vm/gc_implementation/parNew/parNewGeneration.hpp
@@ -239,7 +239,7 @@ public:
 
   HeapWord* young_old_boundary() { return _young_old_boundary; }
 
-  void work(int i);
+  void work(uint worker_id);
 
   // Reset the terminator in ParScanThreadStateSet for
   // "active_workers" threads.
diff --git a/hotspot/src/share/vm/gc_implementation/shared/mutableNUMASpace.cpp b/hotspot/src/share/vm/gc_implementation/shared/mutableNUMASpace.cpp
index e38556db75f..dd8a76d92ad 100644
--- a/hotspot/src/share/vm/gc_implementation/shared/mutableNUMASpace.cpp
+++ b/hotspot/src/share/vm/gc_implementation/shared/mutableNUMASpace.cpp
@@ -282,7 +282,7 @@ void MutableNUMASpace::bias_region(MemRegion mr, int lgrp_id) {
     // large page can be broken down if we require small pages.
     os::realign_memory((char*)aligned_region.start(), aligned_region.byte_size(), page_size());
     // Then we uncommit the pages in the range.
-    os::free_memory((char*)aligned_region.start(), aligned_region.byte_size());
+    os::free_memory((char*)aligned_region.start(), aligned_region.byte_size(), page_size());
     // And make them local/first-touch biased.
     os::numa_make_local((char*)aligned_region.start(), aligned_region.byte_size(), lgrp_id);
   }
@@ -297,7 +297,7 @@ void MutableNUMASpace::free_region(MemRegion mr) {
     assert((intptr_t)aligned_region.start()     % page_size() == 0 &&
            (intptr_t)aligned_region.byte_size() % page_size() == 0, "Bad alignment");
     assert(region().contains(aligned_region), "Sanity");
-    os::free_memory((char*)aligned_region.start(), aligned_region.byte_size());
+    os::free_memory((char*)aligned_region.start(), aligned_region.byte_size(), page_size());
   }
 }
 
@@ -954,7 +954,7 @@ void MutableNUMASpace::LGRPSpace::scan_pages(size_t page_size, size_t page_count
     if (e != scan_end) {
       if ((page_expected.size != page_size || page_expected.lgrp_id != lgrp_id())
           && page_expected.size != 0) {
-        os::free_memory(s, pointer_delta(e, s, sizeof(char)));
+        os::free_memory(s, pointer_delta(e, s, sizeof(char)), page_size);
       }
       page_expected = page_found;
     }
diff --git a/hotspot/src/share/vm/gc_implementation/shared/mutableSpace.cpp b/hotspot/src/share/vm/gc_implementation/shared/mutableSpace.cpp
index 41f63658252..9725c4a1ebd 100644
--- a/hotspot/src/share/vm/gc_implementation/shared/mutableSpace.cpp
+++ b/hotspot/src/share/vm/gc_implementation/shared/mutableSpace.cpp
@@ -51,7 +51,7 @@ void MutableSpace::numa_setup_pages(MemRegion mr, bool clear_space) {
       size_t size = pointer_delta(end, start, sizeof(char));
       if (clear_space) {
         // Prefer page reallocation to migration.
-        os::free_memory((char*)start, size);
+        os::free_memory((char*)start, size, page_size);
       }
       os::numa_make_global((char*)start, size);
     }
diff --git a/hotspot/src/share/vm/gc_interface/collectedHeap.cpp b/hotspot/src/share/vm/gc_interface/collectedHeap.cpp
index 52c6e4792eb..413248ca4f8 100644
--- a/hotspot/src/share/vm/gc_interface/collectedHeap.cpp
+++ b/hotspot/src/share/vm/gc_interface/collectedHeap.cpp
@@ -478,18 +478,22 @@ oop CollectedHeap::Class_obj_allocate(KlassHandle klass, int size, KlassHandle r
 void CollectedHeap::test_is_in() {
   CollectedHeap* heap = Universe::heap();
 
+  uintptr_t epsilon    = (uintptr_t) MinObjAlignment;
+  uintptr_t heap_start = (uintptr_t) heap->_reserved.start();
+  uintptr_t heap_end   = (uintptr_t) heap->_reserved.end();
+
   // Test that NULL is not in the heap.
   assert(!heap->is_in(NULL), "NULL is unexpectedly in the heap");
 
   // Test that a pointer to before the heap start is reported as outside the heap.
-  assert(heap->_reserved.start() >= (void*)MinObjAlignment, "sanity");
-  void* before_heap = (void*)((intptr_t)heap->_reserved.start() - MinObjAlignment);
+  assert(heap_start >= ((uintptr_t)NULL + epsilon), "sanity");
+  void* before_heap = (void*)(heap_start - epsilon);
   assert(!heap->is_in(before_heap),
       err_msg("before_heap: " PTR_FORMAT " is unexpectedly in the heap", before_heap));
 
   // Test that a pointer to after the heap end is reported as outside the heap.
-  assert(heap->_reserved.end() <= (void*)(uintptr_t(-1) - (uint)MinObjAlignment), "sanity");
-  void* after_heap = (void*)((intptr_t)heap->_reserved.end() + MinObjAlignment);
+  assert(heap_end <= ((uintptr_t)-1 - epsilon), "sanity");
+  void* after_heap = (void*)(heap_end + epsilon);
   assert(!heap->is_in(after_heap),
       err_msg("after_heap: " PTR_FORMAT " is unexpectedly in the heap", after_heap));
 }
diff --git a/hotspot/src/share/vm/gc_interface/collectedHeap.hpp b/hotspot/src/share/vm/gc_interface/collectedHeap.hpp
index bf01c640d80..c54dbbfc203 100644
--- a/hotspot/src/share/vm/gc_interface/collectedHeap.hpp
+++ b/hotspot/src/share/vm/gc_interface/collectedHeap.hpp
@@ -69,7 +69,7 @@ class CollectedHeap : public CHeapObj {
   MemRegion _reserved;
   BarrierSet* _barrier_set;
   bool _is_gc_active;
-  int _n_par_threads;
+  uint _n_par_threads;
 
   unsigned int _total_collections;          // ... started
   unsigned int _total_full_collections;     // ... started
@@ -309,10 +309,10 @@ class CollectedHeap : public CHeapObj {
   GCCause::Cause gc_cause() { return _gc_cause; }
 
   // Number of threads currently working on GC tasks.
-  int n_par_threads() { return _n_par_threads; }
+  uint n_par_threads() { return _n_par_threads; }
 
   // May be overridden to set additional parallelism.
-  virtual void set_par_threads(int t) { _n_par_threads = t; };
+  virtual void set_par_threads(uint t) { _n_par_threads = t; };
 
   // Preload classes into the shared portion of the heap, and then dump
   // that data to a file so that it can be loaded directly by another
diff --git a/hotspot/src/share/vm/memory/dump.cpp b/hotspot/src/share/vm/memory/dump.cpp
index cb66e8608a7..75a0f8b4a60 100644
--- a/hotspot/src/share/vm/memory/dump.cpp
+++ b/hotspot/src/share/vm/memory/dump.cpp
@@ -1402,7 +1402,7 @@ class LinkClassesClosure : public ObjectClosure {
         instanceKlass* ik = (instanceKlass*) k;
         // Link the class to cause the bytecodes to be rewritten and the
         // cpcache to be created.
-        if (ik->get_init_state() < instanceKlass::linked) {
+        if (ik->init_state() < instanceKlass::linked) {
           ik->link_class(THREAD);
           guarantee(!HAS_PENDING_EXCEPTION, "exception in class rewriting");
         }
@@ -1535,7 +1535,7 @@ void GenCollectedHeap::preload_and_dump(TRAPS) {
         // are loaded in order that the related data structures (klass,
         // cpCache, Sting constants) are located together.
 
-        if (ik->get_init_state() < instanceKlass::linked) {
+        if (ik->init_state() < instanceKlass::linked) {
           ik->link_class(THREAD);
           guarantee(!(HAS_PENDING_EXCEPTION), "exception in class rewriting");
         }
diff --git a/hotspot/src/share/vm/memory/genCollectedHeap.cpp b/hotspot/src/share/vm/memory/genCollectedHeap.cpp
index 565aef739e8..b4de7f19eda 100644
--- a/hotspot/src/share/vm/memory/genCollectedHeap.cpp
+++ b/hotspot/src/share/vm/memory/genCollectedHeap.cpp
@@ -703,7 +703,7 @@ HeapWord* GenCollectedHeap::satisfy_failed_allocation(size_t size, bool is_tlab)
   return collector_policy()->satisfy_failed_allocation(size, is_tlab);
 }
 
-void GenCollectedHeap::set_par_threads(int t) {
+void GenCollectedHeap::set_par_threads(uint t) {
   SharedHeap::set_par_threads(t);
   _gen_process_strong_tasks->set_n_threads(t);
 }
diff --git a/hotspot/src/share/vm/memory/genCollectedHeap.hpp b/hotspot/src/share/vm/memory/genCollectedHeap.hpp
index ad424bae3ab..5f35dec41e2 100644
--- a/hotspot/src/share/vm/memory/genCollectedHeap.hpp
+++ b/hotspot/src/share/vm/memory/genCollectedHeap.hpp
@@ -419,8 +419,7 @@ public:
   // asserted to be this type.
   static GenCollectedHeap* heap();
 
-  void set_par_threads(int t);
-
+  void set_par_threads(uint t);
 
   // Invoke the "do_oop" method of one of the closures "not_older_gens"
   // or "older_gens" on root locations for the generation at
diff --git a/hotspot/src/share/vm/memory/referenceProcessor.cpp b/hotspot/src/share/vm/memory/referenceProcessor.cpp
index 1513e982d76..2699ecd0f5d 100644
--- a/hotspot/src/share/vm/memory/referenceProcessor.cpp
+++ b/hotspot/src/share/vm/memory/referenceProcessor.cpp
@@ -88,9 +88,9 @@ void ReferenceProcessor::enable_discovery(bool verify_disabled, bool check_no_re
 
 ReferenceProcessor::ReferenceProcessor(MemRegion span,
                                        bool      mt_processing,
-                                       int       mt_processing_degree,
+                                       uint      mt_processing_degree,
                                        bool      mt_discovery,
-                                       int       mt_discovery_degree,
+                                       uint      mt_discovery_degree,
                                        bool      atomic_discovery,
                                        BoolObjectClosure* is_alive_non_header,
                                        bool      discovered_list_needs_barrier)  :
@@ -105,7 +105,7 @@ ReferenceProcessor::ReferenceProcessor(MemRegion span,
   _span = span;
   _discovery_is_atomic = atomic_discovery;
   _discovery_is_mt     = mt_discovery;
-  _num_q               = MAX2(1, mt_processing_degree);
+  _num_q               = MAX2(1U, mt_processing_degree);
   _max_num_q           = MAX2(_num_q, mt_discovery_degree);
   _discovered_refs     = NEW_C_HEAP_ARRAY(DiscoveredList,
                                           _max_num_q * number_of_subclasses_of_ref());
@@ -118,7 +118,7 @@ ReferenceProcessor::ReferenceProcessor(MemRegion span,
   _discoveredPhantomRefs = &_discoveredFinalRefs[_max_num_q];
 
   // Initialize all entries to NULL
-  for (int i = 0; i < _max_num_q * number_of_subclasses_of_ref(); i++) {
+  for (uint i = 0; i < _max_num_q * number_of_subclasses_of_ref(); i++) {
     _discovered_refs[i].set_head(NULL);
     _discovered_refs[i].set_length(0);
   }
@@ -133,7 +133,7 @@ ReferenceProcessor::ReferenceProcessor(MemRegion span,
 #ifndef PRODUCT
 void ReferenceProcessor::verify_no_references_recorded() {
   guarantee(!_discovering_refs, "Discovering refs?");
-  for (int i = 0; i < _max_num_q * number_of_subclasses_of_ref(); i++) {
+  for (uint i = 0; i < _max_num_q * number_of_subclasses_of_ref(); i++) {
     guarantee(_discovered_refs[i].is_empty(),
               "Found non-empty discovered list");
   }
@@ -141,7 +141,7 @@ void ReferenceProcessor::verify_no_references_recorded() {
 #endif
 
 void ReferenceProcessor::weak_oops_do(OopClosure* f) {
-  for (int i = 0; i < _max_num_q * number_of_subclasses_of_ref(); i++) {
+  for (uint i = 0; i < _max_num_q * number_of_subclasses_of_ref(); i++) {
     if (UseCompressedOops) {
       f->do_oop((narrowOop*)_discovered_refs[i].adr_head());
     } else {
@@ -437,7 +437,7 @@ void ReferenceProcessor::enqueue_discovered_reflists(HeapWord* pending_list_addr
     task_executor->execute(tsk);
   } else {
     // Serial code: call the parent class's implementation
-    for (int i = 0; i < _max_num_q * number_of_subclasses_of_ref(); i++) {
+    for (uint i = 0; i < _max_num_q * number_of_subclasses_of_ref(); i++) {
       enqueue_discovered_reflist(_discovered_refs[i], pending_list_addr);
       _discovered_refs[i].set_head(NULL);
       _discovered_refs[i].set_length(0);
@@ -696,7 +696,7 @@ ReferenceProcessor::abandon_partial_discovered_list(DiscoveredList& refs_list) {
 
 void ReferenceProcessor::abandon_partial_discovery() {
   // loop over the lists
-  for (int i = 0; i < _max_num_q * number_of_subclasses_of_ref(); i++) {
+  for (uint i = 0; i < _max_num_q * number_of_subclasses_of_ref(); i++) {
     if (TraceReferenceGC && PrintGCDetails && ((i % _max_num_q) == 0)) {
       gclog_or_tty->print_cr("\nAbandoning %s discovered list", list_name(i));
     }
@@ -787,7 +787,7 @@ void ReferenceProcessor::balance_queues(DiscoveredList ref_lists[])
     gclog_or_tty->print_cr("\nBalance ref_lists ");
   }
 
-  for (int i = 0; i < _max_num_q; ++i) {
+  for (uint i = 0; i < _max_num_q; ++i) {
     total_refs += ref_lists[i].length();
     if (TraceReferenceGC && PrintGCDetails) {
       gclog_or_tty->print("%d ", ref_lists[i].length());
@@ -797,8 +797,8 @@ void ReferenceProcessor::balance_queues(DiscoveredList ref_lists[])
     gclog_or_tty->print_cr(" = %d", total_refs);
   }
   size_t avg_refs = total_refs / _num_q + 1;
-  int to_idx = 0;
-  for (int from_idx = 0; from_idx < _max_num_q; from_idx++) {
+  uint to_idx = 0;
+  for (uint from_idx = 0; from_idx < _max_num_q; from_idx++) {
     bool move_all = false;
     if (from_idx >= _num_q) {
       move_all = ref_lists[from_idx].length() > 0;
@@ -857,7 +857,7 @@ void ReferenceProcessor::balance_queues(DiscoveredList ref_lists[])
   }
 #ifdef ASSERT
   size_t balanced_total_refs = 0;
-  for (int i = 0; i < _max_num_q; ++i) {
+  for (uint i = 0; i < _max_num_q; ++i) {
     balanced_total_refs += ref_lists[i].length();
     if (TraceReferenceGC && PrintGCDetails) {
       gclog_or_tty->print("%d ", ref_lists[i].length());
@@ -903,7 +903,7 @@ ReferenceProcessor::process_discovered_reflist(
   }
   if (PrintReferenceGC && PrintGCDetails) {
     size_t total = 0;
-    for (int i = 0; i < _max_num_q; ++i) {
+    for (uint i = 0; i < _max_num_q; ++i) {
       total += refs_lists[i].length();
     }
     gclog_or_tty->print(", %u refs", total);
@@ -919,7 +919,7 @@ ReferenceProcessor::process_discovered_reflist(
       RefProcPhase1Task phase1(*this, refs_lists, policy, true /*marks_oops_alive*/);
       task_executor->execute(phase1);
     } else {
-      for (int i = 0; i < _max_num_q; i++) {
+      for (uint i = 0; i < _max_num_q; i++) {
         process_phase1(refs_lists[i], policy,
                        is_alive, keep_alive, complete_gc);
       }
@@ -935,7 +935,7 @@ ReferenceProcessor::process_discovered_reflist(
     RefProcPhase2Task phase2(*this, refs_lists, !discovery_is_atomic() /*marks_oops_alive*/);
     task_executor->execute(phase2);
   } else {
-    for (int i = 0; i < _max_num_q; i++) {
+    for (uint i = 0; i < _max_num_q; i++) {
       process_phase2(refs_lists[i], is_alive, keep_alive, complete_gc);
     }
   }
@@ -946,7 +946,7 @@ ReferenceProcessor::process_discovered_reflist(
     RefProcPhase3Task phase3(*this, refs_lists, clear_referent, true /*marks_oops_alive*/);
     task_executor->execute(phase3);
   } else {
-    for (int i = 0; i < _max_num_q; i++) {
+    for (uint i = 0; i < _max_num_q; i++) {
       process_phase3(refs_lists[i], clear_referent,
                      is_alive, keep_alive, complete_gc);
     }
@@ -955,7 +955,7 @@ ReferenceProcessor::process_discovered_reflist(
 
 void ReferenceProcessor::clean_up_discovered_references() {
   // loop over the lists
-  for (int i = 0; i < _max_num_q * number_of_subclasses_of_ref(); i++) {
+  for (uint i = 0; i < _max_num_q * number_of_subclasses_of_ref(); i++) {
     if (TraceReferenceGC && PrintGCDetails && ((i % _max_num_q) == 0)) {
       gclog_or_tty->print_cr(
         "\nScrubbing %s discovered list of Null referents",
@@ -1000,7 +1000,7 @@ void ReferenceProcessor::clean_up_discovered_reflist(DiscoveredList& refs_list)
 }
 
 inline DiscoveredList* ReferenceProcessor::get_discovered_list(ReferenceType rt) {
-  int id = 0;
+  uint id = 0;
   // Determine the queue index to use for this object.
   if (_discovery_is_mt) {
     // During a multi-threaded discovery phase,
@@ -1282,7 +1282,7 @@ void ReferenceProcessor::preclean_discovered_references(
   {
     TraceTime tt("Preclean SoftReferences", PrintGCDetails && PrintReferenceGC,
               false, gclog_or_tty);
-    for (int i = 0; i < _max_num_q; i++) {
+    for (uint i = 0; i < _max_num_q; i++) {
       if (yield->should_return()) {
         return;
       }
@@ -1295,7 +1295,7 @@ void ReferenceProcessor::preclean_discovered_references(
   {
     TraceTime tt("Preclean WeakReferences", PrintGCDetails && PrintReferenceGC,
               false, gclog_or_tty);
-    for (int i = 0; i < _max_num_q; i++) {
+    for (uint i = 0; i < _max_num_q; i++) {
       if (yield->should_return()) {
         return;
       }
@@ -1308,7 +1308,7 @@ void ReferenceProcessor::preclean_discovered_references(
   {
     TraceTime tt("Preclean FinalReferences", PrintGCDetails && PrintReferenceGC,
               false, gclog_or_tty);
-    for (int i = 0; i < _max_num_q; i++) {
+    for (uint i = 0; i < _max_num_q; i++) {
       if (yield->should_return()) {
         return;
       }
@@ -1321,7 +1321,7 @@ void ReferenceProcessor::preclean_discovered_references(
   {
     TraceTime tt("Preclean PhantomReferences", PrintGCDetails && PrintReferenceGC,
               false, gclog_or_tty);
-    for (int i = 0; i < _max_num_q; i++) {
+    for (uint i = 0; i < _max_num_q; i++) {
       if (yield->should_return()) {
         return;
       }
@@ -1386,7 +1386,7 @@ ReferenceProcessor::preclean_discovered_reflist(DiscoveredList&    refs_list,
   )
 }
 
-const char* ReferenceProcessor::list_name(int i) {
+const char* ReferenceProcessor::list_name(uint i) {
    assert(i >= 0 && i <= _max_num_q * number_of_subclasses_of_ref(),
           "Out of bounds index");
 
@@ -1410,7 +1410,7 @@ void ReferenceProcessor::verify_ok_to_handle_reflists() {
 #ifndef PRODUCT
 void ReferenceProcessor::clear_discovered_references() {
   guarantee(!_discovering_refs, "Discovering refs?");
-  for (int i = 0; i < _max_num_q * number_of_subclasses_of_ref(); i++) {
+  for (uint i = 0; i < _max_num_q * number_of_subclasses_of_ref(); i++) {
     clear_discovered_references(_discovered_refs[i]);
   }
 }
diff --git a/hotspot/src/share/vm/memory/referenceProcessor.hpp b/hotspot/src/share/vm/memory/referenceProcessor.hpp
index cbd8bfc203d..bfa3bdffa9e 100644
--- a/hotspot/src/share/vm/memory/referenceProcessor.hpp
+++ b/hotspot/src/share/vm/memory/referenceProcessor.hpp
@@ -231,7 +231,7 @@ class ReferenceProcessor : public CHeapObj {
   bool        _enqueuing_is_done;       // true if all weak references enqueued
   bool        _processing_is_mt;        // true during phases when
                                         // reference processing is MT.
-  int         _next_id;                 // round-robin mod _num_q counter in
+  uint        _next_id;                 // round-robin mod _num_q counter in
                                         // support of work distribution
 
   // For collectors that do not keep GC liveness information
@@ -252,9 +252,9 @@ class ReferenceProcessor : public CHeapObj {
   // The discovered ref lists themselves
 
   // The active MT'ness degree of the queues below
-  int             _num_q;
+  uint             _num_q;
   // The maximum MT'ness degree of the queues below
-  int             _max_num_q;
+  uint             _max_num_q;
 
   // Master array of discovered oops
   DiscoveredList* _discovered_refs;
@@ -268,9 +268,9 @@ class ReferenceProcessor : public CHeapObj {
  public:
   static int number_of_subclasses_of_ref() { return (REF_PHANTOM - REF_OTHER); }
 
-  int num_q()                              { return _num_q; }
-  int max_num_q()                          { return _max_num_q; }
-  void set_active_mt_degree(int v)         { _num_q = v; }
+  uint num_q()                             { return _num_q; }
+  uint max_num_q()                         { return _max_num_q; }
+  void set_active_mt_degree(uint v)        { _num_q = v; }
 
   DiscoveredList* discovered_refs()        { return _discovered_refs; }
 
@@ -368,7 +368,7 @@ class ReferenceProcessor : public CHeapObj {
 
   // Returns the name of the discovered reference list
   // occupying the i / _num_q slot.
-  const char* list_name(int i);
+  const char* list_name(uint i);
 
   void enqueue_discovered_reflists(HeapWord* pending_list_addr, AbstractRefProcTaskExecutor* task_executor);
 
@@ -388,8 +388,8 @@ class ReferenceProcessor : public CHeapObj {
                                    YieldClosure*      yield);
 
   // round-robin mod _num_q (not: _not_ mode _max_num_q)
-  int next_id() {
-    int id = _next_id;
+  uint next_id() {
+    uint id = _next_id;
     if (++_next_id == _num_q) {
       _next_id = 0;
     }
@@ -434,8 +434,8 @@ class ReferenceProcessor : public CHeapObj {
 
   // Default parameters give you a vanilla reference processor.
   ReferenceProcessor(MemRegion span,
-                     bool mt_processing = false, int mt_processing_degree = 1,
-                     bool mt_discovery  = false, int mt_discovery_degree  = 1,
+                     bool mt_processing = false, uint mt_processing_degree = 1,
+                     bool mt_discovery  = false, uint mt_discovery_degree  = 1,
                      bool atomic_discovery = true,
                      BoolObjectClosure* is_alive_non_header = NULL,
                      bool discovered_list_needs_barrier = false);
diff --git a/hotspot/src/share/vm/memory/sharedHeap.cpp b/hotspot/src/share/vm/memory/sharedHeap.cpp
index 8094b6e7a46..707768e5a28 100644
--- a/hotspot/src/share/vm/memory/sharedHeap.cpp
+++ b/hotspot/src/share/vm/memory/sharedHeap.cpp
@@ -94,7 +94,7 @@ bool SharedHeap::heap_lock_held_for_gc() {
              && _thread_holds_heap_lock_for_gc);
 }
 
-void SharedHeap::set_par_threads(int t) {
+void SharedHeap::set_par_threads(uint t) {
   assert(t == 0 || !UseSerialGC, "Cannot have parallel threads");
   _n_par_threads = t;
   _process_strong_tasks->set_n_threads(t);
diff --git a/hotspot/src/share/vm/memory/sharedHeap.hpp b/hotspot/src/share/vm/memory/sharedHeap.hpp
index 8e819c48717..408b0dce8e8 100644
--- a/hotspot/src/share/vm/memory/sharedHeap.hpp
+++ b/hotspot/src/share/vm/memory/sharedHeap.hpp
@@ -287,7 +287,7 @@ public:
 
   // Sets the number of parallel threads that will be doing tasks
   // (such as process strong roots) subsequently.
-  virtual void set_par_threads(int t);
+  virtual void set_par_threads(uint t);
 
   int n_termination();
   void set_n_termination(int t);
diff --git a/hotspot/src/share/vm/oops/arrayKlass.hpp b/hotspot/src/share/vm/oops/arrayKlass.hpp
index cfc06274cdf..13dbaec551d 100644
--- a/hotspot/src/share/vm/oops/arrayKlass.hpp
+++ b/hotspot/src/share/vm/oops/arrayKlass.hpp
@@ -73,7 +73,7 @@ class arrayKlass: public Klass {
   oop* adr_component_mirror()           { return (oop*)&this->_component_mirror;}
 
   // Compiler/Interpreter offset
-  static ByteSize component_mirror_offset() { return byte_offset_of(arrayKlass, _component_mirror); }
+  static ByteSize component_mirror_offset() { return in_ByteSize(sizeof(klassOopDesc) + offset_of(arrayKlass, _component_mirror)); }
 
   virtual klassOop java_super() const;//{ return SystemDictionary::Object_klass(); }
 
diff --git a/hotspot/src/share/vm/oops/instanceKlass.cpp b/hotspot/src/share/vm/oops/instanceKlass.cpp
index 04b3c2ffc32..e20a6ed1a1a 100644
--- a/hotspot/src/share/vm/oops/instanceKlass.cpp
+++ b/hotspot/src/share/vm/oops/instanceKlass.cpp
@@ -208,7 +208,7 @@ void instanceKlass::eager_initialize_impl(instanceKlassHandle this_oop) {
   // abort if someone beat us to the initialization
   if (!this_oop->is_not_initialized()) return;  // note: not equivalent to is_initialized()
 
-  ClassState old_state = this_oop->_init_state;
+  ClassState old_state = this_oop->init_state();
   link_class_impl(this_oop, true, THREAD);
   if (HAS_PENDING_EXCEPTION) {
     CLEAR_PENDING_EXCEPTION;
@@ -2479,7 +2479,7 @@ void instanceKlass::set_init_state(ClassState state) {
   bool good_state = as_klassOop()->is_shared() ? (_init_state <= state)
                                                : (_init_state < state);
   assert(good_state || state == allocated, "illegal state transition");
-  _init_state = state;
+  _init_state = (u1)state;
 }
 #endif
 
diff --git a/hotspot/src/share/vm/oops/instanceKlass.hpp b/hotspot/src/share/vm/oops/instanceKlass.hpp
index a87448a7680..f07f82f4c33 100644
--- a/hotspot/src/share/vm/oops/instanceKlass.hpp
+++ b/hotspot/src/share/vm/oops/instanceKlass.hpp
@@ -227,16 +227,16 @@ class instanceKlass: public Klass {
   // (including inherited fields but after header_size()).
   int             _nonstatic_field_size;
   int             _static_field_size;    // number words used by static fields (oop and non-oop) in this klass
-  int             _static_oop_field_count;// number of static oop fields in this klass
+  u2              _static_oop_field_count;// number of static oop fields in this klass
+  u2              _java_fields_count;    // The number of declared Java fields
   int             _nonstatic_oop_map_size;// size in words of nonstatic oop map blocks
-  int             _java_fields_count;    // The number of declared Java fields
+
   bool            _is_marked_dependent;  // used for marking during flushing and deoptimization
   bool            _rewritten;            // methods rewritten.
   bool            _has_nonstatic_fields; // for sizing with UseCompressedOops
   bool            _should_verify_class;  // allow caching of preverification
   u2              _minor_version;        // minor version number of class file
   u2              _major_version;        // major version number of class file
-  ClassState      _init_state;           // state of class
   Thread*         _init_thread;          // Pointer to current thread doing initialization (to handle recusive initialization)
   int             _vtable_len;           // length of Java vtable (in words)
   int             _itable_len;           // length of Java itable (in words)
@@ -260,6 +260,11 @@ class instanceKlass: public Klass {
   JvmtiCachedClassFieldMap* _jvmti_cached_class_field_map;  // JVMTI: used during heap iteration
   volatile u2     _idnum_allocated_count;         // JNI/JVMTI: increments with the addition of methods, old ids don't change
 
+  // Class states are defined as ClassState (see above).
+  // Place the _init_state here to utilize the unused 2-byte after
+  // _idnum_allocated_count.
+  u1              _init_state;                    // state of class
+
   // embedded Java vtable follows here
   // embedded Java itables follows here
   // embedded static fields follows here
@@ -279,8 +284,8 @@ class instanceKlass: public Klass {
   int static_field_size() const            { return _static_field_size; }
   void set_static_field_size(int size)     { _static_field_size = size; }
 
-  int static_oop_field_count() const        { return _static_oop_field_count; }
-  void set_static_oop_field_count(int size) { _static_oop_field_count = size; }
+  int static_oop_field_count() const       { return (int)_static_oop_field_count; }
+  void set_static_oop_field_count(u2 size) { _static_oop_field_count = size; }
 
   // Java vtable
   int  vtable_length() const               { return _vtable_len; }
@@ -320,14 +325,14 @@ class instanceKlass: public Klass {
   Symbol* field_signature   (int index) const { return field(index)->signature(constants()); }
 
   // Number of Java declared fields
-  int java_fields_count() const           { return _java_fields_count; }
+  int java_fields_count() const           { return (int)_java_fields_count; }
 
   // Number of fields including any injected fields
   int all_fields_count() const            { return _fields->length() / sizeof(FieldInfo::field_slots); }
 
   typeArrayOop fields() const              { return _fields; }
 
-  void set_fields(typeArrayOop f, int java_fields_count) {
+  void set_fields(typeArrayOop f, u2 java_fields_count) {
     oop_store_without_check((oop*) &_fields, (oop) f);
     _java_fields_count = java_fields_count;
   }
@@ -377,7 +382,7 @@ class instanceKlass: public Klass {
   bool is_being_initialized() const        { return _init_state == being_initialized; }
   bool is_in_error_state() const           { return _init_state == initialization_error; }
   bool is_reentrant_initialization(Thread *thread)  { return thread == _init_thread; }
-  int  get_init_state()                    { return _init_state; } // Useful for debugging
+  ClassState  init_state()                 { return (ClassState)_init_state; }
   bool is_rewritten() const                { return _rewritten; }
 
   // defineClass specified verification
@@ -405,7 +410,7 @@ class instanceKlass: public Klass {
   ReferenceType reference_type() const     { return _reference_type; }
   void set_reference_type(ReferenceType t) { _reference_type = t; }
 
-  static int reference_type_offset_in_bytes() { return offset_of(instanceKlass, _reference_type); }
+  static ByteSize reference_type_offset() { return in_ByteSize(sizeof(klassOopDesc) + offset_of(instanceKlass, _reference_type)); }
 
   // find local field, returns true if found
   bool find_local_field(Symbol* name, Symbol* sig, fieldDescriptor* fd) const;
@@ -616,8 +621,8 @@ class instanceKlass: public Klass {
   void set_breakpoints(BreakpointInfo* bps) { _breakpoints = bps; };
 
   // support for stub routines
-  static int init_state_offset_in_bytes()    { return offset_of(instanceKlass, _init_state); }
-  static int init_thread_offset_in_bytes()   { return offset_of(instanceKlass, _init_thread); }
+  static ByteSize init_state_offset()  { return in_ByteSize(sizeof(klassOopDesc) + offset_of(instanceKlass, _init_state)); }
+  static ByteSize init_thread_offset() { return in_ByteSize(sizeof(klassOopDesc) + offset_of(instanceKlass, _init_thread)); }
 
   // subclass/subinterface checks
   bool implements_interface(klassOop k) const;
@@ -754,7 +759,7 @@ private:
 #ifdef ASSERT
   void set_init_state(ClassState state);
 #else
-  void set_init_state(ClassState state) { _init_state = state; }
+  void set_init_state(ClassState state) { _init_state = (u1)state; }
 #endif
   void set_rewritten()                  { _rewritten = true; }
   void set_init_thread(Thread *thread)  { _init_thread = thread; }
diff --git a/hotspot/src/share/vm/oops/klass.cpp b/hotspot/src/share/vm/oops/klass.cpp
index 8541bd5d2b9..39cb26ebbad 100644
--- a/hotspot/src/share/vm/oops/klass.cpp
+++ b/hotspot/src/share/vm/oops/klass.cpp
@@ -144,7 +144,7 @@ klassOop Klass::base_create_klass_oop(KlassHandle& klass, int size,
     }
     kl->set_secondary_supers(NULL);
     oop_store_without_check((oop*) &kl->_primary_supers[0], k);
-    kl->set_super_check_offset(primary_supers_offset_in_bytes() + sizeof(oopDesc));
+    kl->set_super_check_offset(in_bytes(primary_supers_offset()));
   }
 
   kl->set_java_mirror(NULL);
diff --git a/hotspot/src/share/vm/oops/klass.hpp b/hotspot/src/share/vm/oops/klass.hpp
index 407b1ef295a..ae727d9ecee 100644
--- a/hotspot/src/share/vm/oops/klass.hpp
+++ b/hotspot/src/share/vm/oops/klass.hpp
@@ -313,7 +313,7 @@ class Klass : public Klass_vtbl {
   // Can this klass be a primary super?  False for interfaces and arrays of
   // interfaces.  False also for arrays or classes with long super chains.
   bool can_be_primary_super() const {
-    const juint secondary_offset = secondary_super_cache_offset_in_bytes() + sizeof(oopDesc);
+    const juint secondary_offset = in_bytes(secondary_super_cache_offset());
     return super_check_offset() != secondary_offset;
   }
   virtual bool can_be_primary_super_slow() const;
@@ -323,7 +323,7 @@ class Klass : public Klass_vtbl {
     if (!can_be_primary_super()) {
       return primary_super_limit();
     } else {
-      juint d = (super_check_offset() - (primary_supers_offset_in_bytes() + sizeof(oopDesc))) / sizeof(klassOop);
+      juint d = (super_check_offset() - in_bytes(primary_supers_offset())) / sizeof(klassOop);
       assert(d < primary_super_limit(), "oob");
       assert(_primary_supers[d] == as_klassOop(), "proper init");
       return d;
@@ -373,15 +373,15 @@ class Klass : public Klass_vtbl {
   virtual void set_alloc_size(juint n) = 0;
 
   // Compiler support
-  static int super_offset_in_bytes()         { return offset_of(Klass, _super); }
-  static int super_check_offset_offset_in_bytes() { return offset_of(Klass, _super_check_offset); }
-  static int primary_supers_offset_in_bytes(){ return offset_of(Klass, _primary_supers); }
-  static int secondary_super_cache_offset_in_bytes() { return offset_of(Klass, _secondary_super_cache); }
-  static int secondary_supers_offset_in_bytes() { return offset_of(Klass, _secondary_supers); }
-  static int java_mirror_offset_in_bytes()   { return offset_of(Klass, _java_mirror); }
-  static int modifier_flags_offset_in_bytes(){ return offset_of(Klass, _modifier_flags); }
-  static int layout_helper_offset_in_bytes() { return offset_of(Klass, _layout_helper); }
-  static int access_flags_offset_in_bytes()  { return offset_of(Klass, _access_flags); }
+  static ByteSize super_offset()                 { return in_ByteSize(sizeof(klassOopDesc) + offset_of(Klass, _super)); }
+  static ByteSize super_check_offset_offset()    { return in_ByteSize(sizeof(klassOopDesc) + offset_of(Klass, _super_check_offset)); }
+  static ByteSize primary_supers_offset()        { return in_ByteSize(sizeof(klassOopDesc) + offset_of(Klass, _primary_supers)); }
+  static ByteSize secondary_super_cache_offset() { return in_ByteSize(sizeof(klassOopDesc) + offset_of(Klass, _secondary_super_cache)); }
+  static ByteSize secondary_supers_offset()      { return in_ByteSize(sizeof(klassOopDesc) + offset_of(Klass, _secondary_supers)); }
+  static ByteSize java_mirror_offset()           { return in_ByteSize(sizeof(klassOopDesc) + offset_of(Klass, _java_mirror)); }
+  static ByteSize modifier_flags_offset()        { return in_ByteSize(sizeof(klassOopDesc) + offset_of(Klass, _modifier_flags)); }
+  static ByteSize layout_helper_offset()         { return in_ByteSize(sizeof(klassOopDesc) + offset_of(Klass, _layout_helper)); }
+  static ByteSize access_flags_offset()          { return in_ByteSize(sizeof(klassOopDesc) + offset_of(Klass, _access_flags)); }
 
   // Unpacking layout_helper:
   enum {
@@ -478,7 +478,7 @@ class Klass : public Klass_vtbl {
   bool is_subtype_of(klassOop k) const {
     juint    off = k->klass_part()->super_check_offset();
     klassOop sup = *(klassOop*)( (address)as_klassOop() + off );
-    const juint secondary_offset = secondary_super_cache_offset_in_bytes() + sizeof(oopDesc);
+    const juint secondary_offset = in_bytes(secondary_super_cache_offset());
     if (sup == k) {
       return true;
     } else if (off != secondary_offset) {
@@ -674,7 +674,7 @@ class Klass : public Klass_vtbl {
   // are potential problems in setting the bias pattern for
   // JVM-internal oops.
   inline void set_prototype_header(markOop header);
-  static int prototype_header_offset_in_bytes() { return offset_of(Klass, _prototype_header); }
+  static ByteSize prototype_header_offset() { return in_ByteSize(sizeof(klassOopDesc) + offset_of(Klass, _prototype_header)); }
 
   int  biased_lock_revocation_count() const { return (int) _biased_lock_revocation_count; }
   // Atomically increments biased_lock_revocation_count and returns updated value
diff --git a/hotspot/src/share/vm/oops/klassOop.hpp b/hotspot/src/share/vm/oops/klassOop.hpp
index 25dca1d6aaf..f212fc5ba7f 100644
--- a/hotspot/src/share/vm/oops/klassOop.hpp
+++ b/hotspot/src/share/vm/oops/klassOop.hpp
@@ -38,14 +38,8 @@
 
 class klassOopDesc : public oopDesc {
  public:
-  // size operation
-  static int header_size()                       { return sizeof(klassOopDesc)/HeapWordSize; }
-
-  // support for code generation
-  static int klass_part_offset_in_bytes()        { return sizeof(klassOopDesc); }
-
   // returns the Klass part containing dispatching behavior
-  Klass* klass_part() const                      { return (Klass*)((address)this + klass_part_offset_in_bytes()); }
+  Klass* klass_part() const                      { return (Klass*)((address)this + sizeof(klassOopDesc)); }
 
   // Convenience wrapper
   inline oop java_mirror() const;
diff --git a/hotspot/src/share/vm/oops/objArrayKlass.hpp b/hotspot/src/share/vm/oops/objArrayKlass.hpp
index 023f221abf5..44717ec6954 100644
--- a/hotspot/src/share/vm/oops/objArrayKlass.hpp
+++ b/hotspot/src/share/vm/oops/objArrayKlass.hpp
@@ -47,7 +47,7 @@ class objArrayKlass : public arrayKlass {
   oop* bottom_klass_addr()            { return (oop*)&_bottom_klass; }
 
   // Compiler/Interpreter offset
-  static int element_klass_offset_in_bytes() { return offset_of(objArrayKlass, _element_klass); }
+  static ByteSize element_klass_offset() { return in_ByteSize(sizeof(klassOopDesc) + offset_of(objArrayKlass, _element_klass)); }
 
   // Dispatched operation
   bool can_be_primary_super_slow() const;
diff --git a/hotspot/src/share/vm/opto/c2_globals.hpp b/hotspot/src/share/vm/opto/c2_globals.hpp
index e7a0fff72c4..f73dcbde992 100644
--- a/hotspot/src/share/vm/opto/c2_globals.hpp
+++ b/hotspot/src/share/vm/opto/c2_globals.hpp
@@ -426,6 +426,9 @@
   product(bool, EliminateLocks, true,                                       \
           "Coarsen locks when possible")                                    \
                                                                             \
+  product(bool, EliminateNestedLocks, true,                                 \
+          "Eliminate nested locks of the same object when possible")        \
+                                                                            \
   notproduct(bool, PrintLockStatistics, false,                              \
           "Print precise statistics on the dynamic lock usage")             \
                                                                             \
diff --git a/hotspot/src/share/vm/opto/callnode.cpp b/hotspot/src/share/vm/opto/callnode.cpp
index 58d3e3794b4..2811264846e 100644
--- a/hotspot/src/share/vm/opto/callnode.cpp
+++ b/hotspot/src/share/vm/opto/callnode.cpp
@@ -400,10 +400,10 @@ void JVMState::format(PhaseRegAlloc *regalloc, const Node *n, outputStream* st)
       Node *box = mcall->monitor_box(this, i);
       Node *obj = mcall->monitor_obj(this, i);
       if ( OptoReg::is_valid(regalloc->get_reg_first(box)) ) {
-        while( !box->is_BoxLock() )  box = box->in(1);
+        box = BoxLockNode::box_node(box);
         format_helper( regalloc, st, box, "MON-BOX[", i, &scobjs );
       } else {
-        OptoReg::Name box_reg = BoxLockNode::stack_slot(box);
+        OptoReg::Name box_reg = BoxLockNode::reg(box);
         st->print(" MON-BOX%d=%s+%d",
                    i,
                    OptoReg::regname(OptoReg::c_frame_pointer),
@@ -411,8 +411,7 @@ void JVMState::format(PhaseRegAlloc *regalloc, const Node *n, outputStream* st)
       }
       const char* obj_msg = "MON-OBJ[";
       if (EliminateLocks) {
-        while( !box->is_BoxLock() )  box = box->in(1);
-        if (box->as_BoxLock()->is_eliminated())
+        if (BoxLockNode::box_node(box)->is_eliminated())
           obj_msg = "MON-OBJ(LOCK ELIMINATED)[";
       }
       format_helper( regalloc, st, obj, obj_msg, i, &scobjs );
@@ -1387,8 +1386,9 @@ bool AbstractLockNode::find_matching_unlock(const Node* ctrl, LockNode* lock,
     Node *n = ctrl_proj->in(0);
     if (n != NULL && n->is_Unlock()) {
       UnlockNode *unlock = n->as_Unlock();
-      if ((lock->obj_node() == unlock->obj_node()) &&
-          (lock->box_node() == unlock->box_node()) && !unlock->is_eliminated()) {
+      if (lock->obj_node()->eqv_uncast(unlock->obj_node()) &&
+          BoxLockNode::same_slot(lock->box_node(), unlock->box_node()) &&
+          !unlock->is_eliminated()) {
         lock_ops.append(unlock);
         return true;
       }
@@ -1431,8 +1431,8 @@ LockNode *AbstractLockNode::find_matching_lock(UnlockNode* unlock) {
   }
   if (ctrl->is_Lock()) {
     LockNode *lock = ctrl->as_Lock();
-    if ((lock->obj_node() == unlock->obj_node()) &&
-            (lock->box_node() == unlock->box_node())) {
+    if (lock->obj_node()->eqv_uncast(unlock->obj_node()) &&
+        BoxLockNode::same_slot(lock->box_node(), unlock->box_node())) {
       lock_result = lock;
     }
   }
@@ -1462,8 +1462,9 @@ bool AbstractLockNode::find_lock_and_unlock_through_if(Node* node, LockNode* loc
       }
       if (lock1_node != NULL && lock1_node->is_Lock()) {
         LockNode *lock1 = lock1_node->as_Lock();
-        if ((lock->obj_node() == lock1->obj_node()) &&
-            (lock->box_node() == lock1->box_node()) && !lock1->is_eliminated()) {
+        if (lock->obj_node()->eqv_uncast(lock1->obj_node()) &&
+            BoxLockNode::same_slot(lock->box_node(), lock1->box_node()) &&
+            !lock1->is_eliminated()) {
           lock_ops.append(lock1);
           return true;
         }
@@ -1507,19 +1508,16 @@ bool AbstractLockNode::find_unlocks_for_region(const RegionNode* region, LockNod
 void AbstractLockNode::create_lock_counter(JVMState* state) {
   _counter = OptoRuntime::new_named_counter(state, NamedCounter::LockCounter);
 }
-#endif
 
-void AbstractLockNode::set_eliminated() {
-  _eliminate = true;
-#ifndef PRODUCT
+void AbstractLockNode::set_eliminated_lock_counter() {
   if (_counter) {
     // Update the counter to indicate that this lock was eliminated.
     // The counter update code will stay around even though the
     // optimizer will eliminate the lock operation itself.
     _counter->set_tag(NamedCounter::EliminatedLockCounter);
   }
-#endif
 }
+#endif
 
 //=============================================================================
 Node *LockNode::Ideal(PhaseGVN *phase, bool can_reshape) {
@@ -1535,7 +1533,7 @@ Node *LockNode::Ideal(PhaseGVN *phase, bool can_reshape) {
   // prevents macro expansion from expanding the lock.  Since we don't
   // modify the graph, the value returned from this function is the
   // one computed above.
-  if (can_reshape && EliminateLocks && (!is_eliminated() || is_coarsened())) {
+  if (can_reshape && EliminateLocks && !is_non_esc_obj()) {
     //
     // If we are locking an unescaped object, the lock/unlock is unnecessary
     //
@@ -1544,16 +1542,11 @@ Node *LockNode::Ideal(PhaseGVN *phase, bool can_reshape) {
     if (cgr != NULL)
       es = cgr->escape_state(obj_node());
     if (es != PointsToNode::UnknownEscape && es != PointsToNode::GlobalEscape) {
-      if (!is_eliminated()) {
-        // Mark it eliminated to update any counters
-        this->set_eliminated();
-      } else {
-        assert(is_coarsened(), "sanity");
-        // The lock could be marked eliminated by lock coarsening
-        // code during first IGVN before EA. Clear coarsened flag
-        // to eliminate all associated locks/unlocks.
-        this->clear_coarsened();
-      }
+      assert(!is_eliminated() || is_coarsened(), "sanity");
+      // The lock could be marked eliminated by lock coarsening
+      // code during first IGVN before EA. Replace coarsened flag
+      // to eliminate all associated locks/unlocks.
+      this->set_non_esc_obj();
       return result;
     }
 
@@ -1613,8 +1606,7 @@ Node *LockNode::Ideal(PhaseGVN *phase, bool can_reshape) {
         for (int i = 0; i < lock_ops.length(); i++) {
           AbstractLockNode* lock = lock_ops.at(i);
 
-          // Mark it eliminated to update any counters
-          lock->set_eliminated();
+          // Mark it eliminated by coarsening and update any counters
           lock->set_coarsened();
         }
       } else if (ctrl->is_Region() &&
@@ -1631,6 +1623,40 @@ Node *LockNode::Ideal(PhaseGVN *phase, bool can_reshape) {
   return result;
 }
 
+//=============================================================================
+bool LockNode::is_nested_lock_region() {
+  BoxLockNode* box = box_node()->as_BoxLock();
+  int stk_slot = box->stack_slot();
+  if (stk_slot <= 0)
+    return false; // External lock or it is not Box (Phi node).
+
+  // Ignore complex cases: merged locks or multiple locks.
+  Node* obj = obj_node();
+  LockNode* unique_lock = NULL;
+  if (!box->is_simple_lock_region(&unique_lock, obj) ||
+      (unique_lock != this)) {
+    return false;
+  }
+
+  // Look for external lock for the same object.
+  SafePointNode* sfn = this->as_SafePoint();
+  JVMState* youngest_jvms = sfn->jvms();
+  int max_depth = youngest_jvms->depth();
+  for (int depth = 1; depth <= max_depth; depth++) {
+    JVMState* jvms = youngest_jvms->of_depth(depth);
+    int num_mon  = jvms->nof_monitors();
+    // Loop over monitors
+    for (int idx = 0; idx < num_mon; idx++) {
+      Node* obj_node = sfn->monitor_obj(jvms, idx);
+      BoxLockNode* box_node = sfn->monitor_box(jvms, idx)->as_BoxLock();
+      if ((box_node->stack_slot() < stk_slot) && obj_node->eqv_uncast(obj)) {
+        return true;
+      }
+    }
+  }
+  return false;
+}
+
 //=============================================================================
 uint UnlockNode::size_of() const { return sizeof(*this); }
 
@@ -1649,7 +1675,7 @@ Node *UnlockNode::Ideal(PhaseGVN *phase, bool can_reshape) {
   // modify the graph, the value returned from this function is the
   // one computed above.
   // Escape state is defined after Parse phase.
-  if (can_reshape && EliminateLocks && (!is_eliminated() || is_coarsened())) {
+  if (can_reshape && EliminateLocks && !is_non_esc_obj()) {
     //
     // If we are unlocking an unescaped object, the lock/unlock is unnecessary.
     //
@@ -1658,16 +1684,11 @@ Node *UnlockNode::Ideal(PhaseGVN *phase, bool can_reshape) {
     if (cgr != NULL)
       es = cgr->escape_state(obj_node());
     if (es != PointsToNode::UnknownEscape && es != PointsToNode::GlobalEscape) {
-      if (!is_eliminated()) {
-        // Mark it eliminated to update any counters
-        this->set_eliminated();
-      } else {
-        assert(is_coarsened(), "sanity");
-        // The lock could be marked eliminated by lock coarsening
-        // code during first IGVN before EA. Clear coarsened flag
-        // to eliminate all associated locks/unlocks.
-        this->clear_coarsened();
-      }
+      assert(!is_eliminated() || is_coarsened(), "sanity");
+      // The lock could be marked eliminated by lock coarsening
+      // code during first IGVN before EA. Replace coarsened flag
+      // to eliminate all associated locks/unlocks.
+      this->set_non_esc_obj();
     }
   }
   return result;
diff --git a/hotspot/src/share/vm/opto/callnode.hpp b/hotspot/src/share/vm/opto/callnode.hpp
index 9e9b3426099..fa9af0cb9a3 100644
--- a/hotspot/src/share/vm/opto/callnode.hpp
+++ b/hotspot/src/share/vm/opto/callnode.hpp
@@ -791,6 +791,10 @@ public:
   // are defined in graphKit.cpp, which sets up the bidirectional relation.)
   InitializeNode* initialization();
 
+  // Return the corresponding storestore barrier (or null if none).
+  // Walks out edges to find it...
+  MemBarStoreStoreNode* storestore();
+
   // Convenience for initialization->maybe_set_complete(phase)
   bool maybe_set_complete(PhaseGVN* phase);
 };
@@ -836,8 +840,12 @@ public:
 //------------------------------AbstractLockNode-----------------------------------
 class AbstractLockNode: public CallNode {
 private:
-  bool _eliminate;    // indicates this lock can be safely eliminated
-  bool _coarsened;    // indicates this lock was coarsened
+  enum {
+    Regular = 0,  // Normal lock
+    NonEscObj,    // Lock is used for non escaping object
+    Coarsened,    // Lock was coarsened
+    Nested        // Nested lock
+  } _kind;
 #ifndef PRODUCT
   NamedCounter* _counter;
 #endif
@@ -854,12 +862,13 @@ protected:
                                GrowableArray<AbstractLockNode*> &lock_ops);
   LockNode *find_matching_lock(UnlockNode* unlock);
 
+  // Update the counter to indicate that this lock was eliminated.
+  void set_eliminated_lock_counter() PRODUCT_RETURN;
 
 public:
   AbstractLockNode(const TypeFunc *tf)
     : CallNode(tf, NULL, TypeRawPtr::BOTTOM),
-      _coarsened(false),
-      _eliminate(false)
+      _kind(Regular)
   {
 #ifndef PRODUCT
     _counter = NULL;
@@ -869,20 +878,23 @@ public:
   Node *   obj_node() const       {return in(TypeFunc::Parms + 0); }
   Node *   box_node() const       {return in(TypeFunc::Parms + 1); }
   Node *   fastlock_node() const  {return in(TypeFunc::Parms + 2); }
+  void     set_box_node(Node* box) { set_req(TypeFunc::Parms + 1, box); }
+
   const Type *sub(const Type *t1, const Type *t2) const { return TypeInt::CC;}
 
   virtual uint size_of() const { return sizeof(*this); }
 
-  bool is_eliminated()         {return _eliminate; }
-  // mark node as eliminated and update the counter if there is one
-  void set_eliminated();
+  bool is_eliminated()  const { return (_kind != Regular); }
+  bool is_non_esc_obj() const { return (_kind == NonEscObj); }
+  bool is_coarsened()   const { return (_kind == Coarsened); }
+  bool is_nested()      const { return (_kind == Nested); }
 
-  bool is_coarsened()  { return _coarsened; }
-  void set_coarsened() { _coarsened = true; }
-  void clear_coarsened() { _coarsened = false; }
+  void set_non_esc_obj() { _kind = NonEscObj; set_eliminated_lock_counter(); }
+  void set_coarsened()   { _kind = Coarsened; set_eliminated_lock_counter(); }
+  void set_nested()      { _kind = Nested; set_eliminated_lock_counter(); }
 
   // locking does not modify its arguments
-  virtual bool        may_modify(const TypePtr *addr_t, PhaseTransform *phase){ return false;}
+  virtual bool may_modify(const TypePtr *addr_t, PhaseTransform *phase){ return false;}
 
 #ifndef PRODUCT
   void create_lock_counter(JVMState* s);
@@ -932,6 +944,8 @@ public:
   virtual void  clone_jvms() {
     set_jvms(jvms()->clone_deep(Compile::current()));
   }
+
+  bool is_nested_lock_region(); // Is this Lock nested?
 };
 
 //------------------------------Unlock---------------------------------------
diff --git a/hotspot/src/share/vm/opto/cfgnode.cpp b/hotspot/src/share/vm/opto/cfgnode.cpp
index c2f35afab84..4f01e85484e 100644
--- a/hotspot/src/share/vm/opto/cfgnode.cpp
+++ b/hotspot/src/share/vm/opto/cfgnode.cpp
@@ -1597,7 +1597,7 @@ Node *PhiNode::Ideal(PhaseGVN *phase, bool can_reshape) {
       bool is_loop = (r->is_Loop() && r->req() == 3);
       // Then, check if there is a data loop when phi references itself directly
       // or through other data nodes.
-      if (is_loop && !phase->eqv_uncast(uin, in(LoopNode::EntryControl)) ||
+      if (is_loop && !uin->eqv_uncast(in(LoopNode::EntryControl)) ||
          !is_loop && is_unsafe_data_reference(uin)) {
         // Break this data loop to avoid creation of a dead loop.
         if (can_reshape) {
diff --git a/hotspot/src/share/vm/opto/chaitin.hpp b/hotspot/src/share/vm/opto/chaitin.hpp
index 999a7922709..1e6be63c452 100644
--- a/hotspot/src/share/vm/opto/chaitin.hpp
+++ b/hotspot/src/share/vm/opto/chaitin.hpp
@@ -485,7 +485,11 @@ private:
     return yank_if_dead(old, current_block, &value, &regnd);
   }
 
-  int yank_if_dead( Node *old, Block *current_block, Node_List *value, Node_List *regnd );
+  int yank_if_dead( Node *old, Block *current_block, Node_List *value, Node_List *regnd ) {
+    return yank_if_dead_recurse(old, old, current_block, value, regnd);
+  }
+  int yank_if_dead_recurse(Node *old, Node *orig_old, Block *current_block,
+                           Node_List *value, Node_List *regnd);
   int yank( Node *old, Block *current_block, Node_List *value, Node_List *regnd );
   int elide_copy( Node *n, int k, Block *current_block, Node_List &value, Node_List &regnd, bool can_change_regs );
   int use_prior_register( Node *copy, uint idx, Node *def, Block *current_block, Node_List &value, Node_List &regnd );
diff --git a/hotspot/src/share/vm/opto/classes.hpp b/hotspot/src/share/vm/opto/classes.hpp
index dbece7611c1..19ade906999 100644
--- a/hotspot/src/share/vm/opto/classes.hpp
+++ b/hotspot/src/share/vm/opto/classes.hpp
@@ -166,6 +166,7 @@ macro(MemBarCPUOrder)
 macro(MemBarRelease)
 macro(MemBarReleaseLock)
 macro(MemBarVolatile)
+macro(MemBarStoreStore)
 macro(MergeMem)
 macro(MinI)
 macro(ModD)
diff --git a/hotspot/src/share/vm/opto/compile.cpp b/hotspot/src/share/vm/opto/compile.cpp
index 721d1eedc60..2b9051c9032 100644
--- a/hotspot/src/share/vm/opto/compile.cpp
+++ b/hotspot/src/share/vm/opto/compile.cpp
@@ -1282,12 +1282,11 @@ const TypePtr *Compile::flatten_alias_type( const TypePtr *tj ) const {
   if( tk ) {
     // If we are referencing a field within a Klass, we need
     // to assume the worst case of an Object.  Both exact and
-    // inexact types must flatten to the same alias class.
-    // Since the flattened result for a klass is defined to be
-    // precisely java.lang.Object, use a constant ptr.
+    // inexact types must flatten to the same alias class so
+    // use NotNull as the PTR.
     if ( offset == Type::OffsetBot || (offset >= 0 && (size_t)offset < sizeof(Klass)) ) {
 
-      tj = tk = TypeKlassPtr::make(TypePtr::Constant,
+      tj = tk = TypeKlassPtr::make(TypePtr::NotNull,
                                    TypeKlassPtr::OBJECT->klass(),
                                    offset);
     }
@@ -1307,10 +1306,12 @@ const TypePtr *Compile::flatten_alias_type( const TypePtr *tj ) const {
     // these 2 disparate memories into the same alias class.  Since the
     // primary supertype array is read-only, there's no chance of confusion
     // where we bypass an array load and an array store.
-    uint off2 = offset - Klass::primary_supers_offset_in_bytes();
-    if( offset == Type::OffsetBot ||
-        off2 < Klass::primary_super_limit()*wordSize ) {
-      offset = sizeof(oopDesc) +Klass::secondary_super_cache_offset_in_bytes();
+    int primary_supers_offset = in_bytes(Klass::primary_supers_offset());
+    if (offset == Type::OffsetBot ||
+        (offset >= primary_supers_offset &&
+         offset < (int)(primary_supers_offset + Klass::primary_super_limit() * wordSize)) ||
+        offset == (int)in_bytes(Klass::secondary_super_cache_offset())) {
+      offset = in_bytes(Klass::secondary_super_cache_offset());
       tj = tk = TypeKlassPtr::make( TypePtr::NotNull, tk->klass(), offset );
     }
   }
@@ -1489,13 +1490,13 @@ Compile::AliasType* Compile::find_alias_type(const TypePtr* adr_type, bool no_cr
         alias_type(idx)->set_rewritable(false);
     }
     if (flat->isa_klassptr()) {
-      if (flat->offset() == Klass::super_check_offset_offset_in_bytes() + (int)sizeof(oopDesc))
+      if (flat->offset() == in_bytes(Klass::super_check_offset_offset()))
         alias_type(idx)->set_rewritable(false);
-      if (flat->offset() == Klass::modifier_flags_offset_in_bytes() + (int)sizeof(oopDesc))
+      if (flat->offset() == in_bytes(Klass::modifier_flags_offset()))
         alias_type(idx)->set_rewritable(false);
-      if (flat->offset() == Klass::access_flags_offset_in_bytes() + (int)sizeof(oopDesc))
+      if (flat->offset() == in_bytes(Klass::access_flags_offset()))
         alias_type(idx)->set_rewritable(false);
-      if (flat->offset() == Klass::java_mirror_offset_in_bytes() + (int)sizeof(oopDesc))
+      if (flat->offset() == in_bytes(Klass::java_mirror_offset()))
         alias_type(idx)->set_rewritable(false);
     }
     // %%% (We would like to finalize JavaThread::threadObj_offset(),
@@ -2521,7 +2522,7 @@ static void final_graph_reshaping_impl( Node *n, Final_Reshape_Counts &frc ) {
             break;
           }
         }
-        assert(p != NULL, "must be found");
+        assert(proj != NULL, "must be found");
         p->subsume_by(proj);
       }
     }
diff --git a/hotspot/src/share/vm/opto/escape.cpp b/hotspot/src/share/vm/opto/escape.cpp
index 22ac9a7ec6f..514a7acbb40 100644
--- a/hotspot/src/share/vm/opto/escape.cpp
+++ b/hotspot/src/share/vm/opto/escape.cpp
@@ -1595,6 +1595,7 @@ bool ConnectionGraph::compute_escape() {
   GrowableArray<Node*> alloc_worklist;
   GrowableArray<Node*> addp_worklist;
   GrowableArray<Node*> ptr_cmp_worklist;
+  GrowableArray<Node*> storestore_worklist;
   PhaseGVN* igvn = _igvn;
 
   // Push all useful nodes onto CG list and set their type.
@@ -1618,6 +1619,11 @@ bool ConnectionGraph::compute_escape() {
                (n->Opcode() == Op_CmpP || n->Opcode() == Op_CmpN)) {
       // Compare pointers nodes
       ptr_cmp_worklist.append(n);
+    } else if (n->is_MemBarStoreStore()) {
+      // Collect all MemBarStoreStore nodes so that depending on the
+      // escape status of the associated Allocate node some of them
+      // may be eliminated.
+      storestore_worklist.append(n);
     }
     for (DUIterator_Fast imax, i = n->fast_outs(imax); i < imax; i++) {
       Node* m = n->fast_out(i);   // Get user
@@ -1724,11 +1730,20 @@ bool ConnectionGraph::compute_escape() {
   uint alloc_length = alloc_worklist.length();
   for (uint next = 0; next < alloc_length; ++next) {
     Node* n = alloc_worklist.at(next);
-    if (ptnode_adr(n->_idx)->escape_state() == PointsToNode::NoEscape) {
+    PointsToNode::EscapeState es = ptnode_adr(n->_idx)->escape_state();
+    if (es == PointsToNode::NoEscape) {
       has_non_escaping_obj = true;
       if (n->is_Allocate()) {
         find_init_values(n, &visited, igvn);
+        // The object allocated by this Allocate node will never be
+        // seen by an other thread. Mark it so that when it is
+        // expanded no MemBarStoreStore is added.
+        n->as_Allocate()->initialization()->set_does_not_escape();
       }
+    } else if ((es == PointsToNode::ArgEscape) && n->is_Allocate()) {
+      // Same as above. Mark this Allocate node so that when it is
+      // expanded no MemBarStoreStore is added.
+      n->as_Allocate()->initialization()->set_does_not_escape();
     }
   }
 
@@ -1827,20 +1842,15 @@ bool ConnectionGraph::compute_escape() {
       Node *n = C->macro_node(i);
       if (n->is_AbstractLock()) { // Lock and Unlock nodes
         AbstractLockNode* alock = n->as_AbstractLock();
-        if (!alock->is_eliminated() || alock->is_coarsened()) {
+        if (!alock->is_non_esc_obj()) {
           PointsToNode::EscapeState es = escape_state(alock->obj_node());
           assert(es != PointsToNode::UnknownEscape, "should know");
           if (es != PointsToNode::UnknownEscape && es != PointsToNode::GlobalEscape) {
-            if (!alock->is_eliminated()) {
-              // Mark it eliminated to update any counters
-              alock->set_eliminated();
-            } else {
-              // The lock could be marked eliminated by lock coarsening
-              // code during first IGVN before EA. Clear coarsened flag
-              // to eliminate all associated locks/unlocks and relock
-              // during deoptimization.
-              alock->clear_coarsened();
-            }
+            assert(!alock->is_eliminated() || alock->is_coarsened(), "sanity");
+            // The lock could be marked eliminated by lock coarsening
+            // code during first IGVN before EA. Replace coarsened flag
+            // to eliminate all associated locks/unlocks.
+            alock->set_non_esc_obj();
           }
         }
       }
@@ -1874,6 +1884,25 @@ bool ConnectionGraph::compute_escape() {
       igvn->hash_delete(_pcmp_eq);
   }
 
+  // For MemBarStoreStore nodes added in library_call.cpp, check
+  // escape status of associated AllocateNode and optimize out
+  // MemBarStoreStore node if the allocated object never escapes.
+  while (storestore_worklist.length() != 0) {
+    Node *n = storestore_worklist.pop();
+    MemBarStoreStoreNode *storestore = n ->as_MemBarStoreStore();
+    Node *alloc = storestore->in(MemBarNode::Precedent)->in(0);
+    assert (alloc->is_Allocate(), "storestore should point to AllocateNode");
+    PointsToNode::EscapeState es = ptnode_adr(alloc->_idx)->escape_state();
+    if (es == PointsToNode::NoEscape || es == PointsToNode::ArgEscape) {
+      MemBarNode* mb = MemBarNode::make(C, Op_MemBarCPUOrder, Compile::AliasIdxBot);
+      mb->init_req(TypeFunc::Memory, storestore->in(TypeFunc::Memory));
+      mb->init_req(TypeFunc::Control, storestore->in(TypeFunc::Control));
+
+      _igvn->register_new_node_with_optimizer(mb);
+      _igvn->replace_node(storestore, mb);
+    }
+  }
+
 #ifndef PRODUCT
   if (PrintEscapeAnalysis) {
     dump(); // Dump ConnectionGraph
diff --git a/hotspot/src/share/vm/opto/graphKit.cpp b/hotspot/src/share/vm/opto/graphKit.cpp
index 49717298a14..5658960799d 100644
--- a/hotspot/src/share/vm/opto/graphKit.cpp
+++ b/hotspot/src/share/vm/opto/graphKit.cpp
@@ -2304,9 +2304,9 @@ Node* GraphKit::gen_subtype_check(Node* subklass, Node* superklass) {
   // will always succeed.  We could leave a dependency behind to ensure this.
 
   // First load the super-klass's check-offset
-  Node *p1 = basic_plus_adr( superklass, superklass, sizeof(oopDesc) + Klass::super_check_offset_offset_in_bytes() );
+  Node *p1 = basic_plus_adr( superklass, superklass, in_bytes(Klass::super_check_offset_offset()) );
   Node *chk_off = _gvn.transform( new (C, 3) LoadINode( NULL, memory(p1), p1, _gvn.type(p1)->is_ptr() ) );
-  int cacheoff_con = sizeof(oopDesc) + Klass::secondary_super_cache_offset_in_bytes();
+  int cacheoff_con = in_bytes(Klass::secondary_super_cache_offset());
   bool might_be_cache = (find_int_con(chk_off, cacheoff_con) == cacheoff_con);
 
   // Load from the sub-klass's super-class display list, or a 1-word cache of
@@ -2934,7 +2934,7 @@ Node* GraphKit::get_layout_helper(Node* klass_node, jint& constant_value) {
     }
   }
   constant_value = Klass::_lh_neutral_value;  // put in a known value
-  Node* lhp = basic_plus_adr(klass_node, klass_node, Klass::layout_helper_offset_in_bytes() + sizeof(oopDesc));
+  Node* lhp = basic_plus_adr(klass_node, klass_node, in_bytes(Klass::layout_helper_offset()));
   return make_load(NULL, lhp, TypeInt::INT, T_INT);
 }
 
@@ -3337,6 +3337,19 @@ InitializeNode* AllocateNode::initialization() {
   return NULL;
 }
 
+// Trace Allocate -> Proj[Parm] -> MemBarStoreStore
+MemBarStoreStoreNode* AllocateNode::storestore() {
+  ProjNode* rawoop = proj_out(AllocateNode::RawAddress);
+  if (rawoop == NULL)  return NULL;
+  for (DUIterator_Fast imax, i = rawoop->fast_outs(imax); i < imax; i++) {
+    Node* storestore = rawoop->fast_out(i);
+    if (storestore->is_MemBarStoreStore()) {
+      return storestore->as_MemBarStoreStore();
+    }
+  }
+  return NULL;
+}
+
 //----------------------------- loop predicates ---------------------------
 
 //------------------------------add_predicate_impl----------------------------
diff --git a/hotspot/src/share/vm/opto/library_call.cpp b/hotspot/src/share/vm/opto/library_call.cpp
index d5ad00e16e0..ebc3a2dd9f4 100644
--- a/hotspot/src/share/vm/opto/library_call.cpp
+++ b/hotspot/src/share/vm/opto/library_call.cpp
@@ -819,7 +819,7 @@ inline Node* LibraryCallKit::generate_limit_guard(Node* offset,
   if (stopped())
     return NULL;                // already stopped
   bool zero_offset = _gvn.type(offset) == TypeInt::ZERO;
-  if (zero_offset && _gvn.eqv_uncast(subseq_length, array_length))
+  if (zero_offset && subseq_length->eqv_uncast(array_length))
     return NULL;                // common case of whole-array copy
   Node* last = subseq_length;
   if (!zero_offset)             // last += offset
@@ -2165,8 +2165,7 @@ void LibraryCallKit::insert_g1_pre_barrier(Node* base_oop, Node* offset, Node* p
   IdealKit ideal(this);
 #define __ ideal.
 
-  const int reference_type_offset = instanceKlass::reference_type_offset_in_bytes() +
-                                        sizeof(oopDesc);
+  const int reference_type_offset = in_bytes(instanceKlass::reference_type_offset());
 
   Node* referent_off = __ ConX(java_lang_ref_Reference::referent_offset);
 
@@ -2806,8 +2805,10 @@ bool LibraryCallKit::inline_unsafe_allocate() {
   // Note:  The argument might still be an illegal value like
   // Serializable.class or Object[].class.   The runtime will handle it.
   // But we must make an explicit check for initialization.
-  Node* insp = basic_plus_adr(kls, instanceKlass::init_state_offset_in_bytes() + sizeof(oopDesc));
-  Node* inst = make_load(NULL, insp, TypeInt::INT, T_INT);
+  Node* insp = basic_plus_adr(kls, in_bytes(instanceKlass::init_state_offset()));
+  // Use T_BOOLEAN for instanceKlass::_init_state so the compiler
+  // can generate code to load it as unsigned byte.
+  Node* inst = make_load(NULL, insp, TypeInt::UBYTE, T_BOOLEAN);
   Node* bits = intcon(instanceKlass::fully_initialized);
   Node* test = _gvn.transform( new (C, 3) SubINode(inst, bits) );
   // The 'test' is non-zero if we need to take a slow path.
@@ -2954,7 +2955,7 @@ bool LibraryCallKit::inline_native_isInterrupted() {
 //---------------------------load_mirror_from_klass----------------------------
 // Given a klass oop, load its java mirror (a java.lang.Class oop).
 Node* LibraryCallKit::load_mirror_from_klass(Node* klass) {
-  Node* p = basic_plus_adr(klass, Klass::java_mirror_offset_in_bytes() + sizeof(oopDesc));
+  Node* p = basic_plus_adr(klass, in_bytes(Klass::java_mirror_offset()));
   return make_load(NULL, p, TypeInstPtr::MIRROR, T_OBJECT);
 }
 
@@ -2994,7 +2995,7 @@ Node* LibraryCallKit::load_klass_from_mirror_common(Node* mirror,
 Node* LibraryCallKit::generate_access_flags_guard(Node* kls, int modifier_mask, int modifier_bits, RegionNode* region) {
   // Branch around if the given klass has the given modifier bit set.
   // Like generate_guard, adds a new path onto the region.
-  Node* modp = basic_plus_adr(kls, Klass::access_flags_offset_in_bytes() + sizeof(oopDesc));
+  Node* modp = basic_plus_adr(kls, in_bytes(Klass::access_flags_offset()));
   Node* mods = make_load(NULL, modp, TypeInt::INT, T_INT);
   Node* mask = intcon(modifier_mask);
   Node* bits = intcon(modifier_bits);
@@ -3115,7 +3116,7 @@ bool LibraryCallKit::inline_native_Class_query(vmIntrinsics::ID id) {
     break;
 
   case vmIntrinsics::_getModifiers:
-    p = basic_plus_adr(kls, Klass::modifier_flags_offset_in_bytes() + sizeof(oopDesc));
+    p = basic_plus_adr(kls, in_bytes(Klass::modifier_flags_offset()));
     query_value = make_load(NULL, p, TypeInt::INT, T_INT);
     break;
 
@@ -3155,7 +3156,7 @@ bool LibraryCallKit::inline_native_Class_query(vmIntrinsics::ID id) {
       // A guard was added.  If the guard is taken, it was an array.
       phi->add_req(makecon(TypeInstPtr::make(env()->Object_klass()->java_mirror())));
     // If we fall through, it's a plain class.  Get its _super.
-    p = basic_plus_adr(kls, Klass::super_offset_in_bytes() + sizeof(oopDesc));
+    p = basic_plus_adr(kls, in_bytes(Klass::super_offset()));
     kls = _gvn.transform( LoadKlassNode::make(_gvn, immutable_memory(), p, TypeRawPtr::BOTTOM, TypeKlassPtr::OBJECT_OR_NULL) );
     null_ctl = top();
     kls = null_check_oop(kls, &null_ctl);
@@ -3173,7 +3174,7 @@ bool LibraryCallKit::inline_native_Class_query(vmIntrinsics::ID id) {
     if (generate_array_guard(kls, region) != NULL) {
       // Be sure to pin the oop load to the guard edge just created:
       Node* is_array_ctrl = region->in(region->req()-1);
-      Node* cma = basic_plus_adr(kls, in_bytes(arrayKlass::component_mirror_offset()) + sizeof(oopDesc));
+      Node* cma = basic_plus_adr(kls, in_bytes(arrayKlass::component_mirror_offset()));
       Node* cmo = make_load(is_array_ctrl, cma, TypeInstPtr::MIRROR, T_OBJECT);
       phi->add_req(cmo);
     }
@@ -3181,7 +3182,7 @@ bool LibraryCallKit::inline_native_Class_query(vmIntrinsics::ID id) {
     break;
 
   case vmIntrinsics::_getClassAccessFlags:
-    p = basic_plus_adr(kls, Klass::access_flags_offset_in_bytes() + sizeof(oopDesc));
+    p = basic_plus_adr(kls, in_bytes(Klass::access_flags_offset()));
     query_value = make_load(NULL, p, TypeInt::INT, T_INT);
     break;
 
@@ -4194,12 +4195,17 @@ void LibraryCallKit::copy_to_clone(Node* obj, Node* alloc_obj, Node* obj_size, b
   Node* raw_obj = alloc_obj->in(1);
   assert(alloc_obj->is_CheckCastPP() && raw_obj->is_Proj() && raw_obj->in(0)->is_Allocate(), "");
 
+  AllocateNode* alloc = NULL;
   if (ReduceBulkZeroing) {
     // We will be completely responsible for initializing this object -
     // mark Initialize node as complete.
-    AllocateNode* alloc = AllocateNode::Ideal_allocation(alloc_obj, &_gvn);
+    alloc = AllocateNode::Ideal_allocation(alloc_obj, &_gvn);
     // The object was just allocated - there should be no any stores!
     guarantee(alloc != NULL && alloc->maybe_set_complete(&_gvn), "");
+    // Mark as complete_with_arraycopy so that on AllocateNode
+    // expansion, we know this AllocateNode is initialized by an array
+    // copy and a StoreStore barrier exists after the array copy.
+    alloc->initialization()->set_complete_with_arraycopy();
   }
 
   // Copy the fastest available way.
@@ -4261,7 +4267,18 @@ void LibraryCallKit::copy_to_clone(Node* obj, Node* alloc_obj, Node* obj_size, b
   }
 
   // Do not let reads from the cloned object float above the arraycopy.
-  insert_mem_bar(Op_MemBarCPUOrder);
+  if (alloc != NULL) {
+    // Do not let stores that initialize this object be reordered with
+    // a subsequent store that would make this object accessible by
+    // other threads.
+    // Record what AllocateNode this StoreStore protects so that
+    // escape analysis can go from the MemBarStoreStoreNode to the
+    // AllocateNode and eliminate the MemBarStoreStoreNode if possible
+    // based on the escape status of the AllocateNode.
+    insert_mem_bar(Op_MemBarStoreStore, alloc->proj_out(AllocateNode::RawAddress));
+  } else {
+    insert_mem_bar(Op_MemBarCPUOrder);
+  }
 }
 
 //------------------------inline_native_clone----------------------------
@@ -4650,7 +4667,7 @@ LibraryCallKit::generate_arraycopy(const TypePtr* adr_type,
   if (ReduceBulkZeroing
       && !ZeroTLAB              // pointless if already zeroed
       && basic_elem_type != T_CONFLICT // avoid corner case
-      && !_gvn.eqv_uncast(src, dest)
+      && !src->eqv_uncast(dest)
       && ((alloc = tightly_coupled_allocation(dest, slow_region))
           != NULL)
       && _gvn.find_int_con(alloc->in(AllocateNode::ALength), 1) > 0
@@ -4728,7 +4745,7 @@ LibraryCallKit::generate_arraycopy(const TypePtr* adr_type,
     // copy_length is 0.
     if (!stopped() && dest_uninitialized) {
       Node* dest_length = alloc->in(AllocateNode::ALength);
-      if (_gvn.eqv_uncast(copy_length, dest_length)
+      if (copy_length->eqv_uncast(dest_length)
           || _gvn.find_int_con(dest_length, 1) <= 0) {
         // There is no zeroing to do. No need for a secondary raw memory barrier.
       } else {
@@ -4774,7 +4791,7 @@ LibraryCallKit::generate_arraycopy(const TypePtr* adr_type,
     // with its attendant messy index arithmetic, and upgrade
     // the copy to a more hardware-friendly word size of 64 bits.
     Node* tail_ctl = NULL;
-    if (!stopped() && !_gvn.eqv_uncast(dest_tail, dest_length)) {
+    if (!stopped() && !dest_tail->eqv_uncast(dest_length)) {
       Node* cmp_lt   = _gvn.transform( new(C,3) CmpINode(dest_tail, dest_length) );
       Node* bol_lt   = _gvn.transform( new(C,2) BoolNode(cmp_lt, BoolTest::lt) );
       tail_ctl = generate_slow_guard(bol_lt, NULL);
@@ -4857,7 +4874,7 @@ LibraryCallKit::generate_arraycopy(const TypePtr* adr_type,
       PreserveJVMState pjvms(this);
       set_control(not_subtype_ctrl);
       // (At this point we can assume disjoint_bases, since types differ.)
-      int ek_offset = objArrayKlass::element_klass_offset_in_bytes() + sizeof(oopDesc);
+      int ek_offset = in_bytes(objArrayKlass::element_klass_offset());
       Node* p1 = basic_plus_adr(dest_klass, ek_offset);
       Node* n1 = LoadKlassNode::make(_gvn, immutable_memory(), p1, TypeRawPtr::BOTTOM);
       Node* dest_elem_klass = _gvn.transform(n1);
@@ -5004,7 +5021,16 @@ LibraryCallKit::generate_arraycopy(const TypePtr* adr_type,
   // the membar also.
   //
   // Do not let reads from the cloned object float above the arraycopy.
-  if (InsertMemBarAfterArraycopy || alloc != NULL)
+  if (alloc != NULL) {
+    // Do not let stores that initialize this object be reordered with
+    // a subsequent store that would make this object accessible by
+    // other threads.
+    // Record what AllocateNode this StoreStore protects so that
+    // escape analysis can go from the MemBarStoreStoreNode to the
+    // AllocateNode and eliminate the MemBarStoreStoreNode if possible
+    // based on the escape status of the AllocateNode.
+    insert_mem_bar(Op_MemBarStoreStore, alloc->proj_out(AllocateNode::RawAddress));
+  } else if (InsertMemBarAfterArraycopy)
     insert_mem_bar(Op_MemBarCPUOrder);
 }
 
@@ -5308,7 +5334,7 @@ LibraryCallKit::generate_checkcast_arraycopy(const TypePtr* adr_type,
   // for the target array.  This is an optimistic check.  It will
   // look in each non-null element's class, at the desired klass's
   // super_check_offset, for the desired klass.
-  int sco_offset = Klass::super_check_offset_offset_in_bytes() + sizeof(oopDesc);
+  int sco_offset = in_bytes(Klass::super_check_offset_offset());
   Node* p3 = basic_plus_adr(dest_elem_klass, sco_offset);
   Node* n3 = new(C, 3) LoadINode(NULL, memory(p3), p3, _gvn.type(p3)->is_ptr());
   Node* check_offset = ConvI2X(_gvn.transform(n3));
diff --git a/hotspot/src/share/vm/opto/locknode.cpp b/hotspot/src/share/vm/opto/locknode.cpp
index 23a594e134a..4147ed33dc7 100644
--- a/hotspot/src/share/vm/opto/locknode.cpp
+++ b/hotspot/src/share/vm/opto/locknode.cpp
@@ -49,18 +49,22 @@ BoxLockNode::BoxLockNode( int slot ) : Node( Compile::current()->root() ),
 
 //-----------------------------hash--------------------------------------------
 uint BoxLockNode::hash() const {
+  if (EliminateNestedLocks)
+    return NO_HASH; // Each locked region has own BoxLock node
   return Node::hash() + _slot + (_is_eliminated ? Compile::current()->fixed_slots() : 0);
 }
 
 //------------------------------cmp--------------------------------------------
 uint BoxLockNode::cmp( const Node &n ) const {
+  if (EliminateNestedLocks)
+    return (&n == this); // Always fail except on self
   const BoxLockNode &bn = (const BoxLockNode &)n;
   return bn._slot == _slot && bn._is_eliminated == _is_eliminated;
 }
 
-OptoReg::Name BoxLockNode::stack_slot(Node* box_node) {
-  // Chase down the BoxNode
-  while (!box_node->is_BoxLock()) {
+BoxLockNode* BoxLockNode::box_node(Node* box) {
+  // Chase down the BoxNode after RA which may spill box nodes.
+  while (!box->is_BoxLock()) {
     //    if (box_node->is_SpillCopy()) {
     //      Node *m = box_node->in(1);
     //      if (m->is_Mach() && m->as_Mach()->ideal_Opcode() == Op_StoreP) {
@@ -68,10 +72,64 @@ OptoReg::Name BoxLockNode::stack_slot(Node* box_node) {
     //        continue;
     //      }
     //    }
-    assert(box_node->is_SpillCopy() || box_node->is_Phi(), "Bad spill of Lock.");
-    box_node = box_node->in(1);
+    assert(box->is_SpillCopy() || box->is_Phi(), "Bad spill of Lock.");
+    // Only BoxLock nodes with the same stack slot are merged.
+    // So it is enough to trace one path to find the slot value.
+    box = box->in(1);
   }
-  return box_node->in_RegMask(0).find_first_elem();
+  return box->as_BoxLock();
+}
+
+OptoReg::Name BoxLockNode::reg(Node* box) {
+  return box_node(box)->in_RegMask(0).find_first_elem();
+}
+
+// Is BoxLock node used for one simple lock region (same box and obj)?
+bool BoxLockNode::is_simple_lock_region(LockNode** unique_lock, Node* obj) {
+  LockNode* lock = NULL;
+  bool has_one_lock = false;
+  for (uint i = 0; i < this->outcnt(); i++) {
+    Node* n = this->raw_out(i);
+    assert(!n->is_Phi(), "should not merge BoxLock nodes");
+    if (n->is_AbstractLock()) {
+      AbstractLockNode* alock = n->as_AbstractLock();
+      // Check lock's box since box could be referenced by Lock's debug info.
+      if (alock->box_node() == this) {
+        if (alock->obj_node()->eqv_uncast(obj)) {
+          if ((unique_lock != NULL) && alock->is_Lock()) {
+            if (lock == NULL) {
+              lock = alock->as_Lock();
+              has_one_lock = true;
+            } else if (lock != alock->as_Lock()) {
+              has_one_lock = false;
+            }
+          }
+        } else {
+          return false; // Different objects
+        }
+      }
+    }
+  }
+#ifdef ASSERT
+  // Verify that FastLock and Safepoint reference only this lock region.
+  for (uint i = 0; i < this->outcnt(); i++) {
+    Node* n = this->raw_out(i);
+    if (n->is_FastLock()) {
+      FastLockNode* flock = n->as_FastLock();
+      assert((flock->box_node() == this) && flock->obj_node()->eqv_uncast(obj),"");
+    }
+    // Don't check monitor info in safepoints since the referenced object could
+    // be different from the locked object. It could be Phi node of different
+    // cast nodes which point to this locked object.
+    // We assume that no other objects could be referenced in monitor info
+    // associated with this BoxLock node because all associated locks and
+    // unlocks are reference only this one object.
+  }
+#endif
+  if (unique_lock != NULL && has_one_lock) {
+    *unique_lock = lock;
+  }
+  return true;
 }
 
 //=============================================================================
diff --git a/hotspot/src/share/vm/opto/locknode.hpp b/hotspot/src/share/vm/opto/locknode.hpp
index 05630e22d4a..91b99bc0052 100644
--- a/hotspot/src/share/vm/opto/locknode.hpp
+++ b/hotspot/src/share/vm/opto/locknode.hpp
@@ -49,11 +49,11 @@
 
 //------------------------------BoxLockNode------------------------------------
 class BoxLockNode : public Node {
-public:
-  const int _slot;
-  RegMask   _inmask;
-  bool _is_eliminated;    // indicates this lock was safely eliminated
+  const int     _slot; // stack slot
+  RegMask     _inmask; // OptoReg corresponding to stack slot
+  bool _is_eliminated; // Associated locks were safely eliminated
 
+public:
   BoxLockNode( int lock );
   virtual int Opcode() const;
   virtual void emit(CodeBuffer &cbuf, PhaseRegAlloc *ra_) const;
@@ -66,11 +66,19 @@ public:
   virtual const class Type *bottom_type() const { return TypeRawPtr::BOTTOM; }
   virtual uint ideal_reg() const { return Op_RegP; }
 
-  static OptoReg::Name stack_slot(Node* box_node);
+  static OptoReg::Name reg(Node* box_node);
+  static BoxLockNode* box_node(Node* box_node);
+  static bool same_slot(Node* box1, Node* box2) {
+    return box1->as_BoxLock()->_slot == box2->as_BoxLock()->_slot;
+  }
+  int stack_slot() const { return _slot; }
 
-  bool is_eliminated()  { return _is_eliminated; }
+  bool is_eliminated() const { return _is_eliminated; }
   // mark lock as eliminated.
-  void set_eliminated() { _is_eliminated = true; }
+  void set_eliminated()      { _is_eliminated = true; }
+
+  // Is BoxLock node used for one simple lock region?
+  bool is_simple_lock_region(LockNode** unique_lock, Node* obj);
 
 #ifndef PRODUCT
   virtual void format( PhaseRegAlloc *, outputStream *st ) const;
@@ -91,6 +99,7 @@ public:
   }
   Node* obj_node() const { return in(1); }
   Node* box_node() const { return in(2); }
+  void  set_box_node(Node* box) { set_req(2, box); }
 
   // FastLock and FastUnlockNode do not hash, we need one for each correspoding
   // LockNode/UnLockNode to avoid creating Phi's.
diff --git a/hotspot/src/share/vm/opto/loopnode.cpp b/hotspot/src/share/vm/opto/loopnode.cpp
index e9c471a77cb..6c7b8edb8b9 100644
--- a/hotspot/src/share/vm/opto/loopnode.cpp
+++ b/hotspot/src/share/vm/opto/loopnode.cpp
@@ -3278,16 +3278,7 @@ void PhaseIdealLoop::build_loop_late_post( Node *n ) {
 #ifdef ASSERT
     if (legal->is_Start() && !early->is_Root()) {
       // Bad graph. Print idom path and fail.
-      tty->print_cr( "Bad graph detected in build_loop_late");
-      tty->print("n: ");n->dump(); tty->cr();
-      tty->print("early: ");early->dump(); tty->cr();
-      int ct = 0;
-      Node *dbg_legal = LCA;
-      while(!dbg_legal->is_Start() && ct < 100) {
-        tty->print("idom[%d] ",ct); dbg_legal->dump(); tty->cr();
-        ct++;
-        dbg_legal = idom(dbg_legal);
-      }
+      dump_bad_graph(n, early, LCA);
       assert(false, "Bad graph detected in build_loop_late");
     }
 #endif
@@ -3337,6 +3328,88 @@ void PhaseIdealLoop::build_loop_late_post( Node *n ) {
     chosen_loop->_body.push(n);// Collect inner loops
 }
 
+#ifdef ASSERT
+void PhaseIdealLoop::dump_bad_graph(Node* n, Node* early, Node* LCA) {
+  tty->print_cr( "Bad graph detected in build_loop_late");
+  tty->print("n: "); n->dump();
+  tty->print("early(n): "); early->dump();
+  if (n->in(0) != NULL  && !n->in(0)->is_top() &&
+      n->in(0) != early && !n->in(0)->is_Root()) {
+    tty->print("n->in(0): "); n->in(0)->dump();
+  }
+  for (uint i = 1; i < n->req(); i++) {
+    Node* in1 = n->in(i);
+    if (in1 != NULL && in1 != n && !in1->is_top()) {
+      tty->print("n->in(%d): ", i); in1->dump();
+      Node* in1_early = get_ctrl(in1);
+      tty->print("early(n->in(%d)): ", i); in1_early->dump();
+      if (in1->in(0) != NULL     && !in1->in(0)->is_top() &&
+          in1->in(0) != in1_early && !in1->in(0)->is_Root()) {
+        tty->print("n->in(%d)->in(0): ", i); in1->in(0)->dump();
+      }
+      for (uint j = 1; j < in1->req(); j++) {
+        Node* in2 = in1->in(j);
+        if (in2 != NULL && in2 != n && in2 != in1 && !in2->is_top()) {
+          tty->print("n->in(%d)->in(%d): ", i, j); in2->dump();
+          Node* in2_early = get_ctrl(in2);
+          tty->print("early(n->in(%d)->in(%d)): ", i, j); in2_early->dump();
+          if (in2->in(0) != NULL     && !in2->in(0)->is_top() &&
+              in2->in(0) != in2_early && !in2->in(0)->is_Root()) {
+            tty->print("n->in(%d)->in(%d)->in(0): ", i, j); in2->in(0)->dump();
+          }
+        }
+      }
+    }
+  }
+  tty->cr();
+  tty->print("LCA(n): "); LCA->dump();
+  for (uint i = 0; i < n->outcnt(); i++) {
+    Node* u1 = n->raw_out(i);
+    if (u1 == n)
+      continue;
+    tty->print("n->out(%d): ", i); u1->dump();
+    if (u1->is_CFG()) {
+      for (uint j = 0; j < u1->outcnt(); j++) {
+        Node* u2 = u1->raw_out(j);
+        if (u2 != u1 && u2 != n && u2->is_CFG()) {
+          tty->print("n->out(%d)->out(%d): ", i, j); u2->dump();
+        }
+      }
+    } else {
+      Node* u1_later = get_ctrl(u1);
+      tty->print("later(n->out(%d)): ", i); u1_later->dump();
+      if (u1->in(0) != NULL     && !u1->in(0)->is_top() &&
+          u1->in(0) != u1_later && !u1->in(0)->is_Root()) {
+        tty->print("n->out(%d)->in(0): ", i); u1->in(0)->dump();
+      }
+      for (uint j = 0; j < u1->outcnt(); j++) {
+        Node* u2 = u1->raw_out(j);
+        if (u2 == n || u2 == u1)
+          continue;
+        tty->print("n->out(%d)->out(%d): ", i, j); u2->dump();
+        if (!u2->is_CFG()) {
+          Node* u2_later = get_ctrl(u2);
+          tty->print("later(n->out(%d)->out(%d)): ", i, j); u2_later->dump();
+          if (u2->in(0) != NULL     && !u2->in(0)->is_top() &&
+              u2->in(0) != u2_later && !u2->in(0)->is_Root()) {
+            tty->print("n->out(%d)->in(0): ", i); u2->in(0)->dump();
+          }
+        }
+      }
+    }
+  }
+  tty->cr();
+  int ct = 0;
+  Node *dbg_legal = LCA;
+  while(!dbg_legal->is_Start() && ct < 100) {
+    tty->print("idom[%d] ",ct); dbg_legal->dump();
+    ct++;
+    dbg_legal = idom(dbg_legal);
+  }
+  tty->cr();
+}
+#endif
+
 #ifndef PRODUCT
 //------------------------------dump-------------------------------------------
 void PhaseIdealLoop::dump( ) const {
diff --git a/hotspot/src/share/vm/opto/loopnode.hpp b/hotspot/src/share/vm/opto/loopnode.hpp
index 9b4c38a5f5d..d2ea55610b8 100644
--- a/hotspot/src/share/vm/opto/loopnode.hpp
+++ b/hotspot/src/share/vm/opto/loopnode.hpp
@@ -1040,6 +1040,10 @@ public:
   bool created_loop_node()     { return _created_loop_node; }
   void register_new_node( Node *n, Node *blk );
 
+#ifdef ASSERT
+void dump_bad_graph(Node* n, Node* early, Node* LCA);
+#endif
+
 #ifndef PRODUCT
   void dump( ) const;
   void dump( IdealLoopTree *loop, uint rpo_idx, Node_List &rpo_list ) const;
diff --git a/hotspot/src/share/vm/opto/loopopts.cpp b/hotspot/src/share/vm/opto/loopopts.cpp
index 93c181973a4..e93fb4d088d 100644
--- a/hotspot/src/share/vm/opto/loopopts.cpp
+++ b/hotspot/src/share/vm/opto/loopopts.cpp
@@ -819,6 +819,8 @@ void PhaseIdealLoop::split_if_with_blocks_post( Node *n ) {
     if( iff->is_If() ) {        // Classic split-if?
       if( iff->in(0) != n_ctrl ) return; // Compare must be in same blk as if
     } else if (iff->is_CMove()) { // Trying to split-up a CMOVE
+      // Can't split CMove with different control edge.
+      if (iff->in(0) != NULL && iff->in(0) != n_ctrl ) return;
       if( get_ctrl(iff->in(2)) == n_ctrl ||
           get_ctrl(iff->in(3)) == n_ctrl )
         return;                 // Inputs not yet split-up
@@ -937,7 +939,7 @@ void PhaseIdealLoop::split_if_with_blocks_post( Node *n ) {
       }
       bool did_break = (i < imax);  // Did we break out of the previous loop?
       if (!did_break && n->outcnt() > 1) { // All uses in outer loops!
-        Node *late_load_ctrl;
+        Node *late_load_ctrl = NULL;
         if (n->is_Load()) {
           // If n is a load, get and save the result from get_late_ctrl(),
           // to be later used in calculating the control for n's clones.
diff --git a/hotspot/src/share/vm/opto/macro.cpp b/hotspot/src/share/vm/opto/macro.cpp
index 343544586da..adcf8e48f55 100644
--- a/hotspot/src/share/vm/opto/macro.cpp
+++ b/hotspot/src/share/vm/opto/macro.cpp
@@ -1088,6 +1088,12 @@ void PhaseMacroExpand::expand_allocate_common(
   Node* klass_node        = alloc->in(AllocateNode::KlassNode);
   Node* initial_slow_test = alloc->in(AllocateNode::InitialTest);
 
+  Node* storestore = alloc->storestore();
+  if (storestore != NULL) {
+    // Break this link that is no longer useful and confuses register allocation
+    storestore->set_req(MemBarNode::Precedent, top());
+  }
+
   assert(ctrl != NULL, "must have control");
   // We need a Region and corresponding Phi's to merge the slow-path and fast-path results.
   // they will not be used if "always_slow" is set
@@ -1289,10 +1295,66 @@ void PhaseMacroExpand::expand_allocate_common(
                                    0, new_alloc_bytes, T_LONG);
     }
 
+    InitializeNode* init = alloc->initialization();
     fast_oop_rawmem = initialize_object(alloc,
                                         fast_oop_ctrl, fast_oop_rawmem, fast_oop,
                                         klass_node, length, size_in_bytes);
 
+    // If initialization is performed by an array copy, any required
+    // MemBarStoreStore was already added. If the object does not
+    // escape no need for a MemBarStoreStore. Otherwise we need a
+    // MemBarStoreStore so that stores that initialize this object
+    // can't be reordered with a subsequent store that makes this
+    // object accessible by other threads.
+    if (init == NULL || (!init->is_complete_with_arraycopy() && !init->does_not_escape())) {
+      if (init == NULL || init->req() < InitializeNode::RawStores) {
+        // No InitializeNode or no stores captured by zeroing
+        // elimination. Simply add the MemBarStoreStore after object
+        // initialization.
+        MemBarNode* mb = MemBarNode::make(C, Op_MemBarStoreStore, Compile::AliasIdxBot, fast_oop_rawmem);
+        transform_later(mb);
+
+        mb->init_req(TypeFunc::Memory, fast_oop_rawmem);
+        mb->init_req(TypeFunc::Control, fast_oop_ctrl);
+        fast_oop_ctrl = new (C, 1) ProjNode(mb,TypeFunc::Control);
+        transform_later(fast_oop_ctrl);
+        fast_oop_rawmem = new (C, 1) ProjNode(mb,TypeFunc::Memory);
+        transform_later(fast_oop_rawmem);
+      } else {
+        // Add the MemBarStoreStore after the InitializeNode so that
+        // all stores performing the initialization that were moved
+        // before the InitializeNode happen before the storestore
+        // barrier.
+
+        Node* init_ctrl = init->proj_out(TypeFunc::Control);
+        Node* init_mem = init->proj_out(TypeFunc::Memory);
+
+        MemBarNode* mb = MemBarNode::make(C, Op_MemBarStoreStore, Compile::AliasIdxBot);
+        transform_later(mb);
+
+        Node* ctrl = new (C, 1) ProjNode(init,TypeFunc::Control);
+        transform_later(ctrl);
+        Node* mem = new (C, 1) ProjNode(init,TypeFunc::Memory);
+        transform_later(mem);
+
+        // The MemBarStoreStore depends on control and memory coming
+        // from the InitializeNode
+        mb->init_req(TypeFunc::Memory, mem);
+        mb->init_req(TypeFunc::Control, ctrl);
+
+        ctrl = new (C, 1) ProjNode(mb,TypeFunc::Control);
+        transform_later(ctrl);
+        mem = new (C, 1) ProjNode(mb,TypeFunc::Memory);
+        transform_later(mem);
+
+        // All nodes that depended on the InitializeNode for control
+        // and memory must now depend on the MemBarNode that itself
+        // depends on the InitializeNode
+        _igvn.replace_node(init_ctrl, ctrl);
+        _igvn.replace_node(init_mem, mem);
+      }
+    }
+
     if (C->env()->dtrace_extended_probes()) {
       // Slow-path call
       int size = TypeFunc::Parms + 2;
@@ -1326,6 +1388,7 @@ void PhaseMacroExpand::expand_allocate_common(
     result_phi_rawmem->init_req(fast_result_path, fast_oop_rawmem);
   } else {
     slow_region = ctrl;
+    result_phi_i_o = i_o; // Rename it to use in the following code.
   }
 
   // Generate slow-path call
@@ -1350,6 +1413,10 @@ void PhaseMacroExpand::expand_allocate_common(
   copy_call_debug_info((CallNode *) alloc,  call);
   if (!always_slow) {
     call->set_cnt(PROB_UNLIKELY_MAG(4));  // Same effect as RC_UNCOMMON.
+  } else {
+    // Hook i_o projection to avoid its elimination during allocation
+    // replacement (when only a slow call is generated).
+    call->set_req(TypeFunc::I_O, result_phi_i_o);
   }
   _igvn.replace_node(alloc, call);
   transform_later(call);
@@ -1366,8 +1433,10 @@ void PhaseMacroExpand::expand_allocate_common(
   //
   extract_call_projections(call);
 
-  // An allocate node has separate memory projections for the uses on the control and i_o paths
-  // Replace uses of the control memory projection with result_phi_rawmem (unless we are only generating a slow call)
+  // An allocate node has separate memory projections for the uses on
+  // the control and i_o paths. Replace the control memory projection with
+  // result_phi_rawmem (unless we are only generating a slow call when
+  // both memory projections are combined)
   if (!always_slow && _memproj_fallthrough != NULL) {
     for (DUIterator_Fast imax, i = _memproj_fallthrough->fast_outs(imax); i < imax; i++) {
       Node *use = _memproj_fallthrough->fast_out(i);
@@ -1378,8 +1447,8 @@ void PhaseMacroExpand::expand_allocate_common(
       --i;
     }
   }
-  // Now change uses of _memproj_catchall to use _memproj_fallthrough and delete _memproj_catchall so
-  // we end up with a call that has only 1 memory projection
+  // Now change uses of _memproj_catchall to use _memproj_fallthrough and delete
+  // _memproj_catchall so we end up with a call that has only 1 memory projection.
   if (_memproj_catchall != NULL ) {
     if (_memproj_fallthrough == NULL) {
       _memproj_fallthrough = new (C, 1) ProjNode(call, TypeFunc::Memory);
@@ -1393,17 +1462,18 @@ void PhaseMacroExpand::expand_allocate_common(
       // back up iterator
       --i;
     }
+    assert(_memproj_catchall->outcnt() == 0, "all uses must be deleted");
+    _igvn.remove_dead_node(_memproj_catchall);
   }
 
-  // An allocate node has separate i_o projections for the uses on the control and i_o paths
-  // Replace uses of the control i_o projection with result_phi_i_o (unless we are only generating a slow call)
-  if (_ioproj_fallthrough == NULL) {
-    _ioproj_fallthrough = new (C, 1) ProjNode(call, TypeFunc::I_O);
-    transform_later(_ioproj_fallthrough);
-  } else if (!always_slow) {
+  // An allocate node has separate i_o projections for the uses on the control
+  // and i_o paths. Always replace the control i_o projection with result i_o
+  // otherwise incoming i_o become dead when only a slow call is generated
+  // (it is different from memory projections where both projections are
+  // combined in such case).
+  if (_ioproj_fallthrough != NULL) {
     for (DUIterator_Fast imax, i = _ioproj_fallthrough->fast_outs(imax); i < imax; i++) {
       Node *use = _ioproj_fallthrough->fast_out(i);
-
       _igvn.hash_delete(use);
       imax -= replace_input(use, _ioproj_fallthrough, result_phi_i_o);
       _igvn._worklist.push(use);
@@ -1411,9 +1481,13 @@ void PhaseMacroExpand::expand_allocate_common(
       --i;
     }
   }
-  // Now change uses of _ioproj_catchall to use _ioproj_fallthrough and delete _ioproj_catchall so
-  // we end up with a call that has only 1 control projection
+  // Now change uses of _ioproj_catchall to use _ioproj_fallthrough and delete
+  // _ioproj_catchall so we end up with a call that has only 1 i_o projection.
   if (_ioproj_catchall != NULL ) {
+    if (_ioproj_fallthrough == NULL) {
+      _ioproj_fallthrough = new (C, 1) ProjNode(call, TypeFunc::I_O);
+      transform_later(_ioproj_fallthrough);
+    }
     for (DUIterator_Fast imax, i = _ioproj_catchall->fast_outs(imax); i < imax; i++) {
       Node *use = _ioproj_catchall->fast_out(i);
       _igvn.hash_delete(use);
@@ -1422,11 +1496,25 @@ void PhaseMacroExpand::expand_allocate_common(
       // back up iterator
       --i;
     }
+    assert(_ioproj_catchall->outcnt() == 0, "all uses must be deleted");
+    _igvn.remove_dead_node(_ioproj_catchall);
   }
 
   // if we generated only a slow call, we are done
-  if (always_slow)
+  if (always_slow) {
+    // Now we can unhook i_o.
+    if (result_phi_i_o->outcnt() > 1) {
+      call->set_req(TypeFunc::I_O, top());
+    } else {
+      assert(result_phi_i_o->unique_ctrl_out() == call, "");
+      // Case of new array with negative size known during compilation.
+      // AllocateArrayNode::Ideal() optimization disconnect unreachable
+      // following code since call to runtime will throw exception.
+      // As result there will be no users of i_o after the call.
+      // Leave i_o attached to this call to avoid problems in preceding graph.
+    }
     return;
+  }
 
 
   if (_fallthroughcatchproj != NULL) {
@@ -1470,7 +1558,7 @@ PhaseMacroExpand::initialize_object(AllocateNode* alloc,
   Node* mark_node = NULL;
   // For now only enable fast locking for non-array types
   if (UseBiasedLocking && (length == NULL)) {
-    mark_node = make_load(control, rawmem, klass_node, Klass::prototype_header_offset_in_bytes() + sizeof(oopDesc), TypeRawPtr::BOTTOM, T_ADDRESS);
+    mark_node = make_load(control, rawmem, klass_node, in_bytes(Klass::prototype_header_offset()), TypeRawPtr::BOTTOM, T_ADDRESS);
   } else {
     mark_node = makecon(TypeRawPtr::make((address)markOopDesc::prototype()));
   }
@@ -1701,7 +1789,8 @@ void PhaseMacroExpand::expand_allocate_array(AllocateArrayNode *alloc) {
                          slow_call_address);
 }
 
-//-----------------------mark_eliminated_locking_nodes-----------------------
+//-------------------mark_eliminated_box----------------------------------
+//
 // During EA obj may point to several objects but after few ideal graph
 // transformations (CCP) it may point to only one non escaping object
 // (but still using phi), corresponding locks and unlocks will be marked
@@ -1712,62 +1801,148 @@ void PhaseMacroExpand::expand_allocate_array(AllocateArrayNode *alloc) {
 // marked for elimination since new obj has no escape information.
 // Mark all associated (same box and obj) lock and unlock nodes for
 // elimination if some of them marked already.
-void PhaseMacroExpand::mark_eliminated_locking_nodes(AbstractLockNode *alock) {
-  if (!alock->is_eliminated()) {
+void PhaseMacroExpand::mark_eliminated_box(Node* oldbox, Node* obj) {
+  if (oldbox->as_BoxLock()->is_eliminated())
+    return; // This BoxLock node was processed already.
+
+  // New implementation (EliminateNestedLocks) has separate BoxLock
+  // node for each locked region so mark all associated locks/unlocks as
+  // eliminated even if different objects are referenced in one locked region
+  // (for example, OSR compilation of nested loop inside locked scope).
+  if (EliminateNestedLocks ||
+      oldbox->as_BoxLock()->is_simple_lock_region(NULL, obj)) {
+    // Box is used only in one lock region. Mark this box as eliminated.
+    _igvn.hash_delete(oldbox);
+    oldbox->as_BoxLock()->set_eliminated(); // This changes box's hash value
+    _igvn.hash_insert(oldbox);
+
+    for (uint i = 0; i < oldbox->outcnt(); i++) {
+      Node* u = oldbox->raw_out(i);
+      if (u->is_AbstractLock() && !u->as_AbstractLock()->is_non_esc_obj()) {
+        AbstractLockNode* alock = u->as_AbstractLock();
+        // Check lock's box since box could be referenced by Lock's debug info.
+        if (alock->box_node() == oldbox) {
+          // Mark eliminated all related locks and unlocks.
+          alock->set_non_esc_obj();
+        }
+      }
+    }
     return;
   }
-  if (!alock->is_coarsened()) { // Eliminated by EA
-      // Create new "eliminated" BoxLock node and use it
-      // in monitor debug info for the same object.
-      BoxLockNode* oldbox = alock->box_node()->as_BoxLock();
-      Node* obj = alock->obj_node();
-      if (!oldbox->is_eliminated()) {
-        BoxLockNode* newbox = oldbox->clone()->as_BoxLock();
+
+  // Create new "eliminated" BoxLock node and use it in monitor debug info
+  // instead of oldbox for the same object.
+  BoxLockNode* newbox = oldbox->clone()->as_BoxLock();
+
+  // Note: BoxLock node is marked eliminated only here and it is used
+  // to indicate that all associated lock and unlock nodes are marked
+  // for elimination.
+  newbox->set_eliminated();
+  transform_later(newbox);
+
+  // Replace old box node with new box for all users of the same object.
+  for (uint i = 0; i < oldbox->outcnt();) {
+    bool next_edge = true;
+
+    Node* u = oldbox->raw_out(i);
+    if (u->is_AbstractLock()) {
+      AbstractLockNode* alock = u->as_AbstractLock();
+      if (alock->box_node() == oldbox && alock->obj_node()->eqv_uncast(obj)) {
+        // Replace Box and mark eliminated all related locks and unlocks.
+        alock->set_non_esc_obj();
+        _igvn.hash_delete(alock);
+        alock->set_box_node(newbox);
+        _igvn._worklist.push(alock);
+        next_edge = false;
+      }
+    }
+    if (u->is_FastLock() && u->as_FastLock()->obj_node()->eqv_uncast(obj)) {
+      FastLockNode* flock = u->as_FastLock();
+      assert(flock->box_node() == oldbox, "sanity");
+      _igvn.hash_delete(flock);
+      flock->set_box_node(newbox);
+      _igvn._worklist.push(flock);
+      next_edge = false;
+    }
+
+    // Replace old box in monitor debug info.
+    if (u->is_SafePoint() && u->as_SafePoint()->jvms()) {
+      SafePointNode* sfn = u->as_SafePoint();
+      JVMState* youngest_jvms = sfn->jvms();
+      int max_depth = youngest_jvms->depth();
+      for (int depth = 1; depth <= max_depth; depth++) {
+        JVMState* jvms = youngest_jvms->of_depth(depth);
+        int num_mon  = jvms->nof_monitors();
+        // Loop over monitors
+        for (int idx = 0; idx < num_mon; idx++) {
+          Node* obj_node = sfn->monitor_obj(jvms, idx);
+          Node* box_node = sfn->monitor_box(jvms, idx);
+          if (box_node == oldbox && obj_node->eqv_uncast(obj)) {
+            int j = jvms->monitor_box_offset(idx);
+            _igvn.hash_delete(u);
+            u->set_req(j, newbox);
+            _igvn._worklist.push(u);
+            next_edge = false;
+          }
+        }
+      }
+    }
+    if (next_edge) i++;
+  }
+}
+
+//-----------------------mark_eliminated_locking_nodes-----------------------
+void PhaseMacroExpand::mark_eliminated_locking_nodes(AbstractLockNode *alock) {
+  if (EliminateNestedLocks) {
+    if (alock->is_nested()) {
+       assert(alock->box_node()->as_BoxLock()->is_eliminated(), "sanity");
+       return;
+    } else if (!alock->is_non_esc_obj()) { // Not eliminated or coarsened
+      // Only Lock node has JVMState needed here.
+      if (alock->jvms() != NULL && alock->as_Lock()->is_nested_lock_region()) {
+        // Mark eliminated related nested locks and unlocks.
+        Node* obj = alock->obj_node();
+        BoxLockNode* box_node = alock->box_node()->as_BoxLock();
+        assert(!box_node->is_eliminated(), "should not be marked yet");
         // Note: BoxLock node is marked eliminated only here
         // and it is used to indicate that all associated lock
         // and unlock nodes are marked for elimination.
-        newbox->set_eliminated();
-        transform_later(newbox);
-        // Replace old box node with new box for all users
-        // of the same object.
-        for (uint i = 0; i < oldbox->outcnt();) {
-
-          bool next_edge = true;
-          Node* u = oldbox->raw_out(i);
-          if (u->is_AbstractLock() &&
-              u->as_AbstractLock()->obj_node() == obj &&
-              u->as_AbstractLock()->box_node() == oldbox) {
-            // Mark all associated locks and unlocks.
-            u->as_AbstractLock()->set_eliminated();
-            _igvn.hash_delete(u);
-            u->set_req(TypeFunc::Parms + 1, newbox);
-            next_edge = false;
+        box_node->set_eliminated(); // Box's hash is always NO_HASH here
+        for (uint i = 0; i < box_node->outcnt(); i++) {
+          Node* u = box_node->raw_out(i);
+          if (u->is_AbstractLock()) {
+            alock = u->as_AbstractLock();
+            if (alock->box_node() == box_node) {
+              // Verify that this Box is referenced only by related locks.
+              assert(alock->obj_node()->eqv_uncast(obj), "");
+              // Mark all related locks and unlocks.
+              alock->set_nested();
+            }
           }
-          // Replace old box in monitor debug info.
-          if (u->is_SafePoint() && u->as_SafePoint()->jvms()) {
-            SafePointNode* sfn = u->as_SafePoint();
-            JVMState* youngest_jvms = sfn->jvms();
-            int max_depth = youngest_jvms->depth();
-            for (int depth = 1; depth <= max_depth; depth++) {
-              JVMState* jvms = youngest_jvms->of_depth(depth);
-              int num_mon  = jvms->nof_monitors();
-              // Loop over monitors
-              for (int idx = 0; idx < num_mon; idx++) {
-                Node* obj_node = sfn->monitor_obj(jvms, idx);
-                Node* box_node = sfn->monitor_box(jvms, idx);
-                if (box_node == oldbox && obj_node == obj) {
-                  int j = jvms->monitor_box_offset(idx);
-                  _igvn.hash_delete(u);
-                  u->set_req(j, newbox);
-                  next_edge = false;
-                }
-              } // for (int idx = 0;
-            } // for (int depth = 1;
-          } // if (u->is_SafePoint()
-          if (next_edge) i++;
-        } // for (uint i = 0; i < oldbox->outcnt();)
-      } // if (!oldbox->is_eliminated())
-  } // if (!alock->is_coarsened())
+        }
+      }
+      return;
+    }
+    // Process locks for non escaping object
+    assert(alock->is_non_esc_obj(), "");
+  } // EliminateNestedLocks
+
+  if (alock->is_non_esc_obj()) { // Lock is used for non escaping object
+    // Look for all locks of this object and mark them and
+    // corresponding BoxLock nodes as eliminated.
+    Node* obj = alock->obj_node();
+    for (uint j = 0; j < obj->outcnt(); j++) {
+      Node* o = obj->raw_out(j);
+      if (o->is_AbstractLock() &&
+          o->as_AbstractLock()->obj_node()->eqv_uncast(obj)) {
+        alock = o->as_AbstractLock();
+        Node* box = alock->box_node();
+        // Replace old box node with new eliminated box for all users
+        // of the same object and mark related locks as eliminated.
+        mark_eliminated_box(box, obj);
+      }
+    }
+  }
 }
 
 // we have determined that this lock/unlock can be eliminated, we simply
@@ -1782,7 +1957,7 @@ bool PhaseMacroExpand::eliminate_locking_node(AbstractLockNode *alock) {
     return false;
   }
 #ifdef ASSERT
-  if (alock->is_Lock() && !alock->is_coarsened()) {
+  if (!alock->is_coarsened()) {
     // Check that new "eliminated" BoxLock node is created.
     BoxLockNode* oldbox = alock->box_node()->as_BoxLock();
     assert(oldbox->is_eliminated(), "should be done already");
@@ -1874,6 +2049,8 @@ void PhaseMacroExpand::expand_lock_node(LockNode *lock) {
   Node* box = lock->box_node();
   Node* flock = lock->fastlock_node();
 
+  assert(!box->as_BoxLock()->is_eliminated(), "sanity");
+
   // Make the merge point
   Node *region;
   Node *mem_phi;
@@ -1958,7 +2135,7 @@ void PhaseMacroExpand::expand_lock_node(LockNode *lock) {
 #endif
       klass_node->init_req(0, ctrl);
     }
-    Node *proto_node = make_load(ctrl, mem, klass_node, Klass::prototype_header_offset_in_bytes() + sizeof(oopDesc), TypeX_X, TypeX_X->basic_type());
+    Node *proto_node = make_load(ctrl, mem, klass_node, in_bytes(Klass::prototype_header_offset()), TypeX_X, TypeX_X->basic_type());
 
     Node* thread = transform_later(new (C, 1) ThreadLocalNode());
     Node* cast_thread = transform_later(new (C, 2) CastP2XNode(ctrl, thread));
@@ -2108,6 +2285,8 @@ void PhaseMacroExpand::expand_unlock_node(UnlockNode *unlock) {
   Node* obj = unlock->obj_node();
   Node* box = unlock->box_node();
 
+  assert(!box->as_BoxLock()->is_eliminated(), "sanity");
+
   // No need for a null check on unlock
 
   // Make the merge point
diff --git a/hotspot/src/share/vm/opto/macro.hpp b/hotspot/src/share/vm/opto/macro.hpp
index 7f0080afe7f..f521e3d35a2 100644
--- a/hotspot/src/share/vm/opto/macro.hpp
+++ b/hotspot/src/share/vm/opto/macro.hpp
@@ -92,6 +92,7 @@ private:
   void process_users_of_allocation(AllocateNode *alloc);
 
   void eliminate_card_mark(Node *cm);
+  void mark_eliminated_box(Node* box, Node* obj);
   void mark_eliminated_locking_nodes(AbstractLockNode *alock);
   bool eliminate_locking_node(AbstractLockNode *alock);
   void expand_lock_node(LockNode *lock);
diff --git a/hotspot/src/share/vm/opto/matcher.cpp b/hotspot/src/share/vm/opto/matcher.cpp
index 1200d2541c7..397385670d2 100644
--- a/hotspot/src/share/vm/opto/matcher.cpp
+++ b/hotspot/src/share/vm/opto/matcher.cpp
@@ -1365,31 +1365,36 @@ static bool match_into_reg( const Node *n, Node *m, Node *control, int i, bool s
 
   const Type *t = m->bottom_type();
 
-  if( t->singleton() ) {
+  if (t->singleton()) {
     // Never force constants into registers.  Allow them to match as
     // constants or registers.  Copies of the same value will share
     // the same register.  See find_shared_node.
     return false;
   } else {                      // Not a constant
     // Stop recursion if they have different Controls.
-    // Slot 0 of constants is not really a Control.
-    if( control && m->in(0) && control != m->in(0) ) {
+    Node* m_control = m->in(0);
+    // Control of load's memory can post-dominates load's control.
+    // So use it since load can't float above its memory.
+    Node* mem_control = (m->is_Load()) ? m->in(MemNode::Memory)->in(0) : NULL;
+    if (control && m_control && control != m_control && control != mem_control) {
 
       // Actually, we can live with the most conservative control we
       // find, if it post-dominates the others.  This allows us to
       // pick up load/op/store trees where the load can float a little
       // above the store.
       Node *x = control;
-      const uint max_scan = 6;   // Arbitrary scan cutoff
+      const uint max_scan = 6;  // Arbitrary scan cutoff
       uint j;
-      for( j=0; j<max_scan; j++ ) {
-        if( x->is_Region() )    // Bail out at merge points
+      for (j=0; j<max_scan; j++) {
+        if (x->is_Region())     // Bail out at merge points
           return true;
         x = x->in(0);
-        if( x == m->in(0) )     // Does 'control' post-dominate
+        if (x == m_control)     // Does 'control' post-dominate
           break;                // m->in(0)?  If so, we can use it
+        if (x == mem_control)   // Does 'control' post-dominate
+          break;                // mem_control?  If so, we can use it
       }
-      if( j == max_scan )       // No post-domination before scan end?
+      if (j == max_scan)        // No post-domination before scan end?
         return true;            // Then break the match tree up
     }
     if (m->is_DecodeN() && Matcher::narrow_oop_use_complex_address()) {
diff --git a/hotspot/src/share/vm/opto/memnode.cpp b/hotspot/src/share/vm/opto/memnode.cpp
index 722935a3fd3..bd144751d42 100644
--- a/hotspot/src/share/vm/opto/memnode.cpp
+++ b/hotspot/src/share/vm/opto/memnode.cpp
@@ -1473,19 +1473,19 @@ Node *LoadNode::Ideal(PhaseGVN *phase, bool can_reshape) {
 const Type*
 LoadNode::load_array_final_field(const TypeKlassPtr *tkls,
                                  ciKlass* klass) const {
-  if (tkls->offset() == Klass::modifier_flags_offset_in_bytes() + (int)sizeof(oopDesc)) {
+  if (tkls->offset() == in_bytes(Klass::modifier_flags_offset())) {
     // The field is Klass::_modifier_flags.  Return its (constant) value.
     // (Folds up the 2nd indirection in aClassConstant.getModifiers().)
     assert(this->Opcode() == Op_LoadI, "must load an int from _modifier_flags");
     return TypeInt::make(klass->modifier_flags());
   }
-  if (tkls->offset() == Klass::access_flags_offset_in_bytes() + (int)sizeof(oopDesc)) {
+  if (tkls->offset() == in_bytes(Klass::access_flags_offset())) {
     // The field is Klass::_access_flags.  Return its (constant) value.
     // (Folds up the 2nd indirection in Reflection.getClassAccessFlags(aClassConstant).)
     assert(this->Opcode() == Op_LoadI, "must load an int from _access_flags");
     return TypeInt::make(klass->access_flags());
   }
-  if (tkls->offset() == Klass::layout_helper_offset_in_bytes() + (int)sizeof(oopDesc)) {
+  if (tkls->offset() == in_bytes(Klass::layout_helper_offset())) {
     // The field is Klass::_layout_helper.  Return its constant value if known.
     assert(this->Opcode() == Op_LoadI, "must load an int from _layout_helper");
     return TypeInt::make(klass->layout_helper());
@@ -1636,14 +1636,14 @@ const Type *LoadNode::Value( PhaseTransform *phase ) const {
       // We are loading a field from a Klass metaobject whose identity
       // is known at compile time (the type is "exact" or "precise").
       // Check for fields we know are maintained as constants by the VM.
-      if (tkls->offset() == Klass::super_check_offset_offset_in_bytes() + (int)sizeof(oopDesc)) {
+      if (tkls->offset() == in_bytes(Klass::super_check_offset_offset())) {
         // The field is Klass::_super_check_offset.  Return its (constant) value.
         // (Folds up type checking code.)
         assert(Opcode() == Op_LoadI, "must load an int from _super_check_offset");
         return TypeInt::make(klass->super_check_offset());
       }
       // Compute index into primary_supers array
-      juint depth = (tkls->offset() - (Klass::primary_supers_offset_in_bytes() + (int)sizeof(oopDesc))) / sizeof(klassOop);
+      juint depth = (tkls->offset() - in_bytes(Klass::primary_supers_offset())) / sizeof(klassOop);
       // Check for overflowing; use unsigned compare to handle the negative case.
       if( depth < ciKlass::primary_super_limit() ) {
         // The field is an element of Klass::_primary_supers.  Return its (constant) value.
@@ -1654,14 +1654,14 @@ const Type *LoadNode::Value( PhaseTransform *phase ) const {
       }
       const Type* aift = load_array_final_field(tkls, klass);
       if (aift != NULL)  return aift;
-      if (tkls->offset() == in_bytes(arrayKlass::component_mirror_offset()) + (int)sizeof(oopDesc)
+      if (tkls->offset() == in_bytes(arrayKlass::component_mirror_offset())
           && klass->is_array_klass()) {
         // The field is arrayKlass::_component_mirror.  Return its (constant) value.
         // (Folds up aClassConstant.getComponentType, common in Arrays.copyOf.)
         assert(Opcode() == Op_LoadP, "must load an oop from _component_mirror");
         return TypeInstPtr::make(klass->as_array_klass()->component_mirror());
       }
-      if (tkls->offset() == Klass::java_mirror_offset_in_bytes() + (int)sizeof(oopDesc)) {
+      if (tkls->offset() == in_bytes(Klass::java_mirror_offset())) {
         // The field is Klass::_java_mirror.  Return its (constant) value.
         // (Folds up the 2nd indirection in anObjConstant.getClass().)
         assert(Opcode() == Op_LoadP, "must load an oop from _java_mirror");
@@ -1679,7 +1679,7 @@ const Type *LoadNode::Value( PhaseTransform *phase ) const {
       if( inner->is_instance_klass() &&
           !inner->as_instance_klass()->flags().is_interface() ) {
         // Compute index into primary_supers array
-        juint depth = (tkls->offset() - (Klass::primary_supers_offset_in_bytes() + (int)sizeof(oopDesc))) / sizeof(klassOop);
+        juint depth = (tkls->offset() - in_bytes(Klass::primary_supers_offset())) / sizeof(klassOop);
         // Check for overflowing; use unsigned compare to handle the negative case.
         if( depth < ciKlass::primary_super_limit() &&
             depth <= klass->super_depth() ) { // allow self-depth checks to handle self-check case
@@ -1695,7 +1695,7 @@ const Type *LoadNode::Value( PhaseTransform *phase ) const {
     // If the type is enough to determine that the thing is not an array,
     // we can give the layout_helper a positive interval type.
     // This will help short-circuit some reflective code.
-    if (tkls->offset() == Klass::layout_helper_offset_in_bytes() + (int)sizeof(oopDesc)
+    if (tkls->offset() == in_bytes(Klass::layout_helper_offset())
         && !klass->is_array_klass() // not directly typed as an array
         && !klass->is_interface()  // specifically not Serializable & Cloneable
         && !klass->is_java_lang_Object()   // not the supertype of all T[]
@@ -1938,7 +1938,7 @@ const Type *LoadNode::klass_value_common( PhaseTransform *phase ) const {
     if( !klass->is_loaded() )
       return _type;             // Bail out if not loaded
     if( klass->is_obj_array_klass() &&
-        (uint)tkls->offset() == objArrayKlass::element_klass_offset_in_bytes() + sizeof(oopDesc)) {
+        tkls->offset() == in_bytes(objArrayKlass::element_klass_offset())) {
       ciKlass* elem = klass->as_obj_array_klass()->element_klass();
       // // Always returning precise element type is incorrect,
       // // e.g., element type could be object and array may contain strings
@@ -1949,7 +1949,7 @@ const Type *LoadNode::klass_value_common( PhaseTransform *phase ) const {
       return TypeKlassPtr::make(tkls->ptr(), elem, 0/*offset*/);
     }
     if( klass->is_instance_klass() && tkls->klass_is_exact() &&
-        (uint)tkls->offset() == Klass::super_offset_in_bytes() + sizeof(oopDesc)) {
+        tkls->offset() == in_bytes(Klass::super_offset())) {
       ciKlass* sup = klass->as_instance_klass()->super();
       // The field is Klass::_super.  Return its (constant) value.
       // (Folds up the 2nd indirection in aClassConstant.getSuperClass().)
@@ -2013,11 +2013,11 @@ Node* LoadNode::klass_identity_common(PhaseTransform *phase ) {
               tkls->klass()->is_array_klass())
           && adr2->is_AddP()
           ) {
-        int mirror_field = Klass::java_mirror_offset_in_bytes();
+        int mirror_field = in_bytes(Klass::java_mirror_offset());
         if (offset == java_lang_Class::array_klass_offset_in_bytes()) {
           mirror_field = in_bytes(arrayKlass::component_mirror_offset());
         }
-        if (tkls->offset() == mirror_field + (int)sizeof(oopDesc)) {
+        if (tkls->offset() == mirror_field) {
           return adr2->in(AddPNode::Base);
         }
       }
@@ -2201,7 +2201,7 @@ Node *StoreNode::Ideal(PhaseGVN *phase, bool can_reshape) {
   // unsafe if I have intervening uses...  Also disallowed for StoreCM
   // since they must follow each StoreP operation.  Redundant StoreCMs
   // are eliminated just before matching in final_graph_reshape.
-  if (mem->is_Store() && phase->eqv_uncast(mem->in(MemNode::Address), address) &&
+  if (mem->is_Store() && mem->in(MemNode::Address)->eqv_uncast(address) &&
       mem->Opcode() != Op_StoreCM) {
     // Looking at a dead closed cycle of memory?
     assert(mem != mem->in(MemNode::Memory), "dead loop in StoreNode::Ideal");
@@ -2274,16 +2274,16 @@ Node *StoreNode::Identity( PhaseTransform *phase ) {
 
   // Load then Store?  Then the Store is useless
   if (val->is_Load() &&
-      phase->eqv_uncast( val->in(MemNode::Address), adr ) &&
-      phase->eqv_uncast( val->in(MemNode::Memory ), mem ) &&
+      val->in(MemNode::Address)->eqv_uncast(adr) &&
+      val->in(MemNode::Memory )->eqv_uncast(mem) &&
       val->as_Load()->store_Opcode() == Opcode()) {
     return mem;
   }
 
   // Two stores in a row of the same value?
   if (mem->is_Store() &&
-      phase->eqv_uncast( mem->in(MemNode::Address), adr ) &&
-      phase->eqv_uncast( mem->in(MemNode::ValueIn), val ) &&
+      mem->in(MemNode::Address)->eqv_uncast(adr) &&
+      mem->in(MemNode::ValueIn)->eqv_uncast(val) &&
       mem->Opcode() == Opcode()) {
     return mem;
   }
@@ -2721,6 +2721,7 @@ MemBarNode* MemBarNode::make(Compile* C, int opcode, int atp, Node* pn) {
   case Op_MemBarVolatile:  return new(C, len) MemBarVolatileNode(C, atp, pn);
   case Op_MemBarCPUOrder:  return new(C, len) MemBarCPUOrderNode(C, atp, pn);
   case Op_Initialize:      return new(C, len) InitializeNode(C,     atp, pn);
+  case Op_MemBarStoreStore: return new(C, len) MemBarStoreStoreNode(C,  atp, pn);
   default:                 ShouldNotReachHere(); return NULL;
   }
 }
@@ -2870,7 +2871,7 @@ Node *MemBarNode::match( const ProjNode *proj, const Matcher *m ) {
 
 //---------------------------InitializeNode------------------------------------
 InitializeNode::InitializeNode(Compile* C, int adr_type, Node* rawoop)
-  : _is_complete(Incomplete),
+  : _is_complete(Incomplete), _does_not_escape(false),
     MemBarNode(C, adr_type, rawoop)
 {
   init_class_id(Class_Initialize);
diff --git a/hotspot/src/share/vm/opto/memnode.hpp b/hotspot/src/share/vm/opto/memnode.hpp
index 01c149c7a97..f15d4986bd0 100644
--- a/hotspot/src/share/vm/opto/memnode.hpp
+++ b/hotspot/src/share/vm/opto/memnode.hpp
@@ -918,6 +918,15 @@ public:
   virtual int Opcode() const;
 };
 
+class MemBarStoreStoreNode: public MemBarNode {
+public:
+  MemBarStoreStoreNode(Compile* C, int alias_idx, Node* precedent)
+    : MemBarNode(C, alias_idx, precedent) {
+    init_class_id(Class_MemBarStoreStore);
+  }
+  virtual int Opcode() const;
+};
+
 // Ordering between a volatile store and a following volatile load.
 // Requires multi-CPU visibility?
 class MemBarVolatileNode: public MemBarNode {
@@ -950,6 +959,8 @@ class InitializeNode: public MemBarNode {
   };
   int _is_complete;
 
+  bool _does_not_escape;
+
 public:
   enum {
     Control    = TypeFunc::Control,
@@ -989,6 +1000,9 @@ public:
   void set_complete(PhaseGVN* phase);
   void set_complete_with_arraycopy() { _is_complete = Complete | WithArraycopy; }
 
+  bool does_not_escape() { return _does_not_escape; }
+  void set_does_not_escape() { _does_not_escape = true; }
+
 #ifdef ASSERT
   // ensure all non-degenerate stores are ordered and non-overlapping
   bool stores_are_sane(PhaseTransform* phase);
diff --git a/hotspot/src/share/vm/opto/node.cpp b/hotspot/src/share/vm/opto/node.cpp
index 1400e3d3aae..4bd752feda5 100644
--- a/hotspot/src/share/vm/opto/node.cpp
+++ b/hotspot/src/share/vm/opto/node.cpp
@@ -833,8 +833,20 @@ Node* Node::uncast() const {
 
 //---------------------------uncast_helper-------------------------------------
 Node* Node::uncast_helper(const Node* p) {
-  uint max_depth = 3;
-  for (uint i = 0; i < max_depth; i++) {
+#ifdef ASSERT
+  uint depth_count = 0;
+  const Node* orig_p = p;
+#endif
+
+  while (true) {
+#ifdef ASSERT
+    if (depth_count >= K) {
+      orig_p->dump(4);
+      if (p != orig_p)
+        p->dump(1);
+    }
+    assert(depth_count++ < K, "infinite loop in Node::uncast_helper");
+#endif
     if (p == NULL || p->req() != 2) {
       break;
     } else if (p->is_ConstraintCast()) {
diff --git a/hotspot/src/share/vm/opto/node.hpp b/hotspot/src/share/vm/opto/node.hpp
index e10cad4720e..5ddae2f0b19 100644
--- a/hotspot/src/share/vm/opto/node.hpp
+++ b/hotspot/src/share/vm/opto/node.hpp
@@ -97,6 +97,7 @@ class MachSpillCopyNode;
 class MachTempNode;
 class Matcher;
 class MemBarNode;
+class MemBarStoreStoreNode;
 class MemNode;
 class MergeMemNode;
 class MultiNode;
@@ -428,6 +429,10 @@ protected:
 
   // Strip away casting.  (It is depth-limited.)
   Node* uncast() const;
+  // Return whether two Nodes are equivalent, after stripping casting.
+  bool eqv_uncast(const Node* n) const {
+    return (this->uncast() == n->uncast());
+  }
 
 private:
   static Node* uncast_helper(const Node* n);
@@ -564,7 +569,8 @@ public:
         DEFINE_CLASS_ID(NeverBranch, MultiBranch, 2)
       DEFINE_CLASS_ID(Start,       Multi, 2)
       DEFINE_CLASS_ID(MemBar,      Multi, 3)
-        DEFINE_CLASS_ID(Initialize,    MemBar, 0)
+        DEFINE_CLASS_ID(Initialize,       MemBar, 0)
+        DEFINE_CLASS_ID(MemBarStoreStore, MemBar, 1)
 
     DEFINE_CLASS_ID(Mach,  Node, 1)
       DEFINE_CLASS_ID(MachReturn, Mach, 0)
@@ -744,6 +750,7 @@ public:
   DEFINE_CLASS_QUERY(MachTemp)
   DEFINE_CLASS_QUERY(Mem)
   DEFINE_CLASS_QUERY(MemBar)
+  DEFINE_CLASS_QUERY(MemBarStoreStore)
   DEFINE_CLASS_QUERY(MergeMem)
   DEFINE_CLASS_QUERY(Multi)
   DEFINE_CLASS_QUERY(MultiBranch)
diff --git a/hotspot/src/share/vm/opto/output.cpp b/hotspot/src/share/vm/opto/output.cpp
index d9317d6af35..f365353e740 100644
--- a/hotspot/src/share/vm/opto/output.cpp
+++ b/hotspot/src/share/vm/opto/output.cpp
@@ -924,10 +924,10 @@ void Compile::Process_OopMap_Node(MachNode *mach, int current_offset) {
         scval = new ConstantOopWriteValue(tp->is_oopptr()->const_oop()->constant_encoding());
       }
 
-      OptoReg::Name box_reg = BoxLockNode::stack_slot(box_node);
+      OptoReg::Name box_reg = BoxLockNode::reg(box_node);
       Location basic_lock = Location::new_stk_loc(Location::normal,_regalloc->reg2offset(box_reg));
-      while( !box_node->is_BoxLock() )  box_node = box_node->in(1);
-      monarray->append(new MonitorValue(scval, basic_lock, box_node->as_BoxLock()->is_eliminated()));
+      bool eliminated = (box_node->is_BoxLock() && box_node->as_BoxLock()->is_eliminated());
+      monarray->append(new MonitorValue(scval, basic_lock, eliminated));
     }
 
     // We dump the object pool first, since deoptimization reads it in first.
diff --git a/hotspot/src/share/vm/opto/parse1.cpp b/hotspot/src/share/vm/opto/parse1.cpp
index 06ef91a49a6..99bdf93b933 100644
--- a/hotspot/src/share/vm/opto/parse1.cpp
+++ b/hotspot/src/share/vm/opto/parse1.cpp
@@ -1604,7 +1604,16 @@ void Parse::merge_common(Parse::Block* target, int pnum) {
           continue;
         default:                // All normal stuff
           if (phi == NULL) {
-            if (!check_elide_phi || !target->can_elide_SEL_phi(j)) {
+            const JVMState* jvms = map()->jvms();
+            if (EliminateNestedLocks &&
+                jvms->is_mon(j) && jvms->is_monitor_box(j)) {
+              // BoxLock nodes are not commoning.
+              // Use old BoxLock node as merged box.
+              assert(newin->jvms()->is_monitor_box(j), "sanity");
+              // This assert also tests that nodes are BoxLock.
+              assert(BoxLockNode::same_slot(n, m), "sanity");
+              C->gvn_replace_by(n, m);
+            } else if (!check_elide_phi || !target->can_elide_SEL_phi(j)) {
               phi = ensure_phi(j, nophi);
             }
           }
@@ -1911,7 +1920,7 @@ void Parse::call_register_finalizer() {
   Node* klass_addr = basic_plus_adr( receiver, receiver, oopDesc::klass_offset_in_bytes() );
   Node* klass = _gvn.transform( LoadKlassNode::make(_gvn, immutable_memory(), klass_addr, TypeInstPtr::KLASS) );
 
-  Node* access_flags_addr = basic_plus_adr(klass, klass, Klass::access_flags_offset_in_bytes() + sizeof(oopDesc));
+  Node* access_flags_addr = basic_plus_adr(klass, klass, in_bytes(Klass::access_flags_offset()));
   Node* access_flags = make_load(NULL, access_flags_addr, TypeInt::INT, T_INT);
 
   Node* mask  = _gvn.transform(new (C, 3) AndINode(access_flags, intcon(JVM_ACC_HAS_FINALIZER)));
diff --git a/hotspot/src/share/vm/opto/parseHelper.cpp b/hotspot/src/share/vm/opto/parseHelper.cpp
index 5a9d7e146f6..9ebc716b548 100644
--- a/hotspot/src/share/vm/opto/parseHelper.cpp
+++ b/hotspot/src/share/vm/opto/parseHelper.cpp
@@ -200,7 +200,7 @@ void Parse::array_store_check() {
   // Come here for polymorphic array klasses
 
   // Extract the array element class
-  int element_klass_offset = objArrayKlass::element_klass_offset_in_bytes() + sizeof(oopDesc);
+  int element_klass_offset = in_bytes(objArrayKlass::element_klass_offset());
   Node *p2 = basic_plus_adr(array_klass, array_klass, element_klass_offset);
   Node *a_e_klass = _gvn.transform( LoadKlassNode::make(_gvn, immutable_memory(), p2, tak) );
 
@@ -220,7 +220,7 @@ void Parse::emit_guard_for_new(ciInstanceKlass* klass) {
   _gvn.set_type(merge, Type::CONTROL);
   Node* kls = makecon(TypeKlassPtr::make(klass));
 
-  Node* init_thread_offset = _gvn.MakeConX(instanceKlass::init_thread_offset_in_bytes() + klassOopDesc::klass_part_offset_in_bytes());
+  Node* init_thread_offset = _gvn.MakeConX(in_bytes(instanceKlass::init_thread_offset()));
   Node* adr_node = basic_plus_adr(kls, kls, init_thread_offset);
   Node* init_thread = make_load(NULL, adr_node, TypeRawPtr::BOTTOM, T_ADDRESS);
   Node *tst   = Bool( CmpP( init_thread, cur_thread), BoolTest::eq);
@@ -228,9 +228,11 @@ void Parse::emit_guard_for_new(ciInstanceKlass* klass) {
   set_control(IfTrue(iff));
   merge->set_req(1, IfFalse(iff));
 
-  Node* init_state_offset = _gvn.MakeConX(instanceKlass::init_state_offset_in_bytes() + klassOopDesc::klass_part_offset_in_bytes());
+  Node* init_state_offset = _gvn.MakeConX(in_bytes(instanceKlass::init_state_offset()));
   adr_node = basic_plus_adr(kls, kls, init_state_offset);
-  Node* init_state = make_load(NULL, adr_node, TypeInt::INT, T_INT);
+  // Use T_BOOLEAN for instanceKlass::_init_state so the compiler
+  // can generate code to load it as unsigned byte.
+  Node* init_state = make_load(NULL, adr_node, TypeInt::UBYTE, T_BOOLEAN);
   Node* being_init = _gvn.intcon(instanceKlass::being_initialized);
   tst   = Bool( CmpI( init_state, being_init), BoolTest::eq);
   iff = create_and_map_if(control(), tst, PROB_ALWAYS, COUNT_UNKNOWN);
diff --git a/hotspot/src/share/vm/opto/phaseX.hpp b/hotspot/src/share/vm/opto/phaseX.hpp
index 162716dfa53..ef5bbb55fc3 100644
--- a/hotspot/src/share/vm/opto/phaseX.hpp
+++ b/hotspot/src/share/vm/opto/phaseX.hpp
@@ -256,11 +256,6 @@ public:
   // For pessimistic optimizations this is simply pointer equivalence.
   bool eqv(const Node* n1, const Node* n2) const { return n1 == n2; }
 
-  // Return whether two Nodes are equivalent, after stripping casting.
-  bool eqv_uncast(const Node* n1, const Node* n2) const {
-    return eqv(n1->uncast(), n2->uncast());
-  }
-
   // For pessimistic passes, the return type must monotonically narrow.
   // For optimistic  passes, the return type must monotonically widen.
   // It is possible to get into a "death march" in either type of pass,
diff --git a/hotspot/src/share/vm/opto/postaloc.cpp b/hotspot/src/share/vm/opto/postaloc.cpp
index 603264acc76..1a7553bc5cd 100644
--- a/hotspot/src/share/vm/opto/postaloc.cpp
+++ b/hotspot/src/share/vm/opto/postaloc.cpp
@@ -89,32 +89,62 @@ int PhaseChaitin::yank( Node *old, Block *current_block, Node_List *value, Node_
   return blk_adjust;
 }
 
+#ifdef ASSERT
+static bool expected_yanked_node(Node *old, Node *orig_old) {
+  // This code is expected only next original nodes:
+  // - load from constant table node which may have next data input nodes:
+  //     MachConstantBase, Phi, MachTemp, MachSpillCopy
+  // - load constant node which may have next data input nodes:
+  //     MachTemp, MachSpillCopy
+  // - MachSpillCopy
+  // - MachProj and Copy dead nodes
+  if (old->is_MachSpillCopy()) {
+    return true;
+  } else if (old->is_Con()) {
+    return true;
+  } else if (old->is_MachProj()) { // Dead kills projection of Con node
+    return (old == orig_old);
+  } else if (old->is_Copy()) {     // Dead copy of a callee-save value
+    return (old == orig_old);
+  } else if (old->is_MachTemp()) {
+    return orig_old->is_Con();
+  } else if (old->is_Phi() || old->is_MachConstantBase()) {
+    return (orig_old->is_Con() && orig_old->is_MachConstant());
+  }
+  return false;
+}
+#endif
+
 //------------------------------yank_if_dead-----------------------------------
-// Removed an edge from 'old'.  Yank if dead.  Return adjustment counts to
+// Removed edges from 'old'.  Yank if dead.  Return adjustment counts to
 // iterators in the current block.
-int PhaseChaitin::yank_if_dead( Node *old, Block *current_block, Node_List *value, Node_List *regnd ) {
+int PhaseChaitin::yank_if_dead_recurse(Node *old, Node *orig_old, Block *current_block,
+                                       Node_List *value, Node_List *regnd) {
   int blk_adjust=0;
-  while (old->outcnt() == 0 && old != C->top()) {
+  if (old->outcnt() == 0 && old != C->top()) {
+#ifdef ASSERT
+    if (!expected_yanked_node(old, orig_old)) {
+      tty->print_cr("==============================================");
+      tty->print_cr("orig_old:");
+      orig_old->dump();
+      tty->print_cr("old:");
+      old->dump();
+      assert(false, "unexpected yanked node");
+    }
+    if (old->is_Con())
+      orig_old = old; // Reset to satisfy expected nodes checks.
+#endif
     blk_adjust += yank(old, current_block, value, regnd);
 
-    Node *tmp = NULL;
     for (uint i = 1; i < old->req(); i++) {
-      if (old->in(i)->is_MachTemp()) {
-        // handle TEMP inputs
-        Node* machtmp = old->in(i);
-        if (machtmp->outcnt() == 1) {
-          assert(machtmp->unique_out() == old, "sanity");
-          blk_adjust += yank(machtmp, current_block, value, regnd);
-          machtmp->disconnect_inputs(NULL);
-        }
-      } else {
-        assert(tmp == NULL, "can't handle more non MachTemp inputs");
-        tmp = old->in(i);
+      Node* n = old->in(i);
+      if (n != NULL) {
+        old->set_req(i, NULL);
+        blk_adjust += yank_if_dead_recurse(n, orig_old, current_block, value, regnd);
       }
     }
+    // Disconnect control and remove precedence edges if any exist
     old->disconnect_inputs(NULL);
-    if( !tmp ) break;
-    old = tmp;
   }
   return blk_adjust;
 }
diff --git a/hotspot/src/share/vm/opto/subnode.cpp b/hotspot/src/share/vm/opto/subnode.cpp
index a51c5760eee..51212cff57f 100644
--- a/hotspot/src/share/vm/opto/subnode.cpp
+++ b/hotspot/src/share/vm/opto/subnode.cpp
@@ -91,7 +91,7 @@ const Type *SubNode::Value( PhaseTransform *phase ) const {
 
   // Not correct for SubFnode and AddFNode (must check for infinity)
   // Equal?  Subtract is zero
-  if (phase->eqv_uncast(in1, in2))  return add_id();
+  if (in1->eqv_uncast(in2))  return add_id();
 
   // Either input is BOTTOM ==> the result is the local BOTTOM
   if( t1 == Type::BOTTOM || t2 == Type::BOTTOM )
diff --git a/hotspot/src/share/vm/prims/jvmtiEnv.cpp b/hotspot/src/share/vm/prims/jvmtiEnv.cpp
index 60849150162..46519bb5796 100644
--- a/hotspot/src/share/vm/prims/jvmtiEnv.cpp
+++ b/hotspot/src/share/vm/prims/jvmtiEnv.cpp
@@ -267,7 +267,10 @@ JvmtiEnv::RetransformClasses(jint class_count, const jclass* classes) {
 
     instanceKlassHandle ikh(current_thread, k_oop);
     if (ikh->get_cached_class_file_bytes() == NULL) {
-      // not cached, we need to reconstitute the class file from VM representation
+      // Not cached, we need to reconstitute the class file from the
+      // VM representation. We don't attach the reconstituted class
+      // bytes to the instanceKlass here because they have not been
+      // validated and we're not at a safepoint.
       constantPoolHandle  constants(current_thread, ikh->constants());
       ObjectLocker ol(constants, current_thread);    // lock constant pool while we query it
 
diff --git a/hotspot/src/share/vm/prims/jvmtiExport.cpp b/hotspot/src/share/vm/prims/jvmtiExport.cpp
index 181716d4cb0..5325073b620 100644
--- a/hotspot/src/share/vm/prims/jvmtiExport.cpp
+++ b/hotspot/src/share/vm/prims/jvmtiExport.cpp
@@ -538,8 +538,6 @@ class JvmtiClassFileLoadHookPoster : public StackObj {
     _curr_env = NULL;
     _cached_length_ptr = cached_length_ptr;
     _cached_data_ptr = cached_data_ptr;
-    *_cached_length_ptr = 0;
-    *_cached_data_ptr = NULL;
 
     _state = _thread->jvmti_thread_state();
     if (_state != NULL) {
diff --git a/hotspot/src/share/vm/prims/jvmtiRedefineClasses.cpp b/hotspot/src/share/vm/prims/jvmtiRedefineClasses.cpp
index 10a478bd5eb..295ed86645a 100644
--- a/hotspot/src/share/vm/prims/jvmtiRedefineClasses.cpp
+++ b/hotspot/src/share/vm/prims/jvmtiRedefineClasses.cpp
@@ -854,8 +854,9 @@ jvmtiError VM_RedefineClasses::load_new_class_versions(TRAPS) {
 
     // RC_TRACE_WITH_THREAD macro has an embedded ResourceMark
     RC_TRACE_WITH_THREAD(0x00000001, THREAD,
-      ("loading name=%s (avail_mem=" UINT64_FORMAT "K)",
-      the_class->external_name(), os::available_memory() >> 10));
+      ("loading name=%s kind=%d (avail_mem=" UINT64_FORMAT "K)",
+      the_class->external_name(), _class_load_kind,
+      os::available_memory() >> 10));
 
     ClassFileStream st((u1*) _class_defs[i].class_bytes,
       _class_defs[i].class_byte_count, (char *)"__VM_RedefineClasses__");
@@ -3205,8 +3206,20 @@ void VM_RedefineClasses::redefine_single_class(jclass the_jclass,
   // with them was cached on the scratch class, move to the_class.
   // Note: we still want to do this if nothing needed caching since it
   // should get cleared in the_class too.
-  the_class->set_cached_class_file(scratch_class->get_cached_class_file_bytes(),
-                                   scratch_class->get_cached_class_file_len());
+  if (the_class->get_cached_class_file_bytes() == 0) {
+    // the_class doesn't have a cache yet so copy it
+    the_class->set_cached_class_file(
+      scratch_class->get_cached_class_file_bytes(),
+      scratch_class->get_cached_class_file_len());
+  }
+#ifndef PRODUCT
+  else {
+    assert(the_class->get_cached_class_file_bytes() ==
+      scratch_class->get_cached_class_file_bytes(), "cache ptrs must match");
+    assert(the_class->get_cached_class_file_len() ==
+      scratch_class->get_cached_class_file_len(), "cache lens must match");
+  }
+#endif
 
   // Replace inner_classes
   typeArrayOop old_inner_classes = the_class->inner_classes();
diff --git a/hotspot/src/share/vm/runtime/advancedThresholdPolicy.cpp b/hotspot/src/share/vm/runtime/advancedThresholdPolicy.cpp
index 758bf5ce505..a7dab0b651b 100644
--- a/hotspot/src/share/vm/runtime/advancedThresholdPolicy.cpp
+++ b/hotspot/src/share/vm/runtime/advancedThresholdPolicy.cpp
@@ -156,20 +156,19 @@ bool AdvancedThresholdPolicy::is_method_profiled(methodOop method) {
 // Called with the queue locked and with at least one element
 CompileTask* AdvancedThresholdPolicy::select_task(CompileQueue* compile_queue) {
   CompileTask *max_task = NULL;
-  methodOop max_method;
+  methodHandle max_method;
   jlong t = os::javaTimeMillis();
   // Iterate through the queue and find a method with a maximum rate.
   for (CompileTask* task = compile_queue->first(); task != NULL;) {
     CompileTask* next_task = task->next();
-    methodOop method = (methodOop)JNIHandles::resolve(task->method_handle());
-    methodDataOop mdo = method->method_data();
-    update_rate(t, method);
+    methodHandle method = (methodOop)JNIHandles::resolve(task->method_handle());
+    update_rate(t, method());
     if (max_task == NULL) {
       max_task = task;
       max_method = method;
     } else {
       // If a method has been stale for some time, remove it from the queue.
-      if (is_stale(t, TieredCompileTaskTimeout, method) && !is_old(method)) {
+      if (is_stale(t, TieredCompileTaskTimeout, method()) && !is_old(method())) {
         if (PrintTieredEvents) {
           print_event(REMOVE_FROM_QUEUE, method, method, task->osr_bci(), (CompLevel)task->comp_level());
         }
@@ -181,7 +180,7 @@ CompileTask* AdvancedThresholdPolicy::select_task(CompileQueue* compile_queue) {
       }
 
       // Select a method with a higher rate
-      if (compare_methods(method, max_method)) {
+      if (compare_methods(method(), max_method())) {
         max_task = task;
         max_method = method;
       }
@@ -190,7 +189,7 @@ CompileTask* AdvancedThresholdPolicy::select_task(CompileQueue* compile_queue) {
   }
 
   if (max_task->comp_level() == CompLevel_full_profile && TieredStopAtLevel > CompLevel_full_profile
-      && is_method_profiled(max_method)) {
+      && is_method_profiled(max_method())) {
     max_task->set_comp_level(CompLevel_limited_profile);
     if (PrintTieredEvents) {
       print_event(UPDATE_IN_QUEUE, max_method, max_method, max_task->osr_bci(), (CompLevel)max_task->comp_level());
diff --git a/hotspot/src/share/vm/runtime/arguments.cpp b/hotspot/src/share/vm/runtime/arguments.cpp
index 89f89128366..1072daf6b04 100644
--- a/hotspot/src/share/vm/runtime/arguments.cpp
+++ b/hotspot/src/share/vm/runtime/arguments.cpp
@@ -1000,6 +1000,13 @@ void Arguments::set_mode_flags(Mode mode) {
     UseInterpreter           = false;
     BackgroundCompilation    = false;
     ClipInlining             = false;
+    // Be much more aggressive in tiered mode with -Xcomp and exercise C2 more.
+    // We will first compile a level 3 version (C1 with full profiling), then do one invocation of it and
+    // compile a level 4 (C2) and then continue executing it.
+    if (TieredCompilation) {
+      Tier3InvokeNotifyFreqLog = 0;
+      Tier4InvocationThreshold = 0;
+    }
     break;
   }
 }
@@ -2323,7 +2330,7 @@ jint Arguments::parse_each_vm_init_arg(const JavaVMInitArgs* args,
 #ifndef PRODUCT
     // -Xprintflags
     } else if (match_option(option, "-Xprintflags", &tail)) {
-      CommandLineFlags::printFlags();
+      CommandLineFlags::printFlags(tty, false);
       vm_exit(0);
 #endif
     // -D
@@ -2971,13 +2978,13 @@ jint Arguments::parse(const JavaVMInitArgs* args) {
       IgnoreUnrecognizedVMOptions = false;
     }
     if (match_option(option, "-XX:+PrintFlagsInitial", &tail)) {
-      CommandLineFlags::printFlags();
+      CommandLineFlags::printFlags(tty, false);
       vm_exit(0);
     }
 
 #ifndef PRODUCT
     if (match_option(option, "-XX:+PrintFlagsWithComments", &tail)) {
-      CommandLineFlags::printFlags(true);
+      CommandLineFlags::printFlags(tty, true);
       vm_exit(0);
     }
 #endif
@@ -3153,6 +3160,9 @@ jint Arguments::parse(const JavaVMInitArgs* args) {
   if (!UseBiasedLocking || EmitSync != 0) {
     UseOptoBiasInlining = false;
   }
+  if (!EliminateLocks) {
+    EliminateNestedLocks = false;
+  }
 #endif
 
   if (PrintAssembly && FLAG_IS_DEFAULT(DebugNonSafepoints)) {
@@ -3170,7 +3180,7 @@ jint Arguments::parse(const JavaVMInitArgs* args) {
 #endif
 
   if (PrintCommandLineFlags) {
-    CommandLineFlags::printSetFlags();
+    CommandLineFlags::printSetFlags(tty);
   }
 
   // Apply CPU specific policy for the BiasedLocking
diff --git a/hotspot/src/share/vm/runtime/deoptimization.cpp b/hotspot/src/share/vm/runtime/deoptimization.cpp
index 05771c31180..44eeee8b9f3 100644
--- a/hotspot/src/share/vm/runtime/deoptimization.cpp
+++ b/hotspot/src/share/vm/runtime/deoptimization.cpp
@@ -211,7 +211,7 @@ Deoptimization::UnrollBlock* Deoptimization::fetch_unroll_info_helper(JavaThread
 #ifdef COMPILER2
   // Reallocate the non-escaping objects and restore their fields. Then
   // relock objects if synchronization on them was eliminated.
-  if (DoEscapeAnalysis) {
+  if (DoEscapeAnalysis || EliminateNestedLocks) {
     if (EliminateAllocations) {
       assert (chunk->at(0)->scope() != NULL,"expect only compiled java frames");
       GrowableArray<ScopeValue*>* objects = chunk->at(0)->scope()->objects();
diff --git a/hotspot/src/share/vm/runtime/globals.cpp b/hotspot/src/share/vm/runtime/globals.cpp
index 12efebcfcf4..13ce54fb42b 100644
--- a/hotspot/src/share/vm/runtime/globals.cpp
+++ b/hotspot/src/share/vm/runtime/globals.cpp
@@ -488,7 +488,7 @@ extern "C" {
   }
 }
 
-void CommandLineFlags::printSetFlags() {
+void CommandLineFlags::printSetFlags(outputStream* out) {
   // Print which flags were set on the command line
   // note: this method is called before the thread structure is in place
   //       which means resource allocation cannot be used.
@@ -507,11 +507,11 @@ void CommandLineFlags::printSetFlags() {
   // Print
   for (int i = 0; i < length; i++) {
     if (array[i]->origin /* naked field! */) {
-      array[i]->print_as_flag(tty);
-      tty->print(" ");
+      array[i]->print_as_flag(out);
+      out->print(" ");
     }
   }
-  tty->cr();
+  out->cr();
   FREE_C_HEAP_ARRAY(Flag*, array);
 }
 
@@ -524,7 +524,7 @@ void CommandLineFlags::verify() {
 
 #endif // PRODUCT
 
-void CommandLineFlags::printFlags(bool withComments) {
+void CommandLineFlags::printFlags(outputStream* out, bool withComments) {
   // Print the flags sorted by name
   // note: this method is called before the thread structure is in place
   //       which means resource allocation cannot be used.
@@ -541,10 +541,10 @@ void CommandLineFlags::printFlags(bool withComments) {
   qsort(array, length, sizeof(Flag*), compare_flags);
 
   // Print
-  tty->print_cr("[Global flags]");
+  out->print_cr("[Global flags]");
   for (int i = 0; i < length; i++) {
     if (array[i]->is_unlocked()) {
-      array[i]->print_on(tty, withComments);
+      array[i]->print_on(out, withComments);
     }
   }
   FREE_C_HEAP_ARRAY(Flag*, array);
diff --git a/hotspot/src/share/vm/runtime/globals.hpp b/hotspot/src/share/vm/runtime/globals.hpp
index e583295ede2..02311e215a9 100644
--- a/hotspot/src/share/vm/runtime/globals.hpp
+++ b/hotspot/src/share/vm/runtime/globals.hpp
@@ -326,9 +326,9 @@ class CommandLineFlags {
 
   // Returns false if name is not a command line flag.
   static bool wasSetOnCmdline(const char* name, bool* value);
-  static void printSetFlags();
+  static void printSetFlags(outputStream* out);
 
-  static void printFlags(bool withComments = false );
+  static void printFlags(outputStream* out, bool withComments);
 
   static void verify() PRODUCT_RETURN;
 };
@@ -527,6 +527,9 @@ class CommandLineFlags {
   product(intx, UseSSE, 99,                                                 \
           "Highest supported SSE instructions set on x86/x64")              \
                                                                             \
+  product(intx, UseAVX, 99,                                                 \
+          "Highest supported AVX instructions set on x86/x64")              \
+                                                                            \
   product(intx, UseVIS, 99,                                                 \
           "Highest supported VIS instructions set on Sparc")                \
                                                                             \
@@ -1553,7 +1556,7 @@ class CommandLineFlags {
   product(uintx, ParGCDesiredObjsFromOverflowList, 20,                      \
           "The desired number of objects to claim from the overflow list")  \
                                                                             \
-  diagnostic(intx, ParGCStridesPerThread, 2,                                \
+  diagnostic(uintx, ParGCStridesPerThread, 2,                               \
           "The number of strides per worker thread that we divide up the "  \
           "card table scanning work into")                                  \
                                                                             \
diff --git a/hotspot/src/share/vm/runtime/init.cpp b/hotspot/src/share/vm/runtime/init.cpp
index 1f09c368365..4176cd82d54 100644
--- a/hotspot/src/share/vm/runtime/init.cpp
+++ b/hotspot/src/share/vm/runtime/init.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 1997, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1997, 2011, Oracle and/or its affiliates. All rights reserved.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
  * This code is free software; you can redistribute it and/or modify it
@@ -47,15 +47,16 @@ void bytecodes_init();
 void classLoader_init();
 void codeCache_init();
 void VM_Version_init();
+void os_init_globals();        // depends on VM_Version_init, before universe_init
 void stubRoutines_init1();
-jint universe_init();  // dependent on codeCache_init and stubRoutines_init
-void interpreter_init();  // before any methods loaded
-void invocationCounter_init();  // before any methods loaded
+jint universe_init();          // depends on codeCache_init and stubRoutines_init
+void interpreter_init();       // before any methods loaded
+void invocationCounter_init(); // before any methods loaded
 void marksweep_init();
 void accessFlags_init();
 void templateTable_init();
 void InterfaceSupport_init();
-void universe2_init();  // dependent on codeCache_init and stubRoutines_init
+void universe2_init();  // dependent on codeCache_init and stubRoutines_init, loads primordial classes
 void referenceProcessor_init();
 void jni_handles_init();
 void vmStructs_init();
@@ -94,8 +95,10 @@ jint init_globals() {
   classLoader_init();
   codeCache_init();
   VM_Version_init();
+  os_init_globals();
   stubRoutines_init1();
-  jint status = universe_init();  // dependent on codeCache_init and stubRoutines_init
+  jint status = universe_init();  // dependent on codeCache_init and
+                                  // stubRoutines_init1
   if (status != JNI_OK)
     return status;
 
@@ -106,7 +109,7 @@ jint init_globals() {
   templateTable_init();
   InterfaceSupport_init();
   SharedRuntime::generate_stubs();
-  universe2_init();  // dependent on codeCache_init and stubRoutines_init
+  universe2_init();  // dependent on codeCache_init and stubRoutines_init1
   referenceProcessor_init();
   jni_handles_init();
 #ifndef VM_STRUCTS_KERNEL
@@ -122,7 +125,7 @@ jint init_globals() {
   if (!universe_post_init()) {
     return JNI_ERR;
   }
-  javaClasses_init();  // must happen after vtable initialization
+  javaClasses_init();   // must happen after vtable initialization
   stubRoutines_init2(); // note: StubRoutines need 2-phase init
 
   // Although we'd like to, we can't easily do a heap verify
@@ -137,7 +140,7 @@ jint init_globals() {
   // All the flags that get adjusted by VM_Version_init and os::init_2
   // have been set so dump the flags now.
   if (PrintFlagsFinal) {
-    CommandLineFlags::printFlags();
+    CommandLineFlags::printFlags(tty, false);
   }
 
   return JNI_OK;
diff --git a/hotspot/src/share/vm/runtime/mutex.cpp b/hotspot/src/share/vm/runtime/mutex.cpp
index 80d3d874273..dddd27cc2fa 100644
--- a/hotspot/src/share/vm/runtime/mutex.cpp
+++ b/hotspot/src/share/vm/runtime/mutex.cpp
@@ -527,7 +527,21 @@ void Monitor::ILock (Thread * Self) {
 
 void Monitor::IUnlock (bool RelaxAssert) {
   assert (ILocked(), "invariant") ;
-  _LockWord.Bytes[_LSBINDEX] = 0 ;       // drop outer lock
+  // Conceptually we need a MEMBAR #storestore|#loadstore barrier or fence immediately
+  // before the store that releases the lock.  Crucially, all the stores and loads in the
+  // critical section must be globally visible before the store of 0 into the lock-word
+  // that releases the lock becomes globally visible.  That is, memory accesses in the
+  // critical section should not be allowed to bypass or overtake the following ST that
+  // releases the lock.  As such, to prevent accesses within the critical section
+  // from "leaking" out, we need a release fence between the critical section and the
+  // store that releases the lock.  In practice that release barrier is elided on
+  // platforms with strong memory models such as TSO.
+  //
+  // Note that the OrderAccess::storeload() fence that appears after unlock store
+  // provides for progress conditions and succession and is _not related to exclusion
+  // safety or lock release consistency.
+  OrderAccess::release_store(&_LockWord.Bytes[_LSBINDEX], 0); // drop outer lock
+
   OrderAccess::storeload ();
   ParkEvent * const w = _OnDeck ;
   assert (RelaxAssert || w != Thread::current()->_MutexEvent, "invariant") ;
diff --git a/hotspot/src/share/vm/runtime/os.cpp b/hotspot/src/share/vm/runtime/os.cpp
index 583d32d8ab8..773948971cd 100644
--- a/hotspot/src/share/vm/runtime/os.cpp
+++ b/hotspot/src/share/vm/runtime/os.cpp
@@ -82,6 +82,12 @@ julong os::num_frees = 0;           // # of calls to free
 julong os::free_bytes = 0;          // # of bytes freed
 #endif
 
+void os_init_globals() {
+  // Called from init_globals().
+  // See Threads::create_vm() in thread.cpp, and init.cpp.
+  os::init_globals();
+}
+
 // Fill in buffer with current local time as an ISO-8601 string.
 // E.g., yyyy-mm-ddThh:mm:ss-zzzz.
 // Returns buffer, or NULL if it failed.
diff --git a/hotspot/src/share/vm/runtime/os.hpp b/hotspot/src/share/vm/runtime/os.hpp
index 0ea0ae93fb4..85a85a3776a 100644
--- a/hotspot/src/share/vm/runtime/os.hpp
+++ b/hotspot/src/share/vm/runtime/os.hpp
@@ -99,9 +99,11 @@ class os: AllStatic {
   }
 
  public:
-
   static void init(void);                      // Called before command line parsing
   static jint init_2(void);                    // Called after command line parsing
+  static void init_globals(void) {             // Called from init_globals() in init.cpp
+    init_globals_ext();
+  }
   static void init_3(void);                    // Called at the end of vm init
 
   // File names are case-insensitive on windows only
@@ -256,7 +258,7 @@ class os: AllStatic {
                              char *addr, size_t bytes, bool read_only,
                              bool allow_exec);
   static bool   unmap_memory(char *addr, size_t bytes);
-  static void   free_memory(char *addr, size_t bytes);
+  static void   free_memory(char *addr, size_t bytes, size_t alignment_hint);
   static void   realign_memory(char *addr, size_t bytes, size_t alignment_hint);
 
   // NUMA-specific interface
@@ -500,6 +502,7 @@ class os: AllStatic {
 
   static void print_location(outputStream* st, intptr_t x, bool verbose = false);
   static size_t lasterror(char *buf, size_t len);
+  static int get_last_error();
 
   // Determines whether the calling process is being debugged by a user-mode debugger.
   static bool is_debugger_attached();
@@ -671,6 +674,11 @@ class os: AllStatic {
   // rest of line is skipped. Returns number of bytes read or -1 on EOF
   static int get_line_chars(int fd, char *buf, const size_t bsize);
 
+  // Extensions
+#include "runtime/os_ext.hpp"
+
+ public:
+
   // Platform dependent stuff
 #ifdef TARGET_OS_FAMILY_linux
 # include "os_linux.hpp"
@@ -715,6 +723,7 @@ class os: AllStatic {
 # include "os_bsd_zero.hpp"
 #endif
 
+ public:
   // debugging support (mostly used by debug.cpp but also fatal error handler)
   static bool find(address pc, outputStream* st = tty); // OS specific function to make sense out of an address
 
diff --git a/hotspot/src/share/vm/runtime/os_ext.hpp b/hotspot/src/share/vm/runtime/os_ext.hpp
new file mode 100644
index 00000000000..6b610528e05
--- /dev/null
+++ b/hotspot/src/share/vm/runtime/os_ext.hpp
@@ -0,0 +1,34 @@
+/*
+ * Copyright (c) 2011 Oracle and/or its affiliates. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+#ifndef SHARE_VM_RUNTIME_OS_EXT_HPP
+#define SHARE_VM_RUNTIME_OS_EXT_HPP
+
+ public:
+  static void init_globals_ext() {} // Run from init_globals().
+                                    // See os.hpp/cpp and init.cpp.
+
+ private:
+
+#endif // SHARE_VM_RUNTIME_OS_EXT_HPP
diff --git a/hotspot/src/share/vm/runtime/vmStructs.cpp b/hotspot/src/share/vm/runtime/vmStructs.cpp
index 00fe3582e67..eb765f0cf62 100644
--- a/hotspot/src/share/vm/runtime/vmStructs.cpp
+++ b/hotspot/src/share/vm/runtime/vmStructs.cpp
@@ -295,7 +295,7 @@ static inline uint64_t cast_uint64_t(size_t x)
   nonstatic_field(instanceKlass,               _nof_implementors,                             int)                                   \
   nonstatic_field(instanceKlass,               _implementors[0],                              klassOop)                              \
   nonstatic_field(instanceKlass,               _fields,                                       typeArrayOop)                          \
-  nonstatic_field(instanceKlass,               _java_fields_count,                             int)                                   \
+  nonstatic_field(instanceKlass,               _java_fields_count,                            u2)                                    \
   nonstatic_field(instanceKlass,               _constants,                                    constantPoolOop)                       \
   nonstatic_field(instanceKlass,               _class_loader,                                 oop)                                   \
   nonstatic_field(instanceKlass,               _protection_domain,                            oop)                                   \
@@ -305,12 +305,12 @@ static inline uint64_t cast_uint64_t(size_t x)
   nonstatic_field(instanceKlass,               _inner_classes,                                typeArrayOop)                          \
   nonstatic_field(instanceKlass,               _nonstatic_field_size,                         int)                                   \
   nonstatic_field(instanceKlass,               _static_field_size,                            int)                                   \
-  nonstatic_field(instanceKlass,               _static_oop_field_count,                       int)                                   \
+  nonstatic_field(instanceKlass,               _static_oop_field_count,                       u2)                                   \
   nonstatic_field(instanceKlass,               _nonstatic_oop_map_size,                       int)                                   \
   nonstatic_field(instanceKlass,               _is_marked_dependent,                          bool)                                  \
   nonstatic_field(instanceKlass,               _minor_version,                                u2)                                    \
   nonstatic_field(instanceKlass,               _major_version,                                u2)                                    \
-  nonstatic_field(instanceKlass,               _init_state,                                   instanceKlass::ClassState)             \
+  nonstatic_field(instanceKlass,               _init_state,                                   u1)                                    \
   nonstatic_field(instanceKlass,               _init_thread,                                  Thread*)                               \
   nonstatic_field(instanceKlass,               _vtable_len,                                   int)                                   \
   nonstatic_field(instanceKlass,               _itable_len,                                   int)                                   \
@@ -1362,6 +1362,7 @@ static inline uint64_t cast_uint64_t(size_t x)
   /* The compiler thinks this is a different type than */                 \
   /* unsigned short on Win32 */                                           \
   declare_unsigned_integer_type(u2)                                       \
+  declare_unsigned_integer_type(u1)                                       \
   declare_unsigned_integer_type(unsigned)                                 \
                                                                           \
   /*****************************/                                         \
diff --git a/hotspot/src/share/vm/services/attachListener.cpp b/hotspot/src/share/vm/services/attachListener.cpp
index 3182081ce0d..3753a64cc95 100644
--- a/hotspot/src/share/vm/services/attachListener.cpp
+++ b/hotspot/src/share/vm/services/attachListener.cpp
@@ -99,6 +99,7 @@ static jint get_properties(AttachOperation* op, outputStream* out, Symbol* seria
 }
 
 // Implementation of "properties" command.
+// See also: PrintSystemPropertiesDCmd class
 static jint get_system_properties(AttachOperation* op, outputStream* out) {
   return get_properties(op, out, vmSymbols::serializePropertiesToByteArray_name());
 }
@@ -127,6 +128,7 @@ static jint data_dump(AttachOperation* op, outputStream* out) {
 }
 
 // Implementation of "threaddump" command - essentially a remote ctrl-break
+// See also: ThreadDumpDCmd class
 //
 static jint thread_dump(AttachOperation* op, outputStream* out) {
   bool print_concurrent_locks = false;
@@ -158,6 +160,7 @@ static jint jcmd(AttachOperation* op, outputStream* out) {
   DCmd::parse_and_execute(out, op->arg(0), ' ', THREAD);
   if (HAS_PENDING_EXCEPTION) {
     java_lang_Throwable::print(PENDING_EXCEPTION, out);
+    out->cr();
     CLEAR_PENDING_EXCEPTION;
     // The exception has been printed on the output stream
     // If the JVM returns JNI_ERR, the attachAPI throws a generic I/O
@@ -169,6 +172,7 @@ static jint jcmd(AttachOperation* op, outputStream* out) {
 
 #ifndef SERVICES_KERNEL   // Heap dumping not supported
 // Implementation of "dumpheap" command.
+// See also: HeapDumpDCmd class
 //
 // Input arguments :-
 //   arg0: Name of the dump file
@@ -211,6 +215,7 @@ jint dump_heap(AttachOperation* op, outputStream* out) {
 #endif // SERVICES_KERNEL
 
 // Implementation of "inspectheap" command
+// See also: ClassHistogramDCmd class
 //
 // Input arguments :-
 //   arg0: "-live" or "-all"
@@ -354,6 +359,7 @@ static jint set_flag(AttachOperation* op, outputStream* out) {
 }
 
 // Implementation of "printflag" command
+// See also: PrintVMFlagsDCmd class
 static jint print_flag(AttachOperation* op, outputStream* out) {
   const char* name = NULL;
   if ((name = op->arg(0)) == NULL) {
diff --git a/hotspot/src/share/vm/services/diagnosticCommand.cpp b/hotspot/src/share/vm/services/diagnosticCommand.cpp
index 2a268eb2de6..39c7ce25e4b 100644
--- a/hotspot/src/share/vm/services/diagnosticCommand.cpp
+++ b/hotspot/src/share/vm/services/diagnosticCommand.cpp
@@ -23,11 +23,15 @@
  */
 
 #include "precompiled.hpp"
+#include "gc_implementation/shared/vmGCOperations.hpp"
+#include "runtime/javaCalls.hpp"
 #include "services/diagnosticArgument.hpp"
 #include "services/diagnosticCommand.hpp"
 #include "services/diagnosticFramework.hpp"
+#include "services/heapDumper.hpp"
+#include "services/management.hpp"
 
-HelpDCmd::HelpDCmd(outputStream* output, bool heap) : DCmd(output, heap),
+HelpDCmd::HelpDCmd(outputStream* output, bool heap) : DCmdWithParser(output, heap),
   _all("-all", "Show help for all commands", "BOOLEAN", false, "false"),
   _cmd("command name", "The name of the command for which we want help",
         "STRING", false) {
@@ -35,14 +39,6 @@ HelpDCmd::HelpDCmd(outputStream* output, bool heap) : DCmd(output, heap),
   _dcmdparser.add_dcmd_argument(&_cmd);
 };
 
-void HelpDCmd::parse(CmdLine* line, char delim, TRAPS) {
-  _dcmdparser.parse(line, delim, CHECK);
-}
-
-void HelpDCmd::print_help(outputStream* out) {
-  _dcmdparser.print_help(out, name());
-}
-
 void HelpDCmd::execute(TRAPS) {
   if (_all.value()) {
     GrowableArray<const char*>* cmd_list = DCmdFactory::DCmd_list();
@@ -66,10 +62,11 @@ void HelpDCmd::execute(TRAPS) {
                          factory->is_enabled() ? "" : " [disabled]");
       output()->print_cr(factory->description());
       output()->print_cr("\nImpact: %s", factory->impact());
+      output()->cr();
       cmd = factory->create_resource_instance(output());
       if (cmd != NULL) {
         DCmdMark mark(cmd);
-        cmd->print_help(output());
+        cmd->print_help(factory->name());
       }
     } else {
       output()->print_cr("Help unavailable : '%s' : No such command", _cmd.value());
@@ -90,14 +87,6 @@ void HelpDCmd::execute(TRAPS) {
   }
 }
 
-void HelpDCmd::reset(TRAPS) {
-  _dcmdparser.reset(CHECK);
-}
-
-void HelpDCmd::cleanup() {
-  _dcmdparser.cleanup();
-}
-
 int HelpDCmd::num_arguments() {
   ResourceMark rm;
   HelpDCmd* dcmd = new HelpDCmd(NULL, false);
@@ -109,14 +98,6 @@ int HelpDCmd::num_arguments() {
   }
 }
 
-GrowableArray<const char*>* HelpDCmd::argument_name_array() {
-  return _dcmdparser.argument_name_array();
-}
-
-GrowableArray<DCmdArgumentInfo*>* HelpDCmd::argument_info_array() {
-  return _dcmdparser.argument_info_array();
-}
-
 void VersionDCmd::execute(TRAPS) {
   output()->print_cr("%s version %s", Abstract_VM_Version::vm_name(),
           Abstract_VM_Version::vm_release());
@@ -129,3 +110,210 @@ void VersionDCmd::execute(TRAPS) {
             jdk_version.minor_version());
   }
 }
+
+PrintVMFlagsDCmd::PrintVMFlagsDCmd(outputStream* output, bool heap) :
+                                   DCmdWithParser(output, heap),
+  _all("-all", "Print all flags supported by the VM", "BOOLEAN", false, "false") {
+  _dcmdparser.add_dcmd_option(&_all);
+}
+
+void PrintVMFlagsDCmd::execute(TRAPS) {
+  if (_all.value()) {
+    CommandLineFlags::printFlags(output(), true);
+  } else {
+    CommandLineFlags::printSetFlags(output());
+  }
+}
+
+int PrintVMFlagsDCmd::num_arguments() {
+    ResourceMark rm;
+    PrintVMFlagsDCmd* dcmd = new PrintVMFlagsDCmd(NULL, false);
+    if (dcmd != NULL) {
+      DCmdMark mark(dcmd);
+      return dcmd->_dcmdparser.num_arguments();
+    } else {
+      return 0;
+    }
+}
+
+void PrintSystemPropertiesDCmd::execute(TRAPS) {
+  // load sun.misc.VMSupport
+  Symbol* klass = vmSymbols::sun_misc_VMSupport();
+  klassOop k = SystemDictionary::resolve_or_fail(klass, true, CHECK);
+  instanceKlassHandle ik (THREAD, k);
+  if (ik->should_be_initialized()) {
+    ik->initialize(THREAD);
+  }
+  if (HAS_PENDING_EXCEPTION) {
+    java_lang_Throwable::print(PENDING_EXCEPTION, output());
+    output()->cr();
+    CLEAR_PENDING_EXCEPTION;
+    return;
+  }
+
+  // invoke the serializePropertiesToByteArray method
+  JavaValue result(T_OBJECT);
+  JavaCallArguments args;
+
+  Symbol* signature = vmSymbols::serializePropertiesToByteArray_signature();
+  JavaCalls::call_static(&result,
+                         ik,
+                         vmSymbols::serializePropertiesToByteArray_name(),
+                         signature,
+                         &args,
+                         THREAD);
+  if (HAS_PENDING_EXCEPTION) {
+    java_lang_Throwable::print(PENDING_EXCEPTION, output());
+    output()->cr();
+    CLEAR_PENDING_EXCEPTION;
+    return;
+  }
+
+  // The result should be a [B
+  oop res = (oop)result.get_jobject();
+  assert(res->is_typeArray(), "just checking");
+  assert(typeArrayKlass::cast(res->klass())->element_type() == T_BYTE, "just checking");
+
+  // copy the bytes to the output stream
+  typeArrayOop ba = typeArrayOop(res);
+  jbyte* addr = typeArrayOop(res)->byte_at_addr(0);
+  output()->print_raw((const char*)addr, ba->length());
+}
+
+VMUptimeDCmd::VMUptimeDCmd(outputStream* output, bool heap) :
+                           DCmdWithParser(output, heap),
+  _date("-date", "Add a prefix with current date", "BOOLEAN", false, "false") {
+  _dcmdparser.add_dcmd_option(&_date);
+}
+
+void VMUptimeDCmd::execute(TRAPS) {
+  if (_date.value()) {
+    output()->date_stamp(true, "", ": ");
+  }
+  output()->time_stamp().update_to(tty->time_stamp().ticks());
+  output()->stamp();
+  output()->print_cr(" s");
+}
+
+int VMUptimeDCmd::num_arguments() {
+  ResourceMark rm;
+  VMUptimeDCmd* dcmd = new VMUptimeDCmd(NULL, false);
+  if (dcmd != NULL) {
+    DCmdMark mark(dcmd);
+    return dcmd->_dcmdparser.num_arguments();
+  } else {
+    return 0;
+  }
+}
+
+void SystemGCDCmd::execute(TRAPS) {
+  Universe::heap()->collect(GCCause::_java_lang_system_gc);
+}
+
+void RunFinalizationDCmd::execute(TRAPS) {
+  klassOop k = SystemDictionary::resolve_or_fail(vmSymbols::java_lang_System(),
+                                                 true, CHECK);
+  instanceKlassHandle klass(THREAD, k);
+  JavaValue result(T_VOID);
+  JavaCalls::call_static(&result, klass,
+                         vmSymbols::run_finalization_name(),
+                         vmSymbols::void_method_signature(), CHECK);
+}
+
+#ifndef SERVICES_KERNEL   // Heap dumping not supported
+HeapDumpDCmd::HeapDumpDCmd(outputStream* output, bool heap) :
+                           DCmdWithParser(output, heap),
+  _filename("filename","Name of the dump file", "STRING",true),
+  _all("-all", "Dump all objects, including unreachable objects",
+       "BOOLEAN", false, "false") {
+  _dcmdparser.add_dcmd_option(&_all);
+  _dcmdparser.add_dcmd_argument(&_filename);
+}
+
+void HeapDumpDCmd::execute(TRAPS) {
+  // Request a full GC before heap dump if _all is false
+  // This helps reduces the amount of unreachable objects in the dump
+  // and makes it easier to browse.
+  HeapDumper dumper(!_all.value() /* request GC if _all is false*/);
+  int res = dumper.dump(_filename.value());
+  if (res == 0) {
+    output()->print_cr("Heap dump file created");
+  } else {
+    // heap dump failed
+    ResourceMark rm;
+    char* error = dumper.error_as_C_string();
+    if (error == NULL) {
+      output()->print_cr("Dump failed - reason unknown");
+    } else {
+      output()->print_cr("%s", error);
+    }
+  }
+}
+
+int HeapDumpDCmd::num_arguments() {
+  ResourceMark rm;
+  HeapDumpDCmd* dcmd = new HeapDumpDCmd(NULL, false);
+  if (dcmd != NULL) {
+    DCmdMark mark(dcmd);
+    return dcmd->_dcmdparser.num_arguments();
+  } else {
+    return 0;
+  }
+}
+#endif // SERVICES_KERNEL
+
+ClassHistogramDCmd::ClassHistogramDCmd(outputStream* output, bool heap) :
+                                       DCmdWithParser(output, heap),
+  _all("-all", "Inspect all objects, including unreachable objects",
+       "BOOLEAN", false, "false") {
+  _dcmdparser.add_dcmd_option(&_all);
+}
+
+void ClassHistogramDCmd::execute(TRAPS) {
+  VM_GC_HeapInspection heapop(output(),
+                              !_all.value() /* request full gc if false */,
+                              true /* need_prologue */);
+  VMThread::execute(&heapop);
+}
+
+int ClassHistogramDCmd::num_arguments() {
+  ResourceMark rm;
+  ClassHistogramDCmd* dcmd = new ClassHistogramDCmd(NULL, false);
+  if (dcmd != NULL) {
+    DCmdMark mark(dcmd);
+    return dcmd->_dcmdparser.num_arguments();
+  } else {
+    return 0;
+  }
+}
+
+ThreadDumpDCmd::ThreadDumpDCmd(outputStream* output, bool heap) :
+                               DCmdWithParser(output, heap),
+  _locks("-l", "print java.util.concurrent locks", "BOOLEAN", false, "false") {
+  _dcmdparser.add_dcmd_option(&_locks);
+}
+
+void ThreadDumpDCmd::execute(TRAPS) {
+  // thread stacks
+  VM_PrintThreads op1(output(), _locks.value());
+  VMThread::execute(&op1);
+
+  // JNI global handles
+  VM_PrintJNI op2(output());
+  VMThread::execute(&op2);
+
+  // Deadlock detection
+  VM_FindDeadlocks op3(output());
+  VMThread::execute(&op3);
+}
+
+int ThreadDumpDCmd::num_arguments() {
+  ResourceMark rm;
+  ThreadDumpDCmd* dcmd = new ThreadDumpDCmd(NULL, false);
+  if (dcmd != NULL) {
+    DCmdMark mark(dcmd);
+    return dcmd->_dcmdparser.num_arguments();
+  } else {
+    return 0;
+  }
+}
diff --git a/hotspot/src/share/vm/services/diagnosticCommand.hpp b/hotspot/src/share/vm/services/diagnosticCommand.hpp
index f17a208a60c..436664a9cad 100644
--- a/hotspot/src/share/vm/services/diagnosticCommand.hpp
+++ b/hotspot/src/share/vm/services/diagnosticCommand.hpp
@@ -35,9 +35,8 @@
 #include "services/diagnosticCommand.hpp"
 #include "services/diagnosticFramework.hpp"
 
-class HelpDCmd : public DCmd {
+class HelpDCmd : public DCmdWithParser {
 protected:
-  DCmdParser _dcmdparser;
   DCmdArgument<bool> _all;
   DCmdArgument<char*> _cmd;
 public:
@@ -50,13 +49,7 @@ public:
   }
   static const char* impact() { return "Low: "; }
   static int num_arguments();
-  virtual void parse(CmdLine* line, char delim, TRAPS);
   virtual void execute(TRAPS);
-  virtual void reset(TRAPS);
-  virtual void cleanup();
-  virtual void print_help(outputStream* out);
-  virtual GrowableArray<const char*>* argument_name_array();
-  virtual GrowableArray<DCmdArgumentInfo*>* argument_info_array();
 };
 
 class VersionDCmd : public DCmd {
@@ -68,9 +61,156 @@ public:
   }
   static const char* impact() { return "Low: "; }
   static int num_arguments() { return 0; }
-  virtual void parse(CmdLine* line, char delim, TRAPS) { }
   virtual void execute(TRAPS);
-  virtual void print_help(outputStream* out) { }
+};
+
+class CommandLineDCmd : public DCmd {
+public:
+  CommandLineDCmd(outputStream* output, bool heap) : DCmd(output, heap) { }
+  static const char* name() { return "VM.command_line"; }
+  static const char* description() {
+    return "Print the command line used to start this VM instance.";
+  }
+  static const char* impact() { return "Low: "; }
+  static int num_arguments() { return 0; }
+  virtual void execute(TRAPS) {
+    Arguments::print_on(_output);
+  }
+};
+
+// See also: get_system_properties in attachListener.cpp
+class PrintSystemPropertiesDCmd : public DCmd {
+public:
+  PrintSystemPropertiesDCmd(outputStream* output, bool heap) : DCmd(output, heap) { }
+    static const char* name() { return "VM.system_properties"; }
+    static const char* description() {
+      return "Print system properties.";
+    }
+    static const char* impact() {
+      return "Low: ";
+    }
+    static int num_arguments() { return 0; }
+    virtual void execute(TRAPS);
+};
+
+// See also: print_flag in attachListener.cpp
+class PrintVMFlagsDCmd : public DCmdWithParser {
+protected:
+  DCmdArgument<bool> _all;
+public:
+  PrintVMFlagsDCmd(outputStream* output, bool heap);
+  static const char* name() { return "VM.flags"; }
+  static const char* description() {
+    return "Print VM flag options and their current values.";
+  }
+  static const char* impact() {
+    return "Low: ";
+  }
+  static int num_arguments();
+  virtual void execute(TRAPS);
+};
+
+class VMUptimeDCmd : public DCmdWithParser {
+protected:
+  DCmdArgument<bool> _date;
+public:
+  VMUptimeDCmd(outputStream* output, bool heap);
+  static const char* name() { return "VM.uptime"; }
+  static const char* description() {
+    return "Print VM uptime.";
+  }
+  static const char* impact() {
+    return "Low: ";
+  }
+  static int num_arguments();
+  virtual void execute(TRAPS);
+};
+
+class SystemGCDCmd : public DCmd {
+public:
+  SystemGCDCmd(outputStream* output, bool heap) : DCmd(output, heap) { }
+    static const char* name() { return "GC.run"; }
+    static const char* description() {
+      return "Call java.lang.System.gc().";
+    }
+    static const char* impact() {
+      return "Medium: Depends on Java heap size and content.";
+    }
+    static int num_arguments() { return 0; }
+    virtual void execute(TRAPS);
+};
+
+class RunFinalizationDCmd : public DCmd {
+public:
+  RunFinalizationDCmd(outputStream* output, bool heap) : DCmd(output, heap) { }
+    static const char* name() { return "GC.run_finalization"; }
+    static const char* description() {
+      return "Call java.lang.System.runFinalization().";
+    }
+    static const char* impact() {
+      return "Medium: Depends on Java content.";
+    }
+    static int num_arguments() { return 0; }
+    virtual void execute(TRAPS);
+};
+
+#ifndef SERVICES_KERNEL   // Heap dumping not supported
+// See also: dump_heap in attachListener.cpp
+class HeapDumpDCmd : public DCmdWithParser {
+protected:
+  DCmdArgument<char*> _filename;
+  DCmdArgument<bool>  _all;
+public:
+  HeapDumpDCmd(outputStream* output, bool heap);
+  static const char* name() {
+    return "GC.heap_dump";
+  }
+  static const char* description() {
+    return "Generate a HPROF format dump of the Java heap.";
+  }
+  static const char* impact() {
+    return "High: Depends on Java heap size and content. "
+           "Request a full GC unless the '-all' option is specified.";
+  }
+  static int num_arguments();
+  virtual void execute(TRAPS);
+};
+#endif // SERVICES_KERNEL
+
+// See also: inspeactheap in attachListener.cpp
+class ClassHistogramDCmd : public DCmdWithParser {
+protected:
+  DCmdArgument<bool> _all;
+public:
+  ClassHistogramDCmd(outputStream* output, bool heap);
+  static const char* name() {
+    return "GC.class_histogram";
+  }
+  static const char* description() {
+    return "Provide statistics about the Java heap usage.";
+  }
+  static const char* impact() {
+    return "High: Depends on Java heap size and content.";
+  }
+  static int num_arguments();
+  virtual void execute(TRAPS);
+};
+
+// See also: thread_dump in attachListener.cpp
+class ThreadDumpDCmd : public DCmdWithParser {
+protected:
+  DCmdArgument<bool> _locks;
+public:
+  ThreadDumpDCmd(outputStream* output, bool heap);
+  static const char* name() { return "Thread.print"; }
+  static const char* description() {
+    return "Print all threads with stacktraces.";
+  }
+  static const char* impact() {
+    return "Medium: Depends on the number of threads.";
+  }
+  static int num_arguments();
+  virtual void execute(TRAPS);
 };
 
 #endif // SHARE_VM_SERVICES_DIAGNOSTICCOMMAND_HPP
diff --git a/hotspot/src/share/vm/services/diagnosticFramework.cpp b/hotspot/src/share/vm/services/diagnosticFramework.cpp
index 91f8ae6c1f6..2de9f81e9be 100644
--- a/hotspot/src/share/vm/services/diagnosticFramework.cpp
+++ b/hotspot/src/share/vm/services/diagnosticFramework.cpp
@@ -226,7 +226,7 @@ void DCmdParser::check(TRAPS) {
 }
 
 void DCmdParser::print_help(outputStream* out, const char* cmd_name) {
-  out->print("\nSyntax : %s %s", cmd_name, _options == NULL ? "" : "[options]");
+  out->print("Syntax : %s %s", cmd_name, _options == NULL ? "" : "[options]");
   GenDCmdArgument* arg = _arguments_list;
   while (arg != NULL) {
     if (arg->is_mandatory()) {
@@ -373,6 +373,30 @@ void DCmd::parse_and_execute(outputStream* out, const char* cmdline,
   }
 }
 
+void DCmdWithParser::parse(CmdLine* line, char delim, TRAPS) {
+  _dcmdparser.parse(line, delim, CHECK);
+}
+
+void DCmdWithParser::print_help(const char* name) {
+  _dcmdparser.print_help(output(), name);
+}
+
+void DCmdWithParser::reset(TRAPS) {
+  _dcmdparser.reset(CHECK);
+}
+
+void DCmdWithParser::cleanup() {
+  _dcmdparser.cleanup();
+}
+
+GrowableArray<const char*>* DCmdWithParser::argument_name_array() {
+  return _dcmdparser.argument_name_array();
+}
+
+GrowableArray<DCmdArgumentInfo*>* DCmdWithParser::argument_info_array() {
+  return _dcmdparser.argument_info_array();
+}
+
 Mutex* DCmdFactory::_dcmdFactory_lock = new Mutex(Mutex::leaf, "DCmdFactory", true);
 
 DCmdFactory* DCmdFactory::factory(const char* name, size_t len) {
diff --git a/hotspot/src/share/vm/services/diagnosticFramework.hpp b/hotspot/src/share/vm/services/diagnosticFramework.hpp
index 00c03c5733f..e3b78bb384b 100644
--- a/hotspot/src/share/vm/services/diagnosticFramework.hpp
+++ b/hotspot/src/share/vm/services/diagnosticFramework.hpp
@@ -241,8 +241,17 @@ public:
   static int num_arguments() { return 0; }
   outputStream* output() { return _output; }
   bool is_heap_allocated()  { return _is_heap_allocated; }
-  virtual void print_help(outputStream* out) { };
-  virtual void parse(CmdLine* line, char delim, TRAPS) { }
+  virtual void print_help(const char* name) {
+    output()->print_cr("Syntax: %s", name);
+  }
+  virtual void parse(CmdLine* line, char delim, TRAPS) {
+    DCmdArgIter iter(line->args_addr(), line->args_len(), delim);
+    bool has_arg = iter.next(CHECK);
+    if (has_arg) {
+      THROW_MSG(vmSymbols::java_lang_IllegalArgumentException(),
+                "Unknown argument in diagnostic command");
+    }
+  }
   virtual void execute(TRAPS) { }
   virtual void reset(TRAPS) { }
   virtual void cleanup() { }
@@ -262,6 +271,25 @@ public:
                                 char delim, TRAPS);
 };
 
+class DCmdWithParser : public DCmd {
+protected:
+  DCmdParser _dcmdparser;
+public:
+  DCmdWithParser (outputStream *output, bool heap=false) : DCmd(output, heap) { }
+  static const char* name() { return "No Name";}
+  static const char* description() { return "No Help";}
+  static const char* disabled_message() { return "Diagnostic command currently disabled"; }
+  static const char* impact() { return "Low: No impact"; }
+  static int num_arguments() { return 0; }
+  virtual void parse(CmdLine *line, char delim, TRAPS);
+  virtual void execute(TRAPS) { }
+  virtual void reset(TRAPS);
+  virtual void cleanup();
+  virtual void print_help(const char* name);
+  virtual GrowableArray<const char*>* argument_name_array();
+  virtual GrowableArray<DCmdArgumentInfo*>* argument_info_array();
+};
+
 class DCmdMark : public StackObj {
   DCmd* _ref;
 public:
diff --git a/hotspot/src/share/vm/services/management.cpp b/hotspot/src/share/vm/services/management.cpp
index 217836c4f91..680c5d389fc 100644
--- a/hotspot/src/share/vm/services/management.cpp
+++ b/hotspot/src/share/vm/services/management.cpp
@@ -118,8 +118,22 @@ void Management::init() {
 #endif // SERVICES_KERNEL
   _optional_support.isThreadAllocatedMemorySupported = 1;
 
+  // Registration of the diagnostic commands
+  // First boolean argument specifies if the command is enabled
+  // Second boolean argument specifies if the command is hidden
   DCmdFactory::register_DCmdFactory(new DCmdFactoryImpl<HelpDCmd>(true, false));
   DCmdFactory::register_DCmdFactory(new DCmdFactoryImpl<VersionDCmd>(true, false));
+  DCmdFactory::register_DCmdFactory(new DCmdFactoryImpl<CommandLineDCmd>(true, false));
+  DCmdFactory::register_DCmdFactory(new DCmdFactoryImpl<PrintSystemPropertiesDCmd>(true, false));
+  DCmdFactory::register_DCmdFactory(new DCmdFactoryImpl<PrintVMFlagsDCmd>(true, false));
+  DCmdFactory::register_DCmdFactory(new DCmdFactoryImpl<VMUptimeDCmd>(true, false));
+  DCmdFactory::register_DCmdFactory(new DCmdFactoryImpl<SystemGCDCmd>(true, false));
+  DCmdFactory::register_DCmdFactory(new DCmdFactoryImpl<RunFinalizationDCmd>(true, false));
+#ifndef SERVICES_KERNEL   // Heap dumping not supported
+  DCmdFactory::register_DCmdFactory(new DCmdFactoryImpl<HeapDumpDCmd>(true, false));
+#endif // SERVICES_KERNEL
+  DCmdFactory::register_DCmdFactory(new DCmdFactoryImpl<ClassHistogramDCmd>(true, false));
+  DCmdFactory::register_DCmdFactory(new DCmdFactoryImpl<ThreadDumpDCmd>(true, false));
 }
 
 void Management::initialize(TRAPS) {
diff --git a/hotspot/src/share/vm/services/management.hpp b/hotspot/src/share/vm/services/management.hpp
index 9d97988df9c..be1770aea06 100644
--- a/hotspot/src/share/vm/services/management.hpp
+++ b/hotspot/src/share/vm/services/management.hpp
@@ -76,6 +76,9 @@ public:
     _stamp.update();
   }
 
+  static jlong begin_vm_creation_time() {
+    return _begin_vm_creation_time->get_value();
+  }
   static jlong vm_init_done_time() {
     return _vm_init_done_time->get_value();
   }
diff --git a/hotspot/src/share/vm/services/threadService.cpp b/hotspot/src/share/vm/services/threadService.cpp
index 09f09f2908d..5c70fe99baa 100644
--- a/hotspot/src/share/vm/services/threadService.cpp
+++ b/hotspot/src/share/vm/services/threadService.cpp
@@ -377,7 +377,7 @@ DeadlockCycle* ThreadService::find_deadlocks_at_safepoint(bool concurrent_locks)
     }
 
   }
-
+  delete cycle;
   return deadlocks;
 }
 
diff --git a/hotspot/src/share/vm/shark/sharkIntrinsics.cpp b/hotspot/src/share/vm/shark/sharkIntrinsics.cpp
index a5d83cf1aa6..8efaa432e2e 100644
--- a/hotspot/src/share/vm/shark/sharkIntrinsics.cpp
+++ b/hotspot/src/share/vm/shark/sharkIntrinsics.cpp
@@ -213,17 +213,11 @@ void SharkIntrinsics::do_Object_getClass() {
     SharkType::oop_type(),
     "klass");
 
-  Value *klass_part = builder()->CreateAddressOfStructEntry(
-    klass,
-    in_ByteSize(klassOopDesc::klass_part_offset_in_bytes()),
-    SharkType::klass_type(),
-    "klass_part");
-
   state()->push(
     SharkValue::create_jobject(
       builder()->CreateValueOfStructEntry(
-        klass_part,
-        in_ByteSize(Klass::java_mirror_offset_in_bytes()),
+        klass,
+        Klass::java_mirror_offset(),
         SharkType::oop_type(),
         "java_mirror"),
       true));
diff --git a/hotspot/src/share/vm/shark/sharkTopLevelBlock.cpp b/hotspot/src/share/vm/shark/sharkTopLevelBlock.cpp
index 1e236ff6f0e..70e30f9fafa 100644
--- a/hotspot/src/share/vm/shark/sharkTopLevelBlock.cpp
+++ b/hotspot/src/share/vm/shark/sharkTopLevelBlock.cpp
@@ -745,15 +745,9 @@ void SharkTopLevelBlock::call_register_finalizer(Value *receiver) {
     SharkType::oop_type(),
     "klass");
 
-  Value *klass_part = builder()->CreateAddressOfStructEntry(
-    klass,
-    in_ByteSize(klassOopDesc::klass_part_offset_in_bytes()),
-    SharkType::klass_type(),
-    "klass_part");
-
   Value *access_flags = builder()->CreateValueOfStructEntry(
-    klass_part,
-    in_ByteSize(Klass::access_flags_offset_in_bytes()),
+    klass,
+    Klass::access_flags_offset(),
     SharkType::jint_type(),
     "access_flags");
 
diff --git a/hotspot/src/share/vm/utilities/workgroup.cpp b/hotspot/src/share/vm/utilities/workgroup.cpp
index 8b695528ec4..0ef1f833d04 100644
--- a/hotspot/src/share/vm/utilities/workgroup.cpp
+++ b/hotspot/src/share/vm/utilities/workgroup.cpp
@@ -53,14 +53,14 @@ AbstractWorkGang::AbstractWorkGang(const char* name,
 }
 
 WorkGang::WorkGang(const char* name,
-                   int         workers,
+                   uint        workers,
                    bool        are_GC_task_threads,
                    bool        are_ConcurrentGC_threads) :
   AbstractWorkGang(name, are_GC_task_threads, are_ConcurrentGC_threads) {
   _total_workers = workers;
 }
 
-GangWorker* WorkGang::allocate_worker(int which) {
+GangWorker* WorkGang::allocate_worker(uint which) {
   GangWorker* new_worker = new GangWorker(this, which);
   return new_worker;
 }
@@ -88,7 +88,7 @@ bool WorkGang::initialize_workers() {
   } else {
     worker_type = os::pgc_thread;
   }
-  for (int worker = 0; worker < total_workers(); worker += 1) {
+  for (uint worker = 0; worker < total_workers(); worker += 1) {
     GangWorker* new_worker = allocate_worker(worker);
     assert(new_worker != NULL, "Failed to allocate GangWorker");
     _gang_workers[worker] = new_worker;
@@ -108,14 +108,14 @@ AbstractWorkGang::~AbstractWorkGang() {
     tty->print_cr("Destructing work gang %s", name());
   }
   stop();   // stop all the workers
-  for (int worker = 0; worker < total_workers(); worker += 1) {
+  for (uint worker = 0; worker < total_workers(); worker += 1) {
     delete gang_worker(worker);
   }
   delete gang_workers();
   delete monitor();
 }
 
-GangWorker* AbstractWorkGang::gang_worker(int i) const {
+GangWorker* AbstractWorkGang::gang_worker(uint i) const {
   // Array index bounds checking.
   GangWorker* result = NULL;
   assert(gang_workers() != NULL, "No workers for indexing");
@@ -148,7 +148,7 @@ void WorkGang::run_task(AbstractGangTask* task, uint no_of_parallel_workers) {
   // Tell the workers to get to work.
   monitor()->notify_all();
   // Wait for them to be finished
-  while (finished_workers() < (int) no_of_parallel_workers) {
+  while (finished_workers() < no_of_parallel_workers) {
     if (TraceWorkGang) {
       tty->print_cr("Waiting in work gang %s: %d/%d finished sequence %d",
                     name(), finished_workers(), no_of_parallel_workers,
@@ -377,12 +377,12 @@ WorkGangBarrierSync::WorkGangBarrierSync()
     _n_workers(0), _n_completed(0), _should_reset(false) {
 }
 
-WorkGangBarrierSync::WorkGangBarrierSync(int n_workers, const char* name)
+WorkGangBarrierSync::WorkGangBarrierSync(uint n_workers, const char* name)
   : _monitor(Mutex::safepoint, name, true),
     _n_workers(n_workers), _n_completed(0), _should_reset(false) {
 }
 
-void WorkGangBarrierSync::set_n_workers(int n_workers) {
+void WorkGangBarrierSync::set_n_workers(uint n_workers) {
   _n_workers   = n_workers;
   _n_completed = 0;
   _should_reset = false;
@@ -419,9 +419,9 @@ void WorkGangBarrierSync::enter() {
 
 // SubTasksDone functions.
 
-SubTasksDone::SubTasksDone(int n) :
+SubTasksDone::SubTasksDone(uint n) :
   _n_tasks(n), _n_threads(1), _tasks(NULL) {
-  _tasks = NEW_C_HEAP_ARRAY(jint, n);
+  _tasks = NEW_C_HEAP_ARRAY(uint, n);
   guarantee(_tasks != NULL, "alloc failure");
   clear();
 }
@@ -430,14 +430,14 @@ bool SubTasksDone::valid() {
   return _tasks != NULL;
 }
 
-void SubTasksDone::set_n_threads(int t) {
+void SubTasksDone::set_n_threads(uint t) {
   assert(_claimed == 0 || _threads_completed == _n_threads,
          "should not be called while tasks are being processed!");
   _n_threads = (t == 0 ? 1 : t);
 }
 
 void SubTasksDone::clear() {
-  for (int i = 0; i < _n_tasks; i++) {
+  for (uint i = 0; i < _n_tasks; i++) {
     _tasks[i] = 0;
   }
   _threads_completed = 0;
@@ -446,9 +446,9 @@ void SubTasksDone::clear() {
 #endif
 }
 
-bool SubTasksDone::is_task_claimed(int t) {
+bool SubTasksDone::is_task_claimed(uint t) {
   assert(0 <= t && t < _n_tasks, "bad task id.");
-  jint old = _tasks[t];
+  uint old = _tasks[t];
   if (old == 0) {
     old = Atomic::cmpxchg(1, &_tasks[t], 0);
   }
@@ -457,7 +457,7 @@ bool SubTasksDone::is_task_claimed(int t) {
 #ifdef ASSERT
   if (!res) {
     assert(_claimed < _n_tasks, "Too many tasks claimed; missing clear?");
-    Atomic::inc(&_claimed);
+    Atomic::inc((volatile jint*) &_claimed);
   }
 #endif
   return res;
@@ -471,7 +471,7 @@ void SubTasksDone::all_tasks_completed() {
     observed = Atomic::cmpxchg(old+1, &_threads_completed, old);
   } while (observed != old);
   // If this was the last thread checking in, clear the tasks.
-  if (observed+1 == _n_threads) clear();
+  if (observed+1 == (jint)_n_threads) clear();
 }
 
 
@@ -490,12 +490,12 @@ bool SequentialSubTasksDone::valid() {
   return _n_threads > 0;
 }
 
-bool SequentialSubTasksDone::is_task_claimed(int& t) {
-  jint* n_claimed_ptr = &_n_claimed;
+bool SequentialSubTasksDone::is_task_claimed(uint& t) {
+  uint* n_claimed_ptr = &_n_claimed;
   t = *n_claimed_ptr;
   while (t < _n_tasks) {
     jint res = Atomic::cmpxchg(t+1, n_claimed_ptr, t);
-    if (res == t) {
+    if (res == (jint)t) {
       return false;
     }
     t = *n_claimed_ptr;
@@ -504,10 +504,10 @@ bool SequentialSubTasksDone::is_task_claimed(int& t) {
 }
 
 bool SequentialSubTasksDone::all_tasks_completed() {
-  jint* n_completed_ptr = &_n_completed;
-  jint  complete        = *n_completed_ptr;
+  uint* n_completed_ptr = &_n_completed;
+  uint  complete        = *n_completed_ptr;
   while (true) {
-    jint res = Atomic::cmpxchg(complete+1, n_completed_ptr, complete);
+    uint res = Atomic::cmpxchg(complete+1, n_completed_ptr, complete);
     if (res == complete) {
       break;
     }
diff --git a/hotspot/src/share/vm/utilities/workgroup.hpp b/hotspot/src/share/vm/utilities/workgroup.hpp
index 91161634772..7fc0ddf5674 100644
--- a/hotspot/src/share/vm/utilities/workgroup.hpp
+++ b/hotspot/src/share/vm/utilities/workgroup.hpp
@@ -68,7 +68,7 @@ class AbstractGangTask VALUE_OBJ_CLASS_SPEC {
 public:
   // The abstract work method.
   // The argument tells you which member of the gang you are.
-  virtual void work(int i) = 0;
+  virtual void work(uint worker_id) = 0;
 
   // This method configures the task for proper termination.
   // Some tasks do not have any requirements on termination
@@ -149,7 +149,7 @@ protected:
   // and notifies of changes in it.
   Monitor*  _monitor;
   // The count of the number of workers in the gang.
-  int _total_workers;
+  uint _total_workers;
   // Whether the workers should terminate.
   bool _terminate;
   // The array of worker threads for this gang.
@@ -160,18 +160,18 @@ protected:
   // A sequence number for the current task.
   int _sequence_number;
   // The number of started workers.
-  int _started_workers;
+  uint _started_workers;
   // The number of finished workers.
-  int _finished_workers;
+  uint _finished_workers;
 public:
   // Accessors for fields
   Monitor* monitor() const {
     return _monitor;
   }
-  int total_workers() const {
+  uint total_workers() const {
     return _total_workers;
   }
-  virtual int active_workers() const {
+  virtual uint active_workers() const {
     return _total_workers;
   }
   bool terminate() const {
@@ -186,10 +186,10 @@ public:
   int sequence_number() const {
     return _sequence_number;
   }
-  int started_workers() const {
+  uint started_workers() const {
     return _started_workers;
   }
-  int finished_workers() const {
+  uint finished_workers() const {
     return _finished_workers;
   }
   bool are_GC_task_threads() const {
@@ -203,7 +203,7 @@ public:
     return (task() == NULL);
   }
   // Return the Ith gang worker.
-  GangWorker* gang_worker(int i) const;
+  GangWorker* gang_worker(uint i) const;
 
   void threads_do(ThreadClosure* tc) const;
 
@@ -255,13 +255,13 @@ public:
 class WorkGang: public AbstractWorkGang {
 public:
   // Constructor
-  WorkGang(const char* name, int workers,
+  WorkGang(const char* name, uint workers,
            bool are_GC_task_threads, bool are_ConcurrentGC_threads);
   // Run a task, returns when the task is done (or terminated).
   virtual void run_task(AbstractGangTask* task);
   void run_task(AbstractGangTask* task, uint no_of_parallel_workers);
   // Allocate a worker and return a pointer to it.
-  virtual GangWorker* allocate_worker(int which);
+  virtual GangWorker* allocate_worker(uint which);
   // Initialize workers in the gang.  Return true if initialization
   // succeeded. The type of the worker can be overridden in a derived
   // class with the appropriate implementation of allocate_worker().
@@ -323,25 +323,25 @@ class FlexibleWorkGang: public WorkGang {
   // determine completion.
 
  protected:
-  int _active_workers;
+  uint _active_workers;
  public:
   // Constructor and destructor.
   // Initialize active_workers to a minimum value.  Setting it to
   // the parameter "workers" will initialize it to a maximum
   // value which is not desirable.
-  FlexibleWorkGang(const char* name, int workers,
+  FlexibleWorkGang(const char* name, uint workers,
                    bool are_GC_task_threads,
                    bool  are_ConcurrentGC_threads) :
     WorkGang(name, workers, are_GC_task_threads, are_ConcurrentGC_threads),
-    _active_workers(UseDynamicNumberOfGCThreads ? 1 : ParallelGCThreads) {};
+    _active_workers(UseDynamicNumberOfGCThreads ? 1U : ParallelGCThreads) {}
   // Accessors for fields
-  virtual int active_workers() const { return _active_workers; }
-  void set_active_workers(int v) {
+  virtual uint active_workers() const { return _active_workers; }
+  void set_active_workers(uint v) {
     assert(v <= _total_workers,
            "Trying to set more workers active than there are");
     _active_workers = MIN2(v, _total_workers);
     assert(v != 0, "Trying to set active workers to 0");
-    _active_workers = MAX2(1, _active_workers);
+    _active_workers = MAX2(1U, _active_workers);
     assert(UseDynamicNumberOfGCThreads || _active_workers == _total_workers,
            "Unless dynamic should use total workers");
   }
@@ -370,13 +370,13 @@ class FlexibleWorkGang: public WorkGang {
 class WorkGangBarrierSync : public StackObj {
 protected:
   Monitor _monitor;
-  int     _n_workers;
-  int     _n_completed;
+  uint     _n_workers;
+  uint     _n_completed;
   bool    _should_reset;
 
   Monitor* monitor()        { return &_monitor; }
-  int      n_workers()      { return _n_workers; }
-  int      n_completed()    { return _n_completed; }
+  uint     n_workers()      { return _n_workers; }
+  uint     n_completed()    { return _n_completed; }
   bool     should_reset()   { return _should_reset; }
 
   void     zero_completed() { _n_completed = 0; }
@@ -386,11 +386,11 @@ protected:
 
 public:
   WorkGangBarrierSync();
-  WorkGangBarrierSync(int n_workers, const char* name);
+  WorkGangBarrierSync(uint n_workers, const char* name);
 
   // Set the number of workers that will use the barrier.
   // Must be called before any of the workers start running.
-  void set_n_workers(int n_workers);
+  void set_n_workers(uint n_workers);
 
   // Enter the barrier. A worker that enters the barrier will
   // not be allowed to leave until all other threads have
@@ -402,18 +402,18 @@ public:
 // subtasks will be identified by integer indices, usually elements of an
 // enumeration type.
 
-class SubTasksDone: public CHeapObj {
-  jint* _tasks;
-  int _n_tasks;
+class SubTasksDone : public CHeapObj {
+  uint* _tasks;
+  uint _n_tasks;
   // _n_threads is used to determine when a sub task is done.
   // It does not control how many threads will execute the subtask
   // but must be initialized to the number that do execute the task
   // in order to correctly decide when the subtask is done (all the
   // threads working on the task have finished).
-  int _n_threads;
-  jint _threads_completed;
+  uint _n_threads;
+  uint _threads_completed;
 #ifdef ASSERT
-  volatile jint _claimed;
+  volatile uint _claimed;
 #endif
 
   // Set all tasks to unclaimed.
@@ -423,19 +423,19 @@ public:
   // Initializes "this" to a state in which there are "n" tasks to be
   // processed, none of the which are originally claimed.  The number of
   // threads doing the tasks is initialized 1.
-  SubTasksDone(int n);
+  SubTasksDone(uint n);
 
   // True iff the object is in a valid state.
   bool valid();
 
   // Get/set the number of parallel threads doing the tasks to "t".  Can only
   // be called before tasks start or after they are complete.
-  int n_threads() { return _n_threads; }
-  void set_n_threads(int t);
+  uint n_threads() { return _n_threads; }
+  void set_n_threads(uint t);
 
   // Returns "false" if the task "t" is unclaimed, and ensures that task is
   // claimed.  The task "t" is required to be within the range of "this".
-  bool is_task_claimed(int t);
+  bool is_task_claimed(uint t);
 
   // The calling thread asserts that it has attempted to claim all the
   // tasks that it will try to claim.  Every thread in the parallel task
@@ -456,12 +456,12 @@ public:
 
 class SequentialSubTasksDone : public StackObj {
 protected:
-  jint _n_tasks;     // Total number of tasks available.
-  jint _n_claimed;   // Number of tasks claimed.
+  uint _n_tasks;     // Total number of tasks available.
+  uint _n_claimed;   // Number of tasks claimed.
   // _n_threads is used to determine when a sub task is done.
   // See comments on SubTasksDone::_n_threads
-  jint _n_threads;   // Total number of parallel threads.
-  jint _n_completed; // Number of completed threads.
+  uint _n_threads;   // Total number of parallel threads.
+  uint _n_completed; // Number of completed threads.
 
   void clear();
 
@@ -475,26 +475,26 @@ public:
   bool valid();
 
   // number of tasks
-  jint n_tasks() const { return _n_tasks; }
+  uint n_tasks() const { return _n_tasks; }
 
   // Get/set the number of parallel threads doing the tasks to t.
   // Should be called before the task starts but it is safe
   // to call this once a task is running provided that all
   // threads agree on the number of threads.
-  int n_threads() { return _n_threads; }
-  void set_n_threads(int t) { _n_threads = t; }
+  uint n_threads() { return _n_threads; }
+  void set_n_threads(uint t) { _n_threads = t; }
 
   // Set the number of tasks to be claimed to t. As above,
   // should be called before the tasks start but it is safe
   // to call this once a task is running provided all threads
   // agree on the number of tasks.
-  void set_n_tasks(int t) { _n_tasks = t; }
+  void set_n_tasks(uint t) { _n_tasks = t; }
 
   // Returns false if the next task in the sequence is unclaimed,
   // and ensures that it is claimed. Will set t to be the index
   // of the claimed task in the sequence. Will return true if
   // the task cannot be claimed and there are none left to claim.
-  bool is_task_claimed(int& t);
+  bool is_task_claimed(uint& t);
 
   // The calling thread asserts that it has attempted to claim
   // all the tasks it possibly can in the sequence. Every thread
diff --git a/hotspot/src/share/vm/utilities/yieldingWorkgroup.cpp b/hotspot/src/share/vm/utilities/yieldingWorkgroup.cpp
index d8daf7a5d84..7e594232af4 100644
--- a/hotspot/src/share/vm/utilities/yieldingWorkgroup.cpp
+++ b/hotspot/src/share/vm/utilities/yieldingWorkgroup.cpp
@@ -33,11 +33,11 @@ class GangWorker;
 class WorkData;
 
 YieldingFlexibleWorkGang::YieldingFlexibleWorkGang(
-  const char* name, int workers, bool are_GC_task_threads) :
+  const char* name, uint workers, bool are_GC_task_threads) :
   FlexibleWorkGang(name, workers, are_GC_task_threads, false),
     _yielded_workers(0) {}
 
-GangWorker* YieldingFlexibleWorkGang::allocate_worker(int which) {
+GangWorker* YieldingFlexibleWorkGang::allocate_worker(uint which) {
   YieldingFlexibleGangWorker* new_member =
       new YieldingFlexibleGangWorker(this, which);
   return (YieldingFlexibleGangWorker*) new_member;
@@ -120,7 +120,7 @@ void YieldingFlexibleWorkGang::start_task(YieldingFlexibleGangTask* new_task) {
   new_task->set_gang(this);  // Establish 2-way binding to support yielding
   _sequence_number++;
 
-  int requested_size = new_task->requested_size();
+  uint requested_size = new_task->requested_size();
   assert(requested_size >= 0, "Should be non-negative");
   if (requested_size != 0) {
     _active_workers = MIN2(requested_size, total_workers());
diff --git a/hotspot/src/share/vm/utilities/yieldingWorkgroup.hpp b/hotspot/src/share/vm/utilities/yieldingWorkgroup.hpp
index 6d8c6bf66f4..a39f8227f57 100644
--- a/hotspot/src/share/vm/utilities/yieldingWorkgroup.hpp
+++ b/hotspot/src/share/vm/utilities/yieldingWorkgroup.hpp
@@ -71,7 +71,7 @@ public:
 
   // The abstract work method.
   // The argument tells you which member of the gang you are.
-  virtual void work(int i) = 0;
+  virtual void work(uint worker_id) = 0;
 
   int requested_size() const { return _requested_size; }
   int actual_size()    const { return _actual_size; }
@@ -128,7 +128,7 @@ protected:
 public:
   // The abstract work method.
   // The argument tells you which member of the gang you are.
-  virtual void work(int i) = 0;
+  virtual void work(uint worker_id) = 0;
 
   // Subclasses should call the parent's yield() method
   // after having done any work specific to the subclass.
@@ -159,7 +159,7 @@ class YieldingFlexibleWorkGang: public FlexibleWorkGang {
   // Here's the public interface to this class.
 public:
   // Constructor and destructor.
-  YieldingFlexibleWorkGang(const char* name, int workers,
+  YieldingFlexibleWorkGang(const char* name, uint workers,
                            bool are_GC_task_threads);
 
   YieldingFlexibleGangTask* yielding_task() const {
@@ -168,7 +168,7 @@ public:
     return (YieldingFlexibleGangTask*)task();
   }
   // Allocate a worker and return a pointer to it.
-  GangWorker* allocate_worker(int which);
+  GangWorker* allocate_worker(uint which);
 
   // Run a task; returns when the task is done, or the workers yield,
   // or the task is aborted, or the work gang is terminated via stop().
@@ -199,12 +199,12 @@ public:
   void abort();
 
 private:
-  int _yielded_workers;
+  uint _yielded_workers;
   void wait_for_gang();
 
 public:
   // Accessors for fields
-  int yielded_workers() const {
+  uint yielded_workers() const {
     return _yielded_workers;
   }
 
diff --git a/hotspot/test/compiler/7116216/LargeFrame.java b/hotspot/test/compiler/7116216/LargeFrame.java
new file mode 100644
index 00000000000..eb39b06e338
--- /dev/null
+++ b/hotspot/test/compiler/7116216/LargeFrame.java
@@ -0,0 +1,1329 @@
+/*
+ * Copyright (c) 2011, Oracle and/or its affiliates. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+public class LargeFrame {
+
+    public static void method_with_many_locals(Object r1, int r2, int r3, int r4, int r5, int r6, int r7, Object r8) {
+        int i00 = 0, i01 = 0, i02 = 0, i03 = 0, i04 = 0, i05 = 0, i06 = 0, i07 = 0, i08 = 0, i09 = 0;
+        int i10 = 0, i11 = 0, i12 = 0, i13 = 0, i14 = 0, i15 = 0, i16 = 0, i17 = 0, i18 = 0, i19 = 0;
+        int i20 = 0, i21 = 0, i22 = 0, i23 = 0, i24 = 0, i25 = 0, i26 = 0, i27 = 0, i28 = 0, i29 = 0;
+        int i30 = 0, i31 = 0, i32 = 0, i33 = 0, i34 = 0, i35 = 0, i36 = 0, i37 = 0, i38 = 0, i39 = 0;
+        int i40 = 0, i41 = 0, i42 = 0, i43 = 0, i44 = 0, i45 = 0, i46 = 0, i47 = 0, i48 = 0, i49 = 0;
+        int i50 = 0, i51 = 0, i52 = 0, i53 = 0, i54 = 0, i55 = 0, i56 = 0, i57 = 0, i58 = 0, i59 = 0;
+        int i60 = 0, i61 = 0, i62 = 0, i63 = 0, i64 = 0, i65 = 0, i66 = 0, i67 = 0, i68 = 0, i69 = 0;
+        int i70 = 0, i71 = 0, i72 = 0, i73 = 0, i74 = 0, i75 = 0, i76 = 0, i77 = 0, i78 = 0, i79 = 0;
+        int i80 = 0, i81 = 0, i82 = 0, i83 = 0, i84 = 0, i85 = 0, i86 = 0, i87 = 0, i88 = 0, i89 = 0;
+        int i90 = 0, i91 = 0, i92 = 0, i93 = 0, i94 = 0, i95 = 0, i96 = 0, i97 = 0, i98 = 0, i99 = 0;
+        int i100 = 0, i101 = 0, i102 = 0, i103 = 0, i104 = 0, i105 = 0, i106 = 0, i107 = 0, i108 = 0, i109 = 0;
+        int i110 = 0, i111 = 0, i112 = 0, i113 = 0, i114 = 0, i115 = 0, i116 = 0, i117 = 0, i118 = 0, i119 = 0;
+        int i120 = 0, i121 = 0, i122 = 0, i123 = 0, i124 = 0, i125 = 0, i126 = 0, i127 = 0, i128 = 0, i129 = 0;
+        int i130 = 0, i131 = 0, i132 = 0, i133 = 0, i134 = 0, i135 = 0, i136 = 0, i137 = 0, i138 = 0, i139 = 0;
+        int i140 = 0, i141 = 0, i142 = 0, i143 = 0, i144 = 0, i145 = 0, i146 = 0, i147 = 0, i148 = 0, i149 = 0;
+        int i150 = 0, i151 = 0, i152 = 0, i153 = 0, i154 = 0, i155 = 0, i156 = 0, i157 = 0, i158 = 0, i159 = 0;
+        int i160 = 0, i161 = 0, i162 = 0, i163 = 0, i164 = 0, i165 = 0, i166 = 0, i167 = 0, i168 = 0, i169 = 0;
+        int i170 = 0, i171 = 0, i172 = 0, i173 = 0, i174 = 0, i175 = 0, i176 = 0, i177 = 0, i178 = 0, i179 = 0;
+        int i180 = 0, i181 = 0, i182 = 0, i183 = 0, i184 = 0, i185 = 0, i186 = 0, i187 = 0, i188 = 0, i189 = 0;
+        int i190 = 0, i191 = 0, i192 = 0, i193 = 0, i194 = 0, i195 = 0, i196 = 0, i197 = 0, i198 = 0, i199 = 0;
+        int i200 = 0, i201 = 0, i202 = 0, i203 = 0, i204 = 0, i205 = 0, i206 = 0, i207 = 0, i208 = 0, i209 = 0;
+        int i210 = 0, i211 = 0, i212 = 0, i213 = 0, i214 = 0, i215 = 0, i216 = 0, i217 = 0, i218 = 0, i219 = 0;
+        int i220 = 0, i221 = 0, i222 = 0, i223 = 0, i224 = 0, i225 = 0, i226 = 0, i227 = 0, i228 = 0, i229 = 0;
+        int i230 = 0, i231 = 0, i232 = 0, i233 = 0, i234 = 0, i235 = 0, i236 = 0, i237 = 0, i238 = 0, i239 = 0;
+        int i240 = 0, i241 = 0, i242 = 0, i243 = 0, i244 = 0, i245 = 0, i246 = 0, i247 = 0, i248 = 0, i249 = 0;
+        int i250 = 0, i251 = 0, i252 = 0, i253 = 0, i254 = 0, i255 = 0, i256 = 0, i257 = 0, i258 = 0, i259 = 0;
+        int i260 = 0, i261 = 0, i262 = 0, i263 = 0, i264 = 0, i265 = 0, i266 = 0, i267 = 0, i268 = 0, i269 = 0;
+        int i270 = 0, i271 = 0, i272 = 0, i273 = 0, i274 = 0, i275 = 0, i276 = 0, i277 = 0, i278 = 0, i279 = 0;
+        int i280 = 0, i281 = 0, i282 = 0, i283 = 0, i284 = 0, i285 = 0, i286 = 0, i287 = 0, i288 = 0, i289 = 0;
+        int i290 = 0, i291 = 0, i292 = 0, i293 = 0, i294 = 0, i295 = 0, i296 = 0, i297 = 0, i298 = 0, i299 = 0;
+        int i300 = 0, i301 = 0, i302 = 0, i303 = 0, i304 = 0, i305 = 0, i306 = 0, i307 = 0, i308 = 0, i309 = 0;
+        int i310 = 0, i311 = 0, i312 = 0, i313 = 0, i314 = 0, i315 = 0, i316 = 0, i317 = 0, i318 = 0, i319 = 0;
+        int i320 = 0, i321 = 0, i322 = 0, i323 = 0, i324 = 0, i325 = 0, i326 = 0, i327 = 0, i328 = 0, i329 = 0;
+        int i330 = 0, i331 = 0, i332 = 0, i333 = 0, i334 = 0, i335 = 0, i336 = 0, i337 = 0, i338 = 0, i339 = 0;
+        int i340 = 0, i341 = 0, i342 = 0, i343 = 0, i344 = 0, i345 = 0, i346 = 0, i347 = 0, i348 = 0, i349 = 0;
+        int i350 = 0, i351 = 0, i352 = 0, i353 = 0, i354 = 0, i355 = 0, i356 = 0, i357 = 0, i358 = 0, i359 = 0;
+        int i360 = 0, i361 = 0, i362 = 0, i363 = 0, i364 = 0, i365 = 0, i366 = 0, i367 = 0, i368 = 0, i369 = 0;
+        int i370 = 0, i371 = 0, i372 = 0, i373 = 0, i374 = 0, i375 = 0, i376 = 0, i377 = 0, i378 = 0, i379 = 0;
+        int i380 = 0, i381 = 0, i382 = 0, i383 = 0, i384 = 0, i385 = 0, i386 = 0, i387 = 0, i388 = 0, i389 = 0;
+        int i390 = 0, i391 = 0, i392 = 0, i393 = 0, i394 = 0, i395 = 0, i396 = 0, i397 = 0, i398 = 0, i399 = 0;
+        int i400 = 0, i401 = 0, i402 = 0, i403 = 0, i404 = 0, i405 = 0, i406 = 0, i407 = 0, i408 = 0, i409 = 0;
+        int i410 = 0, i411 = 0, i412 = 0, i413 = 0, i414 = 0, i415 = 0, i416 = 0, i417 = 0, i418 = 0, i419 = 0;
+        int i420 = 0, i421 = 0, i422 = 0, i423 = 0, i424 = 0, i425 = 0, i426 = 0, i427 = 0, i428 = 0, i429 = 0;
+        int i430 = 0, i431 = 0, i432 = 0, i433 = 0, i434 = 0, i435 = 0, i436 = 0, i437 = 0, i438 = 0, i439 = 0;
+        int i440 = 0, i441 = 0, i442 = 0, i443 = 0, i444 = 0, i445 = 0, i446 = 0, i447 = 0, i448 = 0, i449 = 0;
+        int i450 = 0, i451 = 0, i452 = 0, i453 = 0, i454 = 0, i455 = 0, i456 = 0, i457 = 0, i458 = 0, i459 = 0;
+        int i460 = 0, i461 = 0, i462 = 0, i463 = 0, i464 = 0, i465 = 0, i466 = 0, i467 = 0, i468 = 0, i469 = 0;
+        int i470 = 0, i471 = 0, i472 = 0, i473 = 0, i474 = 0, i475 = 0, i476 = 0, i477 = 0, i478 = 0, i479 = 0;
+        int i480 = 0, i481 = 0, i482 = 0, i483 = 0, i484 = 0, i485 = 0, i486 = 0, i487 = 0, i488 = 0, i489 = 0;
+        int i490 = 0, i491 = 0, i492 = 0, i493 = 0, i494 = 0, i495 = 0, i496 = 0, i497 = 0, i498 = 0, i499 = 0;
+        int i500 = 0, i501 = 0, i502 = 0, i503 = 0, i504 = 0, i505 = 0, i506 = 0, i507 = 0, i508 = 0, i509 = 0;
+        int i510 = 0, i511 = 0, i512 = 0, i513 = 0, i514 = 0, i515 = 0, i516 = 0, i517 = 0, i518 = 0, i519 = 0;
+        int i520 = 0, i521 = 0, i522 = 0, i523 = 0, i524 = 0, i525 = 0, i526 = 0, i527 = 0, i528 = 0, i529 = 0;
+        int i530 = 0, i531 = 0, i532 = 0, i533 = 0, i534 = 0, i535 = 0, i536 = 0, i537 = 0, i538 = 0, i539 = 0;
+        int i540 = 0, i541 = 0, i542 = 0, i543 = 0, i544 = 0, i545 = 0, i546 = 0, i547 = 0, i548 = 0, i549 = 0;
+        int i550 = 0, i551 = 0, i552 = 0, i553 = 0, i554 = 0, i555 = 0, i556 = 0, i557 = 0, i558 = 0, i559 = 0;
+        int i560 = 0, i561 = 0, i562 = 0, i563 = 0, i564 = 0, i565 = 0, i566 = 0, i567 = 0, i568 = 0, i569 = 0;
+        int i570 = 0, i571 = 0, i572 = 0, i573 = 0, i574 = 0, i575 = 0, i576 = 0, i577 = 0, i578 = 0, i579 = 0;
+        int i580 = 0, i581 = 0, i582 = 0, i583 = 0, i584 = 0, i585 = 0, i586 = 0, i587 = 0, i588 = 0, i589 = 0;
+        int i590 = 0, i591 = 0, i592 = 0, i593 = 0, i594 = 0, i595 = 0, i596 = 0, i597 = 0, i598 = 0, i599 = 0;
+        int i600 = 0, i601 = 0, i602 = 0, i603 = 0, i604 = 0, i605 = 0, i606 = 0, i607 = 0, i608 = 0, i609 = 0;
+        int i610 = 0, i611 = 0, i612 = 0, i613 = 0, i614 = 0, i615 = 0, i616 = 0, i617 = 0, i618 = 0, i619 = 0;
+        int i620 = 0, i621 = 0, i622 = 0, i623 = 0, i624 = 0, i625 = 0, i626 = 0, i627 = 0, i628 = 0, i629 = 0;
+        int i630 = 0, i631 = 0, i632 = 0, i633 = 0, i634 = 0, i635 = 0, i636 = 0, i637 = 0, i638 = 0, i639 = 0;
+        int i640 = 0, i641 = 0, i642 = 0, i643 = 0, i644 = 0, i645 = 0, i646 = 0, i647 = 0, i648 = 0, i649 = 0;
+        int i650 = 0, i651 = 0, i652 = 0, i653 = 0, i654 = 0, i655 = 0, i656 = 0, i657 = 0, i658 = 0, i659 = 0;
+        int i660 = 0, i661 = 0, i662 = 0, i663 = 0, i664 = 0, i665 = 0, i666 = 0, i667 = 0, i668 = 0, i669 = 0;
+        int i670 = 0, i671 = 0, i672 = 0, i673 = 0, i674 = 0, i675 = 0, i676 = 0, i677 = 0, i678 = 0, i679 = 0;
+        int i680 = 0, i681 = 0, i682 = 0, i683 = 0, i684 = 0, i685 = 0, i686 = 0, i687 = 0, i688 = 0, i689 = 0;
+        int i690 = 0, i691 = 0, i692 = 0, i693 = 0, i694 = 0, i695 = 0, i696 = 0, i697 = 0, i698 = 0, i699 = 0;
+        int i700 = 0, i701 = 0, i702 = 0, i703 = 0, i704 = 0, i705 = 0, i706 = 0, i707 = 0, i708 = 0, i709 = 0;
+        int i710 = 0, i711 = 0, i712 = 0, i713 = 0, i714 = 0, i715 = 0, i716 = 0, i717 = 0, i718 = 0, i719 = 0;
+        int i720 = 0, i721 = 0, i722 = 0, i723 = 0, i724 = 0, i725 = 0, i726 = 0, i727 = 0, i728 = 0, i729 = 0;
+        int i730 = 0, i731 = 0, i732 = 0, i733 = 0, i734 = 0, i735 = 0, i736 = 0, i737 = 0, i738 = 0, i739 = 0;
+        int i740 = 0, i741 = 0, i742 = 0, i743 = 0, i744 = 0, i745 = 0, i746 = 0, i747 = 0, i748 = 0, i749 = 0;
+        int i750 = 0, i751 = 0, i752 = 0, i753 = 0, i754 = 0, i755 = 0, i756 = 0, i757 = 0, i758 = 0, i759 = 0;
+        int i760 = 0, i761 = 0, i762 = 0, i763 = 0, i764 = 0, i765 = 0, i766 = 0, i767 = 0, i768 = 0, i769 = 0;
+        int i770 = 0, i771 = 0, i772 = 0, i773 = 0, i774 = 0, i775 = 0, i776 = 0, i777 = 0, i778 = 0, i779 = 0;
+        int i780 = 0, i781 = 0, i782 = 0, i783 = 0, i784 = 0, i785 = 0, i786 = 0, i787 = 0, i788 = 0, i789 = 0;
+        int i790 = 0, i791 = 0, i792 = 0, i793 = 0, i794 = 0, i795 = 0, i796 = 0, i797 = 0, i798 = 0, i799 = 0;
+        int i800 = 0, i801 = 0, i802 = 0, i803 = 0, i804 = 0, i805 = 0, i806 = 0, i807 = 0, i808 = 0, i809 = 0;
+        int i810 = 0, i811 = 0, i812 = 0, i813 = 0, i814 = 0, i815 = 0, i816 = 0, i817 = 0, i818 = 0, i819 = 0;
+        int i820 = 0, i821 = 0, i822 = 0, i823 = 0, i824 = 0, i825 = 0, i826 = 0, i827 = 0, i828 = 0, i829 = 0;
+        int i830 = 0, i831 = 0, i832 = 0, i833 = 0, i834 = 0, i835 = 0, i836 = 0, i837 = 0, i838 = 0, i839 = 0;
+        int i840 = 0, i841 = 0, i842 = 0, i843 = 0, i844 = 0, i845 = 0, i846 = 0, i847 = 0, i848 = 0, i849 = 0;
+        int i850 = 0, i851 = 0, i852 = 0, i853 = 0, i854 = 0, i855 = 0, i856 = 0, i857 = 0, i858 = 0, i859 = 0;
+        int i860 = 0, i861 = 0, i862 = 0, i863 = 0, i864 = 0, i865 = 0, i866 = 0, i867 = 0, i868 = 0, i869 = 0;
+        int i870 = 0, i871 = 0, i872 = 0, i873 = 0, i874 = 0, i875 = 0, i876 = 0, i877 = 0, i878 = 0, i879 = 0;
+        int i880 = 0, i881 = 0, i882 = 0, i883 = 0, i884 = 0, i885 = 0, i886 = 0, i887 = 0, i888 = 0, i889 = 0;
+        int i890 = 0, i891 = 0, i892 = 0, i893 = 0, i894 = 0, i895 = 0, i896 = 0, i897 = 0, i898 = 0, i899 = 0;
+        int i900 = 0, i901 = 0, i902 = 0, i903 = 0, i904 = 0, i905 = 0, i906 = 0, i907 = 0, i908 = 0, i909 = 0;
+        int i910 = 0, i911 = 0, i912 = 0, i913 = 0, i914 = 0, i915 = 0, i916 = 0, i917 = 0, i918 = 0, i919 = 0;
+        int i920 = 0, i921 = 0, i922 = 0, i923 = 0, i924 = 0, i925 = 0, i926 = 0, i927 = 0, i928 = 0, i929 = 0;
+        int i930 = 0, i931 = 0, i932 = 0, i933 = 0, i934 = 0, i935 = 0, i936 = 0, i937 = 0, i938 = 0, i939 = 0;
+        int i940 = 0, i941 = 0, i942 = 0, i943 = 0, i944 = 0, i945 = 0, i946 = 0, i947 = 0, i948 = 0, i949 = 0;
+        int i950 = 0, i951 = 0, i952 = 0, i953 = 0, i954 = 0, i955 = 0, i956 = 0, i957 = 0, i958 = 0, i959 = 0;
+        int i960 = 0, i961 = 0, i962 = 0, i963 = 0, i964 = 0, i965 = 0, i966 = 0, i967 = 0, i968 = 0, i969 = 0;
+        int i970 = 0, i971 = 0, i972 = 0, i973 = 0, i974 = 0, i975 = 0, i976 = 0, i977 = 0, i978 = 0, i979 = 0;
+        int i980 = 0, i981 = 0, i982 = 0, i983 = 0, i984 = 0, i985 = 0, i986 = 0, i987 = 0, i988 = 0, i989 = 0;
+        int i990 = 0, i991 = 0, i992 = 0, i993 = 0, i994 = 0, i995 = 0, i996 = 0, i997 = 0, i998 = 0, i999 = 0;
+        int i1000 = 0, i1001 = 0, i1002 = 0, i1003 = 0, i1004 = 0, i1005 = 0, i1006 = 0, i1007 = 0, i1008 = 0, i1009 = 0;
+        int i1010 = 0, i1011 = 0, i1012 = 0, i1013 = 0, i1014 = 0, i1015 = 0, i1016 = 0, i1017 = 0, i1018 = 0, i1019 = 0;
+        int i1020 = 0, i1021 = 0, i1022 = 0, i1023 = 0, i1024 = 0, i1025 = 0, i1026 = 0, i1027 = 0, i1028 = 0, i1029 = 0;
+        int i1030 = 0, i1031 = 0, i1032 = 0, i1033 = 0, i1034 = 0, i1035 = 0, i1036 = 0, i1037 = 0, i1038 = 0, i1039 = 0;
+        int i1040 = 0, i1041 = 0, i1042 = 0, i1043 = 0, i1044 = 0, i1045 = 0, i1046 = 0, i1047 = 0, i1048 = 0, i1049 = 0;
+        int i1050 = 0, i1051 = 0, i1052 = 0, i1053 = 0, i1054 = 0, i1055 = 0, i1056 = 0, i1057 = 0, i1058 = 0, i1059 = 0;
+        int i1060 = 0, i1061 = 0, i1062 = 0, i1063 = 0, i1064 = 0, i1065 = 0, i1066 = 0, i1067 = 0, i1068 = 0, i1069 = 0;
+        int i1070 = 0, i1071 = 0, i1072 = 0, i1073 = 0, i1074 = 0, i1075 = 0, i1076 = 0, i1077 = 0, i1078 = 0, i1079 = 0;
+        int i1080 = 0, i1081 = 0, i1082 = 0, i1083 = 0, i1084 = 0, i1085 = 0, i1086 = 0, i1087 = 0, i1088 = 0, i1089 = 0;
+        int i1090 = 0, i1091 = 0, i1092 = 0, i1093 = 0, i1094 = 0, i1095 = 0, i1096 = 0, i1097 = 0, i1098 = 0, i1099 = 0;
+        int i1100 = 0, i1101 = 0, i1102 = 0, i1103 = 0, i1104 = 0, i1105 = 0, i1106 = 0, i1107 = 0, i1108 = 0, i1109 = 0;
+        int i1110 = 0, i1111 = 0, i1112 = 0, i1113 = 0, i1114 = 0, i1115 = 0, i1116 = 0, i1117 = 0, i1118 = 0, i1119 = 0;
+        int i1120 = 0, i1121 = 0, i1122 = 0, i1123 = 0, i1124 = 0, i1125 = 0, i1126 = 0, i1127 = 0, i1128 = 0, i1129 = 0;
+        int i1130 = 0, i1131 = 0, i1132 = 0, i1133 = 0, i1134 = 0, i1135 = 0, i1136 = 0, i1137 = 0, i1138 = 0, i1139 = 0;
+        int i1140 = 0, i1141 = 0, i1142 = 0, i1143 = 0, i1144 = 0, i1145 = 0, i1146 = 0, i1147 = 0, i1148 = 0, i1149 = 0;
+        int i1150 = 0, i1151 = 0, i1152 = 0, i1153 = 0, i1154 = 0, i1155 = 0, i1156 = 0, i1157 = 0, i1158 = 0, i1159 = 0;
+        int i1160 = 0, i1161 = 0, i1162 = 0, i1163 = 0, i1164 = 0, i1165 = 0, i1166 = 0, i1167 = 0, i1168 = 0, i1169 = 0;
+        int i1170 = 0, i1171 = 0, i1172 = 0, i1173 = 0, i1174 = 0, i1175 = 0, i1176 = 0, i1177 = 0, i1178 = 0, i1179 = 0;
+        int i1180 = 0, i1181 = 0, i1182 = 0, i1183 = 0, i1184 = 0, i1185 = 0, i1186 = 0, i1187 = 0, i1188 = 0, i1189 = 0;
+        int i1190 = 0, i1191 = 0, i1192 = 0, i1193 = 0, i1194 = 0, i1195 = 0, i1196 = 0, i1197 = 0, i1198 = 0, i1199 = 0;
+        int i1200 = 0, i1201 = 0, i1202 = 0, i1203 = 0, i1204 = 0, i1205 = 0, i1206 = 0, i1207 = 0, i1208 = 0, i1209 = 0;
+        int i1210 = 0, i1211 = 0, i1212 = 0, i1213 = 0, i1214 = 0, i1215 = 0, i1216 = 0, i1217 = 0, i1218 = 0, i1219 = 0;
+        int i1220 = 0, i1221 = 0, i1222 = 0, i1223 = 0, i1224 = 0, i1225 = 0, i1226 = 0, i1227 = 0, i1228 = 0, i1229 = 0;
+        int i1230 = 0, i1231 = 0, i1232 = 0, i1233 = 0, i1234 = 0, i1235 = 0, i1236 = 0, i1237 = 0, i1238 = 0, i1239 = 0;
+        int i1240 = 0, i1241 = 0, i1242 = 0, i1243 = 0, i1244 = 0, i1245 = 0, i1246 = 0, i1247 = 0, i1248 = 0, i1249 = 0;
+        int i1250 = 0, i1251 = 0, i1252 = 0, i1253 = 0, i1254 = 0, i1255 = 0, i1256 = 0, i1257 = 0, i1258 = 0, i1259 = 0;
+        int i1260 = 0, i1261 = 0, i1262 = 0, i1263 = 0, i1264 = 0, i1265 = 0, i1266 = 0, i1267 = 0, i1268 = 0, i1269 = 0;
+        int i1270 = 0, i1271 = 0, i1272 = 0, i1273 = 0, i1274 = 0, i1275 = 0, i1276 = 0, i1277 = 0, i1278 = 0, i1279 = 0;
+        int i1280 = 0, i1281 = 0, i1282 = 0, i1283 = 0, i1284 = 0, i1285 = 0, i1286 = 0, i1287 = 0, i1288 = 0, i1289 = 0;
+        int i1290 = 0, i1291 = 0, i1292 = 0, i1293 = 0, i1294 = 0, i1295 = 0, i1296 = 0, i1297 = 0, i1298 = 0, i1299 = 0;
+        int i1300 = 0, i1301 = 0, i1302 = 0, i1303 = 0, i1304 = 0, i1305 = 0, i1306 = 0, i1307 = 0, i1308 = 0, i1309 = 0;
+        int i1310 = 0, i1311 = 0, i1312 = 0, i1313 = 0, i1314 = 0, i1315 = 0, i1316 = 0, i1317 = 0, i1318 = 0, i1319 = 0;
+        int i1320 = 0, i1321 = 0, i1322 = 0, i1323 = 0, i1324 = 0, i1325 = 0, i1326 = 0, i1327 = 0, i1328 = 0, i1329 = 0;
+        int i1330 = 0, i1331 = 0, i1332 = 0, i1333 = 0, i1334 = 0, i1335 = 0, i1336 = 0, i1337 = 0, i1338 = 0, i1339 = 0;
+        int i1340 = 0, i1341 = 0, i1342 = 0, i1343 = 0, i1344 = 0, i1345 = 0, i1346 = 0, i1347 = 0, i1348 = 0, i1349 = 0;
+        int i1350 = 0, i1351 = 0, i1352 = 0, i1353 = 0, i1354 = 0, i1355 = 0, i1356 = 0, i1357 = 0, i1358 = 0, i1359 = 0;
+        int i1360 = 0, i1361 = 0, i1362 = 0, i1363 = 0, i1364 = 0, i1365 = 0, i1366 = 0, i1367 = 0, i1368 = 0, i1369 = 0;
+        int i1370 = 0, i1371 = 0, i1372 = 0, i1373 = 0, i1374 = 0, i1375 = 0, i1376 = 0, i1377 = 0, i1378 = 0, i1379 = 0;
+        int i1380 = 0, i1381 = 0, i1382 = 0, i1383 = 0, i1384 = 0, i1385 = 0, i1386 = 0, i1387 = 0, i1388 = 0, i1389 = 0;
+        int i1390 = 0, i1391 = 0, i1392 = 0, i1393 = 0, i1394 = 0, i1395 = 0, i1396 = 0, i1397 = 0, i1398 = 0, i1399 = 0;
+        int i1400 = 0, i1401 = 0, i1402 = 0, i1403 = 0, i1404 = 0, i1405 = 0, i1406 = 0, i1407 = 0, i1408 = 0, i1409 = 0;
+        int i1410 = 0, i1411 = 0, i1412 = 0, i1413 = 0, i1414 = 0, i1415 = 0, i1416 = 0, i1417 = 0, i1418 = 0, i1419 = 0;
+        int i1420 = 0, i1421 = 0, i1422 = 0, i1423 = 0, i1424 = 0, i1425 = 0, i1426 = 0, i1427 = 0, i1428 = 0, i1429 = 0;
+        int i1430 = 0, i1431 = 0, i1432 = 0, i1433 = 0, i1434 = 0, i1435 = 0, i1436 = 0, i1437 = 0, i1438 = 0, i1439 = 0;
+        int i1440 = 0, i1441 = 0, i1442 = 0, i1443 = 0, i1444 = 0, i1445 = 0, i1446 = 0, i1447 = 0, i1448 = 0, i1449 = 0;
+        int i1450 = 0, i1451 = 0, i1452 = 0, i1453 = 0, i1454 = 0, i1455 = 0, i1456 = 0, i1457 = 0, i1458 = 0, i1459 = 0;
+        int i1460 = 0, i1461 = 0, i1462 = 0, i1463 = 0, i1464 = 0, i1465 = 0, i1466 = 0, i1467 = 0, i1468 = 0, i1469 = 0;
+        int i1470 = 0, i1471 = 0, i1472 = 0, i1473 = 0, i1474 = 0, i1475 = 0, i1476 = 0, i1477 = 0, i1478 = 0, i1479 = 0;
+        int i1480 = 0, i1481 = 0, i1482 = 0, i1483 = 0, i1484 = 0, i1485 = 0, i1486 = 0, i1487 = 0, i1488 = 0, i1489 = 0;
+        int i1490 = 0, i1491 = 0, i1492 = 0, i1493 = 0, i1494 = 0, i1495 = 0, i1496 = 0, i1497 = 0, i1498 = 0, i1499 = 0;
+        int i1500 = 0, i1501 = 0, i1502 = 0, i1503 = 0, i1504 = 0, i1505 = 0, i1506 = 0, i1507 = 0, i1508 = 0, i1509 = 0;
+        int i1510 = 0, i1511 = 0, i1512 = 0, i1513 = 0, i1514 = 0, i1515 = 0, i1516 = 0, i1517 = 0, i1518 = 0, i1519 = 0;
+        int i1520 = 0, i1521 = 0, i1522 = 0, i1523 = 0, i1524 = 0, i1525 = 0, i1526 = 0, i1527 = 0, i1528 = 0, i1529 = 0;
+        int i1530 = 0, i1531 = 0, i1532 = 0, i1533 = 0, i1534 = 0, i1535 = 0, i1536 = 0, i1537 = 0, i1538 = 0, i1539 = 0;
+        int i1540 = 0, i1541 = 0, i1542 = 0, i1543 = 0, i1544 = 0, i1545 = 0, i1546 = 0, i1547 = 0, i1548 = 0, i1549 = 0;
+        int i1550 = 0, i1551 = 0, i1552 = 0, i1553 = 0, i1554 = 0, i1555 = 0, i1556 = 0, i1557 = 0, i1558 = 0, i1559 = 0;
+        int i1560 = 0, i1561 = 0, i1562 = 0, i1563 = 0, i1564 = 0, i1565 = 0, i1566 = 0, i1567 = 0, i1568 = 0, i1569 = 0;
+        int i1570 = 0, i1571 = 0, i1572 = 0, i1573 = 0, i1574 = 0, i1575 = 0, i1576 = 0, i1577 = 0, i1578 = 0, i1579 = 0;
+        int i1580 = 0, i1581 = 0, i1582 = 0, i1583 = 0, i1584 = 0, i1585 = 0, i1586 = 0, i1587 = 0, i1588 = 0, i1589 = 0;
+        int i1590 = 0, i1591 = 0, i1592 = 0, i1593 = 0, i1594 = 0, i1595 = 0, i1596 = 0, i1597 = 0, i1598 = 0, i1599 = 0;
+        int i1600 = 0, i1601 = 0, i1602 = 0, i1603 = 0, i1604 = 0, i1605 = 0, i1606 = 0, i1607 = 0, i1608 = 0, i1609 = 0;
+        int i1610 = 0, i1611 = 0, i1612 = 0, i1613 = 0, i1614 = 0, i1615 = 0, i1616 = 0, i1617 = 0, i1618 = 0, i1619 = 0;
+        int i1620 = 0, i1621 = 0, i1622 = 0, i1623 = 0, i1624 = 0, i1625 = 0, i1626 = 0, i1627 = 0, i1628 = 0, i1629 = 0;
+        int i1630 = 0, i1631 = 0, i1632 = 0, i1633 = 0, i1634 = 0, i1635 = 0, i1636 = 0, i1637 = 0, i1638 = 0, i1639 = 0;
+        int i1640 = 0, i1641 = 0, i1642 = 0, i1643 = 0, i1644 = 0, i1645 = 0, i1646 = 0, i1647 = 0, i1648 = 0, i1649 = 0;
+        int i1650 = 0, i1651 = 0, i1652 = 0, i1653 = 0, i1654 = 0, i1655 = 0, i1656 = 0, i1657 = 0, i1658 = 0, i1659 = 0;
+        int i1660 = 0, i1661 = 0, i1662 = 0, i1663 = 0, i1664 = 0, i1665 = 0, i1666 = 0, i1667 = 0, i1668 = 0, i1669 = 0;
+        int i1670 = 0, i1671 = 0, i1672 = 0, i1673 = 0, i1674 = 0, i1675 = 0, i1676 = 0, i1677 = 0, i1678 = 0, i1679 = 0;
+        int i1680 = 0, i1681 = 0, i1682 = 0, i1683 = 0, i1684 = 0, i1685 = 0, i1686 = 0, i1687 = 0, i1688 = 0, i1689 = 0;
+        int i1690 = 0, i1691 = 0, i1692 = 0, i1693 = 0, i1694 = 0, i1695 = 0, i1696 = 0, i1697 = 0, i1698 = 0, i1699 = 0;
+        int i1700 = 0, i1701 = 0, i1702 = 0, i1703 = 0, i1704 = 0, i1705 = 0, i1706 = 0, i1707 = 0, i1708 = 0, i1709 = 0;
+        int i1710 = 0, i1711 = 0, i1712 = 0, i1713 = 0, i1714 = 0, i1715 = 0, i1716 = 0, i1717 = 0, i1718 = 0, i1719 = 0;
+        int i1720 = 0, i1721 = 0, i1722 = 0, i1723 = 0, i1724 = 0, i1725 = 0, i1726 = 0, i1727 = 0, i1728 = 0, i1729 = 0;
+        int i1730 = 0, i1731 = 0, i1732 = 0, i1733 = 0, i1734 = 0, i1735 = 0, i1736 = 0, i1737 = 0, i1738 = 0, i1739 = 0;
+        int i1740 = 0, i1741 = 0, i1742 = 0, i1743 = 0, i1744 = 0, i1745 = 0, i1746 = 0, i1747 = 0, i1748 = 0, i1749 = 0;
+        int i1750 = 0, i1751 = 0, i1752 = 0, i1753 = 0, i1754 = 0, i1755 = 0, i1756 = 0, i1757 = 0, i1758 = 0, i1759 = 0;
+        int i1760 = 0, i1761 = 0, i1762 = 0, i1763 = 0, i1764 = 0, i1765 = 0, i1766 = 0, i1767 = 0, i1768 = 0, i1769 = 0;
+        int i1770 = 0, i1771 = 0, i1772 = 0, i1773 = 0, i1774 = 0, i1775 = 0, i1776 = 0, i1777 = 0, i1778 = 0, i1779 = 0;
+        int i1780 = 0, i1781 = 0, i1782 = 0, i1783 = 0, i1784 = 0, i1785 = 0, i1786 = 0, i1787 = 0, i1788 = 0, i1789 = 0;
+        int i1790 = 0, i1791 = 0, i1792 = 0, i1793 = 0, i1794 = 0, i1795 = 0, i1796 = 0, i1797 = 0, i1798 = 0, i1799 = 0;
+        int i1800 = 0, i1801 = 0, i1802 = 0, i1803 = 0, i1804 = 0, i1805 = 0, i1806 = 0, i1807 = 0, i1808 = 0, i1809 = 0;
+        int i1810 = 0, i1811 = 0, i1812 = 0, i1813 = 0, i1814 = 0, i1815 = 0, i1816 = 0, i1817 = 0, i1818 = 0, i1819 = 0;
+        int i1820 = 0, i1821 = 0, i1822 = 0, i1823 = 0, i1824 = 0, i1825 = 0, i1826 = 0, i1827 = 0, i1828 = 0, i1829 = 0;
+        int i1830 = 0, i1831 = 0, i1832 = 0, i1833 = 0, i1834 = 0, i1835 = 0, i1836 = 0, i1837 = 0, i1838 = 0, i1839 = 0;
+        int i1840 = 0, i1841 = 0, i1842 = 0, i1843 = 0, i1844 = 0, i1845 = 0, i1846 = 0, i1847 = 0, i1848 = 0, i1849 = 0;
+        int i1850 = 0, i1851 = 0, i1852 = 0, i1853 = 0, i1854 = 0, i1855 = 0, i1856 = 0, i1857 = 0, i1858 = 0, i1859 = 0;
+        int i1860 = 0, i1861 = 0, i1862 = 0, i1863 = 0, i1864 = 0, i1865 = 0, i1866 = 0, i1867 = 0, i1868 = 0, i1869 = 0;
+        int i1870 = 0, i1871 = 0, i1872 = 0, i1873 = 0, i1874 = 0, i1875 = 0, i1876 = 0, i1877 = 0, i1878 = 0, i1879 = 0;
+        int i1880 = 0, i1881 = 0, i1882 = 0, i1883 = 0, i1884 = 0, i1885 = 0, i1886 = 0, i1887 = 0, i1888 = 0, i1889 = 0;
+        int i1890 = 0, i1891 = 0, i1892 = 0, i1893 = 0, i1894 = 0, i1895 = 0, i1896 = 0, i1897 = 0, i1898 = 0, i1899 = 0;
+        int i1900 = 0, i1901 = 0, i1902 = 0, i1903 = 0, i1904 = 0, i1905 = 0, i1906 = 0, i1907 = 0, i1908 = 0, i1909 = 0;
+        int i1910 = 0, i1911 = 0, i1912 = 0, i1913 = 0, i1914 = 0, i1915 = 0, i1916 = 0, i1917 = 0, i1918 = 0, i1919 = 0;
+        int i1920 = 0, i1921 = 0, i1922 = 0, i1923 = 0, i1924 = 0, i1925 = 0, i1926 = 0, i1927 = 0, i1928 = 0, i1929 = 0;
+        int i1930 = 0, i1931 = 0, i1932 = 0, i1933 = 0, i1934 = 0, i1935 = 0, i1936 = 0, i1937 = 0, i1938 = 0, i1939 = 0;
+        int i1940 = 0, i1941 = 0, i1942 = 0, i1943 = 0, i1944 = 0, i1945 = 0, i1946 = 0, i1947 = 0, i1948 = 0, i1949 = 0;
+        int i1950 = 0, i1951 = 0, i1952 = 0, i1953 = 0, i1954 = 0, i1955 = 0, i1956 = 0, i1957 = 0, i1958 = 0, i1959 = 0;
+        int i1960 = 0, i1961 = 0, i1962 = 0, i1963 = 0, i1964 = 0, i1965 = 0, i1966 = 0, i1967 = 0, i1968 = 0, i1969 = 0;
+        int i1970 = 0, i1971 = 0, i1972 = 0, i1973 = 0, i1974 = 0, i1975 = 0, i1976 = 0, i1977 = 0, i1978 = 0, i1979 = 0;
+        int i1980 = 0, i1981 = 0, i1982 = 0, i1983 = 0, i1984 = 0, i1985 = 0, i1986 = 0, i1987 = 0, i1988 = 0, i1989 = 0;
+        int i1990 = 0, i1991 = 0, i1992 = 0, i1993 = 0, i1994 = 0, i1995 = 0, i1996 = 0, i1997 = 0, i1998 = 0, i1999 = 0;
+        int i2000 = 0, i2001 = 0, i2002 = 0, i2003 = 0, i2004 = 0, i2005 = 0, i2006 = 0, i2007 = 0, i2008 = 0, i2009 = 0;
+        int i2010 = 0, i2011 = 0, i2012 = 0, i2013 = 0, i2014 = 0, i2015 = 0, i2016 = 0, i2017 = 0, i2018 = 0, i2019 = 0;
+        int i2020 = 0, i2021 = 0, i2022 = 0, i2023 = 0, i2024 = 0, i2025 = 0, i2026 = 0, i2027 = 0, i2028 = 0, i2029 = 0;
+        int i2030 = 0, i2031 = 0, i2032 = 0, i2033 = 0, i2034 = 0, i2035 = 0, i2036 = 0, i2037 = 0, i2038 = 0, i2039 = 0;
+        int i2040 = 0, i2041 = 0, i2042 = 0, i2043 = 0, i2044 = 0, i2045 = 0, i2046 = 0, i2047 = 0, i2048 = 0, i2049 = 0;
+        int i2050 = 0, i2051 = 0, i2052 = 0, i2053 = 0, i2054 = 0, i2055 = 0, i2056 = 0, i2057 = 0, i2058 = 0, i2059 = 0;
+        int i2060 = 0, i2061 = 0, i2062 = 0, i2063 = 0, i2064 = 0, i2065 = 0, i2066 = 0, i2067 = 0, i2068 = 0, i2069 = 0;
+        int i2070 = 0, i2071 = 0, i2072 = 0, i2073 = 0, i2074 = 0, i2075 = 0, i2076 = 0, i2077 = 0, i2078 = 0, i2079 = 0;
+        int i2080 = 0, i2081 = 0, i2082 = 0, i2083 = 0, i2084 = 0, i2085 = 0, i2086 = 0, i2087 = 0, i2088 = 0, i2089 = 0;
+        int i2090 = 0, i2091 = 0, i2092 = 0, i2093 = 0, i2094 = 0, i2095 = 0, i2096 = 0, i2097 = 0, i2098 = 0, i2099 = 0;
+        int i2100 = 0, i2101 = 0, i2102 = 0, i2103 = 0, i2104 = 0, i2105 = 0, i2106 = 0, i2107 = 0, i2108 = 0, i2109 = 0;
+        int i2110 = 0, i2111 = 0, i2112 = 0, i2113 = 0, i2114 = 0, i2115 = 0, i2116 = 0, i2117 = 0, i2118 = 0, i2119 = 0;
+        int i2120 = 0, i2121 = 0, i2122 = 0, i2123 = 0, i2124 = 0, i2125 = 0, i2126 = 0, i2127 = 0, i2128 = 0, i2129 = 0;
+        int i2130 = 0, i2131 = 0, i2132 = 0, i2133 = 0, i2134 = 0, i2135 = 0, i2136 = 0, i2137 = 0, i2138 = 0, i2139 = 0;
+        int i2140 = 0, i2141 = 0, i2142 = 0, i2143 = 0, i2144 = 0, i2145 = 0, i2146 = 0, i2147 = 0, i2148 = 0, i2149 = 0;
+        int i2150 = 0, i2151 = 0, i2152 = 0, i2153 = 0, i2154 = 0, i2155 = 0, i2156 = 0, i2157 = 0, i2158 = 0, i2159 = 0;
+        int i2160 = 0, i2161 = 0, i2162 = 0, i2163 = 0, i2164 = 0, i2165 = 0, i2166 = 0, i2167 = 0, i2168 = 0, i2169 = 0;
+        int i2170 = 0, i2171 = 0, i2172 = 0, i2173 = 0, i2174 = 0, i2175 = 0, i2176 = 0, i2177 = 0, i2178 = 0, i2179 = 0;
+        int i2180 = 0, i2181 = 0, i2182 = 0, i2183 = 0, i2184 = 0, i2185 = 0, i2186 = 0, i2187 = 0, i2188 = 0, i2189 = 0;
+        int i2190 = 0, i2191 = 0, i2192 = 0, i2193 = 0, i2194 = 0, i2195 = 0, i2196 = 0, i2197 = 0, i2198 = 0, i2199 = 0;
+        int i2200 = 0, i2201 = 0, i2202 = 0, i2203 = 0, i2204 = 0, i2205 = 0, i2206 = 0, i2207 = 0, i2208 = 0, i2209 = 0;
+        int i2210 = 0, i2211 = 0, i2212 = 0, i2213 = 0, i2214 = 0, i2215 = 0, i2216 = 0, i2217 = 0, i2218 = 0, i2219 = 0;
+        int i2220 = 0, i2221 = 0, i2222 = 0, i2223 = 0, i2224 = 0, i2225 = 0, i2226 = 0, i2227 = 0, i2228 = 0, i2229 = 0;
+        int i2230 = 0, i2231 = 0, i2232 = 0, i2233 = 0, i2234 = 0, i2235 = 0, i2236 = 0, i2237 = 0, i2238 = 0, i2239 = 0;
+        int i2240 = 0, i2241 = 0, i2242 = 0, i2243 = 0, i2244 = 0, i2245 = 0, i2246 = 0, i2247 = 0, i2248 = 0, i2249 = 0;
+        int i2250 = 0, i2251 = 0, i2252 = 0, i2253 = 0, i2254 = 0, i2255 = 0, i2256 = 0, i2257 = 0, i2258 = 0, i2259 = 0;
+        int i2260 = 0, i2261 = 0, i2262 = 0, i2263 = 0, i2264 = 0, i2265 = 0, i2266 = 0, i2267 = 0, i2268 = 0, i2269 = 0;
+        int i2270 = 0, i2271 = 0, i2272 = 0, i2273 = 0, i2274 = 0, i2275 = 0, i2276 = 0, i2277 = 0, i2278 = 0, i2279 = 0;
+        int i2280 = 0, i2281 = 0, i2282 = 0, i2283 = 0, i2284 = 0, i2285 = 0, i2286 = 0, i2287 = 0, i2288 = 0, i2289 = 0;
+        int i2290 = 0, i2291 = 0, i2292 = 0, i2293 = 0, i2294 = 0, i2295 = 0, i2296 = 0, i2297 = 0, i2298 = 0, i2299 = 0;
+        int i2300 = 0, i2301 = 0, i2302 = 0, i2303 = 0, i2304 = 0, i2305 = 0, i2306 = 0, i2307 = 0, i2308 = 0, i2309 = 0;
+        int i2310 = 0, i2311 = 0, i2312 = 0, i2313 = 0, i2314 = 0, i2315 = 0, i2316 = 0, i2317 = 0, i2318 = 0, i2319 = 0;
+        int i2320 = 0, i2321 = 0, i2322 = 0, i2323 = 0, i2324 = 0, i2325 = 0, i2326 = 0, i2327 = 0, i2328 = 0, i2329 = 0;
+        int i2330 = 0, i2331 = 0, i2332 = 0, i2333 = 0, i2334 = 0, i2335 = 0, i2336 = 0, i2337 = 0, i2338 = 0, i2339 = 0;
+        int i2340 = 0, i2341 = 0, i2342 = 0, i2343 = 0, i2344 = 0, i2345 = 0, i2346 = 0, i2347 = 0, i2348 = 0, i2349 = 0;
+        int i2350 = 0, i2351 = 0, i2352 = 0, i2353 = 0, i2354 = 0, i2355 = 0, i2356 = 0, i2357 = 0, i2358 = 0, i2359 = 0;
+        int i2360 = 0, i2361 = 0, i2362 = 0, i2363 = 0, i2364 = 0, i2365 = 0, i2366 = 0, i2367 = 0, i2368 = 0, i2369 = 0;
+        int i2370 = 0, i2371 = 0, i2372 = 0, i2373 = 0, i2374 = 0, i2375 = 0, i2376 = 0, i2377 = 0, i2378 = 0, i2379 = 0;
+        int i2380 = 0, i2381 = 0, i2382 = 0, i2383 = 0, i2384 = 0, i2385 = 0, i2386 = 0, i2387 = 0, i2388 = 0, i2389 = 0;
+        int i2390 = 0, i2391 = 0, i2392 = 0, i2393 = 0, i2394 = 0, i2395 = 0, i2396 = 0, i2397 = 0, i2398 = 0, i2399 = 0;
+        int i2400 = 0, i2401 = 0, i2402 = 0, i2403 = 0, i2404 = 0, i2405 = 0, i2406 = 0, i2407 = 0, i2408 = 0, i2409 = 0;
+        int i2410 = 0, i2411 = 0, i2412 = 0, i2413 = 0, i2414 = 0, i2415 = 0, i2416 = 0, i2417 = 0, i2418 = 0, i2419 = 0;
+        int i2420 = 0, i2421 = 0, i2422 = 0, i2423 = 0, i2424 = 0, i2425 = 0, i2426 = 0, i2427 = 0, i2428 = 0, i2429 = 0;
+        int i2430 = 0, i2431 = 0, i2432 = 0, i2433 = 0, i2434 = 0, i2435 = 0, i2436 = 0, i2437 = 0, i2438 = 0, i2439 = 0;
+        int i2440 = 0, i2441 = 0, i2442 = 0, i2443 = 0, i2444 = 0, i2445 = 0, i2446 = 0, i2447 = 0, i2448 = 0, i2449 = 0;
+        int i2450 = 0, i2451 = 0, i2452 = 0, i2453 = 0, i2454 = 0, i2455 = 0, i2456 = 0, i2457 = 0, i2458 = 0, i2459 = 0;
+        int i2460 = 0, i2461 = 0, i2462 = 0, i2463 = 0, i2464 = 0, i2465 = 0, i2466 = 0, i2467 = 0, i2468 = 0, i2469 = 0;
+        int i2470 = 0, i2471 = 0, i2472 = 0, i2473 = 0, i2474 = 0, i2475 = 0, i2476 = 0, i2477 = 0, i2478 = 0, i2479 = 0;
+        int i2480 = 0, i2481 = 0, i2482 = 0, i2483 = 0, i2484 = 0, i2485 = 0, i2486 = 0, i2487 = 0, i2488 = 0, i2489 = 0;
+        int i2490 = 0, i2491 = 0, i2492 = 0, i2493 = 0, i2494 = 0, i2495 = 0, i2496 = 0, i2497 = 0, i2498 = 0, i2499 = 0;
+        int i2500 = 0, i2501 = 0, i2502 = 0, i2503 = 0, i2504 = 0, i2505 = 0, i2506 = 0, i2507 = 0, i2508 = 0, i2509 = 0;
+        int i2510 = 0, i2511 = 0, i2512 = 0, i2513 = 0, i2514 = 0, i2515 = 0, i2516 = 0, i2517 = 0, i2518 = 0, i2519 = 0;
+        int i2520 = 0, i2521 = 0, i2522 = 0, i2523 = 0, i2524 = 0, i2525 = 0, i2526 = 0, i2527 = 0, i2528 = 0, i2529 = 0;
+        int i2530 = 0, i2531 = 0, i2532 = 0, i2533 = 0, i2534 = 0, i2535 = 0, i2536 = 0, i2537 = 0, i2538 = 0, i2539 = 0;
+        int i2540 = 0, i2541 = 0, i2542 = 0, i2543 = 0, i2544 = 0, i2545 = 0, i2546 = 0, i2547 = 0, i2548 = 0, i2549 = 0;
+        int i2550 = 0, i2551 = 0, i2552 = 0, i2553 = 0, i2554 = 0, i2555 = 0, i2556 = 0, i2557 = 0, i2558 = 0, i2559 = 0;
+        int i2560 = 0, i2561 = 0, i2562 = 0, i2563 = 0, i2564 = 0, i2565 = 0, i2566 = 0, i2567 = 0, i2568 = 0, i2569 = 0;
+        int i2570 = 0, i2571 = 0, i2572 = 0, i2573 = 0, i2574 = 0, i2575 = 0, i2576 = 0, i2577 = 0, i2578 = 0, i2579 = 0;
+        int i2580 = 0, i2581 = 0, i2582 = 0, i2583 = 0, i2584 = 0, i2585 = 0, i2586 = 0, i2587 = 0, i2588 = 0, i2589 = 0;
+        int i2590 = 0, i2591 = 0, i2592 = 0, i2593 = 0, i2594 = 0, i2595 = 0, i2596 = 0, i2597 = 0, i2598 = 0, i2599 = 0;
+        int i2600 = 0, i2601 = 0, i2602 = 0, i2603 = 0, i2604 = 0, i2605 = 0, i2606 = 0, i2607 = 0, i2608 = 0, i2609 = 0;
+        int i2610 = 0, i2611 = 0, i2612 = 0, i2613 = 0, i2614 = 0, i2615 = 0, i2616 = 0, i2617 = 0, i2618 = 0, i2619 = 0;
+        int i2620 = 0, i2621 = 0, i2622 = 0, i2623 = 0, i2624 = 0, i2625 = 0, i2626 = 0, i2627 = 0, i2628 = 0, i2629 = 0;
+        int i2630 = 0, i2631 = 0, i2632 = 0, i2633 = 0, i2634 = 0, i2635 = 0, i2636 = 0, i2637 = 0, i2638 = 0, i2639 = 0;
+        int i2640 = 0, i2641 = 0, i2642 = 0, i2643 = 0, i2644 = 0, i2645 = 0, i2646 = 0, i2647 = 0, i2648 = 0, i2649 = 0;
+        int i2650 = 0, i2651 = 0, i2652 = 0, i2653 = 0, i2654 = 0, i2655 = 0, i2656 = 0, i2657 = 0, i2658 = 0, i2659 = 0;
+        int i2660 = 0, i2661 = 0, i2662 = 0, i2663 = 0, i2664 = 0, i2665 = 0, i2666 = 0, i2667 = 0, i2668 = 0, i2669 = 0;
+        int i2670 = 0, i2671 = 0, i2672 = 0, i2673 = 0, i2674 = 0, i2675 = 0, i2676 = 0, i2677 = 0, i2678 = 0, i2679 = 0;
+        int i2680 = 0, i2681 = 0, i2682 = 0, i2683 = 0, i2684 = 0, i2685 = 0, i2686 = 0, i2687 = 0, i2688 = 0, i2689 = 0;
+        int i2690 = 0, i2691 = 0, i2692 = 0, i2693 = 0, i2694 = 0, i2695 = 0, i2696 = 0, i2697 = 0, i2698 = 0, i2699 = 0;
+        int i2700 = 0, i2701 = 0, i2702 = 0, i2703 = 0, i2704 = 0, i2705 = 0, i2706 = 0, i2707 = 0, i2708 = 0, i2709 = 0;
+        int i2710 = 0, i2711 = 0, i2712 = 0, i2713 = 0, i2714 = 0, i2715 = 0, i2716 = 0, i2717 = 0, i2718 = 0, i2719 = 0;
+        int i2720 = 0, i2721 = 0, i2722 = 0, i2723 = 0, i2724 = 0, i2725 = 0, i2726 = 0, i2727 = 0, i2728 = 0, i2729 = 0;
+        int i2730 = 0, i2731 = 0, i2732 = 0, i2733 = 0, i2734 = 0, i2735 = 0, i2736 = 0, i2737 = 0, i2738 = 0, i2739 = 0;
+        int i2740 = 0, i2741 = 0, i2742 = 0, i2743 = 0, i2744 = 0, i2745 = 0, i2746 = 0, i2747 = 0, i2748 = 0, i2749 = 0;
+        int i2750 = 0, i2751 = 0, i2752 = 0, i2753 = 0, i2754 = 0, i2755 = 0, i2756 = 0, i2757 = 0, i2758 = 0, i2759 = 0;
+        int i2760 = 0, i2761 = 0, i2762 = 0, i2763 = 0, i2764 = 0, i2765 = 0, i2766 = 0, i2767 = 0, i2768 = 0, i2769 = 0;
+        int i2770 = 0, i2771 = 0, i2772 = 0, i2773 = 0, i2774 = 0, i2775 = 0, i2776 = 0, i2777 = 0, i2778 = 0, i2779 = 0;
+        int i2780 = 0, i2781 = 0, i2782 = 0, i2783 = 0, i2784 = 0, i2785 = 0, i2786 = 0, i2787 = 0, i2788 = 0, i2789 = 0;
+        int i2790 = 0, i2791 = 0, i2792 = 0, i2793 = 0, i2794 = 0, i2795 = 0, i2796 = 0, i2797 = 0, i2798 = 0, i2799 = 0;
+        int i2800 = 0, i2801 = 0, i2802 = 0, i2803 = 0, i2804 = 0, i2805 = 0, i2806 = 0, i2807 = 0, i2808 = 0, i2809 = 0;
+        int i2810 = 0, i2811 = 0, i2812 = 0, i2813 = 0, i2814 = 0, i2815 = 0, i2816 = 0, i2817 = 0, i2818 = 0, i2819 = 0;
+        int i2820 = 0, i2821 = 0, i2822 = 0, i2823 = 0, i2824 = 0, i2825 = 0, i2826 = 0, i2827 = 0, i2828 = 0, i2829 = 0;
+        int i2830 = 0, i2831 = 0, i2832 = 0, i2833 = 0, i2834 = 0, i2835 = 0, i2836 = 0, i2837 = 0, i2838 = 0, i2839 = 0;
+        int i2840 = 0, i2841 = 0, i2842 = 0, i2843 = 0, i2844 = 0, i2845 = 0, i2846 = 0, i2847 = 0, i2848 = 0, i2849 = 0;
+        int i2850 = 0, i2851 = 0, i2852 = 0, i2853 = 0, i2854 = 0, i2855 = 0, i2856 = 0, i2857 = 0, i2858 = 0, i2859 = 0;
+        int i2860 = 0, i2861 = 0, i2862 = 0, i2863 = 0, i2864 = 0, i2865 = 0, i2866 = 0, i2867 = 0, i2868 = 0, i2869 = 0;
+        int i2870 = 0, i2871 = 0, i2872 = 0, i2873 = 0, i2874 = 0, i2875 = 0, i2876 = 0, i2877 = 0, i2878 = 0, i2879 = 0;
+        int i2880 = 0, i2881 = 0, i2882 = 0, i2883 = 0, i2884 = 0, i2885 = 0, i2886 = 0, i2887 = 0, i2888 = 0, i2889 = 0;
+        int i2890 = 0, i2891 = 0, i2892 = 0, i2893 = 0, i2894 = 0, i2895 = 0, i2896 = 0, i2897 = 0, i2898 = 0, i2899 = 0;
+        int i2900 = 0, i2901 = 0, i2902 = 0, i2903 = 0, i2904 = 0, i2905 = 0, i2906 = 0, i2907 = 0, i2908 = 0, i2909 = 0;
+        int i2910 = 0, i2911 = 0, i2912 = 0, i2913 = 0, i2914 = 0, i2915 = 0, i2916 = 0, i2917 = 0, i2918 = 0, i2919 = 0;
+        int i2920 = 0, i2921 = 0, i2922 = 0, i2923 = 0, i2924 = 0, i2925 = 0, i2926 = 0, i2927 = 0, i2928 = 0, i2929 = 0;
+        int i2930 = 0, i2931 = 0, i2932 = 0, i2933 = 0, i2934 = 0, i2935 = 0, i2936 = 0, i2937 = 0, i2938 = 0, i2939 = 0;
+        int i2940 = 0, i2941 = 0, i2942 = 0, i2943 = 0, i2944 = 0, i2945 = 0, i2946 = 0, i2947 = 0, i2948 = 0, i2949 = 0;
+        int i2950 = 0, i2951 = 0, i2952 = 0, i2953 = 0, i2954 = 0, i2955 = 0, i2956 = 0, i2957 = 0, i2958 = 0, i2959 = 0;
+        int i2960 = 0, i2961 = 0, i2962 = 0, i2963 = 0, i2964 = 0, i2965 = 0, i2966 = 0, i2967 = 0, i2968 = 0, i2969 = 0;
+        int i2970 = 0, i2971 = 0, i2972 = 0, i2973 = 0, i2974 = 0, i2975 = 0, i2976 = 0, i2977 = 0, i2978 = 0, i2979 = 0;
+        int i2980 = 0, i2981 = 0, i2982 = 0, i2983 = 0, i2984 = 0, i2985 = 0, i2986 = 0, i2987 = 0, i2988 = 0, i2989 = 0;
+        int i2990 = 0, i2991 = 0, i2992 = 0, i2993 = 0, i2994 = 0, i2995 = 0, i2996 = 0, i2997 = 0, i2998 = 0, i2999 = 0;
+        int i3000 = 0, i3001 = 0, i3002 = 0, i3003 = 0, i3004 = 0, i3005 = 0, i3006 = 0, i3007 = 0, i3008 = 0, i3009 = 0;
+        int i3010 = 0, i3011 = 0, i3012 = 0, i3013 = 0, i3014 = 0, i3015 = 0, i3016 = 0, i3017 = 0, i3018 = 0, i3019 = 0;
+        int i3020 = 0, i3021 = 0, i3022 = 0, i3023 = 0, i3024 = 0, i3025 = 0, i3026 = 0, i3027 = 0, i3028 = 0, i3029 = 0;
+        int i3030 = 0, i3031 = 0, i3032 = 0, i3033 = 0, i3034 = 0, i3035 = 0, i3036 = 0, i3037 = 0, i3038 = 0, i3039 = 0;
+        int i3040 = 0, i3041 = 0, i3042 = 0, i3043 = 0, i3044 = 0, i3045 = 0, i3046 = 0, i3047 = 0, i3048 = 0, i3049 = 0;
+        int i3050 = 0, i3051 = 0, i3052 = 0, i3053 = 0, i3054 = 0, i3055 = 0, i3056 = 0, i3057 = 0, i3058 = 0, i3059 = 0;
+        int i3060 = 0, i3061 = 0, i3062 = 0, i3063 = 0, i3064 = 0, i3065 = 0, i3066 = 0, i3067 = 0, i3068 = 0, i3069 = 0;
+        int i3070 = 0, i3071 = 0, i3072 = 0, i3073 = 0, i3074 = 0, i3075 = 0, i3076 = 0, i3077 = 0, i3078 = 0, i3079 = 0;
+        int i3080 = 0, i3081 = 0, i3082 = 0, i3083 = 0, i3084 = 0, i3085 = 0, i3086 = 0, i3087 = 0, i3088 = 0, i3089 = 0;
+        int i3090 = 0, i3091 = 0, i3092 = 0, i3093 = 0, i3094 = 0, i3095 = 0, i3096 = 0, i3097 = 0, i3098 = 0, i3099 = 0;
+        int i3100 = 0, i3101 = 0, i3102 = 0, i3103 = 0, i3104 = 0, i3105 = 0, i3106 = 0, i3107 = 0, i3108 = 0, i3109 = 0;
+        int i3110 = 0, i3111 = 0, i3112 = 0, i3113 = 0, i3114 = 0, i3115 = 0, i3116 = 0, i3117 = 0, i3118 = 0, i3119 = 0;
+        int i3120 = 0, i3121 = 0, i3122 = 0, i3123 = 0, i3124 = 0, i3125 = 0, i3126 = 0, i3127 = 0, i3128 = 0, i3129 = 0;
+        int i3130 = 0, i3131 = 0, i3132 = 0, i3133 = 0, i3134 = 0, i3135 = 0, i3136 = 0, i3137 = 0, i3138 = 0, i3139 = 0;
+        int i3140 = 0, i3141 = 0, i3142 = 0, i3143 = 0, i3144 = 0, i3145 = 0, i3146 = 0, i3147 = 0, i3148 = 0, i3149 = 0;
+        int i3150 = 0, i3151 = 0, i3152 = 0, i3153 = 0, i3154 = 0, i3155 = 0, i3156 = 0, i3157 = 0, i3158 = 0, i3159 = 0;
+        int i3160 = 0, i3161 = 0, i3162 = 0, i3163 = 0, i3164 = 0, i3165 = 0, i3166 = 0, i3167 = 0, i3168 = 0, i3169 = 0;
+        int i3170 = 0, i3171 = 0, i3172 = 0, i3173 = 0, i3174 = 0, i3175 = 0, i3176 = 0, i3177 = 0, i3178 = 0, i3179 = 0;
+        int i3180 = 0, i3181 = 0, i3182 = 0, i3183 = 0, i3184 = 0, i3185 = 0, i3186 = 0, i3187 = 0, i3188 = 0, i3189 = 0;
+        int i3190 = 0, i3191 = 0, i3192 = 0, i3193 = 0, i3194 = 0, i3195 = 0, i3196 = 0, i3197 = 0, i3198 = 0, i3199 = 0;
+        int i3200 = 0, i3201 = 0, i3202 = 0, i3203 = 0, i3204 = 0, i3205 = 0, i3206 = 0, i3207 = 0, i3208 = 0, i3209 = 0;
+        int i3210 = 0, i3211 = 0, i3212 = 0, i3213 = 0, i3214 = 0, i3215 = 0, i3216 = 0, i3217 = 0, i3218 = 0, i3219 = 0;
+        int i3220 = 0, i3221 = 0, i3222 = 0, i3223 = 0, i3224 = 0, i3225 = 0, i3226 = 0, i3227 = 0, i3228 = 0, i3229 = 0;
+        int i3230 = 0, i3231 = 0, i3232 = 0, i3233 = 0, i3234 = 0, i3235 = 0, i3236 = 0, i3237 = 0, i3238 = 0, i3239 = 0;
+        int i3240 = 0, i3241 = 0, i3242 = 0, i3243 = 0, i3244 = 0, i3245 = 0, i3246 = 0, i3247 = 0, i3248 = 0, i3249 = 0;
+        int i3250 = 0, i3251 = 0, i3252 = 0, i3253 = 0, i3254 = 0, i3255 = 0, i3256 = 0, i3257 = 0, i3258 = 0, i3259 = 0;
+        int i3260 = 0, i3261 = 0, i3262 = 0, i3263 = 0, i3264 = 0, i3265 = 0, i3266 = 0, i3267 = 0, i3268 = 0, i3269 = 0;
+        int i3270 = 0, i3271 = 0, i3272 = 0, i3273 = 0, i3274 = 0, i3275 = 0, i3276 = 0, i3277 = 0, i3278 = 0, i3279 = 0;
+        int i3280 = 0, i3281 = 0, i3282 = 0, i3283 = 0, i3284 = 0, i3285 = 0, i3286 = 0, i3287 = 0, i3288 = 0, i3289 = 0;
+        int i3290 = 0, i3291 = 0, i3292 = 0, i3293 = 0, i3294 = 0, i3295 = 0, i3296 = 0, i3297 = 0, i3298 = 0, i3299 = 0;
+        int i3300 = 0, i3301 = 0, i3302 = 0, i3303 = 0, i3304 = 0, i3305 = 0, i3306 = 0, i3307 = 0, i3308 = 0, i3309 = 0;
+        int i3310 = 0, i3311 = 0, i3312 = 0, i3313 = 0, i3314 = 0, i3315 = 0, i3316 = 0, i3317 = 0, i3318 = 0, i3319 = 0;
+        int i3320 = 0, i3321 = 0, i3322 = 0, i3323 = 0, i3324 = 0, i3325 = 0, i3326 = 0, i3327 = 0, i3328 = 0, i3329 = 0;
+        int i3330 = 0, i3331 = 0, i3332 = 0, i3333 = 0, i3334 = 0, i3335 = 0, i3336 = 0, i3337 = 0, i3338 = 0, i3339 = 0;
+        int i3340 = 0, i3341 = 0, i3342 = 0, i3343 = 0, i3344 = 0, i3345 = 0, i3346 = 0, i3347 = 0, i3348 = 0, i3349 = 0;
+        int i3350 = 0, i3351 = 0, i3352 = 0, i3353 = 0, i3354 = 0, i3355 = 0, i3356 = 0, i3357 = 0, i3358 = 0, i3359 = 0;
+        int i3360 = 0, i3361 = 0, i3362 = 0, i3363 = 0, i3364 = 0, i3365 = 0, i3366 = 0, i3367 = 0, i3368 = 0, i3369 = 0;
+        int i3370 = 0, i3371 = 0, i3372 = 0, i3373 = 0, i3374 = 0, i3375 = 0, i3376 = 0, i3377 = 0, i3378 = 0, i3379 = 0;
+        int i3380 = 0, i3381 = 0, i3382 = 0, i3383 = 0, i3384 = 0, i3385 = 0, i3386 = 0, i3387 = 0, i3388 = 0, i3389 = 0;
+        int i3390 = 0, i3391 = 0, i3392 = 0, i3393 = 0, i3394 = 0, i3395 = 0, i3396 = 0, i3397 = 0, i3398 = 0, i3399 = 0;
+        int i3400 = 0, i3401 = 0, i3402 = 0, i3403 = 0, i3404 = 0, i3405 = 0, i3406 = 0, i3407 = 0, i3408 = 0, i3409 = 0;
+        int i3410 = 0, i3411 = 0, i3412 = 0, i3413 = 0, i3414 = 0, i3415 = 0, i3416 = 0, i3417 = 0, i3418 = 0, i3419 = 0;
+        int i3420 = 0, i3421 = 0, i3422 = 0, i3423 = 0, i3424 = 0, i3425 = 0, i3426 = 0, i3427 = 0, i3428 = 0, i3429 = 0;
+        int i3430 = 0, i3431 = 0, i3432 = 0, i3433 = 0, i3434 = 0, i3435 = 0, i3436 = 0, i3437 = 0, i3438 = 0, i3439 = 0;
+        int i3440 = 0, i3441 = 0, i3442 = 0, i3443 = 0, i3444 = 0, i3445 = 0, i3446 = 0, i3447 = 0, i3448 = 0, i3449 = 0;
+        int i3450 = 0, i3451 = 0, i3452 = 0, i3453 = 0, i3454 = 0, i3455 = 0, i3456 = 0, i3457 = 0, i3458 = 0, i3459 = 0;
+        int i3460 = 0, i3461 = 0, i3462 = 0, i3463 = 0, i3464 = 0, i3465 = 0, i3466 = 0, i3467 = 0, i3468 = 0, i3469 = 0;
+        int i3470 = 0, i3471 = 0, i3472 = 0, i3473 = 0, i3474 = 0, i3475 = 0, i3476 = 0, i3477 = 0, i3478 = 0, i3479 = 0;
+        int i3480 = 0, i3481 = 0, i3482 = 0, i3483 = 0, i3484 = 0, i3485 = 0, i3486 = 0, i3487 = 0, i3488 = 0, i3489 = 0;
+        int i3490 = 0, i3491 = 0, i3492 = 0, i3493 = 0, i3494 = 0, i3495 = 0, i3496 = 0, i3497 = 0, i3498 = 0, i3499 = 0;
+        int i3500 = 0, i3501 = 0, i3502 = 0, i3503 = 0, i3504 = 0, i3505 = 0, i3506 = 0, i3507 = 0, i3508 = 0, i3509 = 0;
+        int i3510 = 0, i3511 = 0, i3512 = 0, i3513 = 0, i3514 = 0, i3515 = 0, i3516 = 0, i3517 = 0, i3518 = 0, i3519 = 0;
+        int i3520 = 0, i3521 = 0, i3522 = 0, i3523 = 0, i3524 = 0, i3525 = 0, i3526 = 0, i3527 = 0, i3528 = 0, i3529 = 0;
+        int i3530 = 0, i3531 = 0, i3532 = 0, i3533 = 0, i3534 = 0, i3535 = 0, i3536 = 0, i3537 = 0, i3538 = 0, i3539 = 0;
+        int i3540 = 0, i3541 = 0, i3542 = 0, i3543 = 0, i3544 = 0, i3545 = 0, i3546 = 0, i3547 = 0, i3548 = 0, i3549 = 0;
+        int i3550 = 0, i3551 = 0, i3552 = 0, i3553 = 0, i3554 = 0, i3555 = 0, i3556 = 0, i3557 = 0, i3558 = 0, i3559 = 0;
+        int i3560 = 0, i3561 = 0, i3562 = 0, i3563 = 0, i3564 = 0, i3565 = 0, i3566 = 0, i3567 = 0, i3568 = 0, i3569 = 0;
+        int i3570 = 0, i3571 = 0, i3572 = 0, i3573 = 0, i3574 = 0, i3575 = 0, i3576 = 0, i3577 = 0, i3578 = 0, i3579 = 0;
+        int i3580 = 0, i3581 = 0, i3582 = 0, i3583 = 0, i3584 = 0, i3585 = 0, i3586 = 0, i3587 = 0, i3588 = 0, i3589 = 0;
+        int i3590 = 0, i3591 = 0, i3592 = 0, i3593 = 0, i3594 = 0, i3595 = 0, i3596 = 0, i3597 = 0, i3598 = 0, i3599 = 0;
+        int i3600 = 0, i3601 = 0, i3602 = 0, i3603 = 0, i3604 = 0, i3605 = 0, i3606 = 0, i3607 = 0, i3608 = 0, i3609 = 0;
+        int i3610 = 0, i3611 = 0, i3612 = 0, i3613 = 0, i3614 = 0, i3615 = 0, i3616 = 0, i3617 = 0, i3618 = 0, i3619 = 0;
+        int i3620 = 0, i3621 = 0, i3622 = 0, i3623 = 0, i3624 = 0, i3625 = 0, i3626 = 0, i3627 = 0, i3628 = 0, i3629 = 0;
+        int i3630 = 0, i3631 = 0, i3632 = 0, i3633 = 0, i3634 = 0, i3635 = 0, i3636 = 0, i3637 = 0, i3638 = 0, i3639 = 0;
+        int i3640 = 0, i3641 = 0, i3642 = 0, i3643 = 0, i3644 = 0, i3645 = 0, i3646 = 0, i3647 = 0, i3648 = 0, i3649 = 0;
+        int i3650 = 0, i3651 = 0, i3652 = 0, i3653 = 0, i3654 = 0, i3655 = 0, i3656 = 0, i3657 = 0, i3658 = 0, i3659 = 0;
+        int i3660 = 0, i3661 = 0, i3662 = 0, i3663 = 0, i3664 = 0, i3665 = 0, i3666 = 0, i3667 = 0, i3668 = 0, i3669 = 0;
+        int i3670 = 0, i3671 = 0, i3672 = 0, i3673 = 0, i3674 = 0, i3675 = 0, i3676 = 0, i3677 = 0, i3678 = 0, i3679 = 0;
+        int i3680 = 0, i3681 = 0, i3682 = 0, i3683 = 0, i3684 = 0, i3685 = 0, i3686 = 0, i3687 = 0, i3688 = 0, i3689 = 0;
+        int i3690 = 0, i3691 = 0, i3692 = 0, i3693 = 0, i3694 = 0, i3695 = 0, i3696 = 0, i3697 = 0, i3698 = 0, i3699 = 0;
+        int i3700 = 0, i3701 = 0, i3702 = 0, i3703 = 0, i3704 = 0, i3705 = 0, i3706 = 0, i3707 = 0, i3708 = 0, i3709 = 0;
+        int i3710 = 0, i3711 = 0, i3712 = 0, i3713 = 0, i3714 = 0, i3715 = 0, i3716 = 0, i3717 = 0, i3718 = 0, i3719 = 0;
+        int i3720 = 0, i3721 = 0, i3722 = 0, i3723 = 0, i3724 = 0, i3725 = 0, i3726 = 0, i3727 = 0, i3728 = 0, i3729 = 0;
+        int i3730 = 0, i3731 = 0, i3732 = 0, i3733 = 0, i3734 = 0, i3735 = 0, i3736 = 0, i3737 = 0, i3738 = 0, i3739 = 0;
+        int i3740 = 0, i3741 = 0, i3742 = 0, i3743 = 0, i3744 = 0, i3745 = 0, i3746 = 0, i3747 = 0, i3748 = 0, i3749 = 0;
+        int i3750 = 0, i3751 = 0, i3752 = 0, i3753 = 0, i3754 = 0, i3755 = 0, i3756 = 0, i3757 = 0, i3758 = 0, i3759 = 0;
+        int i3760 = 0, i3761 = 0, i3762 = 0, i3763 = 0, i3764 = 0, i3765 = 0, i3766 = 0, i3767 = 0, i3768 = 0, i3769 = 0;
+        int i3770 = 0, i3771 = 0, i3772 = 0, i3773 = 0, i3774 = 0, i3775 = 0, i3776 = 0, i3777 = 0, i3778 = 0, i3779 = 0;
+        int i3780 = 0, i3781 = 0, i3782 = 0, i3783 = 0, i3784 = 0, i3785 = 0, i3786 = 0, i3787 = 0, i3788 = 0, i3789 = 0;
+        int i3790 = 0, i3791 = 0, i3792 = 0, i3793 = 0, i3794 = 0, i3795 = 0, i3796 = 0, i3797 = 0, i3798 = 0, i3799 = 0;
+        int i3800 = 0, i3801 = 0, i3802 = 0, i3803 = 0, i3804 = 0, i3805 = 0, i3806 = 0, i3807 = 0, i3808 = 0, i3809 = 0;
+        int i3810 = 0, i3811 = 0, i3812 = 0, i3813 = 0, i3814 = 0, i3815 = 0, i3816 = 0, i3817 = 0, i3818 = 0, i3819 = 0;
+        int i3820 = 0, i3821 = 0, i3822 = 0, i3823 = 0, i3824 = 0, i3825 = 0, i3826 = 0, i3827 = 0, i3828 = 0, i3829 = 0;
+        int i3830 = 0, i3831 = 0, i3832 = 0, i3833 = 0, i3834 = 0, i3835 = 0, i3836 = 0, i3837 = 0, i3838 = 0, i3839 = 0;
+        int i3840 = 0, i3841 = 0, i3842 = 0, i3843 = 0, i3844 = 0, i3845 = 0, i3846 = 0, i3847 = 0, i3848 = 0, i3849 = 0;
+        int i3850 = 0, i3851 = 0, i3852 = 0, i3853 = 0, i3854 = 0, i3855 = 0, i3856 = 0, i3857 = 0, i3858 = 0, i3859 = 0;
+        int i3860 = 0, i3861 = 0, i3862 = 0, i3863 = 0, i3864 = 0, i3865 = 0, i3866 = 0, i3867 = 0, i3868 = 0, i3869 = 0;
+        int i3870 = 0, i3871 = 0, i3872 = 0, i3873 = 0, i3874 = 0, i3875 = 0, i3876 = 0, i3877 = 0, i3878 = 0, i3879 = 0;
+        int i3880 = 0, i3881 = 0, i3882 = 0, i3883 = 0, i3884 = 0, i3885 = 0, i3886 = 0, i3887 = 0, i3888 = 0, i3889 = 0;
+        int i3890 = 0, i3891 = 0, i3892 = 0, i3893 = 0, i3894 = 0, i3895 = 0, i3896 = 0, i3897 = 0, i3898 = 0, i3899 = 0;
+        int i3900 = 0, i3901 = 0, i3902 = 0, i3903 = 0, i3904 = 0, i3905 = 0, i3906 = 0, i3907 = 0, i3908 = 0, i3909 = 0;
+        int i3910 = 0, i3911 = 0, i3912 = 0, i3913 = 0, i3914 = 0, i3915 = 0, i3916 = 0, i3917 = 0, i3918 = 0, i3919 = 0;
+        int i3920 = 0, i3921 = 0, i3922 = 0, i3923 = 0, i3924 = 0, i3925 = 0, i3926 = 0, i3927 = 0, i3928 = 0, i3929 = 0;
+        int i3930 = 0, i3931 = 0, i3932 = 0, i3933 = 0, i3934 = 0, i3935 = 0, i3936 = 0, i3937 = 0, i3938 = 0, i3939 = 0;
+        int i3940 = 0, i3941 = 0, i3942 = 0, i3943 = 0, i3944 = 0, i3945 = 0, i3946 = 0, i3947 = 0, i3948 = 0, i3949 = 0;
+        int i3950 = 0, i3951 = 0, i3952 = 0, i3953 = 0, i3954 = 0, i3955 = 0, i3956 = 0, i3957 = 0, i3958 = 0, i3959 = 0;
+        int i3960 = 0, i3961 = 0, i3962 = 0, i3963 = 0, i3964 = 0, i3965 = 0, i3966 = 0, i3967 = 0, i3968 = 0, i3969 = 0;
+        int i3970 = 0, i3971 = 0, i3972 = 0, i3973 = 0, i3974 = 0, i3975 = 0, i3976 = 0, i3977 = 0, i3978 = 0, i3979 = 0;
+        int i3980 = 0, i3981 = 0, i3982 = 0, i3983 = 0, i3984 = 0, i3985 = 0, i3986 = 0, i3987 = 0, i3988 = 0, i3989 = 0;
+        int i3990 = 0, i3991 = 0, i3992 = 0, i3993 = 0, i3994 = 0, i3995 = 0, i3996 = 0, i3997 = 0, i3998 = 0, i3999 = 0;
+        int i4000 = 0, i4001 = 0, i4002 = 0, i4003 = 0, i4004 = 0, i4005 = 0, i4006 = 0, i4007 = 0, i4008 = 0, i4009 = 0;
+        int i4010 = 0, i4011 = 0, i4012 = 0, i4013 = 0, i4014 = 0, i4015 = 0, i4016 = 0, i4017 = 0, i4018 = 0, i4019 = 0;
+        int i4020 = 0, i4021 = 0, i4022 = 0, i4023 = 0, i4024 = 0, i4025 = 0, i4026 = 0, i4027 = 0, i4028 = 0, i4029 = 0;
+        int i4030 = 0, i4031 = 0, i4032 = 0, i4033 = 0, i4034 = 0, i4035 = 0, i4036 = 0, i4037 = 0, i4038 = 0, i4039 = 0;
+        int i4040 = 0, i4041 = 0, i4042 = 0, i4043 = 0, i4044 = 0, i4045 = 0, i4046 = 0, i4047 = 0, i4048 = 0, i4049 = 0;
+        int i4050 = 0, i4051 = 0, i4052 = 0, i4053 = 0, i4054 = 0, i4055 = 0, i4056 = 0, i4057 = 0, i4058 = 0, i4059 = 0;
+        int i4060 = 0, i4061 = 0, i4062 = 0, i4063 = 0, i4064 = 0, i4065 = 0, i4066 = 0, i4067 = 0, i4068 = 0, i4069 = 0;
+        int i4070 = 0, i4071 = 0, i4072 = 0, i4073 = 0, i4074 = 0, i4075 = 0, i4076 = 0, i4077 = 0, i4078 = 0, i4079 = 0;
+        int i4080 = 0, i4081 = 0, i4082 = 0, i4083 = 0, i4084 = 0, i4085 = 0, i4086 = 0, i4087 = 0, i4088 = 0, i4089 = 0;
+        int i4090 = 0, i4091 = 0, i4092 = 0, i4093 = 0, i4094 = 0, i4095 = 0, i4096 = 0, i4097 = 0, i4098 = 0, i4099 = 0;
+        int i4100 = 0, i4101 = 0, i4102 = 0, i4103 = 0, i4104 = 0, i4105 = 0, i4106 = 0, i4107 = 0, i4108 = 0, i4109 = 0;
+        int i4110 = 0, i4111 = 0, i4112 = 0, i4113 = 0, i4114 = 0, i4115 = 0, i4116 = 0, i4117 = 0, i4118 = 0, i4119 = 0;
+        int i4120 = 0, i4121 = 0, i4122 = 0, i4123 = 0, i4124 = 0, i4125 = 0, i4126 = 0, i4127 = 0, i4128 = 0, i4129 = 0;
+        int i4130 = 0, i4131 = 0, i4132 = 0, i4133 = 0, i4134 = 0, i4135 = 0, i4136 = 0, i4137 = 0, i4138 = 0, i4139 = 0;
+        int i4140 = 0, i4141 = 0, i4142 = 0, i4143 = 0, i4144 = 0, i4145 = 0, i4146 = 0, i4147 = 0, i4148 = 0, i4149 = 0;
+        int i4150 = 0, i4151 = 0, i4152 = 0, i4153 = 0, i4154 = 0, i4155 = 0, i4156 = 0, i4157 = 0, i4158 = 0, i4159 = 0;
+        int i4160 = 0, i4161 = 0, i4162 = 0, i4163 = 0, i4164 = 0, i4165 = 0, i4166 = 0, i4167 = 0, i4168 = 0, i4169 = 0;
+        int i4170 = 0, i4171 = 0, i4172 = 0, i4173 = 0, i4174 = 0, i4175 = 0, i4176 = 0, i4177 = 0, i4178 = 0, i4179 = 0;
+        int i4180 = 0, i4181 = 0, i4182 = 0, i4183 = 0, i4184 = 0, i4185 = 0, i4186 = 0, i4187 = 0, i4188 = 0, i4189 = 0;
+        int i4190 = 0, i4191 = 0, i4192 = 0, i4193 = 0, i4194 = 0, i4195 = 0, i4196 = 0, i4197 = 0, i4198 = 0, i4199 = 0;
+        int i4200 = 0, i4201 = 0, i4202 = 0, i4203 = 0, i4204 = 0, i4205 = 0, i4206 = 0, i4207 = 0, i4208 = 0, i4209 = 0;
+        int i4210 = 0, i4211 = 0, i4212 = 0, i4213 = 0, i4214 = 0, i4215 = 0, i4216 = 0, i4217 = 0, i4218 = 0, i4219 = 0;
+        int i4220 = 0, i4221 = 0, i4222 = 0, i4223 = 0, i4224 = 0, i4225 = 0, i4226 = 0, i4227 = 0, i4228 = 0, i4229 = 0;
+        int i4230 = 0, i4231 = 0, i4232 = 0, i4233 = 0, i4234 = 0, i4235 = 0, i4236 = 0, i4237 = 0, i4238 = 0, i4239 = 0;
+        int i4240 = 0, i4241 = 0, i4242 = 0, i4243 = 0, i4244 = 0, i4245 = 0, i4246 = 0, i4247 = 0, i4248 = 0, i4249 = 0;
+        int i4250 = 0, i4251 = 0, i4252 = 0, i4253 = 0, i4254 = 0, i4255 = 0, i4256 = 0, i4257 = 0, i4258 = 0, i4259 = 0;
+        int i4260 = 0, i4261 = 0, i4262 = 0, i4263 = 0, i4264 = 0, i4265 = 0, i4266 = 0, i4267 = 0, i4268 = 0, i4269 = 0;
+        int i4270 = 0, i4271 = 0, i4272 = 0, i4273 = 0, i4274 = 0, i4275 = 0, i4276 = 0, i4277 = 0, i4278 = 0, i4279 = 0;
+        int i4280 = 0, i4281 = 0, i4282 = 0, i4283 = 0, i4284 = 0, i4285 = 0, i4286 = 0, i4287 = 0, i4288 = 0, i4289 = 0;
+        int i4290 = 0, i4291 = 0, i4292 = 0, i4293 = 0, i4294 = 0, i4295 = 0, i4296 = 0, i4297 = 0, i4298 = 0, i4299 = 0;
+        int i4300 = 0, i4301 = 0, i4302 = 0, i4303 = 0, i4304 = 0, i4305 = 0, i4306 = 0, i4307 = 0, i4308 = 0, i4309 = 0;
+        int i4310 = 0, i4311 = 0, i4312 = 0, i4313 = 0, i4314 = 0, i4315 = 0, i4316 = 0, i4317 = 0, i4318 = 0, i4319 = 0;
+        int i4320 = 0, i4321 = 0, i4322 = 0, i4323 = 0, i4324 = 0, i4325 = 0, i4326 = 0, i4327 = 0, i4328 = 0, i4329 = 0;
+        int i4330 = 0, i4331 = 0, i4332 = 0, i4333 = 0, i4334 = 0, i4335 = 0, i4336 = 0, i4337 = 0, i4338 = 0, i4339 = 0;
+        int i4340 = 0, i4341 = 0, i4342 = 0, i4343 = 0, i4344 = 0, i4345 = 0, i4346 = 0, i4347 = 0, i4348 = 0, i4349 = 0;
+        int i4350 = 0, i4351 = 0, i4352 = 0, i4353 = 0, i4354 = 0, i4355 = 0, i4356 = 0, i4357 = 0, i4358 = 0, i4359 = 0;
+        int i4360 = 0, i4361 = 0, i4362 = 0, i4363 = 0, i4364 = 0, i4365 = 0, i4366 = 0, i4367 = 0, i4368 = 0, i4369 = 0;
+        int i4370 = 0, i4371 = 0, i4372 = 0, i4373 = 0, i4374 = 0, i4375 = 0, i4376 = 0, i4377 = 0, i4378 = 0, i4379 = 0;
+        int i4380 = 0, i4381 = 0, i4382 = 0, i4383 = 0, i4384 = 0, i4385 = 0, i4386 = 0, i4387 = 0, i4388 = 0, i4389 = 0;
+        int i4390 = 0, i4391 = 0, i4392 = 0, i4393 = 0, i4394 = 0, i4395 = 0, i4396 = 0, i4397 = 0, i4398 = 0, i4399 = 0;
+        int i4400 = 0, i4401 = 0, i4402 = 0, i4403 = 0, i4404 = 0, i4405 = 0, i4406 = 0, i4407 = 0, i4408 = 0, i4409 = 0;
+        int i4410 = 0, i4411 = 0, i4412 = 0, i4413 = 0, i4414 = 0, i4415 = 0, i4416 = 0, i4417 = 0, i4418 = 0, i4419 = 0;
+        int i4420 = 0, i4421 = 0, i4422 = 0, i4423 = 0, i4424 = 0, i4425 = 0, i4426 = 0, i4427 = 0, i4428 = 0, i4429 = 0;
+        int i4430 = 0, i4431 = 0, i4432 = 0, i4433 = 0, i4434 = 0, i4435 = 0, i4436 = 0, i4437 = 0, i4438 = 0, i4439 = 0;
+        int i4440 = 0, i4441 = 0, i4442 = 0, i4443 = 0, i4444 = 0, i4445 = 0, i4446 = 0, i4447 = 0, i4448 = 0, i4449 = 0;
+        int i4450 = 0, i4451 = 0, i4452 = 0, i4453 = 0, i4454 = 0, i4455 = 0, i4456 = 0, i4457 = 0, i4458 = 0, i4459 = 0;
+        int i4460 = 0, i4461 = 0, i4462 = 0, i4463 = 0, i4464 = 0, i4465 = 0, i4466 = 0, i4467 = 0, i4468 = 0, i4469 = 0;
+        int i4470 = 0, i4471 = 0, i4472 = 0, i4473 = 0, i4474 = 0, i4475 = 0, i4476 = 0, i4477 = 0, i4478 = 0, i4479 = 0;
+        int i4480 = 0, i4481 = 0, i4482 = 0, i4483 = 0, i4484 = 0, i4485 = 0, i4486 = 0, i4487 = 0, i4488 = 0, i4489 = 0;
+        int i4490 = 0, i4491 = 0, i4492 = 0, i4493 = 0, i4494 = 0, i4495 = 0, i4496 = 0, i4497 = 0, i4498 = 0, i4499 = 0;
+        int i4500 = 0, i4501 = 0, i4502 = 0, i4503 = 0, i4504 = 0, i4505 = 0, i4506 = 0, i4507 = 0, i4508 = 0, i4509 = 0;
+        int i4510 = 0, i4511 = 0, i4512 = 0, i4513 = 0, i4514 = 0, i4515 = 0, i4516 = 0, i4517 = 0, i4518 = 0, i4519 = 0;
+        int i4520 = 0, i4521 = 0, i4522 = 0, i4523 = 0, i4524 = 0, i4525 = 0, i4526 = 0, i4527 = 0, i4528 = 0, i4529 = 0;
+        int i4530 = 0, i4531 = 0, i4532 = 0, i4533 = 0, i4534 = 0, i4535 = 0, i4536 = 0, i4537 = 0, i4538 = 0, i4539 = 0;
+        int i4540 = 0, i4541 = 0, i4542 = 0, i4543 = 0, i4544 = 0, i4545 = 0, i4546 = 0, i4547 = 0, i4548 = 0, i4549 = 0;
+        int i4550 = 0, i4551 = 0, i4552 = 0, i4553 = 0, i4554 = 0, i4555 = 0, i4556 = 0, i4557 = 0, i4558 = 0, i4559 = 0;
+        int i4560 = 0, i4561 = 0, i4562 = 0, i4563 = 0, i4564 = 0, i4565 = 0, i4566 = 0, i4567 = 0, i4568 = 0, i4569 = 0;
+        int i4570 = 0, i4571 = 0, i4572 = 0, i4573 = 0, i4574 = 0, i4575 = 0, i4576 = 0, i4577 = 0, i4578 = 0, i4579 = 0;
+        int i4580 = 0, i4581 = 0, i4582 = 0, i4583 = 0, i4584 = 0, i4585 = 0, i4586 = 0, i4587 = 0, i4588 = 0, i4589 = 0;
+        int i4590 = 0, i4591 = 0, i4592 = 0, i4593 = 0, i4594 = 0, i4595 = 0, i4596 = 0, i4597 = 0, i4598 = 0, i4599 = 0;
+        int i4600 = 0, i4601 = 0, i4602 = 0, i4603 = 0, i4604 = 0, i4605 = 0, i4606 = 0, i4607 = 0, i4608 = 0, i4609 = 0;
+        int i4610 = 0, i4611 = 0, i4612 = 0, i4613 = 0, i4614 = 0, i4615 = 0, i4616 = 0, i4617 = 0, i4618 = 0, i4619 = 0;
+        int i4620 = 0, i4621 = 0, i4622 = 0, i4623 = 0, i4624 = 0, i4625 = 0, i4626 = 0, i4627 = 0, i4628 = 0, i4629 = 0;
+        int i4630 = 0, i4631 = 0, i4632 = 0, i4633 = 0, i4634 = 0, i4635 = 0, i4636 = 0, i4637 = 0, i4638 = 0, i4639 = 0;
+        int i4640 = 0, i4641 = 0, i4642 = 0, i4643 = 0, i4644 = 0, i4645 = 0, i4646 = 0, i4647 = 0, i4648 = 0, i4649 = 0;
+        int i4650 = 0, i4651 = 0, i4652 = 0, i4653 = 0, i4654 = 0, i4655 = 0, i4656 = 0, i4657 = 0, i4658 = 0, i4659 = 0;
+        int i4660 = 0, i4661 = 0, i4662 = 0, i4663 = 0, i4664 = 0, i4665 = 0, i4666 = 0, i4667 = 0, i4668 = 0, i4669 = 0;
+        int i4670 = 0, i4671 = 0, i4672 = 0, i4673 = 0, i4674 = 0, i4675 = 0, i4676 = 0, i4677 = 0, i4678 = 0, i4679 = 0;
+        int i4680 = 0, i4681 = 0, i4682 = 0, i4683 = 0, i4684 = 0, i4685 = 0, i4686 = 0, i4687 = 0, i4688 = 0, i4689 = 0;
+        int i4690 = 0, i4691 = 0, i4692 = 0, i4693 = 0, i4694 = 0, i4695 = 0, i4696 = 0, i4697 = 0, i4698 = 0, i4699 = 0;
+        int i4700 = 0, i4701 = 0, i4702 = 0, i4703 = 0, i4704 = 0, i4705 = 0, i4706 = 0, i4707 = 0, i4708 = 0, i4709 = 0;
+        int i4710 = 0, i4711 = 0, i4712 = 0, i4713 = 0, i4714 = 0, i4715 = 0, i4716 = 0, i4717 = 0, i4718 = 0, i4719 = 0;
+        int i4720 = 0, i4721 = 0, i4722 = 0, i4723 = 0, i4724 = 0, i4725 = 0, i4726 = 0, i4727 = 0, i4728 = 0, i4729 = 0;
+        int i4730 = 0, i4731 = 0, i4732 = 0, i4733 = 0, i4734 = 0, i4735 = 0, i4736 = 0, i4737 = 0, i4738 = 0, i4739 = 0;
+        int i4740 = 0, i4741 = 0, i4742 = 0, i4743 = 0, i4744 = 0, i4745 = 0, i4746 = 0, i4747 = 0, i4748 = 0, i4749 = 0;
+        int i4750 = 0, i4751 = 0, i4752 = 0, i4753 = 0, i4754 = 0, i4755 = 0, i4756 = 0, i4757 = 0, i4758 = 0, i4759 = 0;
+        int i4760 = 0, i4761 = 0, i4762 = 0, i4763 = 0, i4764 = 0, i4765 = 0, i4766 = 0, i4767 = 0, i4768 = 0, i4769 = 0;
+        int i4770 = 0, i4771 = 0, i4772 = 0, i4773 = 0, i4774 = 0, i4775 = 0, i4776 = 0, i4777 = 0, i4778 = 0, i4779 = 0;
+        int i4780 = 0, i4781 = 0, i4782 = 0, i4783 = 0, i4784 = 0, i4785 = 0, i4786 = 0, i4787 = 0, i4788 = 0, i4789 = 0;
+        int i4790 = 0, i4791 = 0, i4792 = 0, i4793 = 0, i4794 = 0, i4795 = 0, i4796 = 0, i4797 = 0, i4798 = 0, i4799 = 0;
+        int i4800 = 0, i4801 = 0, i4802 = 0, i4803 = 0, i4804 = 0, i4805 = 0, i4806 = 0, i4807 = 0, i4808 = 0, i4809 = 0;
+        int i4810 = 0, i4811 = 0, i4812 = 0, i4813 = 0, i4814 = 0, i4815 = 0, i4816 = 0, i4817 = 0, i4818 = 0, i4819 = 0;
+        int i4820 = 0, i4821 = 0, i4822 = 0, i4823 = 0, i4824 = 0, i4825 = 0, i4826 = 0, i4827 = 0, i4828 = 0, i4829 = 0;
+        int i4830 = 0, i4831 = 0, i4832 = 0, i4833 = 0, i4834 = 0, i4835 = 0, i4836 = 0, i4837 = 0, i4838 = 0, i4839 = 0;
+        int i4840 = 0, i4841 = 0, i4842 = 0, i4843 = 0, i4844 = 0, i4845 = 0, i4846 = 0, i4847 = 0, i4848 = 0, i4849 = 0;
+        int i4850 = 0, i4851 = 0, i4852 = 0, i4853 = 0, i4854 = 0, i4855 = 0, i4856 = 0, i4857 = 0, i4858 = 0, i4859 = 0;
+        int i4860 = 0, i4861 = 0, i4862 = 0, i4863 = 0, i4864 = 0, i4865 = 0, i4866 = 0, i4867 = 0, i4868 = 0, i4869 = 0;
+        int i4870 = 0, i4871 = 0, i4872 = 0, i4873 = 0, i4874 = 0, i4875 = 0, i4876 = 0, i4877 = 0, i4878 = 0, i4879 = 0;
+        int i4880 = 0, i4881 = 0, i4882 = 0, i4883 = 0, i4884 = 0, i4885 = 0, i4886 = 0, i4887 = 0, i4888 = 0, i4889 = 0;
+        int i4890 = 0, i4891 = 0, i4892 = 0, i4893 = 0, i4894 = 0, i4895 = 0, i4896 = 0, i4897 = 0, i4898 = 0, i4899 = 0;
+        int i4900 = 0, i4901 = 0, i4902 = 0, i4903 = 0, i4904 = 0, i4905 = 0, i4906 = 0, i4907 = 0, i4908 = 0, i4909 = 0;
+        int i4910 = 0, i4911 = 0, i4912 = 0, i4913 = 0, i4914 = 0, i4915 = 0, i4916 = 0, i4917 = 0, i4918 = 0, i4919 = 0;
+        int i4920 = 0, i4921 = 0, i4922 = 0, i4923 = 0, i4924 = 0, i4925 = 0, i4926 = 0, i4927 = 0, i4928 = 0, i4929 = 0;
+        int i4930 = 0, i4931 = 0, i4932 = 0, i4933 = 0, i4934 = 0, i4935 = 0, i4936 = 0, i4937 = 0, i4938 = 0, i4939 = 0;
+        int i4940 = 0, i4941 = 0, i4942 = 0, i4943 = 0, i4944 = 0, i4945 = 0, i4946 = 0, i4947 = 0, i4948 = 0, i4949 = 0;
+        int i4950 = 0, i4951 = 0, i4952 = 0, i4953 = 0, i4954 = 0, i4955 = 0, i4956 = 0, i4957 = 0, i4958 = 0, i4959 = 0;
+        int i4960 = 0, i4961 = 0, i4962 = 0, i4963 = 0, i4964 = 0, i4965 = 0, i4966 = 0, i4967 = 0, i4968 = 0, i4969 = 0;
+        int i4970 = 0, i4971 = 0, i4972 = 0, i4973 = 0, i4974 = 0, i4975 = 0, i4976 = 0, i4977 = 0, i4978 = 0, i4979 = 0;
+        int i4980 = 0, i4981 = 0, i4982 = 0, i4983 = 0, i4984 = 0, i4985 = 0, i4986 = 0, i4987 = 0, i4988 = 0, i4989 = 0;
+        int i4990 = 0, i4991 = 0, i4992 = 0, i4993 = 0, i4994 = 0, i4995 = 0, i4996 = 0, i4997 = 0, i4998 = 0, i4999 = 0;
+        int i5000 = 0, i5001 = 0, i5002 = 0, i5003 = 0, i5004 = 0, i5005 = 0, i5006 = 0, i5007 = 0, i5008 = 0, i5009 = 0;
+        int i5010 = 0, i5011 = 0, i5012 = 0, i5013 = 0, i5014 = 0, i5015 = 0, i5016 = 0, i5017 = 0, i5018 = 0, i5019 = 0;
+        int i5020 = 0, i5021 = 0, i5022 = 0, i5023 = 0, i5024 = 0, i5025 = 0, i5026 = 0, i5027 = 0, i5028 = 0, i5029 = 0;
+        int i5030 = 0, i5031 = 0, i5032 = 0, i5033 = 0, i5034 = 0, i5035 = 0, i5036 = 0, i5037 = 0, i5038 = 0, i5039 = 0;
+        int i5040 = 0, i5041 = 0, i5042 = 0, i5043 = 0, i5044 = 0, i5045 = 0, i5046 = 0, i5047 = 0, i5048 = 0, i5049 = 0;
+        int i5050 = 0, i5051 = 0, i5052 = 0, i5053 = 0, i5054 = 0, i5055 = 0, i5056 = 0, i5057 = 0, i5058 = 0, i5059 = 0;
+        int i5060 = 0, i5061 = 0, i5062 = 0, i5063 = 0, i5064 = 0, i5065 = 0, i5066 = 0, i5067 = 0, i5068 = 0, i5069 = 0;
+        int i5070 = 0, i5071 = 0, i5072 = 0, i5073 = 0, i5074 = 0, i5075 = 0, i5076 = 0, i5077 = 0, i5078 = 0, i5079 = 0;
+        int i5080 = 0, i5081 = 0, i5082 = 0, i5083 = 0, i5084 = 0, i5085 = 0, i5086 = 0, i5087 = 0, i5088 = 0, i5089 = 0;
+        int i5090 = 0, i5091 = 0, i5092 = 0, i5093 = 0, i5094 = 0, i5095 = 0, i5096 = 0, i5097 = 0, i5098 = 0, i5099 = 0;
+        int i5100 = 0, i5101 = 0, i5102 = 0, i5103 = 0, i5104 = 0, i5105 = 0, i5106 = 0, i5107 = 0, i5108 = 0, i5109 = 0;
+        int i5110 = 0, i5111 = 0, i5112 = 0, i5113 = 0, i5114 = 0, i5115 = 0, i5116 = 0, i5117 = 0, i5118 = 0, i5119 = 0;
+        int i5120 = 0, i5121 = 0, i5122 = 0, i5123 = 0, i5124 = 0, i5125 = 0, i5126 = 0, i5127 = 0, i5128 = 0, i5129 = 0;
+        int i5130 = 0, i5131 = 0, i5132 = 0, i5133 = 0, i5134 = 0, i5135 = 0, i5136 = 0, i5137 = 0, i5138 = 0, i5139 = 0;
+        int i5140 = 0, i5141 = 0, i5142 = 0, i5143 = 0, i5144 = 0, i5145 = 0, i5146 = 0, i5147 = 0, i5148 = 0, i5149 = 0;
+        int i5150 = 0, i5151 = 0, i5152 = 0, i5153 = 0, i5154 = 0, i5155 = 0, i5156 = 0, i5157 = 0, i5158 = 0, i5159 = 0;
+        int i5160 = 0, i5161 = 0, i5162 = 0, i5163 = 0, i5164 = 0, i5165 = 0, i5166 = 0, i5167 = 0, i5168 = 0, i5169 = 0;
+        int i5170 = 0, i5171 = 0, i5172 = 0, i5173 = 0, i5174 = 0, i5175 = 0, i5176 = 0, i5177 = 0, i5178 = 0, i5179 = 0;
+        int i5180 = 0, i5181 = 0, i5182 = 0, i5183 = 0, i5184 = 0, i5185 = 0, i5186 = 0, i5187 = 0, i5188 = 0, i5189 = 0;
+        int i5190 = 0, i5191 = 0, i5192 = 0, i5193 = 0, i5194 = 0, i5195 = 0, i5196 = 0, i5197 = 0, i5198 = 0, i5199 = 0;
+        int i5200 = 0, i5201 = 0, i5202 = 0, i5203 = 0, i5204 = 0, i5205 = 0, i5206 = 0, i5207 = 0, i5208 = 0, i5209 = 0;
+        int i5210 = 0, i5211 = 0, i5212 = 0, i5213 = 0, i5214 = 0, i5215 = 0, i5216 = 0, i5217 = 0, i5218 = 0, i5219 = 0;
+        int i5220 = 0, i5221 = 0, i5222 = 0, i5223 = 0, i5224 = 0, i5225 = 0, i5226 = 0, i5227 = 0, i5228 = 0, i5229 = 0;
+        int i5230 = 0, i5231 = 0, i5232 = 0, i5233 = 0, i5234 = 0, i5235 = 0, i5236 = 0, i5237 = 0, i5238 = 0, i5239 = 0;
+        int i5240 = 0, i5241 = 0, i5242 = 0, i5243 = 0, i5244 = 0, i5245 = 0, i5246 = 0, i5247 = 0, i5248 = 0, i5249 = 0;
+        int i5250 = 0, i5251 = 0, i5252 = 0, i5253 = 0, i5254 = 0, i5255 = 0, i5256 = 0, i5257 = 0, i5258 = 0, i5259 = 0;
+        int i5260 = 0, i5261 = 0, i5262 = 0, i5263 = 0, i5264 = 0, i5265 = 0, i5266 = 0, i5267 = 0, i5268 = 0, i5269 = 0;
+        int i5270 = 0, i5271 = 0, i5272 = 0, i5273 = 0, i5274 = 0, i5275 = 0, i5276 = 0, i5277 = 0, i5278 = 0, i5279 = 0;
+        int i5280 = 0, i5281 = 0, i5282 = 0, i5283 = 0, i5284 = 0, i5285 = 0, i5286 = 0, i5287 = 0, i5288 = 0, i5289 = 0;
+        int i5290 = 0, i5291 = 0, i5292 = 0, i5293 = 0, i5294 = 0, i5295 = 0, i5296 = 0, i5297 = 0, i5298 = 0, i5299 = 0;
+        int i5300 = 0, i5301 = 0, i5302 = 0, i5303 = 0, i5304 = 0, i5305 = 0, i5306 = 0, i5307 = 0, i5308 = 0, i5309 = 0;
+        int i5310 = 0, i5311 = 0, i5312 = 0, i5313 = 0, i5314 = 0, i5315 = 0, i5316 = 0, i5317 = 0, i5318 = 0, i5319 = 0;
+        int i5320 = 0, i5321 = 0, i5322 = 0, i5323 = 0, i5324 = 0, i5325 = 0, i5326 = 0, i5327 = 0, i5328 = 0, i5329 = 0;
+        int i5330 = 0, i5331 = 0, i5332 = 0, i5333 = 0, i5334 = 0, i5335 = 0, i5336 = 0, i5337 = 0, i5338 = 0, i5339 = 0;
+        int i5340 = 0, i5341 = 0, i5342 = 0, i5343 = 0, i5344 = 0, i5345 = 0, i5346 = 0, i5347 = 0, i5348 = 0, i5349 = 0;
+        int i5350 = 0, i5351 = 0, i5352 = 0, i5353 = 0, i5354 = 0, i5355 = 0, i5356 = 0, i5357 = 0, i5358 = 0, i5359 = 0;
+        int i5360 = 0, i5361 = 0, i5362 = 0, i5363 = 0, i5364 = 0, i5365 = 0, i5366 = 0, i5367 = 0, i5368 = 0, i5369 = 0;
+        int i5370 = 0, i5371 = 0, i5372 = 0, i5373 = 0, i5374 = 0, i5375 = 0, i5376 = 0, i5377 = 0, i5378 = 0, i5379 = 0;
+        int i5380 = 0, i5381 = 0, i5382 = 0, i5383 = 0, i5384 = 0, i5385 = 0, i5386 = 0, i5387 = 0, i5388 = 0, i5389 = 0;
+        int i5390 = 0, i5391 = 0, i5392 = 0, i5393 = 0, i5394 = 0, i5395 = 0, i5396 = 0, i5397 = 0, i5398 = 0, i5399 = 0;
+        int i5400 = 0, i5401 = 0, i5402 = 0, i5403 = 0, i5404 = 0, i5405 = 0, i5406 = 0, i5407 = 0, i5408 = 0, i5409 = 0;
+        int i5410 = 0, i5411 = 0, i5412 = 0, i5413 = 0, i5414 = 0, i5415 = 0, i5416 = 0, i5417 = 0, i5418 = 0, i5419 = 0;
+        int i5420 = 0, i5421 = 0, i5422 = 0, i5423 = 0, i5424 = 0, i5425 = 0, i5426 = 0, i5427 = 0, i5428 = 0, i5429 = 0;
+        int i5430 = 0, i5431 = 0, i5432 = 0, i5433 = 0, i5434 = 0, i5435 = 0, i5436 = 0, i5437 = 0, i5438 = 0, i5439 = 0;
+        int i5440 = 0, i5441 = 0, i5442 = 0, i5443 = 0, i5444 = 0, i5445 = 0, i5446 = 0, i5447 = 0, i5448 = 0, i5449 = 0;
+        int i5450 = 0, i5451 = 0, i5452 = 0, i5453 = 0, i5454 = 0, i5455 = 0, i5456 = 0, i5457 = 0, i5458 = 0, i5459 = 0;
+        int i5460 = 0, i5461 = 0, i5462 = 0, i5463 = 0, i5464 = 0, i5465 = 0, i5466 = 0, i5467 = 0, i5468 = 0, i5469 = 0;
+        int i5470 = 0, i5471 = 0, i5472 = 0, i5473 = 0, i5474 = 0, i5475 = 0, i5476 = 0, i5477 = 0, i5478 = 0, i5479 = 0;
+        int i5480 = 0, i5481 = 0, i5482 = 0, i5483 = 0, i5484 = 0, i5485 = 0, i5486 = 0, i5487 = 0, i5488 = 0, i5489 = 0;
+        int i5490 = 0, i5491 = 0, i5492 = 0, i5493 = 0, i5494 = 0, i5495 = 0, i5496 = 0, i5497 = 0, i5498 = 0, i5499 = 0;
+        int i5500 = 0, i5501 = 0, i5502 = 0, i5503 = 0, i5504 = 0, i5505 = 0, i5506 = 0, i5507 = 0, i5508 = 0, i5509 = 0;
+        int i5510 = 0, i5511 = 0, i5512 = 0, i5513 = 0, i5514 = 0, i5515 = 0, i5516 = 0, i5517 = 0, i5518 = 0, i5519 = 0;
+        int i5520 = 0, i5521 = 0, i5522 = 0, i5523 = 0, i5524 = 0, i5525 = 0, i5526 = 0, i5527 = 0, i5528 = 0, i5529 = 0;
+        int i5530 = 0, i5531 = 0, i5532 = 0, i5533 = 0, i5534 = 0, i5535 = 0, i5536 = 0, i5537 = 0, i5538 = 0, i5539 = 0;
+        int i5540 = 0, i5541 = 0, i5542 = 0, i5543 = 0, i5544 = 0, i5545 = 0, i5546 = 0, i5547 = 0, i5548 = 0, i5549 = 0;
+        int i5550 = 0, i5551 = 0, i5552 = 0, i5553 = 0, i5554 = 0, i5555 = 0, i5556 = 0, i5557 = 0, i5558 = 0, i5559 = 0;
+        int i5560 = 0, i5561 = 0, i5562 = 0, i5563 = 0, i5564 = 0, i5565 = 0, i5566 = 0, i5567 = 0, i5568 = 0, i5569 = 0;
+        int i5570 = 0, i5571 = 0, i5572 = 0, i5573 = 0, i5574 = 0, i5575 = 0, i5576 = 0, i5577 = 0, i5578 = 0, i5579 = 0;
+        int i5580 = 0, i5581 = 0, i5582 = 0, i5583 = 0, i5584 = 0, i5585 = 0, i5586 = 0, i5587 = 0, i5588 = 0, i5589 = 0;
+        int i5590 = 0, i5591 = 0, i5592 = 0, i5593 = 0, i5594 = 0, i5595 = 0, i5596 = 0, i5597 = 0, i5598 = 0, i5599 = 0;
+        int i5600 = 0, i5601 = 0, i5602 = 0, i5603 = 0, i5604 = 0, i5605 = 0, i5606 = 0, i5607 = 0, i5608 = 0, i5609 = 0;
+        int i5610 = 0, i5611 = 0, i5612 = 0, i5613 = 0, i5614 = 0, i5615 = 0, i5616 = 0, i5617 = 0, i5618 = 0, i5619 = 0;
+        int i5620 = 0, i5621 = 0, i5622 = 0, i5623 = 0, i5624 = 0, i5625 = 0, i5626 = 0, i5627 = 0, i5628 = 0, i5629 = 0;
+        int i5630 = 0, i5631 = 0, i5632 = 0, i5633 = 0, i5634 = 0, i5635 = 0, i5636 = 0, i5637 = 0, i5638 = 0, i5639 = 0;
+        int i5640 = 0, i5641 = 0, i5642 = 0, i5643 = 0, i5644 = 0, i5645 = 0, i5646 = 0, i5647 = 0, i5648 = 0, i5649 = 0;
+        int i5650 = 0, i5651 = 0, i5652 = 0, i5653 = 0, i5654 = 0, i5655 = 0, i5656 = 0, i5657 = 0, i5658 = 0, i5659 = 0;
+        int i5660 = 0, i5661 = 0, i5662 = 0, i5663 = 0, i5664 = 0, i5665 = 0, i5666 = 0, i5667 = 0, i5668 = 0, i5669 = 0;
+        int i5670 = 0, i5671 = 0, i5672 = 0, i5673 = 0, i5674 = 0, i5675 = 0, i5676 = 0, i5677 = 0, i5678 = 0, i5679 = 0;
+        int i5680 = 0, i5681 = 0, i5682 = 0, i5683 = 0, i5684 = 0, i5685 = 0, i5686 = 0, i5687 = 0, i5688 = 0, i5689 = 0;
+        int i5690 = 0, i5691 = 0, i5692 = 0, i5693 = 0, i5694 = 0, i5695 = 0, i5696 = 0, i5697 = 0, i5698 = 0, i5699 = 0;
+        int i5700 = 0, i5701 = 0, i5702 = 0, i5703 = 0, i5704 = 0, i5705 = 0, i5706 = 0, i5707 = 0, i5708 = 0, i5709 = 0;
+        int i5710 = 0, i5711 = 0, i5712 = 0, i5713 = 0, i5714 = 0, i5715 = 0, i5716 = 0, i5717 = 0, i5718 = 0, i5719 = 0;
+        int i5720 = 0, i5721 = 0, i5722 = 0, i5723 = 0, i5724 = 0, i5725 = 0, i5726 = 0, i5727 = 0, i5728 = 0, i5729 = 0;
+        int i5730 = 0, i5731 = 0, i5732 = 0, i5733 = 0, i5734 = 0, i5735 = 0, i5736 = 0, i5737 = 0, i5738 = 0, i5739 = 0;
+        int i5740 = 0, i5741 = 0, i5742 = 0, i5743 = 0, i5744 = 0, i5745 = 0, i5746 = 0, i5747 = 0, i5748 = 0, i5749 = 0;
+        int i5750 = 0, i5751 = 0, i5752 = 0, i5753 = 0, i5754 = 0, i5755 = 0, i5756 = 0, i5757 = 0, i5758 = 0, i5759 = 0;
+        int i5760 = 0, i5761 = 0, i5762 = 0, i5763 = 0, i5764 = 0, i5765 = 0, i5766 = 0, i5767 = 0, i5768 = 0, i5769 = 0;
+        int i5770 = 0, i5771 = 0, i5772 = 0, i5773 = 0, i5774 = 0, i5775 = 0, i5776 = 0, i5777 = 0, i5778 = 0, i5779 = 0;
+        int i5780 = 0, i5781 = 0, i5782 = 0, i5783 = 0, i5784 = 0, i5785 = 0, i5786 = 0, i5787 = 0, i5788 = 0, i5789 = 0;
+        int i5790 = 0, i5791 = 0, i5792 = 0, i5793 = 0, i5794 = 0, i5795 = 0, i5796 = 0, i5797 = 0, i5798 = 0, i5799 = 0;
+        int i5800 = 0, i5801 = 0, i5802 = 0, i5803 = 0, i5804 = 0, i5805 = 0, i5806 = 0, i5807 = 0, i5808 = 0, i5809 = 0;
+        int i5810 = 0, i5811 = 0, i5812 = 0, i5813 = 0, i5814 = 0, i5815 = 0, i5816 = 0, i5817 = 0, i5818 = 0, i5819 = 0;
+        int i5820 = 0, i5821 = 0, i5822 = 0, i5823 = 0, i5824 = 0, i5825 = 0, i5826 = 0, i5827 = 0, i5828 = 0, i5829 = 0;
+        int i5830 = 0, i5831 = 0, i5832 = 0, i5833 = 0, i5834 = 0, i5835 = 0, i5836 = 0, i5837 = 0, i5838 = 0, i5839 = 0;
+        int i5840 = 0, i5841 = 0, i5842 = 0, i5843 = 0, i5844 = 0, i5845 = 0, i5846 = 0, i5847 = 0, i5848 = 0, i5849 = 0;
+        int i5850 = 0, i5851 = 0, i5852 = 0, i5853 = 0, i5854 = 0, i5855 = 0, i5856 = 0, i5857 = 0, i5858 = 0, i5859 = 0;
+        int i5860 = 0, i5861 = 0, i5862 = 0, i5863 = 0, i5864 = 0, i5865 = 0, i5866 = 0, i5867 = 0, i5868 = 0, i5869 = 0;
+        int i5870 = 0, i5871 = 0, i5872 = 0, i5873 = 0, i5874 = 0, i5875 = 0, i5876 = 0, i5877 = 0, i5878 = 0, i5879 = 0;
+        int i5880 = 0, i5881 = 0, i5882 = 0, i5883 = 0, i5884 = 0, i5885 = 0, i5886 = 0, i5887 = 0, i5888 = 0, i5889 = 0;
+        int i5890 = 0, i5891 = 0, i5892 = 0, i5893 = 0, i5894 = 0, i5895 = 0, i5896 = 0, i5897 = 0, i5898 = 0, i5899 = 0;
+        int i5900 = 0, i5901 = 0, i5902 = 0, i5903 = 0, i5904 = 0, i5905 = 0, i5906 = 0, i5907 = 0, i5908 = 0, i5909 = 0;
+        int i5910 = 0, i5911 = 0, i5912 = 0, i5913 = 0, i5914 = 0, i5915 = 0, i5916 = 0, i5917 = 0, i5918 = 0, i5919 = 0;
+        int i5920 = 0, i5921 = 0, i5922 = 0, i5923 = 0, i5924 = 0, i5925 = 0, i5926 = 0, i5927 = 0, i5928 = 0, i5929 = 0;
+        int i5930 = 0, i5931 = 0, i5932 = 0, i5933 = 0, i5934 = 0, i5935 = 0, i5936 = 0, i5937 = 0, i5938 = 0, i5939 = 0;
+        int i5940 = 0, i5941 = 0, i5942 = 0, i5943 = 0, i5944 = 0, i5945 = 0, i5946 = 0, i5947 = 0, i5948 = 0, i5949 = 0;
+        int i5950 = 0, i5951 = 0, i5952 = 0, i5953 = 0, i5954 = 0, i5955 = 0, i5956 = 0, i5957 = 0, i5958 = 0, i5959 = 0;
+        int i5960 = 0, i5961 = 0, i5962 = 0, i5963 = 0, i5964 = 0, i5965 = 0, i5966 = 0, i5967 = 0, i5968 = 0, i5969 = 0;
+        int i5970 = 0, i5971 = 0, i5972 = 0, i5973 = 0, i5974 = 0, i5975 = 0, i5976 = 0, i5977 = 0, i5978 = 0, i5979 = 0;
+        int i5980 = 0, i5981 = 0, i5982 = 0, i5983 = 0, i5984 = 0, i5985 = 0, i5986 = 0, i5987 = 0, i5988 = 0, i5989 = 0;
+        int i5990 = 0, i5991 = 0, i5992 = 0, i5993 = 0, i5994 = 0, i5995 = 0, i5996 = 0, i5997 = 0, i5998 = 0, i5999 = 0;
+        int i6000 = 0, i6001 = 0, i6002 = 0, i6003 = 0, i6004 = 0, i6005 = 0, i6006 = 0, i6007 = 0, i6008 = 0, i6009 = 0;
+        int i6010 = 0, i6011 = 0, i6012 = 0, i6013 = 0, i6014 = 0, i6015 = 0, i6016 = 0, i6017 = 0, i6018 = 0, i6019 = 0;
+        int i6020 = 0, i6021 = 0, i6022 = 0, i6023 = 0, i6024 = 0, i6025 = 0, i6026 = 0, i6027 = 0, i6028 = 0, i6029 = 0;
+        int i6030 = 0, i6031 = 0, i6032 = 0, i6033 = 0, i6034 = 0, i6035 = 0, i6036 = 0, i6037 = 0, i6038 = 0, i6039 = 0;
+        int i6040 = 0, i6041 = 0, i6042 = 0, i6043 = 0, i6044 = 0, i6045 = 0, i6046 = 0, i6047 = 0, i6048 = 0, i6049 = 0;
+        int i6050 = 0, i6051 = 0, i6052 = 0, i6053 = 0, i6054 = 0, i6055 = 0, i6056 = 0, i6057 = 0, i6058 = 0, i6059 = 0;
+        int i6060 = 0, i6061 = 0, i6062 = 0, i6063 = 0, i6064 = 0, i6065 = 0, i6066 = 0, i6067 = 0, i6068 = 0, i6069 = 0;
+        int i6070 = 0, i6071 = 0, i6072 = 0, i6073 = 0, i6074 = 0, i6075 = 0, i6076 = 0, i6077 = 0, i6078 = 0, i6079 = 0;
+        int i6080 = 0, i6081 = 0, i6082 = 0, i6083 = 0, i6084 = 0, i6085 = 0, i6086 = 0, i6087 = 0, i6088 = 0, i6089 = 0;
+        int i6090 = 0, i6091 = 0, i6092 = 0, i6093 = 0, i6094 = 0, i6095 = 0, i6096 = 0, i6097 = 0, i6098 = 0, i6099 = 0;
+        int i6100 = 0, i6101 = 0, i6102 = 0, i6103 = 0, i6104 = 0, i6105 = 0, i6106 = 0, i6107 = 0, i6108 = 0, i6109 = 0;
+        int i6110 = 0, i6111 = 0, i6112 = 0, i6113 = 0, i6114 = 0, i6115 = 0, i6116 = 0, i6117 = 0, i6118 = 0, i6119 = 0;
+        int i6120 = 0, i6121 = 0, i6122 = 0, i6123 = 0, i6124 = 0, i6125 = 0, i6126 = 0, i6127 = 0, i6128 = 0, i6129 = 0;
+        int i6130 = 0, i6131 = 0, i6132 = 0, i6133 = 0, i6134 = 0, i6135 = 0, i6136 = 0, i6137 = 0, i6138 = 0, i6139 = 0;
+        int i6140 = 0, i6141 = 0, i6142 = 0, i6143 = 0, i6144 = 0, i6145 = 0, i6146 = 0, i6147 = 0, i6148 = 0, i6149 = 0;
+        int i6150 = 0, i6151 = 0, i6152 = 0, i6153 = 0, i6154 = 0, i6155 = 0, i6156 = 0, i6157 = 0, i6158 = 0, i6159 = 0;
+        int i6160 = 0, i6161 = 0, i6162 = 0, i6163 = 0, i6164 = 0, i6165 = 0, i6166 = 0, i6167 = 0, i6168 = 0, i6169 = 0;
+        int i6170 = 0, i6171 = 0, i6172 = 0, i6173 = 0, i6174 = 0, i6175 = 0, i6176 = 0, i6177 = 0, i6178 = 0, i6179 = 0;
+        int i6180 = 0, i6181 = 0, i6182 = 0, i6183 = 0, i6184 = 0, i6185 = 0, i6186 = 0, i6187 = 0, i6188 = 0, i6189 = 0;
+        int i6190 = 0, i6191 = 0, i6192 = 0, i6193 = 0, i6194 = 0, i6195 = 0, i6196 = 0, i6197 = 0, i6198 = 0, i6199 = 0;
+        int i6200 = 0, i6201 = 0, i6202 = 0, i6203 = 0, i6204 = 0, i6205 = 0, i6206 = 0, i6207 = 0, i6208 = 0, i6209 = 0;
+        int i6210 = 0, i6211 = 0, i6212 = 0, i6213 = 0, i6214 = 0, i6215 = 0, i6216 = 0, i6217 = 0, i6218 = 0, i6219 = 0;
+        int i6220 = 0, i6221 = 0, i6222 = 0, i6223 = 0, i6224 = 0, i6225 = 0, i6226 = 0, i6227 = 0, i6228 = 0, i6229 = 0;
+        int i6230 = 0, i6231 = 0, i6232 = 0, i6233 = 0, i6234 = 0, i6235 = 0, i6236 = 0, i6237 = 0, i6238 = 0, i6239 = 0;
+        int i6240 = 0, i6241 = 0, i6242 = 0, i6243 = 0, i6244 = 0, i6245 = 0, i6246 = 0, i6247 = 0, i6248 = 0, i6249 = 0;
+        int i6250 = 0, i6251 = 0, i6252 = 0, i6253 = 0, i6254 = 0, i6255 = 0, i6256 = 0, i6257 = 0, i6258 = 0, i6259 = 0;
+        int i6260 = 0, i6261 = 0, i6262 = 0, i6263 = 0, i6264 = 0, i6265 = 0, i6266 = 0, i6267 = 0, i6268 = 0, i6269 = 0;
+        int i6270 = 0, i6271 = 0, i6272 = 0, i6273 = 0, i6274 = 0, i6275 = 0, i6276 = 0, i6277 = 0, i6278 = 0, i6279 = 0;
+        int i6280 = 0, i6281 = 0, i6282 = 0, i6283 = 0, i6284 = 0, i6285 = 0, i6286 = 0, i6287 = 0, i6288 = 0, i6289 = 0;
+        int i6290 = 0, i6291 = 0, i6292 = 0, i6293 = 0, i6294 = 0, i6295 = 0, i6296 = 0, i6297 = 0, i6298 = 0, i6299 = 0;
+        int i6300 = 0, i6301 = 0, i6302 = 0, i6303 = 0, i6304 = 0, i6305 = 0, i6306 = 0, i6307 = 0, i6308 = 0, i6309 = 0;
+        int i6310 = 0, i6311 = 0, i6312 = 0, i6313 = 0, i6314 = 0, i6315 = 0, i6316 = 0, i6317 = 0, i6318 = 0, i6319 = 0;
+        int i6320 = 0, i6321 = 0, i6322 = 0, i6323 = 0, i6324 = 0, i6325 = 0, i6326 = 0, i6327 = 0, i6328 = 0, i6329 = 0;
+        int i6330 = 0, i6331 = 0, i6332 = 0, i6333 = 0, i6334 = 0, i6335 = 0, i6336 = 0, i6337 = 0, i6338 = 0, i6339 = 0;
+        int i6340 = 0, i6341 = 0, i6342 = 0, i6343 = 0, i6344 = 0, i6345 = 0, i6346 = 0, i6347 = 0, i6348 = 0, i6349 = 0;
+        int i6350 = 0, i6351 = 0, i6352 = 0, i6353 = 0, i6354 = 0, i6355 = 0, i6356 = 0, i6357 = 0, i6358 = 0, i6359 = 0;
+        int i6360 = 0, i6361 = 0, i6362 = 0, i6363 = 0, i6364 = 0, i6365 = 0, i6366 = 0, i6367 = 0, i6368 = 0, i6369 = 0;
+        int i6370 = 0, i6371 = 0, i6372 = 0, i6373 = 0, i6374 = 0, i6375 = 0, i6376 = 0, i6377 = 0, i6378 = 0, i6379 = 0;
+        int i6380 = 0, i6381 = 0, i6382 = 0, i6383 = 0, i6384 = 0, i6385 = 0, i6386 = 0, i6387 = 0, i6388 = 0, i6389 = 0;
+        int i6390 = 0, i6391 = 0, i6392 = 0, i6393 = 0, i6394 = 0, i6395 = 0, i6396 = 0, i6397 = 0, i6398 = 0, i6399 = 0;
+        int i6400 = 0, i6401 = 0, i6402 = 0, i6403 = 0, i6404 = 0, i6405 = 0, i6406 = 0, i6407 = 0, i6408 = 0, i6409 = 0;
+        int i6410 = 0, i6411 = 0, i6412 = 0, i6413 = 0, i6414 = 0, i6415 = 0, i6416 = 0, i6417 = 0, i6418 = 0, i6419 = 0;
+        int i6420 = 0, i6421 = 0, i6422 = 0, i6423 = 0, i6424 = 0, i6425 = 0, i6426 = 0, i6427 = 0, i6428 = 0, i6429 = 0;
+        int i6430 = 0, i6431 = 0, i6432 = 0, i6433 = 0, i6434 = 0, i6435 = 0, i6436 = 0, i6437 = 0, i6438 = 0, i6439 = 0;
+        int i6440 = 0, i6441 = 0, i6442 = 0, i6443 = 0, i6444 = 0, i6445 = 0, i6446 = 0, i6447 = 0, i6448 = 0, i6449 = 0;
+        int i6450 = 0, i6451 = 0, i6452 = 0, i6453 = 0, i6454 = 0, i6455 = 0, i6456 = 0, i6457 = 0, i6458 = 0, i6459 = 0;
+        int i6460 = 0, i6461 = 0, i6462 = 0, i6463 = 0, i6464 = 0, i6465 = 0, i6466 = 0, i6467 = 0, i6468 = 0, i6469 = 0;
+        int i6470 = 0, i6471 = 0, i6472 = 0, i6473 = 0, i6474 = 0, i6475 = 0, i6476 = 0, i6477 = 0, i6478 = 0, i6479 = 0;
+        int i6480 = 0, i6481 = 0, i6482 = 0, i6483 = 0, i6484 = 0, i6485 = 0, i6486 = 0, i6487 = 0, i6488 = 0, i6489 = 0;
+        int i6490 = 0, i6491 = 0, i6492 = 0, i6493 = 0, i6494 = 0, i6495 = 0, i6496 = 0, i6497 = 0, i6498 = 0, i6499 = 0;
+        int i6500 = 0, i6501 = 0, i6502 = 0, i6503 = 0, i6504 = 0, i6505 = 0, i6506 = 0, i6507 = 0, i6508 = 0, i6509 = 0;
+        int i6510 = 0, i6511 = 0, i6512 = 0, i6513 = 0, i6514 = 0, i6515 = 0, i6516 = 0, i6517 = 0, i6518 = 0, i6519 = 0;
+        int i6520 = 0, i6521 = 0, i6522 = 0, i6523 = 0, i6524 = 0, i6525 = 0, i6526 = 0, i6527 = 0, i6528 = 0, i6529 = 0;
+        int i6530 = 0, i6531 = 0, i6532 = 0, i6533 = 0, i6534 = 0, i6535 = 0, i6536 = 0, i6537 = 0, i6538 = 0, i6539 = 0;
+        int i6540 = 0, i6541 = 0, i6542 = 0, i6543 = 0, i6544 = 0, i6545 = 0, i6546 = 0, i6547 = 0, i6548 = 0, i6549 = 0;
+        int i6550 = 0, i6551 = 0, i6552 = 0, i6553 = 0, i6554 = 0, i6555 = 0, i6556 = 0, i6557 = 0, i6558 = 0, i6559 = 0;
+        int i6560 = 0, i6561 = 0, i6562 = 0, i6563 = 0, i6564 = 0, i6565 = 0, i6566 = 0, i6567 = 0, i6568 = 0, i6569 = 0;
+        int i6570 = 0, i6571 = 0, i6572 = 0, i6573 = 0, i6574 = 0, i6575 = 0, i6576 = 0, i6577 = 0, i6578 = 0, i6579 = 0;
+        int i6580 = 0, i6581 = 0, i6582 = 0, i6583 = 0, i6584 = 0, i6585 = 0, i6586 = 0, i6587 = 0, i6588 = 0, i6589 = 0;
+        int i6590 = 0, i6591 = 0, i6592 = 0, i6593 = 0, i6594 = 0, i6595 = 0, i6596 = 0, i6597 = 0, i6598 = 0, i6599 = 0;
+        int i6600 = 0, i6601 = 0, i6602 = 0, i6603 = 0, i6604 = 0, i6605 = 0, i6606 = 0, i6607 = 0, i6608 = 0, i6609 = 0;
+        int i6610 = 0, i6611 = 0, i6612 = 0, i6613 = 0, i6614 = 0, i6615 = 0, i6616 = 0, i6617 = 0, i6618 = 0, i6619 = 0;
+        int i6620 = 0, i6621 = 0, i6622 = 0, i6623 = 0, i6624 = 0, i6625 = 0, i6626 = 0, i6627 = 0, i6628 = 0, i6629 = 0;
+        int i6630 = 0, i6631 = 0, i6632 = 0, i6633 = 0, i6634 = 0, i6635 = 0, i6636 = 0, i6637 = 0, i6638 = 0, i6639 = 0;
+        int i6640 = 0, i6641 = 0, i6642 = 0, i6643 = 0, i6644 = 0, i6645 = 0, i6646 = 0, i6647 = 0, i6648 = 0, i6649 = 0;
+        int i6650 = 0, i6651 = 0, i6652 = 0, i6653 = 0, i6654 = 0, i6655 = 0, i6656 = 0, i6657 = 0, i6658 = 0, i6659 = 0;
+        int i6660 = 0, i6661 = 0, i6662 = 0, i6663 = 0, i6664 = 0, i6665 = 0, i6666 = 0, i6667 = 0, i6668 = 0, i6669 = 0;
+        int i6670 = 0, i6671 = 0, i6672 = 0, i6673 = 0, i6674 = 0, i6675 = 0, i6676 = 0, i6677 = 0, i6678 = 0, i6679 = 0;
+        int i6680 = 0, i6681 = 0, i6682 = 0, i6683 = 0, i6684 = 0, i6685 = 0, i6686 = 0, i6687 = 0, i6688 = 0, i6689 = 0;
+        int i6690 = 0, i6691 = 0, i6692 = 0, i6693 = 0, i6694 = 0, i6695 = 0, i6696 = 0, i6697 = 0, i6698 = 0, i6699 = 0;
+        int i6700 = 0, i6701 = 0, i6702 = 0, i6703 = 0, i6704 = 0, i6705 = 0, i6706 = 0, i6707 = 0, i6708 = 0, i6709 = 0;
+        int i6710 = 0, i6711 = 0, i6712 = 0, i6713 = 0, i6714 = 0, i6715 = 0, i6716 = 0, i6717 = 0, i6718 = 0, i6719 = 0;
+        int i6720 = 0, i6721 = 0, i6722 = 0, i6723 = 0, i6724 = 0, i6725 = 0, i6726 = 0, i6727 = 0, i6728 = 0, i6729 = 0;
+        int i6730 = 0, i6731 = 0, i6732 = 0, i6733 = 0, i6734 = 0, i6735 = 0, i6736 = 0, i6737 = 0, i6738 = 0, i6739 = 0;
+        int i6740 = 0, i6741 = 0, i6742 = 0, i6743 = 0, i6744 = 0, i6745 = 0, i6746 = 0, i6747 = 0, i6748 = 0, i6749 = 0;
+        int i6750 = 0, i6751 = 0, i6752 = 0, i6753 = 0, i6754 = 0, i6755 = 0, i6756 = 0, i6757 = 0, i6758 = 0, i6759 = 0;
+        int i6760 = 0, i6761 = 0, i6762 = 0, i6763 = 0, i6764 = 0, i6765 = 0, i6766 = 0, i6767 = 0, i6768 = 0, i6769 = 0;
+        int i6770 = 0, i6771 = 0, i6772 = 0, i6773 = 0, i6774 = 0, i6775 = 0, i6776 = 0, i6777 = 0, i6778 = 0, i6779 = 0;
+        int i6780 = 0, i6781 = 0, i6782 = 0, i6783 = 0, i6784 = 0, i6785 = 0, i6786 = 0, i6787 = 0, i6788 = 0, i6789 = 0;
+        int i6790 = 0, i6791 = 0, i6792 = 0, i6793 = 0, i6794 = 0, i6795 = 0, i6796 = 0, i6797 = 0, i6798 = 0, i6799 = 0;
+        int i6800 = 0, i6801 = 0, i6802 = 0, i6803 = 0, i6804 = 0, i6805 = 0, i6806 = 0, i6807 = 0, i6808 = 0, i6809 = 0;
+        int i6810 = 0, i6811 = 0, i6812 = 0, i6813 = 0, i6814 = 0, i6815 = 0, i6816 = 0, i6817 = 0, i6818 = 0, i6819 = 0;
+        int i6820 = 0, i6821 = 0, i6822 = 0, i6823 = 0, i6824 = 0, i6825 = 0, i6826 = 0, i6827 = 0, i6828 = 0, i6829 = 0;
+        int i6830 = 0, i6831 = 0, i6832 = 0, i6833 = 0, i6834 = 0, i6835 = 0, i6836 = 0, i6837 = 0, i6838 = 0, i6839 = 0;
+        int i6840 = 0, i6841 = 0, i6842 = 0, i6843 = 0, i6844 = 0, i6845 = 0, i6846 = 0, i6847 = 0, i6848 = 0, i6849 = 0;
+        int i6850 = 0, i6851 = 0, i6852 = 0, i6853 = 0, i6854 = 0, i6855 = 0, i6856 = 0, i6857 = 0, i6858 = 0, i6859 = 0;
+        int i6860 = 0, i6861 = 0, i6862 = 0, i6863 = 0, i6864 = 0, i6865 = 0, i6866 = 0, i6867 = 0, i6868 = 0, i6869 = 0;
+        int i6870 = 0, i6871 = 0, i6872 = 0, i6873 = 0, i6874 = 0, i6875 = 0, i6876 = 0, i6877 = 0, i6878 = 0, i6879 = 0;
+        int i6880 = 0, i6881 = 0, i6882 = 0, i6883 = 0, i6884 = 0, i6885 = 0, i6886 = 0, i6887 = 0, i6888 = 0, i6889 = 0;
+        int i6890 = 0, i6891 = 0, i6892 = 0, i6893 = 0, i6894 = 0, i6895 = 0, i6896 = 0, i6897 = 0, i6898 = 0, i6899 = 0;
+        int i6900 = 0, i6901 = 0, i6902 = 0, i6903 = 0, i6904 = 0, i6905 = 0, i6906 = 0, i6907 = 0, i6908 = 0, i6909 = 0;
+        int i6910 = 0, i6911 = 0, i6912 = 0, i6913 = 0, i6914 = 0, i6915 = 0, i6916 = 0, i6917 = 0, i6918 = 0, i6919 = 0;
+        int i6920 = 0, i6921 = 0, i6922 = 0, i6923 = 0, i6924 = 0, i6925 = 0, i6926 = 0, i6927 = 0, i6928 = 0, i6929 = 0;
+        int i6930 = 0, i6931 = 0, i6932 = 0, i6933 = 0, i6934 = 0, i6935 = 0, i6936 = 0, i6937 = 0, i6938 = 0, i6939 = 0;
+        int i6940 = 0, i6941 = 0, i6942 = 0, i6943 = 0, i6944 = 0, i6945 = 0, i6946 = 0, i6947 = 0, i6948 = 0, i6949 = 0;
+        int i6950 = 0, i6951 = 0, i6952 = 0, i6953 = 0, i6954 = 0, i6955 = 0, i6956 = 0, i6957 = 0, i6958 = 0, i6959 = 0;
+        int i6960 = 0, i6961 = 0, i6962 = 0, i6963 = 0, i6964 = 0, i6965 = 0, i6966 = 0, i6967 = 0, i6968 = 0, i6969 = 0;
+        int i6970 = 0, i6971 = 0, i6972 = 0, i6973 = 0, i6974 = 0, i6975 = 0, i6976 = 0, i6977 = 0, i6978 = 0, i6979 = 0;
+        int i6980 = 0, i6981 = 0, i6982 = 0, i6983 = 0, i6984 = 0, i6985 = 0, i6986 = 0, i6987 = 0, i6988 = 0, i6989 = 0;
+        int i6990 = 0, i6991 = 0, i6992 = 0, i6993 = 0, i6994 = 0, i6995 = 0, i6996 = 0, i6997 = 0, i6998 = 0, i6999 = 0;
+        int i7000 = 0, i7001 = 0, i7002 = 0, i7003 = 0, i7004 = 0, i7005 = 0, i7006 = 0, i7007 = 0, i7008 = 0, i7009 = 0;
+        int i7010 = 0, i7011 = 0, i7012 = 0, i7013 = 0, i7014 = 0, i7015 = 0, i7016 = 0, i7017 = 0, i7018 = 0, i7019 = 0;
+        int i7020 = 0, i7021 = 0, i7022 = 0, i7023 = 0, i7024 = 0, i7025 = 0, i7026 = 0, i7027 = 0, i7028 = 0, i7029 = 0;
+        int i7030 = 0, i7031 = 0, i7032 = 0, i7033 = 0, i7034 = 0, i7035 = 0, i7036 = 0, i7037 = 0, i7038 = 0, i7039 = 0;
+        int i7040 = 0, i7041 = 0, i7042 = 0, i7043 = 0, i7044 = 0, i7045 = 0, i7046 = 0, i7047 = 0, i7048 = 0, i7049 = 0;
+        int i7050 = 0, i7051 = 0, i7052 = 0, i7053 = 0, i7054 = 0, i7055 = 0, i7056 = 0, i7057 = 0, i7058 = 0, i7059 = 0;
+        int i7060 = 0, i7061 = 0, i7062 = 0, i7063 = 0, i7064 = 0, i7065 = 0, i7066 = 0, i7067 = 0, i7068 = 0, i7069 = 0;
+        int i7070 = 0, i7071 = 0, i7072 = 0, i7073 = 0, i7074 = 0, i7075 = 0, i7076 = 0, i7077 = 0, i7078 = 0, i7079 = 0;
+        int i7080 = 0, i7081 = 0, i7082 = 0, i7083 = 0, i7084 = 0, i7085 = 0, i7086 = 0, i7087 = 0, i7088 = 0, i7089 = 0;
+        int i7090 = 0, i7091 = 0, i7092 = 0, i7093 = 0, i7094 = 0, i7095 = 0, i7096 = 0, i7097 = 0, i7098 = 0, i7099 = 0;
+        int i7100 = 0, i7101 = 0, i7102 = 0, i7103 = 0, i7104 = 0, i7105 = 0, i7106 = 0, i7107 = 0, i7108 = 0, i7109 = 0;
+        int i7110 = 0, i7111 = 0, i7112 = 0, i7113 = 0, i7114 = 0, i7115 = 0, i7116 = 0, i7117 = 0, i7118 = 0, i7119 = 0;
+        int i7120 = 0, i7121 = 0, i7122 = 0, i7123 = 0, i7124 = 0, i7125 = 0, i7126 = 0, i7127 = 0, i7128 = 0, i7129 = 0;
+        int i7130 = 0, i7131 = 0, i7132 = 0, i7133 = 0, i7134 = 0, i7135 = 0, i7136 = 0, i7137 = 0, i7138 = 0, i7139 = 0;
+        int i7140 = 0, i7141 = 0, i7142 = 0, i7143 = 0, i7144 = 0, i7145 = 0, i7146 = 0, i7147 = 0, i7148 = 0, i7149 = 0;
+        int i7150 = 0, i7151 = 0, i7152 = 0, i7153 = 0, i7154 = 0, i7155 = 0, i7156 = 0, i7157 = 0, i7158 = 0, i7159 = 0;
+        int i7160 = 0, i7161 = 0, i7162 = 0, i7163 = 0, i7164 = 0, i7165 = 0, i7166 = 0, i7167 = 0, i7168 = 0, i7169 = 0;
+        int i7170 = 0, i7171 = 0, i7172 = 0, i7173 = 0, i7174 = 0, i7175 = 0, i7176 = 0, i7177 = 0, i7178 = 0, i7179 = 0;
+        int i7180 = 0, i7181 = 0, i7182 = 0, i7183 = 0, i7184 = 0, i7185 = 0, i7186 = 0, i7187 = 0, i7188 = 0, i7189 = 0;
+        int i7190 = 0, i7191 = 0, i7192 = 0, i7193 = 0, i7194 = 0, i7195 = 0, i7196 = 0, i7197 = 0, i7198 = 0, i7199 = 0;
+        int i7200 = 0, i7201 = 0, i7202 = 0, i7203 = 0, i7204 = 0, i7205 = 0, i7206 = 0, i7207 = 0, i7208 = 0, i7209 = 0;
+        int i7210 = 0, i7211 = 0, i7212 = 0, i7213 = 0, i7214 = 0, i7215 = 0, i7216 = 0, i7217 = 0, i7218 = 0, i7219 = 0;
+        int i7220 = 0, i7221 = 0, i7222 = 0, i7223 = 0, i7224 = 0, i7225 = 0, i7226 = 0, i7227 = 0, i7228 = 0, i7229 = 0;
+        int i7230 = 0, i7231 = 0, i7232 = 0, i7233 = 0, i7234 = 0, i7235 = 0, i7236 = 0, i7237 = 0, i7238 = 0, i7239 = 0;
+        int i7240 = 0, i7241 = 0, i7242 = 0, i7243 = 0, i7244 = 0, i7245 = 0, i7246 = 0, i7247 = 0, i7248 = 0, i7249 = 0;
+        int i7250 = 0, i7251 = 0, i7252 = 0, i7253 = 0, i7254 = 0, i7255 = 0, i7256 = 0, i7257 = 0, i7258 = 0, i7259 = 0;
+        int i7260 = 0, i7261 = 0, i7262 = 0, i7263 = 0, i7264 = 0, i7265 = 0, i7266 = 0, i7267 = 0, i7268 = 0, i7269 = 0;
+        int i7270 = 0, i7271 = 0, i7272 = 0, i7273 = 0, i7274 = 0, i7275 = 0, i7276 = 0, i7277 = 0, i7278 = 0, i7279 = 0;
+        int i7280 = 0, i7281 = 0, i7282 = 0, i7283 = 0, i7284 = 0, i7285 = 0, i7286 = 0, i7287 = 0, i7288 = 0, i7289 = 0;
+        int i7290 = 0, i7291 = 0, i7292 = 0, i7293 = 0, i7294 = 0, i7295 = 0, i7296 = 0, i7297 = 0, i7298 = 0, i7299 = 0;
+        int i7300 = 0, i7301 = 0, i7302 = 0, i7303 = 0, i7304 = 0, i7305 = 0, i7306 = 0, i7307 = 0, i7308 = 0, i7309 = 0;
+        int i7310 = 0, i7311 = 0, i7312 = 0, i7313 = 0, i7314 = 0, i7315 = 0, i7316 = 0, i7317 = 0, i7318 = 0, i7319 = 0;
+        int i7320 = 0, i7321 = 0, i7322 = 0, i7323 = 0, i7324 = 0, i7325 = 0, i7326 = 0, i7327 = 0, i7328 = 0, i7329 = 0;
+        int i7330 = 0, i7331 = 0, i7332 = 0, i7333 = 0, i7334 = 0, i7335 = 0, i7336 = 0, i7337 = 0, i7338 = 0, i7339 = 0;
+        int i7340 = 0, i7341 = 0, i7342 = 0, i7343 = 0, i7344 = 0, i7345 = 0, i7346 = 0, i7347 = 0, i7348 = 0, i7349 = 0;
+        int i7350 = 0, i7351 = 0, i7352 = 0, i7353 = 0, i7354 = 0, i7355 = 0, i7356 = 0, i7357 = 0, i7358 = 0, i7359 = 0;
+        int i7360 = 0, i7361 = 0, i7362 = 0, i7363 = 0, i7364 = 0, i7365 = 0, i7366 = 0, i7367 = 0, i7368 = 0, i7369 = 0;
+        int i7370 = 0, i7371 = 0, i7372 = 0, i7373 = 0, i7374 = 0, i7375 = 0, i7376 = 0, i7377 = 0, i7378 = 0, i7379 = 0;
+        int i7380 = 0, i7381 = 0, i7382 = 0, i7383 = 0, i7384 = 0, i7385 = 0, i7386 = 0, i7387 = 0, i7388 = 0, i7389 = 0;
+        int i7390 = 0, i7391 = 0, i7392 = 0, i7393 = 0, i7394 = 0, i7395 = 0, i7396 = 0, i7397 = 0, i7398 = 0, i7399 = 0;
+        int i7400 = 0, i7401 = 0, i7402 = 0, i7403 = 0, i7404 = 0, i7405 = 0, i7406 = 0, i7407 = 0, i7408 = 0, i7409 = 0;
+        int i7410 = 0, i7411 = 0, i7412 = 0, i7413 = 0, i7414 = 0, i7415 = 0, i7416 = 0, i7417 = 0, i7418 = 0, i7419 = 0;
+        int i7420 = 0, i7421 = 0, i7422 = 0, i7423 = 0, i7424 = 0, i7425 = 0, i7426 = 0, i7427 = 0, i7428 = 0, i7429 = 0;
+        int i7430 = 0, i7431 = 0, i7432 = 0, i7433 = 0, i7434 = 0, i7435 = 0, i7436 = 0, i7437 = 0, i7438 = 0, i7439 = 0;
+        int i7440 = 0, i7441 = 0, i7442 = 0, i7443 = 0, i7444 = 0, i7445 = 0, i7446 = 0, i7447 = 0, i7448 = 0, i7449 = 0;
+        int i7450 = 0, i7451 = 0, i7452 = 0, i7453 = 0, i7454 = 0, i7455 = 0, i7456 = 0, i7457 = 0, i7458 = 0, i7459 = 0;
+        int i7460 = 0, i7461 = 0, i7462 = 0, i7463 = 0, i7464 = 0, i7465 = 0, i7466 = 0, i7467 = 0, i7468 = 0, i7469 = 0;
+        int i7470 = 0, i7471 = 0, i7472 = 0, i7473 = 0, i7474 = 0, i7475 = 0, i7476 = 0, i7477 = 0, i7478 = 0, i7479 = 0;
+        int i7480 = 0, i7481 = 0, i7482 = 0, i7483 = 0, i7484 = 0, i7485 = 0, i7486 = 0, i7487 = 0, i7488 = 0, i7489 = 0;
+        int i7490 = 0, i7491 = 0, i7492 = 0, i7493 = 0, i7494 = 0, i7495 = 0, i7496 = 0, i7497 = 0, i7498 = 0, i7499 = 0;
+        int i7500 = 0, i7501 = 0, i7502 = 0, i7503 = 0, i7504 = 0, i7505 = 0, i7506 = 0, i7507 = 0, i7508 = 0, i7509 = 0;
+        int i7510 = 0, i7511 = 0, i7512 = 0, i7513 = 0, i7514 = 0, i7515 = 0, i7516 = 0, i7517 = 0, i7518 = 0, i7519 = 0;
+        int i7520 = 0, i7521 = 0, i7522 = 0, i7523 = 0, i7524 = 0, i7525 = 0, i7526 = 0, i7527 = 0, i7528 = 0, i7529 = 0;
+        int i7530 = 0, i7531 = 0, i7532 = 0, i7533 = 0, i7534 = 0, i7535 = 0, i7536 = 0, i7537 = 0, i7538 = 0, i7539 = 0;
+        int i7540 = 0, i7541 = 0, i7542 = 0, i7543 = 0, i7544 = 0, i7545 = 0, i7546 = 0, i7547 = 0, i7548 = 0, i7549 = 0;
+        int i7550 = 0, i7551 = 0, i7552 = 0, i7553 = 0, i7554 = 0, i7555 = 0, i7556 = 0, i7557 = 0, i7558 = 0, i7559 = 0;
+        int i7560 = 0, i7561 = 0, i7562 = 0, i7563 = 0, i7564 = 0, i7565 = 0, i7566 = 0, i7567 = 0, i7568 = 0, i7569 = 0;
+        int i7570 = 0, i7571 = 0, i7572 = 0, i7573 = 0, i7574 = 0, i7575 = 0, i7576 = 0, i7577 = 0, i7578 = 0, i7579 = 0;
+        int i7580 = 0, i7581 = 0, i7582 = 0, i7583 = 0, i7584 = 0, i7585 = 0, i7586 = 0, i7587 = 0, i7588 = 0, i7589 = 0;
+        int i7590 = 0, i7591 = 0, i7592 = 0, i7593 = 0, i7594 = 0, i7595 = 0, i7596 = 0, i7597 = 0, i7598 = 0, i7599 = 0;
+        int i7600 = 0, i7601 = 0, i7602 = 0, i7603 = 0, i7604 = 0, i7605 = 0, i7606 = 0, i7607 = 0, i7608 = 0, i7609 = 0;
+        int i7610 = 0, i7611 = 0, i7612 = 0, i7613 = 0, i7614 = 0, i7615 = 0, i7616 = 0, i7617 = 0, i7618 = 0, i7619 = 0;
+        int i7620 = 0, i7621 = 0, i7622 = 0, i7623 = 0, i7624 = 0, i7625 = 0, i7626 = 0, i7627 = 0, i7628 = 0, i7629 = 0;
+        int i7630 = 0, i7631 = 0, i7632 = 0, i7633 = 0, i7634 = 0, i7635 = 0, i7636 = 0, i7637 = 0, i7638 = 0, i7639 = 0;
+        int i7640 = 0, i7641 = 0, i7642 = 0, i7643 = 0, i7644 = 0, i7645 = 0, i7646 = 0, i7647 = 0, i7648 = 0, i7649 = 0;
+        int i7650 = 0, i7651 = 0, i7652 = 0, i7653 = 0, i7654 = 0, i7655 = 0, i7656 = 0, i7657 = 0, i7658 = 0, i7659 = 0;
+        int i7660 = 0, i7661 = 0, i7662 = 0, i7663 = 0, i7664 = 0, i7665 = 0, i7666 = 0, i7667 = 0, i7668 = 0, i7669 = 0;
+        int i7670 = 0, i7671 = 0, i7672 = 0, i7673 = 0, i7674 = 0, i7675 = 0, i7676 = 0, i7677 = 0, i7678 = 0, i7679 = 0;
+        int i7680 = 0, i7681 = 0, i7682 = 0, i7683 = 0, i7684 = 0, i7685 = 0, i7686 = 0, i7687 = 0, i7688 = 0, i7689 = 0;
+        int i7690 = 0, i7691 = 0, i7692 = 0, i7693 = 0, i7694 = 0, i7695 = 0, i7696 = 0, i7697 = 0, i7698 = 0, i7699 = 0;
+        int i7700 = 0, i7701 = 0, i7702 = 0, i7703 = 0, i7704 = 0, i7705 = 0, i7706 = 0, i7707 = 0, i7708 = 0, i7709 = 0;
+        int i7710 = 0, i7711 = 0, i7712 = 0, i7713 = 0, i7714 = 0, i7715 = 0, i7716 = 0, i7717 = 0, i7718 = 0, i7719 = 0;
+        int i7720 = 0, i7721 = 0, i7722 = 0, i7723 = 0, i7724 = 0, i7725 = 0, i7726 = 0, i7727 = 0, i7728 = 0, i7729 = 0;
+        int i7730 = 0, i7731 = 0, i7732 = 0, i7733 = 0, i7734 = 0, i7735 = 0, i7736 = 0, i7737 = 0, i7738 = 0, i7739 = 0;
+        int i7740 = 0, i7741 = 0, i7742 = 0, i7743 = 0, i7744 = 0, i7745 = 0, i7746 = 0, i7747 = 0, i7748 = 0, i7749 = 0;
+        int i7750 = 0, i7751 = 0, i7752 = 0, i7753 = 0, i7754 = 0, i7755 = 0, i7756 = 0, i7757 = 0, i7758 = 0, i7759 = 0;
+        int i7760 = 0, i7761 = 0, i7762 = 0, i7763 = 0, i7764 = 0, i7765 = 0, i7766 = 0, i7767 = 0, i7768 = 0, i7769 = 0;
+        int i7770 = 0, i7771 = 0, i7772 = 0, i7773 = 0, i7774 = 0, i7775 = 0, i7776 = 0, i7777 = 0, i7778 = 0, i7779 = 0;
+        int i7780 = 0, i7781 = 0, i7782 = 0, i7783 = 0, i7784 = 0, i7785 = 0, i7786 = 0, i7787 = 0, i7788 = 0, i7789 = 0;
+        int i7790 = 0, i7791 = 0, i7792 = 0, i7793 = 0, i7794 = 0, i7795 = 0, i7796 = 0, i7797 = 0, i7798 = 0, i7799 = 0;
+        int i7800 = 0, i7801 = 0, i7802 = 0, i7803 = 0, i7804 = 0, i7805 = 0, i7806 = 0, i7807 = 0, i7808 = 0, i7809 = 0;
+        int i7810 = 0, i7811 = 0, i7812 = 0, i7813 = 0, i7814 = 0, i7815 = 0, i7816 = 0, i7817 = 0, i7818 = 0, i7819 = 0;
+        int i7820 = 0, i7821 = 0, i7822 = 0, i7823 = 0, i7824 = 0, i7825 = 0, i7826 = 0, i7827 = 0, i7828 = 0, i7829 = 0;
+        int i7830 = 0, i7831 = 0, i7832 = 0, i7833 = 0, i7834 = 0, i7835 = 0, i7836 = 0, i7837 = 0, i7838 = 0, i7839 = 0;
+        int i7840 = 0, i7841 = 0, i7842 = 0, i7843 = 0, i7844 = 0, i7845 = 0, i7846 = 0, i7847 = 0, i7848 = 0, i7849 = 0;
+        int i7850 = 0, i7851 = 0, i7852 = 0, i7853 = 0, i7854 = 0, i7855 = 0, i7856 = 0, i7857 = 0, i7858 = 0, i7859 = 0;
+        int i7860 = 0, i7861 = 0, i7862 = 0, i7863 = 0, i7864 = 0, i7865 = 0, i7866 = 0, i7867 = 0, i7868 = 0, i7869 = 0;
+        int i7870 = 0, i7871 = 0, i7872 = 0, i7873 = 0, i7874 = 0, i7875 = 0, i7876 = 0, i7877 = 0, i7878 = 0, i7879 = 0;
+        int i7880 = 0, i7881 = 0, i7882 = 0, i7883 = 0, i7884 = 0, i7885 = 0, i7886 = 0, i7887 = 0, i7888 = 0, i7889 = 0;
+        int i7890 = 0, i7891 = 0, i7892 = 0, i7893 = 0, i7894 = 0, i7895 = 0, i7896 = 0, i7897 = 0, i7898 = 0, i7899 = 0;
+        int i7900 = 0, i7901 = 0, i7902 = 0, i7903 = 0, i7904 = 0, i7905 = 0, i7906 = 0, i7907 = 0, i7908 = 0, i7909 = 0;
+        int i7910 = 0, i7911 = 0, i7912 = 0, i7913 = 0, i7914 = 0, i7915 = 0, i7916 = 0, i7917 = 0, i7918 = 0, i7919 = 0;
+        int i7920 = 0, i7921 = 0, i7922 = 0, i7923 = 0, i7924 = 0, i7925 = 0, i7926 = 0, i7927 = 0, i7928 = 0, i7929 = 0;
+        int i7930 = 0, i7931 = 0, i7932 = 0, i7933 = 0, i7934 = 0, i7935 = 0, i7936 = 0, i7937 = 0, i7938 = 0, i7939 = 0;
+        int i7940 = 0, i7941 = 0, i7942 = 0, i7943 = 0, i7944 = 0, i7945 = 0, i7946 = 0, i7947 = 0, i7948 = 0, i7949 = 0;
+        int i7950 = 0, i7951 = 0, i7952 = 0, i7953 = 0, i7954 = 0, i7955 = 0, i7956 = 0, i7957 = 0, i7958 = 0, i7959 = 0;
+        int i7960 = 0, i7961 = 0, i7962 = 0, i7963 = 0, i7964 = 0, i7965 = 0, i7966 = 0, i7967 = 0, i7968 = 0, i7969 = 0;
+        int i7970 = 0, i7971 = 0, i7972 = 0, i7973 = 0, i7974 = 0, i7975 = 0, i7976 = 0, i7977 = 0, i7978 = 0, i7979 = 0;
+        int i7980 = 0, i7981 = 0, i7982 = 0, i7983 = 0, i7984 = 0, i7985 = 0, i7986 = 0, i7987 = 0, i7988 = 0, i7989 = 0;
+        int i7990 = 0, i7991 = 0, i7992 = 0, i7993 = 0, i7994 = 0, i7995 = 0, i7996 = 0, i7997 = 0, i7998 = 0, i7999 = 0;
+        int i8000 = 0, i8001 = 0, i8002 = 0, i8003 = 0, i8004 = 0, i8005 = 0, i8006 = 0, i8007 = 0, i8008 = 0, i8009 = 0;
+        int i8010 = 0, i8011 = 0, i8012 = 0, i8013 = 0, i8014 = 0, i8015 = 0, i8016 = 0, i8017 = 0, i8018 = 0, i8019 = 0;
+        int i8020 = 0, i8021 = 0, i8022 = 0, i8023 = 0, i8024 = 0, i8025 = 0, i8026 = 0, i8027 = 0, i8028 = 0, i8029 = 0;
+        int i8030 = 0, i8031 = 0, i8032 = 0, i8033 = 0, i8034 = 0, i8035 = 0, i8036 = 0, i8037 = 0, i8038 = 0, i8039 = 0;
+        int i8040 = 0, i8041 = 0, i8042 = 0, i8043 = 0, i8044 = 0, i8045 = 0, i8046 = 0, i8047 = 0, i8048 = 0, i8049 = 0;
+        int i8050 = 0, i8051 = 0, i8052 = 0, i8053 = 0, i8054 = 0, i8055 = 0, i8056 = 0, i8057 = 0, i8058 = 0, i8059 = 0;
+        int i8060 = 0, i8061 = 0, i8062 = 0, i8063 = 0, i8064 = 0, i8065 = 0, i8066 = 0, i8067 = 0, i8068 = 0, i8069 = 0;
+        int i8070 = 0, i8071 = 0, i8072 = 0, i8073 = 0, i8074 = 0, i8075 = 0, i8076 = 0, i8077 = 0, i8078 = 0, i8079 = 0;
+        int i8080 = 0, i8081 = 0, i8082 = 0, i8083 = 0, i8084 = 0, i8085 = 0, i8086 = 0, i8087 = 0, i8088 = 0, i8089 = 0;
+        int i8090 = 0, i8091 = 0, i8092 = 0, i8093 = 0, i8094 = 0, i8095 = 0, i8096 = 0, i8097 = 0, i8098 = 0, i8099 = 0;
+        int i8100 = 0, i8101 = 0, i8102 = 0, i8103 = 0, i8104 = 0, i8105 = 0, i8106 = 0, i8107 = 0, i8108 = 0, i8109 = 0;
+        int i8110 = 0, i8111 = 0, i8112 = 0, i8113 = 0, i8114 = 0, i8115 = 0, i8116 = 0, i8117 = 0, i8118 = 0, i8119 = 0;
+        int i8120 = 0, i8121 = 0, i8122 = 0, i8123 = 0, i8124 = 0, i8125 = 0, i8126 = 0, i8127 = 0, i8128 = 0, i8129 = 0;
+        int i8130 = 0, i8131 = 0, i8132 = 0, i8133 = 0, i8134 = 0, i8135 = 0, i8136 = 0, i8137 = 0, i8138 = 0, i8139 = 0;
+        int i8140 = 0, i8141 = 0, i8142 = 0, i8143 = 0, i8144 = 0, i8145 = 0, i8146 = 0, i8147 = 0, i8148 = 0, i8149 = 0;
+        int i8150 = 0, i8151 = 0, i8152 = 0, i8153 = 0, i8154 = 0, i8155 = 0, i8156 = 0, i8157 = 0, i8158 = 0, i8159 = 0;
+        int i8160 = 0, i8161 = 0, i8162 = 0, i8163 = 0, i8164 = 0, i8165 = 0, i8166 = 0, i8167 = 0, i8168 = 0, i8169 = 0;
+        int i8170 = 0, i8171 = 0, i8172 = 0, i8173 = 0, i8174 = 0, i8175 = 0, i8176 = 0, i8177 = 0, i8178 = 0, i8179 = 0;
+        int i8180 = 0, i8181 = 0, i8182 = 0, i8183 = 0, i8184 = 0, i8185 = 0, i8186 = 0, i8187 = 0, i8188 = 0, i8189 = 0;
+        int i8190 = 0, i8191 = 0, i8192 = 0, i8193 = 0, i8194 = 0, i8195 = 0, i8196 = 0, i8197 = 0, i8198 = 0, i8199 = 0;
+        int i8200 = 0, i8201 = 0, i8202 = 0, i8203 = 0, i8204 = 0, i8205 = 0, i8206 = 0, i8207 = 0, i8208 = 0, i8209 = 0;
+        int i8210 = 0, i8211 = 0, i8212 = 0, i8213 = 0, i8214 = 0, i8215 = 0, i8216 = 0, i8217 = 0, i8218 = 0, i8219 = 0;
+        int i8220 = 0, i8221 = 0, i8222 = 0, i8223 = 0, i8224 = 0, i8225 = 0, i8226 = 0, i8227 = 0, i8228 = 0, i8229 = 0;
+        int i8230 = 0, i8231 = 0, i8232 = 0, i8233 = 0, i8234 = 0, i8235 = 0, i8236 = 0, i8237 = 0, i8238 = 0, i8239 = 0;
+        int i8240 = 0, i8241 = 0, i8242 = 0, i8243 = 0, i8244 = 0, i8245 = 0, i8246 = 0, i8247 = 0, i8248 = 0, i8249 = 0;
+        int i8250 = 0, i8251 = 0, i8252 = 0, i8253 = 0, i8254 = 0, i8255 = 0, i8256 = 0, i8257 = 0, i8258 = 0, i8259 = 0;
+        int i8260 = 0, i8261 = 0, i8262 = 0, i8263 = 0, i8264 = 0, i8265 = 0, i8266 = 0, i8267 = 0, i8268 = 0, i8269 = 0;
+        int i8270 = 0, i8271 = 0, i8272 = 0, i8273 = 0, i8274 = 0, i8275 = 0, i8276 = 0, i8277 = 0, i8278 = 0, i8279 = 0;
+        int i8280 = 0, i8281 = 0, i8282 = 0, i8283 = 0, i8284 = 0, i8285 = 0, i8286 = 0, i8287 = 0, i8288 = 0, i8289 = 0;
+        int i8290 = 0, i8291 = 0, i8292 = 0, i8293 = 0, i8294 = 0, i8295 = 0, i8296 = 0, i8297 = 0, i8298 = 0, i8299 = 0;
+        int i8300 = 0, i8301 = 0, i8302 = 0, i8303 = 0, i8304 = 0, i8305 = 0, i8306 = 0, i8307 = 0, i8308 = 0, i8309 = 0;
+        int i8310 = 0, i8311 = 0, i8312 = 0, i8313 = 0, i8314 = 0, i8315 = 0, i8316 = 0, i8317 = 0, i8318 = 0, i8319 = 0;
+        int i8320 = 0, i8321 = 0, i8322 = 0, i8323 = 0, i8324 = 0, i8325 = 0, i8326 = 0, i8327 = 0, i8328 = 0, i8329 = 0;
+        int i8330 = 0, i8331 = 0, i8332 = 0, i8333 = 0, i8334 = 0, i8335 = 0, i8336 = 0, i8337 = 0, i8338 = 0, i8339 = 0;
+        int i8340 = 0, i8341 = 0, i8342 = 0, i8343 = 0, i8344 = 0, i8345 = 0, i8346 = 0, i8347 = 0, i8348 = 0, i8349 = 0;
+        int i8350 = 0, i8351 = 0, i8352 = 0, i8353 = 0, i8354 = 0, i8355 = 0, i8356 = 0, i8357 = 0, i8358 = 0, i8359 = 0;
+        int i8360 = 0, i8361 = 0, i8362 = 0, i8363 = 0, i8364 = 0, i8365 = 0, i8366 = 0, i8367 = 0, i8368 = 0, i8369 = 0;
+        int i8370 = 0, i8371 = 0, i8372 = 0, i8373 = 0, i8374 = 0, i8375 = 0, i8376 = 0, i8377 = 0, i8378 = 0, i8379 = 0;
+        int i8380 = 0, i8381 = 0, i8382 = 0, i8383 = 0, i8384 = 0, i8385 = 0, i8386 = 0, i8387 = 0, i8388 = 0, i8389 = 0;
+        int i8390 = 0, i8391 = 0, i8392 = 0, i8393 = 0, i8394 = 0, i8395 = 0, i8396 = 0, i8397 = 0, i8398 = 0, i8399 = 0;
+        int i8400 = 0, i8401 = 0, i8402 = 0, i8403 = 0, i8404 = 0, i8405 = 0, i8406 = 0, i8407 = 0, i8408 = 0, i8409 = 0;
+        int i8410 = 0, i8411 = 0, i8412 = 0, i8413 = 0, i8414 = 0, i8415 = 0, i8416 = 0, i8417 = 0, i8418 = 0, i8419 = 0;
+        int i8420 = 0, i8421 = 0, i8422 = 0, i8423 = 0, i8424 = 0, i8425 = 0, i8426 = 0, i8427 = 0, i8428 = 0, i8429 = 0;
+        int i8430 = 0, i8431 = 0, i8432 = 0, i8433 = 0, i8434 = 0, i8435 = 0, i8436 = 0, i8437 = 0, i8438 = 0, i8439 = 0;
+        int i8440 = 0, i8441 = 0, i8442 = 0, i8443 = 0, i8444 = 0, i8445 = 0, i8446 = 0, i8447 = 0, i8448 = 0, i8449 = 0;
+        int i8450 = 0, i8451 = 0, i8452 = 0, i8453 = 0, i8454 = 0, i8455 = 0, i8456 = 0, i8457 = 0, i8458 = 0, i8459 = 0;
+        int i8460 = 0, i8461 = 0, i8462 = 0, i8463 = 0, i8464 = 0, i8465 = 0, i8466 = 0, i8467 = 0, i8468 = 0, i8469 = 0;
+        int i8470 = 0, i8471 = 0, i8472 = 0, i8473 = 0, i8474 = 0, i8475 = 0, i8476 = 0, i8477 = 0, i8478 = 0, i8479 = 0;
+        int i8480 = 0, i8481 = 0, i8482 = 0, i8483 = 0, i8484 = 0, i8485 = 0, i8486 = 0, i8487 = 0, i8488 = 0, i8489 = 0;
+        int i8490 = 0, i8491 = 0, i8492 = 0, i8493 = 0, i8494 = 0, i8495 = 0, i8496 = 0, i8497 = 0, i8498 = 0, i8499 = 0;
+        int i8500 = 0, i8501 = 0, i8502 = 0, i8503 = 0, i8504 = 0, i8505 = 0, i8506 = 0, i8507 = 0, i8508 = 0, i8509 = 0;
+        int i8510 = 0, i8511 = 0, i8512 = 0, i8513 = 0, i8514 = 0, i8515 = 0, i8516 = 0, i8517 = 0, i8518 = 0, i8519 = 0;
+        int i8520 = 0, i8521 = 0, i8522 = 0, i8523 = 0, i8524 = 0, i8525 = 0, i8526 = 0, i8527 = 0, i8528 = 0, i8529 = 0;
+        int i8530 = 0, i8531 = 0, i8532 = 0, i8533 = 0, i8534 = 0, i8535 = 0, i8536 = 0, i8537 = 0, i8538 = 0, i8539 = 0;
+        int i8540 = 0, i8541 = 0, i8542 = 0, i8543 = 0, i8544 = 0, i8545 = 0, i8546 = 0, i8547 = 0, i8548 = 0, i8549 = 0;
+        int i8550 = 0, i8551 = 0, i8552 = 0, i8553 = 0, i8554 = 0, i8555 = 0, i8556 = 0, i8557 = 0, i8558 = 0, i8559 = 0;
+        int i8560 = 0, i8561 = 0, i8562 = 0, i8563 = 0, i8564 = 0, i8565 = 0, i8566 = 0, i8567 = 0, i8568 = 0, i8569 = 0;
+        int i8570 = 0, i8571 = 0, i8572 = 0, i8573 = 0, i8574 = 0, i8575 = 0, i8576 = 0, i8577 = 0, i8578 = 0, i8579 = 0;
+        int i8580 = 0, i8581 = 0, i8582 = 0, i8583 = 0, i8584 = 0, i8585 = 0, i8586 = 0, i8587 = 0, i8588 = 0, i8589 = 0;
+        int i8590 = 0, i8591 = 0, i8592 = 0, i8593 = 0, i8594 = 0, i8595 = 0, i8596 = 0, i8597 = 0, i8598 = 0, i8599 = 0;
+        int i8600 = 0, i8601 = 0, i8602 = 0, i8603 = 0, i8604 = 0, i8605 = 0, i8606 = 0, i8607 = 0, i8608 = 0, i8609 = 0;
+        int i8610 = 0, i8611 = 0, i8612 = 0, i8613 = 0, i8614 = 0, i8615 = 0, i8616 = 0, i8617 = 0, i8618 = 0, i8619 = 0;
+        int i8620 = 0, i8621 = 0, i8622 = 0, i8623 = 0, i8624 = 0, i8625 = 0, i8626 = 0, i8627 = 0, i8628 = 0, i8629 = 0;
+        int i8630 = 0, i8631 = 0, i8632 = 0, i8633 = 0, i8634 = 0, i8635 = 0, i8636 = 0, i8637 = 0, i8638 = 0, i8639 = 0;
+        int i8640 = 0, i8641 = 0, i8642 = 0, i8643 = 0, i8644 = 0, i8645 = 0, i8646 = 0, i8647 = 0, i8648 = 0, i8649 = 0;
+        int i8650 = 0, i8651 = 0, i8652 = 0, i8653 = 0, i8654 = 0, i8655 = 0, i8656 = 0, i8657 = 0, i8658 = 0, i8659 = 0;
+        int i8660 = 0, i8661 = 0, i8662 = 0, i8663 = 0, i8664 = 0, i8665 = 0, i8666 = 0, i8667 = 0, i8668 = 0, i8669 = 0;
+        int i8670 = 0, i8671 = 0, i8672 = 0, i8673 = 0, i8674 = 0, i8675 = 0, i8676 = 0, i8677 = 0, i8678 = 0, i8679 = 0;
+        int i8680 = 0, i8681 = 0, i8682 = 0, i8683 = 0, i8684 = 0, i8685 = 0, i8686 = 0, i8687 = 0, i8688 = 0, i8689 = 0;
+        int i8690 = 0, i8691 = 0, i8692 = 0, i8693 = 0, i8694 = 0, i8695 = 0, i8696 = 0, i8697 = 0, i8698 = 0, i8699 = 0;
+        int i8700 = 0, i8701 = 0, i8702 = 0, i8703 = 0, i8704 = 0, i8705 = 0, i8706 = 0, i8707 = 0, i8708 = 0, i8709 = 0;
+        int i8710 = 0, i8711 = 0, i8712 = 0, i8713 = 0, i8714 = 0, i8715 = 0, i8716 = 0, i8717 = 0, i8718 = 0, i8719 = 0;
+        int i8720 = 0, i8721 = 0, i8722 = 0, i8723 = 0, i8724 = 0, i8725 = 0, i8726 = 0, i8727 = 0, i8728 = 0, i8729 = 0;
+        int i8730 = 0, i8731 = 0, i8732 = 0, i8733 = 0, i8734 = 0, i8735 = 0, i8736 = 0, i8737 = 0, i8738 = 0, i8739 = 0;
+        int i8740 = 0, i8741 = 0, i8742 = 0, i8743 = 0, i8744 = 0, i8745 = 0, i8746 = 0, i8747 = 0, i8748 = 0, i8749 = 0;
+        int i8750 = 0, i8751 = 0, i8752 = 0, i8753 = 0, i8754 = 0, i8755 = 0, i8756 = 0, i8757 = 0, i8758 = 0, i8759 = 0;
+        int i8760 = 0, i8761 = 0, i8762 = 0, i8763 = 0, i8764 = 0, i8765 = 0, i8766 = 0, i8767 = 0, i8768 = 0, i8769 = 0;
+        int i8770 = 0, i8771 = 0, i8772 = 0, i8773 = 0, i8774 = 0, i8775 = 0, i8776 = 0, i8777 = 0, i8778 = 0, i8779 = 0;
+        int i8780 = 0, i8781 = 0, i8782 = 0, i8783 = 0, i8784 = 0, i8785 = 0, i8786 = 0, i8787 = 0, i8788 = 0, i8789 = 0;
+        int i8790 = 0, i8791 = 0, i8792 = 0, i8793 = 0, i8794 = 0, i8795 = 0, i8796 = 0, i8797 = 0, i8798 = 0, i8799 = 0;
+        int i8800 = 0, i8801 = 0, i8802 = 0, i8803 = 0, i8804 = 0, i8805 = 0, i8806 = 0, i8807 = 0, i8808 = 0, i8809 = 0;
+        int i8810 = 0, i8811 = 0, i8812 = 0, i8813 = 0, i8814 = 0, i8815 = 0, i8816 = 0, i8817 = 0, i8818 = 0, i8819 = 0;
+        int i8820 = 0, i8821 = 0, i8822 = 0, i8823 = 0, i8824 = 0, i8825 = 0, i8826 = 0, i8827 = 0, i8828 = 0, i8829 = 0;
+        int i8830 = 0, i8831 = 0, i8832 = 0, i8833 = 0, i8834 = 0, i8835 = 0, i8836 = 0, i8837 = 0, i8838 = 0, i8839 = 0;
+        int i8840 = 0, i8841 = 0, i8842 = 0, i8843 = 0, i8844 = 0, i8845 = 0, i8846 = 0, i8847 = 0, i8848 = 0, i8849 = 0;
+        int i8850 = 0, i8851 = 0, i8852 = 0, i8853 = 0, i8854 = 0, i8855 = 0, i8856 = 0, i8857 = 0, i8858 = 0, i8859 = 0;
+        int i8860 = 0, i8861 = 0, i8862 = 0, i8863 = 0, i8864 = 0, i8865 = 0, i8866 = 0, i8867 = 0, i8868 = 0, i8869 = 0;
+        int i8870 = 0, i8871 = 0, i8872 = 0, i8873 = 0, i8874 = 0, i8875 = 0, i8876 = 0, i8877 = 0, i8878 = 0, i8879 = 0;
+        int i8880 = 0, i8881 = 0, i8882 = 0, i8883 = 0, i8884 = 0, i8885 = 0, i8886 = 0, i8887 = 0, i8888 = 0, i8889 = 0;
+        int i8890 = 0, i8891 = 0, i8892 = 0, i8893 = 0, i8894 = 0, i8895 = 0, i8896 = 0, i8897 = 0, i8898 = 0, i8899 = 0;
+        int i8900 = 0, i8901 = 0, i8902 = 0, i8903 = 0, i8904 = 0, i8905 = 0, i8906 = 0, i8907 = 0, i8908 = 0, i8909 = 0;
+        int i8910 = 0, i8911 = 0, i8912 = 0, i8913 = 0, i8914 = 0, i8915 = 0, i8916 = 0, i8917 = 0, i8918 = 0, i8919 = 0;
+        int i8920 = 0, i8921 = 0, i8922 = 0, i8923 = 0, i8924 = 0, i8925 = 0, i8926 = 0, i8927 = 0, i8928 = 0, i8929 = 0;
+        int i8930 = 0, i8931 = 0, i8932 = 0, i8933 = 0, i8934 = 0, i8935 = 0, i8936 = 0, i8937 = 0, i8938 = 0, i8939 = 0;
+        int i8940 = 0, i8941 = 0, i8942 = 0, i8943 = 0, i8944 = 0, i8945 = 0, i8946 = 0, i8947 = 0, i8948 = 0, i8949 = 0;
+        int i8950 = 0, i8951 = 0, i8952 = 0, i8953 = 0, i8954 = 0, i8955 = 0, i8956 = 0, i8957 = 0, i8958 = 0, i8959 = 0;
+        int i8960 = 0, i8961 = 0, i8962 = 0, i8963 = 0, i8964 = 0, i8965 = 0, i8966 = 0, i8967 = 0, i8968 = 0, i8969 = 0;
+        int i8970 = 0, i8971 = 0, i8972 = 0, i8973 = 0, i8974 = 0, i8975 = 0, i8976 = 0, i8977 = 0, i8978 = 0, i8979 = 0;
+        int i8980 = 0, i8981 = 0, i8982 = 0, i8983 = 0, i8984 = 0, i8985 = 0, i8986 = 0, i8987 = 0, i8988 = 0, i8989 = 0;
+        int i8990 = 0, i8991 = 0, i8992 = 0, i8993 = 0, i8994 = 0, i8995 = 0, i8996 = 0, i8997 = 0, i8998 = 0, i8999 = 0;
+        int i9000 = 0, i9001 = 0, i9002 = 0, i9003 = 0, i9004 = 0, i9005 = 0, i9006 = 0, i9007 = 0, i9008 = 0, i9009 = 0;
+        int i9010 = 0, i9011 = 0, i9012 = 0, i9013 = 0, i9014 = 0, i9015 = 0, i9016 = 0, i9017 = 0, i9018 = 0, i9019 = 0;
+        int i9020 = 0, i9021 = 0, i9022 = 0, i9023 = 0, i9024 = 0, i9025 = 0, i9026 = 0, i9027 = 0, i9028 = 0, i9029 = 0;
+        int i9030 = 0, i9031 = 0, i9032 = 0, i9033 = 0, i9034 = 0, i9035 = 0, i9036 = 0, i9037 = 0, i9038 = 0, i9039 = 0;
+        int i9040 = 0, i9041 = 0, i9042 = 0, i9043 = 0, i9044 = 0, i9045 = 0, i9046 = 0, i9047 = 0, i9048 = 0, i9049 = 0;
+        int i9050 = 0, i9051 = 0, i9052 = 0, i9053 = 0, i9054 = 0, i9055 = 0, i9056 = 0, i9057 = 0, i9058 = 0, i9059 = 0;
+        int i9060 = 0, i9061 = 0, i9062 = 0, i9063 = 0, i9064 = 0, i9065 = 0, i9066 = 0, i9067 = 0, i9068 = 0, i9069 = 0;
+        int i9070 = 0, i9071 = 0, i9072 = 0, i9073 = 0, i9074 = 0, i9075 = 0, i9076 = 0, i9077 = 0, i9078 = 0, i9079 = 0;
+        int i9080 = 0, i9081 = 0, i9082 = 0, i9083 = 0, i9084 = 0, i9085 = 0, i9086 = 0, i9087 = 0, i9088 = 0, i9089 = 0;
+        int i9090 = 0, i9091 = 0, i9092 = 0, i9093 = 0, i9094 = 0, i9095 = 0, i9096 = 0, i9097 = 0, i9098 = 0, i9099 = 0;
+        int i9100 = 0, i9101 = 0, i9102 = 0, i9103 = 0, i9104 = 0, i9105 = 0, i9106 = 0, i9107 = 0, i9108 = 0, i9109 = 0;
+        int i9110 = 0, i9111 = 0, i9112 = 0, i9113 = 0, i9114 = 0, i9115 = 0, i9116 = 0, i9117 = 0, i9118 = 0, i9119 = 0;
+        int i9120 = 0, i9121 = 0, i9122 = 0, i9123 = 0, i9124 = 0, i9125 = 0, i9126 = 0, i9127 = 0, i9128 = 0, i9129 = 0;
+        int i9130 = 0, i9131 = 0, i9132 = 0, i9133 = 0, i9134 = 0, i9135 = 0, i9136 = 0, i9137 = 0, i9138 = 0, i9139 = 0;
+        int i9140 = 0, i9141 = 0, i9142 = 0, i9143 = 0, i9144 = 0, i9145 = 0, i9146 = 0, i9147 = 0, i9148 = 0, i9149 = 0;
+        int i9150 = 0, i9151 = 0, i9152 = 0, i9153 = 0, i9154 = 0, i9155 = 0, i9156 = 0, i9157 = 0, i9158 = 0, i9159 = 0;
+        int i9160 = 0, i9161 = 0, i9162 = 0, i9163 = 0, i9164 = 0, i9165 = 0, i9166 = 0, i9167 = 0, i9168 = 0, i9169 = 0;
+        int i9170 = 0, i9171 = 0, i9172 = 0, i9173 = 0, i9174 = 0, i9175 = 0, i9176 = 0, i9177 = 0, i9178 = 0, i9179 = 0;
+        int i9180 = 0, i9181 = 0, i9182 = 0, i9183 = 0, i9184 = 0, i9185 = 0, i9186 = 0, i9187 = 0, i9188 = 0, i9189 = 0;
+        int i9190 = 0, i9191 = 0, i9192 = 0, i9193 = 0, i9194 = 0, i9195 = 0, i9196 = 0, i9197 = 0, i9198 = 0, i9199 = 0;
+        int i9200 = 0, i9201 = 0, i9202 = 0, i9203 = 0, i9204 = 0, i9205 = 0, i9206 = 0, i9207 = 0, i9208 = 0, i9209 = 0;
+        int i9210 = 0, i9211 = 0, i9212 = 0, i9213 = 0, i9214 = 0, i9215 = 0, i9216 = 0, i9217 = 0, i9218 = 0, i9219 = 0;
+        int i9220 = 0, i9221 = 0, i9222 = 0, i9223 = 0, i9224 = 0, i9225 = 0, i9226 = 0, i9227 = 0, i9228 = 0, i9229 = 0;
+        int i9230 = 0, i9231 = 0, i9232 = 0, i9233 = 0, i9234 = 0, i9235 = 0, i9236 = 0, i9237 = 0, i9238 = 0, i9239 = 0;
+        int i9240 = 0, i9241 = 0, i9242 = 0, i9243 = 0, i9244 = 0, i9245 = 0, i9246 = 0, i9247 = 0, i9248 = 0, i9249 = 0;
+        int i9250 = 0, i9251 = 0, i9252 = 0, i9253 = 0, i9254 = 0, i9255 = 0, i9256 = 0, i9257 = 0, i9258 = 0, i9259 = 0;
+        int i9260 = 0, i9261 = 0, i9262 = 0, i9263 = 0, i9264 = 0, i9265 = 0, i9266 = 0, i9267 = 0, i9268 = 0, i9269 = 0;
+        int i9270 = 0, i9271 = 0, i9272 = 0, i9273 = 0, i9274 = 0, i9275 = 0, i9276 = 0, i9277 = 0, i9278 = 0, i9279 = 0;
+        int i9280 = 0, i9281 = 0, i9282 = 0, i9283 = 0, i9284 = 0, i9285 = 0, i9286 = 0, i9287 = 0, i9288 = 0, i9289 = 0;
+        int i9290 = 0, i9291 = 0, i9292 = 0, i9293 = 0, i9294 = 0, i9295 = 0, i9296 = 0, i9297 = 0, i9298 = 0, i9299 = 0;
+        int i9300 = 0, i9301 = 0, i9302 = 0, i9303 = 0, i9304 = 0, i9305 = 0, i9306 = 0, i9307 = 0, i9308 = 0, i9309 = 0;
+        int i9310 = 0, i9311 = 0, i9312 = 0, i9313 = 0, i9314 = 0, i9315 = 0, i9316 = 0, i9317 = 0, i9318 = 0, i9319 = 0;
+        int i9320 = 0, i9321 = 0, i9322 = 0, i9323 = 0, i9324 = 0, i9325 = 0, i9326 = 0, i9327 = 0, i9328 = 0, i9329 = 0;
+        int i9330 = 0, i9331 = 0, i9332 = 0, i9333 = 0, i9334 = 0, i9335 = 0, i9336 = 0, i9337 = 0, i9338 = 0, i9339 = 0;
+        int i9340 = 0, i9341 = 0, i9342 = 0, i9343 = 0, i9344 = 0, i9345 = 0, i9346 = 0, i9347 = 0, i9348 = 0, i9349 = 0;
+        int i9350 = 0, i9351 = 0, i9352 = 0, i9353 = 0, i9354 = 0, i9355 = 0, i9356 = 0, i9357 = 0, i9358 = 0, i9359 = 0;
+        int i9360 = 0, i9361 = 0, i9362 = 0, i9363 = 0, i9364 = 0, i9365 = 0, i9366 = 0, i9367 = 0, i9368 = 0, i9369 = 0;
+        int i9370 = 0, i9371 = 0, i9372 = 0, i9373 = 0, i9374 = 0, i9375 = 0, i9376 = 0, i9377 = 0, i9378 = 0, i9379 = 0;
+        int i9380 = 0, i9381 = 0, i9382 = 0, i9383 = 0, i9384 = 0, i9385 = 0, i9386 = 0, i9387 = 0, i9388 = 0, i9389 = 0;
+        int i9390 = 0, i9391 = 0, i9392 = 0, i9393 = 0, i9394 = 0, i9395 = 0, i9396 = 0, i9397 = 0, i9398 = 0, i9399 = 0;
+        int i9400 = 0, i9401 = 0, i9402 = 0, i9403 = 0, i9404 = 0, i9405 = 0, i9406 = 0, i9407 = 0, i9408 = 0, i9409 = 0;
+        int i9410 = 0, i9411 = 0, i9412 = 0, i9413 = 0, i9414 = 0, i9415 = 0, i9416 = 0, i9417 = 0, i9418 = 0, i9419 = 0;
+        int i9420 = 0, i9421 = 0, i9422 = 0, i9423 = 0, i9424 = 0, i9425 = 0, i9426 = 0, i9427 = 0, i9428 = 0, i9429 = 0;
+        int i9430 = 0, i9431 = 0, i9432 = 0, i9433 = 0, i9434 = 0, i9435 = 0, i9436 = 0, i9437 = 0, i9438 = 0, i9439 = 0;
+        int i9440 = 0, i9441 = 0, i9442 = 0, i9443 = 0, i9444 = 0, i9445 = 0, i9446 = 0, i9447 = 0, i9448 = 0, i9449 = 0;
+        int i9450 = 0, i9451 = 0, i9452 = 0, i9453 = 0, i9454 = 0, i9455 = 0, i9456 = 0, i9457 = 0, i9458 = 0, i9459 = 0;
+        int i9460 = 0, i9461 = 0, i9462 = 0, i9463 = 0, i9464 = 0, i9465 = 0, i9466 = 0, i9467 = 0, i9468 = 0, i9469 = 0;
+        int i9470 = 0, i9471 = 0, i9472 = 0, i9473 = 0, i9474 = 0, i9475 = 0, i9476 = 0, i9477 = 0, i9478 = 0, i9479 = 0;
+        int i9480 = 0, i9481 = 0, i9482 = 0, i9483 = 0, i9484 = 0, i9485 = 0, i9486 = 0, i9487 = 0, i9488 = 0, i9489 = 0;
+        int i9490 = 0, i9491 = 0, i9492 = 0, i9493 = 0, i9494 = 0, i9495 = 0, i9496 = 0, i9497 = 0, i9498 = 0, i9499 = 0;
+        int i9500 = 0, i9501 = 0, i9502 = 0, i9503 = 0, i9504 = 0, i9505 = 0, i9506 = 0, i9507 = 0, i9508 = 0, i9509 = 0;
+        int i9510 = 0, i9511 = 0, i9512 = 0, i9513 = 0, i9514 = 0, i9515 = 0, i9516 = 0, i9517 = 0, i9518 = 0, i9519 = 0;
+        int i9520 = 0, i9521 = 0, i9522 = 0, i9523 = 0, i9524 = 0, i9525 = 0, i9526 = 0, i9527 = 0, i9528 = 0, i9529 = 0;
+        int i9530 = 0, i9531 = 0, i9532 = 0, i9533 = 0, i9534 = 0, i9535 = 0, i9536 = 0, i9537 = 0, i9538 = 0, i9539 = 0;
+        int i9540 = 0, i9541 = 0, i9542 = 0, i9543 = 0, i9544 = 0, i9545 = 0, i9546 = 0, i9547 = 0, i9548 = 0, i9549 = 0;
+        int i9550 = 0, i9551 = 0, i9552 = 0, i9553 = 0, i9554 = 0, i9555 = 0, i9556 = 0, i9557 = 0, i9558 = 0, i9559 = 0;
+        int i9560 = 0, i9561 = 0, i9562 = 0, i9563 = 0, i9564 = 0, i9565 = 0, i9566 = 0, i9567 = 0, i9568 = 0, i9569 = 0;
+        int i9570 = 0, i9571 = 0, i9572 = 0, i9573 = 0, i9574 = 0, i9575 = 0, i9576 = 0, i9577 = 0, i9578 = 0, i9579 = 0;
+        int i9580 = 0, i9581 = 0, i9582 = 0, i9583 = 0, i9584 = 0, i9585 = 0, i9586 = 0, i9587 = 0, i9588 = 0, i9589 = 0;
+        int i9590 = 0, i9591 = 0, i9592 = 0, i9593 = 0, i9594 = 0, i9595 = 0, i9596 = 0, i9597 = 0, i9598 = 0, i9599 = 0;
+        int i9600 = 0, i9601 = 0, i9602 = 0, i9603 = 0, i9604 = 0, i9605 = 0, i9606 = 0, i9607 = 0, i9608 = 0, i9609 = 0;
+        int i9610 = 0, i9611 = 0, i9612 = 0, i9613 = 0, i9614 = 0, i9615 = 0, i9616 = 0, i9617 = 0, i9618 = 0, i9619 = 0;
+        int i9620 = 0, i9621 = 0, i9622 = 0, i9623 = 0, i9624 = 0, i9625 = 0, i9626 = 0, i9627 = 0, i9628 = 0, i9629 = 0;
+        int i9630 = 0, i9631 = 0, i9632 = 0, i9633 = 0, i9634 = 0, i9635 = 0, i9636 = 0, i9637 = 0, i9638 = 0, i9639 = 0;
+        int i9640 = 0, i9641 = 0, i9642 = 0, i9643 = 0, i9644 = 0, i9645 = 0, i9646 = 0, i9647 = 0, i9648 = 0, i9649 = 0;
+        int i9650 = 0, i9651 = 0, i9652 = 0, i9653 = 0, i9654 = 0, i9655 = 0, i9656 = 0, i9657 = 0, i9658 = 0, i9659 = 0;
+        int i9660 = 0, i9661 = 0, i9662 = 0, i9663 = 0, i9664 = 0, i9665 = 0, i9666 = 0, i9667 = 0, i9668 = 0, i9669 = 0;
+        int i9670 = 0, i9671 = 0, i9672 = 0, i9673 = 0, i9674 = 0, i9675 = 0, i9676 = 0, i9677 = 0, i9678 = 0, i9679 = 0;
+        int i9680 = 0, i9681 = 0, i9682 = 0, i9683 = 0, i9684 = 0, i9685 = 0, i9686 = 0, i9687 = 0, i9688 = 0, i9689 = 0;
+        int i9690 = 0, i9691 = 0, i9692 = 0, i9693 = 0, i9694 = 0, i9695 = 0, i9696 = 0, i9697 = 0, i9698 = 0, i9699 = 0;
+        int i9700 = 0, i9701 = 0, i9702 = 0, i9703 = 0, i9704 = 0, i9705 = 0, i9706 = 0, i9707 = 0, i9708 = 0, i9709 = 0;
+        int i9710 = 0, i9711 = 0, i9712 = 0, i9713 = 0, i9714 = 0, i9715 = 0, i9716 = 0, i9717 = 0, i9718 = 0, i9719 = 0;
+        int i9720 = 0, i9721 = 0, i9722 = 0, i9723 = 0, i9724 = 0, i9725 = 0, i9726 = 0, i9727 = 0, i9728 = 0, i9729 = 0;
+        int i9730 = 0, i9731 = 0, i9732 = 0, i9733 = 0, i9734 = 0, i9735 = 0, i9736 = 0, i9737 = 0, i9738 = 0, i9739 = 0;
+        int i9740 = 0, i9741 = 0, i9742 = 0, i9743 = 0, i9744 = 0, i9745 = 0, i9746 = 0, i9747 = 0, i9748 = 0, i9749 = 0;
+        int i9750 = 0, i9751 = 0, i9752 = 0, i9753 = 0, i9754 = 0, i9755 = 0, i9756 = 0, i9757 = 0, i9758 = 0, i9759 = 0;
+        int i9760 = 0, i9761 = 0, i9762 = 0, i9763 = 0, i9764 = 0, i9765 = 0, i9766 = 0, i9767 = 0, i9768 = 0, i9769 = 0;
+        int i9770 = 0, i9771 = 0, i9772 = 0, i9773 = 0, i9774 = 0, i9775 = 0, i9776 = 0, i9777 = 0, i9778 = 0, i9779 = 0;
+        int i9780 = 0, i9781 = 0, i9782 = 0, i9783 = 0, i9784 = 0, i9785 = 0, i9786 = 0, i9787 = 0, i9788 = 0, i9789 = 0;
+        int i9790 = 0, i9791 = 0, i9792 = 0, i9793 = 0, i9794 = 0, i9795 = 0, i9796 = 0, i9797 = 0, i9798 = 0, i9799 = 0;
+        int i9800 = 0, i9801 = 0, i9802 = 0, i9803 = 0, i9804 = 0, i9805 = 0, i9806 = 0, i9807 = 0, i9808 = 0, i9809 = 0;
+        int i9810 = 0, i9811 = 0, i9812 = 0, i9813 = 0, i9814 = 0, i9815 = 0, i9816 = 0, i9817 = 0, i9818 = 0, i9819 = 0;
+        int i9820 = 0, i9821 = 0, i9822 = 0, i9823 = 0, i9824 = 0, i9825 = 0, i9826 = 0, i9827 = 0, i9828 = 0, i9829 = 0;
+        int i9830 = 0, i9831 = 0, i9832 = 0, i9833 = 0, i9834 = 0, i9835 = 0, i9836 = 0, i9837 = 0, i9838 = 0, i9839 = 0;
+        int i9840 = 0, i9841 = 0, i9842 = 0, i9843 = 0, i9844 = 0, i9845 = 0, i9846 = 0, i9847 = 0, i9848 = 0, i9849 = 0;
+        int i9850 = 0, i9851 = 0, i9852 = 0, i9853 = 0, i9854 = 0, i9855 = 0, i9856 = 0, i9857 = 0, i9858 = 0, i9859 = 0;
+        int i9860 = 0, i9861 = 0, i9862 = 0, i9863 = 0, i9864 = 0, i9865 = 0, i9866 = 0, i9867 = 0, i9868 = 0, i9869 = 0;
+        int i9870 = 0, i9871 = 0, i9872 = 0, i9873 = 0, i9874 = 0, i9875 = 0, i9876 = 0, i9877 = 0, i9878 = 0, i9879 = 0;
+        int i9880 = 0, i9881 = 0, i9882 = 0, i9883 = 0, i9884 = 0, i9885 = 0, i9886 = 0, i9887 = 0, i9888 = 0, i9889 = 0;
+        int i9890 = 0, i9891 = 0, i9892 = 0, i9893 = 0, i9894 = 0, i9895 = 0, i9896 = 0, i9897 = 0, i9898 = 0, i9899 = 0;
+        int i9900 = 0, i9901 = 0, i9902 = 0, i9903 = 0, i9904 = 0, i9905 = 0, i9906 = 0, i9907 = 0, i9908 = 0, i9909 = 0;
+        int i9910 = 0, i9911 = 0, i9912 = 0, i9913 = 0, i9914 = 0, i9915 = 0, i9916 = 0, i9917 = 0, i9918 = 0, i9919 = 0;
+        int i9920 = 0, i9921 = 0, i9922 = 0, i9923 = 0, i9924 = 0, i9925 = 0, i9926 = 0, i9927 = 0, i9928 = 0, i9929 = 0;
+        int i9930 = 0, i9931 = 0, i9932 = 0, i9933 = 0, i9934 = 0, i9935 = 0, i9936 = 0, i9937 = 0, i9938 = 0, i9939 = 0;
+        int i9940 = 0, i9941 = 0, i9942 = 0, i9943 = 0, i9944 = 0, i9945 = 0, i9946 = 0, i9947 = 0, i9948 = 0, i9949 = 0;
+        int i9950 = 0, i9951 = 0, i9952 = 0, i9953 = 0, i9954 = 0, i9955 = 0, i9956 = 0, i9957 = 0, i9958 = 0, i9959 = 0;
+        int i9960 = 0, i9961 = 0, i9962 = 0, i9963 = 0, i9964 = 0, i9965 = 0, i9966 = 0, i9967 = 0, i9968 = 0, i9969 = 0;
+        int i9970 = 0, i9971 = 0, i9972 = 0, i9973 = 0, i9974 = 0, i9975 = 0, i9976 = 0, i9977 = 0, i9978 = 0, i9979 = 0;
+        int i9980 = 0, i9981 = 0, i9982 = 0, i9983 = 0, i9984 = 0, i9985 = 0, i9986 = 0, i9987 = 0, i9988 = 0, i9989 = 0;
+        int i9990 = 0, i9991 = 0, i9992 = 0, i9993 = 0, i9994 = 0, i9995 = 0, i9996 = 0, i9997 = 0, i9998 = 0, i9999 = 0;
+        int i10000 = 0, i10001 = 0, i10002 = 0, i10003 = 0, i10004 = 0, i10005 = 0, i10006 = 0, i10007 = 0, i10008 = 0, i10009 = 0;
+        int i10010 = 0, i10011 = 0, i10012 = 0, i10013 = 0, i10014 = 0, i10015 = 0, i10016 = 0, i10017 = 0, i10018 = 0, i10019 = 0;
+        int i10020 = 0, i10021 = 0, i10022 = 0, i10023 = 0, i10024 = 0, i10025 = 0, i10026 = 0, i10027 = 0, i10028 = 0, i10029 = 0;
+        int i10030 = 0, i10031 = 0, i10032 = 0, i10033 = 0, i10034 = 0, i10035 = 0, i10036 = 0, i10037 = 0, i10038 = 0, i10039 = 0;
+        int i10040 = 0, i10041 = 0, i10042 = 0, i10043 = 0, i10044 = 0, i10045 = 0, i10046 = 0, i10047 = 0, i10048 = 0, i10049 = 0;
+        int i10050 = 0, i10051 = 0, i10052 = 0, i10053 = 0, i10054 = 0, i10055 = 0, i10056 = 0, i10057 = 0, i10058 = 0, i10059 = 0;
+        int i10060 = 0, i10061 = 0, i10062 = 0, i10063 = 0, i10064 = 0, i10065 = 0, i10066 = 0, i10067 = 0, i10068 = 0, i10069 = 0;
+        int i10070 = 0, i10071 = 0, i10072 = 0, i10073 = 0, i10074 = 0, i10075 = 0, i10076 = 0, i10077 = 0, i10078 = 0, i10079 = 0;
+        int i10080 = 0, i10081 = 0, i10082 = 0, i10083 = 0, i10084 = 0, i10085 = 0, i10086 = 0, i10087 = 0, i10088 = 0, i10089 = 0;
+        int i10090 = 0, i10091 = 0, i10092 = 0, i10093 = 0, i10094 = 0, i10095 = 0, i10096 = 0, i10097 = 0, i10098 = 0, i10099 = 0;
+        int i10100 = 0, i10101 = 0, i10102 = 0, i10103 = 0, i10104 = 0, i10105 = 0, i10106 = 0, i10107 = 0, i10108 = 0, i10109 = 0;
+        int i10110 = 0, i10111 = 0, i10112 = 0, i10113 = 0, i10114 = 0, i10115 = 0, i10116 = 0, i10117 = 0, i10118 = 0, i10119 = 0;
+        int i10120 = 0, i10121 = 0, i10122 = 0, i10123 = 0, i10124 = 0, i10125 = 0, i10126 = 0, i10127 = 0, i10128 = 0, i10129 = 0;
+        int i10130 = 0, i10131 = 0, i10132 = 0, i10133 = 0, i10134 = 0, i10135 = 0, i10136 = 0, i10137 = 0, i10138 = 0, i10139 = 0;
+        int i10140 = 0, i10141 = 0, i10142 = 0, i10143 = 0, i10144 = 0, i10145 = 0, i10146 = 0, i10147 = 0, i10148 = 0, i10149 = 0;
+        int i10150 = 0, i10151 = 0, i10152 = 0, i10153 = 0, i10154 = 0, i10155 = 0, i10156 = 0, i10157 = 0, i10158 = 0, i10159 = 0;
+        int i10160 = 0, i10161 = 0, i10162 = 0, i10163 = 0, i10164 = 0, i10165 = 0, i10166 = 0, i10167 = 0, i10168 = 0, i10169 = 0;
+        int i10170 = 0, i10171 = 0, i10172 = 0, i10173 = 0, i10174 = 0, i10175 = 0, i10176 = 0, i10177 = 0, i10178 = 0, i10179 = 0;
+        int i10180 = 0, i10181 = 0, i10182 = 0, i10183 = 0, i10184 = 0, i10185 = 0, i10186 = 0, i10187 = 0, i10188 = 0, i10189 = 0;
+        int i10190 = 0, i10191 = 0, i10192 = 0, i10193 = 0, i10194 = 0, i10195 = 0, i10196 = 0, i10197 = 0, i10198 = 0, i10199 = 0;
+        int i10200 = 0, i10201 = 0, i10202 = 0, i10203 = 0, i10204 = 0, i10205 = 0, i10206 = 0, i10207 = 0, i10208 = 0, i10209 = 0;
+        int i10210 = 0, i10211 = 0, i10212 = 0, i10213 = 0, i10214 = 0, i10215 = 0, i10216 = 0, i10217 = 0, i10218 = 0, i10219 = 0;
+        int i10220 = 0, i10221 = 0, i10222 = 0, i10223 = 0, i10224 = 0, i10225 = 0, i10226 = 0, i10227 = 0, i10228 = 0, i10229 = 0;
+        int i10230 = 0, i10231 = 0, i10232 = 0, i10233 = 0, i10234 = 0, i10235 = 0, i10236 = 0, i10237 = 0, i10238 = 0, i10239 = 0;
+        int i10240 = 0, i10241 = 0, i10242 = 0, i10243 = 0, i10244 = 0, i10245 = 0, i10246 = 0, i10247 = 0, i10248 = 0, i10249 = 0;
+        int i10250 = 0, i10251 = 0, i10252 = 0, i10253 = 0, i10254 = 0, i10255 = 0, i10256 = 0, i10257 = 0, i10258 = 0, i10259 = 0;
+        int i10260 = 0, i10261 = 0, i10262 = 0, i10263 = 0, i10264 = 0, i10265 = 0, i10266 = 0, i10267 = 0, i10268 = 0, i10269 = 0;
+        int i10270 = 0, i10271 = 0, i10272 = 0, i10273 = 0, i10274 = 0, i10275 = 0, i10276 = 0, i10277 = 0, i10278 = 0, i10279 = 0;
+        int i10280 = 0, i10281 = 0, i10282 = 0, i10283 = 0, i10284 = 0, i10285 = 0, i10286 = 0, i10287 = 0, i10288 = 0, i10289 = 0;
+        int i10290 = 0, i10291 = 0, i10292 = 0, i10293 = 0, i10294 = 0, i10295 = 0, i10296 = 0, i10297 = 0, i10298 = 0, i10299 = 0;
+        int i10300 = 0, i10301 = 0, i10302 = 0, i10303 = 0, i10304 = 0, i10305 = 0, i10306 = 0, i10307 = 0, i10308 = 0, i10309 = 0;
+        int i10310 = 0, i10311 = 0, i10312 = 0, i10313 = 0, i10314 = 0, i10315 = 0, i10316 = 0, i10317 = 0, i10318 = 0, i10319 = 0;
+        int i10320 = 0, i10321 = 0, i10322 = 0, i10323 = 0, i10324 = 0, i10325 = 0, i10326 = 0, i10327 = 0, i10328 = 0, i10329 = 0;
+        int i10330 = 0, i10331 = 0, i10332 = 0, i10333 = 0, i10334 = 0, i10335 = 0, i10336 = 0, i10337 = 0, i10338 = 0, i10339 = 0;
+        int i10340 = 0, i10341 = 0, i10342 = 0, i10343 = 0, i10344 = 0, i10345 = 0, i10346 = 0, i10347 = 0, i10348 = 0, i10349 = 0;
+        int i10350 = 0, i10351 = 0, i10352 = 0, i10353 = 0, i10354 = 0, i10355 = 0, i10356 = 0, i10357 = 0, i10358 = 0, i10359 = 0;
+        int i10360 = 0, i10361 = 0, i10362 = 0, i10363 = 0, i10364 = 0, i10365 = 0, i10366 = 0, i10367 = 0, i10368 = 0, i10369 = 0;
+        int i10370 = 0, i10371 = 0, i10372 = 0, i10373 = 0, i10374 = 0, i10375 = 0, i10376 = 0, i10377 = 0, i10378 = 0, i10379 = 0;
+        int i10380 = 0, i10381 = 0, i10382 = 0, i10383 = 0, i10384 = 0, i10385 = 0, i10386 = 0, i10387 = 0, i10388 = 0, i10389 = 0;
+        int i10390 = 0, i10391 = 0, i10392 = 0, i10393 = 0, i10394 = 0, i10395 = 0, i10396 = 0, i10397 = 0, i10398 = 0, i10399 = 0;
+        int i10400 = 0, i10401 = 0, i10402 = 0, i10403 = 0, i10404 = 0, i10405 = 0, i10406 = 0, i10407 = 0, i10408 = 0, i10409 = 0;
+        int i10410 = 0, i10411 = 0, i10412 = 0, i10413 = 0, i10414 = 0, i10415 = 0, i10416 = 0, i10417 = 0, i10418 = 0, i10419 = 0;
+        int i10420 = 0, i10421 = 0, i10422 = 0, i10423 = 0, i10424 = 0, i10425 = 0, i10426 = 0, i10427 = 0, i10428 = 0, i10429 = 0;
+        int i10430 = 0, i10431 = 0, i10432 = 0, i10433 = 0, i10434 = 0, i10435 = 0, i10436 = 0, i10437 = 0, i10438 = 0, i10439 = 0;
+        int i10440 = 0, i10441 = 0, i10442 = 0, i10443 = 0, i10444 = 0, i10445 = 0, i10446 = 0, i10447 = 0, i10448 = 0, i10449 = 0;
+        int i10450 = 0, i10451 = 0, i10452 = 0, i10453 = 0, i10454 = 0, i10455 = 0, i10456 = 0, i10457 = 0, i10458 = 0, i10459 = 0;
+        int i10460 = 0, i10461 = 0, i10462 = 0, i10463 = 0, i10464 = 0, i10465 = 0, i10466 = 0, i10467 = 0, i10468 = 0, i10469 = 0;
+        int i10470 = 0, i10471 = 0, i10472 = 0, i10473 = 0, i10474 = 0, i10475 = 0, i10476 = 0, i10477 = 0, i10478 = 0, i10479 = 0;
+        int i10480 = 0, i10481 = 0, i10482 = 0, i10483 = 0, i10484 = 0, i10485 = 0, i10486 = 0, i10487 = 0, i10488 = 0, i10489 = 0;
+        int i10490 = 0, i10491 = 0, i10492 = 0, i10493 = 0, i10494 = 0, i10495 = 0, i10496 = 0, i10497 = 0, i10498 = 0, i10499 = 0;
+        int i10500 = 0, i10501 = 0, i10502 = 0, i10503 = 0, i10504 = 0, i10505 = 0, i10506 = 0, i10507 = 0, i10508 = 0, i10509 = 0;
+        int i10510 = 0, i10511 = 0, i10512 = 0, i10513 = 0, i10514 = 0, i10515 = 0, i10516 = 0, i10517 = 0, i10518 = 0, i10519 = 0;
+        int i10520 = 0, i10521 = 0, i10522 = 0, i10523 = 0, i10524 = 0, i10525 = 0, i10526 = 0, i10527 = 0, i10528 = 0, i10529 = 0;
+        int i10530 = 0, i10531 = 0, i10532 = 0, i10533 = 0, i10534 = 0, i10535 = 0, i10536 = 0, i10537 = 0, i10538 = 0, i10539 = 0;
+        int i10540 = 0, i10541 = 0, i10542 = 0, i10543 = 0, i10544 = 0, i10545 = 0, i10546 = 0, i10547 = 0, i10548 = 0, i10549 = 0;
+        int i10550 = 0, i10551 = 0, i10552 = 0, i10553 = 0, i10554 = 0, i10555 = 0, i10556 = 0, i10557 = 0, i10558 = 0, i10559 = 0;
+        int i10560 = 0, i10561 = 0, i10562 = 0, i10563 = 0, i10564 = 0, i10565 = 0, i10566 = 0, i10567 = 0, i10568 = 0, i10569 = 0;
+        int i10570 = 0, i10571 = 0, i10572 = 0, i10573 = 0, i10574 = 0, i10575 = 0, i10576 = 0, i10577 = 0, i10578 = 0, i10579 = 0;
+        int i10580 = 0, i10581 = 0, i10582 = 0, i10583 = 0, i10584 = 0, i10585 = 0, i10586 = 0, i10587 = 0, i10588 = 0, i10589 = 0;
+        int i10590 = 0, i10591 = 0, i10592 = 0, i10593 = 0, i10594 = 0, i10595 = 0, i10596 = 0, i10597 = 0, i10598 = 0, i10599 = 0;
+        int i10600 = 0, i10601 = 0, i10602 = 0, i10603 = 0, i10604 = 0, i10605 = 0, i10606 = 0, i10607 = 0, i10608 = 0, i10609 = 0;
+        int i10610 = 0, i10611 = 0, i10612 = 0, i10613 = 0, i10614 = 0, i10615 = 0, i10616 = 0, i10617 = 0, i10618 = 0, i10619 = 0;
+        int i10620 = 0, i10621 = 0, i10622 = 0, i10623 = 0, i10624 = 0, i10625 = 0, i10626 = 0, i10627 = 0, i10628 = 0, i10629 = 0;
+        int i10630 = 0, i10631 = 0, i10632 = 0, i10633 = 0, i10634 = 0, i10635 = 0, i10636 = 0, i10637 = 0, i10638 = 0, i10639 = 0;
+        int i10640 = 0, i10641 = 0, i10642 = 0, i10643 = 0, i10644 = 0, i10645 = 0, i10646 = 0, i10647 = 0, i10648 = 0, i10649 = 0;
+        int i10650 = 0, i10651 = 0, i10652 = 0, i10653 = 0, i10654 = 0, i10655 = 0, i10656 = 0, i10657 = 0, i10658 = 0, i10659 = 0;
+        int i10660 = 0, i10661 = 0, i10662 = 0, i10663 = 0, i10664 = 0, i10665 = 0, i10666 = 0, i10667 = 0, i10668 = 0, i10669 = 0;
+        int i10670 = 0, i10671 = 0, i10672 = 0, i10673 = 0, i10674 = 0, i10675 = 0, i10676 = 0, i10677 = 0, i10678 = 0, i10679 = 0;
+        int i10680 = 0, i10681 = 0, i10682 = 0, i10683 = 0, i10684 = 0, i10685 = 0, i10686 = 0, i10687 = 0, i10688 = 0, i10689 = 0;
+        int i10690 = 0, i10691 = 0, i10692 = 0, i10693 = 0, i10694 = 0, i10695 = 0, i10696 = 0, i10697 = 0, i10698 = 0, i10699 = 0;
+        int i10700 = 0, i10701 = 0, i10702 = 0, i10703 = 0, i10704 = 0, i10705 = 0, i10706 = 0, i10707 = 0, i10708 = 0, i10709 = 0;
+        int i10710 = 0, i10711 = 0, i10712 = 0, i10713 = 0, i10714 = 0, i10715 = 0, i10716 = 0, i10717 = 0, i10718 = 0, i10719 = 0;
+        int i10720 = 0, i10721 = 0, i10722 = 0, i10723 = 0, i10724 = 0, i10725 = 0, i10726 = 0, i10727 = 0, i10728 = 0, i10729 = 0;
+        int i10730 = 0, i10731 = 0, i10732 = 0, i10733 = 0, i10734 = 0, i10735 = 0, i10736 = 0, i10737 = 0, i10738 = 0, i10739 = 0;
+        int i10740 = 0, i10741 = 0, i10742 = 0, i10743 = 0, i10744 = 0, i10745 = 0, i10746 = 0, i10747 = 0, i10748 = 0, i10749 = 0;
+        int i10750 = 0, i10751 = 0, i10752 = 0, i10753 = 0, i10754 = 0, i10755 = 0, i10756 = 0, i10757 = 0, i10758 = 0, i10759 = 0;
+        int i10760 = 0, i10761 = 0, i10762 = 0, i10763 = 0, i10764 = 0, i10765 = 0, i10766 = 0, i10767 = 0, i10768 = 0, i10769 = 0;
+        int i10770 = 0, i10771 = 0, i10772 = 0, i10773 = 0, i10774 = 0, i10775 = 0, i10776 = 0, i10777 = 0, i10778 = 0, i10779 = 0;
+        int i10780 = 0, i10781 = 0, i10782 = 0, i10783 = 0, i10784 = 0, i10785 = 0, i10786 = 0, i10787 = 0, i10788 = 0, i10789 = 0;
+        int i10790 = 0, i10791 = 0, i10792 = 0, i10793 = 0, i10794 = 0, i10795 = 0, i10796 = 0, i10797 = 0, i10798 = 0, i10799 = 0;
+        int i10800 = 0, i10801 = 0, i10802 = 0, i10803 = 0, i10804 = 0, i10805 = 0, i10806 = 0, i10807 = 0, i10808 = 0, i10809 = 0;
+        int i10810 = 0, i10811 = 0, i10812 = 0, i10813 = 0, i10814 = 0, i10815 = 0, i10816 = 0, i10817 = 0, i10818 = 0, i10819 = 0;
+        int i10820 = 0, i10821 = 0, i10822 = 0, i10823 = 0, i10824 = 0, i10825 = 0, i10826 = 0, i10827 = 0, i10828 = 0, i10829 = 0;
+        int i10830 = 0, i10831 = 0, i10832 = 0, i10833 = 0, i10834 = 0, i10835 = 0, i10836 = 0, i10837 = 0, i10838 = 0, i10839 = 0;
+        int i10840 = 0, i10841 = 0, i10842 = 0, i10843 = 0, i10844 = 0, i10845 = 0, i10846 = 0, i10847 = 0, i10848 = 0, i10849 = 0;
+        int i10850 = 0, i10851 = 0, i10852 = 0, i10853 = 0, i10854 = 0, i10855 = 0, i10856 = 0, i10857 = 0, i10858 = 0, i10859 = 0;
+        int i10860 = 0, i10861 = 0, i10862 = 0, i10863 = 0, i10864 = 0, i10865 = 0, i10866 = 0, i10867 = 0, i10868 = 0, i10869 = 0;
+        int i10870 = 0, i10871 = 0, i10872 = 0, i10873 = 0, i10874 = 0, i10875 = 0, i10876 = 0, i10877 = 0, i10878 = 0, i10879 = 0;
+        int i10880 = 0, i10881 = 0, i10882 = 0, i10883 = 0, i10884 = 0, i10885 = 0, i10886 = 0, i10887 = 0, i10888 = 0, i10889 = 0;
+        int i10890 = 0, i10891 = 0, i10892 = 0, i10893 = 0, i10894 = 0, i10895 = 0, i10896 = 0, i10897 = 0, i10898 = 0, i10899 = 0;
+        int i10900 = 0, i10901 = 0, i10902 = 0, i10903 = 0, i10904 = 0, i10905 = 0, i10906 = 0, i10907 = 0, i10908 = 0, i10909 = 0;
+        int i10910 = 0, i10911 = 0, i10912 = 0, i10913 = 0, i10914 = 0, i10915 = 0, i10916 = 0, i10917 = 0, i10918 = 0, i10919 = 0;
+        int i10920 = 0, i10921 = 0, i10922 = 0, i10923 = 0, i10924 = 0, i10925 = 0, i10926 = 0, i10927 = 0, i10928 = 0, i10929 = 0;
+        int i10930 = 0, i10931 = 0, i10932 = 0, i10933 = 0, i10934 = 0, i10935 = 0, i10936 = 0, i10937 = 0, i10938 = 0, i10939 = 0;
+        int i10940 = 0, i10941 = 0, i10942 = 0, i10943 = 0, i10944 = 0, i10945 = 0, i10946 = 0, i10947 = 0, i10948 = 0, i10949 = 0;
+        int i10950 = 0, i10951 = 0, i10952 = 0, i10953 = 0, i10954 = 0, i10955 = 0, i10956 = 0, i10957 = 0, i10958 = 0, i10959 = 0;
+        int i10960 = 0, i10961 = 0, i10962 = 0, i10963 = 0, i10964 = 0, i10965 = 0, i10966 = 0, i10967 = 0, i10968 = 0, i10969 = 0;
+        int i10970 = 0, i10971 = 0, i10972 = 0, i10973 = 0, i10974 = 0, i10975 = 0, i10976 = 0, i10977 = 0, i10978 = 0, i10979 = 0;
+        int i10980 = 0, i10981 = 0, i10982 = 0, i10983 = 0, i10984 = 0, i10985 = 0, i10986 = 0, i10987 = 0, i10988 = 0, i10989 = 0;
+        int i10990 = 0, i10991 = 0, i10992 = 0, i10993 = 0, i10994 = 0, i10995 = 0, i10996 = 0, i10997 = 0, i10998 = 0, i10999 = 0;
+        int i11000 = 0, i11001 = 0, i11002 = 0, i11003 = 0, i11004 = 0, i11005 = 0, i11006 = 0, i11007 = 0, i11008 = 0, i11009 = 0;
+        int i11010 = 0, i11011 = 0, i11012 = 0, i11013 = 0, i11014 = 0, i11015 = 0, i11016 = 0, i11017 = 0, i11018 = 0, i11019 = 0;
+        int i11020 = 0, i11021 = 0, i11022 = 0, i11023 = 0, i11024 = 0, i11025 = 0, i11026 = 0, i11027 = 0, i11028 = 0, i11029 = 0;
+        int i11030 = 0, i11031 = 0, i11032 = 0, i11033 = 0, i11034 = 0, i11035 = 0, i11036 = 0, i11037 = 0, i11038 = 0, i11039 = 0;
+        int i11040 = 0, i11041 = 0, i11042 = 0, i11043 = 0, i11044 = 0, i11045 = 0, i11046 = 0, i11047 = 0, i11048 = 0, i11049 = 0;
+        int i11050 = 0, i11051 = 0, i11052 = 0, i11053 = 0, i11054 = 0, i11055 = 0, i11056 = 0, i11057 = 0, i11058 = 0, i11059 = 0;
+        int i11060 = 0, i11061 = 0, i11062 = 0, i11063 = 0, i11064 = 0, i11065 = 0, i11066 = 0, i11067 = 0, i11068 = 0, i11069 = 0;
+        int i11070 = 0, i11071 = 0, i11072 = 0, i11073 = 0, i11074 = 0, i11075 = 0, i11076 = 0, i11077 = 0, i11078 = 0, i11079 = 0;
+        int i11080 = 0, i11081 = 0, i11082 = 0, i11083 = 0, i11084 = 0, i11085 = 0, i11086 = 0, i11087 = 0, i11088 = 0, i11089 = 0;
+        int i11090 = 0, i11091 = 0, i11092 = 0, i11093 = 0, i11094 = 0, i11095 = 0, i11096 = 0, i11097 = 0, i11098 = 0, i11099 = 0;
+        int i11100 = 0, i11101 = 0, i11102 = 0, i11103 = 0, i11104 = 0, i11105 = 0, i11106 = 0, i11107 = 0, i11108 = 0, i11109 = 0;
+        int i11110 = 0, i11111 = 0, i11112 = 0, i11113 = 0, i11114 = 0, i11115 = 0, i11116 = 0, i11117 = 0, i11118 = 0, i11119 = 0;
+        int i11120 = 0, i11121 = 0, i11122 = 0, i11123 = 0, i11124 = 0, i11125 = 0, i11126 = 0, i11127 = 0, i11128 = 0, i11129 = 0;
+        int i11130 = 0, i11131 = 0, i11132 = 0, i11133 = 0, i11134 = 0, i11135 = 0, i11136 = 0, i11137 = 0, i11138 = 0, i11139 = 0;
+        int i11140 = 0, i11141 = 0, i11142 = 0, i11143 = 0, i11144 = 0, i11145 = 0, i11146 = 0, i11147 = 0, i11148 = 0, i11149 = 0;
+        int i11150 = 0, i11151 = 0, i11152 = 0, i11153 = 0, i11154 = 0, i11155 = 0, i11156 = 0, i11157 = 0, i11158 = 0, i11159 = 0;
+        int i11160 = 0, i11161 = 0, i11162 = 0, i11163 = 0, i11164 = 0, i11165 = 0, i11166 = 0, i11167 = 0, i11168 = 0, i11169 = 0;
+        int i11170 = 0, i11171 = 0, i11172 = 0, i11173 = 0, i11174 = 0, i11175 = 0, i11176 = 0, i11177 = 0, i11178 = 0, i11179 = 0;
+        int i11180 = 0, i11181 = 0, i11182 = 0, i11183 = 0, i11184 = 0, i11185 = 0, i11186 = 0, i11187 = 0, i11188 = 0, i11189 = 0;
+        int i11190 = 0, i11191 = 0, i11192 = 0, i11193 = 0, i11194 = 0, i11195 = 0, i11196 = 0, i11197 = 0, i11198 = 0, i11199 = 0;
+        int i11200 = 0, i11201 = 0, i11202 = 0, i11203 = 0, i11204 = 0, i11205 = 0, i11206 = 0, i11207 = 0, i11208 = 0, i11209 = 0;
+        int i11210 = 0, i11211 = 0, i11212 = 0, i11213 = 0, i11214 = 0, i11215 = 0, i11216 = 0, i11217 = 0, i11218 = 0, i11219 = 0;
+        int i11220 = 0, i11221 = 0, i11222 = 0, i11223 = 0, i11224 = 0, i11225 = 0, i11226 = 0, i11227 = 0, i11228 = 0, i11229 = 0;
+        int i11230 = 0, i11231 = 0, i11232 = 0, i11233 = 0, i11234 = 0, i11235 = 0, i11236 = 0, i11237 = 0, i11238 = 0, i11239 = 0;
+        int i11240 = 0, i11241 = 0, i11242 = 0, i11243 = 0, i11244 = 0, i11245 = 0, i11246 = 0, i11247 = 0, i11248 = 0, i11249 = 0;
+        int i11250 = 0, i11251 = 0, i11252 = 0, i11253 = 0, i11254 = 0, i11255 = 0, i11256 = 0, i11257 = 0, i11258 = 0, i11259 = 0;
+        int i11260 = 0, i11261 = 0, i11262 = 0, i11263 = 0, i11264 = 0, i11265 = 0, i11266 = 0, i11267 = 0, i11268 = 0, i11269 = 0;
+        int i11270 = 0, i11271 = 0, i11272 = 0, i11273 = 0, i11274 = 0, i11275 = 0, i11276 = 0, i11277 = 0, i11278 = 0, i11279 = 0;
+        int i11280 = 0, i11281 = 0, i11282 = 0, i11283 = 0, i11284 = 0, i11285 = 0, i11286 = 0, i11287 = 0, i11288 = 0, i11289 = 0;
+        int i11290 = 0, i11291 = 0, i11292 = 0, i11293 = 0, i11294 = 0, i11295 = 0, i11296 = 0, i11297 = 0, i11298 = 0, i11299 = 0;
+        int i11300 = 0, i11301 = 0, i11302 = 0, i11303 = 0, i11304 = 0, i11305 = 0, i11306 = 0, i11307 = 0, i11308 = 0, i11309 = 0;
+        int i11310 = 0, i11311 = 0, i11312 = 0, i11313 = 0, i11314 = 0, i11315 = 0, i11316 = 0, i11317 = 0, i11318 = 0, i11319 = 0;
+        int i11320 = 0, i11321 = 0, i11322 = 0, i11323 = 0, i11324 = 0, i11325 = 0, i11326 = 0, i11327 = 0, i11328 = 0, i11329 = 0;
+        int i11330 = 0, i11331 = 0, i11332 = 0, i11333 = 0, i11334 = 0, i11335 = 0, i11336 = 0, i11337 = 0, i11338 = 0, i11339 = 0;
+        int i11340 = 0, i11341 = 0, i11342 = 0, i11343 = 0, i11344 = 0, i11345 = 0, i11346 = 0, i11347 = 0, i11348 = 0, i11349 = 0;
+        int i11350 = 0, i11351 = 0, i11352 = 0, i11353 = 0, i11354 = 0, i11355 = 0, i11356 = 0, i11357 = 0, i11358 = 0, i11359 = 0;
+        int i11360 = 0, i11361 = 0, i11362 = 0, i11363 = 0, i11364 = 0, i11365 = 0, i11366 = 0, i11367 = 0, i11368 = 0, i11369 = 0;
+        int i11370 = 0, i11371 = 0, i11372 = 0, i11373 = 0, i11374 = 0, i11375 = 0, i11376 = 0, i11377 = 0, i11378 = 0, i11379 = 0;
+        int i11380 = 0, i11381 = 0, i11382 = 0, i11383 = 0, i11384 = 0, i11385 = 0, i11386 = 0, i11387 = 0, i11388 = 0, i11389 = 0;
+        int i11390 = 0, i11391 = 0, i11392 = 0, i11393 = 0, i11394 = 0, i11395 = 0, i11396 = 0, i11397 = 0, i11398 = 0, i11399 = 0;
+        int i11400 = 0, i11401 = 0, i11402 = 0, i11403 = 0, i11404 = 0, i11405 = 0, i11406 = 0, i11407 = 0, i11408 = 0, i11409 = 0;
+        int i11410 = 0, i11411 = 0, i11412 = 0, i11413 = 0, i11414 = 0, i11415 = 0, i11416 = 0, i11417 = 0, i11418 = 0, i11419 = 0;
+        int i11420 = 0, i11421 = 0, i11422 = 0, i11423 = 0, i11424 = 0, i11425 = 0, i11426 = 0, i11427 = 0, i11428 = 0, i11429 = 0;
+        int i11430 = 0, i11431 = 0, i11432 = 0, i11433 = 0, i11434 = 0, i11435 = 0, i11436 = 0, i11437 = 0, i11438 = 0, i11439 = 0;
+        int i11440 = 0, i11441 = 0, i11442 = 0, i11443 = 0, i11444 = 0, i11445 = 0, i11446 = 0, i11447 = 0, i11448 = 0, i11449 = 0;
+        int i11450 = 0, i11451 = 0, i11452 = 0, i11453 = 0, i11454 = 0, i11455 = 0, i11456 = 0, i11457 = 0, i11458 = 0, i11459 = 0;
+        int i11460 = 0, i11461 = 0, i11462 = 0, i11463 = 0, i11464 = 0, i11465 = 0, i11466 = 0, i11467 = 0, i11468 = 0, i11469 = 0;
+        int i11470 = 0, i11471 = 0, i11472 = 0, i11473 = 0, i11474 = 0, i11475 = 0, i11476 = 0, i11477 = 0, i11478 = 0, i11479 = 0;
+        int i11480 = 0, i11481 = 0, i11482 = 0, i11483 = 0, i11484 = 0, i11485 = 0, i11486 = 0, i11487 = 0, i11488 = 0, i11489 = 0;
+        int i11490 = 0, i11491 = 0, i11492 = 0, i11493 = 0, i11494 = 0, i11495 = 0, i11496 = 0, i11497 = 0, i11498 = 0, i11499 = 0;
+        int i11500 = 0, i11501 = 0, i11502 = 0, i11503 = 0, i11504 = 0, i11505 = 0, i11506 = 0, i11507 = 0, i11508 = 0, i11509 = 0;
+        int i11510 = 0, i11511 = 0, i11512 = 0, i11513 = 0, i11514 = 0, i11515 = 0, i11516 = 0, i11517 = 0, i11518 = 0, i11519 = 0;
+        int i11520 = 0, i11521 = 0, i11522 = 0, i11523 = 0, i11524 = 0, i11525 = 0, i11526 = 0, i11527 = 0, i11528 = 0, i11529 = 0;
+        int i11530 = 0, i11531 = 0, i11532 = 0, i11533 = 0, i11534 = 0, i11535 = 0, i11536 = 0, i11537 = 0, i11538 = 0, i11539 = 0;
+        int i11540 = 0, i11541 = 0, i11542 = 0, i11543 = 0, i11544 = 0, i11545 = 0, i11546 = 0, i11547 = 0, i11548 = 0, i11549 = 0;
+        int i11550 = 0, i11551 = 0, i11552 = 0, i11553 = 0, i11554 = 0, i11555 = 0, i11556 = 0, i11557 = 0, i11558 = 0, i11559 = 0;
+        int i11560 = 0, i11561 = 0, i11562 = 0, i11563 = 0, i11564 = 0, i11565 = 0, i11566 = 0, i11567 = 0, i11568 = 0, i11569 = 0;
+        int i11570 = 0, i11571 = 0, i11572 = 0, i11573 = 0, i11574 = 0, i11575 = 0, i11576 = 0, i11577 = 0, i11578 = 0, i11579 = 0;
+        int i11580 = 0, i11581 = 0, i11582 = 0, i11583 = 0, i11584 = 0, i11585 = 0, i11586 = 0, i11587 = 0, i11588 = 0, i11589 = 0;
+        int i11590 = 0, i11591 = 0, i11592 = 0, i11593 = 0, i11594 = 0, i11595 = 0, i11596 = 0, i11597 = 0, i11598 = 0, i11599 = 0;
+        int i11600 = 0, i11601 = 0, i11602 = 0, i11603 = 0, i11604 = 0, i11605 = 0, i11606 = 0, i11607 = 0, i11608 = 0, i11609 = 0;
+        int i11610 = 0, i11611 = 0, i11612 = 0, i11613 = 0, i11614 = 0, i11615 = 0, i11616 = 0, i11617 = 0, i11618 = 0, i11619 = 0;
+        int i11620 = 0, i11621 = 0, i11622 = 0, i11623 = 0, i11624 = 0, i11625 = 0, i11626 = 0, i11627 = 0, i11628 = 0, i11629 = 0;
+        int i11630 = 0, i11631 = 0, i11632 = 0, i11633 = 0, i11634 = 0, i11635 = 0, i11636 = 0, i11637 = 0, i11638 = 0, i11639 = 0;
+        int i11640 = 0, i11641 = 0, i11642 = 0, i11643 = 0, i11644 = 0, i11645 = 0, i11646 = 0, i11647 = 0, i11648 = 0, i11649 = 0;
+        int i11650 = 0, i11651 = 0, i11652 = 0, i11653 = 0, i11654 = 0, i11655 = 0, i11656 = 0, i11657 = 0, i11658 = 0, i11659 = 0;
+        int i11660 = 0, i11661 = 0, i11662 = 0, i11663 = 0, i11664 = 0, i11665 = 0, i11666 = 0, i11667 = 0, i11668 = 0, i11669 = 0;
+        int i11670 = 0, i11671 = 0, i11672 = 0, i11673 = 0, i11674 = 0, i11675 = 0, i11676 = 0, i11677 = 0, i11678 = 0, i11679 = 0;
+        int i11680 = 0, i11681 = 0, i11682 = 0, i11683 = 0, i11684 = 0, i11685 = 0, i11686 = 0, i11687 = 0, i11688 = 0, i11689 = 0;
+        int i11690 = 0, i11691 = 0, i11692 = 0, i11693 = 0, i11694 = 0, i11695 = 0, i11696 = 0, i11697 = 0, i11698 = 0, i11699 = 0;
+        int i11700 = 0, i11701 = 0, i11702 = 0, i11703 = 0, i11704 = 0, i11705 = 0, i11706 = 0, i11707 = 0, i11708 = 0, i11709 = 0;
+        int i11710 = 0, i11711 = 0, i11712 = 0, i11713 = 0, i11714 = 0, i11715 = 0, i11716 = 0, i11717 = 0, i11718 = 0, i11719 = 0;
+        int i11720 = 0, i11721 = 0, i11722 = 0, i11723 = 0, i11724 = 0, i11725 = 0, i11726 = 0, i11727 = 0, i11728 = 0, i11729 = 0;
+        int i11730 = 0, i11731 = 0, i11732 = 0, i11733 = 0, i11734 = 0, i11735 = 0, i11736 = 0, i11737 = 0, i11738 = 0, i11739 = 0;
+        int i11740 = 0, i11741 = 0, i11742 = 0, i11743 = 0, i11744 = 0, i11745 = 0, i11746 = 0, i11747 = 0, i11748 = 0, i11749 = 0;
+        int i11750 = 0, i11751 = 0, i11752 = 0, i11753 = 0, i11754 = 0, i11755 = 0, i11756 = 0, i11757 = 0, i11758 = 0, i11759 = 0;
+        int i11760 = 0, i11761 = 0, i11762 = 0, i11763 = 0, i11764 = 0, i11765 = 0, i11766 = 0, i11767 = 0, i11768 = 0, i11769 = 0;
+        int i11770 = 0, i11771 = 0, i11772 = 0, i11773 = 0, i11774 = 0, i11775 = 0, i11776 = 0, i11777 = 0, i11778 = 0, i11779 = 0;
+        int i11780 = 0, i11781 = 0, i11782 = 0, i11783 = 0, i11784 = 0, i11785 = 0, i11786 = 0, i11787 = 0, i11788 = 0, i11789 = 0;
+        int i11790 = 0, i11791 = 0, i11792 = 0, i11793 = 0, i11794 = 0, i11795 = 0, i11796 = 0, i11797 = 0, i11798 = 0, i11799 = 0;
+        int i11800 = 0, i11801 = 0, i11802 = 0, i11803 = 0, i11804 = 0, i11805 = 0, i11806 = 0, i11807 = 0, i11808 = 0, i11809 = 0;
+        int i11810 = 0, i11811 = 0, i11812 = 0, i11813 = 0, i11814 = 0, i11815 = 0, i11816 = 0, i11817 = 0, i11818 = 0, i11819 = 0;
+        int i11820 = 0, i11821 = 0, i11822 = 0, i11823 = 0, i11824 = 0, i11825 = 0, i11826 = 0, i11827 = 0, i11828 = 0, i11829 = 0;
+        int i11830 = 0, i11831 = 0, i11832 = 0, i11833 = 0, i11834 = 0, i11835 = 0, i11836 = 0, i11837 = 0, i11838 = 0, i11839 = 0;
+        int i11840 = 0, i11841 = 0, i11842 = 0, i11843 = 0, i11844 = 0, i11845 = 0, i11846 = 0, i11847 = 0, i11848 = 0, i11849 = 0;
+        int i11850 = 0, i11851 = 0, i11852 = 0, i11853 = 0, i11854 = 0, i11855 = 0, i11856 = 0, i11857 = 0, i11858 = 0, i11859 = 0;
+        int i11860 = 0, i11861 = 0, i11862 = 0, i11863 = 0, i11864 = 0, i11865 = 0, i11866 = 0, i11867 = 0, i11868 = 0, i11869 = 0;
+        int i11870 = 0, i11871 = 0, i11872 = 0, i11873 = 0, i11874 = 0, i11875 = 0, i11876 = 0, i11877 = 0, i11878 = 0, i11879 = 0;
+        int i11880 = 0, i11881 = 0, i11882 = 0, i11883 = 0, i11884 = 0, i11885 = 0, i11886 = 0, i11887 = 0, i11888 = 0, i11889 = 0;
+        int i11890 = 0, i11891 = 0, i11892 = 0, i11893 = 0, i11894 = 0, i11895 = 0, i11896 = 0, i11897 = 0, i11898 = 0, i11899 = 0;
+        int i11900 = 0, i11901 = 0, i11902 = 0, i11903 = 0, i11904 = 0, i11905 = 0, i11906 = 0, i11907 = 0, i11908 = 0, i11909 = 0;
+        int i11910 = 0, i11911 = 0, i11912 = 0, i11913 = 0, i11914 = 0, i11915 = 0, i11916 = 0, i11917 = 0, i11918 = 0, i11919 = 0;
+        int i11920 = 0, i11921 = 0, i11922 = 0, i11923 = 0, i11924 = 0, i11925 = 0, i11926 = 0, i11927 = 0, i11928 = 0, i11929 = 0;
+        int i11930 = 0, i11931 = 0, i11932 = 0, i11933 = 0, i11934 = 0, i11935 = 0, i11936 = 0, i11937 = 0, i11938 = 0, i11939 = 0;
+        int i11940 = 0, i11941 = 0, i11942 = 0, i11943 = 0, i11944 = 0, i11945 = 0, i11946 = 0, i11947 = 0, i11948 = 0, i11949 = 0;
+        int i11950 = 0, i11951 = 0, i11952 = 0, i11953 = 0, i11954 = 0, i11955 = 0, i11956 = 0, i11957 = 0, i11958 = 0, i11959 = 0;
+        int i11960 = 0, i11961 = 0, i11962 = 0, i11963 = 0, i11964 = 0, i11965 = 0, i11966 = 0, i11967 = 0, i11968 = 0, i11969 = 0;
+        int i11970 = 0, i11971 = 0, i11972 = 0, i11973 = 0, i11974 = 0, i11975 = 0, i11976 = 0, i11977 = 0, i11978 = 0, i11979 = 0;
+        int i11980 = 0, i11981 = 0, i11982 = 0, i11983 = 0, i11984 = 0, i11985 = 0, i11986 = 0, i11987 = 0, i11988 = 0, i11989 = 0;
+        int i11990 = 0, i11991 = 0, i11992 = 0, i11993 = 0, i11994 = 0, i11995 = 0, i11996 = 0, i11997 = 0, i11998 = 0, i11999 = 0;
+        int i12000 = 0, i12001 = 0, i12002 = 0, i12003 = 0, i12004 = 0, i12005 = 0, i12006 = 0, i12007 = 0, i12008 = 0, i12009 = 0;
+        int i12010 = 0, i12011 = 0, i12012 = 0, i12013 = 0, i12014 = 0, i12015 = 0, i12016 = 0, i12017 = 0, i12018 = 0, i12019 = 0;
+        int i12020 = 0, i12021 = 0, i12022 = 0, i12023 = 0, i12024 = 0, i12025 = 0, i12026 = 0, i12027 = 0, i12028 = 0, i12029 = 0;
+        int i12030 = 0, i12031 = 0, i12032 = 0, i12033 = 0, i12034 = 0, i12035 = 0, i12036 = 0, i12037 = 0, i12038 = 0, i12039 = 0;
+        int i12040 = 0, i12041 = 0, i12042 = 0, i12043 = 0, i12044 = 0, i12045 = 0, i12046 = 0, i12047 = 0, i12048 = 0, i12049 = 0;
+        int i12050 = 0, i12051 = 0, i12052 = 0, i12053 = 0, i12054 = 0, i12055 = 0, i12056 = 0, i12057 = 0, i12058 = 0, i12059 = 0;
+        int i12060 = 0, i12061 = 0, i12062 = 0, i12063 = 0, i12064 = 0, i12065 = 0, i12066 = 0, i12067 = 0, i12068 = 0, i12069 = 0;
+        int i12070 = 0, i12071 = 0, i12072 = 0, i12073 = 0, i12074 = 0, i12075 = 0, i12076 = 0, i12077 = 0, i12078 = 0, i12079 = 0;
+        int i12080 = 0, i12081 = 0, i12082 = 0, i12083 = 0, i12084 = 0, i12085 = 0, i12086 = 0, i12087 = 0, i12088 = 0, i12089 = 0;
+        int i12090 = 0, i12091 = 0, i12092 = 0, i12093 = 0, i12094 = 0, i12095 = 0, i12096 = 0, i12097 = 0, i12098 = 0, i12099 = 0;
+        int i12100 = 0, i12101 = 0, i12102 = 0, i12103 = 0, i12104 = 0, i12105 = 0, i12106 = 0, i12107 = 0, i12108 = 0, i12109 = 0;
+        int i12110 = 0, i12111 = 0, i12112 = 0, i12113 = 0, i12114 = 0, i12115 = 0, i12116 = 0, i12117 = 0, i12118 = 0, i12119 = 0;
+        int i12120 = 0, i12121 = 0, i12122 = 0, i12123 = 0, i12124 = 0, i12125 = 0, i12126 = 0, i12127 = 0, i12128 = 0, i12129 = 0;
+        int i12130 = 0, i12131 = 0, i12132 = 0, i12133 = 0, i12134 = 0, i12135 = 0, i12136 = 0, i12137 = 0, i12138 = 0, i12139 = 0;
+        int i12140 = 0, i12141 = 0, i12142 = 0, i12143 = 0, i12144 = 0, i12145 = 0, i12146 = 0, i12147 = 0, i12148 = 0, i12149 = 0;
+        int i12150 = 0, i12151 = 0, i12152 = 0, i12153 = 0, i12154 = 0, i12155 = 0, i12156 = 0, i12157 = 0, i12158 = 0, i12159 = 0;
+        int i12160 = 0, i12161 = 0, i12162 = 0, i12163 = 0, i12164 = 0, i12165 = 0, i12166 = 0, i12167 = 0, i12168 = 0, i12169 = 0;
+        int i12170 = 0, i12171 = 0, i12172 = 0, i12173 = 0, i12174 = 0, i12175 = 0, i12176 = 0, i12177 = 0, i12178 = 0, i12179 = 0;
+        int i12180 = 0, i12181 = 0, i12182 = 0, i12183 = 0, i12184 = 0, i12185 = 0, i12186 = 0, i12187 = 0, i12188 = 0, i12189 = 0;
+        int i12190 = 0, i12191 = 0, i12192 = 0, i12193 = 0, i12194 = 0, i12195 = 0, i12196 = 0, i12197 = 0, i12198 = 0, i12199 = 0;
+        int i12200 = 0, i12201 = 0, i12202 = 0, i12203 = 0, i12204 = 0, i12205 = 0, i12206 = 0, i12207 = 0, i12208 = 0, i12209 = 0;
+        int i12210 = 0, i12211 = 0, i12212 = 0, i12213 = 0, i12214 = 0, i12215 = 0, i12216 = 0, i12217 = 0, i12218 = 0, i12219 = 0;
+        int i12220 = 0, i12221 = 0, i12222 = 0, i12223 = 0, i12224 = 0, i12225 = 0, i12226 = 0, i12227 = 0, i12228 = 0, i12229 = 0;
+        int i12230 = 0, i12231 = 0, i12232 = 0, i12233 = 0, i12234 = 0, i12235 = 0, i12236 = 0, i12237 = 0, i12238 = 0, i12239 = 0;
+        int i12240 = 0, i12241 = 0, i12242 = 0, i12243 = 0, i12244 = 0, i12245 = 0, i12246 = 0, i12247 = 0, i12248 = 0, i12249 = 0;
+        int i12250 = 0, i12251 = 0, i12252 = 0, i12253 = 0, i12254 = 0, i12255 = 0, i12256 = 0, i12257 = 0, i12258 = 0, i12259 = 0;
+        int i12260 = 0, i12261 = 0, i12262 = 0, i12263 = 0, i12264 = 0, i12265 = 0, i12266 = 0, i12267 = 0, i12268 = 0, i12269 = 0;
+        int i12270 = 0, i12271 = 0, i12272 = 0, i12273 = 0, i12274 = 0, i12275 = 0, i12276 = 0, i12277 = 0, i12278 = 0, i12279 = 0;
+        int i12280 = 0, i12281 = 0, i12282 = 0, i12283 = 0, i12284 = 0, i12285 = 0, i12286 = 0, i12287 = 0, i12288 = 0, i12289 = 0;
+        int i12290 = 0, i12291 = 0, i12292 = 0, i12293 = 0, i12294 = 0, i12295 = 0, i12296 = 0, i12297 = 0, i12298 = 0, i12299 = 0;
+        int i12300 = 0, i12301 = 0, i12302 = 0, i12303 = 0, i12304 = 0, i12305 = 0, i12306 = 0, i12307 = 0, i12308 = 0, i12309 = 0;
+        int i12310 = 0, i12311 = 0, i12312 = 0, i12313 = 0, i12314 = 0, i12315 = 0, i12316 = 0, i12317 = 0, i12318 = 0, i12319 = 0;
+        int i12320 = 0, i12321 = 0, i12322 = 0, i12323 = 0, i12324 = 0, i12325 = 0, i12326 = 0, i12327 = 0, i12328 = 0, i12329 = 0;
+        int i12330 = 0, i12331 = 0, i12332 = 0, i12333 = 0, i12334 = 0, i12335 = 0, i12336 = 0, i12337 = 0, i12338 = 0, i12339 = 0;
+        int i12340 = 0, i12341 = 0, i12342 = 0, i12343 = 0, i12344 = 0, i12345 = 0, i12346 = 0, i12347 = 0, i12348 = 0, i12349 = 0;
+        int i12350 = 0, i12351 = 0, i12352 = 0, i12353 = 0, i12354 = 0, i12355 = 0, i12356 = 0, i12357 = 0, i12358 = 0, i12359 = 0;
+        int i12360 = 0, i12361 = 0, i12362 = 0, i12363 = 0, i12364 = 0, i12365 = 0, i12366 = 0, i12367 = 0, i12368 = 0, i12369 = 0;
+        int i12370 = 0, i12371 = 0, i12372 = 0, i12373 = 0, i12374 = 0, i12375 = 0, i12376 = 0, i12377 = 0, i12378 = 0, i12379 = 0;
+        int i12380 = 0, i12381 = 0, i12382 = 0, i12383 = 0, i12384 = 0, i12385 = 0, i12386 = 0, i12387 = 0, i12388 = 0, i12389 = 0;
+        int i12390 = 0, i12391 = 0, i12392 = 0, i12393 = 0, i12394 = 0, i12395 = 0, i12396 = 0, i12397 = 0, i12398 = 0, i12399 = 0;
+        int i12400 = 0, i12401 = 0, i12402 = 0, i12403 = 0, i12404 = 0, i12405 = 0, i12406 = 0, i12407 = 0, i12408 = 0, i12409 = 0;
+        int i12410 = 0, i12411 = 0, i12412 = 0, i12413 = 0, i12414 = 0, i12415 = 0, i12416 = 0, i12417 = 0, i12418 = 0, i12419 = 0;
+        int i12420 = 0, i12421 = 0, i12422 = 0, i12423 = 0, i12424 = 0, i12425 = 0, i12426 = 0, i12427 = 0, i12428 = 0, i12429 = 0;
+        int i12430 = 0, i12431 = 0, i12432 = 0, i12433 = 0, i12434 = 0, i12435 = 0, i12436 = 0, i12437 = 0, i12438 = 0, i12439 = 0;
+        int i12440 = 0, i12441 = 0, i12442 = 0, i12443 = 0, i12444 = 0, i12445 = 0, i12446 = 0, i12447 = 0, i12448 = 0, i12449 = 0;
+        int i12450 = 0, i12451 = 0, i12452 = 0, i12453 = 0, i12454 = 0, i12455 = 0, i12456 = 0, i12457 = 0, i12458 = 0, i12459 = 0;
+        int i12460 = 0, i12461 = 0, i12462 = 0, i12463 = 0, i12464 = 0, i12465 = 0, i12466 = 0, i12467 = 0, i12468 = 0, i12469 = 0;
+        int i12470 = 0, i12471 = 0, i12472 = 0, i12473 = 0, i12474 = 0, i12475 = 0, i12476 = 0, i12477 = 0, i12478 = 0, i12479 = 0;
+        int i12480 = 0, i12481 = 0, i12482 = 0, i12483 = 0, i12484 = 0, i12485 = 0, i12486 = 0, i12487 = 0, i12488 = 0, i12489 = 0;
+        int i12490 = 0, i12491 = 0, i12492 = 0, i12493 = 0, i12494 = 0, i12495 = 0, i12496 = 0, i12497 = 0, i12498 = 0, i12499 = 0;
+        int i12500 = 0, i12501 = 0, i12502 = 0, i12503 = 0, i12504 = 0, i12505 = 0, i12506 = 0, i12507 = 0, i12508 = 0, i12509 = 0;
+        int i12510 = 0, i12511 = 0, i12512 = 0, i12513 = 0, i12514 = 0, i12515 = 0, i12516 = 0, i12517 = 0, i12518 = 0, i12519 = 0;
+        int i12520 = 0, i12521 = 0, i12522 = 0, i12523 = 0, i12524 = 0, i12525 = 0, i12526 = 0, i12527 = 0, i12528 = 0, i12529 = 0;
+        int i12530 = 0, i12531 = 0, i12532 = 0, i12533 = 0, i12534 = 0, i12535 = 0, i12536 = 0, i12537 = 0, i12538 = 0, i12539 = 0;
+        int i12540 = 0, i12541 = 0, i12542 = 0, i12543 = 0, i12544 = 0, i12545 = 0, i12546 = 0, i12547 = 0, i12548 = 0, i12549 = 0;
+        int i12550 = 0, i12551 = 0, i12552 = 0, i12553 = 0, i12554 = 0, i12555 = 0, i12556 = 0, i12557 = 0, i12558 = 0, i12559 = 0;
+        int i12560 = 0, i12561 = 0, i12562 = 0, i12563 = 0, i12564 = 0, i12565 = 0, i12566 = 0, i12567 = 0, i12568 = 0, i12569 = 0;
+        int i12570 = 0, i12571 = 0, i12572 = 0, i12573 = 0, i12574 = 0, i12575 = 0, i12576 = 0, i12577 = 0, i12578 = 0, i12579 = 0;
+        int i12580 = 0, i12581 = 0, i12582 = 0, i12583 = 0, i12584 = 0, i12585 = 0, i12586 = 0, i12587 = 0, i12588 = 0, i12589 = 0;
+        int i12590 = 0, i12591 = 0, i12592 = 0, i12593 = 0, i12594 = 0, i12595 = 0, i12596 = 0, i12597 = 0, i12598 = 0, i12599 = 0;
+        int i12600 = 0, i12601 = 0, i12602 = 0, i12603 = 0, i12604 = 0, i12605 = 0, i12606 = 0, i12607 = 0, i12608 = 0, i12609 = 0;
+        int i12610 = 0, i12611 = 0, i12612 = 0, i12613 = 0, i12614 = 0, i12615 = 0, i12616 = 0, i12617 = 0, i12618 = 0, i12619 = 0;
+        int i12620 = 0, i12621 = 0, i12622 = 0, i12623 = 0, i12624 = 0, i12625 = 0, i12626 = 0, i12627 = 0, i12628 = 0, i12629 = 0;
+        int i12630 = 0, i12631 = 0, i12632 = 0, i12633 = 0, i12634 = 0, i12635 = 0, i12636 = 0, i12637 = 0, i12638 = 0, i12639 = 0;
+        int i12640 = 0, i12641 = 0, i12642 = 0, i12643 = 0, i12644 = 0, i12645 = 0, i12646 = 0, i12647 = 0, i12648 = 0, i12649 = 0;
+        int i12650 = 0, i12651 = 0, i12652 = 0, i12653 = 0, i12654 = 0, i12655 = 0, i12656 = 0, i12657 = 0, i12658 = 0, i12659 = 0;
+        int i12660 = 0, i12661 = 0, i12662 = 0, i12663 = 0, i12664 = 0, i12665 = 0, i12666 = 0, i12667 = 0, i12668 = 0, i12669 = 0;
+        int i12670 = 0, i12671 = 0, i12672 = 0, i12673 = 0, i12674 = 0, i12675 = 0, i12676 = 0, i12677 = 0, i12678 = 0, i12679 = 0;
+        int i12680 = 0, i12681 = 0, i12682 = 0, i12683 = 0, i12684 = 0, i12685 = 0, i12686 = 0, i12687 = 0, i12688 = 0, i12689 = 0;
+        int i12690 = 0, i12691 = 0, i12692 = 0, i12693 = 0, i12694 = 0, i12695 = 0, i12696 = 0, i12697 = 0, i12698 = 0, i12699 = 0;
+        int i12700 = 0, i12701 = 0, i12702 = 0, i12703 = 0, i12704 = 0, i12705 = 0, i12706 = 0, i12707 = 0, i12708 = 0, i12709 = 0;
+        int i12710 = 0, i12711 = 0, i12712 = 0, i12713 = 0, i12714 = 0, i12715 = 0, i12716 = 0, i12717 = 0, i12718 = 0, i12719 = 0;
+        int i12720 = 0, i12721 = 0, i12722 = 0, i12723 = 0, i12724 = 0, i12725 = 0, i12726 = 0, i12727 = 0, i12728 = 0, i12729 = 0;
+        int i12730 = 0, i12731 = 0, i12732 = 0, i12733 = 0, i12734 = 0, i12735 = 0, i12736 = 0, i12737 = 0, i12738 = 0, i12739 = 0;
+        int i12740 = 0, i12741 = 0, i12742 = 0, i12743 = 0, i12744 = 0, i12745 = 0, i12746 = 0, i12747 = 0, i12748 = 0, i12749 = 0;
+        int i12750 = 0, i12751 = 0, i12752 = 0, i12753 = 0, i12754 = 0, i12755 = 0, i12756 = 0, i12757 = 0, i12758 = 0, i12759 = 0;
+        int i12760 = 0, i12761 = 0, i12762 = 0, i12763 = 0, i12764 = 0, i12765 = 0, i12766 = 0, i12767 = 0, i12768 = 0, i12769 = 0;
+        int i12770 = 0, i12771 = 0, i12772 = 0, i12773 = 0, i12774 = 0, i12775 = 0, i12776 = 0, i12777 = 0, i12778 = 0, i12779 = 0;
+        int i12780 = 0, i12781 = 0, i12782 = 0, i12783 = 0, i12784 = 0, i12785 = 0, i12786 = 0, i12787 = 0, i12788 = 0, i12789 = 0;
+        int i12790 = 0, i12791 = 0, i12792 = 0, i12793 = 0, i12794 = 0, i12795 = 0, i12796 = 0, i12797 = 0, i12798 = 0, i12799 = 0;
+        int i12800 = 0, i12801 = 0, i12802 = 0, i12803 = 0, i12804 = 0, i12805 = 0, i12806 = 0, i12807 = 0, i12808 = 0, i12809 = 0;
+        int i12810 = 0, i12811 = 0, i12812 = 0, i12813 = 0, i12814 = 0, i12815 = 0, i12816 = 0, i12817 = 0, i12818 = 0, i12819 = 0;
+        int i12820 = 0, i12821 = 0, i12822 = 0, i12823 = 0, i12824 = 0, i12825 = 0, i12826 = 0, i12827 = 0, i12828 = 0, i12829 = 0;
+        int i12830 = 0, i12831 = 0, i12832 = 0, i12833 = 0, i12834 = 0, i12835 = 0, i12836 = 0, i12837 = 0, i12838 = 0, i12839 = 0;
+        int i12840 = 0, i12841 = 0, i12842 = 0, i12843 = 0, i12844 = 0, i12845 = 0, i12846 = 0, i12847 = 0, i12848 = 0, i12849 = 0;
+        int i12850 = 0, i12851 = 0, i12852 = 0, i12853 = 0, i12854 = 0, i12855 = 0, i12856 = 0, i12857 = 0, i12858 = 0, i12859 = 0;
+        int i12860 = 0, i12861 = 0, i12862 = 0, i12863 = 0, i12864 = 0, i12865 = 0, i12866 = 0, i12867 = 0, i12868 = 0, i12869 = 0;
+        int i12870 = 0, i12871 = 0, i12872 = 0, i12873 = 0, i12874 = 0, i12875 = 0, i12876 = 0, i12877 = 0, i12878 = 0, i12879 = 0;
+        int i12880 = 0, i12881 = 0, i12882 = 0, i12883 = 0, i12884 = 0, i12885 = 0, i12886 = 0, i12887 = 0, i12888 = 0, i12889 = 0;
+        int i12890 = 0, i12891 = 0, i12892 = 0, i12893 = 0, i12894 = 0, i12895 = 0, i12896 = 0, i12897 = 0, i12898 = 0, i12899 = 0;
+        int i12900 = 0, i12901 = 0, i12902 = 0, i12903 = 0, i12904 = 0, i12905 = 0, i12906 = 0, i12907 = 0, i12908 = 0, i12909 = 0;
+        int i12910 = 0, i12911 = 0, i12912 = 0, i12913 = 0, i12914 = 0, i12915 = 0, i12916 = 0, i12917 = 0, i12918 = 0, i12919 = 0;
+        int i12920 = 0, i12921 = 0, i12922 = 0, i12923 = 0, i12924 = 0, i12925 = 0, i12926 = 0, i12927 = 0, i12928 = 0, i12929 = 0;
+        int i12930 = 0, i12931 = 0, i12932 = 0, i12933 = 0, i12934 = 0, i12935 = 0, i12936 = 0, i12937 = 0, i12938 = 0, i12939 = 0;
+        int i12940 = 0, i12941 = 0, i12942 = 0, i12943 = 0, i12944 = 0, i12945 = 0, i12946 = 0, i12947 = 0, i12948 = 0, i12949 = 0;
+        int i12950 = 0, i12951 = 0, i12952 = 0, i12953 = 0, i12954 = 0, i12955 = 0, i12956 = 0, i12957 = 0, i12958 = 0, i12959 = 0;
+        int i12960 = 0, i12961 = 0, i12962 = 0, i12963 = 0, i12964 = 0, i12965 = 0, i12966 = 0, i12967 = 0, i12968 = 0, i12969 = 0;
+        int i12970 = 0, i12971 = 0, i12972 = 0, i12973 = 0, i12974 = 0, i12975 = 0, i12976 = 0, i12977 = 0, i12978 = 0, i12979 = 0;
+        int i12980 = 0, i12981 = 0, i12982 = 0, i12983 = 0, i12984 = 0, i12985 = 0, i12986 = 0, i12987 = 0, i12988 = 0, i12989 = 0;
+        int i12990 = 0, i12991 = 0, i12992 = 0, i12993 = 0, i12994 = 0, i12995 = 0, i12996 = 0, i12997 = 0, i12998 = 0, i12999 = 0;
+    }
+}
diff --git a/hotspot/test/compiler/7116216/StackOverflow.java b/hotspot/test/compiler/7116216/StackOverflow.java
new file mode 100644
index 00000000000..d77658ae74a
--- /dev/null
+++ b/hotspot/test/compiler/7116216/StackOverflow.java
@@ -0,0 +1,64 @@
+/*
+ * Copyright (c) 2011, Oracle and/or its affiliates. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+/**
+ * @test
+ * @bug 7116216
+ * @summary The vm crashes when GC happens during throwing a StackOverflow exception
+ *
+ * @run main/othervm -Xcomp -Xbatch StackOverflow
+ */
+
+public class StackOverflow {
+    static String stackOverflow_largeFrame_liveOopForGC;
+
+    public static int stackOverflow_largeFrame(int call_count, String liveOopForGC) {
+        try {
+            int return_count = stackOverflow_largeFrame(++call_count, liveOopForGC);
+            if (return_count == 0) {
+                try {
+                    LargeFrame.method_with_many_locals(liveOopForGC, 2,3,4,5,6,7,liveOopForGC);
+                } catch (StackOverflowError e2) {
+                    // access liveOopForGC to make it a live variable
+                    stackOverflow_largeFrame_liveOopForGC = liveOopForGC;
+                }
+            }
+            return return_count - 1;
+        } catch (StackOverflowError e) {
+            // Return a value that is large enough such that no unrecoverable
+            // stack overflow will occur afterwards, but that is small enough
+            // such that calling LargeFrame.method_with_many_locals() will
+            // cause a StackOverflowError.
+            // Don't use a call here because we're out of stack space anyway!
+            int tmp = call_count / 2;
+            return (tmp < 100 ? tmp : 100);
+        }
+    }
+    public static void main(String args[]) {
+        LargeFrame.method_with_many_locals(new Object(), 2,3,4,5,6,7,new Object());
+
+        stackOverflow_largeFrame(0, "this is a live oop to test GC");
+        System.out.println("finished ok!");
+    }
+}
diff --git a/hotspot/test/compiler/7123108/Test7123108.java b/hotspot/test/compiler/7123108/Test7123108.java
new file mode 100644
index 00000000000..66d3a01db09
--- /dev/null
+++ b/hotspot/test/compiler/7123108/Test7123108.java
@@ -0,0 +1,60 @@
+/*
+ * Copyright (c) 2011, Oracle and/or its affiliates. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+/**
+ * @test
+ * @bug 7123108
+ * @summary C1 crashes with assert(if_state != NULL) failed: states do not match up
+ *
+ * @run main/othervm -Xcomp Test7123108
+ */
+
+public class Test7123108 {
+
+    static class Test_Class_0 {
+        final static byte var_2 = 67;
+        byte var_3;
+    }
+
+    Object var_25 = "kgfpyhcms";
+    static long var_27 = 6899666748616086528L;
+
+    static float func_1()
+    {
+        return 0.0F;
+    }
+
+    private void test()
+    {
+        "dlwq".charAt(((short)'x' > var_27 | func_1() <= (((Test_Class_0)var_25).var_3) ? true : true) ? Test_Class_0.var_2 & (short)-1.1173839E38F : 'Y');
+    }
+
+    public static void main(String[] args)
+    {
+        Test7123108 t = new Test7123108();
+        try {
+            t.test();
+        } catch (Throwable e) { }
+    }
+}
diff --git a/hotspot/test/compiler/7125879/Test7125879.java b/hotspot/test/compiler/7125879/Test7125879.java
new file mode 100644
index 00000000000..729aac66939
--- /dev/null
+++ b/hotspot/test/compiler/7125879/Test7125879.java
@@ -0,0 +1,51 @@
+/*
+ * Copyright (c) 2011, Oracle and/or its affiliates. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+/**
+ * @test
+ * @bug 7125879
+ * @summary assert(proj != NULL) failed: must be found
+ *
+ * @run main/othervm -Xcomp Test7125879
+ */
+
+public class Test7125879 {
+    String var_1 = "abc";
+
+    public Test7125879() {
+        var_1 = var_1.replaceAll("d", "e") + var_1;
+    }
+
+    public static void main(String[] args) {
+        Test7125879 t = new Test7125879();
+        try {
+            t.test();
+        } catch(Throwable e) { }
+    }
+
+    private void test() {
+        new Test7125879().var_1 = ((Test7125879)(new Object[-1])[0]).var_1;
+    }
+}
+
diff --git a/jdk/.hgtags b/jdk/.hgtags
index a0cd20022ac..58e1636aaaf 100644
--- a/jdk/.hgtags
+++ b/jdk/.hgtags
@@ -142,3 +142,4 @@ b71d1acfae5240d8c1359443cd02b5ddb587231c jdk8-b17
 334bd51fb3f321cd6777416ae7bafac71a84140a jdk8-b18
 3778f85773055e81eab6c5ef828935ecca241810 jdk8-b19
 39e938cd1b82ec3aab0a9aa66fd8a0457cd0c9c2 jdk8-b20
+664fa4fb0ee411ef048903c479f8b962fcdb2f4b jdk8-b21