From 0b467e902d591ae9feeec1669918d1588987cd1c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Roberto=20Casta=C3=B1eda=20Lozano?=
 <rcastanedalo@openjdk.org>
Date: Thu, 3 Oct 2024 08:36:33 +0000
Subject: [PATCH] 8334060: Implementation of Late Barrier Expansion for G1
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Co-authored-by: Roberto Castañeda Lozano <rcastanedalo@openjdk.org>
Co-authored-by: Erik Österlund <eosterlund@openjdk.org>
Co-authored-by: Siyao Liu <siyao.l.liu@oracle.com>
Co-authored-by: Kim Barrett <kbarrett@openjdk.org>
Co-authored-by: Amit Kumar <amitkumar@openjdk.org>
Co-authored-by: Martin Doerr <mdoerr@openjdk.org>
Co-authored-by: Feilong Jiang <fjiang@openjdk.org>
Co-authored-by: Sergey Nazarkin <snazarki@openjdk.org>
Reviewed-by: kvn, tschatzl, fyang, ayang, kbarrett
---
 make/hotspot/gensrc/GensrcAdlc.gmk            |    7 +
 src/hotspot/cpu/aarch64/aarch64.ad            |   24 +-
 src/hotspot/cpu/aarch64/cas.m4                |    4 +
 .../gc/g1/g1BarrierSetAssembler_aarch64.cpp   |  282 ++--
 .../gc/g1/g1BarrierSetAssembler_aarch64.hpp   |   23 +
 src/hotspot/cpu/aarch64/gc/g1/g1_aarch64.ad   |  680 +++++++++
 src/hotspot/cpu/aarch64/gc/g1/g1_aarch64.m4   |  384 ++++++
 src/hotspot/cpu/arm/arm.ad                    |    4 +
 src/hotspot/cpu/arm/assembler_arm_32.hpp      |   23 +-
 .../arm/gc/g1/g1BarrierSetAssembler_arm.cpp   |  302 ++--
 .../arm/gc/g1/g1BarrierSetAssembler_arm.hpp   |   26 +-
 src/hotspot/cpu/arm/gc/g1/g1_arm.ad           |  201 +++
 .../arm/gc/shared/barrierSetAssembler_arm.cpp |   56 +-
 .../arm/gc/shared/barrierSetAssembler_arm.hpp |   24 +
 src/hotspot/cpu/arm/register_arm.hpp          |   25 +
 .../ppc/gc/g1/g1BarrierSetAssembler_ppc.cpp   |  294 ++--
 .../ppc/gc/g1/g1BarrierSetAssembler_ppc.hpp   |   25 +
 src/hotspot/cpu/ppc/gc/g1/g1_ppc.ad           |  684 ++++++++++
 src/hotspot/cpu/ppc/ppc.ad                    |   20 +-
 src/hotspot/cpu/ppc/register_ppc.hpp          |    9 +
 .../gc/g1/g1BarrierSetAssembler_riscv.cpp     |  288 ++--
 .../gc/g1/g1BarrierSetAssembler_riscv.hpp     |   25 +-
 src/hotspot/cpu/riscv/gc/g1/g1_riscv.ad       |  564 ++++++++
 src/hotspot/cpu/riscv/riscv.ad                |   19 +-
 .../s390/gc/g1/g1BarrierSetAssembler_s390.cpp |  279 +++-
 .../s390/gc/g1/g1BarrierSetAssembler_s390.hpp |   28 +-
 src/hotspot/cpu/s390/gc/g1/g1_s390.ad         |  457 +++++++
 .../gc/shared/barrierSetAssembler_s390.cpp    |   92 +-
 .../gc/shared/barrierSetAssembler_s390.hpp    |   38 +-
 src/hotspot/cpu/s390/macroAssembler_s390.cpp  |    3 +-
 src/hotspot/cpu/s390/register_s390.hpp        |    8 +
 src/hotspot/cpu/s390/s390.ad                  |   18 +-
 .../x86/gc/g1/g1BarrierSetAssembler_x86.cpp   |  301 ++--
 .../x86/gc/g1/g1BarrierSetAssembler_x86.hpp   |   23 +
 src/hotspot/cpu/x86/gc/g1/g1_x86_64.ad        |  371 +++++
 src/hotspot/cpu/x86/x86.ad                    |    4 +
 src/hotspot/cpu/x86/x86_64.ad                 |   16 +-
 src/hotspot/share/gc/g1/c2/g1BarrierSetC2.cpp | 1214 +++++------------
 src/hotspot/share/gc/g1/c2/g1BarrierSetC2.hpp |  124 +-
 .../share/gc/g1/g1BarrierSetRuntime.cpp       |    8 +
 .../share/gc/g1/g1BarrierSetRuntime.hpp       |    4 +
 .../share/gc/shared/c2/barrierSetC2.cpp       |    4 +
 .../share/gc/shared/c2/barrierSetC2.hpp       |    4 +
 .../gc/shared/c2/cardTableBarrierSetC2.cpp    |   29 -
 .../gc/shared/c2/cardTableBarrierSetC2.hpp    |    2 -
 src/hotspot/share/opto/buildOopMap.cpp        |    7 +
 src/hotspot/share/opto/lcm.cpp                |    8 +
 src/hotspot/share/opto/matcher.cpp            |   20 +
 src/hotspot/share/opto/matcher.hpp            |    2 +
 src/hotspot/share/opto/memnode.cpp            |    5 +
 src/hotspot/share/opto/output.cpp             |    2 +
 .../compiler/c2/aarch64/TestVolatiles.java    |   44 +-
 .../AllocationMergesTests.java                |    9 +-
 .../gcbarriers/TestG1BarrierGeneration.java   |  639 +++++++++
 .../compiler/lib/ir_framework/IRNode.java     |  102 ++
 .../TestMachTempsAcrossSafepoints.java        |   98 ++
 .../src/sun/hotspot/tools/ctw/CtwRunner.java  |    5 +-
 test/jdk/java/lang/invoke/BigArityTest.java   |    2 +-
 58 files changed, 6451 insertions(+), 1512 deletions(-)
 create mode 100644 src/hotspot/cpu/aarch64/gc/g1/g1_aarch64.ad
 create mode 100644 src/hotspot/cpu/aarch64/gc/g1/g1_aarch64.m4
 create mode 100644 src/hotspot/cpu/arm/gc/g1/g1_arm.ad
 create mode 100644 src/hotspot/cpu/ppc/gc/g1/g1_ppc.ad
 create mode 100644 src/hotspot/cpu/riscv/gc/g1/g1_riscv.ad
 create mode 100644 src/hotspot/cpu/s390/gc/g1/g1_s390.ad
 create mode 100644 src/hotspot/cpu/x86/gc/g1/g1_x86_64.ad
 create mode 100644 test/hotspot/jtreg/compiler/gcbarriers/TestG1BarrierGeneration.java
 create mode 100644 test/hotspot/jtreg/compiler/runtime/safepoints/TestMachTempsAcrossSafepoints.java

diff --git a/make/hotspot/gensrc/GensrcAdlc.gmk b/make/hotspot/gensrc/GensrcAdlc.gmk
index 8dada3cec0a..ddb2c3e33e5 100644
--- a/make/hotspot/gensrc/GensrcAdlc.gmk
+++ b/make/hotspot/gensrc/GensrcAdlc.gmk
@@ -200,6 +200,13 @@ ifeq ($(call check-jvm-feature, compiler2), true)
       )))
   endif
 
+  ifeq ($(call check-jvm-feature, g1gc), true)
+    AD_SRC_FILES += $(call uniq, $(wildcard $(foreach d, $(AD_SRC_ROOTS), \
+        $d/cpu/$(HOTSPOT_TARGET_CPU_ARCH)/gc/g1/g1_$(HOTSPOT_TARGET_CPU).ad \
+        $d/cpu/$(HOTSPOT_TARGET_CPU_ARCH)/gc/g1/g1_$(HOTSPOT_TARGET_CPU_ARCH).ad \
+      )))
+  endif
+
   SINGLE_AD_SRCFILE := $(ADLC_SUPPORT_DIR)/all-ad-src.ad
 
   INSERT_FILENAME_AWK_SCRIPT := \
diff --git a/src/hotspot/cpu/aarch64/aarch64.ad b/src/hotspot/cpu/aarch64/aarch64.ad
index 39eae43a287..7d2a35cefd8 100644
--- a/src/hotspot/cpu/aarch64/aarch64.ad
+++ b/src/hotspot/cpu/aarch64/aarch64.ad
@@ -2620,7 +2620,8 @@ static bool is_vector_bitwise_not_pattern(Node* n, Node* m) {
 bool Matcher::pd_clone_node(Node* n, Node* m, Matcher::MStack& mstack) {
   if (is_vshift_con_pattern(n, m) ||
       is_vector_bitwise_not_pattern(n, m) ||
-      is_valid_sve_arith_imm_pattern(n, m)) {
+      is_valid_sve_arith_imm_pattern(n, m) ||
+      is_encode_and_store_pattern(n, m)) {
     mstack.push(m, Visit);
     return true;
   }
@@ -6410,7 +6411,7 @@ instruct loadP(iRegPNoSp dst, memory mem)
 instruct loadN(iRegNNoSp dst, memory mem)
 %{
   match(Set dst (LoadN mem));
-  predicate(!needs_acquiring_load(n));
+  predicate(!needs_acquiring_load(n) && n->as_Load()->barrier_data() == 0);
 
   ins_cost(4 * INSN_COST);
   format %{ "ldrw  $dst, $mem\t# compressed ptr" %}
@@ -6839,7 +6840,7 @@ instruct storeimmP0(immP0 zero, memory mem)
 instruct storeN(iRegN src, memory mem)
 %{
   match(Set mem (StoreN mem src));
-  predicate(!needs_releasing_store(n));
+  predicate(!needs_releasing_store(n) && n->as_Store()->barrier_data() == 0);
 
   ins_cost(INSN_COST);
   format %{ "strw  $src, $mem\t# compressed ptr" %}
@@ -6852,7 +6853,7 @@ instruct storeN(iRegN src, memory mem)
 instruct storeImmN0(immN0 zero, memory mem)
 %{
   match(Set mem (StoreN mem zero));
-  predicate(!needs_releasing_store(n));
+  predicate(!needs_releasing_store(n) && n->as_Store()->barrier_data() == 0);
 
   ins_cost(INSN_COST);
   format %{ "strw  zr, $mem\t# compressed ptr" %}
@@ -7086,6 +7087,7 @@ instruct loadP_volatile(iRegPNoSp dst, /* sync_memory*/indirect mem)
 instruct loadN_volatile(iRegNNoSp dst, /* sync_memory*/indirect mem)
 %{
   match(Set dst (LoadN mem));
+  predicate(n->as_Load()->barrier_data() == 0);
 
   ins_cost(VOLATILE_REF_COST);
   format %{ "ldarw  $dst, $mem\t# compressed ptr" %}
@@ -7253,6 +7255,7 @@ instruct storeimmP0_volatile(immP0 zero, /* sync_memory*/indirect mem)
 instruct storeN_volatile(iRegN src, /* sync_memory*/indirect mem)
 %{
   match(Set mem (StoreN mem src));
+  predicate(n->as_Store()->barrier_data() == 0);
 
   ins_cost(VOLATILE_REF_COST);
   format %{ "stlrw  $src, $mem\t# compressed ptr" %}
@@ -7265,6 +7268,7 @@ instruct storeN_volatile(iRegN src, /* sync_memory*/indirect mem)
 instruct storeimmN0_volatile(immN0 zero, /* sync_memory*/indirect mem)
 %{
   match(Set mem (StoreN mem zero));
+  predicate(n->as_Store()->barrier_data() == 0);
 
   ins_cost(VOLATILE_REF_COST);
   format %{ "stlrw  zr, $mem\t# compressed ptr" %}
@@ -8061,6 +8065,7 @@ instruct compareAndSwapP(iRegINoSp res, indirect mem, iRegP oldval, iRegP newval
 instruct compareAndSwapN(iRegINoSp res, indirect mem, iRegNNoSp oldval, iRegNNoSp newval, rFlagsReg cr) %{
 
   match(Set res (CompareAndSwapN mem (Binary oldval newval)));
+  predicate(n->as_LoadStore()->barrier_data() == 0);
   ins_cost(2 * VOLATILE_REF_COST);
 
   effect(KILL cr);
@@ -8175,7 +8180,7 @@ instruct compareAndSwapPAcq(iRegINoSp res, indirect mem, iRegP oldval, iRegP new
 
 instruct compareAndSwapNAcq(iRegINoSp res, indirect mem, iRegNNoSp oldval, iRegNNoSp newval, rFlagsReg cr) %{
 
-  predicate(needs_acquiring_load_exclusive(n));
+  predicate(needs_acquiring_load_exclusive(n) && n->as_LoadStore()->barrier_data() == 0);
   match(Set res (CompareAndSwapN mem (Binary oldval newval)));
   ins_cost(VOLATILE_REF_COST);
 
@@ -8280,6 +8285,7 @@ instruct compareAndExchangeL(iRegLNoSp res, indirect mem, iRegL oldval, iRegL ne
 // This pattern is generated automatically from cas.m4.
 // DO NOT EDIT ANYTHING IN THIS SECTION OF THE FILE
 instruct compareAndExchangeN(iRegNNoSp res, indirect mem, iRegN oldval, iRegN newval, rFlagsReg cr) %{
+  predicate(n->as_LoadStore()->barrier_data() == 0);
   match(Set res (CompareAndExchangeN mem (Binary oldval newval)));
   ins_cost(2 * VOLATILE_REF_COST);
   effect(TEMP_DEF res, KILL cr);
@@ -8389,7 +8395,7 @@ instruct compareAndExchangeLAcq(iRegLNoSp res, indirect mem, iRegL oldval, iRegL
 // This pattern is generated automatically from cas.m4.
 // DO NOT EDIT ANYTHING IN THIS SECTION OF THE FILE
 instruct compareAndExchangeNAcq(iRegNNoSp res, indirect mem, iRegN oldval, iRegN newval, rFlagsReg cr) %{
-  predicate(needs_acquiring_load_exclusive(n));
+  predicate(needs_acquiring_load_exclusive(n) && n->as_LoadStore()->barrier_data() == 0);
   match(Set res (CompareAndExchangeN mem (Binary oldval newval)));
   ins_cost(VOLATILE_REF_COST);
   effect(TEMP_DEF res, KILL cr);
@@ -8501,6 +8507,7 @@ instruct weakCompareAndSwapL(iRegINoSp res, indirect mem, iRegL oldval, iRegL ne
 // This pattern is generated automatically from cas.m4.
 // DO NOT EDIT ANYTHING IN THIS SECTION OF THE FILE
 instruct weakCompareAndSwapN(iRegINoSp res, indirect mem, iRegN oldval, iRegN newval, rFlagsReg cr) %{
+  predicate(n->as_LoadStore()->barrier_data() == 0);
   match(Set res (WeakCompareAndSwapN mem (Binary oldval newval)));
   ins_cost(2 * VOLATILE_REF_COST);
   effect(KILL cr);
@@ -8620,7 +8627,7 @@ instruct weakCompareAndSwapLAcq(iRegINoSp res, indirect mem, iRegL oldval, iRegL
 // This pattern is generated automatically from cas.m4.
 // DO NOT EDIT ANYTHING IN THIS SECTION OF THE FILE
 instruct weakCompareAndSwapNAcq(iRegINoSp res, indirect mem, iRegN oldval, iRegN newval, rFlagsReg cr) %{
-  predicate(needs_acquiring_load_exclusive(n));
+  predicate(needs_acquiring_load_exclusive(n) && n->as_LoadStore()->barrier_data() == 0);
   match(Set res (WeakCompareAndSwapN mem (Binary oldval newval)));
   ins_cost(VOLATILE_REF_COST);
   effect(KILL cr);
@@ -8681,6 +8688,7 @@ instruct get_and_setL(indirect mem, iRegL newv, iRegLNoSp prev) %{
 %}
 
 instruct get_and_setN(indirect mem, iRegN newv, iRegINoSp prev) %{
+  predicate(n->as_LoadStore()->barrier_data() == 0);
   match(Set prev (GetAndSetN mem newv));
   ins_cost(2 * VOLATILE_REF_COST);
   format %{ "atomic_xchgw $prev, $newv, [$mem]" %}
@@ -8724,7 +8732,7 @@ instruct get_and_setLAcq(indirect mem, iRegL newv, iRegLNoSp prev) %{
 %}
 
 instruct get_and_setNAcq(indirect mem, iRegN newv, iRegINoSp prev) %{
-  predicate(needs_acquiring_load_exclusive(n));
+  predicate(needs_acquiring_load_exclusive(n) && n->as_LoadStore()->barrier_data() == 0);
   match(Set prev (GetAndSetN mem newv));
   ins_cost(VOLATILE_REF_COST);
   format %{ "atomic_xchgw_acq $prev, $newv, [$mem]" %}
diff --git a/src/hotspot/cpu/aarch64/cas.m4 b/src/hotspot/cpu/aarch64/cas.m4
index f8aac0c4939..7e13e153db1 100644
--- a/src/hotspot/cpu/aarch64/cas.m4
+++ b/src/hotspot/cpu/aarch64/cas.m4
@@ -45,7 +45,9 @@ define(`CAS_INSN',
 // DO NOT EDIT ANYTHING IN THIS SECTION OF THE FILE
 instruct compareAndExchange$1$6(iReg$2NoSp res, indirect mem, iReg$2 oldval, iReg$2 newval, rFlagsReg cr) %{
 ifelse($1$6,PAcq,INDENT(predicate(needs_acquiring_load_exclusive(n) && (n->as_LoadStore()->barrier_data() == 0));),
+       $1$6,NAcq,INDENT(predicate(needs_acquiring_load_exclusive(n) && n->as_LoadStore()->barrier_data() == 0);),
        $1,P,INDENT(predicate(n->as_LoadStore()->barrier_data() == 0);),
+       $1,N,INDENT(predicate(n->as_LoadStore()->barrier_data() == 0);),
        $6,Acq,INDENT(predicate(needs_acquiring_load_exclusive(n));),
        `dnl')
   match(Set res (CompareAndExchange$1 mem (Binary oldval newval)));
@@ -122,7 +124,9 @@ define(`CAS_INSN3',
 // DO NOT EDIT ANYTHING IN THIS SECTION OF THE FILE
 instruct weakCompareAndSwap$1$6(iRegINoSp res, indirect mem, iReg$2 oldval, iReg$2 newval, rFlagsReg cr) %{
 ifelse($1$6,PAcq,INDENT(predicate(needs_acquiring_load_exclusive(n) && (n->as_LoadStore()->barrier_data() == 0));),
+       $1$6,NAcq,INDENT(predicate(needs_acquiring_load_exclusive(n) && n->as_LoadStore()->barrier_data() == 0);),
        $1,P,INDENT(predicate(n->as_LoadStore()->barrier_data() == 0);),
+       $1,N,INDENT(predicate(n->as_LoadStore()->barrier_data() == 0);),
        $6,Acq,INDENT(predicate(needs_acquiring_load_exclusive(n));),
        `dnl')
   match(Set res (WeakCompareAndSwap$1 mem (Binary oldval newval)));
diff --git a/src/hotspot/cpu/aarch64/gc/g1/g1BarrierSetAssembler_aarch64.cpp b/src/hotspot/cpu/aarch64/gc/g1/g1BarrierSetAssembler_aarch64.cpp
index d02038b6e91..b978c350ce1 100644
--- a/src/hotspot/cpu/aarch64/gc/g1/g1BarrierSetAssembler_aarch64.cpp
+++ b/src/hotspot/cpu/aarch64/gc/g1/g1BarrierSetAssembler_aarch64.cpp
@@ -38,7 +38,10 @@
 #include "c1/c1_LIRAssembler.hpp"
 #include "c1/c1_MacroAssembler.hpp"
 #include "gc/g1/c1/g1BarrierSetC1.hpp"
-#endif
+#endif // COMPILER1
+#ifdef COMPILER2
+#include "gc/g1/c2/g1BarrierSetC2.hpp"
+#endif // COMPILER2
 
 #define __ masm->
 
@@ -95,6 +98,54 @@ void G1BarrierSetAssembler::gen_write_ref_array_post_barrier(MacroAssembler* mas
   __ pop(saved_regs, sp);
 }
 
+static void generate_queue_test_and_insertion(MacroAssembler* masm, ByteSize index_offset, ByteSize buffer_offset, Label& runtime,
+                                              const Register thread, const Register value, const Register temp1, const Register temp2) {
+  // Can we store a value in the given thread's buffer?
+  // (The index field is typed as size_t.)
+  __ ldr(temp1, Address(thread, in_bytes(index_offset)));   // temp1 := *(index address)
+  __ cbz(temp1, runtime);                                   // jump to runtime if index == 0 (full buffer)
+  // The buffer is not full, store value into it.
+  __ sub(temp1, temp1, wordSize);                           // temp1 := next index
+  __ str(temp1, Address(thread, in_bytes(index_offset)));   // *(index address) := next index
+  __ ldr(temp2, Address(thread, in_bytes(buffer_offset)));  // temp2 := buffer address
+  __ str(value, Address(temp2, temp1));                     // *(buffer address + next index) := value
+}
+
+static void generate_pre_barrier_fast_path(MacroAssembler* masm,
+                                           const Register thread,
+                                           const Register tmp1) {
+  Address in_progress(thread, in_bytes(G1ThreadLocalData::satb_mark_queue_active_offset()));
+  // Is marking active?
+  if (in_bytes(SATBMarkQueue::byte_width_of_active()) == 4) {
+    __ ldrw(tmp1, in_progress);
+  } else {
+    assert(in_bytes(SATBMarkQueue::byte_width_of_active()) == 1, "Assumption");
+    __ ldrb(tmp1, in_progress);
+  }
+}
+
+static void generate_pre_barrier_slow_path(MacroAssembler* masm,
+                                           const Register obj,
+                                           const Register pre_val,
+                                           const Register thread,
+                                           const Register tmp1,
+                                           const Register tmp2,
+                                           Label& done,
+                                           Label& runtime) {
+  // Do we need to load the previous value?
+  if (obj != noreg) {
+    __ load_heap_oop(pre_val, Address(obj, 0), noreg, noreg, AS_RAW);
+  }
+  // Is the previous value null?
+  __ cbz(pre_val, done);
+  generate_queue_test_and_insertion(masm,
+                                    G1ThreadLocalData::satb_mark_queue_index_offset(),
+                                    G1ThreadLocalData::satb_mark_queue_buffer_offset(),
+                                    runtime,
+                                    thread, pre_val, tmp1, tmp2);
+  __ b(done);
+}
+
 void G1BarrierSetAssembler::g1_write_barrier_pre(MacroAssembler* masm,
                                                  Register obj,
                                                  Register pre_val,
@@ -115,43 +166,10 @@ void G1BarrierSetAssembler::g1_write_barrier_pre(MacroAssembler* masm,
   assert_different_registers(obj, pre_val, tmp1, tmp2);
   assert(pre_val != noreg && tmp1 != noreg && tmp2 != noreg, "expecting a register");
 
-  Address in_progress(thread, in_bytes(G1ThreadLocalData::satb_mark_queue_active_offset()));
-  Address index(thread, in_bytes(G1ThreadLocalData::satb_mark_queue_index_offset()));
-  Address buffer(thread, in_bytes(G1ThreadLocalData::satb_mark_queue_buffer_offset()));
-
-  // Is marking active?
-  if (in_bytes(SATBMarkQueue::byte_width_of_active()) == 4) {
-    __ ldrw(tmp1, in_progress);
-  } else {
-    assert(in_bytes(SATBMarkQueue::byte_width_of_active()) == 1, "Assumption");
-    __ ldrb(tmp1, in_progress);
-  }
+  generate_pre_barrier_fast_path(masm, thread, tmp1);
+  // If marking is not active (*(mark queue active address) == 0), jump to done
   __ cbzw(tmp1, done);
-
-  // Do we need to load the previous value?
-  if (obj != noreg) {
-    __ load_heap_oop(pre_val, Address(obj, 0), noreg, noreg, AS_RAW);
-  }
-
-  // Is the previous value null?
-  __ cbz(pre_val, done);
-
-  // Can we store original value in the thread's buffer?
-  // Is index == 0?
-  // (The index field is typed as size_t.)
-
-  __ ldr(tmp1, index);                      // tmp := *index_adr
-  __ cbz(tmp1, runtime);                    // tmp == 0?
-                                        // If yes, goto runtime
-
-  __ sub(tmp1, tmp1, wordSize);             // tmp := tmp - wordSize
-  __ str(tmp1, index);                      // *index_adr := tmp
-  __ ldr(tmp2, buffer);
-  __ add(tmp1, tmp1, tmp2);                 // tmp := tmp + *buffer_adr
-
-  // Record the previous value
-  __ str(pre_val, Address(tmp1, 0));
-  __ b(done);
+  generate_pre_barrier_slow_path(masm, obj, pre_val, thread, tmp1, tmp2, done, runtime);
 
   __ bind(runtime);
 
@@ -182,6 +200,50 @@ void G1BarrierSetAssembler::g1_write_barrier_pre(MacroAssembler* masm,
 
 }
 
+static void generate_post_barrier_fast_path(MacroAssembler* masm,
+                                            const Register store_addr,
+                                            const Register new_val,
+                                            const Register tmp1,
+                                            const Register tmp2,
+                                            Label& done,
+                                            bool new_val_may_be_null) {
+  // Does store cross heap regions?
+  __ eor(tmp1, store_addr, new_val);                     // tmp1 := store address ^ new value
+  __ lsr(tmp1, tmp1, G1HeapRegion::LogOfHRGrainBytes);   // tmp1 := ((store address ^ new value) >> LogOfHRGrainBytes)
+  __ cbz(tmp1, done);
+  // Crosses regions, storing null?
+  if (new_val_may_be_null) {
+    __ cbz(new_val, done);
+  }
+  // Storing region crossing non-null, is card young?
+  __ lsr(tmp1, store_addr, CardTable::card_shift());     // tmp1 := card address relative to card table base
+  __ load_byte_map_base(tmp2);                           // tmp2 := card table base address
+  __ add(tmp1, tmp1, tmp2);                              // tmp1 := card address
+  __ ldrb(tmp2, Address(tmp1));                          // tmp2 := card
+  __ cmpw(tmp2, (int)G1CardTable::g1_young_card_val());  // tmp2 := card == young_card_val?
+}
+
+static void generate_post_barrier_slow_path(MacroAssembler* masm,
+                                            const Register thread,
+                                            const Register tmp1,
+                                            const Register tmp2,
+                                            Label& done,
+                                            Label& runtime) {
+  __ membar(Assembler::StoreLoad);  // StoreLoad membar
+  __ ldrb(tmp2, Address(tmp1));     // tmp2 := card
+  __ cbzw(tmp2, done);
+  // Storing a region crossing, non-null oop, card is clean.
+  // Dirty card and log.
+  STATIC_ASSERT(CardTable::dirty_card_val() == 0);
+  __ strb(zr, Address(tmp1));       // *(card address) := dirty_card_val
+  generate_queue_test_and_insertion(masm,
+                                    G1ThreadLocalData::dirty_card_queue_index_offset(),
+                                    G1ThreadLocalData::dirty_card_queue_buffer_offset(),
+                                    runtime,
+                                    thread, tmp1, tmp2, rscratch1);
+  __ b(done);
+}
+
 void G1BarrierSetAssembler::g1_write_barrier_post(MacroAssembler* masm,
                                                   Register store_addr,
                                                   Register new_val,
@@ -194,70 +256,116 @@ void G1BarrierSetAssembler::g1_write_barrier_post(MacroAssembler* masm,
   assert(store_addr != noreg && new_val != noreg && tmp1 != noreg
          && tmp2 != noreg, "expecting a register");
 
-  Address queue_index(thread, in_bytes(G1ThreadLocalData::dirty_card_queue_index_offset()));
-  Address buffer(thread, in_bytes(G1ThreadLocalData::dirty_card_queue_buffer_offset()));
-
-  BarrierSet* bs = BarrierSet::barrier_set();
-  CardTableBarrierSet* ctbs = barrier_set_cast<CardTableBarrierSet>(bs);
-  CardTable* ct = ctbs->card_table();
-
   Label done;
   Label runtime;
 
-  // Does store cross heap regions?
-
-  __ eor(tmp1, store_addr, new_val);
-  __ lsr(tmp1, tmp1, G1HeapRegion::LogOfHRGrainBytes);
-  __ cbz(tmp1, done);
-
-  // crosses regions, storing null?
-
-  __ cbz(new_val, done);
-
-  // storing region crossing non-null, is card already dirty?
-
-  const Register card_addr = tmp1;
-
-  __ lsr(card_addr, store_addr, CardTable::card_shift());
-
-  // get the address of the card
-  __ load_byte_map_base(tmp2);
-  __ add(card_addr, card_addr, tmp2);
-  __ ldrb(tmp2, Address(card_addr));
-  __ cmpw(tmp2, (int)G1CardTable::g1_young_card_val());
+  generate_post_barrier_fast_path(masm, store_addr, new_val, tmp1, tmp2, done, true /* new_val_may_be_null */);
+  // If card is young, jump to done
   __ br(Assembler::EQ, done);
-
-  assert((int)CardTable::dirty_card_val() == 0, "must be 0");
-
-  __ membar(Assembler::StoreLoad);
-
-  __ ldrb(tmp2, Address(card_addr));
-  __ cbzw(tmp2, done);
-
-  // storing a region crossing, non-null oop, card is clean.
-  // dirty card and log.
-
-  __ strb(zr, Address(card_addr));
-
-  __ ldr(rscratch1, queue_index);
-  __ cbz(rscratch1, runtime);
-  __ sub(rscratch1, rscratch1, wordSize);
-  __ str(rscratch1, queue_index);
-
-  __ ldr(tmp2, buffer);
-  __ str(card_addr, Address(tmp2, rscratch1));
-  __ b(done);
+  generate_post_barrier_slow_path(masm, thread, tmp1, tmp2, done, runtime);
 
   __ bind(runtime);
   // save the live input values
   RegSet saved = RegSet::of(store_addr);
   __ push(saved, sp);
-  __ call_VM_leaf(CAST_FROM_FN_PTR(address, G1BarrierSetRuntime::write_ref_field_post_entry), card_addr, thread);
+  __ call_VM_leaf(CAST_FROM_FN_PTR(address, G1BarrierSetRuntime::write_ref_field_post_entry), tmp1, thread);
   __ pop(saved, sp);
 
   __ bind(done);
 }
 
+#if defined(COMPILER2)
+
+static void generate_c2_barrier_runtime_call(MacroAssembler* masm, G1BarrierStubC2* stub, const Register arg, const address runtime_path) {
+  SaveLiveRegisters save_registers(masm, stub);
+  if (c_rarg0 != arg) {
+    __ mov(c_rarg0, arg);
+  }
+  __ mov(c_rarg1, rthread);
+  __ mov(rscratch1, runtime_path);
+  __ blr(rscratch1);
+}
+
+void G1BarrierSetAssembler::g1_write_barrier_pre_c2(MacroAssembler* masm,
+                                                    Register obj,
+                                                    Register pre_val,
+                                                    Register thread,
+                                                    Register tmp1,
+                                                    Register tmp2,
+                                                    G1PreBarrierStubC2* stub) {
+  assert(thread == rthread, "must be");
+  assert_different_registers(obj, pre_val, tmp1, tmp2);
+  assert(pre_val != noreg && tmp1 != noreg && tmp2 != noreg, "expecting a register");
+
+  stub->initialize_registers(obj, pre_val, thread, tmp1, tmp2);
+
+  generate_pre_barrier_fast_path(masm, thread, tmp1);
+  // If marking is active (*(mark queue active address) != 0), jump to stub (slow path)
+  __ cbnzw(tmp1, *stub->entry());
+
+  __ bind(*stub->continuation());
+}
+
+void G1BarrierSetAssembler::generate_c2_pre_barrier_stub(MacroAssembler* masm,
+                                                         G1PreBarrierStubC2* stub) const {
+  Assembler::InlineSkippedInstructionsCounter skip_counter(masm);
+  Label runtime;
+  Register obj = stub->obj();
+  Register pre_val = stub->pre_val();
+  Register thread = stub->thread();
+  Register tmp1 = stub->tmp1();
+  Register tmp2 = stub->tmp2();
+
+  __ bind(*stub->entry());
+  generate_pre_barrier_slow_path(masm, obj, pre_val, thread, tmp1, tmp2, *stub->continuation(), runtime);
+
+  __ bind(runtime);
+  generate_c2_barrier_runtime_call(masm, stub, pre_val, CAST_FROM_FN_PTR(address, G1BarrierSetRuntime::write_ref_field_pre_entry));
+  __ b(*stub->continuation());
+}
+
+void G1BarrierSetAssembler::g1_write_barrier_post_c2(MacroAssembler* masm,
+                                                     Register store_addr,
+                                                     Register new_val,
+                                                     Register thread,
+                                                     Register tmp1,
+                                                     Register tmp2,
+                                                     G1PostBarrierStubC2* stub) {
+  assert(thread == rthread, "must be");
+  assert_different_registers(store_addr, new_val, thread, tmp1, tmp2,
+                             rscratch1);
+  assert(store_addr != noreg && new_val != noreg && tmp1 != noreg
+         && tmp2 != noreg, "expecting a register");
+
+  stub->initialize_registers(thread, tmp1, tmp2);
+
+  bool new_val_may_be_null = (stub->barrier_data() & G1C2BarrierPostNotNull) == 0;
+  generate_post_barrier_fast_path(masm, store_addr, new_val, tmp1, tmp2, *stub->continuation(), new_val_may_be_null);
+  // If card is not young, jump to stub (slow path)
+  __ br(Assembler::NE, *stub->entry());
+
+  __ bind(*stub->continuation());
+}
+
+void G1BarrierSetAssembler::generate_c2_post_barrier_stub(MacroAssembler* masm,
+                                                          G1PostBarrierStubC2* stub) const {
+  Assembler::InlineSkippedInstructionsCounter skip_counter(masm);
+  Label runtime;
+  Register thread = stub->thread();
+  Register tmp1 = stub->tmp1(); // tmp1 holds the card address.
+  Register tmp2 = stub->tmp2();
+  assert(stub->tmp3() == noreg, "not needed in this platform");
+
+  __ bind(*stub->entry());
+  generate_post_barrier_slow_path(masm, thread, tmp1, tmp2, *stub->continuation(), runtime);
+
+  __ bind(runtime);
+  generate_c2_barrier_runtime_call(masm, stub, tmp1, CAST_FROM_FN_PTR(address, G1BarrierSetRuntime::write_ref_field_post_entry));
+  __ b(*stub->continuation());
+}
+
+#endif // COMPILER2
+
 void G1BarrierSetAssembler::load_at(MacroAssembler* masm, DecoratorSet decorators, BasicType type,
                                     Register dst, Address src, Register tmp1, Register tmp2) {
   bool on_oop = is_reference_type(type);
diff --git a/src/hotspot/cpu/aarch64/gc/g1/g1BarrierSetAssembler_aarch64.hpp b/src/hotspot/cpu/aarch64/gc/g1/g1BarrierSetAssembler_aarch64.hpp
index 7b4bc8cdc49..4baa18cb945 100644
--- a/src/hotspot/cpu/aarch64/gc/g1/g1BarrierSetAssembler_aarch64.hpp
+++ b/src/hotspot/cpu/aarch64/gc/g1/g1BarrierSetAssembler_aarch64.hpp
@@ -33,6 +33,8 @@ class LIR_Assembler;
 class StubAssembler;
 class G1PreBarrierStub;
 class G1PostBarrierStub;
+class G1PreBarrierStubC2;
+class G1PostBarrierStubC2;
 
 class G1BarrierSetAssembler: public ModRefBarrierSetAssembler {
 protected:
@@ -69,6 +71,27 @@ public:
   void generate_c1_post_barrier_runtime_stub(StubAssembler* sasm);
 #endif
 
+#ifdef COMPILER2
+  void g1_write_barrier_pre_c2(MacroAssembler* masm,
+                               Register obj,
+                               Register pre_val,
+                               Register thread,
+                               Register tmp1,
+                               Register tmp2,
+                               G1PreBarrierStubC2* c2_stub);
+  void generate_c2_pre_barrier_stub(MacroAssembler* masm,
+                                    G1PreBarrierStubC2* stub) const;
+  void g1_write_barrier_post_c2(MacroAssembler* masm,
+                                Register store_addr,
+                                Register new_val,
+                                Register thread,
+                                Register tmp1,
+                                Register tmp2,
+                                G1PostBarrierStubC2* c2_stub);
+  void generate_c2_post_barrier_stub(MacroAssembler* masm,
+                                     G1PostBarrierStubC2* stub) const;
+#endif
+
   void load_at(MacroAssembler* masm, DecoratorSet decorators, BasicType type,
                Register dst, Address src, Register tmp1, Register tmp2);
 };
diff --git a/src/hotspot/cpu/aarch64/gc/g1/g1_aarch64.ad b/src/hotspot/cpu/aarch64/gc/g1/g1_aarch64.ad
new file mode 100644
index 00000000000..081a67d6880
--- /dev/null
+++ b/src/hotspot/cpu/aarch64/gc/g1/g1_aarch64.ad
@@ -0,0 +1,680 @@
+//
+// Copyright (c) 2024, Oracle and/or its affiliates. All rights reserved.
+// DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+//
+// This code is free software; you can redistribute it and/or modify it
+// under the terms of the GNU General Public License version 2 only, as
+// published by the Free Software Foundation.
+//
+// This code is distributed in the hope that it will be useful, but WITHOUT
+// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+// FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+// version 2 for more details (a copy is included in the LICENSE file that
+// accompanied this code).
+//
+// You should have received a copy of the GNU General Public License version
+// 2 along with this work; if not, write to the Free Software Foundation,
+// Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+//
+// Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+// or visit www.oracle.com if you need additional information or have any
+// questions.
+//
+
+source_hpp %{
+
+#include "gc/g1/c2/g1BarrierSetC2.hpp"
+#include "gc/shared/gc_globals.hpp"
+
+%}
+
+source %{
+
+#include "gc/g1/g1BarrierSetAssembler_aarch64.hpp"
+#include "gc/g1/g1BarrierSetRuntime.hpp"
+
+static void write_barrier_pre(MacroAssembler* masm,
+                              const MachNode* node,
+                              Register obj,
+                              Register pre_val,
+                              Register tmp1,
+                              Register tmp2,
+                              RegSet preserve = RegSet(),
+                              RegSet no_preserve = RegSet()) {
+  if (!G1PreBarrierStubC2::needs_barrier(node)) {
+    return;
+  }
+  Assembler::InlineSkippedInstructionsCounter skip_counter(masm);
+  G1BarrierSetAssembler* g1_asm = static_cast<G1BarrierSetAssembler*>(BarrierSet::barrier_set()->barrier_set_assembler());
+  G1PreBarrierStubC2* const stub = G1PreBarrierStubC2::create(node);
+  for (RegSetIterator<Register> reg = preserve.begin(); *reg != noreg; ++reg) {
+    stub->preserve(*reg);
+  }
+  for (RegSetIterator<Register> reg = no_preserve.begin(); *reg != noreg; ++reg) {
+    stub->dont_preserve(*reg);
+  }
+  g1_asm->g1_write_barrier_pre_c2(masm, obj, pre_val, rthread, tmp1, tmp2, stub);
+}
+
+static void write_barrier_post(MacroAssembler* masm,
+                               const MachNode* node,
+                               Register store_addr,
+                               Register new_val,
+                               Register tmp1,
+                               Register tmp2) {
+  if (!G1PostBarrierStubC2::needs_barrier(node)) {
+    return;
+  }
+  Assembler::InlineSkippedInstructionsCounter skip_counter(masm);
+  G1BarrierSetAssembler* g1_asm = static_cast<G1BarrierSetAssembler*>(BarrierSet::barrier_set()->barrier_set_assembler());
+  G1PostBarrierStubC2* const stub = G1PostBarrierStubC2::create(node);
+  g1_asm->g1_write_barrier_post_c2(masm, store_addr, new_val, rthread, tmp1, tmp2, stub);
+}
+
+%}
+
+// BEGIN This section of the file is automatically generated. Do not edit --------------
+
+// This section is generated from g1_aarch64.m4
+
+
+// This pattern is generated automatically from g1_aarch64.m4.
+// DO NOT EDIT ANYTHING IN THIS SECTION OF THE FILE
+instruct g1StoreP(indirect mem, iRegP src, iRegPNoSp tmp1, iRegPNoSp tmp2, iRegPNoSp tmp3, rFlagsReg cr)
+%{
+  predicate(UseG1GC && !needs_releasing_store(n) && n->as_Store()->barrier_data() != 0);
+  match(Set mem (StoreP mem src));
+  effect(TEMP tmp1, TEMP tmp2, TEMP tmp3, KILL cr);
+  ins_cost(INSN_COST);
+  format %{ "str  $src, $mem\t# ptr" %}
+  ins_encode %{
+    write_barrier_pre(masm, this,
+                      $mem$$Register /* obj */,
+                      $tmp1$$Register /* pre_val */,
+                      $tmp2$$Register /* tmp1 */,
+                      $tmp3$$Register /* tmp2 */,
+                      RegSet::of($mem$$Register, $src$$Register) /* preserve */);
+    __ str($src$$Register, $mem$$Register);
+    write_barrier_post(masm, this,
+                       $mem$$Register /* store_addr */,
+                       $src$$Register /* new_val */,
+                       $tmp2$$Register /* tmp1 */,
+                       $tmp3$$Register /* tmp2 */);
+  %}
+  ins_pipe(istore_reg_mem);
+%}
+
+// This pattern is generated automatically from g1_aarch64.m4.
+// DO NOT EDIT ANYTHING IN THIS SECTION OF THE FILE
+instruct g1StorePVolatile(indirect mem, iRegP src, iRegPNoSp tmp1, iRegPNoSp tmp2, iRegPNoSp tmp3, rFlagsReg cr)
+%{
+  predicate(UseG1GC && needs_releasing_store(n) && n->as_Store()->barrier_data() != 0);
+  match(Set mem (StoreP mem src));
+  effect(TEMP tmp1, TEMP tmp2, TEMP tmp3, KILL cr);
+  ins_cost(VOLATILE_REF_COST);
+  format %{ "stlr  $src, $mem\t# ptr" %}
+  ins_encode %{
+    write_barrier_pre(masm, this,
+                      $mem$$Register /* obj */,
+                      $tmp1$$Register /* pre_val */,
+                      $tmp2$$Register /* tmp1 */,
+                      $tmp3$$Register /* tmp2 */,
+                      RegSet::of($mem$$Register, $src$$Register) /* preserve */);
+    __ stlr($src$$Register, $mem$$Register);
+    write_barrier_post(masm, this,
+                       $mem$$Register /* store_addr */,
+                       $src$$Register /* new_val */,
+                       $tmp2$$Register /* tmp1 */,
+                       $tmp3$$Register /* tmp2 */);
+  %}
+  ins_pipe(pipe_class_memory);
+%}
+
+// This pattern is generated automatically from g1_aarch64.m4.
+// DO NOT EDIT ANYTHING IN THIS SECTION OF THE FILE
+instruct g1StoreN(indirect mem, iRegN src, iRegPNoSp tmp1, iRegPNoSp tmp2, iRegPNoSp tmp3, rFlagsReg cr)
+%{
+  predicate(UseG1GC && !needs_releasing_store(n) && n->as_Store()->barrier_data() != 0);
+  match(Set mem (StoreN mem src));
+  effect(TEMP tmp1, TEMP tmp2, TEMP tmp3, KILL cr);
+  ins_cost(INSN_COST);
+  format %{ "strw  $src, $mem\t# compressed ptr" %}
+  ins_encode %{
+    write_barrier_pre(masm, this,
+                      $mem$$Register /* obj */,
+                      $tmp1$$Register /* pre_val */,
+                      $tmp2$$Register /* tmp1 */,
+                      $tmp3$$Register /* tmp2 */,
+                      RegSet::of($mem$$Register, $src$$Register) /* preserve */);
+    __ strw($src$$Register, $mem$$Register);
+    if ((barrier_data() & G1C2BarrierPost) != 0) {
+      if ((barrier_data() & G1C2BarrierPostNotNull) == 0) {
+        __ decode_heap_oop($tmp1$$Register, $src$$Register);
+      } else {
+        __ decode_heap_oop_not_null($tmp1$$Register, $src$$Register);
+      }
+    }
+    write_barrier_post(masm, this,
+                       $mem$$Register /* store_addr */,
+                       $tmp1$$Register /* new_val */,
+                       $tmp2$$Register /* tmp1 */,
+                       $tmp3$$Register /* tmp2 */);
+  %}
+  ins_pipe(istore_reg_mem);
+%}
+
+// This pattern is generated automatically from g1_aarch64.m4.
+// DO NOT EDIT ANYTHING IN THIS SECTION OF THE FILE
+instruct g1StoreNVolatile(indirect mem, iRegN src, iRegPNoSp tmp1, iRegPNoSp tmp2, iRegPNoSp tmp3, rFlagsReg cr)
+%{
+  predicate(UseG1GC && needs_releasing_store(n) && n->as_Store()->barrier_data() != 0);
+  match(Set mem (StoreN mem src));
+  effect(TEMP tmp1, TEMP tmp2, TEMP tmp3, KILL cr);
+  ins_cost(VOLATILE_REF_COST);
+  format %{ "stlrw  $src, $mem\t# compressed ptr" %}
+  ins_encode %{
+    write_barrier_pre(masm, this,
+                      $mem$$Register /* obj */,
+                      $tmp1$$Register /* pre_val */,
+                      $tmp2$$Register /* tmp1 */,
+                      $tmp3$$Register /* tmp2 */,
+                      RegSet::of($mem$$Register, $src$$Register) /* preserve */);
+    __ stlrw($src$$Register, $mem$$Register);
+    if ((barrier_data() & G1C2BarrierPost) != 0) {
+      if ((barrier_data() & G1C2BarrierPostNotNull) == 0) {
+        __ decode_heap_oop($tmp1$$Register, $src$$Register);
+      } else {
+        __ decode_heap_oop_not_null($tmp1$$Register, $src$$Register);
+      }
+    }
+    write_barrier_post(masm, this,
+                       $mem$$Register /* store_addr */,
+                       $tmp1$$Register /* new_val */,
+                       $tmp2$$Register /* tmp1 */,
+                       $tmp3$$Register /* tmp2 */);
+  %}
+  ins_pipe(pipe_class_memory);
+%}
+
+// This pattern is generated automatically from g1_aarch64.m4.
+// DO NOT EDIT ANYTHING IN THIS SECTION OF THE FILE
+instruct g1EncodePAndStoreN(indirect mem, iRegP src, iRegPNoSp tmp1, iRegPNoSp tmp2, iRegPNoSp tmp3, rFlagsReg cr)
+%{
+  predicate(UseG1GC && !needs_releasing_store(n) && n->as_Store()->barrier_data() != 0);
+  match(Set mem (StoreN mem (EncodeP src)));
+  effect(TEMP tmp1, TEMP tmp2, TEMP tmp3, KILL cr);
+  ins_cost(INSN_COST);
+  format %{ "encode_heap_oop $tmp1, $src\n\t"
+            "strw  $tmp1, $mem\t# compressed ptr" %}
+  ins_encode %{
+    write_barrier_pre(masm, this,
+                      $mem$$Register /* obj */,
+                      $tmp1$$Register /* pre_val */,
+                      $tmp2$$Register /* tmp1 */,
+                      $tmp3$$Register /* tmp2 */,
+                      RegSet::of($mem$$Register, $src$$Register) /* preserve */);
+    if ((barrier_data() & G1C2BarrierPostNotNull) == 0) {
+      __ encode_heap_oop($tmp1$$Register, $src$$Register);
+    } else {
+      __ encode_heap_oop_not_null($tmp1$$Register, $src$$Register);
+    }
+    __ strw($tmp1$$Register, $mem$$Register);
+    write_barrier_post(masm, this,
+                       $mem$$Register /* store_addr */,
+                       $src$$Register /* new_val */,
+                       $tmp2$$Register /* tmp1 */,
+                       $tmp3$$Register /* tmp2 */);
+  %}
+  ins_pipe(istore_reg_mem);
+%}
+
+// This pattern is generated automatically from g1_aarch64.m4.
+// DO NOT EDIT ANYTHING IN THIS SECTION OF THE FILE
+instruct g1EncodePAndStoreNVolatile(indirect mem, iRegP src, iRegPNoSp tmp1, iRegPNoSp tmp2, iRegPNoSp tmp3, rFlagsReg cr)
+%{
+  predicate(UseG1GC && needs_releasing_store(n) && n->as_Store()->barrier_data() != 0);
+  match(Set mem (StoreN mem (EncodeP src)));
+  effect(TEMP tmp1, TEMP tmp2, TEMP tmp3, KILL cr);
+  ins_cost(VOLATILE_REF_COST);
+  format %{ "encode_heap_oop $tmp1, $src\n\t"
+            "stlrw  $tmp1, $mem\t# compressed ptr" %}
+  ins_encode %{
+    write_barrier_pre(masm, this,
+                      $mem$$Register /* obj */,
+                      $tmp1$$Register /* pre_val */,
+                      $tmp2$$Register /* tmp1 */,
+                      $tmp3$$Register /* tmp2 */,
+                      RegSet::of($mem$$Register, $src$$Register) /* preserve */);
+    if ((barrier_data() & G1C2BarrierPostNotNull) == 0) {
+      __ encode_heap_oop($tmp1$$Register, $src$$Register);
+    } else {
+      __ encode_heap_oop_not_null($tmp1$$Register, $src$$Register);
+    }
+    __ stlrw($tmp1$$Register, $mem$$Register);
+    write_barrier_post(masm, this,
+                       $mem$$Register /* store_addr */,
+                       $src$$Register /* new_val */,
+                       $tmp2$$Register /* tmp1 */,
+                       $tmp3$$Register /* tmp2 */);
+  %}
+  ins_pipe(pipe_class_memory);
+%}
+
+// This pattern is generated automatically from g1_aarch64.m4.
+// DO NOT EDIT ANYTHING IN THIS SECTION OF THE FILE
+instruct g1CompareAndExchangeP(iRegPNoSp res, indirect mem, iRegP oldval, iRegP newval, iRegPNoSp tmp1, iRegPNoSp tmp2, rFlagsReg cr)
+%{
+  predicate(UseG1GC && !needs_acquiring_load_exclusive(n) && n->as_LoadStore()->barrier_data() != 0);
+  match(Set res (CompareAndExchangeP mem (Binary oldval newval)));
+  effect(TEMP res, TEMP tmp1, TEMP tmp2, KILL cr);
+  ins_cost(2 * VOLATILE_REF_COST);
+  format %{ "cmpxchg $res = $mem, $oldval, $newval\t# ptr" %}
+  ins_encode %{
+    assert_different_registers($oldval$$Register, $mem$$Register);
+    assert_different_registers($newval$$Register, $mem$$Register);
+    // Pass $oldval to the pre-barrier (instead of loading from $mem), because
+    // $oldval is the only value that can be overwritten.
+    // The same holds for g1CompareAndSwapP and its Acq variant.
+    write_barrier_pre(masm, this,
+                      noreg /* obj */,
+                      $oldval$$Register /* pre_val */,
+                      $tmp1$$Register /* tmp1 */,
+                      $tmp2$$Register /* tmp2 */,
+                      RegSet::of($mem$$Register, $oldval$$Register, $newval$$Register) /* preserve */,
+                      RegSet::of($res$$Register) /* no_preserve */);
+    __ cmpxchg($mem$$Register, $oldval$$Register, $newval$$Register, Assembler::xword,
+               false /* acquire */, true /* release */, false /* weak */, $res$$Register);
+    write_barrier_post(masm, this,
+                       $mem$$Register /* store_addr */,
+                       $newval$$Register /* new_val */,
+                       $tmp1$$Register /* tmp1 */,
+                       $tmp2$$Register /* tmp2 */);
+  %}
+  ins_pipe(pipe_slow);
+%}
+
+// This pattern is generated automatically from g1_aarch64.m4.
+// DO NOT EDIT ANYTHING IN THIS SECTION OF THE FILE
+instruct g1CompareAndExchangePAcq(iRegPNoSp res, indirect mem, iRegP oldval, iRegP newval, iRegPNoSp tmp1, iRegPNoSp tmp2, rFlagsReg cr)
+%{
+  predicate(UseG1GC && needs_acquiring_load_exclusive(n) && n->as_LoadStore()->barrier_data() != 0);
+  match(Set res (CompareAndExchangeP mem (Binary oldval newval)));
+  effect(TEMP res, TEMP tmp1, TEMP tmp2, KILL cr);
+  ins_cost(VOLATILE_REF_COST);
+  format %{ "cmpxchg_acq $res = $mem, $oldval, $newval\t# ptr" %}
+  ins_encode %{
+    assert_different_registers($oldval$$Register, $mem$$Register);
+    assert_different_registers($newval$$Register, $mem$$Register);
+    // Pass $oldval to the pre-barrier (instead of loading from $mem), because
+    // $oldval is the only value that can be overwritten.
+    // The same holds for g1CompareAndSwapP and its Acq variant.
+    write_barrier_pre(masm, this,
+                      noreg /* obj */,
+                      $oldval$$Register /* pre_val */,
+                      $tmp1$$Register /* tmp1 */,
+                      $tmp2$$Register /* tmp2 */,
+                      RegSet::of($mem$$Register, $oldval$$Register, $newval$$Register) /* preserve */,
+                      RegSet::of($res$$Register) /* no_preserve */);
+    __ cmpxchg($mem$$Register, $oldval$$Register, $newval$$Register, Assembler::xword,
+               true /* acquire */, true /* release */, false /* weak */, $res$$Register);
+    write_barrier_post(masm, this,
+                       $mem$$Register /* store_addr */,
+                       $newval$$Register /* new_val */,
+                       $tmp1$$Register /* tmp1 */,
+                       $tmp2$$Register /* tmp2 */);
+  %}
+  ins_pipe(pipe_slow);
+%}
+
+// This pattern is generated automatically from g1_aarch64.m4.
+// DO NOT EDIT ANYTHING IN THIS SECTION OF THE FILE
+instruct g1CompareAndExchangeN(iRegNNoSp res, indirect mem, iRegN oldval, iRegN newval, iRegPNoSp tmp1, iRegPNoSp tmp2, iRegPNoSp tmp3, rFlagsReg cr)
+%{
+  predicate(UseG1GC && !needs_acquiring_load_exclusive(n) && n->as_LoadStore()->barrier_data() != 0);
+  match(Set res (CompareAndExchangeN mem (Binary oldval newval)));
+  effect(TEMP res, TEMP tmp1, TEMP tmp2, TEMP tmp3, KILL cr);
+  ins_cost(2 * VOLATILE_REF_COST);
+  format %{ "cmpxchg $res = $mem, $oldval, $newval\t# narrow oop" %}
+  ins_encode %{
+    assert_different_registers($oldval$$Register, $mem$$Register);
+    assert_different_registers($newval$$Register, $mem$$Register);
+    write_barrier_pre(masm, this,
+                      $mem$$Register /* obj */,
+                      $tmp1$$Register /* pre_val */,
+                      $tmp2$$Register /* tmp1 */,
+                      $tmp3$$Register /* tmp2 */,
+                      RegSet::of($mem$$Register, $oldval$$Register, $newval$$Register) /* preserve */,
+                      RegSet::of($res$$Register) /* no_preserve */);
+    __ cmpxchg($mem$$Register, $oldval$$Register, $newval$$Register, Assembler::word,
+               false /* acquire */, true /* release */, false /* weak */, $res$$Register);
+    __ decode_heap_oop($tmp1$$Register, $newval$$Register);
+    write_barrier_post(masm, this,
+                       $mem$$Register /* store_addr */,
+                       $tmp1$$Register /* new_val */,
+                       $tmp2$$Register /* tmp1 */,
+                       $tmp3$$Register /* tmp2 */);
+  %}
+  ins_pipe(pipe_slow);
+%}
+
+// This pattern is generated automatically from g1_aarch64.m4.
+// DO NOT EDIT ANYTHING IN THIS SECTION OF THE FILE
+instruct g1CompareAndExchangeNAcq(iRegNNoSp res, indirect mem, iRegN oldval, iRegN newval, iRegPNoSp tmp1, iRegPNoSp tmp2, iRegPNoSp tmp3, rFlagsReg cr)
+%{
+  predicate(UseG1GC && needs_acquiring_load_exclusive(n) && n->as_LoadStore()->barrier_data() != 0);
+  match(Set res (CompareAndExchangeN mem (Binary oldval newval)));
+  effect(TEMP res, TEMP tmp1, TEMP tmp2, TEMP tmp3, KILL cr);
+  ins_cost(VOLATILE_REF_COST);
+  format %{ "cmpxchg_acq $res = $mem, $oldval, $newval\t# narrow oop" %}
+  ins_encode %{
+    assert_different_registers($oldval$$Register, $mem$$Register);
+    assert_different_registers($newval$$Register, $mem$$Register);
+    write_barrier_pre(masm, this,
+                      $mem$$Register /* obj */,
+                      $tmp1$$Register /* pre_val */,
+                      $tmp2$$Register /* tmp1 */,
+                      $tmp3$$Register /* tmp2 */,
+                      RegSet::of($mem$$Register, $oldval$$Register, $newval$$Register) /* preserve */,
+                      RegSet::of($res$$Register) /* no_preserve */);
+    __ cmpxchg($mem$$Register, $oldval$$Register, $newval$$Register, Assembler::word,
+               true /* acquire */, true /* release */, false /* weak */, $res$$Register);
+    __ decode_heap_oop($tmp1$$Register, $newval$$Register);
+    write_barrier_post(masm, this,
+                       $mem$$Register /* store_addr */,
+                       $tmp1$$Register /* new_val */,
+                       $tmp2$$Register /* tmp1 */,
+                       $tmp3$$Register /* tmp2 */);
+  %}
+  ins_pipe(pipe_slow);
+%}
+
+// This pattern is generated automatically from g1_aarch64.m4.
+// DO NOT EDIT ANYTHING IN THIS SECTION OF THE FILE
+instruct g1CompareAndSwapP(iRegINoSp res, indirect mem, iRegP newval, iRegPNoSp tmp1, iRegPNoSp tmp2, iRegP oldval, rFlagsReg cr)
+%{
+  predicate(UseG1GC && !needs_acquiring_load_exclusive(n) && n->as_LoadStore()->barrier_data() != 0);
+  match(Set res (CompareAndSwapP mem (Binary oldval newval)));
+  match(Set res (WeakCompareAndSwapP mem (Binary oldval newval)));
+  effect(TEMP res, TEMP tmp1, TEMP tmp2, KILL cr);
+  ins_cost(2 * VOLATILE_REF_COST);
+  format %{ "cmpxchg $mem, $oldval, $newval\t# (ptr)\n\t"
+            "cset $res, EQ" %}
+  ins_encode %{
+    assert_different_registers($oldval$$Register, $mem$$Register);
+    assert_different_registers($newval$$Register, $mem$$Register);
+    write_barrier_pre(masm, this,
+                      noreg /* obj */,
+                      $oldval$$Register /* pre_val */,
+                      $tmp1$$Register /* tmp1 */,
+                      $tmp2$$Register /* tmp2 */,
+                      RegSet::of($mem$$Register, $oldval$$Register, $newval$$Register) /* preserve */,
+                      RegSet::of($res$$Register) /* no_preserve */);
+    __ cmpxchg($mem$$Register, $oldval$$Register, $newval$$Register, Assembler::xword,
+               false /* acquire */, true /* release */, false /* weak */, noreg);
+    __ cset($res$$Register, Assembler::EQ);
+    write_barrier_post(masm, this,
+                       $mem$$Register /* store_addr */,
+                       $newval$$Register /* new_val */,
+                       $tmp1$$Register /* tmp1 */,
+                       $tmp2$$Register /* tmp2 */);
+  %}
+  ins_pipe(pipe_slow);
+%}
+
+// This pattern is generated automatically from g1_aarch64.m4.
+// DO NOT EDIT ANYTHING IN THIS SECTION OF THE FILE
+instruct g1CompareAndSwapPAcq(iRegINoSp res, indirect mem, iRegP newval, iRegPNoSp tmp1, iRegPNoSp tmp2, iRegP oldval, rFlagsReg cr)
+%{
+  predicate(UseG1GC && needs_acquiring_load_exclusive(n) && n->as_LoadStore()->barrier_data() != 0);
+  match(Set res (CompareAndSwapP mem (Binary oldval newval)));
+  match(Set res (WeakCompareAndSwapP mem (Binary oldval newval)));
+  effect(TEMP res, TEMP tmp1, TEMP tmp2, KILL cr);
+  ins_cost(VOLATILE_REF_COST);
+  format %{ "cmpxchg_acq $mem, $oldval, $newval\t# (ptr)\n\t"
+            "cset $res, EQ" %}
+  ins_encode %{
+    assert_different_registers($oldval$$Register, $mem$$Register);
+    assert_different_registers($newval$$Register, $mem$$Register);
+    write_barrier_pre(masm, this,
+                      noreg /* obj */,
+                      $oldval$$Register /* pre_val */,
+                      $tmp1$$Register /* tmp1 */,
+                      $tmp2$$Register /* tmp2 */,
+                      RegSet::of($mem$$Register, $oldval$$Register, $newval$$Register) /* preserve */,
+                      RegSet::of($res$$Register) /* no_preserve */);
+    __ cmpxchg($mem$$Register, $oldval$$Register, $newval$$Register, Assembler::xword,
+               true /* acquire */, true /* release */, false /* weak */, noreg);
+    __ cset($res$$Register, Assembler::EQ);
+    write_barrier_post(masm, this,
+                       $mem$$Register /* store_addr */,
+                       $newval$$Register /* new_val */,
+                       $tmp1$$Register /* tmp1 */,
+                       $tmp2$$Register /* tmp2 */);
+  %}
+  ins_pipe(pipe_slow);
+%}
+
+// This pattern is generated automatically from g1_aarch64.m4.
+// DO NOT EDIT ANYTHING IN THIS SECTION OF THE FILE
+instruct g1CompareAndSwapN(iRegINoSp res, indirect mem, iRegN newval, iRegPNoSp tmp1, iRegPNoSp tmp2, iRegPNoSp tmp3, iRegN oldval, rFlagsReg cr)
+%{
+  predicate(UseG1GC && !needs_acquiring_load_exclusive(n) && n->as_LoadStore()->barrier_data() != 0);
+  match(Set res (CompareAndSwapN mem (Binary oldval newval)));
+  match(Set res (WeakCompareAndSwapN mem (Binary oldval newval)));
+  effect(TEMP res, TEMP tmp1, TEMP tmp2, TEMP tmp3, KILL cr);
+  ins_cost(2 * VOLATILE_REF_COST);
+  format %{ "cmpxchg $mem, $oldval, $newval\t# (narrow oop)\n\t"
+            "cset $res, EQ" %}
+  ins_encode %{
+    assert_different_registers($oldval$$Register, $mem$$Register);
+    assert_different_registers($newval$$Register, $mem$$Register);
+    write_barrier_pre(masm, this,
+                      $mem$$Register /* obj */,
+                      $tmp1$$Register /* pre_val */,
+                      $tmp2$$Register /* tmp1 */,
+                      $tmp3$$Register /* tmp2 */,
+                      RegSet::of($mem$$Register, $oldval$$Register, $newval$$Register) /* preserve */,
+                      RegSet::of($res$$Register) /* no_preserve */);
+    __ cmpxchg($mem$$Register, $oldval$$Register, $newval$$Register, Assembler::word,
+               false /* acquire */, true /* release */, false /* weak */, noreg);
+    __ cset($res$$Register, Assembler::EQ);
+    __ decode_heap_oop($tmp1$$Register, $newval$$Register);
+    write_barrier_post(masm, this,
+                       $mem$$Register /* store_addr */,
+                       $tmp1$$Register /* new_val */,
+                       $tmp2$$Register /* tmp1 */,
+                       $tmp3$$Register /* tmp2 */);
+  %}
+  ins_pipe(pipe_slow);
+%}
+
+// This pattern is generated automatically from g1_aarch64.m4.
+// DO NOT EDIT ANYTHING IN THIS SECTION OF THE FILE
+instruct g1CompareAndSwapNAcq(iRegINoSp res, indirect mem, iRegN newval, iRegPNoSp tmp1, iRegPNoSp tmp2, iRegPNoSp tmp3, iRegN oldval, rFlagsReg cr)
+%{
+  predicate(UseG1GC && needs_acquiring_load_exclusive(n) && n->as_LoadStore()->barrier_data() != 0);
+  match(Set res (CompareAndSwapN mem (Binary oldval newval)));
+  match(Set res (WeakCompareAndSwapN mem (Binary oldval newval)));
+  effect(TEMP res, TEMP tmp1, TEMP tmp2, TEMP tmp3, KILL cr);
+  ins_cost(VOLATILE_REF_COST);
+  format %{ "cmpxchg_acq $mem, $oldval, $newval\t# (narrow oop)\n\t"
+            "cset $res, EQ" %}
+  ins_encode %{
+    assert_different_registers($oldval$$Register, $mem$$Register);
+    assert_different_registers($newval$$Register, $mem$$Register);
+    write_barrier_pre(masm, this,
+                      $mem$$Register /* obj */,
+                      $tmp1$$Register /* pre_val */,
+                      $tmp2$$Register /* tmp1 */,
+                      $tmp3$$Register /* tmp2 */,
+                      RegSet::of($mem$$Register, $oldval$$Register, $newval$$Register) /* preserve */,
+                      RegSet::of($res$$Register) /* no_preserve */);
+    __ cmpxchg($mem$$Register, $oldval$$Register, $newval$$Register, Assembler::word,
+               true /* acquire */, true /* release */, false /* weak */, noreg);
+    __ cset($res$$Register, Assembler::EQ);
+    __ decode_heap_oop($tmp1$$Register, $newval$$Register);
+    write_barrier_post(masm, this,
+                       $mem$$Register /* store_addr */,
+                       $tmp1$$Register /* new_val */,
+                       $tmp2$$Register /* tmp1 */,
+                       $tmp3$$Register /* tmp2 */);
+  %}
+  ins_pipe(pipe_slow);
+%}
+
+// This pattern is generated automatically from g1_aarch64.m4.
+// DO NOT EDIT ANYTHING IN THIS SECTION OF THE FILE
+instruct g1GetAndSetP(indirect mem, iRegP newval, iRegPNoSp tmp1, iRegPNoSp tmp2, iRegPNoSp preval, rFlagsReg cr)
+%{
+  predicate(UseG1GC && !needs_acquiring_load_exclusive(n) && n->as_LoadStore()->barrier_data() != 0);
+  match(Set preval (GetAndSetP mem newval));
+  effect(TEMP preval, TEMP tmp1, TEMP tmp2, KILL cr);
+  ins_cost(2 * VOLATILE_REF_COST);
+  format %{ "atomic_xchg  $preval, $newval, [$mem]" %}
+  ins_encode %{
+    assert_different_registers($mem$$Register, $newval$$Register);
+    write_barrier_pre(masm, this,
+                      $mem$$Register /* obj */,
+                      $preval$$Register /* pre_val (as a temporary register) */,
+                      $tmp1$$Register /* tmp1 */,
+                      $tmp2$$Register /* tmp2 */,
+                      RegSet::of($mem$$Register, $preval$$Register, $newval$$Register) /* preserve */);
+    __ atomic_xchg($preval$$Register, $newval$$Register, $mem$$Register);
+    write_barrier_post(masm, this,
+                       $mem$$Register /* store_addr */,
+                       $newval$$Register /* new_val */,
+                       $tmp1$$Register /* tmp1 */,
+                       $tmp2$$Register /* tmp2 */);
+  %}
+  ins_pipe(pipe_serial);
+%}
+
+// This pattern is generated automatically from g1_aarch64.m4.
+// DO NOT EDIT ANYTHING IN THIS SECTION OF THE FILE
+instruct g1GetAndSetPAcq(indirect mem, iRegP newval, iRegPNoSp tmp1, iRegPNoSp tmp2, iRegPNoSp preval, rFlagsReg cr)
+%{
+  predicate(UseG1GC && needs_acquiring_load_exclusive(n) && n->as_LoadStore()->barrier_data() != 0);
+  match(Set preval (GetAndSetP mem newval));
+  effect(TEMP preval, TEMP tmp1, TEMP tmp2, KILL cr);
+  ins_cost(VOLATILE_REF_COST);
+  format %{ "atomic_xchg_acq  $preval, $newval, [$mem]" %}
+  ins_encode %{
+    assert_different_registers($mem$$Register, $newval$$Register);
+    write_barrier_pre(masm, this,
+                      $mem$$Register /* obj */,
+                      $preval$$Register /* pre_val (as a temporary register) */,
+                      $tmp1$$Register /* tmp1 */,
+                      $tmp2$$Register /* tmp2 */,
+                      RegSet::of($mem$$Register, $preval$$Register, $newval$$Register) /* preserve */);
+    __ atomic_xchgal($preval$$Register, $newval$$Register, $mem$$Register);
+    write_barrier_post(masm, this,
+                       $mem$$Register /* store_addr */,
+                       $newval$$Register /* new_val */,
+                       $tmp1$$Register /* tmp1 */,
+                       $tmp2$$Register /* tmp2 */);
+  %}
+  ins_pipe(pipe_serial);
+%}
+
+// This pattern is generated automatically from g1_aarch64.m4.
+// DO NOT EDIT ANYTHING IN THIS SECTION OF THE FILE
+instruct g1GetAndSetN(indirect mem, iRegN newval, iRegPNoSp tmp1, iRegPNoSp tmp2, iRegPNoSp tmp3, iRegNNoSp preval, rFlagsReg cr)
+%{
+  predicate(UseG1GC && !needs_acquiring_load_exclusive(n) && n->as_LoadStore()->barrier_data() != 0);
+  match(Set preval (GetAndSetN mem newval));
+  effect(TEMP preval, TEMP tmp1, TEMP tmp2, TEMP tmp3, KILL cr);
+  ins_cost(2 * VOLATILE_REF_COST);
+  format %{ "atomic_xchgw $preval, $newval, [$mem]" %}
+  ins_encode %{
+    assert_different_registers($mem$$Register, $newval$$Register);
+    write_barrier_pre(masm, this,
+                      $mem$$Register /* obj */,
+                      $tmp1$$Register /* pre_val */,
+                      $tmp2$$Register /* tmp1 */,
+                      $tmp3$$Register /* tmp2 */,
+                      RegSet::of($mem$$Register, $preval$$Register, $newval$$Register) /* preserve */);
+    __ atomic_xchgw($preval$$Register, $newval$$Register, $mem$$Register);
+    __ decode_heap_oop($tmp1$$Register, $newval$$Register);
+    write_barrier_post(masm, this,
+                       $mem$$Register /* store_addr */,
+                       $tmp1$$Register /* new_val */,
+                       $tmp2$$Register /* tmp1 */,
+                       $tmp3$$Register /* tmp2 */);
+  %}
+  ins_pipe(pipe_serial);
+%}
+
+// This pattern is generated automatically from g1_aarch64.m4.
+// DO NOT EDIT ANYTHING IN THIS SECTION OF THE FILE
+instruct g1GetAndSetNAcq(indirect mem, iRegN newval, iRegPNoSp tmp1, iRegPNoSp tmp2, iRegPNoSp tmp3, iRegNNoSp preval, rFlagsReg cr)
+%{
+  predicate(UseG1GC && needs_acquiring_load_exclusive(n) && n->as_LoadStore()->barrier_data() != 0);
+  match(Set preval (GetAndSetN mem newval));
+  effect(TEMP preval, TEMP tmp1, TEMP tmp2, TEMP tmp3, KILL cr);
+  ins_cost(VOLATILE_REF_COST);
+  format %{ "atomic_xchgw_acq $preval, $newval, [$mem]" %}
+  ins_encode %{
+    assert_different_registers($mem$$Register, $newval$$Register);
+    write_barrier_pre(masm, this,
+                      $mem$$Register /* obj */,
+                      $tmp1$$Register /* pre_val */,
+                      $tmp2$$Register /* tmp1 */,
+                      $tmp3$$Register /* tmp2 */,
+                      RegSet::of($mem$$Register, $preval$$Register, $newval$$Register) /* preserve */);
+    __ atomic_xchgalw($preval$$Register, $newval$$Register, $mem$$Register);
+    __ decode_heap_oop($tmp1$$Register, $newval$$Register);
+    write_barrier_post(masm, this,
+                       $mem$$Register /* store_addr */,
+                       $tmp1$$Register /* new_val */,
+                       $tmp2$$Register /* tmp1 */,
+                       $tmp3$$Register /* tmp2 */);
+  %}
+  ins_pipe(pipe_serial);
+%}
+
+// This pattern is generated automatically from g1_aarch64.m4.
+// DO NOT EDIT ANYTHING IN THIS SECTION OF THE FILE
+instruct g1LoadP(iRegPNoSp dst, indirect mem, iRegPNoSp tmp1, iRegPNoSp tmp2, rFlagsReg cr)
+%{
+  // This instruction does not need an acquiring counterpart because it is only
+  // used for reference loading (Reference::get()). The same holds for g1LoadN.
+  predicate(UseG1GC && !needs_acquiring_load(n) && n->as_Load()->barrier_data() != 0);
+  match(Set dst (LoadP mem));
+  effect(TEMP dst, TEMP tmp1, TEMP tmp2, KILL cr);
+  ins_cost(4 * INSN_COST);
+  format %{ "ldr  $dst, $mem\t# ptr" %}
+  ins_encode %{
+    __ ldr($dst$$Register, $mem$$Register);
+    write_barrier_pre(masm, this,
+                      noreg /* obj */,
+                      $dst$$Register /* pre_val */,
+                      $tmp1$$Register /* tmp1 */,
+                      $tmp2$$Register /* tmp2 */);
+  %}
+  ins_pipe(iload_reg_mem);
+%}
+
+// This pattern is generated automatically from g1_aarch64.m4.
+// DO NOT EDIT ANYTHING IN THIS SECTION OF THE FILE
+instruct g1LoadN(iRegNNoSp dst, indirect mem, iRegPNoSp tmp1, iRegPNoSp tmp2, iRegPNoSp tmp3, rFlagsReg cr)
+%{
+  predicate(UseG1GC && !needs_acquiring_load(n) && n->as_Load()->barrier_data() != 0);
+  match(Set dst (LoadN mem));
+  effect(TEMP dst, TEMP tmp1, TEMP tmp2, TEMP tmp3, KILL cr);
+  ins_cost(4 * INSN_COST);
+  format %{ "ldrw  $dst, $mem\t# compressed ptr" %}
+  ins_encode %{
+    __ ldrw($dst$$Register, $mem$$Register);
+    if ((barrier_data() & G1C2BarrierPre) != 0) {
+      __ decode_heap_oop($tmp1$$Register, $dst$$Register);
+      write_barrier_pre(masm, this,
+                        noreg /* obj */,
+                        $tmp1$$Register /* pre_val */,
+                        $tmp2$$Register /* tmp1 */,
+                        $tmp3$$Register /* tmp2 */);
+    }
+  %}
+  ins_pipe(iload_reg_mem);
+%}
+
+// END This section of the file is automatically generated. Do not edit --------------
diff --git a/src/hotspot/cpu/aarch64/gc/g1/g1_aarch64.m4 b/src/hotspot/cpu/aarch64/gc/g1/g1_aarch64.m4
new file mode 100644
index 00000000000..8fb1f7e8e42
--- /dev/null
+++ b/src/hotspot/cpu/aarch64/gc/g1/g1_aarch64.m4
@@ -0,0 +1,384 @@
+dnl Copyright (c) 2024, Oracle and/or its affiliates. All rights reserved.
+dnl DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+dnl
+dnl This code is free software; you can redistribute it and/or modify it
+dnl under the terms of the GNU General Public License version 2 only, as
+dnl published by the Free Software Foundation.
+dnl
+dnl This code is distributed in the hope that it will be useful, but WITHOUT
+dnl ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+dnl FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl version 2 for more details (a copy is included in the LICENSE file that
+dnl accompanied this code).
+dnl
+dnl You should have received a copy of the GNU General Public License version
+dnl 2 along with this work; if not, write to the Free Software Foundation,
+dnl Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+dnl
+dnl Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+dnl or visit www.oracle.com if you need additional information or have any
+dnl questions.
+dnl
+// BEGIN This section of the file is automatically generated. Do not edit --------------
+
+// This section is generated from g1_aarch64.m4
+
+define(`STOREP_INSN',
+`
+// This pattern is generated automatically from g1_aarch64.m4.
+// DO NOT EDIT ANYTHING IN THIS SECTION OF THE FILE
+instruct g1StoreP$1(indirect mem, iRegP src, iRegPNoSp tmp1, iRegPNoSp tmp2, iRegPNoSp tmp3, rFlagsReg cr)
+%{
+  predicate(UseG1GC && ifelse($1,Volatile,'needs_releasing_store(n)`,'!needs_releasing_store(n)`) && n->as_Store()->barrier_data() != 0);
+  match(Set mem (StoreP mem src));
+  effect(TEMP tmp1, TEMP tmp2, TEMP tmp3, KILL cr);
+  ins_cost(ifelse($1,Volatile,VOLATILE_REF_COST,INSN_COST));
+  format %{ "$2  $src, $mem\t# ptr" %}
+  ins_encode %{
+    write_barrier_pre(masm, this,
+                      $mem$$Register /* obj */,
+                      $tmp1$$Register /* pre_val */,
+                      $tmp2$$Register /* tmp1 */,
+                      $tmp3$$Register /* tmp2 */,
+                      RegSet::of($mem$$Register, $src$$Register) /* preserve */);
+    __ $2($src$$Register, $mem$$Register);
+    write_barrier_post(masm, this,
+                       $mem$$Register /* store_addr */,
+                       $src$$Register /* new_val */,
+                       $tmp2$$Register /* tmp1 */,
+                       $tmp3$$Register /* tmp2 */);
+  %}
+  ins_pipe(ifelse($1,Volatile,pipe_class_memory,istore_reg_mem));
+%}')dnl
+STOREP_INSN(,str)
+STOREP_INSN(Volatile,stlr)
+dnl
+define(`STOREN_INSN',
+`
+// This pattern is generated automatically from g1_aarch64.m4.
+// DO NOT EDIT ANYTHING IN THIS SECTION OF THE FILE
+instruct g1StoreN$1(indirect mem, iRegN src, iRegPNoSp tmp1, iRegPNoSp tmp2, iRegPNoSp tmp3, rFlagsReg cr)
+%{
+  predicate(UseG1GC && ifelse($1,Volatile,'needs_releasing_store(n)`,'!needs_releasing_store(n)`) && n->as_Store()->barrier_data() != 0);
+  match(Set mem (StoreN mem src));
+  effect(TEMP tmp1, TEMP tmp2, TEMP tmp3, KILL cr);
+  ins_cost(ifelse($1,Volatile,VOLATILE_REF_COST,INSN_COST));
+  format %{ "$2  $src, $mem\t# compressed ptr" %}
+  ins_encode %{
+    write_barrier_pre(masm, this,
+                      $mem$$Register /* obj */,
+                      $tmp1$$Register /* pre_val */,
+                      $tmp2$$Register /* tmp1 */,
+                      $tmp3$$Register /* tmp2 */,
+                      RegSet::of($mem$$Register, $src$$Register) /* preserve */);
+    __ $2($src$$Register, $mem$$Register);
+    if ((barrier_data() & G1C2BarrierPost) != 0) {
+      if ((barrier_data() & G1C2BarrierPostNotNull) == 0) {
+        __ decode_heap_oop($tmp1$$Register, $src$$Register);
+      } else {
+        __ decode_heap_oop_not_null($tmp1$$Register, $src$$Register);
+      }
+    }
+    write_barrier_post(masm, this,
+                       $mem$$Register /* store_addr */,
+                       $tmp1$$Register /* new_val */,
+                       $tmp2$$Register /* tmp1 */,
+                       $tmp3$$Register /* tmp2 */);
+  %}
+  ins_pipe(ifelse($1,Volatile,pipe_class_memory,istore_reg_mem));
+%}')dnl
+STOREN_INSN(,strw)
+STOREN_INSN(Volatile,stlrw)
+dnl
+define(`ENCODESTOREN_INSN',
+`
+// This pattern is generated automatically from g1_aarch64.m4.
+// DO NOT EDIT ANYTHING IN THIS SECTION OF THE FILE
+instruct g1EncodePAndStoreN$1(indirect mem, iRegP src, iRegPNoSp tmp1, iRegPNoSp tmp2, iRegPNoSp tmp3, rFlagsReg cr)
+%{
+  predicate(UseG1GC && ifelse($1,Volatile,'needs_releasing_store(n)`,'!needs_releasing_store(n)`) && n->as_Store()->barrier_data() != 0);
+  match(Set mem (StoreN mem (EncodeP src)));
+  effect(TEMP tmp1, TEMP tmp2, TEMP tmp3, KILL cr);
+  ins_cost(ifelse($1,Volatile,VOLATILE_REF_COST,INSN_COST));
+  format %{ "encode_heap_oop $tmp1, $src\n\t"
+            "$2  $tmp1, $mem\t# compressed ptr" %}
+  ins_encode %{
+    write_barrier_pre(masm, this,
+                      $mem$$Register /* obj */,
+                      $tmp1$$Register /* pre_val */,
+                      $tmp2$$Register /* tmp1 */,
+                      $tmp3$$Register /* tmp2 */,
+                      RegSet::of($mem$$Register, $src$$Register) /* preserve */);
+    if ((barrier_data() & G1C2BarrierPostNotNull) == 0) {
+      __ encode_heap_oop($tmp1$$Register, $src$$Register);
+    } else {
+      __ encode_heap_oop_not_null($tmp1$$Register, $src$$Register);
+    }
+    __ $2($tmp1$$Register, $mem$$Register);
+    write_barrier_post(masm, this,
+                       $mem$$Register /* store_addr */,
+                       $src$$Register /* new_val */,
+                       $tmp2$$Register /* tmp1 */,
+                       $tmp3$$Register /* tmp2 */);
+  %}
+  ins_pipe(ifelse($1,Volatile,pipe_class_memory,istore_reg_mem));
+%}')dnl
+ENCODESTOREN_INSN(,strw)
+ENCODESTOREN_INSN(Volatile,stlrw)
+dnl
+define(`CAEP_INSN',
+`
+// This pattern is generated automatically from g1_aarch64.m4.
+// DO NOT EDIT ANYTHING IN THIS SECTION OF THE FILE
+instruct g1CompareAndExchangeP$1(iRegPNoSp res, indirect mem, iRegP oldval, iRegP newval, iRegPNoSp tmp1, iRegPNoSp tmp2, rFlagsReg cr)
+%{
+  predicate(UseG1GC && ifelse($1,Acq,'needs_acquiring_load_exclusive(n)`,'!needs_acquiring_load_exclusive(n)`) && n->as_LoadStore()->barrier_data() != 0);
+  match(Set res (CompareAndExchangeP mem (Binary oldval newval)));
+  effect(TEMP res, TEMP tmp1, TEMP tmp2, KILL cr);
+  ins_cost(ifelse($1,Acq,VOLATILE_REF_COST,2 * VOLATILE_REF_COST));
+  format %{ "cmpxchg$2 $res = $mem, $oldval, $newval\t# ptr" %}
+  ins_encode %{
+    assert_different_registers($oldval$$Register, $mem$$Register);
+    assert_different_registers($newval$$Register, $mem$$Register);
+    // Pass $oldval to the pre-barrier (instead of loading from $mem), because
+    // $oldval is the only value that can be overwritten.
+    // The same holds for g1CompareAndSwapP and its Acq variant.
+    write_barrier_pre(masm, this,
+                      noreg /* obj */,
+                      $oldval$$Register /* pre_val */,
+                      $tmp1$$Register /* tmp1 */,
+                      $tmp2$$Register /* tmp2 */,
+                      RegSet::of($mem$$Register, $oldval$$Register, $newval$$Register) /* preserve */,
+                      RegSet::of($res$$Register) /* no_preserve */);
+    __ cmpxchg($mem$$Register, $oldval$$Register, $newval$$Register, Assembler::xword,
+               $3 /* acquire */, true /* release */, false /* weak */, $res$$Register);
+    write_barrier_post(masm, this,
+                       $mem$$Register /* store_addr */,
+                       $newval$$Register /* new_val */,
+                       $tmp1$$Register /* tmp1 */,
+                       $tmp2$$Register /* tmp2 */);
+  %}
+  ins_pipe(pipe_slow);
+%}')dnl
+CAEP_INSN(,,false)
+CAEP_INSN(Acq,_acq,true)
+dnl
+define(`CAEN_INSN',
+`
+// This pattern is generated automatically from g1_aarch64.m4.
+// DO NOT EDIT ANYTHING IN THIS SECTION OF THE FILE
+instruct g1CompareAndExchangeN$1(iRegNNoSp res, indirect mem, iRegN oldval, iRegN newval, iRegPNoSp tmp1, iRegPNoSp tmp2, iRegPNoSp tmp3, rFlagsReg cr)
+%{
+  predicate(UseG1GC && ifelse($1,Acq,'needs_acquiring_load_exclusive(n)`,'!needs_acquiring_load_exclusive(n)`) && n->as_LoadStore()->barrier_data() != 0);
+  match(Set res (CompareAndExchangeN mem (Binary oldval newval)));
+  effect(TEMP res, TEMP tmp1, TEMP tmp2, TEMP tmp3, KILL cr);
+  ins_cost(ifelse($1,Acq,VOLATILE_REF_COST,2 * VOLATILE_REF_COST));
+  format %{ "cmpxchg$2 $res = $mem, $oldval, $newval\t# narrow oop" %}
+  ins_encode %{
+    assert_different_registers($oldval$$Register, $mem$$Register);
+    assert_different_registers($newval$$Register, $mem$$Register);
+    write_barrier_pre(masm, this,
+                      $mem$$Register /* obj */,
+                      $tmp1$$Register /* pre_val */,
+                      $tmp2$$Register /* tmp1 */,
+                      $tmp3$$Register /* tmp2 */,
+                      RegSet::of($mem$$Register, $oldval$$Register, $newval$$Register) /* preserve */,
+                      RegSet::of($res$$Register) /* no_preserve */);
+    __ cmpxchg($mem$$Register, $oldval$$Register, $newval$$Register, Assembler::word,
+               $3 /* acquire */, true /* release */, false /* weak */, $res$$Register);
+    __ decode_heap_oop($tmp1$$Register, $newval$$Register);
+    write_barrier_post(masm, this,
+                       $mem$$Register /* store_addr */,
+                       $tmp1$$Register /* new_val */,
+                       $tmp2$$Register /* tmp1 */,
+                       $tmp3$$Register /* tmp2 */);
+  %}
+  ins_pipe(pipe_slow);
+%}')dnl
+CAEN_INSN(,,false)
+CAEN_INSN(Acq,_acq,true)
+dnl
+define(`CASP_INSN',
+`
+// This pattern is generated automatically from g1_aarch64.m4.
+// DO NOT EDIT ANYTHING IN THIS SECTION OF THE FILE
+instruct g1CompareAndSwapP$1(iRegINoSp res, indirect mem, iRegP newval, iRegPNoSp tmp1, iRegPNoSp tmp2, iRegP oldval, rFlagsReg cr)
+%{
+  predicate(UseG1GC && ifelse($1,Acq,'needs_acquiring_load_exclusive(n)`,'!needs_acquiring_load_exclusive(n)`) && n->as_LoadStore()->barrier_data() != 0);
+  match(Set res (CompareAndSwapP mem (Binary oldval newval)));
+  match(Set res (WeakCompareAndSwapP mem (Binary oldval newval)));
+  effect(TEMP res, TEMP tmp1, TEMP tmp2, KILL cr);
+  ins_cost(ifelse($1,Acq,VOLATILE_REF_COST,2 * VOLATILE_REF_COST));
+  format %{ "cmpxchg$2 $mem, $oldval, $newval\t# (ptr)\n\t"
+            "cset $res, EQ" %}
+  ins_encode %{
+    assert_different_registers($oldval$$Register, $mem$$Register);
+    assert_different_registers($newval$$Register, $mem$$Register);
+    write_barrier_pre(masm, this,
+                      noreg /* obj */,
+                      $oldval$$Register /* pre_val */,
+                      $tmp1$$Register /* tmp1 */,
+                      $tmp2$$Register /* tmp2 */,
+                      RegSet::of($mem$$Register, $oldval$$Register, $newval$$Register) /* preserve */,
+                      RegSet::of($res$$Register) /* no_preserve */);
+    __ cmpxchg($mem$$Register, $oldval$$Register, $newval$$Register, Assembler::xword,
+               $3 /* acquire */, true /* release */, false /* weak */, noreg);
+    __ cset($res$$Register, Assembler::EQ);
+    write_barrier_post(masm, this,
+                       $mem$$Register /* store_addr */,
+                       $newval$$Register /* new_val */,
+                       $tmp1$$Register /* tmp1 */,
+                       $tmp2$$Register /* tmp2 */);
+  %}
+  ins_pipe(pipe_slow);
+%}')dnl
+CASP_INSN(,,false)
+CASP_INSN(Acq,_acq,true)
+dnl
+define(`CASN_INSN',
+`
+// This pattern is generated automatically from g1_aarch64.m4.
+// DO NOT EDIT ANYTHING IN THIS SECTION OF THE FILE
+instruct g1CompareAndSwapN$1(iRegINoSp res, indirect mem, iRegN newval, iRegPNoSp tmp1, iRegPNoSp tmp2, iRegPNoSp tmp3, iRegN oldval, rFlagsReg cr)
+%{
+  predicate(UseG1GC && ifelse($1,Acq,'needs_acquiring_load_exclusive(n)`,'!needs_acquiring_load_exclusive(n)`) && n->as_LoadStore()->barrier_data() != 0);
+  match(Set res (CompareAndSwapN mem (Binary oldval newval)));
+  match(Set res (WeakCompareAndSwapN mem (Binary oldval newval)));
+  effect(TEMP res, TEMP tmp1, TEMP tmp2, TEMP tmp3, KILL cr);
+  ins_cost(ifelse($1,Acq,VOLATILE_REF_COST,2 * VOLATILE_REF_COST));
+  format %{ "cmpxchg$2 $mem, $oldval, $newval\t# (narrow oop)\n\t"
+            "cset $res, EQ" %}
+  ins_encode %{
+    assert_different_registers($oldval$$Register, $mem$$Register);
+    assert_different_registers($newval$$Register, $mem$$Register);
+    write_barrier_pre(masm, this,
+                      $mem$$Register /* obj */,
+                      $tmp1$$Register /* pre_val */,
+                      $tmp2$$Register /* tmp1 */,
+                      $tmp3$$Register /* tmp2 */,
+                      RegSet::of($mem$$Register, $oldval$$Register, $newval$$Register) /* preserve */,
+                      RegSet::of($res$$Register) /* no_preserve */);
+    __ cmpxchg($mem$$Register, $oldval$$Register, $newval$$Register, Assembler::word,
+               $3 /* acquire */, true /* release */, false /* weak */, noreg);
+    __ cset($res$$Register, Assembler::EQ);
+    __ decode_heap_oop($tmp1$$Register, $newval$$Register);
+    write_barrier_post(masm, this,
+                       $mem$$Register /* store_addr */,
+                       $tmp1$$Register /* new_val */,
+                       $tmp2$$Register /* tmp1 */,
+                       $tmp3$$Register /* tmp2 */);
+  %}
+  ins_pipe(pipe_slow);
+%}')dnl
+CASN_INSN(,,false)
+CASN_INSN(Acq,_acq,true)
+dnl
+define(`XCHGP_INSN',
+`
+// This pattern is generated automatically from g1_aarch64.m4.
+// DO NOT EDIT ANYTHING IN THIS SECTION OF THE FILE
+instruct g1GetAndSetP$1(indirect mem, iRegP newval, iRegPNoSp tmp1, iRegPNoSp tmp2, iRegPNoSp preval, rFlagsReg cr)
+%{
+  predicate(UseG1GC && ifelse($1,Acq,'needs_acquiring_load_exclusive(n)`,'!needs_acquiring_load_exclusive(n)`) && n->as_LoadStore()->barrier_data() != 0);
+  match(Set preval (GetAndSetP mem newval));
+  effect(TEMP preval, TEMP tmp1, TEMP tmp2, KILL cr);
+  ins_cost(ifelse($1,Acq,VOLATILE_REF_COST,2 * VOLATILE_REF_COST));
+  format %{ "atomic_xchg$2  $preval, $newval, [$mem]" %}
+  ins_encode %{
+    assert_different_registers($mem$$Register, $newval$$Register);
+    write_barrier_pre(masm, this,
+                      $mem$$Register /* obj */,
+                      $preval$$Register /* pre_val (as a temporary register) */,
+                      $tmp1$$Register /* tmp1 */,
+                      $tmp2$$Register /* tmp2 */,
+                      RegSet::of($mem$$Register, $preval$$Register, $newval$$Register) /* preserve */);
+    __ $3($preval$$Register, $newval$$Register, $mem$$Register);
+    write_barrier_post(masm, this,
+                       $mem$$Register /* store_addr */,
+                       $newval$$Register /* new_val */,
+                       $tmp1$$Register /* tmp1 */,
+                       $tmp2$$Register /* tmp2 */);
+  %}
+  ins_pipe(pipe_serial);
+%}')dnl
+XCHGP_INSN(,,atomic_xchg)
+XCHGP_INSN(Acq,_acq,atomic_xchgal)
+dnl
+define(`XCHGN_INSN',
+`
+// This pattern is generated automatically from g1_aarch64.m4.
+// DO NOT EDIT ANYTHING IN THIS SECTION OF THE FILE
+instruct g1GetAndSetN$1(indirect mem, iRegN newval, iRegPNoSp tmp1, iRegPNoSp tmp2, iRegPNoSp tmp3, iRegNNoSp preval, rFlagsReg cr)
+%{
+  predicate(UseG1GC && ifelse($1,Acq,'needs_acquiring_load_exclusive(n)`,'!needs_acquiring_load_exclusive(n)`) && n->as_LoadStore()->barrier_data() != 0);
+  match(Set preval (GetAndSetN mem newval));
+  effect(TEMP preval, TEMP tmp1, TEMP tmp2, TEMP tmp3, KILL cr);
+  ins_cost(ifelse($1,Acq,VOLATILE_REF_COST,2 * VOLATILE_REF_COST));
+  format %{ "$2 $preval, $newval, [$mem]" %}
+  ins_encode %{
+    assert_different_registers($mem$$Register, $newval$$Register);
+    write_barrier_pre(masm, this,
+                      $mem$$Register /* obj */,
+                      $tmp1$$Register /* pre_val */,
+                      $tmp2$$Register /* tmp1 */,
+                      $tmp3$$Register /* tmp2 */,
+                      RegSet::of($mem$$Register, $preval$$Register, $newval$$Register) /* preserve */);
+    __ $3($preval$$Register, $newval$$Register, $mem$$Register);
+    __ decode_heap_oop($tmp1$$Register, $newval$$Register);
+    write_barrier_post(masm, this,
+                       $mem$$Register /* store_addr */,
+                       $tmp1$$Register /* new_val */,
+                       $tmp2$$Register /* tmp1 */,
+                       $tmp3$$Register /* tmp2 */);
+  %}
+  ins_pipe(pipe_serial);
+%}')dnl
+XCHGN_INSN(,atomic_xchgw,atomic_xchgw)
+XCHGN_INSN(Acq,atomic_xchgw_acq,atomic_xchgalw)
+
+// This pattern is generated automatically from g1_aarch64.m4.
+// DO NOT EDIT ANYTHING IN THIS SECTION OF THE FILE
+instruct g1LoadP(iRegPNoSp dst, indirect mem, iRegPNoSp tmp1, iRegPNoSp tmp2, rFlagsReg cr)
+%{
+  // This instruction does not need an acquiring counterpart because it is only
+  // used for reference loading (Reference::get()). The same holds for g1LoadN.
+  predicate(UseG1GC && !needs_acquiring_load(n) && n->as_Load()->barrier_data() != 0);
+  match(Set dst (LoadP mem));
+  effect(TEMP dst, TEMP tmp1, TEMP tmp2, KILL cr);
+  ins_cost(4 * INSN_COST);
+  format %{ "ldr  $dst, $mem\t# ptr" %}
+  ins_encode %{
+    __ ldr($dst$$Register, $mem$$Register);
+    write_barrier_pre(masm, this,
+                      noreg /* obj */,
+                      $dst$$Register /* pre_val */,
+                      $tmp1$$Register /* tmp1 */,
+                      $tmp2$$Register /* tmp2 */);
+  %}
+  ins_pipe(iload_reg_mem);
+%}
+
+// This pattern is generated automatically from g1_aarch64.m4.
+// DO NOT EDIT ANYTHING IN THIS SECTION OF THE FILE
+instruct g1LoadN(iRegNNoSp dst, indirect mem, iRegPNoSp tmp1, iRegPNoSp tmp2, iRegPNoSp tmp3, rFlagsReg cr)
+%{
+  predicate(UseG1GC && !needs_acquiring_load(n) && n->as_Load()->barrier_data() != 0);
+  match(Set dst (LoadN mem));
+  effect(TEMP dst, TEMP tmp1, TEMP tmp2, TEMP tmp3, KILL cr);
+  ins_cost(4 * INSN_COST);
+  format %{ "ldrw  $dst, $mem\t# compressed ptr" %}
+  ins_encode %{
+    __ ldrw($dst$$Register, $mem$$Register);
+    if ((barrier_data() & G1C2BarrierPre) != 0) {
+      __ decode_heap_oop($tmp1$$Register, $dst$$Register);
+      write_barrier_pre(masm, this,
+                        noreg /* obj */,
+                        $tmp1$$Register /* pre_val */,
+                        $tmp2$$Register /* tmp1 */,
+                        $tmp3$$Register /* tmp2 */);
+    }
+  %}
+  ins_pipe(iload_reg_mem);
+%}
+
+// END This section of the file is automatically generated. Do not edit --------------
diff --git a/src/hotspot/cpu/arm/arm.ad b/src/hotspot/cpu/arm/arm.ad
index 2c7de0a58a2..716f6d87230 100644
--- a/src/hotspot/cpu/arm/arm.ad
+++ b/src/hotspot/cpu/arm/arm.ad
@@ -3890,6 +3890,7 @@ instruct loadRange(iRegI dst, memoryI mem) %{
 
 
 instruct loadP(iRegP dst, memoryP mem) %{
+  predicate(!(UseG1GC && n->as_Load()->barrier_data() != 0));
   match(Set dst (LoadP mem));
   ins_cost(MEMORY_REF_COST);
   size(4);
@@ -4356,6 +4357,7 @@ instruct movSP(store_ptr_RegP dst, SPRegP src) %{
 
 
 instruct storeP(memoryP mem, store_ptr_RegP src) %{
+  predicate(!(UseG1GC && n->as_Store()->barrier_data() != 0));
   match(Set mem (StoreP mem src));
   ins_cost(MEMORY_REF_COST);
   size(4);
@@ -5390,6 +5392,7 @@ instruct compareAndSwapI_bool(memoryex mem, iRegI oldval, iRegI newval, iRegI re
 %}
 
 instruct compareAndSwapP_bool(memoryex mem, iRegP oldval, iRegP newval, iRegI res, iRegI tmp, flagsReg ccr ) %{
+  predicate(!(UseG1GC && n->as_LoadStore()->barrier_data() != 0));
   match(Set res (CompareAndSwapP mem (Binary oldval newval)));
   effect( KILL ccr, TEMP tmp);
   size(28);
@@ -5659,6 +5662,7 @@ instruct xchgL(memoryex mem, iRegLd newval, iRegLd res, iRegI tmp, flagsReg ccr)
 %}
 
 instruct xchgP(memoryex mem, iRegP newval, iRegP res, iRegI tmp, flagsReg ccr) %{
+  predicate(!(UseG1GC && n->as_LoadStore()->barrier_data() != 0));
   match(Set res (GetAndSetP mem newval));
   effect(KILL ccr, TEMP tmp, TEMP res);
   size(16);
diff --git a/src/hotspot/cpu/arm/assembler_arm_32.hpp b/src/hotspot/cpu/arm/assembler_arm_32.hpp
index dd04ad1ab3a..e53eefac097 100644
--- a/src/hotspot/cpu/arm/assembler_arm_32.hpp
+++ b/src/hotspot/cpu/arm/assembler_arm_32.hpp
@@ -119,8 +119,9 @@ class RegisterSet {
   }
 
   friend RegisterSet operator | (const RegisterSet set1, const RegisterSet set2) {
-    assert((set1._encoding & set2._encoding) == 0,
-           "encoding constraint");
+//    why so strong constraint?
+//    assert((set1._encoding & set2._encoding) == 0,
+//           "encoding constraint");
     return RegisterSet(set1._encoding | set2._encoding);
   }
 
@@ -142,6 +143,11 @@ class RegisterSet {
     }
     return count;
   }
+
+  static RegisterSet from(RegSet set) {
+    assert(set.size(), "RegSet must not be empty");
+    return RegisterSet(set.bits());
+  }
 };
 
 #if R9_IS_SCRATCHED
@@ -157,6 +163,10 @@ class FloatRegisterSet {
 
  public:
 
+  FloatRegisterSet() {
+    _encoding = 0;
+  }
+
   FloatRegisterSet(FloatRegister reg) {
     if (reg->hi_bit() == 0) {
       _encoding = reg->hi_bits() << 12 | reg->lo_bit() << 22 | 1;
@@ -185,6 +195,15 @@ class FloatRegisterSet {
     return (_encoding & 0xFFFFFF00) | ((_encoding & 0xFF) << 1);
   }
 
+  static FloatRegisterSet from(FloatRegSet set) {
+    assert(set.size(), "FloatRegSet must not be empty");
+    // the vector load/store instructions operate on a set of consecutive registers.
+    // for the sake of simplicity, write all registers between the first and last in the set
+    size_t range =  (*set.rbegin())->encoding() - (*set.begin())->encoding() + 1;
+    // push_float stores float regisgters by pairs
+    return  FloatRegisterSet(*set.begin(), (range+1)/2);
+  }
+
 };
 
 
diff --git a/src/hotspot/cpu/arm/gc/g1/g1BarrierSetAssembler_arm.cpp b/src/hotspot/cpu/arm/gc/g1/g1BarrierSetAssembler_arm.cpp
index 3c5e29aa871..56ae7707fbf 100644
--- a/src/hotspot/cpu/arm/gc/g1/g1BarrierSetAssembler_arm.cpp
+++ b/src/hotspot/cpu/arm/gc/g1/g1BarrierSetAssembler_arm.cpp
@@ -39,8 +39,10 @@
 #include "c1/c1_LIRAssembler.hpp"
 #include "c1/c1_MacroAssembler.hpp"
 #include "gc/g1/c1/g1BarrierSetC1.hpp"
-#endif
-
+#endif // COMPILER1
+#ifdef COMPILER2
+#include "gc/g1/c2/g1BarrierSetC2.hpp"
+#endif // COMPILER2
 #define __ masm->
 
 #ifdef PRODUCT
@@ -106,70 +108,87 @@ void G1BarrierSetAssembler::gen_write_ref_array_post_barrier(MacroAssembler* mas
 #endif // !R9_IS_SCRATCHED
 }
 
+static void generate_queue_test_and_insertion(MacroAssembler* masm, ByteSize index_offset, ByteSize buffer_offset, Label& runtime,
+                                              const Register thread, const Register value, const Register temp1, const Register temp2) {
+  assert_different_registers(value, temp1, temp2);
+  // Can we store original value in the thread's buffer?
+  // (The index field is typed as size_t.)
+  __ ldr(temp1, Address(thread, in_bytes(index_offset)));  // temp1 := *(index address)
+  __ cbz(temp1, runtime);                                  // jump to runtime if index == 0 (full buffer)
+  // The buffer is not full, store value into it.
+  __ sub(temp1, temp1, wordSize);                          // temp1 := next index
+  __ str(temp1, Address(thread, in_bytes(index_offset)));  // *(index address) := next index
+  __ ldr(temp2, Address(thread, in_bytes(buffer_offset))); // temp2 := buffer address
+  // Record the previous value
+  __ str(value, Address(temp2, temp1));                    // *(buffer address + next index) := value
+ }
+
+static void generate_pre_barrier_fast_path(MacroAssembler* masm,
+                                           const Register thread,
+                                           const Register tmp1) {
+  Address in_progress(thread, in_bytes(G1ThreadLocalData::satb_mark_queue_active_offset()));
+  // Is marking active?
+  assert(in_bytes(SATBMarkQueue::byte_width_of_active()) == 1, "adjust this code");
+  __ ldrb(tmp1, in_progress);
+}
+
+static void generate_pre_barrier_slow_path(MacroAssembler* masm,
+                                           const Register obj,
+                                           const Register pre_val,
+                                           const Register thread,
+                                           const Register tmp1,
+                                           const Register tmp2,
+                                           Label& done,
+                                           Label& runtime) {
+  // Do we need to load the previous value?
+  if (obj != noreg) {
+    __ load_heap_oop(pre_val, Address(obj, 0));
+  }
+
+  // Is the previous value null?
+  __ cbz(pre_val, done);
+
+  generate_queue_test_and_insertion(masm,
+                                    G1ThreadLocalData::satb_mark_queue_index_offset(),
+                                    G1ThreadLocalData::satb_mark_queue_buffer_offset(),
+                                    runtime,
+                                    thread, pre_val, tmp1, tmp2);
+  __ b(done);
+}
+
 // G1 pre-barrier.
-// Blows all volatile registers R0-R3, Rtemp, LR).
-// If store_addr != noreg, then previous value is loaded from [store_addr];
-// in such case store_addr and new_val registers are preserved;
+// Blows all volatile registers R0-R3, LR).
+// If obj != noreg, then previous value is loaded from [obj];
+// in such case obj and pre_val registers is preserved;
 // otherwise pre_val register is preserved.
 void G1BarrierSetAssembler::g1_write_barrier_pre(MacroAssembler* masm,
-                                          Register store_addr,
-                                          Register new_val,
+                                          Register obj,
                                           Register pre_val,
                                           Register tmp1,
                                           Register tmp2) {
   Label done;
   Label runtime;
 
-  if (store_addr != noreg) {
-    assert_different_registers(store_addr, new_val, pre_val, tmp1, tmp2, noreg);
-  } else {
-    assert (new_val == noreg, "should be");
-    assert_different_registers(pre_val, tmp1, tmp2, noreg);
-  }
+  assert_different_registers(obj, pre_val, tmp1, tmp2, noreg);
 
-  Address in_progress(Rthread, in_bytes(G1ThreadLocalData::satb_mark_queue_active_offset()));
-  Address index(Rthread, in_bytes(G1ThreadLocalData::satb_mark_queue_index_offset()));
-  Address buffer(Rthread, in_bytes(G1ThreadLocalData::satb_mark_queue_buffer_offset()));
-
-  // Is marking active?
-  assert(in_bytes(SATBMarkQueue::byte_width_of_active()) == 1, "adjust this code");
-  __ ldrb(tmp1, in_progress);
+  generate_pre_barrier_fast_path(masm, Rthread, tmp1);
+  // If marking is not active (*(mark queue active address) == 0), jump to done
   __ cbz(tmp1, done);
 
-  // Do we need to load the previous value?
-  if (store_addr != noreg) {
-    __ load_heap_oop(pre_val, Address(store_addr, 0));
-  }
-
-  // Is the previous value null?
-  __ cbz(pre_val, done);
-
-  // Can we store original value in the thread's buffer?
-  // Is index == 0?
-  // (The index field is typed as size_t.)
-
-  __ ldr(tmp1, index);           // tmp1 := *index_adr
-  __ ldr(tmp2, buffer);
-
-  __ subs(tmp1, tmp1, wordSize); // tmp1 := tmp1 - wordSize
-  __ b(runtime, lt);             // If negative, goto runtime
-
-  __ str(tmp1, index);           // *index_adr := tmp1
-
-  // Record the previous value
-  __ str(pre_val, Address(tmp2, tmp1));
-  __ b(done);
+   generate_pre_barrier_slow_path(masm, obj, pre_val, Rthread, tmp1, tmp2, done, runtime);
 
   __ bind(runtime);
 
   // save the live input values
-  if (store_addr != noreg) {
-    // avoid raw_push to support any ordering of store_addr and new_val
-    __ push(RegisterSet(store_addr) | RegisterSet(new_val));
-  } else {
-    __ push(pre_val);
+  RegisterSet set = RegisterSet(pre_val) | RegisterSet(R0, R3) | RegisterSet(R12);
+  // save the live input values
+  if (obj != noreg) {
+    // avoid raw_push to support any ordering of store_addr and pre_val
+    set = set | RegisterSet(obj);
   }
 
+  __ push(set);
+
   if (pre_val != R0) {
     __ mov(R0, pre_val);
   }
@@ -177,33 +196,17 @@ void G1BarrierSetAssembler::g1_write_barrier_pre(MacroAssembler* masm,
 
   __ call_VM_leaf(CAST_FROM_FN_PTR(address, G1BarrierSetRuntime::write_ref_field_pre_entry), R0, R1);
 
-  if (store_addr != noreg) {
-    __ pop(RegisterSet(store_addr) | RegisterSet(new_val));
-  } else {
-    __ pop(pre_val);
-  }
-
+  __ pop(set);
   __ bind(done);
 }
 
-// G1 post-barrier.
-// Blows all volatile registers R0-R3, Rtemp, LR).
-void G1BarrierSetAssembler::g1_write_barrier_post(MacroAssembler* masm,
-                                           Register store_addr,
-                                           Register new_val,
-                                           Register tmp1,
-                                           Register tmp2,
-                                           Register tmp3) {
-
-  Address queue_index(Rthread, in_bytes(G1ThreadLocalData::dirty_card_queue_index_offset()));
-  Address buffer(Rthread, in_bytes(G1ThreadLocalData::dirty_card_queue_buffer_offset()));
-
-  BarrierSet* bs = BarrierSet::barrier_set();
-  CardTableBarrierSet* ctbs = barrier_set_cast<CardTableBarrierSet>(bs);
-  CardTable* ct = ctbs->card_table();
-  Label done;
-  Label runtime;
-
+static void generate_post_barrier_fast_path(MacroAssembler* masm,
+                                            const Register store_addr,
+                                            const Register new_val,
+                                            const Register tmp1,
+                                            const Register tmp2,
+                                            Label& done,
+                                            bool new_val_may_be_null) {
   // Does store cross heap regions?
 
   __ eor(tmp1, store_addr, new_val);
@@ -211,22 +214,31 @@ void G1BarrierSetAssembler::g1_write_barrier_post(MacroAssembler* masm,
   __ b(done, eq);
 
   // crosses regions, storing null?
-
-  __ cbz(new_val, done);
-
+  if (new_val_may_be_null) {
+    __ cbz(new_val, done);
+  }
   // storing region crossing non-null, is card already dirty?
   const Register card_addr = tmp1;
 
-  __ mov_address(tmp2, (address)ct->byte_map_base());
+  CardTableBarrierSet* ct = barrier_set_cast<CardTableBarrierSet>(BarrierSet::barrier_set());
+  __ mov_address(tmp2, (address)ct->card_table()->byte_map_base());
   __ add(card_addr, tmp2, AsmOperand(store_addr, lsr, CardTable::card_shift()));
 
   __ ldrb(tmp2, Address(card_addr));
   __ cmp(tmp2, (int)G1CardTable::g1_young_card_val());
-  __ b(done, eq);
+}
 
+static void generate_post_barrier_slow_path(MacroAssembler* masm,
+                                            const Register thread,
+                                            const Register tmp1,
+                                            const Register tmp2,
+                                            const Register tmp3,
+                                            Label& done,
+                                            Label& runtime) {
   __ membar(MacroAssembler::Membar_mask_bits(MacroAssembler::StoreLoad), tmp2);
-
   assert(CardTable::dirty_card_val() == 0, "adjust this code");
+  // card_addr is loaded by generate_post_barrier_fast_path
+  const Register card_addr = tmp1;
   __ ldrb(tmp2, Address(card_addr));
   __ cbz(tmp2, done);
 
@@ -234,29 +246,139 @@ void G1BarrierSetAssembler::g1_write_barrier_post(MacroAssembler* masm,
   // dirty card and log.
 
   __ strb(__ zero_register(tmp2), Address(card_addr));
-
-  __ ldr(tmp2, queue_index);
-  __ ldr(tmp3, buffer);
-
-  __ subs(tmp2, tmp2, wordSize);
-  __ b(runtime, lt); // go to runtime if now negative
-
-  __ str(tmp2, queue_index);
-
-  __ str(card_addr, Address(tmp3, tmp2));
+  generate_queue_test_and_insertion(masm,
+                                    G1ThreadLocalData::dirty_card_queue_index_offset(),
+                                    G1ThreadLocalData::dirty_card_queue_buffer_offset(),
+                                    runtime,
+                                    thread, card_addr, tmp2, tmp3);
   __ b(done);
+}
+
+
+// G1 post-barrier.
+// Blows all volatile registers R0-R3,  LR).
+void G1BarrierSetAssembler::g1_write_barrier_post(MacroAssembler* masm,
+                                           Register store_addr,
+                                           Register new_val,
+                                           Register tmp1,
+                                           Register tmp2,
+                                           Register tmp3) {
+  Label done;
+  Label runtime;
+
+  generate_post_barrier_fast_path(masm, store_addr, new_val, tmp1, tmp2, done, true /* new_val_may_be_null */);
+  // If card is young, jump to done
+  // card_addr and card are loaded by generate_post_barrier_fast_path
+  const Register card      = tmp2;
+  const Register card_addr = tmp1;
+   __ b(done, eq);
+  generate_post_barrier_slow_path(masm, Rthread, card_addr, tmp2, tmp3, done, runtime);
 
   __ bind(runtime);
 
+  RegisterSet set = RegisterSet(store_addr) | RegisterSet(R0, R3) | RegisterSet(R12);
+  __ push(set);
+
   if (card_addr != R0) {
     __ mov(R0, card_addr);
   }
   __ mov(R1, Rthread);
   __ call_VM_leaf(CAST_FROM_FN_PTR(address, G1BarrierSetRuntime::write_ref_field_post_entry), R0, R1);
 
+  __ pop(set);
+
   __ bind(done);
 }
 
+#if defined(COMPILER2)
+
+static void generate_c2_barrier_runtime_call(MacroAssembler* masm, G1BarrierStubC2* stub, const Register arg, const address runtime_path, Register tmp1) {
+  SaveLiveRegisters save_registers(masm, stub);
+  if (c_rarg0 != arg) {
+    __ mov(c_rarg0, arg);
+  }
+  __ mov(c_rarg1, Rthread);
+  __ call_VM_leaf(runtime_path, R0, R1);
+}
+
+void G1BarrierSetAssembler::g1_write_barrier_pre_c2(MacroAssembler* masm,
+                                                    Register obj,
+                                                    Register pre_val,
+                                                    Register thread,
+                                                    Register tmp1,
+                                                    Register tmp2,
+                                                    G1PreBarrierStubC2* stub) {
+  assert(thread == Rthread, "must be");
+  assert_different_registers(obj, pre_val, tmp1, tmp2);
+  assert(pre_val != noreg && tmp1 != noreg && tmp2 != noreg, "expecting a register");
+
+  stub->initialize_registers(obj, pre_val, thread, tmp1, tmp2);
+
+  generate_pre_barrier_fast_path(masm, thread, tmp1);
+  // If marking is active (*(mark queue active address) != 0), jump to stub (slow path)
+  __ cbnz(tmp1, *stub->entry());
+
+  __ bind(*stub->continuation());
+}
+
+void G1BarrierSetAssembler::generate_c2_pre_barrier_stub(MacroAssembler* masm,
+                                                         G1PreBarrierStubC2* stub) const {
+  Assembler::InlineSkippedInstructionsCounter skip_counter(masm);
+  Label runtime;
+  Register obj = stub->obj();
+  Register pre_val = stub->pre_val();
+  Register thread = stub->thread();
+  Register tmp1 = stub->tmp1();
+  Register tmp2 = stub->tmp2();
+
+  __ bind(*stub->entry());
+  generate_pre_barrier_slow_path(masm, obj, pre_val, thread, tmp1, tmp2, *stub->continuation(), runtime);
+
+  __ bind(runtime);
+  generate_c2_barrier_runtime_call(masm, stub, pre_val, CAST_FROM_FN_PTR(address, G1BarrierSetRuntime::write_ref_field_pre_entry), tmp1);
+  __ b(*stub->continuation());
+}
+
+void G1BarrierSetAssembler::g1_write_barrier_post_c2(MacroAssembler* masm,
+                                                     Register store_addr,
+                                                     Register new_val,
+                                                     Register thread,
+                                                     Register tmp1,
+                                                     Register tmp2,
+                                                     Register tmp3,
+                                                     G1PostBarrierStubC2* stub) {
+  assert(thread == Rthread, "must be");
+  assert_different_registers(store_addr, new_val, thread, tmp1, tmp2, noreg);
+
+  stub->initialize_registers(thread, tmp1, tmp2, tmp3);
+
+  bool new_val_may_be_null = (stub->barrier_data() & G1C2BarrierPostNotNull) == 0;
+  generate_post_barrier_fast_path(masm, store_addr, new_val, tmp1, tmp2, *stub->continuation(), new_val_may_be_null);
+  // If card is not young, jump to stub (slow path)
+  __ b(*stub->entry(), ne);
+
+  __ bind(*stub->continuation());
+}
+
+void G1BarrierSetAssembler::generate_c2_post_barrier_stub(MacroAssembler* masm,
+                                                          G1PostBarrierStubC2* stub) const {
+  Assembler::InlineSkippedInstructionsCounter skip_counter(masm);
+  Label runtime;
+  Register thread = stub->thread();
+  Register tmp1 = stub->tmp1(); // tmp1 holds the card address.
+  Register tmp2 = stub->tmp2();
+  Register tmp3 = stub->tmp3();
+
+  __ bind(*stub->entry());
+  generate_post_barrier_slow_path(masm, thread, tmp1, tmp2, tmp3,  *stub->continuation(), runtime);
+
+  __ bind(runtime);
+  generate_c2_barrier_runtime_call(masm, stub, tmp1, CAST_FROM_FN_PTR(address, G1BarrierSetRuntime::write_ref_field_post_entry), tmp2);
+  __ b(*stub->continuation());
+}
+
+#endif // COMPILER2
+
 void G1BarrierSetAssembler::load_at(MacroAssembler* masm, DecoratorSet decorators, BasicType type,
                                     Register dst, Address src, Register tmp1, Register tmp2, Register tmp3) {
   bool on_oop = type == T_OBJECT || type == T_ARRAY;
@@ -268,7 +390,7 @@ void G1BarrierSetAssembler::load_at(MacroAssembler* masm, DecoratorSet decorator
   if (on_oop && on_reference) {
     // Generate the G1 pre-barrier code to log the value of
     // the referent field in an SATB buffer.
-    g1_write_barrier_pre(masm, noreg, noreg, dst, tmp1, tmp2);
+    g1_write_barrier_pre(masm, noreg, dst, tmp1, tmp2);
   }
 }
 
@@ -295,7 +417,7 @@ void G1BarrierSetAssembler::oop_store_at(MacroAssembler* masm, DecoratorSet deco
   }
 
   if (needs_pre_barrier) {
-    g1_write_barrier_pre(masm, store_addr, new_val, tmp1, tmp2, tmp3);
+    g1_write_barrier_pre(masm, store_addr, tmp3 /*pre_val*/, tmp1, tmp2);
   }
 
   if (is_null) {
diff --git a/src/hotspot/cpu/arm/gc/g1/g1BarrierSetAssembler_arm.hpp b/src/hotspot/cpu/arm/gc/g1/g1BarrierSetAssembler_arm.hpp
index 52932faa3e4..aefde19142e 100644
--- a/src/hotspot/cpu/arm/gc/g1/g1BarrierSetAssembler_arm.hpp
+++ b/src/hotspot/cpu/arm/gc/g1/g1BarrierSetAssembler_arm.hpp
@@ -33,6 +33,8 @@ class LIR_Assembler;
 class StubAssembler;
 class G1PreBarrierStub;
 class G1PostBarrierStub;
+class G1PreBarrierStubC2;
+class G1PostBarrierStubC2;
 
 class G1BarrierSetAssembler: public ModRefBarrierSetAssembler {
 protected:
@@ -43,7 +45,6 @@ protected:
 
   void g1_write_barrier_pre(MacroAssembler* masm,
                             Register store_addr,
-                            Register new_val,
                             Register pre_val,
                             Register tmp1,
                             Register tmp2);
@@ -70,6 +71,29 @@ public:
   void generate_c1_pre_barrier_runtime_stub(StubAssembler* sasm);
   void generate_c1_post_barrier_runtime_stub(StubAssembler* sasm);
 #endif
+
+#ifdef COMPILER2
+  void g1_write_barrier_pre_c2(MacroAssembler* masm,
+                               Register obj,
+                               Register pre_val,
+                               Register thread,
+                               Register tmp1,
+                               Register tmp2,
+                               G1PreBarrierStubC2* c2_stub);
+  void generate_c2_pre_barrier_stub(MacroAssembler* masm,
+                                    G1PreBarrierStubC2* stub) const;
+  void g1_write_barrier_post_c2(MacroAssembler* masm,
+                                Register store_addr,
+                                Register new_val,
+                                Register thread,
+                                Register tmp1,
+                                Register tmp2,
+                                Register tmp3,
+                                G1PostBarrierStubC2* c2_stub);
+  void generate_c2_post_barrier_stub(MacroAssembler* masm,
+                                     G1PostBarrierStubC2* stub) const;
+#endif
+
 };
 
 #endif // CPU_ARM_GC_G1_G1BARRIERSETASSEMBLER_ARM_HPP
diff --git a/src/hotspot/cpu/arm/gc/g1/g1_arm.ad b/src/hotspot/cpu/arm/gc/g1/g1_arm.ad
new file mode 100644
index 00000000000..8a0a9e1aa53
--- /dev/null
+++ b/src/hotspot/cpu/arm/gc/g1/g1_arm.ad
@@ -0,0 +1,201 @@
+//
+// Copyright (c) 2024, Oracle and/or its affiliates. All rights reserved.
+// DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+//
+// This code is free software; you can redistribute it and/or modify it
+// under the terms of the GNU General Public License version 2 only, as
+// published by the Free Software Foundation.
+//
+// This code is distributed in the hope that it will be useful, but WITHOUT
+// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+// FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+// version 2 for more details (a copy is included in the LICENSE file that
+// accompanied this code).
+//
+// You should have received a copy of the GNU General Public License version
+// 2 along with this work; if not, write to the Free Software Foundation,
+// Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+//
+// Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+// or visit www.oracle.com if you need additional information or have any
+// questions.
+//
+
+source_hpp %{
+
+#include "gc/g1/c2/g1BarrierSetC2.hpp"
+#include "gc/shared/gc_globals.hpp"
+
+%}
+
+source %{
+
+#include "gc/g1/g1BarrierSetAssembler_arm.hpp"
+#include "gc/g1/g1BarrierSetRuntime.hpp"
+
+static void write_barrier_pre(MacroAssembler* masm,
+                              const MachNode* node,
+                              Register obj,
+                              Register pre_val,
+                              Register tmp1,
+                              Register tmp2,
+                              RegSet preserve = RegSet(),
+                              RegSet no_preserve = RegSet()) {
+  if (!G1PreBarrierStubC2::needs_barrier(node)) {
+    return;
+  }
+  Assembler::InlineSkippedInstructionsCounter skip_counter(masm);
+  G1BarrierSetAssembler* g1_asm = static_cast<G1BarrierSetAssembler*>(BarrierSet::barrier_set()->barrier_set_assembler());
+  G1PreBarrierStubC2* const stub = G1PreBarrierStubC2::create(node);
+  for (RegSetIterator<Register> reg = preserve.begin(); *reg != noreg; ++reg) {
+    stub->preserve(*reg);
+  }
+  for (RegSetIterator<Register> reg = no_preserve.begin(); *reg != noreg; ++reg) {
+    stub->dont_preserve(*reg);
+  }
+  g1_asm->g1_write_barrier_pre_c2(masm, obj, pre_val, Rthread, tmp1, tmp2, stub);
+}
+
+static void write_barrier_post(MacroAssembler* masm,
+                               const MachNode* node,
+                               Register store_addr,
+                               Register new_val,
+                               Register tmp1,
+                               Register tmp2,
+                               Register tmp3) {
+  if (!G1PostBarrierStubC2::needs_barrier(node)) {
+    return;
+  }
+  Assembler::InlineSkippedInstructionsCounter skip_counter(masm);
+  G1BarrierSetAssembler* g1_asm = static_cast<G1BarrierSetAssembler*>(BarrierSet::barrier_set()->barrier_set_assembler());
+  G1PostBarrierStubC2* const stub = G1PostBarrierStubC2::create(node);
+  g1_asm->g1_write_barrier_post_c2(masm, store_addr, new_val, Rthread, tmp1, tmp2, tmp3, stub);
+}
+
+%}
+
+instruct g1StoreP(indirect mem, iRegP src, iRegP tmp1, iRegP tmp2, iRegP tmp3, flagsReg icc)
+%{
+  predicate(UseG1GC && n->as_Store()->barrier_data() != 0);
+  match(Set mem (StoreP mem src));
+  effect(TEMP tmp1, TEMP tmp2, TEMP tmp3, KILL icc);
+  ins_cost(2 * (MEMORY_REF_COST + BRANCH_COST));
+  format %{ "sd  $src, $mem\t# ptr" %}
+  ins_encode %{
+    guarantee($mem$$disp == 0, "impossible encoding");
+    write_barrier_pre(masm, this,
+                      $mem$$Register  /* obj */,
+                      $tmp1$$Register /* pre_val */,
+                      $tmp2$$Register /* tmp1 */,
+                      $tmp3$$Register /* tmp2 */,
+                      RegSet::of($mem$$Register, $src$$Register) /* preserve */);
+    __ str($src$$Register, Address($mem$$Register));
+    write_barrier_post(masm, this,
+                       $mem$$Register  /* store_addr */,
+                       $src$$Register  /* new_val */,
+                       $tmp1$$Register /* tmp1 */,
+                       $tmp2$$Register /* tmp2 */,
+                       $tmp3$$Register /* tmp3 */);
+  %}
+  ins_pipe(istore_mem_reg);
+%}
+
+instruct g1CompareAndSwapP(iRegI res, indirect mem, iRegP newval, iRegP tmp1, iRegP tmp2, iRegP tmp3, iRegP oldval, flagsReg ccr )
+%{
+  predicate(UseG1GC && n->as_LoadStore()->barrier_data() != 0);
+  match(Set res (CompareAndSwapP mem (Binary oldval newval)));
+  effect(KILL ccr, TEMP res, TEMP tmp1, TEMP tmp2, TEMP tmp3);
+  ins_cost(4 * (MEMORY_REF_COST + BRANCH_COST));
+  format %{ "loop: \n\t"
+            "LDREX    $tmp1, $mem\t! If $oldval==[$mem] Then store $newval into [$mem]\n\t"
+            "CMP      $tmp1, $oldval\n\t"
+            "STREX.eq $tmp1, $newval, $mem\n\t"
+            "MOV.ne   $tmp1, 0 \n\t"
+            "EORS.eq  $tmp1,$tmp1, 1 \n\t"
+            "B.eq     loop \n\t"
+            "MOV      $res, $tmp1" %}
+  ins_encode %{
+    guarantee($mem$$disp == 0, "impossible encoding");
+    assert_different_registers($oldval$$Register, $mem$$Register);
+    assert_different_registers($newval$$Register, $mem$$Register);
+    write_barrier_pre(masm, this,
+                      noreg             /* obj */,
+                      $oldval$$Register /* pre_val */,
+                      $tmp2$$Register   /* tmp1 */,
+                      $tmp3$$Register   /* tmp2 */,
+                      RegSet::of($mem$$Register, $oldval$$Register, $newval$$Register) /* preserve */,
+                      RegSet::of($res$$Register) /* no_preserve */);
+    Label loop;
+    __ bind(loop);
+    __ ldrex($tmp1$$Register,$mem$$Address);
+    __ cmp($tmp1$$Register, $oldval$$Register);
+    __ strex($tmp1$$Register, $newval$$Register, $mem$$Address, eq);
+    __ mov($tmp1$$Register, 0, ne);
+    __ eors($tmp1$$Register, $tmp1$$Register, 1, eq);
+    __ b(loop, eq);
+    __ mov($res$$Register, $tmp1$$Register);
+    write_barrier_post(masm, this,
+                       $mem$$Register  /* store_addr */,
+                       $newval$$Register /* new_val */,
+                       $tmp1$$Register /* tmp1 */,
+                       $tmp2$$Register /* tmp2 */,
+                       $tmp3$$Register /* tmp3 */);
+  %}
+  ins_pipe(long_memory_op);
+%}
+
+
+instruct g1GetAndSetP(indirect mem, iRegP newval, iRegP tmp1, iRegP tmp2, iRegP tmp3, iRegP preval, flagsReg ccr)
+%{
+  predicate(UseG1GC && n->as_LoadStore()->barrier_data() != 0);
+  match(Set preval (GetAndSetP mem newval));
+  effect(KILL ccr, TEMP preval, TEMP tmp1, TEMP tmp2, TEMP tmp3);
+  ins_cost(4 * (MEMORY_REF_COST + BRANCH_COST));
+  format %{ "loop: \n\t"
+            "LDREX    $preval, $mem\n\t"
+            "STREX    $tmp1, $newval, $mem\n\t"
+            "CMP      $tmp1, 0 \n\t"
+            "B.ne     loop \n\t" %}
+  ins_encode %{
+    guarantee($mem$$disp == 0, "impossible encoding");
+    assert_different_registers($mem$$Register, $newval$$Register);
+    write_barrier_pre(masm, this,
+                      $mem$$Register    /* obj */,
+                      $preval$$Register /* pre_val (as a temporary register) */,
+                      $tmp1$$Register   /* tmp1 */,
+                      $tmp2$$Register   /* tmp2 */,
+                      RegSet::of($mem$$Register, $preval$$Register, $newval$$Register) /* preserve */);
+    Label loop;
+    __ bind(loop);
+    __ ldrex($preval$$Register,$mem$$Address);
+    __ strex($tmp1$$Register, $newval$$Register, $mem$$Address);
+    __ cmp($tmp1$$Register, 0);
+    __ b(loop, ne);
+    write_barrier_post(masm, this,
+                       $mem$$Register    /* store_addr */,
+                       $newval$$Register /* new_val */,
+                       $tmp1$$Register   /* tmp1 */,
+                       $tmp2$$Register   /* tmp2 */,
+                       $tmp3$$Register   /* tmp3 */);
+  %}
+  ins_pipe(long_memory_op);
+%}
+
+instruct g1LoadP(iRegP dst, indirect mem, iRegP tmp1, iRegP tmp2, flagsReg icc)
+%{
+  predicate(UseG1GC && n->as_Load()->barrier_data() != 0);
+  match(Set dst (LoadP mem));
+  effect(TEMP dst, TEMP tmp1, TEMP tmp2, KILL icc);
+  ins_cost(MEMORY_REF_COST + BRANCH_COST);
+  format %{ "ld  $dst, $mem\t# ptr" %}
+  ins_encode %{
+    guarantee($mem$$disp == 0, "impossible encoding");
+    __ ldr($dst$$Register, Address($mem$$Register));
+    write_barrier_pre(masm, this,
+                      noreg /* obj */,
+                      $dst$$Register /* pre_val */,
+                      $tmp1$$Register /* tmp1 */,
+                      $tmp2$$Register /* tmp2 */);
+  %}
+  ins_pipe(iload_mem);
+%}
diff --git a/src/hotspot/cpu/arm/gc/shared/barrierSetAssembler_arm.cpp b/src/hotspot/cpu/arm/gc/shared/barrierSetAssembler_arm.cpp
index ea19730673c..c13a259a1b9 100644
--- a/src/hotspot/cpu/arm/gc/shared/barrierSetAssembler_arm.cpp
+++ b/src/hotspot/cpu/arm/gc/shared/barrierSetAssembler_arm.cpp
@@ -31,6 +31,10 @@
 #include "runtime/javaThread.hpp"
 #include "runtime/stubRoutines.hpp"
 
+#ifdef COMPILER2
+#include "gc/shared/c2/barrierSetC2.hpp"
+#endif // COMPILER2
+
 #define __ masm->
 
 void BarrierSetAssembler::load_at(MacroAssembler* masm, DecoratorSet decorators, BasicType type,
@@ -206,7 +210,57 @@ void BarrierSetAssembler::nmethod_entry_barrier(MacroAssembler* masm) {
 #ifdef COMPILER2
 
 OptoReg::Name BarrierSetAssembler::refine_register(const Node* node, OptoReg::Name opto_reg) {
-  Unimplemented(); // This must be implemented to support late barrier expansion.
+  if (!OptoReg::is_reg(opto_reg)) {
+    return OptoReg::Bad;
+  }
+
+  const VMReg vm_reg = OptoReg::as_VMReg(opto_reg);
+  if (!vm_reg->is_valid()){
+    // skip APSR and FPSCR
+    return OptoReg::Bad;
+  }
+
+  return opto_reg;
 }
 
+void SaveLiveRegisters::initialize(BarrierStubC2* stub) {
+  // Record registers that needs to be saved/restored
+  RegMaskIterator rmi(stub->preserve_set());
+  while (rmi.has_next()) {
+    const OptoReg::Name opto_reg = rmi.next();
+    if (OptoReg::is_reg(opto_reg)) {
+      const VMReg vm_reg = OptoReg::as_VMReg(opto_reg);
+      if (vm_reg->is_Register()) {
+        gp_regs += RegSet::of(vm_reg->as_Register());
+      } else if (vm_reg->is_FloatRegister()) {
+        fp_regs += FloatRegSet::of(vm_reg->as_FloatRegister());
+      } else {
+        fatal("Unknown register type");
+      }
+    }
+  }
+  // Remove C-ABI SOE registers that will be updated
+  gp_regs -= RegSet::range(R4, R11) + RegSet::of(R13, R15);
+
+  // Remove C-ABI SOE fp registers
+  fp_regs -= FloatRegSet::range(S16, S31);
+}
+
+SaveLiveRegisters::SaveLiveRegisters(MacroAssembler* masm, BarrierStubC2* stub)
+  : masm(masm),
+    gp_regs(),
+    fp_regs() {
+  // Figure out what registers to save/restore
+  initialize(stub);
+
+  // Save registers
+  if (gp_regs.size() > 0) __ push(RegisterSet::from(gp_regs));
+  if (fp_regs.size() > 0) __ fpush(FloatRegisterSet::from(fp_regs));
+}
+
+SaveLiveRegisters::~SaveLiveRegisters() {
+  // Restore registers
+  if (fp_regs.size() > 0) __ fpop(FloatRegisterSet::from(fp_regs));
+  if (gp_regs.size() > 0) __ pop(RegisterSet::from(gp_regs));
+}
 #endif // COMPILER2
diff --git a/src/hotspot/cpu/arm/gc/shared/barrierSetAssembler_arm.hpp b/src/hotspot/cpu/arm/gc/shared/barrierSetAssembler_arm.hpp
index 60021390ea2..054d172f463 100644
--- a/src/hotspot/cpu/arm/gc/shared/barrierSetAssembler_arm.hpp
+++ b/src/hotspot/cpu/arm/gc/shared/barrierSetAssembler_arm.hpp
@@ -31,7 +31,9 @@
 #ifdef COMPILER2
 #include "code/vmreg.hpp"
 #include "opto/optoreg.hpp"
+#include "opto/regmask.hpp"
 
+class BarrierStubC2;
 class Node;
 #endif // COMPILER2
 
@@ -69,4 +71,26 @@ public:
 #endif // COMPILER2
 };
 
+#ifdef COMPILER2
+// This class saves and restores the registers that need to be preserved across
+// the runtime call represented by a given C2 barrier stub. Use as follows:
+// {
+//   SaveLiveRegisters save(masm, stub);
+//   ..
+//   __ bl(...);
+//   ..
+// }
+class SaveLiveRegisters {
+private:
+  MacroAssembler* const masm;
+  RegSet                gp_regs;
+  FloatRegSet           fp_regs;
+
+public:
+  void initialize(BarrierStubC2* stub);
+  SaveLiveRegisters(MacroAssembler* masm, BarrierStubC2* stub);
+  ~SaveLiveRegisters();
+};
+
+#endif // COMPILER2
 #endif // CPU_ARM_GC_SHARED_BARRIERSETASSEMBLER_ARM_HPP
diff --git a/src/hotspot/cpu/arm/register_arm.hpp b/src/hotspot/cpu/arm/register_arm.hpp
index 9f486d2a625..d8961fd2935 100644
--- a/src/hotspot/cpu/arm/register_arm.hpp
+++ b/src/hotspot/cpu/arm/register_arm.hpp
@@ -303,6 +303,31 @@ class ConcreteRegisterImpl : public AbstractRegisterImpl {
   static const int max_fpr;
 };
 
+typedef AbstractRegSet<Register> RegSet;
+typedef AbstractRegSet<FloatRegister> FloatRegSet;
+
+template <>
+inline Register AbstractRegSet<Register>::first() {
+  if (_bitset == 0) { return noreg; }
+  return as_Register(count_trailing_zeros(_bitset));
+}
+
+
+template <>
+inline FloatRegister AbstractRegSet<FloatRegister>::first() {
+  uint32_t first = _bitset & -_bitset;
+  return first ? as_FloatRegister(exact_log2(first)) : fnoreg;
+}
+
+template <>
+inline FloatRegister AbstractRegSet<FloatRegister>::last() {
+  if (_bitset == 0) { return fnoreg; }
+  int last = max_size() - 1 - count_leading_zeros(_bitset);
+  return as_FloatRegister(last);
+}
+
+
+
 class VFPSystemRegisterImpl;
 typedef VFPSystemRegisterImpl* VFPSystemRegister;
 class VFPSystemRegisterImpl : public AbstractRegisterImpl {
diff --git a/src/hotspot/cpu/ppc/gc/g1/g1BarrierSetAssembler_ppc.cpp b/src/hotspot/cpu/ppc/gc/g1/g1BarrierSetAssembler_ppc.cpp
index 7d230d301c2..39693bdf925 100644
--- a/src/hotspot/cpu/ppc/gc/g1/g1BarrierSetAssembler_ppc.cpp
+++ b/src/hotspot/cpu/ppc/gc/g1/g1BarrierSetAssembler_ppc.cpp
@@ -41,10 +41,20 @@
 #include "c1/c1_LIRAssembler.hpp"
 #include "c1/c1_MacroAssembler.hpp"
 #include "gc/g1/c1/g1BarrierSetC1.hpp"
-#endif
+#endif // COMPILER1
+#ifdef COMPILER2
+#include "gc/g1/c2/g1BarrierSetC2.hpp"
+#endif // COMPILER2
 
 #define __ masm->
 
+static void generate_marking_inactive_test(MacroAssembler* masm) {
+  int active_offset = in_bytes(G1ThreadLocalData::satb_mark_queue_active_offset());
+  assert(in_bytes(SATBMarkQueue::byte_width_of_active()) == 1, "Assumption");
+  __ lbz(R0, active_offset, R16_thread);  // tmp1 := *(mark queue active address)
+  __ cmpwi(CCR0, R0, 0);
+}
+
 void G1BarrierSetAssembler::gen_write_ref_array_pre_barrier(MacroAssembler* masm, DecoratorSet decorators,
                                                             Register from, Register to, Register count,
                                                             Register preserve1, Register preserve2) {
@@ -58,13 +68,7 @@ void G1BarrierSetAssembler::gen_write_ref_array_pre_barrier(MacroAssembler* masm
     Label filtered;
 
     // Is marking active?
-    if (in_bytes(SATBMarkQueue::byte_width_of_active()) == 4) {
-      __ lwz(R0, in_bytes(G1ThreadLocalData::satb_mark_queue_active_offset()), R16_thread);
-    } else {
-      guarantee(in_bytes(SATBMarkQueue::byte_width_of_active()) == 1, "Assumption");
-      __ lbz(R0, in_bytes(G1ThreadLocalData::satb_mark_queue_active_offset()), R16_thread);
-    }
-    __ cmpdi(CCR0, R0, 0);
+    generate_marking_inactive_test(masm);
     __ beq(CCR0, filtered);
 
     __ save_LR(R0);
@@ -109,35 +113,48 @@ void G1BarrierSetAssembler::gen_write_ref_array_post_barrier(MacroAssembler* mas
   __ restore_LR(R0);
 }
 
+static void generate_queue_insertion(MacroAssembler* masm, ByteSize index_offset, ByteSize buffer_offset, Label& runtime,
+                                     const Register value, const Register temp) {
+  assert_different_registers(value, temp);
+  // Can we store a value in the given thread's buffer?
+  // (The index field is typed as size_t.)
+  __ ld(temp, in_bytes(index_offset), R16_thread);  // temp := *(index address)
+  __ cmpdi(CCR0, temp, 0);                          // jump to runtime if index == 0 (full buffer)
+  __ beq(CCR0, runtime);
+  // The buffer is not full, store value into it.
+  __ ld(R0, in_bytes(buffer_offset), R16_thread);   // R0 := buffer address
+  __ addi(temp, temp, -wordSize);                   // temp := next index
+  __ std(temp, in_bytes(index_offset), R16_thread); // *(index address) := next index
+  __ stdx(value, temp, R0);                         // *(buffer address + next index) := value
+}
+
 void G1BarrierSetAssembler::g1_write_barrier_pre(MacroAssembler* masm, DecoratorSet decorators,
                                                  Register obj, RegisterOrConstant ind_or_offs, Register pre_val,
                                                  Register tmp1, Register tmp2,
                                                  MacroAssembler::PreservationLevel preservation_level) {
+  assert_different_registers(pre_val, tmp1, tmp2);
+
   bool not_null  = (decorators & IS_NOT_NULL) != 0,
        preloaded = obj == noreg;
   Register nv_save = noreg;
 
-  if (preloaded) {
+  // Determine necessary runtime invocation preservation measures
+  const bool needs_frame = preservation_level >= MacroAssembler::PRESERVATION_FRAME_LR;
+  const bool preserve_gp_registers = preservation_level >= MacroAssembler::PRESERVATION_FRAME_LR_GP_REGS;
+  const bool preserve_fp_registers = preservation_level >= MacroAssembler::PRESERVATION_FRAME_LR_GP_FP_REGS;
+  int nbytes_save = 0;
+
+  if (pre_val->is_volatile() && preloaded && !preserve_gp_registers) {
     // We are not loading the previous value so make
     // sure that we don't trash the value in pre_val
     // with the code below.
-    assert_different_registers(pre_val, tmp1, tmp2);
-    if (pre_val->is_volatile()) {
-      nv_save = !tmp1->is_volatile() ? tmp1 : tmp2;
-      assert(!nv_save->is_volatile(), "need one nv temp register if pre_val lives in volatile register");
-    }
+    nv_save = !tmp1->is_volatile() ? tmp1 : tmp2;
+    assert(!nv_save->is_volatile(), "need one nv temp register if pre_val lives in volatile register");
   }
 
   Label runtime, filtered;
 
-  // Is marking active?
-  if (in_bytes(SATBMarkQueue::byte_width_of_active()) == 4) {
-    __ lwz(tmp1, in_bytes(G1ThreadLocalData::satb_mark_queue_active_offset()), R16_thread);
-  } else {
-    guarantee(in_bytes(SATBMarkQueue::byte_width_of_active()) == 1, "Assumption");
-    __ lbz(tmp1, in_bytes(G1ThreadLocalData::satb_mark_queue_active_offset()), R16_thread);
-  }
-  __ cmpdi(CCR0, tmp1, 0);
+  generate_marking_inactive_test(masm);
   __ beq(CCR0, filtered);
 
   // Do we need to load the previous value?
@@ -175,28 +192,12 @@ void G1BarrierSetAssembler::g1_write_barrier_pre(MacroAssembler* masm, Decorator
   // Can we store original value in the thread's buffer?
   // Is index == 0?
   // (The index field is typed as size_t.)
-  const Register Rbuffer = tmp1, Rindex = tmp2;
-
-  __ ld(Rindex, in_bytes(G1ThreadLocalData::satb_mark_queue_index_offset()), R16_thread);
-  __ cmpdi(CCR0, Rindex, 0);
-  __ beq(CCR0, runtime); // If index == 0, goto runtime.
-  __ ld(Rbuffer, in_bytes(G1ThreadLocalData::satb_mark_queue_buffer_offset()), R16_thread);
-
-  __ addi(Rindex, Rindex, -wordSize); // Decrement index.
-  __ std(Rindex, in_bytes(G1ThreadLocalData::satb_mark_queue_index_offset()), R16_thread);
-
-  // Record the previous value.
-  __ stdx(pre_val, Rbuffer, Rindex);
+  generate_queue_insertion(masm, G1ThreadLocalData::satb_mark_queue_index_offset(), G1ThreadLocalData::satb_mark_queue_buffer_offset(),
+                           runtime, pre_val, tmp1);
   __ b(filtered);
 
   __ bind(runtime);
 
-  // Determine necessary runtime invocation preservation measures
-  const bool needs_frame = preservation_level >= MacroAssembler::PRESERVATION_FRAME_LR;
-  const bool preserve_gp_registers = preservation_level >= MacroAssembler::PRESERVATION_FRAME_LR_GP_REGS;
-  const bool preserve_fp_registers = preservation_level >= MacroAssembler::PRESERVATION_FRAME_LR_GP_FP_REGS;
-  int nbytes_save = 0;
-
   // May need to preserve LR. Also needed if current frame is not compatible with C calling convention.
   if (needs_frame) {
     if (preserve_gp_registers) {
@@ -210,11 +211,11 @@ void G1BarrierSetAssembler::g1_write_barrier_pre(MacroAssembler* masm, Decorator
     __ push_frame_reg_args(nbytes_save, tmp2);
   }
 
-  if (pre_val->is_volatile() && preloaded && !preserve_gp_registers) {
+  if (nv_save != noreg) {
     __ mr(nv_save, pre_val); // Save pre_val across C call if it was preloaded.
   }
   __ call_VM_leaf(CAST_FROM_FN_PTR(address, G1BarrierSetRuntime::write_ref_field_pre_entry), pre_val, R16_thread);
-  if (pre_val->is_volatile() && preloaded && !preserve_gp_registers) {
+  if (nv_save != noreg) {
     __ mr(pre_val, nv_save); // restore
   }
 
@@ -230,6 +231,26 @@ void G1BarrierSetAssembler::g1_write_barrier_pre(MacroAssembler* masm, Decorator
   __ bind(filtered);
 }
 
+static void generate_region_crossing_test(MacroAssembler* masm, const Register store_addr, const Register new_val) {
+  __ xorr(R0, store_addr, new_val);                  // tmp1 := store address ^ new value
+  __ srdi_(R0, R0, G1HeapRegion::LogOfHRGrainBytes); // tmp1 := ((store address ^ new value) >> LogOfHRGrainBytes)
+}
+
+static Address generate_card_young_test(MacroAssembler* masm, const Register store_addr, const Register tmp1, const Register tmp2) {
+  CardTableBarrierSet* ct = barrier_set_cast<CardTableBarrierSet>(BarrierSet::barrier_set());
+  __ load_const_optimized(tmp1, (address)(ct->card_table()->byte_map_base()), tmp2);
+  __ srdi(tmp2, store_addr, CardTable::card_shift());        // tmp1 := card address relative to card table base
+  __ lbzx(R0, tmp1, tmp2);                                   // tmp1 := card address
+  __ cmpwi(CCR0, R0, (int)G1CardTable::g1_young_card_val());
+  return Address(tmp1, tmp2); // return card address
+}
+
+static void generate_card_dirty_test(MacroAssembler* masm, Address card_addr) {
+  __ membar(Assembler::StoreLoad);                        // Must reload after StoreLoad membar due to concurrent refinement
+  __ lbzx(R0, card_addr.base(), card_addr.index());       // tmp2 := card
+  __ cmpwi(CCR0, R0, (int)G1CardTable::dirty_card_val()); // tmp2 := card == dirty_card_val?
+}
+
 void G1BarrierSetAssembler::g1_write_barrier_post(MacroAssembler* masm, DecoratorSet decorators,
                                                   Register store_addr, Register new_val,
                                                   Register tmp1, Register tmp2, Register tmp3,
@@ -241,9 +262,7 @@ void G1BarrierSetAssembler::g1_write_barrier_post(MacroAssembler* masm, Decorato
 
   CardTableBarrierSet* ct = barrier_set_cast<CardTableBarrierSet>(BarrierSet::barrier_set());
 
-  // Does store cross heap regions?
-  __ xorr(tmp1, store_addr, new_val);
-  __ srdi_(tmp1, tmp1, G1HeapRegion::LogOfHRGrainBytes);
+  generate_region_crossing_test(masm, store_addr, new_val);
   __ beq(CCR0, filtered);
 
   // Crosses regions, storing null?
@@ -257,43 +276,22 @@ void G1BarrierSetAssembler::g1_write_barrier_post(MacroAssembler* masm, Decorato
     __ beq(CCR0, filtered);
   }
 
-  // Storing region crossing non-null, is card already dirty?
-  const Register Rcard_addr = tmp1;
-  Register Rbase = tmp2;
-  __ load_const_optimized(Rbase, (address)(ct->card_table()->byte_map_base()), /*temp*/ tmp3);
-
-  __ srdi(Rcard_addr, store_addr, CardTable::card_shift());
-
-  // Get the address of the card.
-  __ lbzx(/*card value*/ tmp3, Rbase, Rcard_addr);
-  __ cmpwi(CCR0, tmp3, (int)G1CardTable::g1_young_card_val());
+  Address card_addr = generate_card_young_test(masm, store_addr, tmp1, tmp2);
   __ beq(CCR0, filtered);
 
-  __ membar(Assembler::StoreLoad);
-  __ lbzx(/*card value*/ tmp3, Rbase, Rcard_addr);  // Reload after membar.
-  __ cmpwi(CCR0, tmp3 /* card value */, (int)G1CardTable::dirty_card_val());
+  generate_card_dirty_test(masm, card_addr);
   __ beq(CCR0, filtered);
 
-  // Storing a region crossing, non-null oop, card is clean.
-  // Dirty card and log.
-  __ li(tmp3, (int)G1CardTable::dirty_card_val());
-  //release(); // G1: oops are allowed to get visible after dirty marking.
-  __ stbx(tmp3, Rbase, Rcard_addr);
+  __ li(R0, (int)G1CardTable::dirty_card_val());
+  __ stbx(R0, card_addr.base(), card_addr.index()); // *(card address) := dirty_card_val
 
-  __ add(Rcard_addr, Rbase, Rcard_addr); // This is the address which needs to get enqueued.
-  Rbase = noreg; // end of lifetime
+  Register Rcard_addr = tmp3;
+  __ add(Rcard_addr, card_addr.base(), card_addr.index()); // This is the address which needs to get enqueued.
 
-  const Register Rqueue_index = tmp2,
-                 Rqueue_buf   = tmp3;
-  __ ld(Rqueue_index, in_bytes(G1ThreadLocalData::dirty_card_queue_index_offset()), R16_thread);
-  __ cmpdi(CCR0, Rqueue_index, 0);
-  __ beq(CCR0, runtime); // index == 0 then jump to runtime
-  __ ld(Rqueue_buf, in_bytes(G1ThreadLocalData::dirty_card_queue_buffer_offset()), R16_thread);
-
-  __ addi(Rqueue_index, Rqueue_index, -wordSize); // decrement index
-  __ std(Rqueue_index, in_bytes(G1ThreadLocalData::dirty_card_queue_index_offset()), R16_thread);
-
-  __ stdx(Rcard_addr, Rqueue_buf, Rqueue_index); // store card
+  generate_queue_insertion(masm,
+                           G1ThreadLocalData::dirty_card_queue_index_offset(),
+                           G1ThreadLocalData::dirty_card_queue_buffer_offset(),
+                           runtime, Rcard_addr, tmp1);
   __ b(filtered);
 
   __ bind(runtime);
@@ -392,6 +390,142 @@ void G1BarrierSetAssembler::resolve_jobject(MacroAssembler* masm, Register value
   __ bind(done);
 }
 
+#ifdef COMPILER2
+
+static void generate_c2_barrier_runtime_call(MacroAssembler* masm, G1BarrierStubC2* stub, const Register arg, const address runtime_path) {
+  SaveLiveRegisters save_registers(masm, stub);
+  __ call_VM_leaf(runtime_path, arg, R16_thread);
+}
+
+void G1BarrierSetAssembler::g1_write_barrier_pre_c2(MacroAssembler* masm,
+                                                    Register obj,
+                                                    Register pre_val,
+                                                    Register tmp1,
+                                                    Register tmp2,
+                                                    G1PreBarrierStubC2* stub) {
+  assert_different_registers(obj, tmp1, tmp2, R0);
+  assert_different_registers(pre_val, tmp1, R0);
+  assert(!UseCompressedOops || tmp2 != noreg, "tmp2 needed with CompressedOops");
+
+  stub->initialize_registers(obj, pre_val, R16_thread, tmp1, tmp2);
+
+  generate_marking_inactive_test(masm);
+  __ bc_far_optimized(Assembler::bcondCRbiIs0, __ bi0(CCR0, Assembler::equal), *stub->entry());
+
+  __ bind(*stub->continuation());
+}
+
+void G1BarrierSetAssembler::generate_c2_pre_barrier_stub(MacroAssembler* masm,
+                                                         G1PreBarrierStubC2* stub) const {
+  Assembler::InlineSkippedInstructionsCounter skip_counter(masm);
+  Label runtime;
+  Register obj = stub->obj();
+  Register pre_val = stub->pre_val();
+  Register tmp1 = stub->tmp1();
+
+  __ bind(*stub->entry());
+
+  if (obj != noreg) {
+    // Note: C2 currently doesn't use implicit null checks with barriers.
+    // Otherwise, obj could be null and the following instruction would raise a SIGSEGV.
+    if (UseCompressedOops) {
+      __ lwz(pre_val, 0, obj);
+    } else {
+      __ ld(pre_val, 0, obj);
+    }
+  }
+  __ cmpdi(CCR0, pre_val, 0);
+  __ bc_far_optimized(Assembler::bcondCRbiIs1, __ bi0(CCR0, Assembler::equal), *stub->continuation());
+
+  Register pre_val_decoded = pre_val;
+  if (UseCompressedOops) {
+    pre_val_decoded = __ decode_heap_oop_not_null(stub->tmp2(), pre_val);
+  }
+
+  generate_queue_insertion(masm,
+                           G1ThreadLocalData::satb_mark_queue_index_offset(),
+                           G1ThreadLocalData::satb_mark_queue_buffer_offset(),
+                           runtime, pre_val_decoded, tmp1);
+  __ b(*stub->continuation());
+
+  __ bind(runtime);
+  generate_c2_barrier_runtime_call(masm, stub, pre_val_decoded, CAST_FROM_FN_PTR(address, G1BarrierSetRuntime::write_ref_field_pre_entry));
+  __ b(*stub->continuation());
+}
+
+void G1BarrierSetAssembler::g1_write_barrier_post_c2(MacroAssembler* masm,
+                                                     Register store_addr,
+                                                     Register new_val,
+                                                     Register tmp1,
+                                                     Register tmp2,
+                                                     G1PostBarrierStubC2* stub,
+                                                     bool decode_new_val) {
+  assert_different_registers(store_addr, new_val, tmp1, R0);
+  assert_different_registers(store_addr, tmp1, tmp2, R0);
+
+  stub->initialize_registers(R16_thread, tmp1, tmp2);
+
+  bool null_check_required = (stub->barrier_data() & G1C2BarrierPostNotNull) == 0;
+  Register new_val_decoded = new_val;
+
+  if (decode_new_val) {
+    assert(UseCompressedOops, "or should not be here");
+    if (null_check_required && CompressedOops::base() != nullptr) {
+      // We prefer doing the null check after the region crossing check.
+      // Only compressed oop modes with base != null require a null check here.
+      __ cmpwi(CCR0, new_val, 0);
+      __ beq(CCR0, *stub->continuation());
+      null_check_required = false;
+    }
+    new_val_decoded = __ decode_heap_oop_not_null(tmp2, new_val);
+  }
+
+  generate_region_crossing_test(masm, store_addr, new_val_decoded);
+  __ beq(CCR0, *stub->continuation());
+
+  // crosses regions, storing null?
+  if (null_check_required) {
+    __ cmpdi(CCR0, new_val_decoded, 0);
+    __ beq(CCR0, *stub->continuation());
+  }
+
+  Address card_addr = generate_card_young_test(masm, store_addr, tmp1, tmp2);
+  assert(card_addr.base() == tmp1 && card_addr.index() == tmp2, "needed by post barrier stub");
+  __ bc_far_optimized(Assembler::bcondCRbiIs0, __ bi0(CCR0, Assembler::equal), *stub->entry());
+
+  __ bind(*stub->continuation());
+}
+
+void G1BarrierSetAssembler::generate_c2_post_barrier_stub(MacroAssembler* masm,
+                                                          G1PostBarrierStubC2* stub) const {
+  Assembler::InlineSkippedInstructionsCounter skip_counter(masm);
+  Label runtime;
+  Address card_addr(stub->tmp1(), stub->tmp2()); // See above.
+
+  __ bind(*stub->entry());
+
+  generate_card_dirty_test(masm, card_addr);
+  __ bc_far_optimized(Assembler::bcondCRbiIs1, __ bi0(CCR0, Assembler::equal), *stub->continuation());
+
+  __ li(R0, (int)G1CardTable::dirty_card_val());
+  __ stbx(R0, card_addr.base(), card_addr.index()); // *(card address) := dirty_card_val
+
+  Register Rcard_addr = stub->tmp1();
+  __ add(Rcard_addr, card_addr.base(), card_addr.index()); // This is the address which needs to get enqueued.
+
+  generate_queue_insertion(masm,
+                           G1ThreadLocalData::dirty_card_queue_index_offset(),
+                           G1ThreadLocalData::dirty_card_queue_buffer_offset(),
+                           runtime, Rcard_addr, stub->tmp2());
+  __ b(*stub->continuation());
+
+  __ bind(runtime);
+  generate_c2_barrier_runtime_call(masm, stub, Rcard_addr, CAST_FROM_FN_PTR(address, G1BarrierSetRuntime::write_ref_field_post_entry));
+  __ b(*stub->continuation());
+}
+
+#endif // COMPILER2
+
 #ifdef COMPILER1
 
 #undef __
@@ -470,13 +604,7 @@ void G1BarrierSetAssembler::generate_c1_pre_barrier_runtime_stub(StubAssembler*
   __ std(tmp2, -24, R1_SP);
 
   // Is marking still active?
-  if (in_bytes(SATBMarkQueue::byte_width_of_active()) == 4) {
-    __ lwz(tmp, satb_q_active_byte_offset, R16_thread);
-  } else {
-    assert(in_bytes(SATBMarkQueue::byte_width_of_active()) == 1, "Assumption");
-    __ lbz(tmp, satb_q_active_byte_offset, R16_thread);
-  }
-  __ cmpdi(CCR0, tmp, 0);
+  generate_marking_inactive_test(sasm);
   __ beq(CCR0, marking_not_active);
 
   __ bind(restart);
diff --git a/src/hotspot/cpu/ppc/gc/g1/g1BarrierSetAssembler_ppc.hpp b/src/hotspot/cpu/ppc/gc/g1/g1BarrierSetAssembler_ppc.hpp
index d9a252ff6ea..1c9fe8a5d10 100644
--- a/src/hotspot/cpu/ppc/gc/g1/g1BarrierSetAssembler_ppc.hpp
+++ b/src/hotspot/cpu/ppc/gc/g1/g1BarrierSetAssembler_ppc.hpp
@@ -30,10 +30,16 @@
 #include "gc/shared/modRefBarrierSetAssembler.hpp"
 #include "utilities/macros.hpp"
 
+#ifdef COMPILER2
+#include "gc/g1/c2/g1BarrierSetC2.hpp"
+#endif
+
 class LIR_Assembler;
 class StubAssembler;
 class G1PreBarrierStub;
 class G1PostBarrierStub;
+class G1PreBarrierStubC2;
+class G1PostBarrierStubC2;
 
 class G1BarrierSetAssembler: public ModRefBarrierSetAssembler {
 protected:
@@ -59,6 +65,25 @@ protected:
                             MacroAssembler::PreservationLevel preservation_level);
 
 public:
+#ifdef COMPILER2
+  void g1_write_barrier_pre_c2(MacroAssembler* masm,
+                               Register obj,
+                               Register pre_val,
+                               Register tmp1,
+                               Register tmp2,
+                               G1PreBarrierStubC2* c2_stub);
+  void generate_c2_pre_barrier_stub(MacroAssembler* masm,
+                                    G1PreBarrierStubC2* stub) const;
+  void g1_write_barrier_post_c2(MacroAssembler* masm,
+                                Register store_addr,
+                                Register new_val,
+                                Register tmp1,
+                                Register tmp2,
+                                G1PostBarrierStubC2* c2_stub,
+                                bool decode_new_val);
+  void generate_c2_post_barrier_stub(MacroAssembler* masm,
+                                     G1PostBarrierStubC2* stub) const;
+#endif
 #ifdef COMPILER1
   void gen_pre_barrier_stub(LIR_Assembler* ce, G1PreBarrierStub* stub);
   void gen_post_barrier_stub(LIR_Assembler* ce, G1PostBarrierStub* stub);
diff --git a/src/hotspot/cpu/ppc/gc/g1/g1_ppc.ad b/src/hotspot/cpu/ppc/gc/g1/g1_ppc.ad
new file mode 100644
index 00000000000..f4163242cad
--- /dev/null
+++ b/src/hotspot/cpu/ppc/gc/g1/g1_ppc.ad
@@ -0,0 +1,684 @@
+//
+// Copyright (c) 2024, Oracle and/or its affiliates. All rights reserved.
+// Copyright (c) 2024 SAP SE. All rights reserved.
+// DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+//
+// This code is free software; you can redistribute it and/or modify it
+// under the terms of the GNU General Public License version 2 only, as
+// published by the Free Software Foundation.
+//
+// This code is distributed in the hope that it will be useful, but WITHOUT
+// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+// FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+// version 2 for more details (a copy is included in the LICENSE file that
+// accompanied this code).
+//
+// You should have received a copy of the GNU General Public License version
+// 2 along with this work; if not, write to the Free Software Foundation,
+// Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+//
+// Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+// or visit www.oracle.com if you need additional information or have any
+// questions.
+//
+
+source_hpp %{
+
+#include "gc/g1/c2/g1BarrierSetC2.hpp"
+#include "gc/shared/gc_globals.hpp"
+
+%}
+
+source %{
+
+#include "gc/g1/g1BarrierSetAssembler_ppc.hpp"
+#include "gc/g1/g1BarrierSetRuntime.hpp"
+
+static void pre_write_barrier(MacroAssembler* masm,
+                              const MachNode* node,
+                              Register obj,
+                              Register pre_val,
+                              Register tmp1,
+                              Register tmp2 = noreg, // only needed with CompressedOops when pre_val needs to be preserved
+                              RegSet preserve = RegSet(),
+                              RegSet no_preserve = RegSet()) {
+  if (!G1PreBarrierStubC2::needs_barrier(node)) {
+    return;
+  }
+  Assembler::InlineSkippedInstructionsCounter skip_counter(masm);
+  G1BarrierSetAssembler* g1_asm = static_cast<G1BarrierSetAssembler*>(BarrierSet::barrier_set()->barrier_set_assembler());
+  G1PreBarrierStubC2* const stub = G1PreBarrierStubC2::create(node);
+  for (RegSetIterator<Register> reg = preserve.begin(); *reg != noreg; ++reg) {
+    stub->preserve(*reg);
+  }
+  for (RegSetIterator<Register> reg = no_preserve.begin(); *reg != noreg; ++reg) {
+    stub->dont_preserve(*reg);
+  }
+  g1_asm->g1_write_barrier_pre_c2(masm, obj, pre_val, tmp1, (tmp2 != noreg) ? tmp2 : pre_val, stub);
+}
+
+static void post_write_barrier(MacroAssembler* masm,
+                               const MachNode* node,
+                               Register store_addr,
+                               Register new_val,
+                               Register tmp1,
+                               Register tmp2,
+                               bool decode_new_val = false) {
+  if (!G1PostBarrierStubC2::needs_barrier(node)) {
+    return;
+  }
+  Assembler::InlineSkippedInstructionsCounter skip_counter(masm);
+  G1BarrierSetAssembler* g1_asm = static_cast<G1BarrierSetAssembler*>(BarrierSet::barrier_set()->barrier_set_assembler());
+  G1PostBarrierStubC2* const stub = G1PostBarrierStubC2::create(node);
+  g1_asm->g1_write_barrier_post_c2(masm, store_addr, new_val, tmp1, tmp2, stub, decode_new_val);
+}
+
+%}
+
+instruct g1StoreP(indirect mem, iRegPsrc src, iRegPdst tmp1, iRegPdst tmp2, flagsRegCR0 cr0)
+%{
+  predicate(UseG1GC && n->as_Store()->barrier_data() != 0);
+  match(Set mem (StoreP mem src));
+  effect(TEMP tmp1, TEMP tmp2, KILL cr0);
+  ins_cost(2 * MEMORY_REF_COST);
+  format %{ "std    $mem, $src\t# ptr" %}
+  ins_encode %{
+    pre_write_barrier(masm, this,
+                      $mem$$Register,
+                      $tmp1$$Register,
+                      $tmp2$$Register,
+                      noreg,
+                      RegSet::of($mem$$Register, $src$$Register) /* preserve */);
+    __ std($src$$Register, 0, $mem$$Register);
+    post_write_barrier(masm, this,
+                       $mem$$Register,
+                       $src$$Register /* new_val */,
+                       $tmp1$$Register,
+                       $tmp2$$Register);
+  %}
+  ins_pipe(pipe_class_default);
+%}
+
+instruct g1StoreN(indirect mem, iRegNsrc src, iRegPdst tmp1, iRegPdst tmp2, flagsRegCR0 cr0)
+%{
+  predicate(UseG1GC && n->as_Store()->barrier_data() != 0);
+  match(Set mem (StoreN mem src));
+  effect(TEMP tmp1, TEMP tmp2, KILL cr0);
+  ins_cost(2 * MEMORY_REF_COST);
+  format %{ "stw    $mem, $src\t# ptr" %}
+  ins_encode %{
+    pre_write_barrier(masm, this,
+                      $mem$$Register,
+                      $tmp1$$Register,
+                      $tmp2$$Register,
+                      noreg,
+                      RegSet::of($mem$$Register, $src$$Register) /* preserve */);
+    __ stw($src$$Register, 0, $mem$$Register);
+    post_write_barrier(masm, this,
+                       $mem$$Register,
+                       $src$$Register /* new_val */,
+                       $tmp1$$Register,
+                       $tmp2$$Register,
+                       true /* decode_new_val */);
+  %}
+  ins_pipe(pipe_class_default);
+%}
+
+instruct g1EncodePAndStoreN(indirect mem, iRegPsrc src, iRegPdst tmp1, iRegPdst tmp2, flagsRegCR0 cr0)
+%{
+  predicate(UseG1GC && n->as_Store()->barrier_data() != 0);
+  match(Set mem (StoreN mem (EncodeP src)));
+  effect(TEMP tmp1, TEMP tmp2, KILL cr0);
+  ins_cost(2 * MEMORY_REF_COST);
+  format %{ "encode_heap_oop $src\n\t"
+            "stw   $mem, $src\t# ptr" %}
+  ins_encode %{
+    pre_write_barrier(masm, this,
+                      $mem$$Register,
+                      $tmp1$$Register,
+                      $tmp2$$Register,
+                      noreg,
+                      RegSet::of($mem$$Register, $src$$Register) /* preserve */);
+    Register encoded_oop = noreg;
+    if ((barrier_data() & G1C2BarrierPostNotNull) == 0) {
+      encoded_oop = __ encode_heap_oop($tmp2$$Register, $src$$Register);
+    } else {
+      encoded_oop = __ encode_heap_oop_not_null($tmp2$$Register, $src$$Register);
+    }
+    __ stw(encoded_oop, 0, $mem$$Register);
+    post_write_barrier(masm, this,
+                       $mem$$Register,
+                       $src$$Register /* new_val */,
+                       $tmp1$$Register,
+                       $tmp2$$Register);
+  %}
+  ins_pipe(pipe_class_default);
+%}
+
+instruct g1CompareAndExchangeP(iRegPdst res, indirect mem, iRegPsrc oldval, iRegPsrc newval, iRegPdst tmp1, iRegPdst tmp2, flagsRegCR0 cr0)
+%{
+  predicate(UseG1GC && n->as_LoadStore()->barrier_data() != 0 &&
+            (((CompareAndExchangeNode*)n)->order() != MemNode::acquire && ((CompareAndExchangeNode*)n)->order() != MemNode::seqcst));
+  match(Set res (CompareAndExchangeP mem (Binary oldval newval)));
+  effect(TEMP_DEF res, TEMP tmp1, TEMP tmp2, KILL cr0);
+  format %{ "cmpxchgd $newval, $mem" %}
+  ins_encode %{
+    Label no_update;
+    __ cmpxchgd(CCR0, $res$$Register, $oldval$$Register, $newval$$Register, $mem$$Register,
+                MacroAssembler::MemBarNone, MacroAssembler::cmpxchgx_hint_atomic_update(),
+                noreg, &no_update, true);
+    // Pass oldval to SATB which is the only value which can get overwritten.
+    // Can be done after cmpxchg because there's no safepoint here.
+    pre_write_barrier(masm, this,
+                      noreg,
+                      $oldval$$Register,
+                      $tmp1$$Register,
+                      $tmp2$$Register,
+                      RegSet::of($mem$$Register, $newval$$Register) /* preserve */);
+    post_write_barrier(masm, this,
+                       $mem$$Register,
+                       $newval$$Register,
+                       $tmp1$$Register,
+                       $tmp2$$Register);
+    __ bind(no_update);
+  %}
+  ins_pipe(pipe_class_default);
+%}
+
+instruct g1CompareAndExchangeP_acq(iRegPdst res, indirect mem, iRegPsrc oldval, iRegPsrc newval, iRegPdst tmp1, iRegPdst tmp2, flagsRegCR0 cr0)
+%{
+  predicate(UseG1GC && n->as_LoadStore()->barrier_data() != 0 &&
+            (((CompareAndExchangeNode*)n)->order() == MemNode::acquire || ((CompareAndExchangeNode*)n)->order() == MemNode::seqcst));
+  match(Set res (CompareAndExchangeP mem (Binary oldval newval)));
+  effect(TEMP_DEF res, TEMP tmp1, TEMP tmp2, KILL cr0);
+  format %{ "cmpxchgd acq $newval, $mem" %}
+  ins_encode %{
+    Label no_update;
+    __ cmpxchgd(CCR0, $res$$Register, $oldval$$Register, $newval$$Register, $mem$$Register,
+                MacroAssembler::MemBarNone, MacroAssembler::cmpxchgx_hint_atomic_update(),
+                noreg, &no_update, true);
+    // Pass oldval to SATB which is the only value which can get overwritten.
+    // Can be done after cmpxchg because there's no safepoint here.
+    pre_write_barrier(masm, this,
+                      noreg,
+                      $oldval$$Register,
+                      $tmp1$$Register,
+                      $tmp2$$Register,
+                      RegSet::of($mem$$Register, $newval$$Register) /* preserve */);
+    post_write_barrier(masm, this,
+                       $mem$$Register,
+                       $newval$$Register,
+                       $tmp1$$Register,
+                       $tmp2$$Register);
+    __ bind(no_update);
+    if (support_IRIW_for_not_multiple_copy_atomic_cpu) {
+      __ isync();
+    } else {
+      // isync would be sufficient in case of CompareAndExchangeAcquire, but we currently don't optimize for that.
+      __ sync();
+    }
+  %}
+  ins_pipe(pipe_class_default);
+%}
+
+instruct g1CompareAndExchangeN(iRegNdst res, indirect mem, iRegNsrc oldval, iRegNsrc newval, iRegPdst tmp1, iRegPdst tmp2, flagsRegCR0 cr0)
+%{
+  predicate(UseG1GC && n->as_LoadStore()->barrier_data() != 0 &&
+            (((CompareAndExchangeNode*)n)->order() != MemNode::acquire && ((CompareAndExchangeNode*)n)->order() != MemNode::seqcst));
+  match(Set res (CompareAndExchangeN mem (Binary oldval newval)));
+  effect(TEMP_DEF res, TEMP tmp1, TEMP tmp2, KILL cr0);
+  format %{ "cmpxchgw $newval, $mem" %}
+  ins_encode %{
+    Label no_update;
+    __ cmpxchgw(CCR0, $res$$Register, $oldval$$Register, $newval$$Register, $mem$$Register,
+                MacroAssembler::MemBarNone, MacroAssembler::cmpxchgx_hint_atomic_update(),
+                noreg, &no_update, true);
+    // Pass oldval to SATB which is the only value which can get overwritten.
+    // Can be done after cmpxchg because there's no safepoint here.
+    pre_write_barrier(masm, this,
+                      noreg,
+                      $oldval$$Register,
+                      $tmp1$$Register,
+                      $tmp2$$Register,
+                      RegSet::of($mem$$Register, $newval$$Register) /* preserve */);
+    post_write_barrier(masm, this,
+                       $mem$$Register,
+                       $newval$$Register,
+                       $tmp1$$Register,
+                       $tmp2$$Register,
+                       true /* decode_new_val */);
+    __ bind(no_update);
+  %}
+  ins_pipe(pipe_class_default);
+%}
+
+instruct g1CompareAndExchangeN_acq(iRegNdst res, indirect mem, iRegNsrc oldval, iRegNsrc newval, iRegPdst tmp1, iRegPdst tmp2, flagsRegCR0 cr0)
+%{
+  predicate(UseG1GC && n->as_LoadStore()->barrier_data() != 0 &&
+            (((CompareAndExchangeNode*)n)->order() == MemNode::acquire || ((CompareAndExchangeNode*)n)->order() == MemNode::seqcst));
+  match(Set res (CompareAndExchangeN mem (Binary oldval newval)));
+  effect(TEMP_DEF res, TEMP tmp1, TEMP tmp2, KILL cr0);
+  format %{ "cmpxchgw acq $newval, $mem" %}
+  ins_encode %{
+    Label no_update;
+    __ cmpxchgw(CCR0, $res$$Register, $oldval$$Register, $newval$$Register, $mem$$Register,
+                MacroAssembler::MemBarNone, MacroAssembler::cmpxchgx_hint_atomic_update(),
+                noreg, &no_update, true);
+    // Pass oldval to SATB which is the only value which can get overwritten.
+    // Can be done after cmpxchg because there's no safepoint here.
+    pre_write_barrier(masm, this,
+                      noreg,
+                      $oldval$$Register,
+                      $tmp1$$Register,
+                      $tmp2$$Register,
+                      RegSet::of($mem$$Register, $newval$$Register) /* preserve */);
+    post_write_barrier(masm, this,
+                       $mem$$Register,
+                       $newval$$Register,
+                       $tmp1$$Register,
+                       $tmp2$$Register,
+                       true /* decode_new_val */);
+    __ bind(no_update);
+    if (support_IRIW_for_not_multiple_copy_atomic_cpu) {
+      __ isync();
+    } else {
+      // isync would be sufficient in case of CompareAndExchangeAcquire, but we currently don't optimize for that.
+      __ sync();
+    }
+  %}
+  ins_pipe(pipe_class_default);
+%}
+
+instruct g1CompareAndSwapP(iRegIdst res, indirect mem, iRegPsrc oldval, iRegPsrc newval, iRegPdst tmp, flagsRegCR0 cr0)
+%{
+  predicate(UseG1GC && n->as_LoadStore()->barrier_data() != 0 &&
+            (((CompareAndSwapNode*)n)->order() != MemNode::acquire && ((CompareAndSwapNode*)n)->order() != MemNode::seqcst));
+  match(Set res (CompareAndSwapP mem (Binary oldval newval)));
+  effect(TEMP_DEF res, TEMP tmp, KILL cr0);
+  format %{ "CMPXCHGD $res, $mem, $oldval, $newval; as bool; ptr" %}
+  ins_encode %{
+    Label no_update;
+    __ li($res$$Register, 0);
+    __ cmpxchgd(CCR0, R0, $oldval$$Register, $newval$$Register, $mem$$Register,
+                MacroAssembler::MemBarNone, MacroAssembler::cmpxchgx_hint_atomic_update(),
+                noreg, &no_update, true);
+    // Pass oldval to SATB which is the only value which can get overwritten.
+    // Can be done after cmpxchg because there's no safepoint here.
+    pre_write_barrier(masm, this,
+                      noreg,
+                      $oldval$$Register /* pre_val */,
+                      $tmp$$Register,
+                      $res$$Register /* temp */,
+                      RegSet::of($mem$$Register, $newval$$Register) /* preserve */,
+                      RegSet::of($res$$Register) /* no_preserve */);
+    post_write_barrier(masm, this,
+                       $mem$$Register,
+                       $newval$$Register,
+                       $tmp$$Register,
+                       $res$$Register /* temp */);
+    __ li($res$$Register, 1);
+    __ bind(no_update);
+  %}
+  ins_pipe(pipe_class_default);
+%}
+
+instruct g1CompareAndSwapP_acq(iRegIdst res, indirect mem, iRegPsrc oldval, iRegPsrc newval, iRegPdst tmp, flagsRegCR0 cr0)
+%{
+  predicate(UseG1GC && n->as_LoadStore()->barrier_data() != 0 &&
+            (((CompareAndSwapNode*)n)->order() == MemNode::acquire || ((CompareAndSwapNode*)n)->order() == MemNode::seqcst));
+  match(Set res (CompareAndSwapP mem (Binary oldval newval)));
+  effect(TEMP_DEF res, TEMP tmp, KILL cr0);
+  format %{ "CMPXCHGD acq $res, $mem, $oldval, $newval; as bool; ptr" %}
+  ins_encode %{
+    Label no_update;
+    __ li($res$$Register, 0);
+    __ cmpxchgd(CCR0, R0, $oldval$$Register, $newval$$Register, $mem$$Register,
+                MacroAssembler::MemBarNone, MacroAssembler::cmpxchgx_hint_atomic_update(),
+                noreg, &no_update, true);
+    // Pass oldval to SATB which is the only value which can get overwritten.
+    // Can be done after cmpxchg because there's no safepoint here.
+    pre_write_barrier(masm, this,
+                      noreg,
+                      $oldval$$Register /* pre_val */,
+                      $tmp$$Register,
+                      $res$$Register /* temp */,
+                      RegSet::of($mem$$Register, $newval$$Register) /* preserve */,
+                      RegSet::of($res$$Register) /* no_preserve */);
+    post_write_barrier(masm, this,
+                       $mem$$Register,
+                       $newval$$Register,
+                       $tmp$$Register,
+                       $res$$Register /* temp */);
+    __ li($res$$Register, 1);
+    __ bind(no_update);
+    if (support_IRIW_for_not_multiple_copy_atomic_cpu) {
+      __ isync();
+    } else {
+      // isync would be sufficient in case of CompareAndExchangeAcquire, but we currently don't optimize for that.
+      __ sync();
+    }
+  %}
+  ins_pipe(pipe_class_default);
+%}
+
+instruct g1CompareAndSwapN(iRegIdst res, indirect mem, iRegNsrc oldval, iRegNsrc newval, iRegPdst tmp, flagsRegCR0 cr0)
+%{
+  predicate(UseG1GC && n->as_LoadStore()->barrier_data() != 0 &&
+            (((CompareAndSwapNode*)n)->order() != MemNode::acquire && ((CompareAndSwapNode*)n)->order() != MemNode::seqcst));
+  match(Set res (CompareAndSwapN mem (Binary oldval newval)));
+  effect(TEMP_DEF res, TEMP tmp, KILL cr0);
+  format %{ "CMPXCHGW $res, $mem, $oldval, $newval; as bool; ptr" %}
+  ins_encode %{
+    Label no_update;
+    __ li($res$$Register, 0);
+    __ cmpxchgw(CCR0, R0, $oldval$$Register, $newval$$Register, $mem$$Register,
+                MacroAssembler::MemBarNone, MacroAssembler::cmpxchgx_hint_atomic_update(),
+                noreg, &no_update, true);
+    // Pass oldval to SATB which is the only value which can get overwritten.
+    // Can be done after cmpxchg because there's no safepoint here.
+    pre_write_barrier(masm, this,
+                      noreg,
+                      $oldval$$Register /* pre_val */,
+                      $tmp$$Register,
+                      $res$$Register /* temp */,
+                      RegSet::of($mem$$Register, $newval$$Register) /* preserve */,
+                      RegSet::of($res$$Register) /* no_preserve */);
+    post_write_barrier(masm, this,
+                       $mem$$Register,
+                       $newval$$Register,
+                       $tmp$$Register,
+                       $res$$Register /* temp */,
+                       true /* decode_new_val */);
+    __ li($res$$Register, 1);
+    __ bind(no_update);
+  %}
+  ins_pipe(pipe_class_default);
+%}
+
+instruct g1CompareAndSwapN_acq(iRegIdst res, indirect mem, iRegNsrc oldval, iRegNsrc newval, iRegPdst tmp, flagsRegCR0 cr0)
+%{
+  predicate(UseG1GC && n->as_LoadStore()->barrier_data() != 0 &&
+            (((CompareAndSwapNode*)n)->order() == MemNode::acquire || ((CompareAndSwapNode*)n)->order() == MemNode::seqcst));
+  match(Set res (CompareAndSwapN mem (Binary oldval newval)));
+  effect(TEMP_DEF res, TEMP tmp, KILL cr0);
+  format %{ "CMPXCHGW acq $res, $mem, $oldval, $newval; as bool; ptr" %}
+  ins_encode %{
+    Label no_update;
+    __ li($res$$Register, 0);
+    __ cmpxchgw(CCR0, R0, $oldval$$Register, $newval$$Register, $mem$$Register,
+                MacroAssembler::MemBarNone, MacroAssembler::cmpxchgx_hint_atomic_update(),
+                noreg, &no_update, true);
+    // Pass oldval to SATB which is the only value which can get overwritten.
+    // Can be done after cmpxchg because there's no safepoint here.
+    pre_write_barrier(masm, this,
+                      noreg,
+                      $oldval$$Register /* pre_val */,
+                      $tmp$$Register,
+                      $res$$Register /* temp */,
+                      RegSet::of($mem$$Register, $newval$$Register) /* preserve */,
+                      RegSet::of($res$$Register) /* no_preserve */);
+    post_write_barrier(masm, this,
+                       $mem$$Register,
+                       $newval$$Register,
+                       $tmp$$Register,
+                       $res$$Register /* temp */,
+                       true /* decode_new_val */);
+    __ li($res$$Register, 1);
+    __ bind(no_update);
+    if (support_IRIW_for_not_multiple_copy_atomic_cpu) {
+      __ isync();
+    } else {
+      // isync would be sufficient in case of CompareAndExchangeAcquire, but we currently don't optimize for that.
+      __ sync();
+    }
+  %}
+  ins_pipe(pipe_class_default);
+%}
+
+instruct weakG1CompareAndSwapP(iRegIdst res, indirect mem, iRegPsrc oldval, iRegPsrc newval, iRegPdst tmp, flagsRegCR0 cr0)
+%{
+  predicate(UseG1GC && n->as_LoadStore()->barrier_data() != 0 &&
+            (((CompareAndSwapNode*)n)->order() != MemNode::acquire && ((CompareAndSwapNode*)n)->order() != MemNode::seqcst));
+  match(Set res (WeakCompareAndSwapP mem (Binary oldval newval)));
+  effect(TEMP_DEF res, TEMP tmp, KILL cr0);
+  format %{ "weak CMPXCHGD $res, $mem, $oldval, $newval; as bool; ptr" %}
+  ins_encode %{
+    Label no_update;
+    __ li($res$$Register, 0);
+    __ cmpxchgd(CCR0, R0, $oldval$$Register, $newval$$Register, $mem$$Register,
+                MacroAssembler::MemBarNone, MacroAssembler::cmpxchgx_hint_atomic_update(),
+                noreg, &no_update, true, true);
+    // Pass oldval to SATB which is the only value which can get overwritten.
+    // Can be done after cmpxchg because there's no safepoint here.
+    pre_write_barrier(masm, this,
+                      noreg,
+                      $oldval$$Register /* pre_val */,
+                      $tmp$$Register,
+                      $res$$Register /* temp */,
+                      RegSet::of($mem$$Register, $newval$$Register) /* preserve */,
+                      RegSet::of($res$$Register) /* no_preserve */);
+    post_write_barrier(masm, this,
+                       $mem$$Register,
+                       $newval$$Register,
+                       $tmp$$Register,
+                       $res$$Register /* temp */);
+    __ li($res$$Register, 1);
+    __ bind(no_update);
+  %}
+  ins_pipe(pipe_class_default);
+%}
+
+instruct weakG1CompareAndSwapP_acq(iRegIdst res, indirect mem, iRegPsrc oldval, iRegPsrc newval, iRegPdst tmp, flagsRegCR0 cr0)
+%{
+  predicate(UseG1GC && n->as_LoadStore()->barrier_data() != 0 &&
+            (((CompareAndSwapNode*)n)->order() == MemNode::acquire || ((CompareAndSwapNode*)n)->order() == MemNode::seqcst));
+  match(Set res (WeakCompareAndSwapP mem (Binary oldval newval)));
+  effect(TEMP_DEF res, TEMP tmp, KILL cr0);
+  format %{ "weak CMPXCHGD acq $res, $mem, $oldval, $newval; as bool; ptr" %}
+  ins_encode %{
+    Label no_update;
+    __ li($res$$Register, 0);
+    __ cmpxchgd(CCR0, R0, $oldval$$Register, $newval$$Register, $mem$$Register,
+                MacroAssembler::MemBarNone, MacroAssembler::cmpxchgx_hint_atomic_update(),
+                noreg, &no_update, true, true);
+    // Pass oldval to SATB which is the only value which can get overwritten.
+    // Can be done after cmpxchg because there's no safepoint here.
+    pre_write_barrier(masm, this,
+                      noreg,
+                      $oldval$$Register /* pre_val */,
+                      $tmp$$Register,
+                      $res$$Register /* temp */,
+                      RegSet::of($mem$$Register, $newval$$Register) /* preserve */,
+                      RegSet::of($res$$Register) /* no_preserve */);
+    post_write_barrier(masm, this,
+                       $mem$$Register,
+                       $newval$$Register,
+                       $tmp$$Register,
+                       $res$$Register /* temp */);
+    __ li($res$$Register, 1);
+    if (support_IRIW_for_not_multiple_copy_atomic_cpu) {
+      __ isync();
+    } else {
+      // isync would be sufficient in case of CompareAndExchangeAcquire, but we currently don't optimize for that.
+      __ sync();
+    }
+    __ bind(no_update); // weak version requires no memory barrier on failure
+  %}
+  ins_pipe(pipe_class_default);
+%}
+
+instruct weakG1CompareAndSwapN(iRegIdst res, indirect mem, iRegNsrc oldval, iRegNsrc newval, iRegPdst tmp, flagsRegCR0 cr0)
+%{
+  predicate(UseG1GC && n->as_LoadStore()->barrier_data() != 0 &&
+            (((CompareAndSwapNode*)n)->order() != MemNode::acquire && ((CompareAndSwapNode*)n)->order() != MemNode::seqcst));
+  match(Set res (WeakCompareAndSwapN mem (Binary oldval newval)));
+  effect(TEMP_DEF res, TEMP tmp, KILL cr0);
+  format %{ "weak CMPXCHGW $res, $mem, $oldval, $newval; as bool; ptr" %}
+  ins_encode %{
+    Label no_update;
+    __ li($res$$Register, 0);
+    __ cmpxchgw(CCR0, R0, $oldval$$Register, $newval$$Register, $mem$$Register,
+                MacroAssembler::MemBarNone, MacroAssembler::cmpxchgx_hint_atomic_update(),
+                noreg, &no_update, true, true);
+    // Pass oldval to SATB which is the only value which can get overwritten.
+    // Can be done after cmpxchg because there's no safepoint here.
+    pre_write_barrier(masm, this,
+                      noreg,
+                      $oldval$$Register /* pre_val */,
+                      $tmp$$Register,
+                      $res$$Register /* temp */,
+                      RegSet::of($mem$$Register, $newval$$Register) /* preserve */,
+                      RegSet::of($res$$Register) /* no_preserve */);
+    post_write_barrier(masm, this,
+                       $mem$$Register,
+                       $newval$$Register,
+                       $tmp$$Register,
+                       $res$$Register /* temp */,
+                       true /* decode_new_val */);
+    __ li($res$$Register, 1);
+    __ bind(no_update);
+  %}
+  ins_pipe(pipe_class_default);
+%}
+
+instruct weakG1CompareAndSwapN_acq(iRegIdst res, indirect mem, iRegNsrc oldval, iRegNsrc newval, iRegPdst tmp, flagsRegCR0 cr0)
+%{
+  predicate(UseG1GC && n->as_LoadStore()->barrier_data() != 0 &&
+            (((CompareAndSwapNode*)n)->order() == MemNode::acquire || ((CompareAndSwapNode*)n)->order() == MemNode::seqcst));
+  match(Set res (WeakCompareAndSwapN mem (Binary oldval newval)));
+  effect(TEMP_DEF res, TEMP tmp, KILL cr0);
+  format %{ "weak CMPXCHGW acq $res, $mem, $oldval, $newval; as bool; ptr" %}
+  ins_encode %{
+    Label no_update;
+    __ li($res$$Register, 0);
+    __ cmpxchgw(CCR0, R0, $oldval$$Register, $newval$$Register, $mem$$Register,
+                MacroAssembler::MemBarNone, MacroAssembler::cmpxchgx_hint_atomic_update(),
+                noreg, &no_update, true, true);
+    // Pass oldval to SATB which is the only value which can get overwritten.
+    // Can be done after cmpxchg because there's no safepoint here.
+    pre_write_barrier(masm, this,
+                      noreg,
+                      $oldval$$Register /* pre_val */,
+                      $tmp$$Register,
+                      $res$$Register /* temp */,
+                      RegSet::of($mem$$Register, $newval$$Register) /* preserve */,
+                      RegSet::of($res$$Register) /* no_preserve */);
+    post_write_barrier(masm, this,
+                       $mem$$Register,
+                       $newval$$Register,
+                       $tmp$$Register,
+                       $res$$Register /* temp */,
+                       true /* decode_new_val */);
+    __ li($res$$Register, 1);
+    if (support_IRIW_for_not_multiple_copy_atomic_cpu) {
+      __ isync();
+    } else {
+      // isync would be sufficient in case of CompareAndExchangeAcquire, but we currently don't optimize for that.
+      __ sync();
+    }
+    __ bind(no_update); // weak version requires no memory barrier on failure
+  %}
+  ins_pipe(pipe_class_default);
+%}
+
+instruct g1GetAndSetP(iRegPdst res, indirect mem, iRegPsrc newval, iRegPdst tmp1, iRegPdst tmp2, flagsRegCR0 cr0)
+%{
+  predicate(UseG1GC && n->as_LoadStore()->barrier_data() != 0);
+  match(Set res (GetAndSetP mem newval));
+  effect(TEMP_DEF res, TEMP tmp1, TEMP tmp2, KILL cr0);
+  format %{ "GetAndSetP    $newval, $mem" %}
+  ins_encode %{
+    assert_different_registers($mem$$Register, $newval$$Register);
+    __ getandsetd($res$$Register, $newval$$Register, $mem$$Register,
+                  MacroAssembler::cmpxchgx_hint_atomic_update());
+    // Can be done after cmpxchg because there's no safepoint here.
+    pre_write_barrier(masm, this,
+                      noreg /* obj */,
+                      $res$$Register /* res */,
+                      $tmp1$$Register,
+                      $tmp2$$Register,
+                      RegSet::of($mem$$Register, $newval$$Register) /* preserve */);
+    post_write_barrier(masm, this,
+                       $mem$$Register,
+                       $newval$$Register,
+                       $tmp1$$Register,
+                       $tmp2$$Register);
+    if (support_IRIW_for_not_multiple_copy_atomic_cpu) {
+      __ isync();
+    } else {
+      __ sync();
+    }
+  %}
+  ins_pipe(pipe_class_default);
+%}
+
+instruct g1GetAndSetN(iRegNdst res, indirect mem, iRegNsrc newval, iRegPdst tmp1, iRegPdst tmp2, flagsRegCR0 cr0)
+%{
+  predicate(UseG1GC && n->as_LoadStore()->barrier_data() != 0);
+  match(Set res (GetAndSetN mem newval));
+  effect(TEMP_DEF res, TEMP tmp1, TEMP tmp2, KILL cr0);
+  format %{ "GetAndSetN    $newval, $mem" %}
+  ins_encode %{
+    assert_different_registers($mem$$Register, $newval$$Register);
+    __ getandsetw($res$$Register, $newval$$Register, $mem$$Register,
+                  MacroAssembler::cmpxchgx_hint_atomic_update());
+    // Can be done after cmpxchg because there's no safepoint here.
+    pre_write_barrier(masm, this,
+                      noreg /* obj */,
+                      $res$$Register /* res */,
+                      $tmp1$$Register,
+                      $tmp2$$Register,
+                      RegSet::of($mem$$Register, $newval$$Register) /* preserve */);
+    post_write_barrier(masm, this,
+                       $mem$$Register,
+                       $newval$$Register,
+                       $tmp1$$Register,
+                       $tmp2$$Register,
+                       true /* decode_new_val */);
+    if (support_IRIW_for_not_multiple_copy_atomic_cpu) {
+      __ isync();
+    } else {
+      __ sync();
+    }
+  %}
+  ins_pipe(pipe_class_default);
+%}
+
+instruct g1LoadP(iRegPdst dst, memoryAlg4 mem, iRegPdst tmp, flagsRegCR0 cr0)
+%{
+  predicate(UseG1GC && n->as_Load()->is_unordered() && n->as_Load()->barrier_data() != 0);
+  // This instruction does not need an acquiring counterpart because it is only
+  // used for reference loading (Reference::get()).
+  match(Set dst (LoadP mem));
+  effect(TEMP_DEF dst, TEMP tmp, KILL cr0);
+  ins_cost(2 * MEMORY_REF_COST);
+  format %{ "ld    $dst, $mem\t# ptr" %}
+  ins_encode %{
+    __ ld($dst$$Register, $mem$$disp, $mem$$base$$Register);
+    pre_write_barrier(masm, this,
+                      noreg /* obj */,
+                      $dst$$Register /* pre_val */,
+                      $tmp$$Register);
+  %}
+  ins_pipe(pipe_class_default);
+%}
+
+instruct g1LoadN(iRegNdst dst, memoryAlg4 mem, iRegPdst tmp1, iRegPdst tmp2, flagsRegCR0 cr0)
+%{
+  predicate(UseG1GC && n->as_Load()->is_unordered() && n->as_Load()->barrier_data() != 0);
+  // This instruction does not need an acquiring counterpart because it is only
+  // used for reference loading (Reference::get()).
+  match(Set dst (LoadN mem));
+  effect(TEMP_DEF dst, TEMP tmp1, TEMP tmp2, KILL cr0);
+  ins_cost(2 * MEMORY_REF_COST);
+  format %{ "lwz    $dst, $mem\t# ptr" %}
+  ins_encode %{
+    __ lwz($dst$$Register, $mem$$disp, $mem$$base$$Register);
+    pre_write_barrier(masm, this,
+                      noreg /* obj */,
+                      $dst$$Register,
+                      $tmp1$$Register,
+                      $tmp2$$Register);
+  %}
+  ins_pipe(pipe_class_default);
+%}
diff --git a/src/hotspot/cpu/ppc/ppc.ad b/src/hotspot/cpu/ppc/ppc.ad
index ca9abfa3719..d15f9929671 100644
--- a/src/hotspot/cpu/ppc/ppc.ad
+++ b/src/hotspot/cpu/ppc/ppc.ad
@@ -1000,6 +1000,10 @@ int MachNode::compute_padding(int current_offset) const {
 
 // Should the matcher clone input 'm' of node 'n'?
 bool Matcher::pd_clone_node(Node* n, Node* m, Matcher::MStack& mstack) {
+  if (is_encode_and_store_pattern(n, m)) {
+    mstack.push(m, Visit);
+    return true;
+  }
   return false;
 }
 
@@ -5407,7 +5411,7 @@ instruct loadRange(iRegIdst dst, memory mem) %{
 // Load Compressed Pointer
 instruct loadN(iRegNdst dst, memory mem) %{
   match(Set dst (LoadN mem));
-  predicate(n->as_Load()->is_unordered() || followed_by_acquire(n));
+  predicate((n->as_Load()->is_unordered() || followed_by_acquire(n)) && n->as_Load()->barrier_data() == 0);
   ins_cost(MEMORY_REF_COST);
 
   format %{ "LWZ     $dst, $mem \t// load compressed ptr" %}
@@ -5419,6 +5423,7 @@ instruct loadN(iRegNdst dst, memory mem) %{
 // Load Compressed Pointer acquire.
 instruct loadN_ac(iRegNdst dst, memory mem) %{
   match(Set dst (LoadN mem));
+  predicate(n->as_Load()->barrier_data() == 0);
   ins_cost(3*MEMORY_REF_COST);
 
   format %{ "LWZ     $dst, $mem \t// load acquire compressed ptr\n\t"
@@ -5432,7 +5437,7 @@ instruct loadN_ac(iRegNdst dst, memory mem) %{
 // Load Compressed Pointer and decode it if narrow_oop_shift == 0.
 instruct loadN2P_unscaled(iRegPdst dst, memory mem) %{
   match(Set dst (DecodeN (LoadN mem)));
-  predicate(_kids[0]->_leaf->as_Load()->is_unordered() && CompressedOops::shift() == 0);
+  predicate(_kids[0]->_leaf->as_Load()->is_unordered() && CompressedOops::shift() == 0 && _kids[0]->_leaf->as_Load()->barrier_data() == 0);
   ins_cost(MEMORY_REF_COST);
 
   format %{ "LWZ     $dst, $mem \t// DecodeN (unscaled)" %}
@@ -6423,6 +6428,7 @@ instruct reinterpretX(vecX dst) %{
 // Store Compressed Oop
 instruct storeN(memory dst, iRegN_P2N src) %{
   match(Set dst (StoreN dst src));
+  predicate(n->as_Store()->barrier_data() == 0);
   ins_cost(MEMORY_REF_COST);
 
   format %{ "STW     $src, $dst \t// compressed oop" %}
@@ -7477,6 +7483,7 @@ instruct compareAndSwapI_regP_regI_regI(iRegIdst res, iRegPdst mem_ptr, iRegIsrc
 
 instruct compareAndSwapN_regP_regN_regN(iRegIdst res, iRegPdst mem_ptr, iRegNsrc src1, iRegNsrc src2, flagsRegCR0 cr0) %{
   match(Set res (CompareAndSwapN mem_ptr (Binary src1 src2)));
+  predicate(n->as_LoadStore()->barrier_data() == 0);
   effect(TEMP_DEF res, TEMP cr0); // TEMP_DEF to avoid jump
   format %{ "CMPXCHGW $res, $mem_ptr, $src1, $src2; as bool" %}
   ins_encode %{
@@ -7676,7 +7683,7 @@ instruct weakCompareAndSwapI_acq_regP_regI_regI(iRegIdst res, iRegPdst mem_ptr,
 
 instruct weakCompareAndSwapN_regP_regN_regN(iRegIdst res, iRegPdst mem_ptr, iRegNsrc src1, iRegNsrc src2, flagsRegCR0 cr0) %{
   match(Set res (WeakCompareAndSwapN mem_ptr (Binary src1 src2)));
-  predicate(((CompareAndSwapNode*)n)->order() != MemNode::acquire && ((CompareAndSwapNode*)n)->order() != MemNode::seqcst);
+  predicate(((CompareAndSwapNode*)n)->order() != MemNode::acquire && ((CompareAndSwapNode*)n)->order() != MemNode::seqcst && n->as_LoadStore()->barrier_data() == 0);
   effect(TEMP_DEF res, TEMP cr0); // TEMP_DEF to avoid jump
   format %{ "weak CMPXCHGW $res, $mem_ptr, $src1, $src2; as bool" %}
   ins_encode %{
@@ -7690,7 +7697,7 @@ instruct weakCompareAndSwapN_regP_regN_regN(iRegIdst res, iRegPdst mem_ptr, iReg
 
 instruct weakCompareAndSwapN_acq_regP_regN_regN(iRegIdst res, iRegPdst mem_ptr, iRegNsrc src1, iRegNsrc src2, flagsRegCR0 cr0) %{
   match(Set res (WeakCompareAndSwapN mem_ptr (Binary src1 src2)));
-  predicate(((CompareAndSwapNode*)n)->order() == MemNode::acquire || ((CompareAndSwapNode*)n)->order() == MemNode::seqcst);
+  predicate((((CompareAndSwapNode*)n)->order() == MemNode::acquire || ((CompareAndSwapNode*)n)->order() == MemNode::seqcst) && n->as_LoadStore()->barrier_data() == 0);
   effect(TEMP_DEF res, TEMP cr0); // TEMP_DEF to avoid jump
   format %{ "weak CMPXCHGW acq $res, $mem_ptr, $src1, $src2; as bool" %}
   ins_encode %{
@@ -7939,7 +7946,7 @@ instruct compareAndExchangeI_acq_regP_regI_regI(iRegIdst res, iRegPdst mem_ptr,
 
 instruct compareAndExchangeN_regP_regN_regN(iRegNdst res, iRegPdst mem_ptr, iRegNsrc src1, iRegNsrc src2, flagsRegCR0 cr0) %{
   match(Set res (CompareAndExchangeN mem_ptr (Binary src1 src2)));
-  predicate(((CompareAndSwapNode*)n)->order() != MemNode::acquire && ((CompareAndSwapNode*)n)->order() != MemNode::seqcst);
+  predicate(((CompareAndSwapNode*)n)->order() != MemNode::acquire && ((CompareAndSwapNode*)n)->order() != MemNode::seqcst && n->as_LoadStore()->barrier_data() == 0);
   effect(TEMP_DEF res, TEMP cr0);
   format %{ "CMPXCHGW $res, $mem_ptr, $src1, $src2; as narrow oop" %}
   ins_encode %{
@@ -7953,7 +7960,7 @@ instruct compareAndExchangeN_regP_regN_regN(iRegNdst res, iRegPdst mem_ptr, iReg
 
 instruct compareAndExchangeN_acq_regP_regN_regN(iRegNdst res, iRegPdst mem_ptr, iRegNsrc src1, iRegNsrc src2, flagsRegCR0 cr0) %{
   match(Set res (CompareAndExchangeN mem_ptr (Binary src1 src2)));
-  predicate(((CompareAndSwapNode*)n)->order() == MemNode::acquire || ((CompareAndSwapNode*)n)->order() == MemNode::seqcst);
+  predicate((((CompareAndSwapNode*)n)->order() == MemNode::acquire || ((CompareAndSwapNode*)n)->order() == MemNode::seqcst) && n->as_LoadStore()->barrier_data() == 0);
   effect(TEMP_DEF res, TEMP cr0);
   format %{ "CMPXCHGW acq $res, $mem_ptr, $src1, $src2; as narrow oop" %}
   ins_encode %{
@@ -8262,6 +8269,7 @@ instruct getAndSetP(iRegPdst res, iRegPdst mem_ptr, iRegPsrc src, flagsRegCR0 cr
 
 instruct getAndSetN(iRegNdst res, iRegPdst mem_ptr, iRegNsrc src, flagsRegCR0 cr0) %{
   match(Set res (GetAndSetN mem_ptr src));
+  predicate(n->as_LoadStore()->barrier_data() == 0);
   effect(TEMP_DEF res, TEMP cr0);
   format %{ "GetAndSetN $res, $mem_ptr, $src" %}
   ins_encode %{
diff --git a/src/hotspot/cpu/ppc/register_ppc.hpp b/src/hotspot/cpu/ppc/register_ppc.hpp
index 302d49884fa..b7ba4f053b5 100644
--- a/src/hotspot/cpu/ppc/register_ppc.hpp
+++ b/src/hotspot/cpu/ppc/register_ppc.hpp
@@ -27,6 +27,7 @@
 #define CPU_PPC_REGISTER_PPC_HPP
 
 #include "asm/register.hpp"
+#include "utilities/count_trailing_zeros.hpp"
 
 // forward declaration
 class VMRegImpl;
@@ -555,4 +556,12 @@ constexpr Register R29_TOC = R29;
 constexpr Register R11_scratch1 = R11;
 constexpr Register R12_scratch2 = R12;
 
+template <>
+inline Register AbstractRegSet<Register>::first() {
+  if (_bitset == 0) { return noreg; }
+  return as_Register(count_trailing_zeros(_bitset));
+}
+
+typedef AbstractRegSet<Register> RegSet;
+
 #endif // CPU_PPC_REGISTER_PPC_HPP
diff --git a/src/hotspot/cpu/riscv/gc/g1/g1BarrierSetAssembler_riscv.cpp b/src/hotspot/cpu/riscv/gc/g1/g1BarrierSetAssembler_riscv.cpp
index 062f8029062..7036c44d99d 100644
--- a/src/hotspot/cpu/riscv/gc/g1/g1BarrierSetAssembler_riscv.cpp
+++ b/src/hotspot/cpu/riscv/gc/g1/g1BarrierSetAssembler_riscv.cpp
@@ -1,6 +1,6 @@
 /*
  * Copyright (c) 2018, 2023, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2020, 2023, Huawei Technologies Co., Ltd. All rights reserved.
+ * Copyright (c) 2020, 2024, Huawei Technologies Co., Ltd. All rights reserved.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
  * This code is free software; you can redistribute it and/or modify it
@@ -39,7 +39,10 @@
 #include "c1/c1_LIRAssembler.hpp"
 #include "c1/c1_MacroAssembler.hpp"
 #include "gc/g1/c1/g1BarrierSetC1.hpp"
-#endif
+#endif // COMPILER1
+#ifdef COMPILER2
+#include "gc/g1/c2/g1BarrierSetC2.hpp"
+#endif // COMPILER2
 
 #define __ masm->
 
@@ -96,6 +99,55 @@ void G1BarrierSetAssembler::gen_write_ref_array_post_barrier(MacroAssembler* mas
   __ pop_reg(saved_regs, sp);
 }
 
+static void generate_queue_test_and_insertion(MacroAssembler* masm, ByteSize index_offset, ByteSize buffer_offset, Label& runtime,
+                                              const Register thread, const Register value, const Register tmp1, const Register tmp2) {
+  // Can we store a value in the given thread's buffer?
+  // (The index field is typed as size_t.)
+  __ ld(tmp1, Address(thread, in_bytes(index_offset)));   // tmp1 := *(index address)
+  __ beqz(tmp1, runtime);                                 // jump to runtime if index == 0 (full buffer)
+  // The buffer is not full, store value into it.
+  __ sub(tmp1, tmp1, wordSize);                           // tmp1 := next index
+  __ sd(tmp1, Address(thread, in_bytes(index_offset)));   // *(index address) := next index
+  __ ld(tmp2, Address(thread, in_bytes(buffer_offset)));  // tmp2 := buffer address
+  __ add(tmp2, tmp2, tmp1);
+  __ sd(value, Address(tmp2));                            // *(buffer address + next index) := value
+}
+
+static void generate_pre_barrier_fast_path(MacroAssembler* masm,
+                                           const Register thread,
+                                           const Register tmp1) {
+  Address in_progress(thread, in_bytes(G1ThreadLocalData::satb_mark_queue_active_offset()));
+  // Is marking active?
+  if (in_bytes(SATBMarkQueue::byte_width_of_active()) == 4) {
+    __ lwu(tmp1, in_progress);
+  } else {
+    assert(in_bytes(SATBMarkQueue::byte_width_of_active()) == 1, "Assumption");
+    __ lbu(tmp1, in_progress);
+  }
+}
+
+static void generate_pre_barrier_slow_path(MacroAssembler* masm,
+                                           const Register obj,
+                                           const Register pre_val,
+                                           const Register thread,
+                                           const Register tmp1,
+                                           const Register tmp2,
+                                           Label& done,
+                                           Label& runtime) {
+  // Do we need to load the previous value?
+  if (obj != noreg) {
+    __ load_heap_oop(pre_val, Address(obj, 0), noreg, noreg, AS_RAW);
+  }
+  // Is the previous value null?
+  __ beqz(pre_val, done, true);
+  generate_queue_test_and_insertion(masm,
+                                    G1ThreadLocalData::satb_mark_queue_index_offset(),
+                                    G1ThreadLocalData::satb_mark_queue_buffer_offset(),
+                                    runtime,
+                                    thread, pre_val, tmp1, tmp2);
+  __ j(done);
+}
+
 void G1BarrierSetAssembler::g1_write_barrier_pre(MacroAssembler* masm,
                                                  Register obj,
                                                  Register pre_val,
@@ -116,43 +168,10 @@ void G1BarrierSetAssembler::g1_write_barrier_pre(MacroAssembler* masm,
   assert_different_registers(obj, pre_val, tmp1, tmp2);
   assert(pre_val != noreg && tmp1 != noreg && tmp2 != noreg, "expecting a register");
 
-  Address in_progress(thread, in_bytes(G1ThreadLocalData::satb_mark_queue_active_offset()));
-  Address index(thread, in_bytes(G1ThreadLocalData::satb_mark_queue_index_offset()));
-  Address buffer(thread, in_bytes(G1ThreadLocalData::satb_mark_queue_buffer_offset()));
-
-  // Is marking active?
-  if (in_bytes(SATBMarkQueue::byte_width_of_active()) == 4) { // 4-byte width
-    __ lwu(tmp1, in_progress);
-  } else {
-    assert(in_bytes(SATBMarkQueue::byte_width_of_active()) == 1, "Assumption");
-    __ lbu(tmp1, in_progress);
-  }
+  generate_pre_barrier_fast_path(masm, thread, tmp1);
+  // If marking is not active (*(mark queue active address) == 0), jump to done
   __ beqz(tmp1, done);
-
-  // Do we need to load the previous value?
-  if (obj != noreg) {
-    __ load_heap_oop(pre_val, Address(obj, 0), noreg, noreg, AS_RAW);
-  }
-
-  // Is the previous value null?
-  __ beqz(pre_val, done);
-
-  // Can we store original value in the thread's buffer?
-  // Is index == 0?
-  // (The index field is typed as size_t.)
-
-  __ ld(tmp1, index);                  // tmp := *index_adr
-  __ beqz(tmp1, runtime);              // tmp == 0?
-                                       // If yes, goto runtime
-
-  __ sub(tmp1, tmp1, wordSize);        // tmp := tmp - wordSize
-  __ sd(tmp1, index);                  // *index_adr := tmp
-  __ ld(tmp2, buffer);
-  __ add(tmp1, tmp1, tmp2);            // tmp := tmp + *buffer_adr
-
-  // Record the previous value
-  __ sd(pre_val, Address(tmp1, 0));
-  __ j(done);
+  generate_pre_barrier_slow_path(masm, obj, pre_val, thread, tmp1, tmp2, done, runtime);
 
   __ bind(runtime);
 
@@ -171,6 +190,49 @@ void G1BarrierSetAssembler::g1_write_barrier_pre(MacroAssembler* masm,
 
 }
 
+static void generate_post_barrier_fast_path(MacroAssembler* masm,
+                                            const Register store_addr,
+                                            const Register new_val,
+                                            const Register tmp1,
+                                            const Register tmp2,
+                                            Label& done,
+                                            bool new_val_may_be_null) {
+  // Does store cross heap regions?
+  __ xorr(tmp1, store_addr, new_val);                    // tmp1 := store address ^ new value
+  __ srli(tmp1, tmp1, G1HeapRegion::LogOfHRGrainBytes);  // tmp1 := ((store address ^ new value) >> LogOfHRGrainBytes)
+  __ beqz(tmp1, done);
+  // Crosses regions, storing null?
+  if (new_val_may_be_null) {
+    __ beqz(new_val, done);
+  }
+  // Storing region crossing non-null, is card young?
+  __ srli(tmp1, store_addr, CardTable::card_shift());    // tmp1 := card address relative to card table base
+  __ load_byte_map_base(tmp2);                           // tmp2 := card table base address
+  __ add(tmp1, tmp1, tmp2);                              // tmp1 := card address
+  __ lbu(tmp2, Address(tmp1));                           // tmp2 := card
+}
+
+static void generate_post_barrier_slow_path(MacroAssembler* masm,
+                                            const Register thread,
+                                            const Register tmp1,
+                                            const Register tmp2,
+                                            Label& done,
+                                            Label& runtime) {
+  __ membar(MacroAssembler::StoreLoad);  // StoreLoad membar
+  __ lbu(tmp2, Address(tmp1));           // tmp2 := card
+  __ beqz(tmp2, done, true);
+  // Storing a region crossing, non-null oop, card is clean.
+  // Dirty card and log.
+  STATIC_ASSERT(CardTable::dirty_card_val() == 0);
+  __ sb(zr, Address(tmp1));       // *(card address) := dirty_card_val
+  generate_queue_test_and_insertion(masm,
+                                    G1ThreadLocalData::dirty_card_queue_index_offset(),
+                                    G1ThreadLocalData::dirty_card_queue_buffer_offset(),
+                                    runtime,
+                                    thread, tmp1, tmp2, t0);
+  __ j(done);
+}
+
 void G1BarrierSetAssembler::g1_write_barrier_post(MacroAssembler* masm,
                                                   Register store_addr,
                                                   Register new_val,
@@ -179,73 +241,119 @@ void G1BarrierSetAssembler::g1_write_barrier_post(MacroAssembler* masm,
                                                   Register tmp2) {
   assert(thread == xthread, "must be");
   assert_different_registers(store_addr, new_val, thread, tmp1, tmp2, t0);
-  assert(store_addr != noreg && new_val != noreg && tmp1 != noreg &&
-         tmp2 != noreg, "expecting a register");
-
-  Address queue_index(thread, in_bytes(G1ThreadLocalData::dirty_card_queue_index_offset()));
-  Address buffer(thread, in_bytes(G1ThreadLocalData::dirty_card_queue_buffer_offset()));
-
-  BarrierSet* bs = BarrierSet::barrier_set();
-  CardTableBarrierSet* ctbs = barrier_set_cast<CardTableBarrierSet>(bs);
+  assert(store_addr != noreg && new_val != noreg && tmp1 != noreg && tmp2 != noreg,
+         "expecting a register");
 
   Label done;
   Label runtime;
 
-  // Does store cross heap regions?
-
-  __ xorr(tmp1, store_addr, new_val);
-  __ srli(tmp1, tmp1, G1HeapRegion::LogOfHRGrainBytes);
-  __ beqz(tmp1, done);
-
-  // crosses regions, storing null?
-
-  __ beqz(new_val, done);
-
-  // storing region crossing non-null, is card already dirty?
-
-  const Register card_addr = tmp1;
-
-  __ srli(card_addr, store_addr, CardTable::card_shift());
-
-  // get the address of the card
-  __ load_byte_map_base(tmp2);
-  __ add(card_addr, card_addr, tmp2);
-  __ lbu(tmp2, Address(card_addr));
+  generate_post_barrier_fast_path(masm, store_addr, new_val, tmp1, tmp2, done, true /* new_val_may_be_null */);
+  // If card is young, jump to done (tmp2 holds the card value)
   __ mv(t0, (int)G1CardTable::g1_young_card_val());
-  __ beq(tmp2, t0, done);
-
-  assert((int)CardTable::dirty_card_val() == 0, "must be 0");
-
-  __ membar(MacroAssembler::StoreLoad);
-
-  __ lbu(tmp2, Address(card_addr));
-  __ beqz(tmp2, done);
-
-  // storing a region crossing, non-null oop, card is clean.
-  // dirty card and log.
-
-  __ sb(zr, Address(card_addr));
-
-  __ ld(t0, queue_index);
-  __ beqz(t0, runtime);
-  __ sub(t0, t0, wordSize);
-  __ sd(t0, queue_index);
-
-  __ ld(tmp2, buffer);
-  __ add(t0, tmp2, t0);
-  __ sd(card_addr, Address(t0, 0));
-  __ j(done);
+  __ beq(tmp2, t0, done);   // card == young_card_val?
+  generate_post_barrier_slow_path(masm, thread, tmp1, tmp2, done, runtime);
 
   __ bind(runtime);
   // save the live input values
   RegSet saved = RegSet::of(store_addr);
   __ push_reg(saved, sp);
-  __ call_VM_leaf(CAST_FROM_FN_PTR(address, G1BarrierSetRuntime::write_ref_field_post_entry), card_addr, thread);
+  __ call_VM_leaf(CAST_FROM_FN_PTR(address, G1BarrierSetRuntime::write_ref_field_post_entry), tmp1, thread);
   __ pop_reg(saved, sp);
 
   __ bind(done);
 }
 
+#if defined(COMPILER2)
+
+static void generate_c2_barrier_runtime_call(MacroAssembler* masm, G1BarrierStubC2* stub, const Register arg, const address runtime_path) {
+  SaveLiveRegisters save_registers(masm, stub);
+  if (c_rarg0 != arg) {
+    __ mv(c_rarg0, arg);
+  }
+  __ mv(c_rarg1, xthread);
+  __ mv(t0, runtime_path);
+  __ jalr(t0);
+}
+
+void G1BarrierSetAssembler::g1_write_barrier_pre_c2(MacroAssembler* masm,
+                                                    Register obj,
+                                                    Register pre_val,
+                                                    Register thread,
+                                                    Register tmp1,
+                                                    Register tmp2,
+                                                    G1PreBarrierStubC2* stub) {
+  assert(thread == xthread, "must be");
+  assert_different_registers(obj, pre_val, tmp1, tmp2);
+  assert(pre_val != noreg && tmp1 != noreg && tmp2 != noreg, "expecting a register");
+
+  stub->initialize_registers(obj, pre_val, thread, tmp1, tmp2);
+
+  generate_pre_barrier_fast_path(masm, thread, tmp1);
+  // If marking is active (*(mark queue active address) != 0), jump to stub (slow path)
+  __ bnez(tmp1, *stub->entry(), true);
+
+  __ bind(*stub->continuation());
+}
+
+void G1BarrierSetAssembler::generate_c2_pre_barrier_stub(MacroAssembler* masm,
+                                                         G1PreBarrierStubC2* stub) const {
+  Assembler::InlineSkippedInstructionsCounter skip_counter(masm);
+  Label runtime;
+  Register obj = stub->obj();
+  Register pre_val = stub->pre_val();
+  Register thread = stub->thread();
+  Register tmp1 = stub->tmp1();
+  Register tmp2 = stub->tmp2();
+
+  __ bind(*stub->entry());
+  generate_pre_barrier_slow_path(masm, obj, pre_val, thread, tmp1, tmp2, *stub->continuation(), runtime);
+
+  __ bind(runtime);
+  generate_c2_barrier_runtime_call(masm, stub, pre_val, CAST_FROM_FN_PTR(address, G1BarrierSetRuntime::write_ref_field_pre_entry));
+  __ j(*stub->continuation());
+}
+
+void G1BarrierSetAssembler::g1_write_barrier_post_c2(MacroAssembler* masm,
+                                                     Register store_addr,
+                                                     Register new_val,
+                                                     Register thread,
+                                                     Register tmp1,
+                                                     Register tmp2,
+                                                     G1PostBarrierStubC2* stub) {
+  assert(thread == xthread, "must be");
+  assert_different_registers(store_addr, new_val, thread, tmp1, tmp2, t0);
+  assert(store_addr != noreg && new_val != noreg && tmp1 != noreg && tmp2 != noreg,
+         "expecting a register");
+
+  stub->initialize_registers(thread, tmp1, tmp2);
+
+  bool new_val_may_be_null = (stub->barrier_data() & G1C2BarrierPostNotNull) == 0;
+  generate_post_barrier_fast_path(masm, store_addr, new_val, tmp1, tmp2, *stub->continuation(), new_val_may_be_null);
+  // If card is not young, jump to stub (slow path) (tmp2 holds the card value)
+  __ mv(t0, (int)G1CardTable::g1_young_card_val());
+  __ bne(tmp2, t0, *stub->entry(), true);
+
+  __ bind(*stub->continuation());
+}
+
+void G1BarrierSetAssembler::generate_c2_post_barrier_stub(MacroAssembler* masm,
+                                                          G1PostBarrierStubC2* stub) const {
+  Assembler::InlineSkippedInstructionsCounter skip_counter(masm);
+  Label runtime;
+  Register thread = stub->thread();
+  Register tmp1 = stub->tmp1(); // tmp1 holds the card address.
+  Register tmp2 = stub->tmp2();
+
+  __ bind(*stub->entry());
+  generate_post_barrier_slow_path(masm, thread, tmp1, tmp2, *stub->continuation(), runtime);
+
+  __ bind(runtime);
+  generate_c2_barrier_runtime_call(masm, stub, tmp1, CAST_FROM_FN_PTR(address, G1BarrierSetRuntime::write_ref_field_post_entry));
+  __ j(*stub->continuation());
+}
+
+#endif // COMPILER2
+
 void G1BarrierSetAssembler::load_at(MacroAssembler* masm, DecoratorSet decorators, BasicType type,
                                     Register dst, Address src, Register tmp1, Register tmp2) {
   bool on_oop = is_reference_type(type);
diff --git a/src/hotspot/cpu/riscv/gc/g1/g1BarrierSetAssembler_riscv.hpp b/src/hotspot/cpu/riscv/gc/g1/g1BarrierSetAssembler_riscv.hpp
index 96568994079..c7bee2ef6f3 100644
--- a/src/hotspot/cpu/riscv/gc/g1/g1BarrierSetAssembler_riscv.hpp
+++ b/src/hotspot/cpu/riscv/gc/g1/g1BarrierSetAssembler_riscv.hpp
@@ -1,6 +1,6 @@
 /*
  * Copyright (c) 2018, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2020, 2021, Huawei Technologies Co., Ltd. All rights reserved.
+ * Copyright (c) 2020, 2024, Huawei Technologies Co., Ltd. All rights reserved.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
  * This code is free software; you can redistribute it and/or modify it
@@ -36,6 +36,8 @@ class LIR_Assembler;
 class StubAssembler;
 class G1PreBarrierStub;
 class G1PostBarrierStub;
+class G1PreBarrierStubC2;
+class G1PostBarrierStubC2;
 
 class G1BarrierSetAssembler: public ModRefBarrierSetAssembler {
 protected:
@@ -72,6 +74,27 @@ public:
   void generate_c1_post_barrier_runtime_stub(StubAssembler* sasm);
 #endif
 
+#ifdef COMPILER2
+  void g1_write_barrier_pre_c2(MacroAssembler* masm,
+                               Register obj,
+                               Register pre_val,
+                               Register thread,
+                               Register tmp1,
+                               Register tmp2,
+                               G1PreBarrierStubC2* c2_stub);
+  void generate_c2_pre_barrier_stub(MacroAssembler* masm,
+                                    G1PreBarrierStubC2* stub) const;
+  void g1_write_barrier_post_c2(MacroAssembler* masm,
+                                Register store_addr,
+                                Register new_val,
+                                Register thread,
+                                Register tmp1,
+                                Register tmp2,
+                                G1PostBarrierStubC2* c2_stub);
+  void generate_c2_post_barrier_stub(MacroAssembler* masm,
+                                     G1PostBarrierStubC2* stub) const;
+#endif
+
   void load_at(MacroAssembler* masm, DecoratorSet decorators, BasicType type,
                Register dst, Address src, Register tmp1, Register tmp2);
 };
diff --git a/src/hotspot/cpu/riscv/gc/g1/g1_riscv.ad b/src/hotspot/cpu/riscv/gc/g1/g1_riscv.ad
new file mode 100644
index 00000000000..1dc5834dbdc
--- /dev/null
+++ b/src/hotspot/cpu/riscv/gc/g1/g1_riscv.ad
@@ -0,0 +1,564 @@
+//
+// Copyright (c) 2024, Oracle and/or its affiliates. All rights reserved.
+// Copyright (c) 2024, Huawei Technologies Co., Ltd. All rights reserved.
+// DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+//
+// This code is free software; you can redistribute it and/or modify it
+// under the terms of the GNU General Public License version 2 only, as
+// published by the Free Software Foundation.
+//
+// This code is distributed in the hope that it will be useful, but WITHOUT
+// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+// FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+// version 2 for more details (a copy is included in the LICENSE file that
+// accompanied this code).
+//
+// You should have received a copy of the GNU General Public License version
+// 2 along with this work; if not, write to the Free Software Foundation,
+// Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+//
+// Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+// or visit www.oracle.com if you need additional information or have any
+// questions.
+//
+
+source_hpp %{
+
+#include "gc/g1/c2/g1BarrierSetC2.hpp"
+#include "gc/shared/gc_globals.hpp"
+
+%}
+
+source %{
+
+#include "gc/g1/g1BarrierSetAssembler_riscv.hpp"
+#include "gc/g1/g1BarrierSetRuntime.hpp"
+
+static void write_barrier_pre(MacroAssembler* masm,
+                              const MachNode* node,
+                              Register obj,
+                              Register pre_val,
+                              Register tmp1,
+                              Register tmp2,
+                              RegSet preserve = RegSet(),
+                              RegSet no_preserve = RegSet()) {
+  if (!G1PreBarrierStubC2::needs_barrier(node)) {
+    return;
+  }
+  Assembler::InlineSkippedInstructionsCounter skip_counter(masm);
+  G1BarrierSetAssembler* g1_asm = static_cast<G1BarrierSetAssembler*>(BarrierSet::barrier_set()->barrier_set_assembler());
+  G1PreBarrierStubC2* const stub = G1PreBarrierStubC2::create(node);
+  for (RegSetIterator<Register> reg = preserve.begin(); *reg != noreg; ++reg) {
+    stub->preserve(*reg);
+  }
+  for (RegSetIterator<Register> reg = no_preserve.begin(); *reg != noreg; ++reg) {
+    stub->dont_preserve(*reg);
+  }
+  g1_asm->g1_write_barrier_pre_c2(masm, obj, pre_val, xthread, tmp1, tmp2, stub);
+}
+
+static void write_barrier_post(MacroAssembler* masm,
+                               const MachNode* node,
+                               Register store_addr,
+                               Register new_val,
+                               Register tmp1,
+                               Register tmp2) {
+  if (!G1PostBarrierStubC2::needs_barrier(node)) {
+    return;
+  }
+  Assembler::InlineSkippedInstructionsCounter skip_counter(masm);
+  G1BarrierSetAssembler* g1_asm = static_cast<G1BarrierSetAssembler*>(BarrierSet::barrier_set()->barrier_set_assembler());
+  G1PostBarrierStubC2* const stub = G1PostBarrierStubC2::create(node);
+  g1_asm->g1_write_barrier_post_c2(masm, store_addr, new_val, xthread, tmp1, tmp2, stub);
+}
+
+%}
+
+instruct g1StoreP(indirect mem, iRegP src, iRegPNoSp tmp1, iRegPNoSp tmp2, iRegPNoSp tmp3)
+%{
+  predicate(UseG1GC && n->as_Store()->barrier_data() != 0);
+  match(Set mem (StoreP mem src));
+  effect(TEMP tmp1, TEMP tmp2, TEMP tmp3);
+  ins_cost(STORE_COST);
+  format %{ "sd  $src, $mem\t# ptr" %}
+  ins_encode %{
+    guarantee($mem$$disp == 0, "impossible encoding");
+    write_barrier_pre(masm, this,
+                      $mem$$Register  /* obj */,
+                      $tmp1$$Register /* pre_val */,
+                      $tmp2$$Register /* tmp1 */,
+                      $tmp3$$Register /* tmp2 */,
+                      RegSet::of($mem$$Register, $src$$Register) /* preserve */);
+    __ sd($src$$Register, Address($mem$$Register));
+    write_barrier_post(masm, this,
+                       $mem$$Register  /* store_addr */,
+                       $src$$Register  /* new_val */,
+                       $tmp2$$Register /* tmp1 */,
+                       $tmp3$$Register /* tmp2 */);
+  %}
+  ins_pipe(istore_reg_mem);
+%}
+
+instruct g1StoreN(indirect mem, iRegN src, iRegPNoSp tmp1, iRegPNoSp tmp2, iRegPNoSp tmp3)
+%{
+  predicate(UseG1GC && n->as_Store()->barrier_data() != 0);
+  match(Set mem (StoreN mem src));
+  effect(TEMP tmp1, TEMP tmp2, TEMP tmp3);
+  ins_cost(STORE_COST);
+  format %{ "sw  $src, $mem\t# compressed ptr" %}
+  ins_encode %{
+    guarantee($mem$$disp == 0, "impossible encoding");
+    write_barrier_pre(masm, this,
+                      $mem$$Register  /* obj */,
+                      $tmp1$$Register /* pre_val */,
+                      $tmp2$$Register /* tmp1 */,
+                      $tmp3$$Register /* tmp2 */,
+                      RegSet::of($mem$$Register, $src$$Register) /* preserve */);
+    __ sw($src$$Register, Address($mem$$Register));
+    if ((barrier_data() & G1C2BarrierPost) != 0) {
+      if ((barrier_data() & G1C2BarrierPostNotNull) == 0) {
+        __ decode_heap_oop($tmp1$$Register, $src$$Register);
+      } else {
+        __ decode_heap_oop_not_null($tmp1$$Register, $src$$Register);
+      }
+    }
+    write_barrier_post(masm, this,
+                       $mem$$Register  /* store_addr */,
+                       $tmp1$$Register /* new_val */,
+                       $tmp2$$Register /* tmp1 */,
+                       $tmp3$$Register /* tmp2 */);
+  %}
+  ins_pipe(istore_reg_mem);
+%}
+
+instruct g1EncodePAndStoreN(indirect mem, iRegP src, iRegPNoSp tmp1, iRegPNoSp tmp2, iRegPNoSp tmp3)
+%{
+  predicate(UseG1GC && n->as_Store()->barrier_data() != 0);
+  match(Set mem (StoreN mem (EncodeP src)));
+  effect(TEMP tmp1, TEMP tmp2, TEMP tmp3);
+  ins_cost(STORE_COST);
+  format %{ "encode_heap_oop $tmp1, $src\n\t"
+            "sw  $tmp1, $mem\t# compressed ptr" %}
+  ins_encode %{
+    guarantee($mem$$disp == 0, "impossible encoding");
+    write_barrier_pre(masm, this,
+                      $mem$$Register  /* obj */,
+                      $tmp1$$Register /* pre_val */,
+                      $tmp2$$Register /* tmp1 */,
+                      $tmp3$$Register /* tmp2 */,
+                      RegSet::of($mem$$Register, $src$$Register) /* preserve */);
+    if ((barrier_data() & G1C2BarrierPostNotNull) == 0) {
+      __ encode_heap_oop($tmp1$$Register, $src$$Register);
+    } else {
+      __ encode_heap_oop_not_null($tmp1$$Register, $src$$Register);
+    }
+    __ sw($tmp1$$Register, Address($mem$$Register));
+    write_barrier_post(masm, this,
+                       $mem$$Register  /* store_addr */,
+                       $src$$Register  /* new_val */,
+                       $tmp2$$Register /* tmp1 */,
+                       $tmp3$$Register /* tmp2 */);
+  %}
+  ins_pipe(istore_reg_mem);
+%}
+
+instruct g1CompareAndExchangeP(iRegPNoSp res, indirect mem, iRegP oldval, iRegP newval, iRegPNoSp tmp1, iRegPNoSp tmp2)
+%{
+  predicate(UseG1GC && n->as_LoadStore()->barrier_data() != 0);
+  match(Set res (CompareAndExchangeP mem (Binary oldval newval)));
+  effect(TEMP res, TEMP tmp1, TEMP tmp2);
+  ins_cost(2 * VOLATILE_REF_COST);
+  format %{ "cmpxchg $res = $mem, $oldval, $newval\t# ptr" %}
+  ins_encode %{
+    guarantee($mem$$disp == 0, "impossible encoding");
+    assert_different_registers($oldval$$Register, $mem$$Register);
+    assert_different_registers($newval$$Register, $mem$$Register);
+    // Pass $oldval to the pre-barrier (instead of loading from $mem), because
+    // $oldval is the only value that can be overwritten.
+    // The same holds for g1CompareAndSwapP and its Acq variant.
+    write_barrier_pre(masm, this,
+                      noreg             /* obj */,
+                      $oldval$$Register /* pre_val */,
+                      $tmp1$$Register   /* tmp1 */,
+                      $tmp2$$Register   /* tmp2 */,
+                      RegSet::of($mem$$Register, $oldval$$Register, $newval$$Register) /* preserve */,
+                      RegSet::of($res$$Register) /* no_preserve */);
+    __ cmpxchg($mem$$Register, $oldval$$Register, $newval$$Register, Assembler::int64,
+               /*acquire*/ Assembler::relaxed, /*release*/ Assembler::rl, $res$$Register);
+    write_barrier_post(masm, this,
+                       $mem$$Register    /* store_addr */,
+                       $newval$$Register /* new_val */,
+                       $tmp1$$Register   /* tmp1 */,
+                       $tmp2$$Register   /* tmp2 */);
+  %}
+  ins_pipe(pipe_slow);
+%}
+
+instruct g1CompareAndExchangePAcq(iRegPNoSp res, indirect mem, iRegP oldval, iRegP newval, iRegPNoSp tmp1, iRegPNoSp tmp2)
+%{
+  predicate(UseG1GC && needs_acquiring_load_reserved(n) && n->as_LoadStore()->barrier_data() != 0);
+  match(Set res (CompareAndExchangeP mem (Binary oldval newval)));
+  effect(TEMP res, TEMP tmp1, TEMP tmp2);
+  ins_cost(VOLATILE_REF_COST);
+  format %{ "cmpxchg_acq $res = $mem, $oldval, $newval\t# ptr" %}
+  ins_encode %{
+    guarantee($mem$$disp == 0, "impossible encoding");
+    assert_different_registers($oldval$$Register, $mem$$Register);
+    assert_different_registers($newval$$Register, $mem$$Register);
+    // Pass $oldval to the pre-barrier (instead of loading from $mem), because
+    // $oldval is the only value that can be overwritten.
+    // The same holds for g1CompareAndSwapP and its Acq variant.
+    write_barrier_pre(masm, this,
+                      noreg             /* obj */,
+                      $oldval$$Register /* pre_val */,
+                      $tmp1$$Register   /* tmp1 */,
+                      $tmp2$$Register   /* tmp2 */,
+                      RegSet::of($mem$$Register, $oldval$$Register, $newval$$Register) /* preserve */,
+                      RegSet::of($res$$Register) /* no_preserve */);
+    __ cmpxchg($mem$$Register, $oldval$$Register, $newval$$Register, Assembler::int64,
+               /*acquire*/ Assembler::aq, /*release*/ Assembler::rl, $res$$Register);
+    write_barrier_post(masm, this,
+                       $mem$$Register    /* store_addr */,
+                       $newval$$Register /* new_val */,
+                       $tmp1$$Register   /* tmp1 */,
+                       $tmp2$$Register   /* tmp2 */);
+  %}
+  ins_pipe(pipe_slow);
+%}
+
+instruct g1CompareAndExchangeN(iRegNNoSp res, indirect mem, iRegN oldval, iRegN newval, iRegPNoSp tmp1, iRegPNoSp tmp2, iRegPNoSp tmp3)
+%{
+  predicate(UseG1GC && n->as_LoadStore()->barrier_data() != 0);
+  match(Set res (CompareAndExchangeN mem (Binary oldval newval)));
+  effect(TEMP res, TEMP tmp1, TEMP tmp2, TEMP tmp3);
+  ins_cost(2 * VOLATILE_REF_COST);
+  format %{ "cmpxchg $res = $mem, $oldval, $newval\t# narrow oop" %}
+  ins_encode %{
+    guarantee($mem$$disp == 0, "impossible encoding");
+    assert_different_registers($oldval$$Register, $mem$$Register);
+    assert_different_registers($newval$$Register, $mem$$Register);
+    write_barrier_pre(masm, this,
+                      $mem$$Register  /* obj */,
+                      $tmp1$$Register /* pre_val */,
+                      $tmp2$$Register /* tmp1 */,
+                      $tmp3$$Register /* tmp2 */,
+                      RegSet::of($mem$$Register, $oldval$$Register, $newval$$Register) /* preserve */,
+                      RegSet::of($res$$Register) /* no_preserve */);
+    __ cmpxchg($mem$$Register, $oldval$$Register, $newval$$Register, Assembler::uint32,
+               /*acquire*/ Assembler::relaxed, /*release*/ Assembler::rl, $res$$Register);
+    __ decode_heap_oop($tmp1$$Register, $newval$$Register);
+    write_barrier_post(masm, this,
+                       $mem$$Register  /* store_addr */,
+                       $tmp1$$Register /* new_val */,
+                       $tmp2$$Register /* tmp1 */,
+                       $tmp3$$Register /* tmp2 */);
+  %}
+  ins_pipe(pipe_slow);
+%}
+
+instruct g1CompareAndExchangeNAcq(iRegNNoSp res, indirect mem, iRegN oldval, iRegN newval, iRegPNoSp tmp1, iRegPNoSp tmp2, iRegPNoSp tmp3)
+%{
+  predicate(UseG1GC && needs_acquiring_load_reserved(n) && n->as_LoadStore()->barrier_data() != 0);
+  match(Set res (CompareAndExchangeN mem (Binary oldval newval)));
+  effect(TEMP res, TEMP tmp1, TEMP tmp2, TEMP tmp3);
+  ins_cost(VOLATILE_REF_COST);
+  format %{ "cmpxchg_acq $res = $mem, $oldval, $newval\t# narrow oop" %}
+  ins_encode %{
+    guarantee($mem$$disp == 0, "impossible encoding");
+    assert_different_registers($oldval$$Register, $mem$$Register);
+    assert_different_registers($newval$$Register, $mem$$Register);
+    write_barrier_pre(masm, this,
+                      $mem$$Register  /* obj */,
+                      $tmp1$$Register /* pre_val */,
+                      $tmp2$$Register /* tmp1 */,
+                      $tmp3$$Register /* tmp2 */,
+                      RegSet::of($mem$$Register, $oldval$$Register, $newval$$Register) /* preserve */,
+                      RegSet::of($res$$Register) /* no_preserve */);
+    __ cmpxchg($mem$$Register, $oldval$$Register, $newval$$Register, Assembler::uint32,
+               /*acquire*/ Assembler::aq, /*release*/ Assembler::rl, $res$$Register);
+    __ decode_heap_oop($tmp1$$Register, $newval$$Register);
+    write_barrier_post(masm, this,
+                       $mem$$Register  /* store_addr */,
+                       $tmp1$$Register /* new_val */,
+                       $tmp2$$Register /* tmp1 */,
+                       $tmp3$$Register /* tmp2 */);
+  %}
+  ins_pipe(pipe_slow);
+%}
+
+instruct g1CompareAndSwapP(iRegINoSp res, indirect mem, iRegP newval, iRegPNoSp tmp1, iRegPNoSp tmp2, iRegP oldval)
+%{
+  predicate(UseG1GC && n->as_LoadStore()->barrier_data() != 0);
+  match(Set res (CompareAndSwapP mem (Binary oldval newval)));
+  match(Set res (WeakCompareAndSwapP mem (Binary oldval newval)));
+  effect(TEMP res, TEMP tmp1, TEMP tmp2);
+  ins_cost(2 * VOLATILE_REF_COST);
+  format %{ "cmpxchg $mem, $oldval, $newval\t# (ptr)\n\t"
+            "mv $res, $res == $oldval" %}
+  ins_encode %{
+    guarantee($mem$$disp == 0, "impossible encoding");
+    assert_different_registers($oldval$$Register, $mem$$Register);
+    assert_different_registers($newval$$Register, $mem$$Register);
+    write_barrier_pre(masm, this,
+                      noreg             /* obj */,
+                      $oldval$$Register /* pre_val */,
+                      $tmp1$$Register   /* tmp1 */,
+                      $tmp2$$Register   /* tmp2 */,
+                      RegSet::of($mem$$Register, $oldval$$Register, $newval$$Register) /* preserve */,
+                      RegSet::of($res$$Register) /* no_preserve */);
+    __ cmpxchg($mem$$Register, $oldval$$Register, $newval$$Register, Assembler::int64,
+               /*acquire*/ Assembler::relaxed, /*release*/ Assembler::rl, $res$$Register,
+               /*result as bool*/ true);
+    write_barrier_post(masm, this,
+                       $mem$$Register    /* store_addr */,
+                       $newval$$Register /* new_val */,
+                       $tmp1$$Register   /* tmp1 */,
+                       $tmp2$$Register   /* tmp2 */);
+  %}
+  ins_pipe(pipe_slow);
+%}
+
+instruct g1CompareAndSwapPAcq(iRegINoSp res, indirect mem, iRegP newval, iRegPNoSp tmp1, iRegPNoSp tmp2, iRegP oldval)
+%{
+  predicate(UseG1GC && needs_acquiring_load_reserved(n) && n->as_LoadStore()->barrier_data() != 0);
+  match(Set res (CompareAndSwapP mem (Binary oldval newval)));
+  match(Set res (WeakCompareAndSwapP mem (Binary oldval newval)));
+  effect(TEMP res, TEMP tmp1, TEMP tmp2);
+  ins_cost(VOLATILE_REF_COST);
+  format %{ "cmpxchg_acq $mem, $oldval, $newval\t# (ptr)\n\t"
+            "mv $res, $res == $oldval" %}
+  ins_encode %{
+    guarantee($mem$$disp == 0, "impossible encoding");
+    assert_different_registers($oldval$$Register, $mem$$Register);
+    assert_different_registers($newval$$Register, $mem$$Register);
+    write_barrier_pre(masm, this,
+                      noreg             /* obj */,
+                      $oldval$$Register /* pre_val */,
+                      $tmp1$$Register   /* tmp1 */,
+                      $tmp2$$Register   /* tmp2 */,
+                      RegSet::of($mem$$Register, $oldval$$Register, $newval$$Register) /* preserve */,
+                      RegSet::of($res$$Register) /* no_preserve */);
+    __ cmpxchg($mem$$Register, $oldval$$Register, $newval$$Register, Assembler::int64,
+               /*acquire*/ Assembler::aq, /*release*/ Assembler::rl, $res$$Register,
+               /*result as bool*/ true);
+    write_barrier_post(masm, this,
+                       $mem$$Register    /* store_addr */,
+                       $newval$$Register /* new_val */,
+                       $tmp1$$Register   /* tmp1 */,
+                       $tmp2$$Register   /* tmp2 */);
+  %}
+  ins_pipe(pipe_slow);
+%}
+
+instruct g1CompareAndSwapN(iRegINoSp res, indirect mem, iRegN newval, iRegPNoSp tmp1, iRegPNoSp tmp2, iRegPNoSp tmp3, iRegN oldval)
+%{
+  predicate(UseG1GC && n->as_LoadStore()->barrier_data() != 0);
+  match(Set res (CompareAndSwapN mem (Binary oldval newval)));
+  match(Set res (WeakCompareAndSwapN mem (Binary oldval newval)));
+  effect(TEMP res, TEMP tmp1, TEMP tmp2, TEMP tmp3);
+  ins_cost(2 * VOLATILE_REF_COST);
+  format %{ "cmpxchg $mem, $oldval, $newval\t# (narrow oop)\n\t"
+            "mv $res, $res == $oldval" %}
+  ins_encode %{
+    guarantee($mem$$disp == 0, "impossible encoding");
+    assert_different_registers($oldval$$Register, $mem$$Register);
+    assert_different_registers($newval$$Register, $mem$$Register);
+    write_barrier_pre(masm, this,
+                      $mem$$Register  /* obj */,
+                      $tmp1$$Register /* pre_val */,
+                      $tmp2$$Register /* tmp1 */,
+                      $tmp3$$Register /* tmp2 */,
+                      RegSet::of($mem$$Register, $oldval$$Register, $newval$$Register) /* preserve */,
+                      RegSet::of($res$$Register) /* no_preserve */);
+    __ cmpxchg($mem$$Register, $oldval$$Register, $newval$$Register, Assembler::uint32,
+               /*acquire*/ Assembler::relaxed, /*release*/ Assembler::rl, $res$$Register,
+               /*result as bool*/ true);
+    __ decode_heap_oop($tmp1$$Register, $newval$$Register);
+    write_barrier_post(masm, this,
+                       $mem$$Register  /* store_addr */,
+                       $tmp1$$Register /* new_val */,
+                       $tmp2$$Register /* tmp1 */,
+                       $tmp3$$Register /* tmp2 */);
+  %}
+  ins_pipe(pipe_slow);
+%}
+
+instruct g1CompareAndSwapNAcq(iRegINoSp res, indirect mem, iRegN newval, iRegPNoSp tmp1, iRegPNoSp tmp2, iRegPNoSp tmp3, iRegN oldval)
+%{
+  predicate(UseG1GC && needs_acquiring_load_reserved(n) && n->as_LoadStore()->barrier_data() != 0);
+  match(Set res (CompareAndSwapN mem (Binary oldval newval)));
+  match(Set res (WeakCompareAndSwapN mem (Binary oldval newval)));
+  effect(TEMP res, TEMP tmp1, TEMP tmp2, TEMP tmp3);
+  ins_cost(VOLATILE_REF_COST);
+  format %{ "cmpxchg_acq $mem, $oldval, $newval\t# (narrow oop)\n\t"
+            "mv $res, $res == $oldval" %}
+  ins_encode %{
+    guarantee($mem$$disp == 0, "impossible encoding");
+    assert_different_registers($oldval$$Register, $mem$$Register);
+    assert_different_registers($newval$$Register, $mem$$Register);
+    write_barrier_pre(masm, this,
+                      $mem$$Register  /* obj */,
+                      $tmp1$$Register /* pre_val */,
+                      $tmp2$$Register /* tmp1 */,
+                      $tmp3$$Register /* tmp2 */,
+                      RegSet::of($mem$$Register, $oldval$$Register, $newval$$Register) /* preserve */,
+                      RegSet::of($res$$Register) /* no_preserve */);
+    __ cmpxchg($mem$$Register, $oldval$$Register, $newval$$Register, Assembler::uint32,
+              /*acquire*/ Assembler::aq, /*release*/ Assembler::rl, $res$$Register,
+              /*result as bool*/ true);
+    __ decode_heap_oop($tmp1$$Register, $newval$$Register);
+    write_barrier_post(masm, this,
+                       $mem$$Register  /* store_addr */,
+                       $tmp1$$Register /* new_val */,
+                       $tmp2$$Register /* tmp1 */,
+                       $tmp3$$Register /* tmp2 */);
+  %}
+  ins_pipe(pipe_slow);
+%}
+
+instruct g1GetAndSetP(indirect mem, iRegP newval, iRegPNoSp tmp1, iRegPNoSp tmp2, iRegPNoSp preval)
+%{
+  predicate(UseG1GC && n->as_LoadStore()->barrier_data() != 0);
+  match(Set preval (GetAndSetP mem newval));
+  effect(TEMP preval, TEMP tmp1, TEMP tmp2);
+  ins_cost(2 * VOLATILE_REF_COST);
+  format %{ "atomic_xchg  $preval, $newval, [$mem]" %}
+  ins_encode %{
+    guarantee($mem$$disp == 0, "impossible encoding");
+    assert_different_registers($mem$$Register, $newval$$Register);
+    write_barrier_pre(masm, this,
+                      $mem$$Register    /* obj */,
+                      $preval$$Register /* pre_val (as a temporary register) */,
+                      $tmp1$$Register   /* tmp1 */,
+                      $tmp2$$Register   /* tmp2 */,
+                      RegSet::of($mem$$Register, $preval$$Register, $newval$$Register) /* preserve */);
+    __ atomic_xchg($preval$$Register, $newval$$Register, $mem$$Register);
+    write_barrier_post(masm, this,
+                       $mem$$Register    /* store_addr */,
+                       $newval$$Register /* new_val */,
+                       $tmp1$$Register   /* tmp1 */,
+                       $tmp2$$Register   /* tmp2 */);
+  %}
+  ins_pipe(pipe_serial);
+%}
+
+instruct g1GetAndSetPAcq(indirect mem, iRegP newval, iRegPNoSp tmp1, iRegPNoSp tmp2, iRegPNoSp preval)
+%{
+  predicate(UseG1GC && needs_acquiring_load_reserved(n) && n->as_LoadStore()->barrier_data() != 0);
+  match(Set preval (GetAndSetP mem newval));
+  effect(TEMP preval, TEMP tmp1, TEMP tmp2);
+  ins_cost(VOLATILE_REF_COST);
+  format %{ "atomic_xchg_acq  $preval, $newval, [$mem]" %}
+  ins_encode %{
+    guarantee($mem$$disp == 0, "impossible encoding");
+    assert_different_registers($mem$$Register, $newval$$Register);
+    write_barrier_pre(masm, this,
+                      $mem$$Register    /* obj */,
+                      $preval$$Register /* pre_val (as a temporary register) */,
+                      $tmp1$$Register   /* tmp1 */,
+                      $tmp2$$Register   /* tmp2 */,
+                      RegSet::of($mem$$Register, $preval$$Register, $newval$$Register) /* preserve */);
+    __ atomic_xchgal($preval$$Register, $newval$$Register, $mem$$Register);
+    write_barrier_post(masm, this,
+                       $mem$$Register    /* store_addr */,
+                       $newval$$Register /* new_val */,
+                       $tmp1$$Register   /* tmp1 */,
+                       $tmp2$$Register   /* tmp2 */);
+  %}
+  ins_pipe(pipe_serial);
+%}
+
+instruct g1GetAndSetN(indirect mem, iRegN newval, iRegPNoSp tmp1, iRegPNoSp tmp2, iRegPNoSp tmp3, iRegNNoSp preval)
+%{
+  predicate(UseG1GC && n->as_LoadStore()->barrier_data() != 0);
+  match(Set preval (GetAndSetN mem newval));
+  effect(TEMP preval, TEMP tmp1, TEMP tmp2, TEMP tmp3);
+  ins_cost(2 * VOLATILE_REF_COST);
+  format %{ "atomic_xchgwu $preval, $newval, [$mem]" %}
+  ins_encode %{
+    guarantee($mem$$disp == 0, "impossible encoding");
+    assert_different_registers($mem$$Register, $newval$$Register);
+    write_barrier_pre(masm, this,
+                      $mem$$Register  /* obj */,
+                      $tmp1$$Register /* pre_val */,
+                      $tmp2$$Register /* tmp1 */,
+                      $tmp3$$Register /* tmp2 */,
+                      RegSet::of($mem$$Register, $preval$$Register, $newval$$Register) /* preserve */);
+    __ atomic_xchgwu($preval$$Register, $newval$$Register, $mem$$Register);
+    __ decode_heap_oop($tmp1$$Register, $newval$$Register);
+    write_barrier_post(masm, this,
+                       $mem$$Register  /* store_addr */,
+                       $tmp1$$Register /* new_val */,
+                       $tmp2$$Register /* tmp1 */,
+                       $tmp3$$Register /* tmp2 */);
+  %}
+  ins_pipe(pipe_serial);
+%}
+
+instruct g1GetAndSetNAcq(indirect mem, iRegN newval, iRegPNoSp tmp1, iRegPNoSp tmp2, iRegPNoSp tmp3, iRegNNoSp preval)
+%{
+  predicate(UseG1GC && needs_acquiring_load_reserved(n) && n->as_LoadStore()->barrier_data() != 0);
+  match(Set preval (GetAndSetN mem newval));
+  effect(TEMP preval, TEMP tmp1, TEMP tmp2, TEMP tmp3);
+  ins_cost(VOLATILE_REF_COST);
+  format %{ "atomic_xchgwu_acq $preval, $newval, [$mem]" %}
+  ins_encode %{
+    guarantee($mem$$disp == 0, "impossible encoding");
+    assert_different_registers($mem$$Register, $newval$$Register);
+    write_barrier_pre(masm, this,
+                      $mem$$Register  /* obj */,
+                      $tmp1$$Register /* pre_val */,
+                      $tmp2$$Register /* tmp1 */,
+                      $tmp3$$Register /* tmp2 */,
+                      RegSet::of($mem$$Register, $preval$$Register, $newval$$Register) /* preserve */);
+    __ atomic_xchgalwu($preval$$Register, $newval$$Register, $mem$$Register);
+    __ decode_heap_oop($tmp1$$Register, $newval$$Register);
+    write_barrier_post(masm, this,
+                       $mem$$Register  /* store_addr */,
+                       $tmp1$$Register /* new_val */,
+                       $tmp2$$Register /* tmp1 */,
+                       $tmp3$$Register /* tmp2 */);
+  %}
+  ins_pipe(pipe_serial);
+%}
+
+instruct g1LoadP(iRegPNoSp dst, indirect mem, iRegPNoSp tmp1, iRegPNoSp tmp2)
+%{
+  predicate(UseG1GC && n->as_Load()->barrier_data() != 0);
+  match(Set dst (LoadP mem));
+  effect(TEMP dst, TEMP tmp1, TEMP tmp2);
+  ins_cost(LOAD_COST + BRANCH_COST);
+  format %{ "ld  $dst, $mem\t# ptr" %}
+  ins_encode %{
+    guarantee($mem$$disp == 0, "impossible encoding");
+    __ ld($dst$$Register, Address($mem$$Register));
+    write_barrier_pre(masm, this,
+                      noreg /* obj */,
+                      $dst$$Register /* pre_val */,
+                      $tmp1$$Register /* tmp1 */,
+                      $tmp2$$Register /* tmp2 */);
+  %}
+  ins_pipe(iload_reg_mem);
+%}
+
+instruct g1LoadN(iRegNNoSp dst, indirect mem, iRegPNoSp tmp1, iRegPNoSp tmp2, iRegPNoSp tmp3)
+%{
+  predicate(UseG1GC && n->as_Load()->barrier_data() != 0);
+  match(Set dst (LoadN mem));
+  effect(TEMP dst, TEMP tmp1, TEMP tmp2, TEMP tmp3);
+  ins_cost(LOAD_COST + BRANCH_COST);
+  format %{ "lwu  $dst, $mem\t# compressed ptr" %}
+  ins_encode %{
+    guarantee($mem$$disp == 0, "impossible encoding");
+    __ lwu($dst$$Register, Address($mem$$Register));
+    if ((barrier_data() & G1C2BarrierPre) != 0) {
+      __ decode_heap_oop($tmp1$$Register, $dst$$Register);
+      write_barrier_pre(masm, this,
+                        noreg /* obj */,
+                        $tmp1$$Register /* pre_val */,
+                        $tmp2$$Register /* tmp1 */,
+                        $tmp3$$Register /* tmp2 */);
+    }
+  %}
+  ins_pipe(iload_reg_mem);
+%}
diff --git a/src/hotspot/cpu/riscv/riscv.ad b/src/hotspot/cpu/riscv/riscv.ad
index 05f55fd0da7..563dfd4cde9 100644
--- a/src/hotspot/cpu/riscv/riscv.ad
+++ b/src/hotspot/cpu/riscv/riscv.ad
@@ -2224,7 +2224,8 @@ bool Matcher::pd_clone_node(Node* n, Node* m, Matcher::MStack& mstack) {
   assert_cond(m != nullptr);
   if (is_vshift_con_pattern(n, m) || // ShiftV src (ShiftCntV con)
       is_vector_bitwise_not_pattern(n, m) ||
-      is_vector_scalar_bitwise_pattern(n, m)) {
+      is_vector_scalar_bitwise_pattern(n, m) ||
+      is_encode_and_store_pattern(n, m)) {
     mstack.push(m, Visit);
     return true;
   }
@@ -4785,6 +4786,7 @@ instruct loadP(iRegPNoSp dst, memory mem)
 // Load Compressed Pointer
 instruct loadN(iRegNNoSp dst, memory mem)
 %{
+  predicate(n->as_Load()->barrier_data() == 0);
   match(Set dst (LoadN mem));
 
   ins_cost(LOAD_COST);
@@ -5220,6 +5222,7 @@ instruct storeimmP0(immP0 zero, memory mem)
 // Store Compressed Pointer
 instruct storeN(iRegN src, memory mem)
 %{
+  predicate(n->as_Store()->barrier_data() == 0);
   match(Set mem (StoreN mem src));
 
   ins_cost(STORE_COST);
@@ -5234,6 +5237,7 @@ instruct storeN(iRegN src, memory mem)
 
 instruct storeImmN0(immN0 zero, memory mem)
 %{
+  predicate(n->as_Store()->barrier_data() == 0);
   match(Set mem (StoreN mem zero));
 
   ins_cost(STORE_COST);
@@ -5424,6 +5428,7 @@ instruct compareAndSwapP(iRegINoSp res, indirect mem, iRegP oldval, iRegP newval
 
 instruct compareAndSwapN(iRegINoSp res, indirect mem, iRegN oldval, iRegN newval)
 %{
+  predicate(n->as_LoadStore()->barrier_data() == 0);
   match(Set res (CompareAndSwapN mem (Binary oldval newval)));
 
   ins_cost(LOAD_COST + STORE_COST + ALU_COST * 8 + BRANCH_COST * 4);
@@ -5545,7 +5550,7 @@ instruct compareAndSwapPAcq(iRegINoSp res, indirect mem, iRegP oldval, iRegP new
 
 instruct compareAndSwapNAcq(iRegINoSp res, indirect mem, iRegN oldval, iRegN newval)
 %{
-  predicate(needs_acquiring_load_reserved(n));
+  predicate(needs_acquiring_load_reserved(n) && n->as_LoadStore()->barrier_data() == 0);
 
   match(Set res (CompareAndSwapN mem (Binary oldval newval)));
 
@@ -5653,6 +5658,7 @@ instruct compareAndExchangeL(iRegLNoSp res, indirect mem, iRegL oldval, iRegL ne
 
 instruct compareAndExchangeN(iRegNNoSp res, indirect mem, iRegN oldval, iRegN newval)
 %{
+  predicate(n->as_LoadStore()->barrier_data() == 0);
   match(Set res (CompareAndExchangeN mem (Binary oldval newval)));
 
   ins_cost(LOAD_COST + STORE_COST + BRANCH_COST * 3 + ALU_COST * 3);
@@ -5786,7 +5792,7 @@ instruct compareAndExchangeLAcq(iRegLNoSp res, indirect mem, iRegL oldval, iRegL
 
 instruct compareAndExchangeNAcq(iRegNNoSp res, indirect mem, iRegN oldval, iRegN newval)
 %{
-  predicate(needs_acquiring_load_reserved(n));
+  predicate(needs_acquiring_load_reserved(n) && n->as_LoadStore()->barrier_data() == 0);
 
   match(Set res (CompareAndExchangeN mem (Binary oldval newval)));
 
@@ -5914,6 +5920,7 @@ instruct weakCompareAndSwapL(iRegINoSp res, indirect mem, iRegL oldval, iRegL ne
 
 instruct weakCompareAndSwapN(iRegINoSp res, indirect mem, iRegN oldval, iRegN newval)
 %{
+  predicate(n->as_LoadStore()->barrier_data() == 0);
   match(Set res (WeakCompareAndSwapN mem (Binary oldval newval)));
 
   ins_cost(LOAD_COST + STORE_COST + BRANCH_COST * 2 + ALU_COST * 4);
@@ -6045,7 +6052,7 @@ instruct weakCompareAndSwapLAcq(iRegINoSp res, indirect mem, iRegL oldval, iRegL
 
 instruct weakCompareAndSwapNAcq(iRegINoSp res, indirect mem, iRegN oldval, iRegN newval)
 %{
-  predicate(needs_acquiring_load_reserved(n));
+  predicate(needs_acquiring_load_reserved(n) && n->as_LoadStore()->barrier_data() == 0);
 
   match(Set res (WeakCompareAndSwapN mem (Binary oldval newval)));
 
@@ -6117,6 +6124,8 @@ instruct get_and_setL(indirect mem, iRegL newv, iRegLNoSp prev)
 
 instruct get_and_setN(indirect mem, iRegN newv, iRegINoSp prev)
 %{
+  predicate(n->as_LoadStore()->barrier_data() == 0);
+
   match(Set prev (GetAndSetN mem newv));
 
   ins_cost(ALU_COST);
@@ -6182,7 +6191,7 @@ instruct get_and_setLAcq(indirect mem, iRegL newv, iRegLNoSp prev)
 
 instruct get_and_setNAcq(indirect mem, iRegN newv, iRegINoSp prev)
 %{
-  predicate(needs_acquiring_load_reserved(n));
+  predicate(needs_acquiring_load_reserved(n) && n->as_LoadStore()->barrier_data() == 0);
 
   match(Set prev (GetAndSetN mem newv));
 
diff --git a/src/hotspot/cpu/s390/gc/g1/g1BarrierSetAssembler_s390.cpp b/src/hotspot/cpu/s390/gc/g1/g1BarrierSetAssembler_s390.cpp
index 37631298920..544c82d34a7 100644
--- a/src/hotspot/cpu/s390/gc/g1/g1BarrierSetAssembler_s390.cpp
+++ b/src/hotspot/cpu/s390/gc/g1/g1BarrierSetAssembler_s390.cpp
@@ -1,6 +1,6 @@
 /*
- * Copyright (c) 2019, 2023, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2018, 2023 SAP SE. All rights reserved.
+ * Copyright (c) 2019, 2024, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2018, 2024 SAP SE. All rights reserved.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
  * This code is free software; you can redistribute it and/or modify it
@@ -42,11 +42,47 @@
 #include "c1/c1_LIRAssembler.hpp"
 #include "c1/c1_MacroAssembler.hpp"
 #include "gc/g1/c1/g1BarrierSetC1.hpp"
-#endif
+#endif // COMPILER1
+#ifdef COMPILER2
+#include "gc/g1/c2/g1BarrierSetC2.hpp"
+#endif // COMPILER2
 
 #define __ masm->
 
-#define BLOCK_COMMENT(str) if (PrintAssembly) __ block_comment(str)
+#define BLOCK_COMMENT(str) __ block_comment(str)
+
+static void generate_pre_barrier_fast_path(MacroAssembler* masm,
+                                           const Register thread,
+                                           const Register tmp1) {
+  Address in_progress(thread, in_bytes(G1ThreadLocalData::satb_mark_queue_active_offset()));
+  // Is marking active?
+  if (in_bytes(SATBMarkQueue::byte_width_of_active()) == 4) {
+    __ load_and_test_int(tmp1, in_progress);
+  } else {
+    assert(in_bytes(SATBMarkQueue::byte_width_of_active()) == 1, "Assumption");
+    __ load_and_test_byte(tmp1, in_progress);
+  }
+}
+
+static void generate_queue_test_and_insertion(MacroAssembler* masm, ByteSize index_offset, ByteSize buffer_offset, Label& runtime,
+                                              const Register Z_thread, const Register value, const Register temp) {
+  BLOCK_COMMENT("generate_queue_test_and_insertion {");
+
+  assert_different_registers(temp, value);
+  // Can we store a value in the given thread's buffer?
+  // (The index field is typed as size_t.)
+
+  __ load_and_test_long(temp, Address(Z_thread, in_bytes(index_offset))); // temp := *(index address)
+  __ branch_optimized(Assembler::bcondEqual, runtime);                    // jump to runtime if index == 0 (full buffer)
+
+  // The buffer is not full, store value into it.
+  __ add2reg(temp, -wordSize);                                            // temp := next index
+  __ z_stg(temp, in_bytes(index_offset), Z_thread);                       // *(index address) := next index
+
+  __ z_ag(temp, Address(Z_thread, in_bytes(buffer_offset)));              // temp := buffer address + next index
+  __ z_stg(value, 0, temp);                                               // *(buffer address + next index) := value
+  BLOCK_COMMENT("} generate_queue_test_and_insertion");
+}
 
 void G1BarrierSetAssembler::gen_write_ref_array_pre_barrier(MacroAssembler* masm, DecoratorSet decorators,
                                                             Register addr, Register count) {
@@ -59,13 +95,8 @@ void G1BarrierSetAssembler::gen_write_ref_array_pre_barrier(MacroAssembler* masm
     assert_different_registers(addr,  Z_R0_scratch);  // would be destroyed by push_frame()
     assert_different_registers(count, Z_R0_scratch);  // would be destroyed by push_frame()
     Register Rtmp1 = Z_R0_scratch;
-    const int active_offset = in_bytes(G1ThreadLocalData::satb_mark_queue_active_offset());
-    if (in_bytes(SATBMarkQueue::byte_width_of_active()) == 4) {
-      __ load_and_test_int(Rtmp1, Address(Z_thread, active_offset));
-    } else {
-      guarantee(in_bytes(SATBMarkQueue::byte_width_of_active()) == 1, "Assumption");
-      __ load_and_test_byte(Rtmp1, Address(Z_thread, active_offset));
-    }
+
+    generate_pre_barrier_fast_path(masm, Z_thread, Rtmp1);
     __ z_bre(filtered); // Activity indicator is zero, so there is no marking going on currently.
 
     RegisterSaver::save_live_registers(masm, RegisterSaver::arg_registers); // Creates frame.
@@ -100,6 +131,181 @@ void G1BarrierSetAssembler::gen_write_ref_array_post_barrier(MacroAssembler* mas
   }
 }
 
+#if defined(COMPILER2)
+
+#undef __
+#define __ masm->
+
+static void generate_c2_barrier_runtime_call(MacroAssembler* masm, G1BarrierStubC2* stub, const Register pre_val, const address runtime_path) {
+  BLOCK_COMMENT("generate_c2_barrier_runtime_call {");
+  SaveLiveRegisters save_registers(masm, stub);
+  __ call_VM_leaf(runtime_path, pre_val, Z_thread);
+  BLOCK_COMMENT("} generate_c2_barrier_runtime_call");
+}
+
+void G1BarrierSetAssembler::g1_write_barrier_pre_c2(MacroAssembler* masm,
+                                                    Register obj,
+                                                    Register pre_val,
+                                                    Register thread,
+                                                    Register tmp1,
+                                                    G1PreBarrierStubC2* stub) {
+
+  BLOCK_COMMENT("g1_write_barrier_pre_c2 {");
+
+  assert(thread == Z_thread, "must be");
+  assert_different_registers(obj, pre_val, tmp1);
+  assert(pre_val != noreg && tmp1 != noreg, "expecting a register");
+
+  stub->initialize_registers(obj, pre_val, thread, tmp1, noreg);
+
+  generate_pre_barrier_fast_path(masm, thread, tmp1);
+  __ branch_optimized(Assembler::bcondNotEqual, *stub->entry()); // Activity indicator is zero, so there is no marking going on currently.
+
+  __ bind(*stub->continuation());
+
+  BLOCK_COMMENT("} g1_write_barrier_pre_c2");
+}
+
+void G1BarrierSetAssembler::generate_c2_pre_barrier_stub(MacroAssembler* masm,
+                                                         G1PreBarrierStubC2* stub) const {
+
+  BLOCK_COMMENT("generate_c2_pre_barrier_stub {");
+
+  Assembler::InlineSkippedInstructionsCounter skip_counter(masm);
+
+  Label runtime;
+  Register obj     = stub->obj();
+  Register pre_val = stub->pre_val();
+  Register thread  = stub->thread();
+  Register tmp1    = stub->tmp1();
+
+  __ bind(*stub->entry());
+
+  BLOCK_COMMENT("generate_pre_val_not_null_test {");
+  if (obj != noreg) {
+    __ load_heap_oop(pre_val, Address(obj), noreg, noreg, AS_RAW);
+  }
+  __ z_ltgr(pre_val, pre_val);
+  __ branch_optimized(Assembler::bcondEqual, *stub->continuation());
+  BLOCK_COMMENT("} generate_pre_val_not_null_test");
+
+  generate_queue_test_and_insertion(masm,
+                                    G1ThreadLocalData::satb_mark_queue_index_offset(),
+                                    G1ThreadLocalData::satb_mark_queue_buffer_offset(),
+                                    runtime,
+                                    Z_thread, pre_val, tmp1);
+
+  __ branch_optimized(Assembler::bcondAlways, *stub->continuation());
+
+  __ bind(runtime);
+
+  generate_c2_barrier_runtime_call(masm, stub, pre_val, CAST_FROM_FN_PTR(address, G1BarrierSetRuntime::write_ref_field_pre_entry));
+
+  __ branch_optimized(Assembler::bcondAlways, *stub->continuation());
+
+  BLOCK_COMMENT("} generate_c2_pre_barrier_stub");
+}
+
+void G1BarrierSetAssembler::g1_write_barrier_post_c2(MacroAssembler* masm,
+                                                     Register store_addr,
+                                                     Register new_val,
+                                                     Register thread,
+                                                     Register tmp1,
+                                                     Register tmp2,
+                                                     G1PostBarrierStubC2* stub) {
+  BLOCK_COMMENT("g1_write_barrier_post_c2 {");
+
+  assert(thread == Z_thread, "must be");
+  assert_different_registers(store_addr, new_val, thread, tmp1, tmp2, Z_R1_scratch);
+
+  assert(store_addr != noreg && new_val != noreg && tmp1 != noreg && tmp2 != noreg, "expecting a register");
+
+  stub->initialize_registers(thread, tmp1, tmp2);
+
+  BLOCK_COMMENT("generate_region_crossing_test {");
+  if (VM_Version::has_DistinctOpnds()) {
+    __ z_xgrk(tmp1, store_addr, new_val);
+  } else {
+    __ z_lgr(tmp1, store_addr);
+    __ z_xgr(tmp1, new_val);
+  }
+  __ z_srag(tmp1, tmp1, G1HeapRegion::LogOfHRGrainBytes);
+  __ branch_optimized(Assembler::bcondEqual, *stub->continuation());
+  BLOCK_COMMENT("} generate_region_crossing_test");
+
+  // crosses regions, storing null?
+  if ((stub->barrier_data() & G1C2BarrierPostNotNull) == 0) {
+    __ z_ltgr(new_val, new_val);
+    __ branch_optimized(Assembler::bcondEqual, *stub->continuation());
+  }
+
+  BLOCK_COMMENT("generate_card_young_test {");
+  CardTableBarrierSet* ct = barrier_set_cast<CardTableBarrierSet>(BarrierSet::barrier_set());
+  // calculate address of card
+  __ load_const_optimized(tmp2, (address)ct->card_table()->byte_map_base());      // Card table base.
+  __ z_srlg(tmp1, store_addr, CardTable::card_shift());         // Index into card table.
+  __ z_algr(tmp1, tmp2);                                      // Explicit calculation needed for cli.
+
+  // Filter young.
+  __ z_cli(0, tmp1, G1CardTable::g1_young_card_val());
+
+  BLOCK_COMMENT("} generate_card_young_test");
+
+  // From here on, tmp1 holds the card address.
+  __ branch_optimized(Assembler::bcondNotEqual, *stub->entry());
+
+  __ bind(*stub->continuation());
+
+  BLOCK_COMMENT("} g1_write_barrier_post_c2");
+}
+
+void G1BarrierSetAssembler::generate_c2_post_barrier_stub(MacroAssembler* masm,
+                                                          G1PostBarrierStubC2* stub) const {
+
+  BLOCK_COMMENT("generate_c2_post_barrier_stub {");
+
+  Assembler::InlineSkippedInstructionsCounter skip_counter(masm);
+  Label runtime;
+
+  Register thread     = stub->thread();
+  Register tmp1       = stub->tmp1(); // tmp1 holds the card address.
+  Register tmp2       = stub->tmp2();
+  Register Rcard_addr = tmp1;
+
+  __ bind(*stub->entry());
+
+  BLOCK_COMMENT("generate_card_clean_test {");
+  __ z_sync(); // Required to support concurrent cleaning.
+  __ z_cli(0, Rcard_addr, 0); // Reload after membar.
+  __ branch_optimized(Assembler::bcondEqual, *stub->continuation());
+  BLOCK_COMMENT("} generate_card_clean_test");
+
+  BLOCK_COMMENT("generate_dirty_card {");
+  // Storing a region crossing, non-null oop, card is clean.
+  // Dirty card and log.
+  STATIC_ASSERT(CardTable::dirty_card_val() == 0);
+  __ z_mvi(0, Rcard_addr, CardTable::dirty_card_val());
+  BLOCK_COMMENT("} generate_dirty_card");
+
+  generate_queue_test_and_insertion(masm,
+                                    G1ThreadLocalData::dirty_card_queue_index_offset(),
+                                    G1ThreadLocalData::dirty_card_queue_buffer_offset(),
+                                    runtime,
+                                    Z_thread, tmp1, tmp2);
+
+  __ branch_optimized(Assembler::bcondAlways, *stub->continuation());
+
+  __ bind(runtime);
+
+  generate_c2_barrier_runtime_call(masm, stub, tmp1, CAST_FROM_FN_PTR(address, G1BarrierSetRuntime::write_ref_field_post_entry));
+
+  __ branch_optimized(Assembler::bcondAlways, *stub->continuation());
+
+  BLOCK_COMMENT("} generate_c2_post_barrier_stub");
+}
+
+#endif //COMPILER2
+
 void G1BarrierSetAssembler::load_at(MacroAssembler* masm, DecoratorSet decorators, BasicType type,
                                     const Address& src, Register dst, Register tmp1, Register tmp2, Label *L_handle_null) {
   bool on_oop = is_reference_type(type);
@@ -136,9 +342,6 @@ void G1BarrierSetAssembler::g1_write_barrier_pre(MacroAssembler* masm, Decorator
 
   const Register Robj = obj ? obj->base() : noreg,
                  Roff = obj ? obj->index() : noreg;
-  const int active_offset = in_bytes(G1ThreadLocalData::satb_mark_queue_active_offset());
-  const int buffer_offset = in_bytes(G1ThreadLocalData::satb_mark_queue_buffer_offset());
-  const int index_offset  = in_bytes(G1ThreadLocalData::satb_mark_queue_index_offset());
   assert_different_registers(Rtmp1, Rtmp2, Z_R0_scratch); // None of the Rtmp<i> must be Z_R0!!
   assert_different_registers(Robj, Z_R0_scratch);         // Used for addressing. Furthermore, push_frame destroys Z_R0!!
   assert_different_registers(Rval, Z_R0_scratch);         // push_frame destroys Z_R0!!
@@ -147,14 +350,7 @@ void G1BarrierSetAssembler::g1_write_barrier_pre(MacroAssembler* masm, Decorator
 
   BLOCK_COMMENT("g1_write_barrier_pre {");
 
-  // Is marking active?
-  // Note: value is loaded for test purposes only. No further use here.
-  if (in_bytes(SATBMarkQueue::byte_width_of_active()) == 4) {
-    __ load_and_test_int(Rtmp1, Address(Z_thread, active_offset));
-  } else {
-    guarantee(in_bytes(SATBMarkQueue::byte_width_of_active()) == 1, "Assumption");
-    __ load_and_test_byte(Rtmp1, Address(Z_thread, active_offset));
-  }
+  generate_pre_barrier_fast_path(masm, Z_thread, Rtmp1);
   __ z_bre(filtered); // Activity indicator is zero, so there is no marking going on currently.
 
   assert(Rpre_val != noreg, "must have a real register");
@@ -194,24 +390,14 @@ void G1BarrierSetAssembler::g1_write_barrier_pre(MacroAssembler* masm, Decorator
   // We can store the original value in the thread's buffer
   // only if index > 0. Otherwise, we need runtime to handle.
   // (The index field is typed as size_t.)
-  Register Rbuffer = Rtmp1, Rindex = Rtmp2;
-  assert_different_registers(Rbuffer, Rindex, Rpre_val);
 
-  __ z_lg(Rbuffer, buffer_offset, Z_thread);
-
-  __ load_and_test_long(Rindex, Address(Z_thread, index_offset));
-  __ z_bre(callRuntime); // If index == 0, goto runtime.
-
-  __ add2reg(Rindex, -wordSize); // Decrement index.
-  __ z_stg(Rindex, index_offset, Z_thread);
-
-  // Record the previous value.
-  __ z_stg(Rpre_val, 0, Rbuffer, Rindex);
+  generate_queue_test_and_insertion(masm,
+                                    G1ThreadLocalData::satb_mark_queue_index_offset(),
+                                    G1ThreadLocalData::satb_mark_queue_buffer_offset(),
+                                    callRuntime,
+                                    Z_thread, Rpre_val, Rtmp2);
   __ z_bru(filtered);  // We are done.
 
-  Rbuffer = noreg;  // end of life
-  Rindex  = noreg;  // end of life
-
   __ bind(callRuntime);
 
   // Save some registers (inputs and result) over runtime call
@@ -326,23 +512,16 @@ void G1BarrierSetAssembler::g1_write_barrier_post(MacroAssembler* masm, Decorato
 
   Register Rcard_addr_x = Rcard_addr;
   Register Rqueue_index = (Rtmp2 != Z_R0_scratch) ? Rtmp2 : Rtmp1;
-  Register Rqueue_buf   = (Rtmp3 != Z_R0_scratch) ? Rtmp3 : Rtmp1;
-  const int qidx_off    = in_bytes(G1ThreadLocalData::dirty_card_queue_index_offset());
-  const int qbuf_off    = in_bytes(G1ThreadLocalData::dirty_card_queue_buffer_offset());
-  if ((Rcard_addr == Rqueue_buf) || (Rcard_addr == Rqueue_index)) {
+  if (Rcard_addr == Rqueue_index) {
     Rcard_addr_x = Z_R0_scratch;  // Register shortage. We have to use Z_R0.
   }
   __ lgr_if_needed(Rcard_addr_x, Rcard_addr);
 
-  __ load_and_test_long(Rqueue_index, Address(Z_thread, qidx_off));
-  __ z_bre(callRuntime); // Index == 0 then jump to runtime.
-
-  __ z_lg(Rqueue_buf, qbuf_off, Z_thread);
-
-  __ add2reg(Rqueue_index, -wordSize); // Decrement index.
-  __ z_stg(Rqueue_index, qidx_off, Z_thread);
-
-  __ z_stg(Rcard_addr_x, 0, Rqueue_index, Rqueue_buf); // Store card.
+  generate_queue_test_and_insertion(masm,
+                                    G1ThreadLocalData::dirty_card_queue_index_offset(),
+                                    G1ThreadLocalData::dirty_card_queue_buffer_offset(),
+                                    callRuntime,
+                                    Z_thread, Rcard_addr_x, Rqueue_index);
   __ z_bru(filtered);
 
   __ bind(callRuntime);
diff --git a/src/hotspot/cpu/s390/gc/g1/g1BarrierSetAssembler_s390.hpp b/src/hotspot/cpu/s390/gc/g1/g1BarrierSetAssembler_s390.hpp
index cc1d51d2fa1..0f0bdd8b83c 100644
--- a/src/hotspot/cpu/s390/gc/g1/g1BarrierSetAssembler_s390.hpp
+++ b/src/hotspot/cpu/s390/gc/g1/g1BarrierSetAssembler_s390.hpp
@@ -1,6 +1,6 @@
 /*
- * Copyright (c) 2018, 2023, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2018 SAP SE. All rights reserved.
+ * Copyright (c) 2018, 2024, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2018, 2024 SAP SE. All rights reserved.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
  * This code is free software; you can redistribute it and/or modify it
@@ -34,6 +34,8 @@ class LIR_Assembler;
 class StubAssembler;
 class G1PreBarrierStub;
 class G1PostBarrierStub;
+class G1PreBarrierStubC2;
+class G1PostBarrierStubC2;
 
 class G1BarrierSetAssembler: public ModRefBarrierSetAssembler {
  protected:
@@ -62,7 +64,27 @@ class G1BarrierSetAssembler: public ModRefBarrierSetAssembler {
 
   void generate_c1_pre_barrier_runtime_stub(StubAssembler* sasm);
   void generate_c1_post_barrier_runtime_stub(StubAssembler* sasm);
-#endif
+#endif // COMPILER1
+
+#ifdef COMPILER2
+  void g1_write_barrier_pre_c2(MacroAssembler* masm,
+                               Register obj,
+                               Register pre_val,
+                               Register thread,
+                               Register tmp1,
+                               G1PreBarrierStubC2* c2_stub);
+  void generate_c2_pre_barrier_stub(MacroAssembler* masm,
+                                    G1PreBarrierStubC2* stub) const;
+  void g1_write_barrier_post_c2(MacroAssembler* masm,
+                                Register store_addr,
+                                Register new_val,
+                                Register thread,
+                                Register tmp1,
+                                Register tmp2,
+                                G1PostBarrierStubC2* c2_stub);
+  void generate_c2_post_barrier_stub(MacroAssembler* masm,
+                                     G1PostBarrierStubC2* stub) const;
+#endif // COMPILER2
 
   virtual void load_at(MacroAssembler* masm, DecoratorSet decorators, BasicType type,
                        const Address& src, Register dst, Register tmp1, Register tmp2, Label *L_handle_null = nullptr);
diff --git a/src/hotspot/cpu/s390/gc/g1/g1_s390.ad b/src/hotspot/cpu/s390/gc/g1/g1_s390.ad
new file mode 100644
index 00000000000..31f60c4aeff
--- /dev/null
+++ b/src/hotspot/cpu/s390/gc/g1/g1_s390.ad
@@ -0,0 +1,457 @@
+//
+// Copyright (c) 2024, Oracle and/or its affiliates. All rights reserved.
+// Copyright 2024 IBM Corporation. All rights reserved.
+// DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+//
+// This code is free software; you can redistribute it and/or modify it
+// under the terms of the GNU General Public License version 2 only, as
+// published by the Free Software Foundation.
+//
+// This code is distributed in the hope that it will be useful, but WITHOUT
+// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+// FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+// version 2 for more details (a copy is included in the LICENSE file that
+// accompanied this code).
+//
+// You should have received a copy of the GNU General Public License version
+// 2 along with this work; if not, write to the Free Software Foundation,
+// Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+//
+// Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+// or visit www.oracle.com if you need additional information or have any
+// questions.
+//
+
+source_hpp %{
+
+#include "gc/g1/c2/g1BarrierSetC2.hpp"
+#include "gc/shared/gc_globals.hpp"
+
+%}
+
+source %{
+
+#include "gc/g1/g1BarrierSetAssembler_s390.hpp"
+#include "gc/g1/g1BarrierSetRuntime.hpp"
+
+static void write_barrier_pre(MacroAssembler* masm,
+                              const MachNode* node,
+                              Register obj,
+                              Register pre_val,
+                              Register tmp1,
+                              RegSet preserve = RegSet(),
+                              RegSet no_preserve = RegSet()) {
+  if (!G1PreBarrierStubC2::needs_barrier(node)) {
+    return;
+  }
+  Assembler::InlineSkippedInstructionsCounter skip_counter(masm);
+  G1BarrierSetAssembler* g1_asm = static_cast<G1BarrierSetAssembler*>(BarrierSet::barrier_set()->barrier_set_assembler());
+  G1PreBarrierStubC2* const stub = G1PreBarrierStubC2::create(node);
+  for (RegSetIterator<Register> reg = preserve.begin(); *reg != noreg; ++reg) {
+    stub->preserve(*reg);
+  }
+  for (RegSetIterator<Register> reg = no_preserve.begin(); *reg != noreg; ++reg) {
+    stub->dont_preserve(*reg);
+  }
+  g1_asm->g1_write_barrier_pre_c2(masm, obj, pre_val, Z_thread, tmp1, stub);
+}
+
+static void write_barrier_post(MacroAssembler* masm,
+                               const MachNode* node,
+                               Register store_addr,
+                               Register new_val,
+                               Register tmp1,
+                               Register tmp2) {
+  if (!G1PostBarrierStubC2::needs_barrier(node)) {
+    return;
+  }
+  Assembler::InlineSkippedInstructionsCounter skip_counter(masm);
+  G1BarrierSetAssembler* g1_asm = static_cast<G1BarrierSetAssembler*>(BarrierSet::barrier_set()->barrier_set_assembler());
+  G1PostBarrierStubC2* const stub = G1PostBarrierStubC2::create(node);
+  g1_asm->g1_write_barrier_post_c2(masm, store_addr, new_val, Z_thread, tmp1, tmp2, stub);
+}
+
+%} // source
+
+// store pointer
+instruct g1StoreP(indirect dst, memoryRegP src, iRegL tmp1, iRegL tmp2, flagsReg cr) %{
+  predicate(UseG1GC && n->as_Store()->barrier_data() != 0);
+  match(Set dst (StoreP dst src));
+  effect(TEMP tmp1, TEMP tmp2, KILL cr);
+  ins_cost(MEMORY_REF_COST);
+  format %{ "STG     $src,$dst\t # ptr" %}
+  ins_encode %{
+    __ block_comment("g1StoreP {");
+    write_barrier_pre(masm, this,
+                      $dst$$Register  /* obj     */,
+                      $tmp1$$Register /* pre_val */,
+                      $tmp2$$Register /* tmp1    */,
+                      RegSet::of($dst$$Register, $src$$Register) /* preserve */);
+
+    __ z_stg($src$$Register, Address($dst$$Register));
+
+    write_barrier_post(masm, this,
+                       $dst$$Register, /* store_addr */
+                       $src$$Register  /* new_val    */,
+                       $tmp1$$Register /* tmp1       */,
+                       $tmp2$$Register /* tmp2       */);
+    __ block_comment("} g1StoreP");
+  %}
+  ins_pipe(pipe_class_dummy);
+%}
+
+// Store Compressed Pointer
+instruct g1StoreN(indirect mem, iRegN_P2N src, iRegL tmp1, iRegL tmp2, iRegL tmp3, flagsReg cr) %{
+  predicate(UseG1GC && n->as_Store()->barrier_data() != 0);
+  match(Set mem (StoreN mem src));
+  effect(TEMP tmp1, TEMP tmp2, TEMP tmp3, KILL cr);
+  ins_cost(MEMORY_REF_COST);
+  format %{ "STY     $src,$mem\t # (cOop)" %}
+  ins_encode %{
+    __ block_comment("g1StoreN {");
+    write_barrier_pre(masm, this,
+                      $mem$$Register  /* obj     */,
+                      $tmp1$$Register /* pre_val */,
+                      $tmp2$$Register /* tmp1    */,
+                      RegSet::of($mem$$Register, $src$$Register) /* preserve */);
+
+    __ z_sty($src$$Register, Address($mem$$Register));
+
+    if ((barrier_data() & G1C2BarrierPost) != 0) {
+      if ((barrier_data() & G1C2BarrierPostNotNull) == 0) {
+        __ oop_decoder($tmp1$$Register, $src$$Register, true /* maybe_null */);
+      } else {
+        __ oop_decoder($tmp1$$Register, $src$$Register, false /* maybe_null */);
+      }
+    }
+
+    write_barrier_post(masm, this,
+                       $mem$$Register  /* store_addr */,
+                       $tmp1$$Register /* new_val    */,
+                       $tmp2$$Register /* tmp1       */,
+                       $tmp3$$Register /* tmp2       */);
+    __ block_comment("} g1StoreN");
+  %}
+
+  ins_pipe(pipe_class_dummy);
+%}
+
+instruct g1CompareAndSwapN(indirect mem_ptr, rarg5RegN oldval, iRegN_P2N newval, iRegI res, iRegL tmp1, iRegL tmp2, iRegL tmp3, flagsReg cr) %{
+  predicate(UseG1GC && n->as_LoadStore()->barrier_data() != 0);
+  match(Set res (CompareAndSwapN mem_ptr (Binary oldval newval)));
+  match(Set res (WeakCompareAndSwapN mem_ptr (Binary oldval newval)));
+  effect(USE mem_ptr, TEMP res, TEMP tmp1, TEMP tmp2, TEMP tmp3, USE_KILL oldval, KILL cr);
+  format %{ "$res = CompareAndSwapN $oldval,$newval,$mem_ptr" %}
+  ins_encode %{
+    assert_different_registers($oldval$$Register, $mem_ptr$$Register);
+    assert_different_registers($newval$$Register, $mem_ptr$$Register);
+    __ block_comment("g1compareAndSwapN {");
+
+    Register Rcomp = reg_to_register_object($oldval$$reg);
+    Register Rnew  = reg_to_register_object($newval$$reg);
+    Register Raddr = reg_to_register_object($mem_ptr$$reg);
+    Register Rres  = reg_to_register_object($res$$reg);
+
+    write_barrier_pre(masm, this,
+                      Raddr           /* obj     */,
+                      $tmp1$$Register /* pre_val */,
+                      $tmp2$$Register /* tmp1    */,
+                      RegSet::of(Raddr, Rcomp, Rnew) /* preserve */,
+                      RegSet::of(Rres) /* no_preserve */);
+
+    __ z_cs(Rcomp, Rnew, 0, Raddr);
+
+    assert_different_registers(Rres, Raddr);
+    if (VM_Version::has_LoadStoreConditional()) {
+      __ load_const_optimized(Z_R0_scratch, 0L); // false (failed)
+      __ load_const_optimized(Rres, 1L);         // true  (succeed)
+      __ z_locgr(Rres, Z_R0_scratch, Assembler::bcondNotEqual);
+    } else {
+      Label done;
+      __ load_const_optimized(Rres, 0L); // false (failed)
+      __ z_brne(done);                   // Assume true to be the common case.
+      __ load_const_optimized(Rres, 1L); // true  (succeed)
+      __ bind(done);
+    }
+
+    __ oop_decoder($tmp3$$Register, Rnew, true /* maybe_null */);
+
+    write_barrier_post(masm, this,
+                       Raddr            /* store_addr */,
+                       $tmp3$$Register  /* new_val    */,
+                       $tmp1$$Register  /* tmp1       */,
+                       $tmp2$$Register  /* tmp2       */);
+    __ block_comment("} g1compareAndSwapN");
+  %}
+  ins_pipe(pipe_class_dummy);
+%}
+
+instruct g1CompareAndExchangeN(iRegP mem_ptr, rarg5RegN oldval, iRegN_P2N newval, iRegN res, iRegL tmp1, iRegL tmp2, iRegL tmp3, flagsReg cr) %{
+  predicate(UseG1GC && n->as_LoadStore()->barrier_data() != 0);
+  match(Set res (CompareAndExchangeN mem_ptr (Binary oldval newval)));
+  effect(USE mem_ptr, TEMP res, TEMP tmp1, TEMP tmp2, TEMP tmp3, USE_KILL oldval, KILL cr);
+  format %{ "$res = CompareAndExchangeN $oldval,$newval,$mem_ptr" %}
+  ins_encode %{
+    assert_different_registers($oldval$$Register, $mem_ptr$$Register);
+    assert_different_registers($newval$$Register, $mem_ptr$$Register);
+    __ block_comment("g1CompareAndExchangeN {");
+    write_barrier_pre(masm, this,
+                      $mem_ptr$$Register /* obj     */,
+                      $tmp1$$Register    /* pre_val */,
+                      $tmp2$$Register    /* tmp1    */,
+                      RegSet::of($mem_ptr$$Register, $oldval$$Register, $newval$$Register) /* preserve */,
+                      RegSet::of($res$$Register) /* no_preserve */);
+
+    Register Rcomp = reg_to_register_object($oldval$$reg);
+    Register Rnew  = reg_to_register_object($newval$$reg);
+    Register Raddr = reg_to_register_object($mem_ptr$$reg);
+
+    Register Rres = reg_to_register_object($res$$reg);
+    assert_different_registers(Rres, Raddr);
+
+    __ z_lgr(Rres, Rcomp);  // previous contents
+    __ z_csy(Rres, Rnew, 0, Raddr); // Try to store new value.
+
+    __ oop_decoder($tmp1$$Register, Rnew, true /* maybe_null */);
+
+    write_barrier_post(masm, this,
+                       Raddr           /* store_addr */,
+                       $tmp1$$Register /* new_val    */,
+                       $tmp2$$Register /* tmp1       */,
+                       $tmp3$$Register /* tmp2       */);
+    __ block_comment("} g1CompareAndExchangeN");
+  %}
+  ins_pipe(pipe_class_dummy);
+%}
+
+// Load narrow oop
+instruct g1LoadN(iRegN dst, indirect mem, iRegP tmp1, iRegP tmp2, flagsReg cr) %{
+  predicate(UseG1GC && n->as_Load()->barrier_data() != 0);
+  match(Set dst (LoadN mem));
+  effect(TEMP dst, TEMP tmp1, TEMP tmp2, KILL cr);
+  ins_cost(MEMORY_REF_COST);
+  format %{ "LoadN   $dst,$mem\t # (cOop)" %}
+  ins_encode %{
+    __ block_comment("g1LoadN {");
+    __ z_llgf($dst$$Register, Address($mem$$Register));
+    if ((barrier_data() & G1C2BarrierPre) != 0) {
+      __ oop_decoder($tmp1$$Register, $dst$$Register, true);
+      write_barrier_pre(masm, this,
+                        noreg           /* obj     */,
+                        $tmp1$$Register /* pre_val */,
+                        $tmp2$$Register );
+    }
+    __ block_comment("} g1LoadN");
+  %}
+  ins_pipe(pipe_class_dummy);
+%}
+
+instruct g1GetAndSetN(indirect mem, iRegN dst, iRegI tmp, iRegL tmp1, iRegL tmp2, iRegL tmp3, flagsReg cr) %{
+  predicate(UseG1GC && n->as_LoadStore()->barrier_data() != 0);
+  match(Set dst (GetAndSetN mem dst));
+  effect(KILL cr, TEMP tmp, TEMP tmp1, TEMP tmp2, TEMP tmp3); // USE_DEF dst by match rule.
+  format %{ "XCHGN   $dst,[$mem]\t # EXCHANGE (coop, atomic), temp $tmp" %}
+  ins_encode %{
+    __ block_comment("g1GetAndSetN {");
+    assert_different_registers($mem$$Register, $dst$$Register);
+    write_barrier_pre(masm, this,
+                      $mem$$Register  /* obj     */,
+                      $tmp1$$Register /* pre_val */,
+                      $tmp2$$Register /* tmp1    */,
+                      RegSet::of($mem$$Register, $dst$$Register) /* preserve */);
+
+    Register Rdst = reg_to_register_object($dst$$reg);
+    Register Rtmp = reg_to_register_object($tmp$$reg);
+    guarantee(Rdst != Rtmp, "Fix match rule to use TEMP_DEF");
+    Label    retry;
+
+    // Iterate until swap succeeds.
+    __ z_llgf(Rtmp, Address($mem$$Register)); // current contents
+    __ bind(retry);
+    // Calculate incremented value.
+    __ z_csy(Rtmp, Rdst, Address($mem$$Register)); // Try to store new value.
+    __ z_brne(retry); // Yikes, concurrent update, need to retry.
+
+    __ oop_decoder($tmp1$$Register, $dst$$Register, true /* maybe_null */);
+
+    __ z_lgr(Rdst, Rtmp);  // Exchanged value from memory is return value.
+
+    write_barrier_post(masm, this,
+                       $mem$$Register  /* store_addr */,
+                       $tmp1$$Register /* new_val    */,
+                       $tmp2$$Register /* tmp1       */,
+                       $tmp3$$Register /* tmp2       */);
+
+    __ block_comment("} g1GetAndSetN");
+  %}
+  ins_pipe(pipe_class_dummy);
+%}
+
+instruct g1CompareAndSwapP(iRegP mem_ptr, rarg5RegP oldval, iRegP_N2P newval, iRegI res, iRegL tmp1, iRegL tmp2, flagsReg cr) %{
+  predicate(UseG1GC && n->as_LoadStore()->barrier_data() != 0);
+  match(Set res (CompareAndSwapP mem_ptr (Binary oldval newval)));
+  match(Set res (WeakCompareAndSwapP mem_ptr (Binary oldval newval)));
+  effect(TEMP res, TEMP tmp1, TEMP tmp2, USE mem_ptr, USE_KILL oldval, KILL cr);
+  format %{ "$res = CompareAndSwapP $oldval,$newval,$mem_ptr" %}
+  ins_encode %{
+    __ block_comment("g1CompareAndSwapP {");
+    assert_different_registers($oldval$$Register, $mem_ptr$$Register);
+    assert_different_registers($newval$$Register, $mem_ptr$$Register);
+
+    Register Rcomp = reg_to_register_object($oldval$$reg);
+    Register Rnew  = reg_to_register_object($newval$$reg);
+    Register Raddr = reg_to_register_object($mem_ptr$$reg);
+    Register Rres  = reg_to_register_object($res$$reg);
+
+    write_barrier_pre(masm, this,
+                      noreg           /* obj     */,
+                      Rcomp           /* pre_val */,
+                      $tmp1$$Register /* tmp1    */,
+                      RegSet::of(Raddr, Rcomp, Rnew) /* preserve */,
+                      RegSet::of(Rres) /* no_preserve */);
+
+    __ z_csg(Rcomp, Rnew, 0, Raddr);
+
+    if (VM_Version::has_LoadStoreConditional()) {
+      __ load_const_optimized(Z_R0_scratch, 0L); // false (failed)
+      __ load_const_optimized(Rres, 1L);         // true  (succeed)
+      __ z_locgr(Rres, Z_R0_scratch, Assembler::bcondNotEqual);
+    } else {
+      Label done;
+      __ load_const_optimized(Rres, 0L); // false (failed)
+      __ z_brne(done);                   // Assume true to be the common case.
+      __ load_const_optimized(Rres, 1L); // true  (succeed)
+      __ bind(done);
+    }
+
+    write_barrier_post(masm, this,
+                       Raddr           /* store_addr */,
+                       Rnew            /* new_val    */,
+                       $tmp1$$Register /* tmp1       */,
+                       $tmp2$$Register /* tmp2       */);
+    __ block_comment("} g1CompareAndSwapP");
+  %}
+  ins_pipe(pipe_class_dummy);
+%}
+
+instruct g1CompareAndExchangeP(iRegP mem_ptr, rarg5RegP oldval, iRegP_N2P newval, iRegP res, iRegL tmp1, iRegL tmp2, flagsReg cr) %{
+  predicate(UseG1GC && n->as_LoadStore()->barrier_data() != 0);
+  match(Set res (CompareAndExchangeP mem_ptr (Binary oldval newval)));
+  effect(TEMP res, TEMP tmp1, TEMP tmp2, USE mem_ptr, USE_KILL oldval, KILL cr);
+  format %{ "$res = CompareAndExchangeP $oldval,$newval,$mem_ptr" %}
+  ins_encode %{
+    __ block_comment("g1CompareAndExchangeP {");
+    assert_different_registers($oldval$$Register, $mem_ptr$$Register);
+    assert_different_registers($newval$$Register, $mem_ptr$$Register);
+
+    // Pass $oldval to the pre-barrier (instead of loading from $mem), because
+    // $oldval is the only value that can be overwritten.
+    // The same holds for g1CompareAndSwapP.
+    write_barrier_pre(masm, this,
+                      noreg             /* obj     */,
+                      $oldval$$Register /* pre_val */,
+                      $tmp2$$Register   /* tmp1    */,
+                      RegSet::of($mem_ptr$$Register, $oldval$$Register, $newval$$Register) /* preserve */,
+                      RegSet::of($res$$Register) /* no_preserve */);
+
+    __ z_lgr($res$$Register, $oldval$$Register); // previous content
+
+    __ z_csg($oldval$$Register, $newval$$Register, 0, $mem_ptr$$reg);
+
+    write_barrier_post(masm, this,
+                       $mem_ptr$$Register /* store_addr */,
+                       $newval$$Register  /* new_val    */,
+                       $tmp1$$Register    /* tmp1       */,
+                       $tmp2$$Register    /* tmp2       */);
+    __ block_comment("} g1CompareAndExchangeP");
+  %}
+  ins_pipe(pipe_class_dummy);
+%}
+
+// Load Pointer
+instruct g1LoadP(iRegP dst, memory mem, iRegL tmp1, flagsReg cr) %{
+  predicate(UseG1GC && n->as_Load()->barrier_data() != 0);
+  match(Set dst (LoadP mem));
+  effect(TEMP dst, TEMP tmp1, KILL cr);
+  ins_cost(MEMORY_REF_COST);
+  format %{ "LG      $dst,$mem\t # ptr" %}
+  ins_encode %{
+    __ block_comment("g1LoadP {");
+    __ z_lg($dst$$Register, $mem$$Address);
+    write_barrier_pre(masm, this,
+                      noreg /* obj */,
+                      $dst$$Register /* pre_val */,
+                      $tmp1$$Register );
+    __ block_comment("} g1LoadP");
+  %}
+  ins_pipe(pipe_class_dummy);
+%}
+
+instruct g1GetAndSetP(indirect mem, iRegP dst, iRegL tmp, iRegL tmp1, iRegL tmp2, flagsReg cr) %{
+  predicate(UseG1GC && n->as_LoadStore()->barrier_data() != 0);
+  match(Set dst (GetAndSetP mem dst));
+  effect(KILL cr, TEMP tmp, TEMP tmp1, TEMP tmp2); // USE_DEF dst by match rule.
+  format %{ "XCHGP   $dst,[$mem]\t # EXCHANGE (oop, atomic), temp $tmp" %}
+  ins_encode %{
+    __ block_comment("g1GetAndSetP {");
+
+    write_barrier_pre(masm, this,
+                      $mem$$Register  /* obj  */,
+                      $tmp$$Register  /* pre_val (as a temporary register) */,
+                      $tmp1$$Register /* tmp1 */,
+                      RegSet::of($mem$$Register, $dst$$Register) /* preserve */);
+
+    __ z_lgr($tmp1$$Register, $dst$$Register);
+    Register Rdst = reg_to_register_object($dst$$reg);
+    Register Rtmp = reg_to_register_object($tmp$$reg);
+    guarantee(Rdst != Rtmp, "Fix match rule to use TEMP_DEF");
+    Label    retry;
+
+    // Iterate until swap succeeds.
+    __ z_lg(Rtmp, Address($mem$$Register));  // current contents
+    __ bind(retry);
+    // Calculate incremented value.
+    __ z_csg(Rtmp, Rdst, Address($mem$$Register)); // Try to store new value.
+    __ z_brne(retry);                              // Yikes, concurrent update, need to retry.
+    __ z_lgr(Rdst, Rtmp);                          // Exchanged value from memory is return value.
+
+    write_barrier_post(masm, this,
+                       $mem$$Register  /* store_addr */,
+                       $tmp1$$Register /* new_val    */,
+                       $tmp2$$Register /* tmp1       */,
+                       $tmp$$Register  /* tmp2       */);
+    __ block_comment("} g1GetAndSetP");
+  %}
+  ins_pipe(pipe_class_dummy);
+%}
+
+instruct g1EncodePAndStoreN(indirect mem, iRegP src, iRegL tmp1, iRegL tmp2, flagsReg cr)
+%{
+  predicate(UseG1GC && n->as_Store()->barrier_data() != 0);
+  match(Set mem (StoreN mem (EncodeP src)));
+  effect(TEMP tmp1, TEMP tmp2, KILL cr);
+  // ins_cost(INSN_COST);
+  format %{ "encode_heap_oop $tmp1, $src\n\t"
+            "st  $tmp1, $mem\t# compressed ptr" %}
+  ins_encode %{
+    __ block_comment("g1EncodePAndStoreN {");
+    write_barrier_pre(masm, this,
+                      $mem$$Register  /* obj     */,
+                      $tmp1$$Register /* pre_val */,
+                      $tmp2$$Register /* tmp1    */,
+                      RegSet::of($mem$$Register, $src$$Register) /* preserve */);
+    if ((barrier_data() & G1C2BarrierPostNotNull) == 0) {
+      __ oop_encoder($tmp1$$Register, $src$$Register, true /* maybe_null */);
+    } else {
+      __ oop_encoder($tmp1$$Register, $src$$Register, false /* maybe_null */);
+    }
+    __ z_st($tmp1$$Register, Address($mem$$Register));
+    write_barrier_post(masm, this,
+                       $mem$$Register  /* store_addr */,
+                       $src$$Register  /* new_val    */,
+                       $tmp1$$Register /* tmp1       */,
+                       $tmp2$$Register /* tmp2       */);
+    __ block_comment("} g1EncodePAndStoreN");
+  %}
+  ins_pipe(pipe_class_dummy);
+%}
diff --git a/src/hotspot/cpu/s390/gc/shared/barrierSetAssembler_s390.cpp b/src/hotspot/cpu/s390/gc/shared/barrierSetAssembler_s390.cpp
index 28892da6ca4..d826b4a06f3 100644
--- a/src/hotspot/cpu/s390/gc/shared/barrierSetAssembler_s390.cpp
+++ b/src/hotspot/cpu/s390/gc/shared/barrierSetAssembler_s390.cpp
@@ -33,6 +33,9 @@
 #include "runtime/jniHandles.hpp"
 #include "runtime/stubRoutines.hpp"
 #include "utilities/macros.hpp"
+#ifdef COMPILER2
+#include "gc/shared/c2/barrierSetC2.hpp"
+#endif // COMPILER2
 
 #define __ masm->
 
@@ -194,8 +197,93 @@ void BarrierSetAssembler::nmethod_entry_barrier(MacroAssembler* masm) {
 
 #ifdef COMPILER2
 
-OptoReg::Name BarrierSetAssembler::refine_register(const Node* node, OptoReg::Name opto_reg) {
-  Unimplemented(); // This must be implemented to support late barrier expansion.
+OptoReg::Name BarrierSetAssembler::refine_register(const Node* node, OptoReg::Name opto_reg) const {
+  if (!OptoReg::is_reg(opto_reg)) {
+    return OptoReg::Bad;
+  }
+
+  VMReg vm_reg = OptoReg::as_VMReg(opto_reg);
+  if ((vm_reg->is_Register() || vm_reg ->is_FloatRegister()) && (opto_reg & 1) != 0) {
+    return OptoReg::Bad;
+  }
+
+  return opto_reg;
+}
+
+#undef __
+#define __ _masm->
+
+SaveLiveRegisters::SaveLiveRegisters(MacroAssembler *masm, BarrierStubC2 *stub)
+  : _masm(masm), _reg_mask(stub->preserve_set()) {
+
+  const int register_save_size = iterate_over_register_mask(ACTION_COUNT_ONLY) * BytesPerWord;
+
+  _frame_size = align_up(register_save_size, frame::alignment_in_bytes) + frame::z_abi_160_size; // FIXME: this could be restricted to argument only
+
+  __ save_return_pc();
+  __ push_frame(_frame_size, Z_R14); // FIXME: check if Z_R1_scaratch can do a job here;
+
+  __ z_lg(Z_R14, _z_common_abi(return_pc) + _frame_size, Z_SP);
+
+  iterate_over_register_mask(ACTION_SAVE, _frame_size);
+}
+
+SaveLiveRegisters::~SaveLiveRegisters() {
+  iterate_over_register_mask(ACTION_RESTORE, _frame_size);
+
+  __ pop_frame();
+
+  __ restore_return_pc();
+}
+
+int SaveLiveRegisters::iterate_over_register_mask(IterationAction action, int offset) {
+  int reg_save_index = 0;
+  RegMaskIterator live_regs_iterator(_reg_mask);
+
+  while(live_regs_iterator.has_next()) {
+    const OptoReg::Name opto_reg = live_regs_iterator.next();
+
+    // Filter out stack slots (spilled registers, i.e., stack-allocated registers).
+    if (!OptoReg::is_reg(opto_reg)) {
+      continue;
+    }
+
+    const VMReg vm_reg = OptoReg::as_VMReg(opto_reg);
+    if (vm_reg->is_Register()) {
+      Register std_reg = vm_reg->as_Register();
+
+      if (std_reg->encoding() >= Z_R2->encoding() && std_reg->encoding() <= Z_R15->encoding()) {
+        reg_save_index++;
+
+        if (action == ACTION_SAVE) {
+          __ z_stg(std_reg, offset - reg_save_index * BytesPerWord, Z_SP);
+        } else if (action == ACTION_RESTORE) {
+          __ z_lg(std_reg, offset - reg_save_index * BytesPerWord, Z_SP);
+        } else {
+          assert(action == ACTION_COUNT_ONLY, "Sanity");
+        }
+      }
+    } else if (vm_reg->is_FloatRegister()) {
+      FloatRegister fp_reg = vm_reg->as_FloatRegister();
+      if (fp_reg->encoding() >= Z_F0->encoding() && fp_reg->encoding() <= Z_F15->encoding()
+          && fp_reg->encoding() != Z_F1->encoding()) {
+        reg_save_index++;
+
+        if (action == ACTION_SAVE) {
+          __ z_std(fp_reg, offset - reg_save_index * BytesPerWord, Z_SP);
+        } else if (action == ACTION_RESTORE) {
+          __ z_ld(fp_reg, offset - reg_save_index * BytesPerWord, Z_SP);
+        } else {
+          assert(action == ACTION_COUNT_ONLY, "Sanity");
+        }
+      }
+    } else if (false /* vm_reg->is_VectorRegister() */){
+      fatal("Vector register support is not there yet!");
+    } else {
+      fatal("Register type is not known");
+    }
+  }
+  return reg_save_index;
 }
 
 #endif // COMPILER2
diff --git a/src/hotspot/cpu/s390/gc/shared/barrierSetAssembler_s390.hpp b/src/hotspot/cpu/s390/gc/shared/barrierSetAssembler_s390.hpp
index de1de8a51a7..fb61adc55b5 100644
--- a/src/hotspot/cpu/s390/gc/shared/barrierSetAssembler_s390.hpp
+++ b/src/hotspot/cpu/s390/gc/shared/barrierSetAssembler_s390.hpp
@@ -32,7 +32,9 @@
 #ifdef COMPILER2
 #include "code/vmreg.hpp"
 #include "opto/optoreg.hpp"
+#include "opto/regmask.hpp"
 
+class BarrierStubC2;
 class Node;
 #endif // COMPILER2
 
@@ -62,8 +64,42 @@ public:
 
 #ifdef COMPILER2
   OptoReg::Name refine_register(const Node* node,
-                                OptoReg::Name opto_reg);
+                                OptoReg::Name opto_reg) const;
 #endif // COMPILER2
 };
 
+#ifdef COMPILER2
+
+// This class saves and restores the registers that need to be preserved across
+// the runtime call represented by a given C2 barrier stub. Use as follows:
+// {
+//   SaveLiveRegisters save(masm, stub);
+//   ..
+//   __ call_VM_leaf(...);
+//   ..
+// }
+
+class SaveLiveRegisters {
+  MacroAssembler* _masm;
+  RegMask _reg_mask;
+  Register _result_reg;
+  int _frame_size;
+
+ public:
+  SaveLiveRegisters(MacroAssembler *masm, BarrierStubC2 *stub);
+
+  ~SaveLiveRegisters();
+
+ private:
+  enum IterationAction : int {
+    ACTION_SAVE,
+    ACTION_RESTORE,
+    ACTION_COUNT_ONLY
+  };
+
+  int iterate_over_register_mask(IterationAction action, int offset = 0);
+};
+
+#endif // COMPILER2
+
 #endif // CPU_S390_GC_SHARED_BARRIERSETASSEMBLER_S390_HPP
diff --git a/src/hotspot/cpu/s390/macroAssembler_s390.cpp b/src/hotspot/cpu/s390/macroAssembler_s390.cpp
index af281345b14..e192bbab0de 100644
--- a/src/hotspot/cpu/s390/macroAssembler_s390.cpp
+++ b/src/hotspot/cpu/s390/macroAssembler_s390.cpp
@@ -2127,8 +2127,9 @@ unsigned int MacroAssembler::push_frame_abi160(unsigned int bytes) {
 
 // Pop current C frame.
 void MacroAssembler::pop_frame() {
-  BLOCK_COMMENT("pop_frame:");
+  BLOCK_COMMENT("pop_frame {");
   Assembler::z_lg(Z_SP, _z_abi(callers_sp), Z_SP);
+  BLOCK_COMMENT("} pop_frame");
 }
 
 // Pop current C frame and restore return PC register (Z_R14).
diff --git a/src/hotspot/cpu/s390/register_s390.hpp b/src/hotspot/cpu/s390/register_s390.hpp
index 931e899257e..18af232e569 100644
--- a/src/hotspot/cpu/s390/register_s390.hpp
+++ b/src/hotspot/cpu/s390/register_s390.hpp
@@ -448,4 +448,12 @@ constexpr Register      Z_R0_scratch = Z_R0;
 constexpr Register      Z_R1_scratch = Z_R1;
 constexpr FloatRegister Z_fscratch_1 = Z_F1;
 
+typedef AbstractRegSet<Register> RegSet;
+
+template <>
+inline Register AbstractRegSet<Register>::first() {
+  if (_bitset == 0) { return noreg; }
+  return as_Register(count_trailing_zeros(_bitset));
+}
+
 #endif // CPU_S390_REGISTER_S390_HPP
diff --git a/src/hotspot/cpu/s390/s390.ad b/src/hotspot/cpu/s390/s390.ad
index 1bc94842150..8181e96ecfc 100644
--- a/src/hotspot/cpu/s390/s390.ad
+++ b/src/hotspot/cpu/s390/s390.ad
@@ -1644,6 +1644,10 @@ const RegMask Matcher::method_handle_invoke_SP_save_mask() {
 
 // Should the matcher clone input 'm' of node 'n'?
 bool Matcher::pd_clone_node(Node* n, Node* m, Matcher::MStack& mstack) {
+  if (is_encode_and_store_pattern(n, m)) {
+    mstack.push(m, Visit);
+    return true;
+  }
   return false;
 }
 
@@ -3913,6 +3917,7 @@ instruct loadL_unaligned(iRegL dst, memory mem) %{
 // Load Pointer
 instruct loadP(iRegP dst, memory mem) %{
   match(Set dst (LoadP mem));
+  predicate(n->as_Load()->barrier_data() == 0);
   ins_cost(MEMORY_REF_COST);
   size(Z_DISP3_SIZE);
   format %{ "LG      $dst,$mem\t # ptr" %}
@@ -3924,6 +3929,7 @@ instruct loadP(iRegP dst, memory mem) %{
 // LoadP + CastP2L
 instruct castP2X_loadP(iRegL dst, memory mem) %{
   match(Set dst (CastP2X (LoadP mem)));
+  predicate(n->as_Load()->barrier_data() == 0);
   ins_cost(MEMORY_REF_COST);
   size(Z_DISP3_SIZE);
   format %{ "LG      $dst,$mem\t # ptr + p2x" %}
@@ -4286,6 +4292,7 @@ instruct storeL(memory mem, iRegL src) %{
 // Store Pointer
 instruct storeP(memory dst, memoryRegP src) %{
   match(Set dst (StoreP dst src));
+  predicate(n->as_Store()->barrier_data() == 0);
   ins_cost(MEMORY_REF_COST);
   size(Z_DISP3_SIZE);
   format %{ "STG     $src,$dst\t # ptr" %}
@@ -4388,6 +4395,7 @@ instruct memInitL(memoryRS mem, immL16 src) %{
 // Move Immediate to 8-byte memory.
 instruct memInitP(memoryRS mem, immP16 src) %{
   match(Set mem (StoreP mem src));
+  predicate(n->as_Store()->barrier_data() == 0);
   ins_cost(MEMORY_REF_COST);
   size(6);
   format %{ "MVGHI   $mem,$src\t # direct mem init 8" %}
@@ -4417,6 +4425,7 @@ instruct negL_reg_reg(iRegL dst, immL_0 zero, iRegL src, flagsReg cr) %{
 // Load narrow oop
 instruct loadN(iRegN dst, memory mem) %{
   match(Set dst (LoadN mem));
+  predicate(n->as_Load()->barrier_data() == 0);
   ins_cost(MEMORY_REF_COST);
   size(Z_DISP3_SIZE);
   format %{ "LoadN   $dst,$mem\t # (cOop)" %}
@@ -4480,7 +4489,7 @@ instruct loadConNKlass(iRegN dst, immNKlass src) %{
 
 instruct decodeLoadN(iRegP dst, memory mem) %{
   match(Set dst (DecodeN (LoadN mem)));
-  predicate(false && (CompressedOops::base()==nullptr)&&(CompressedOops::shift()==0));
+  predicate(false && (CompressedOops::base()==nullptr) && (CompressedOops::shift()==0));
   ins_cost(MEMORY_REF_COST);
   size(Z_DISP3_SIZE);
   format %{ "DecodeLoadN  $dst,$mem\t # (cOop Load+Decode)" %}
@@ -4735,6 +4744,7 @@ instruct encodeP_NN_Ex(iRegN dst, iRegP src, flagsReg cr) %{
 // Store Compressed Pointer
 instruct storeN(memory mem, iRegN_P2N src) %{
   match(Set mem (StoreN mem src));
+  predicate(n->as_Store()->barrier_data() == 0);
   ins_cost(MEMORY_REF_COST);
   size(Z_DISP_SIZE);
   format %{ "ST      $src,$mem\t # (cOop)" %}
@@ -5146,6 +5156,7 @@ instruct compareAndSwapL_bool(iRegP mem_ptr, rarg5RegL oldval, iRegL newval, iRe
 
 instruct compareAndSwapP_bool(iRegP mem_ptr, rarg5RegP oldval, iRegP_N2P newval, iRegI res, flagsReg cr) %{
   match(Set res (CompareAndSwapP mem_ptr (Binary oldval newval)));
+  predicate(n->as_LoadStore()->barrier_data() == 0);
   effect(USE mem_ptr, USE_KILL oldval, KILL cr);
   size(18);
   format %{ "$res = CompareAndSwapP $oldval,$newval,$mem_ptr" %}
@@ -5156,6 +5167,7 @@ instruct compareAndSwapP_bool(iRegP mem_ptr, rarg5RegP oldval, iRegP_N2P newval,
 
 instruct compareAndSwapN_bool(iRegP mem_ptr, rarg5RegN oldval, iRegN_P2N newval, iRegI res, flagsReg cr) %{
   match(Set res (CompareAndSwapN mem_ptr (Binary oldval newval)));
+  predicate(n->as_LoadStore()->barrier_data() == 0);
   effect(USE mem_ptr, USE_KILL oldval, KILL cr);
   size(16);
   format %{ "$res = CompareAndSwapN $oldval,$newval,$mem_ptr" %}
@@ -5443,6 +5455,7 @@ instruct xchgL_reg_mem(memoryRSY mem, iRegL dst, iRegL tmp, flagsReg cr) %{
 %}
 
 instruct xchgN_reg_mem(memoryRSY mem, iRegN dst, iRegI tmp, flagsReg cr) %{
+  predicate(n->as_LoadStore()->barrier_data() == 0);
   match(Set dst (GetAndSetN mem dst));
   effect(KILL cr, TEMP tmp); // USE_DEF dst by match rule.
   format %{ "XCHGN   $dst,[$mem]\t # EXCHANGE (coop, atomic), temp $tmp" %}
@@ -5452,6 +5465,7 @@ instruct xchgN_reg_mem(memoryRSY mem, iRegN dst, iRegI tmp, flagsReg cr) %{
 
 instruct xchgP_reg_mem(memoryRSY mem, iRegP dst, iRegL tmp, flagsReg cr) %{
   match(Set dst (GetAndSetP mem dst));
+  predicate(n->as_LoadStore()->barrier_data() == 0);
   effect(KILL cr, TEMP tmp); // USE_DEF dst by match rule.
   format %{ "XCHGP   $dst,[$mem]\t # EXCHANGE (oop, atomic), temp $tmp" %}
   ins_encode(z_enc_SwapL(mem, dst, tmp));
@@ -5926,7 +5940,7 @@ instruct addP_regN_reg_imm20(iRegP dst, iRegP_N2P src1, iRegL src2, immL20 con)
 instruct addP_mem_imm(memoryRSY mem, immL8 src, flagsReg cr) %{
   match(Set mem (StoreP mem (AddP (LoadP mem) src)));
   effect(KILL cr);
-  predicate(VM_Version::has_MemWithImmALUOps());
+  predicate(VM_Version::has_MemWithImmALUOps() && n->as_LoadStore()->barrier_data() == 0);
   ins_cost(MEMORY_REF_COST);
   size(6);
   format %{ "AGSI    $mem,$src\t # direct mem add 8 (ptr)" %}
diff --git a/src/hotspot/cpu/x86/gc/g1/g1BarrierSetAssembler_x86.cpp b/src/hotspot/cpu/x86/gc/g1/g1BarrierSetAssembler_x86.cpp
index b52be627776..b6be4012519 100644
--- a/src/hotspot/cpu/x86/gc/g1/g1BarrierSetAssembler_x86.cpp
+++ b/src/hotspot/cpu/x86/gc/g1/g1BarrierSetAssembler_x86.cpp
@@ -38,7 +38,10 @@
 #include "c1/c1_LIRAssembler.hpp"
 #include "c1/c1_MacroAssembler.hpp"
 #include "gc/g1/c1/g1BarrierSetC1.hpp"
-#endif
+#endif // COMPILER1
+#ifdef COMPILER2
+#include "gc/g1/c2/g1BarrierSetC2.hpp"
+#endif // COMPILER2
 
 #define __ masm->
 
@@ -160,6 +163,56 @@ void G1BarrierSetAssembler::load_at(MacroAssembler* masm, DecoratorSet decorator
   }
 }
 
+static void generate_queue_insertion(MacroAssembler* masm, ByteSize index_offset, ByteSize buffer_offset, Label& runtime,
+                                     const Register thread, const Register value, const Register temp) {
+  // This code assumes that buffer index is pointer sized.
+  STATIC_ASSERT(in_bytes(SATBMarkQueue::byte_width_of_index()) == sizeof(intptr_t));
+  // Can we store a value in the given thread's buffer?
+  // (The index field is typed as size_t.)
+  __ movptr(temp, Address(thread, in_bytes(index_offset)));   // temp := *(index address)
+  __ testptr(temp, temp);                                     // index == 0?
+  __ jcc(Assembler::zero, runtime);                           // jump to runtime if index == 0 (full buffer)
+  // The buffer is not full, store value into it.
+  __ subptr(temp, wordSize);                                  // temp := next index
+  __ movptr(Address(thread, in_bytes(index_offset)), temp);   // *(index address) := next index
+  __ addptr(temp, Address(thread, in_bytes(buffer_offset)));  // temp := buffer address + next index
+  __ movptr(Address(temp, 0), value);                         // *(buffer address + next index) := value
+}
+
+static void generate_pre_barrier_fast_path(MacroAssembler* masm,
+                                           const Register thread) {
+  Address in_progress(thread, in_bytes(G1ThreadLocalData::satb_mark_queue_active_offset()));
+  // Is marking active?
+  if (in_bytes(SATBMarkQueue::byte_width_of_active()) == 4) {
+    __ cmpl(in_progress, 0);
+  } else {
+    assert(in_bytes(SATBMarkQueue::byte_width_of_active()) == 1, "Assumption");
+    __ cmpb(in_progress, 0);
+  }
+}
+
+static void generate_pre_barrier_slow_path(MacroAssembler* masm,
+                                           const Register obj,
+                                           const Register pre_val,
+                                           const Register thread,
+                                           const Register tmp,
+                                           Label& done,
+                                           Label& runtime) {
+  // Do we need to load the previous value?
+  if (obj != noreg) {
+    __ load_heap_oop(pre_val, Address(obj, 0), noreg, noreg, AS_RAW);
+  }
+  // Is the previous value null?
+  __ cmpptr(pre_val, NULL_WORD);
+  __ jcc(Assembler::equal, done);
+  generate_queue_insertion(masm,
+                           G1ThreadLocalData::satb_mark_queue_index_offset(),
+                           G1ThreadLocalData::satb_mark_queue_buffer_offset(),
+                           runtime,
+                           thread, pre_val, tmp);
+  __ jmp(done);
+}
+
 void G1BarrierSetAssembler::g1_write_barrier_pre(MacroAssembler* masm,
                                                  Register obj,
                                                  Register pre_val,
@@ -185,43 +238,10 @@ void G1BarrierSetAssembler::g1_write_barrier_pre(MacroAssembler* masm,
     assert(pre_val != rax, "check this code");
   }
 
-  Address in_progress(thread, in_bytes(G1ThreadLocalData::satb_mark_queue_active_offset()));
-  Address index(thread, in_bytes(G1ThreadLocalData::satb_mark_queue_index_offset()));
-  Address buffer(thread, in_bytes(G1ThreadLocalData::satb_mark_queue_buffer_offset()));
-
-  // Is marking active?
-  if (in_bytes(SATBMarkQueue::byte_width_of_active()) == 4) {
-    __ cmpl(in_progress, 0);
-  } else {
-    assert(in_bytes(SATBMarkQueue::byte_width_of_active()) == 1, "Assumption");
-    __ cmpb(in_progress, 0);
-  }
+  generate_pre_barrier_fast_path(masm, thread);
+  // If marking is not active (*(mark queue active address) == 0), jump to done
   __ jcc(Assembler::equal, done);
-
-  // Do we need to load the previous value?
-  if (obj != noreg) {
-    __ load_heap_oop(pre_val, Address(obj, 0), noreg, noreg, AS_RAW);
-  }
-
-  // Is the previous value null?
-  __ cmpptr(pre_val, NULL_WORD);
-  __ jcc(Assembler::equal, done);
-
-  // Can we store original value in the thread's buffer?
-  // Is index == 0?
-  // (The index field is typed as size_t.)
-
-  __ movptr(tmp, index);                   // tmp := *index_adr
-  __ cmpptr(tmp, 0);                       // tmp == 0?
-  __ jcc(Assembler::equal, runtime);       // If yes, goto runtime
-
-  __ subptr(tmp, wordSize);                // tmp := tmp - wordSize
-  __ movptr(index, tmp);                   // *index_adr := tmp
-  __ addptr(tmp, buffer);                  // tmp := tmp + *buffer_adr
-
-  // Record the previous value
-  __ movptr(Address(tmp, 0), pre_val);
-  __ jmp(done);
+  generate_pre_barrier_slow_path(masm, obj, pre_val, thread, tmp, done, runtime);
 
   __ bind(runtime);
 
@@ -263,6 +283,54 @@ void G1BarrierSetAssembler::g1_write_barrier_pre(MacroAssembler* masm,
   __ bind(done);
 }
 
+static void generate_post_barrier_fast_path(MacroAssembler* masm,
+                                            const Register store_addr,
+                                            const Register new_val,
+                                            const Register tmp,
+                                            const Register tmp2,
+                                            Label& done,
+                                            bool new_val_may_be_null) {
+  CardTableBarrierSet* ct = barrier_set_cast<CardTableBarrierSet>(BarrierSet::barrier_set());
+  // Does store cross heap regions?
+  __ movptr(tmp, store_addr);                                    // tmp := store address
+  __ xorptr(tmp, new_val);                                       // tmp := store address ^ new value
+  __ shrptr(tmp, G1HeapRegion::LogOfHRGrainBytes);               // ((store address ^ new value) >> LogOfHRGrainBytes) == 0?
+  __ jcc(Assembler::equal, done);
+  // Crosses regions, storing null?
+  if (new_val_may_be_null) {
+    __ cmpptr(new_val, NULL_WORD);                               // new value == null?
+    __ jcc(Assembler::equal, done);
+  }
+  // Storing region crossing non-null, is card young?
+  __ movptr(tmp, store_addr);                                    // tmp := store address
+  __ shrptr(tmp, CardTable::card_shift());                       // tmp := card address relative to card table base
+  // Do not use ExternalAddress to load 'byte_map_base', since 'byte_map_base' is NOT
+  // a valid address and therefore is not properly handled by the relocation code.
+  __ movptr(tmp2, (intptr_t)ct->card_table()->byte_map_base());  // tmp2 := card table base address
+  __ addptr(tmp, tmp2);                                          // tmp := card address
+  __ cmpb(Address(tmp, 0), G1CardTable::g1_young_card_val());    // *(card address) == young_card_val?
+}
+
+static void generate_post_barrier_slow_path(MacroAssembler* masm,
+                                            const Register thread,
+                                            const Register tmp,
+                                            const Register tmp2,
+                                            Label& done,
+                                            Label& runtime) {
+  __ membar(Assembler::Membar_mask_bits(Assembler::StoreLoad));  // StoreLoad membar
+  __ cmpb(Address(tmp, 0), G1CardTable::dirty_card_val());       // *(card address) == dirty_card_val?
+  __ jcc(Assembler::equal, done);
+  // Storing a region crossing, non-null oop, card is clean.
+  // Dirty card and log.
+  __ movb(Address(tmp, 0), G1CardTable::dirty_card_val());       // *(card address) := dirty_card_val
+  generate_queue_insertion(masm,
+                           G1ThreadLocalData::dirty_card_queue_index_offset(),
+                           G1ThreadLocalData::dirty_card_queue_buffer_offset(),
+                           runtime,
+                           thread, tmp, tmp2);
+  __ jmp(done);
+}
+
 void G1BarrierSetAssembler::g1_write_barrier_post(MacroAssembler* masm,
                                                   Register store_addr,
                                                   Register new_val,
@@ -273,74 +341,125 @@ void G1BarrierSetAssembler::g1_write_barrier_post(MacroAssembler* masm,
   assert(thread == r15_thread, "must be");
 #endif // _LP64
 
-  Address queue_index(thread, in_bytes(G1ThreadLocalData::dirty_card_queue_index_offset()));
-  Address buffer(thread, in_bytes(G1ThreadLocalData::dirty_card_queue_buffer_offset()));
-
-  CardTableBarrierSet* ct =
-    barrier_set_cast<CardTableBarrierSet>(BarrierSet::barrier_set());
-
   Label done;
   Label runtime;
 
-  // Does store cross heap regions?
-
-  __ movptr(tmp, store_addr);
-  __ xorptr(tmp, new_val);
-  __ shrptr(tmp, G1HeapRegion::LogOfHRGrainBytes);
+  generate_post_barrier_fast_path(masm, store_addr, new_val, tmp, tmp2, done, true /* new_val_may_be_null */);
+  // If card is young, jump to done
   __ jcc(Assembler::equal, done);
-
-  // crosses regions, storing null?
-
-  __ cmpptr(new_val, NULL_WORD);
-  __ jcc(Assembler::equal, done);
-
-  // storing region crossing non-null, is card already dirty?
-
-  const Register card_addr = tmp;
-  const Register cardtable = tmp2;
-
-  __ movptr(card_addr, store_addr);
-  __ shrptr(card_addr, CardTable::card_shift());
-  // Do not use ExternalAddress to load 'byte_map_base', since 'byte_map_base' is NOT
-  // a valid address and therefore is not properly handled by the relocation code.
-  __ movptr(cardtable, (intptr_t)ct->card_table()->byte_map_base());
-  __ addptr(card_addr, cardtable);
-
-  __ cmpb(Address(card_addr, 0), G1CardTable::g1_young_card_val());
-  __ jcc(Assembler::equal, done);
-
-  __ membar(Assembler::Membar_mask_bits(Assembler::StoreLoad));
-  __ cmpb(Address(card_addr, 0), G1CardTable::dirty_card_val());
-  __ jcc(Assembler::equal, done);
-
-
-  // storing a region crossing, non-null oop, card is clean.
-  // dirty card and log.
-
-  __ movb(Address(card_addr, 0), G1CardTable::dirty_card_val());
-
-  // The code below assumes that buffer index is pointer sized.
-  STATIC_ASSERT(in_bytes(G1DirtyCardQueue::byte_width_of_index()) == sizeof(intptr_t));
-
-  __ movptr(tmp2, queue_index);
-  __ testptr(tmp2, tmp2);
-  __ jcc(Assembler::zero, runtime);
-  __ subptr(tmp2, wordSize);
-  __ movptr(queue_index, tmp2);
-  __ addptr(tmp2, buffer);
-  __ movptr(Address(tmp2, 0), card_addr);
-  __ jmp(done);
+  generate_post_barrier_slow_path(masm, thread, tmp, tmp2, done, runtime);
 
   __ bind(runtime);
   // save the live input values
   RegSet saved = RegSet::of(store_addr NOT_LP64(COMMA thread));
   __ push_set(saved);
-  __ call_VM_leaf(CAST_FROM_FN_PTR(address, G1BarrierSetRuntime::write_ref_field_post_entry), card_addr, thread);
+  __ call_VM_leaf(CAST_FROM_FN_PTR(address, G1BarrierSetRuntime::write_ref_field_post_entry), tmp, thread);
   __ pop_set(saved);
 
   __ bind(done);
 }
 
+#if defined(COMPILER2)
+
+static void generate_c2_barrier_runtime_call(MacroAssembler* masm, G1BarrierStubC2* stub, const Register arg, const address runtime_path) {
+#ifdef _LP64
+  SaveLiveRegisters save_registers(masm, stub);
+  if (c_rarg0 != arg) {
+    __ mov(c_rarg0, arg);
+  }
+  __ mov(c_rarg1, r15_thread);
+  // rax is a caller-saved, non-argument-passing register, so it does not
+  // interfere with c_rarg0 or c_rarg1. If it contained any live value before
+  // entering this stub, it is saved at this point, and restored after the
+  // call. If it did not contain any live value, it is free to be used. In
+  // either case, it is safe to use it here as a call scratch register.
+  __ call(RuntimeAddress(runtime_path), rax);
+#else
+  Unimplemented();
+#endif // _LP64
+}
+
+void G1BarrierSetAssembler::g1_write_barrier_pre_c2(MacroAssembler* masm,
+                                                    Register obj,
+                                                    Register pre_val,
+                                                    Register thread,
+                                                    Register tmp,
+                                                    G1PreBarrierStubC2* stub) {
+#ifdef _LP64
+  assert(thread == r15_thread, "must be");
+#endif // _LP64
+  assert(pre_val != noreg, "check this code");
+  if (obj != noreg) {
+    assert_different_registers(obj, pre_val, tmp);
+  }
+
+  stub->initialize_registers(obj, pre_val, thread, tmp);
+
+  generate_pre_barrier_fast_path(masm, thread);
+  // If marking is active (*(mark queue active address) != 0), jump to stub (slow path)
+  __ jcc(Assembler::notEqual, *stub->entry());
+
+  __ bind(*stub->continuation());
+}
+
+void G1BarrierSetAssembler::generate_c2_pre_barrier_stub(MacroAssembler* masm,
+                                                         G1PreBarrierStubC2* stub) const {
+  Assembler::InlineSkippedInstructionsCounter skip_counter(masm);
+  Label runtime;
+  Register obj = stub->obj();
+  Register pre_val = stub->pre_val();
+  Register thread = stub->thread();
+  Register tmp = stub->tmp1();
+  assert(stub->tmp2() == noreg, "not needed in this platform");
+
+  __ bind(*stub->entry());
+  generate_pre_barrier_slow_path(masm, obj, pre_val, thread, tmp, *stub->continuation(), runtime);
+
+  __ bind(runtime);
+  generate_c2_barrier_runtime_call(masm, stub, pre_val, CAST_FROM_FN_PTR(address, G1BarrierSetRuntime::write_ref_field_pre_entry));
+  __ jmp(*stub->continuation());
+}
+
+void G1BarrierSetAssembler::g1_write_barrier_post_c2(MacroAssembler* masm,
+                                                     Register store_addr,
+                                                     Register new_val,
+                                                     Register thread,
+                                                     Register tmp,
+                                                     Register tmp2,
+                                                     G1PostBarrierStubC2* stub) {
+#ifdef _LP64
+  assert(thread == r15_thread, "must be");
+#endif // _LP64
+
+  stub->initialize_registers(thread, tmp, tmp2);
+
+  bool new_val_may_be_null = (stub->barrier_data() & G1C2BarrierPostNotNull) == 0;
+  generate_post_barrier_fast_path(masm, store_addr, new_val, tmp, tmp2, *stub->continuation(), new_val_may_be_null);
+  // If card is not young, jump to stub (slow path)
+  __ jcc(Assembler::notEqual, *stub->entry());
+
+  __ bind(*stub->continuation());
+}
+
+void G1BarrierSetAssembler::generate_c2_post_barrier_stub(MacroAssembler* masm,
+                                                          G1PostBarrierStubC2* stub) const {
+  Assembler::InlineSkippedInstructionsCounter skip_counter(masm);
+  Label runtime;
+  Register thread = stub->thread();
+  Register tmp = stub->tmp1(); // tmp holds the card address.
+  Register tmp2 = stub->tmp2();
+  assert(stub->tmp3() == noreg, "not needed in this platform");
+
+  __ bind(*stub->entry());
+  generate_post_barrier_slow_path(masm, thread, tmp, tmp2, *stub->continuation(), runtime);
+
+  __ bind(runtime);
+  generate_c2_barrier_runtime_call(masm, stub, tmp, CAST_FROM_FN_PTR(address, G1BarrierSetRuntime::write_ref_field_post_entry));
+  __ jmp(*stub->continuation());
+}
+
+#endif // COMPILER2
+
 void G1BarrierSetAssembler::oop_store_at(MacroAssembler* masm, DecoratorSet decorators, BasicType type,
                                          Address dst, Register val, Register tmp1, Register tmp2, Register tmp3) {
   bool in_heap = (decorators & IN_HEAP) != 0;
diff --git a/src/hotspot/cpu/x86/gc/g1/g1BarrierSetAssembler_x86.hpp b/src/hotspot/cpu/x86/gc/g1/g1BarrierSetAssembler_x86.hpp
index a5695f5657a..4dbb1efd885 100644
--- a/src/hotspot/cpu/x86/gc/g1/g1BarrierSetAssembler_x86.hpp
+++ b/src/hotspot/cpu/x86/gc/g1/g1BarrierSetAssembler_x86.hpp
@@ -32,6 +32,9 @@ class LIR_Assembler;
 class StubAssembler;
 class G1PreBarrierStub;
 class G1PostBarrierStub;
+class G1BarrierStubC2;
+class G1PreBarrierStubC2;
+class G1PostBarrierStubC2;
 
 class G1BarrierSetAssembler: public ModRefBarrierSetAssembler {
  protected:
@@ -65,6 +68,26 @@ class G1BarrierSetAssembler: public ModRefBarrierSetAssembler {
 
   virtual void load_at(MacroAssembler* masm, DecoratorSet decorators, BasicType type,
                        Register dst, Address src, Register tmp1, Register tmp_thread);
+
+#ifdef COMPILER2
+  void g1_write_barrier_pre_c2(MacroAssembler* masm,
+                               Register obj,
+                               Register pre_val,
+                               Register thread,
+                               Register tmp,
+                               G1PreBarrierStubC2* c2_stub);
+  void generate_c2_pre_barrier_stub(MacroAssembler* masm,
+                                    G1PreBarrierStubC2* stub) const;
+  void g1_write_barrier_post_c2(MacroAssembler* masm,
+                                Register store_addr,
+                                Register new_val,
+                                Register thread,
+                                Register tmp,
+                                Register tmp2,
+                                G1PostBarrierStubC2* c2_stub);
+  void generate_c2_post_barrier_stub(MacroAssembler* masm,
+                                     G1PostBarrierStubC2* stub) const;
+#endif // COMPILER2
 };
 
 #endif // CPU_X86_GC_G1_G1BARRIERSETASSEMBLER_X86_HPP
diff --git a/src/hotspot/cpu/x86/gc/g1/g1_x86_64.ad b/src/hotspot/cpu/x86/gc/g1/g1_x86_64.ad
new file mode 100644
index 00000000000..8c1559f90f4
--- /dev/null
+++ b/src/hotspot/cpu/x86/gc/g1/g1_x86_64.ad
@@ -0,0 +1,371 @@
+//
+// Copyright (c) 2024, Oracle and/or its affiliates. All rights reserved.
+// DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+//
+// This code is free software; you can redistribute it and/or modify it
+// under the terms of the GNU General Public License version 2 only, as
+// published by the Free Software Foundation.
+//
+// This code is distributed in the hope that it will be useful, but WITHOUT
+// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+// FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+// version 2 for more details (a copy is included in the LICENSE file that
+// accompanied this code).
+//
+// You should have received a copy of the GNU General Public License version
+// 2 along with this work; if not, write to the Free Software Foundation,
+// Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+//
+// Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+// or visit www.oracle.com if you need additional information or have any
+// questions.
+//
+
+source_hpp %{
+
+#include "gc/g1/c2/g1BarrierSetC2.hpp"
+#include "gc/shared/gc_globals.hpp"
+
+%}
+
+source %{
+
+#include "gc/g1/g1BarrierSetAssembler_x86.hpp"
+#include "gc/g1/g1BarrierSetRuntime.hpp"
+
+static void write_barrier_pre(MacroAssembler* masm,
+                              const MachNode* node,
+                              Register obj,
+                              Register pre_val,
+                              Register tmp,
+                              RegSet preserve = RegSet(),
+                              RegSet no_preserve = RegSet()) {
+  if (!G1PreBarrierStubC2::needs_barrier(node)) {
+    return;
+  }
+  Assembler::InlineSkippedInstructionsCounter skip_counter(masm);
+  G1BarrierSetAssembler* g1_asm = static_cast<G1BarrierSetAssembler*>(BarrierSet::barrier_set()->barrier_set_assembler());
+  G1PreBarrierStubC2* const stub = G1PreBarrierStubC2::create(node);
+  for (RegSetIterator<Register> reg = preserve.begin(); *reg != noreg; ++reg) {
+    stub->preserve(*reg);
+  }
+  for (RegSetIterator<Register> reg = no_preserve.begin(); *reg != noreg; ++reg) {
+    stub->dont_preserve(*reg);
+  }
+  g1_asm->g1_write_barrier_pre_c2(masm, obj, pre_val, r15_thread, tmp, stub);
+}
+
+static void write_barrier_post(MacroAssembler* masm,
+                               const MachNode* node,
+                               Register store_addr,
+                               Register new_val,
+                               Register tmp1,
+                               Register tmp2) {
+  if (!G1PostBarrierStubC2::needs_barrier(node)) {
+    return;
+  }
+  Assembler::InlineSkippedInstructionsCounter skip_counter(masm);
+  G1BarrierSetAssembler* g1_asm = static_cast<G1BarrierSetAssembler*>(BarrierSet::barrier_set()->barrier_set_assembler());
+  G1PostBarrierStubC2* const stub = G1PostBarrierStubC2::create(node);
+  g1_asm->g1_write_barrier_post_c2(masm, store_addr, new_val, r15_thread, tmp1, tmp2, stub);
+}
+
+%}
+
+instruct g1StoreP(memory mem, any_RegP src, rRegP tmp1, rRegP tmp2, rRegP tmp3, rFlagsReg cr)
+%{
+  predicate(UseG1GC && n->as_Store()->barrier_data() != 0);
+  match(Set mem (StoreP mem src));
+  effect(TEMP tmp1, TEMP tmp2, TEMP tmp3, KILL cr);
+  ins_cost(125); // XXX
+  format %{ "movq    $mem, $src\t# ptr" %}
+  ins_encode %{
+    // Materialize the store address internally (as opposed to defining 'mem' as
+    // an indirect memory operand) to reduce the overhead of LCM when processing
+    // large basic blocks with many stores. Such basic blocks arise, for
+    // instance, from static initializations of large String arrays.
+    // The same holds for g1StoreN and g1EncodePAndStoreN.
+    __ lea($tmp1$$Register, $mem$$Address);
+    write_barrier_pre(masm, this,
+                      $tmp1$$Register /* obj */,
+                      $tmp2$$Register /* pre_val */,
+                      $tmp3$$Register /* tmp */,
+                      RegSet::of($tmp1$$Register, $src$$Register) /* preserve */);
+    __ movq(Address($tmp1$$Register, 0), $src$$Register);
+    write_barrier_post(masm, this,
+                       $tmp1$$Register /* store_addr */,
+                       $src$$Register /* new_val */,
+                       $tmp3$$Register /* tmp1 */,
+                       $tmp2$$Register /* tmp2 */);
+  %}
+  ins_pipe(ialu_mem_reg);
+%}
+
+instruct g1StoreN(memory mem, rRegN src, rRegP tmp1, rRegP tmp2, rRegP tmp3, rFlagsReg cr)
+%{
+  predicate(UseG1GC && n->as_Store()->barrier_data() != 0);
+  match(Set mem (StoreN mem src));
+  effect(TEMP tmp1, TEMP tmp2, TEMP tmp3, KILL cr);
+  ins_cost(125); // XXX
+  format %{ "movl    $mem, $src\t# ptr" %}
+  ins_encode %{
+    __ lea($tmp1$$Register, $mem$$Address);
+    write_barrier_pre(masm, this,
+                      $tmp1$$Register /* obj */,
+                      $tmp2$$Register /* pre_val */,
+                      $tmp3$$Register /* tmp */,
+                      RegSet::of($tmp1$$Register, $src$$Register) /* preserve */);
+    __ movl(Address($tmp1$$Register, 0), $src$$Register);
+    if ((barrier_data() & G1C2BarrierPost) != 0) {
+      __ movl($tmp2$$Register, $src$$Register);
+      if ((barrier_data() & G1C2BarrierPostNotNull) == 0) {
+        __ decode_heap_oop($tmp2$$Register);
+      } else {
+        __ decode_heap_oop_not_null($tmp2$$Register);
+      }
+    }
+    write_barrier_post(masm, this,
+                       $tmp1$$Register /* store_addr */,
+                       $tmp2$$Register /* new_val */,
+                       $tmp3$$Register /* tmp1 */,
+                       $tmp2$$Register /* tmp2 */);
+  %}
+  ins_pipe(ialu_mem_reg);
+%}
+
+instruct g1EncodePAndStoreN(memory mem, any_RegP src, rRegP tmp1, rRegP tmp2, rRegP tmp3, rFlagsReg cr)
+%{
+  predicate(UseG1GC && n->as_Store()->barrier_data() != 0);
+  match(Set mem (StoreN mem (EncodeP src)));
+  effect(TEMP tmp1, TEMP tmp2, TEMP tmp3, KILL cr);
+  ins_cost(125); // XXX
+  format %{ "encode_heap_oop $src\n\t"
+            "movl   $mem, $src\t# ptr" %}
+  ins_encode %{
+    __ lea($tmp1$$Register, $mem$$Address);
+    write_barrier_pre(masm, this,
+                      $tmp1$$Register /* obj */,
+                      $tmp2$$Register /* pre_val */,
+                      $tmp3$$Register /* tmp */,
+                      RegSet::of($tmp1$$Register, $src$$Register) /* preserve */);
+    __ movq($tmp2$$Register, $src$$Register);
+    if ((barrier_data() & G1C2BarrierPostNotNull) == 0) {
+      __ encode_heap_oop($tmp2$$Register);
+    } else {
+      __ encode_heap_oop_not_null($tmp2$$Register);
+    }
+    __ movl(Address($tmp1$$Register, 0), $tmp2$$Register);
+    write_barrier_post(masm, this,
+                       $tmp1$$Register /* store_addr */,
+                       $src$$Register /* new_val */,
+                       $tmp3$$Register /* tmp1 */,
+                       $tmp2$$Register /* tmp2 */);
+  %}
+  ins_pipe(ialu_mem_reg);
+%}
+
+instruct g1CompareAndExchangeP(indirect mem, rRegP newval, rRegP tmp1, rRegP tmp2, rRegP tmp3, rax_RegP oldval, rFlagsReg cr)
+%{
+  predicate(UseG1GC && n->as_LoadStore()->barrier_data() != 0);
+  match(Set oldval (CompareAndExchangeP mem (Binary oldval newval)));
+  effect(TEMP tmp1, TEMP tmp2, TEMP tmp3, KILL cr);
+  format %{ "lock\n\t"
+            "cmpxchgq $newval, $mem" %}
+  ins_encode %{
+    assert_different_registers($oldval$$Register, $mem$$Register);
+    // Pass $oldval to the pre-barrier (instead of loading from $mem), because
+    // $oldval is the only value that can be overwritten.
+    // The same holds for g1CompareAndSwapP.
+    write_barrier_pre(masm, this,
+                      noreg /* obj */,
+                      $oldval$$Register /* pre_val */,
+                      $tmp3$$Register /* tmp */,
+                      RegSet::of($mem$$Register, $newval$$Register, $oldval$$Register) /* preserve */);
+    __ movq($tmp1$$Register, $newval$$Register);
+    __ lock();
+    __ cmpxchgq($tmp1$$Register, Address($mem$$Register, 0));
+    write_barrier_post(masm, this,
+                       $mem$$Register /* store_addr */,
+                       $tmp1$$Register /* new_val */,
+                       $tmp2$$Register /* tmp1 */,
+                       $tmp3$$Register /* tmp2 */);
+  %}
+  ins_pipe(pipe_cmpxchg);
+%}
+
+instruct g1CompareAndExchangeN(indirect mem, rRegN newval, rRegP tmp1, rRegP tmp2, rRegP tmp3, rax_RegN oldval, rFlagsReg cr)
+%{
+  predicate(UseG1GC && n->as_LoadStore()->barrier_data() != 0);
+  match(Set oldval (CompareAndExchangeN mem (Binary oldval newval)));
+  effect(TEMP tmp1, TEMP tmp2, TEMP tmp3, KILL cr);
+  format %{ "lock\n\t"
+            "cmpxchgq $newval, $mem" %}
+  ins_encode %{
+    assert_different_registers($oldval$$Register, $mem$$Register);
+    write_barrier_pre(masm, this,
+                      $mem$$Register /* obj */,
+                      $tmp2$$Register /* pre_val */,
+                      $tmp3$$Register /* tmp */,
+                      RegSet::of($mem$$Register, $newval$$Register, $oldval$$Register) /* preserve */);
+    __ movl($tmp1$$Register, $newval$$Register);
+    __ lock();
+    __ cmpxchgl($tmp1$$Register, Address($mem$$Register, 0));
+    __ decode_heap_oop($tmp1$$Register);
+    write_barrier_post(masm, this,
+                       $mem$$Register /* store_addr */,
+                       $tmp1$$Register /* new_val */,
+                       $tmp2$$Register /* tmp1 */,
+                       $tmp3$$Register /* tmp2 */);
+  %}
+  ins_pipe(pipe_cmpxchg);
+%}
+
+instruct g1CompareAndSwapP(rRegI res, indirect mem, rRegP newval, rRegP tmp1, rRegP tmp2, rRegP tmp3, rax_RegP oldval, rFlagsReg cr)
+%{
+  predicate(UseG1GC && n->as_LoadStore()->barrier_data() != 0);
+  match(Set res (CompareAndSwapP mem (Binary oldval newval)));
+  match(Set res (WeakCompareAndSwapP mem (Binary oldval newval)));
+  effect(TEMP res, TEMP tmp1, TEMP tmp2, TEMP tmp3, KILL oldval, KILL cr);
+  format %{ "lock\n\t"
+            "cmpxchgq $newval, $mem\n\t"
+            "sete     $res\n\t"
+            "movzbl   $res, $res" %}
+  ins_encode %{
+    assert_different_registers($oldval$$Register, $mem$$Register);
+    write_barrier_pre(masm, this,
+                      noreg /* obj */,
+                      $oldval$$Register /* pre_val */,
+                      $tmp3$$Register /* tmp */,
+                      RegSet::of($mem$$Register, $newval$$Register, $oldval$$Register) /* preserve */,
+                      RegSet::of($res$$Register) /* no_preserve */);
+    __ movq($tmp1$$Register, $newval$$Register);
+    __ lock();
+    __ cmpxchgq($tmp1$$Register, Address($mem$$Register, 0));
+    __ setb(Assembler::equal, $res$$Register);
+    __ movzbl($res$$Register, $res$$Register);
+    write_barrier_post(masm, this,
+                       $mem$$Register /* store_addr */,
+                       $tmp1$$Register /* new_val */,
+                       $tmp2$$Register /* tmp1 */,
+                       $tmp3$$Register /* tmp2 */);
+  %}
+  ins_pipe(pipe_cmpxchg);
+%}
+
+instruct g1CompareAndSwapN(rRegI res, indirect mem, rRegN newval, rRegP tmp1, rRegP tmp2, rRegP tmp3, rax_RegN oldval, rFlagsReg cr)
+%{
+  predicate(UseG1GC && n->as_LoadStore()->barrier_data() != 0);
+  match(Set res (CompareAndSwapN mem (Binary oldval newval)));
+  match(Set res (WeakCompareAndSwapN mem (Binary oldval newval)));
+  effect(TEMP res, TEMP tmp1, TEMP tmp2, TEMP tmp3, KILL oldval, KILL cr);
+  format %{ "lock\n\t"
+            "cmpxchgq $newval, $mem\n\t"
+            "sete     $res\n\t"
+            "movzbl   $res, $res" %}
+  ins_encode %{
+    assert_different_registers($oldval$$Register, $mem$$Register);
+    write_barrier_pre(masm, this,
+                      $mem$$Register /* obj */,
+                      $tmp2$$Register /* pre_val */,
+                      $tmp3$$Register /* tmp */,
+                      RegSet::of($mem$$Register, $newval$$Register, $oldval$$Register) /* preserve */,
+                      RegSet::of($res$$Register) /* no_preserve */);
+    __ movl($tmp1$$Register, $newval$$Register);
+    __ lock();
+    __ cmpxchgl($tmp1$$Register, Address($mem$$Register, 0));
+    __ setb(Assembler::equal, $res$$Register);
+    __ movzbl($res$$Register, $res$$Register);
+    __ decode_heap_oop($tmp1$$Register);
+    write_barrier_post(masm, this,
+                       $mem$$Register /* store_addr */,
+                       $tmp1$$Register /* new_val */,
+                       $tmp2$$Register /* tmp1 */,
+                       $tmp3$$Register /* tmp2 */);
+  %}
+  ins_pipe(pipe_cmpxchg);
+%}
+
+instruct g1GetAndSetP(indirect mem, rRegP newval, rRegP tmp1, rRegP tmp2, rRegP tmp3, rFlagsReg cr)
+%{
+  predicate(UseG1GC && n->as_LoadStore()->barrier_data() != 0);
+  match(Set newval (GetAndSetP mem newval));
+  effect(TEMP tmp1, TEMP tmp2, TEMP tmp3, KILL cr);
+  format %{ "xchgq    $newval, $mem" %}
+  ins_encode %{
+    assert_different_registers($mem$$Register, $newval$$Register);
+    write_barrier_pre(masm, this,
+                      $mem$$Register /* obj */,
+                      $tmp2$$Register /* pre_val */,
+                      $tmp3$$Register /* tmp */,
+                      RegSet::of($mem$$Register, $newval$$Register) /* preserve */);
+    __ movq($tmp1$$Register, $newval$$Register);
+    __ xchgq($newval$$Register, Address($mem$$Register, 0));
+    write_barrier_post(masm, this,
+                       $mem$$Register /* store_addr */,
+                       $tmp1$$Register /* new_val */,
+                       $tmp2$$Register /* tmp1 */,
+                       $tmp3$$Register /* tmp2 */);
+  %}
+  ins_pipe(pipe_cmpxchg);
+%}
+
+instruct g1GetAndSetN(indirect mem, rRegN newval, rRegP tmp1, rRegP tmp2, rRegP tmp3, rFlagsReg cr)
+%{
+  predicate(UseG1GC && n->as_LoadStore()->barrier_data() != 0);
+  match(Set newval (GetAndSetN mem newval));
+  effect(TEMP tmp1, TEMP tmp2, TEMP tmp3, KILL cr);
+  format %{ "xchgq    $newval, $mem" %}
+  ins_encode %{
+    assert_different_registers($mem$$Register, $newval$$Register);
+    write_barrier_pre(masm, this,
+                      $mem$$Register /* obj */,
+                      $tmp2$$Register /* pre_val */,
+                      $tmp3$$Register /* tmp */,
+                      RegSet::of($mem$$Register, $newval$$Register) /* preserve */);
+    __ movl($tmp1$$Register, $newval$$Register);
+    __ decode_heap_oop($tmp1$$Register);
+    __ xchgl($newval$$Register, Address($mem$$Register, 0));
+    write_barrier_post(masm, this,
+                       $mem$$Register /* store_addr */,
+                       $tmp1$$Register /* new_val */,
+                       $tmp2$$Register /* tmp1 */,
+                       $tmp3$$Register /* tmp2 */);
+  %}
+  ins_pipe(pipe_cmpxchg);
+%}
+
+instruct g1LoadP(rRegP dst, memory mem, rRegP tmp, rFlagsReg cr)
+%{
+  predicate(UseG1GC && n->as_Load()->barrier_data() != 0);
+  match(Set dst (LoadP mem));
+  effect(TEMP dst, TEMP tmp, KILL cr);
+  ins_cost(125); // XXX
+  format %{ "movq    $dst, $mem\t# ptr" %}
+  ins_encode %{
+    __ movq($dst$$Register, $mem$$Address);
+    write_barrier_pre(masm, this,
+                      noreg /* obj */,
+                      $dst$$Register /* pre_val */,
+                      $tmp$$Register /* tmp */);
+  %}
+  ins_pipe(ialu_reg_mem); // XXX
+%}
+
+instruct g1LoadN(rRegN dst, memory mem, rRegP tmp1, rRegP tmp2, rFlagsReg cr)
+%{
+  predicate(UseG1GC && n->as_Load()->barrier_data() != 0);
+  match(Set dst (LoadN mem));
+  effect(TEMP dst, TEMP tmp1, TEMP tmp2, KILL cr);
+  ins_cost(125); // XXX
+  format %{ "movl    $dst, $mem\t# compressed ptr" %}
+  ins_encode %{
+    __ movl($dst$$Register, $mem$$Address);
+    __ movl($tmp1$$Register, $dst$$Register);
+    __ decode_heap_oop($tmp1$$Register);
+    write_barrier_pre(masm, this,
+                      noreg /* obj */,
+                      $tmp1$$Register /* pre_val */,
+                      $tmp2$$Register /* tmp */);
+  %}
+  ins_pipe(ialu_reg_mem); // XXX
+%}
diff --git a/src/hotspot/cpu/x86/x86.ad b/src/hotspot/cpu/x86/x86.ad
index 2b29dd14e4b..b55a1208cf2 100644
--- a/src/hotspot/cpu/x86/x86.ad
+++ b/src/hotspot/cpu/x86/x86.ad
@@ -2457,6 +2457,10 @@ bool Matcher::pd_clone_node(Node* n, Node* m, Matcher::MStack& mstack) {
     mstack.push(m, Visit);           // m = ShiftCntV
     return true;
   }
+  if (is_encode_and_store_pattern(n, m)) {
+    mstack.push(m, Visit);
+    return true;
+  }
   return false;
 }
 
diff --git a/src/hotspot/cpu/x86/x86_64.ad b/src/hotspot/cpu/x86/x86_64.ad
index 1b271683bd6..fee265473be 100644
--- a/src/hotspot/cpu/x86/x86_64.ad
+++ b/src/hotspot/cpu/x86/x86_64.ad
@@ -4341,6 +4341,7 @@ instruct loadP(rRegP dst, memory mem)
 // Load Compressed Pointer
 instruct loadN(rRegN dst, memory mem)
 %{
+   predicate(n->as_Load()->barrier_data() == 0);
    match(Set dst (LoadN mem));
 
    ins_cost(125); // XXX
@@ -5126,6 +5127,7 @@ instruct storeImmP(memory mem, immP31 src)
 // Store Compressed Pointer
 instruct storeN(memory mem, rRegN src)
 %{
+  predicate(n->as_Store()->barrier_data() == 0);
   match(Set mem (StoreN mem src));
 
   ins_cost(125); // XXX
@@ -5150,7 +5152,7 @@ instruct storeNKlass(memory mem, rRegN src)
 
 instruct storeImmN0(memory mem, immN0 zero)
 %{
-  predicate(CompressedOops::base() == nullptr);
+  predicate(CompressedOops::base() == nullptr && n->as_Store()->barrier_data() == 0);
   match(Set mem (StoreN mem zero));
 
   ins_cost(125); // XXX
@@ -5163,6 +5165,7 @@ instruct storeImmN0(memory mem, immN0 zero)
 
 instruct storeImmN(memory mem, immN src)
 %{
+  predicate(n->as_Store()->barrier_data() == 0);
   match(Set mem (StoreN mem src));
 
   ins_cost(150); // XXX
@@ -7162,6 +7165,7 @@ instruct compareAndSwapN(rRegI res,
                           memory mem_ptr,
                           rax_RegN oldval, rRegN newval,
                           rFlagsReg cr) %{
+  predicate(n->as_LoadStore()->barrier_data() == 0);
   match(Set res (CompareAndSwapN mem_ptr (Binary oldval newval)));
   match(Set res (WeakCompareAndSwapN mem_ptr (Binary oldval newval)));
   effect(KILL cr, KILL oldval);
@@ -7249,6 +7253,7 @@ instruct compareAndExchangeN(
                           memory mem_ptr,
                           rax_RegN oldval, rRegN newval,
                           rFlagsReg cr) %{
+  predicate(n->as_LoadStore()->barrier_data() == 0);
   match(Set oldval (CompareAndExchangeN mem_ptr (Binary oldval newval)));
   effect(KILL cr);
 
@@ -7470,6 +7475,7 @@ instruct xchgP( memory mem, rRegP newval) %{
 %}
 
 instruct xchgN( memory mem, rRegN newval) %{
+  predicate(n->as_LoadStore()->barrier_data() == 0);
   match(Set newval (GetAndSetN mem newval));
   format %{ "XCHGL  $newval,$mem]" %}
   ins_encode %{
@@ -11659,6 +11665,7 @@ instruct compN_rReg(rFlagsRegU cr, rRegN op1, rRegN op2)
 
 instruct compN_rReg_mem(rFlagsRegU cr, rRegN src, memory mem)
 %{
+  predicate(n->in(2)->as_Load()->barrier_data() == 0);
   match(Set cr (CmpN src (LoadN mem)));
 
   format %{ "cmpl    $src, $mem\t# compressed ptr" %}
@@ -11680,6 +11687,7 @@ instruct compN_rReg_imm(rFlagsRegU cr, rRegN op1, immN op2) %{
 
 instruct compN_mem_imm(rFlagsRegU cr, memory mem, immN src)
 %{
+  predicate(n->in(2)->as_Load()->barrier_data() == 0);
   match(Set cr (CmpN src (LoadN mem)));
 
   format %{ "cmpl    $mem, $src\t# compressed ptr" %}
@@ -11720,7 +11728,8 @@ instruct testN_reg(rFlagsReg cr, rRegN src, immN0 zero) %{
 
 instruct testN_mem(rFlagsReg cr, memory mem, immN0 zero)
 %{
-  predicate(CompressedOops::base() != nullptr);
+  predicate(CompressedOops::base() != nullptr &&
+            n->in(1)->as_Load()->barrier_data() == 0);
   match(Set cr (CmpN (LoadN mem) zero));
 
   ins_cost(500); // XXX
@@ -11733,7 +11742,8 @@ instruct testN_mem(rFlagsReg cr, memory mem, immN0 zero)
 
 instruct testN_mem_reg0(rFlagsReg cr, memory mem, immN0 zero)
 %{
-  predicate(CompressedOops::base() == nullptr);
+  predicate(CompressedOops::base() == nullptr &&
+            n->in(1)->as_Load()->barrier_data() == 0);
   match(Set cr (CmpN (LoadN mem) zero));
 
   format %{ "cmpl    R12, $mem\t# compressed ptr (R12_heapbase==0)" %}
diff --git a/src/hotspot/share/gc/g1/c2/g1BarrierSetC2.cpp b/src/hotspot/share/gc/g1/c2/g1BarrierSetC2.cpp
index 13b993546cd..8e17d1d2a7a 100644
--- a/src/hotspot/share/gc/g1/c2/g1BarrierSetC2.cpp
+++ b/src/hotspot/share/gc/g1/c2/g1BarrierSetC2.cpp
@@ -24,49 +24,32 @@
 
 #include "precompiled.hpp"
 #include "classfile/javaClasses.hpp"
+#include "code/vmreg.inline.hpp"
 #include "gc/g1/c2/g1BarrierSetC2.hpp"
 #include "gc/g1/g1BarrierSet.hpp"
+#include "gc/g1/g1BarrierSetAssembler.hpp"
 #include "gc/g1/g1BarrierSetRuntime.hpp"
 #include "gc/g1/g1CardTable.hpp"
 #include "gc/g1/g1ThreadLocalData.hpp"
 #include "gc/g1/g1HeapRegion.hpp"
 #include "opto/arraycopynode.hpp"
+#include "opto/block.hpp"
 #include "opto/compile.hpp"
 #include "opto/escape.hpp"
 #include "opto/graphKit.hpp"
 #include "opto/idealKit.hpp"
+#include "opto/machnode.hpp"
 #include "opto/macro.hpp"
+#include "opto/memnode.hpp"
+#include "opto/node.hpp"
+#include "opto/output.hpp"
+#include "opto/regalloc.hpp"
 #include "opto/rootnode.hpp"
+#include "opto/runtime.hpp"
 #include "opto/type.hpp"
+#include "utilities/growableArray.hpp"
 #include "utilities/macros.hpp"
 
-const TypeFunc *G1BarrierSetC2::write_ref_field_pre_entry_Type() {
-  const Type **fields = TypeTuple::fields(2);
-  fields[TypeFunc::Parms+0] = TypeInstPtr::NOTNULL; // original field value
-  fields[TypeFunc::Parms+1] = TypeRawPtr::NOTNULL; // thread
-  const TypeTuple *domain = TypeTuple::make(TypeFunc::Parms+2, fields);
-
-  // create result type (range)
-  fields = TypeTuple::fields(0);
-  const TypeTuple *range = TypeTuple::make(TypeFunc::Parms+0, fields);
-
-  return TypeFunc::make(domain, range);
-}
-
-const TypeFunc *G1BarrierSetC2::write_ref_field_post_entry_Type() {
-  const Type **fields = TypeTuple::fields(2);
-  fields[TypeFunc::Parms+0] = TypeRawPtr::NOTNULL;  // Card addr
-  fields[TypeFunc::Parms+1] = TypeRawPtr::NOTNULL;  // thread
-  const TypeTuple *domain = TypeTuple::make(TypeFunc::Parms+2, fields);
-
-  // create result type (range)
-  fields = TypeTuple::fields(0);
-  const TypeTuple *range = TypeTuple::make(TypeFunc::Parms, fields);
-
-  return TypeFunc::make(domain, range);
-}
-
-#define __ ideal.
 /*
  * Determine if the G1 pre-barrier can be removed. The pre-barrier is
  * required by SATB to make sure all objects live at the start of the
@@ -84,8 +67,6 @@ const TypeFunc *G1BarrierSetC2::write_ref_field_post_entry_Type() {
  * The compiler needs to determine that the object in which a field is about
  * to be written is newly allocated, and that no prior store to the same field
  * has happened since the allocation.
- *
- * Returns true if the pre-barrier can be removed
  */
 bool G1BarrierSetC2::g1_can_remove_pre_barrier(GraphKit* kit,
                                                PhaseValues* phase,
@@ -97,34 +78,28 @@ bool G1BarrierSetC2::g1_can_remove_pre_barrier(GraphKit* kit,
   AllocateNode* alloc = AllocateNode::Ideal_allocation(base);
 
   if (offset == Type::OffsetBot) {
-    return false; // cannot unalias unless there are precise offsets
+    return false; // Cannot unalias unless there are precise offsets.
   }
-
   if (alloc == nullptr) {
-    return false; // No allocation found
+    return false; // No allocation found.
   }
 
   intptr_t size_in_bytes = type2aelembytes(bt);
-
-  Node* mem = kit->memory(adr_idx); // start searching here...
+  Node* mem = kit->memory(adr_idx); // Start searching here.
 
   for (int cnt = 0; cnt < 50; cnt++) {
-
     if (mem->is_Store()) {
-
       Node* st_adr = mem->in(MemNode::Address);
       intptr_t st_offset = 0;
       Node* st_base = AddPNode::Ideal_base_and_offset(st_adr, phase, st_offset);
 
       if (st_base == nullptr) {
-        break; // inscrutable pointer
+        break; // Inscrutable pointer.
       }
-
-      // Break we have found a store with same base and offset as ours so break
       if (st_base == base && st_offset == offset) {
+        // We have found a store with same base and offset as ours.
         break;
       }
-
       if (st_offset != offset && st_offset != Type::OffsetBot) {
         const int MAX_STORE = BytesPerLong;
         if (st_offset >= offset + size_in_bytes ||
@@ -136,20 +111,18 @@ bool G1BarrierSetC2::g1_can_remove_pre_barrier(GraphKit* kit,
           // in the same sequence of RawMem effects.  We sometimes initialize
           // a whole 'tile' of array elements with a single jint or jlong.)
           mem = mem->in(MemNode::Memory);
-          continue; // advance through independent store memory
+          continue; // Advance through independent store memory.
         }
       }
-
       if (st_base != base
           && MemNode::detect_ptr_independence(base, alloc, st_base,
                                               AllocateNode::Ideal_allocation(st_base),
                                               phase)) {
-        // Success:  The bases are provably independent.
+        // Success: the bases are provably independent.
         mem = mem->in(MemNode::Memory);
-        continue; // advance through independent store memory
+        continue; // Advance through independent store memory.
       }
     } else if (mem->is_Proj() && mem->in(0)->is_Initialize()) {
-
       InitializeNode* st_init = mem->in(0)->as_Initialize();
       AllocateNode* st_alloc = st_init->allocation();
 
@@ -157,7 +130,7 @@ bool G1BarrierSetC2::g1_can_remove_pre_barrier(GraphKit* kit,
       // The alloc variable is guaranteed to not be null here from earlier check.
       if (alloc == st_alloc) {
         // Check that the initialization is storing null so that no previous store
-        // has been moved up and directly write a reference
+        // has been moved up and directly write a reference.
         Node* captured_store = st_init->find_captured_store(offset,
                                                             type2aelembytes(T_OBJECT),
                                                             phase);
@@ -166,164 +139,55 @@ bool G1BarrierSetC2::g1_can_remove_pre_barrier(GraphKit* kit,
         }
       }
     }
-
     // Unless there is an explicit 'continue', we must bail out here,
     // because 'mem' is an inscrutable memory state (e.g., a call).
     break;
   }
-
   return false;
 }
 
-// G1 pre/post barriers
-void G1BarrierSetC2::pre_barrier(GraphKit* kit,
-                                 bool do_load,
-                                 Node* ctl,
-                                 Node* obj,
-                                 Node* adr,
-                                 uint alias_idx,
-                                 Node* val,
-                                 const TypeOopPtr* val_type,
-                                 Node* pre_val,
-                                 BasicType bt) const {
-  // Some sanity checks
-  // Note: val is unused in this routine.
-
-  if (do_load) {
-    // We need to generate the load of the previous value
-    assert(obj != nullptr, "must have a base");
-    assert(adr != nullptr, "where are loading from?");
-    assert(pre_val == nullptr, "loaded already?");
-    assert(val_type != nullptr, "need a type");
-
-    if (use_ReduceInitialCardMarks()
-        && g1_can_remove_pre_barrier(kit, &kit->gvn(), adr, bt, alias_idx)) {
-      return;
-    }
-
-  } else {
-    // In this case both val_type and alias_idx are unused.
-    assert(pre_val != nullptr, "must be loaded already");
-    // Nothing to be done if pre_val is null.
-    if (pre_val->bottom_type() == TypePtr::NULL_PTR) return;
-    assert(pre_val->bottom_type()->basic_type() == T_OBJECT, "or we shouldn't be here");
-  }
-  assert(bt == T_OBJECT, "or we shouldn't be here");
-
-  IdealKit ideal(kit, true);
-
-  Node* tls = __ thread(); // ThreadLocalStorage
-
-  Node* no_base = __ top();
-  Node* zero  = __ ConI(0);
-  Node* zeroX = __ ConX(0);
-
-  float likely  = PROB_LIKELY(0.999);
-  float unlikely  = PROB_UNLIKELY(0.999);
-
-  BasicType active_type = in_bytes(SATBMarkQueue::byte_width_of_active()) == 4 ? T_INT : T_BYTE;
-  assert(in_bytes(SATBMarkQueue::byte_width_of_active()) == 4 || in_bytes(SATBMarkQueue::byte_width_of_active()) == 1, "flag width");
-
-  // Offsets into the thread
-  const int marking_offset = in_bytes(G1ThreadLocalData::satb_mark_queue_active_offset());
-  const int index_offset   = in_bytes(G1ThreadLocalData::satb_mark_queue_index_offset());
-  const int buffer_offset  = in_bytes(G1ThreadLocalData::satb_mark_queue_buffer_offset());
-
-  // Now the actual pointers into the thread
-  Node* marking_adr = __ AddP(no_base, tls, __ ConX(marking_offset));
-  Node* buffer_adr  = __ AddP(no_base, tls, __ ConX(buffer_offset));
-  Node* index_adr   = __ AddP(no_base, tls, __ ConX(index_offset));
-
-  // Now some of the values
-  Node* marking = __ load(__ ctrl(), marking_adr, TypeInt::INT, active_type, Compile::AliasIdxRaw);
-
-  // if (!marking)
-  __ if_then(marking, BoolTest::ne, zero, unlikely); {
-    BasicType index_bt = TypeX_X->basic_type();
-    assert(sizeof(size_t) == type2aelembytes(index_bt), "Loading G1 SATBMarkQueue::_index with wrong size.");
-    Node* index   = __ load(__ ctrl(), index_adr, TypeX_X, index_bt, Compile::AliasIdxRaw);
-
-    if (do_load) {
-      // load original value
-      pre_val = __ load(__ ctrl(), adr, val_type, bt, alias_idx, false, MemNode::unordered, LoadNode::Pinned);
-    }
-
-    // if (pre_val != nullptr)
-    __ if_then(pre_val, BoolTest::ne, kit->null()); {
-      Node* buffer  = __ load(__ ctrl(), buffer_adr, TypeRawPtr::NOTNULL, T_ADDRESS, Compile::AliasIdxRaw);
-
-      // is the queue for this thread full?
-      __ if_then(index, BoolTest::ne, zeroX, likely); {
-
-        // decrement the index
-        Node* next_index = kit->gvn().transform(new SubXNode(index, __ ConX(sizeof(intptr_t))));
-
-        // Now get the buffer location we will log the previous value into and store it
-        Node *log_addr = __ AddP(no_base, buffer, next_index);
-        __ store(__ ctrl(), log_addr, pre_val, T_OBJECT, Compile::AliasIdxRaw, MemNode::unordered);
-        // update the index
-        __ store(__ ctrl(), index_adr, next_index, index_bt, Compile::AliasIdxRaw, MemNode::unordered);
-
-      } __ else_(); {
-
-        // logging buffer is full, call the runtime
-        const TypeFunc *tf = write_ref_field_pre_entry_Type();
-        __ make_leaf_call(tf, CAST_FROM_FN_PTR(address, G1BarrierSetRuntime::write_ref_field_pre_entry), "write_ref_field_pre_entry", pre_val, tls);
-      } __ end_if();  // (!index)
-    } __ end_if();  // (pre_val != nullptr)
-  } __ end_if();  // (!marking)
-
-  // Final sync IdealKit and GraphKit.
-  kit->final_sync(ideal);
-}
-
 /*
- * G1 similar to any GC with a Young Generation requires a way to keep track of
- * references from Old Generation to Young Generation to make sure all live
+ * G1, similar to any GC with a Young Generation, requires a way to keep track
+ * of references from Old Generation to Young Generation to make sure all live
  * objects are found. G1 also requires to keep track of object references
  * between different regions to enable evacuation of old regions, which is done
- * as part of mixed collections. References are tracked in remembered sets and
- * is continuously updated as reference are written to with the help of the
- * post-barrier.
+ * as part of mixed collections. References are tracked in remembered sets,
+ * which are continuously updated as references are written to with the help of
+ * the post-barrier.
  *
- * To reduce the number of updates to the remembered set the post-barrier
- * filters updates to fields in objects located in the Young Generation,
- * the same region as the reference, when the null is being written or
- * if the card is already marked as dirty by an earlier write.
+ * To reduce the number of updates to the remembered set, the post-barrier
+ * filters out updates to fields in objects located in the Young Generation, the
+ * same region as the reference, when null is being written, or if the card is
+ * already marked as dirty by an earlier write.
  *
  * Under certain circumstances it is possible to avoid generating the
- * post-barrier completely if it is possible during compile time to prove
- * the object is newly allocated and that no safepoint exists between the
- * allocation and the store.
+ * post-barrier completely, if it is possible during compile time to prove the
+ * object is newly allocated and that no safepoint exists between the allocation
+ * and the store. This can be seen as a compile-time version of the
+ * above-mentioned Young Generation filter.
  *
- * In the case of slow allocation the allocation code must handle the barrier
- * as part of the allocation in the case the allocated object is not located
- * in the nursery; this would happen for humongous objects.
- *
- * Returns true if the post barrier can be removed
+ * In the case of a slow allocation, the allocation code must handle the barrier
+ * as part of the allocation if the allocated object is not located in the
+ * nursery; this would happen for humongous objects.
  */
 bool G1BarrierSetC2::g1_can_remove_post_barrier(GraphKit* kit,
-                                                PhaseValues* phase, Node* store,
+                                                PhaseValues* phase, Node* store_ctrl,
                                                 Node* adr) const {
   intptr_t      offset = 0;
   Node*         base   = AddPNode::Ideal_base_and_offset(adr, phase, offset);
   AllocateNode* alloc  = AllocateNode::Ideal_allocation(base);
 
   if (offset == Type::OffsetBot) {
-    return false; // cannot unalias unless there are precise offsets
+    return false; // Cannot unalias unless there are precise offsets.
   }
-
   if (alloc == nullptr) {
-     return false; // No allocation found
+    return false; // No allocation found.
   }
 
-  // Start search from Store node
-  Node* mem = store->in(MemNode::Control);
+  Node* mem = store_ctrl;   // Start search from Store node.
   if (mem->is_Proj() && mem->in(0)->is_Initialize()) {
-
     InitializeNode* st_init = mem->in(0)->as_Initialize();
     AllocateNode*  st_alloc = st_init->allocation();
-
     // Make sure we are looking at the same allocation
     if (alloc == st_alloc) {
       return true;
@@ -333,725 +197,361 @@ bool G1BarrierSetC2::g1_can_remove_post_barrier(GraphKit* kit,
   return false;
 }
 
-//
-// Update the card table and add card address to the queue
-//
-void G1BarrierSetC2::g1_mark_card(GraphKit* kit,
-                                  IdealKit& ideal,
-                                  Node* card_adr,
-                                  Node* oop_store,
-                                  uint oop_alias_idx,
-                                  Node* index,
-                                  Node* index_adr,
-                                  Node* buffer,
-                                  const TypeFunc* tf) const {
-  Node* zero  = __ ConI(0);
-  Node* zeroX = __ ConX(0);
-  Node* no_base = __ top();
-  BasicType card_bt = T_BYTE;
-  // Smash zero into card. MUST BE ORDERED WRT TO STORE
-  __ storeCM(__ ctrl(), card_adr, zero, oop_store, oop_alias_idx, card_bt, Compile::AliasIdxRaw);
-
-  //  Now do the queue work
-  __ if_then(index, BoolTest::ne, zeroX); {
-
-    Node* next_index = kit->gvn().transform(new SubXNode(index, __ ConX(sizeof(intptr_t))));
-    Node* log_addr = __ AddP(no_base, buffer, next_index);
-
-    // Order, see storeCM.
-    __ store(__ ctrl(), log_addr, card_adr, T_ADDRESS, Compile::AliasIdxRaw, MemNode::unordered);
-    __ store(__ ctrl(), index_adr, next_index, TypeX_X->basic_type(), Compile::AliasIdxRaw, MemNode::unordered);
-
-  } __ else_(); {
-    __ make_leaf_call(tf, CAST_FROM_FN_PTR(address, G1BarrierSetRuntime::write_ref_field_post_entry), "write_ref_field_post_entry", card_adr, __ thread());
-  } __ end_if();
-
-}
-
-void G1BarrierSetC2::post_barrier(GraphKit* kit,
-                                  Node* ctl,
-                                  Node* oop_store,
-                                  Node* obj,
-                                  Node* adr,
-                                  uint alias_idx,
-                                  Node* val,
-                                  BasicType bt,
-                                  bool use_precise) const {
-  // If we are writing a null then we need no post barrier
-
-  if (val != nullptr && val->is_Con() && val->bottom_type() == TypePtr::NULL_PTR) {
-    // Must be null
-    const Type* t = val->bottom_type();
-    assert(t == Type::TOP || t == TypePtr::NULL_PTR, "must be null");
-    // No post barrier if writing null
-    return;
-  }
-
-  if (use_ReduceInitialCardMarks() && obj == kit->just_allocated_object(kit->control())) {
-    // We can skip marks on a freshly-allocated object in Eden.
-    // Keep this code in sync with CardTableBarrierSet::on_slowpath_allocation_exit.
-    // That routine informs GC to take appropriate compensating steps,
-    // upon a slow-path allocation, so as to make this card-mark
-    // elision safe.
-    return;
-  }
-
-  if (use_ReduceInitialCardMarks()
-      && g1_can_remove_post_barrier(kit, &kit->gvn(), oop_store, adr)) {
-    return;
-  }
-
-  if (!use_precise) {
-    // All card marks for a (non-array) instance are in one place:
-    adr = obj;
-  }
-  // (Else it's an array (or unknown), and we want more precise card marks.)
-  assert(adr != nullptr, "");
-
-  IdealKit ideal(kit, true);
-
-  Node* tls = __ thread(); // ThreadLocalStorage
-
-  Node* no_base = __ top();
-  float likely = PROB_LIKELY_MAG(3);
-  float unlikely = PROB_UNLIKELY_MAG(3);
-  Node* young_card = __ ConI((jint)G1CardTable::g1_young_card_val());
-  Node* dirty_card = __ ConI((jint)G1CardTable::dirty_card_val());
-  Node* zeroX = __ ConX(0);
-
-  const TypeFunc *tf = write_ref_field_post_entry_Type();
-
-  // Offsets into the thread
-  const int index_offset  = in_bytes(G1ThreadLocalData::dirty_card_queue_index_offset());
-  const int buffer_offset = in_bytes(G1ThreadLocalData::dirty_card_queue_buffer_offset());
-
-  // Pointers into the thread
-
-  Node* buffer_adr = __ AddP(no_base, tls, __ ConX(buffer_offset));
-  Node* index_adr =  __ AddP(no_base, tls, __ ConX(index_offset));
-
-  // Now some values
-  // Use ctrl to avoid hoisting these values past a safepoint, which could
-  // potentially reset these fields in the JavaThread.
-  Node* index  = __ load(__ ctrl(), index_adr, TypeX_X, TypeX_X->basic_type(), Compile::AliasIdxRaw);
-  Node* buffer = __ load(__ ctrl(), buffer_adr, TypeRawPtr::NOTNULL, T_ADDRESS, Compile::AliasIdxRaw);
-
-  // Convert the store obj pointer to an int prior to doing math on it
-  // Must use ctrl to prevent "integerized oop" existing across safepoint
-  Node* cast =  __ CastPX(__ ctrl(), adr);
-
-  // Divide pointer by card size
-  Node* card_offset = __ URShiftX( cast, __ ConI(CardTable::card_shift()) );
-
-  // Combine card table base and card offset
-  Node* card_adr = __ AddP(no_base, byte_map_base_node(kit), card_offset );
-
-  // If we know the value being stored does it cross regions?
-
-  if (val != nullptr) {
-    // Does the store cause us to cross regions?
-
-    // Should be able to do an unsigned compare of region_size instead of
-    // and extra shift. Do we have an unsigned compare??
-    // Node* region_size = __ ConI(1 << G1HeapRegion::LogOfHRGrainBytes);
-    Node* xor_res =  __ URShiftX ( __ XorX( cast,  __ CastPX(__ ctrl(), val)), __ ConI(checked_cast<jint>(G1HeapRegion::LogOfHRGrainBytes)));
-
-    // if (xor_res == 0) same region so skip
-    __ if_then(xor_res, BoolTest::ne, zeroX, likely); {
-
-      // No barrier if we are storing a null.
-      __ if_then(val, BoolTest::ne, kit->null(), likely); {
-
-        // Ok must mark the card if not already dirty
-
-        // load the original value of the card
-        Node* card_val = __ load(__ ctrl(), card_adr, TypeInt::INT, T_BYTE, Compile::AliasIdxRaw);
-
-        __ if_then(card_val, BoolTest::ne, young_card, unlikely); {
-          kit->sync_kit(ideal);
-          kit->insert_mem_bar(Op_MemBarVolatile, oop_store);
-          __ sync_kit(kit);
-
-          Node* card_val_reload = __ load(__ ctrl(), card_adr, TypeInt::INT, T_BYTE, Compile::AliasIdxRaw);
-          __ if_then(card_val_reload, BoolTest::ne, dirty_card); {
-            g1_mark_card(kit, ideal, card_adr, oop_store, alias_idx, index, index_adr, buffer, tf);
-          } __ end_if();
-        } __ end_if();
-      } __ end_if();
-    } __ end_if();
-  } else {
-    // The Object.clone() intrinsic uses this path if !ReduceInitialCardMarks.
-    // We don't need a barrier here if the destination is a newly allocated object
-    // in Eden. Otherwise, GC verification breaks because we assume that cards in Eden
-    // are set to 'g1_young_gen' (see G1CardTable::verify_g1_young_region()).
-    assert(!use_ReduceInitialCardMarks(), "can only happen with card marking");
-    Node* card_val = __ load(__ ctrl(), card_adr, TypeInt::INT, T_BYTE, Compile::AliasIdxRaw);
-    __ if_then(card_val, BoolTest::ne, young_card); {
-      g1_mark_card(kit, ideal, card_adr, oop_store, alias_idx, index, index_adr, buffer, tf);
-    } __ end_if();
-  }
-
-  // Final sync IdealKit and GraphKit.
-  kit->final_sync(ideal);
-}
-
-// Helper that guards and inserts a pre-barrier.
-void G1BarrierSetC2::insert_pre_barrier(GraphKit* kit, Node* base_oop, Node* offset,
-                                        Node* pre_val, bool need_mem_bar) const {
-  // We could be accessing the referent field of a reference object. If so, when G1
-  // is enabled, we need to log the value in the referent field in an SATB buffer.
-  // This routine performs some compile time filters and generates suitable
-  // runtime filters that guard the pre-barrier code.
-  // Also add memory barrier for non volatile load from the referent field
-  // to prevent commoning of loads across safepoint.
-
-  // Some compile time checks.
-
-  // If offset is a constant, is it java_lang_ref_Reference::_reference_offset?
-  const TypeX* otype = offset->find_intptr_t_type();
-  if (otype != nullptr && otype->is_con() &&
-      otype->get_con() != java_lang_ref_Reference::referent_offset()) {
-    // Constant offset but not the reference_offset so just return
-    return;
-  }
-
-  // We only need to generate the runtime guards for instances.
-  const TypeOopPtr* btype = base_oop->bottom_type()->isa_oopptr();
-  if (btype != nullptr) {
-    if (btype->isa_aryptr()) {
-      // Array type so nothing to do
-      return;
-    }
-
-    const TypeInstPtr* itype = btype->isa_instptr();
-    if (itype != nullptr) {
-      // Can the klass of base_oop be statically determined to be
-      // _not_ a sub-class of Reference and _not_ Object?
-      ciKlass* klass = itype->instance_klass();
-      if (klass->is_loaded() &&
-          !klass->is_subtype_of(kit->env()->Reference_klass()) &&
-          !kit->env()->Object_klass()->is_subtype_of(klass)) {
-        return;
-      }
-    }
-  }
-
-  // The compile time filters did not reject base_oop/offset so
-  // we need to generate the following runtime filters
-  //
-  // if (offset == java_lang_ref_Reference::_reference_offset) {
-  //   if (instance_of(base, java.lang.ref.Reference)) {
-  //     pre_barrier(_, pre_val, ...);
-  //   }
-  // }
-
-  float likely   = PROB_LIKELY(  0.999);
-  float unlikely = PROB_UNLIKELY(0.999);
-
-  IdealKit ideal(kit);
-
-  Node* referent_off = __ ConX(java_lang_ref_Reference::referent_offset());
-
-  __ if_then(offset, BoolTest::eq, referent_off, unlikely); {
-      // Update graphKit memory and control from IdealKit.
-      kit->sync_kit(ideal);
-
-      Node* ref_klass_con = kit->makecon(TypeKlassPtr::make(kit->env()->Reference_klass()));
-      Node* is_instof = kit->gen_instanceof(base_oop, ref_klass_con);
-
-      // Update IdealKit memory and control from graphKit.
-      __ sync_kit(kit);
-
-      Node* one = __ ConI(1);
-      // is_instof == 0 if base_oop == nullptr
-      __ if_then(is_instof, BoolTest::eq, one, unlikely); {
-
-        // Update graphKit from IdeakKit.
-        kit->sync_kit(ideal);
-
-        // Use the pre-barrier to record the value in the referent field
-        pre_barrier(kit, false /* do_load */,
-                    __ ctrl(),
-                    nullptr /* obj */, nullptr /* adr */, max_juint /* alias_idx */, nullptr /* val */, nullptr /* val_type */,
-                    pre_val /* pre_val */,
-                    T_OBJECT);
-        if (need_mem_bar) {
-          // Add memory barrier to prevent commoning reads from this field
-          // across safepoint since GC can change its value.
-          kit->insert_mem_bar(Op_MemBarCPUOrder);
-        }
-        // Update IdealKit from graphKit.
-        __ sync_kit(kit);
-
-      } __ end_if(); // _ref_type != ref_none
-  } __ end_if(); // offset == referent_offset
-
-  // Final sync IdealKit and GraphKit.
-  kit->final_sync(ideal);
-}
-
-#undef __
-
 Node* G1BarrierSetC2::load_at_resolved(C2Access& access, const Type* val_type) const {
   DecoratorSet decorators = access.decorators();
-  Node* adr = access.addr().node();
-  Node* obj = access.base();
-
-  bool anonymous = (decorators & C2_UNSAFE_ACCESS) != 0;
-  bool mismatched = (decorators & C2_MISMATCHED) != 0;
-  bool unknown = (decorators & ON_UNKNOWN_OOP_REF) != 0;
-  bool in_heap = (decorators & IN_HEAP) != 0;
-  bool in_native = (decorators & IN_NATIVE) != 0;
   bool on_weak = (decorators & ON_WEAK_OOP_REF) != 0;
   bool on_phantom = (decorators & ON_PHANTOM_OOP_REF) != 0;
-  bool is_unordered = (decorators & MO_UNORDERED) != 0;
   bool no_keepalive = (decorators & AS_NO_KEEPALIVE) != 0;
-  bool is_mixed = !in_heap && !in_native;
-  bool need_cpu_mem_bar = !is_unordered || mismatched || is_mixed;
-
-  Node* top = Compile::current()->top();
-  Node* offset = adr->is_AddP() ? adr->in(AddPNode::Offset) : top;
-
-  // If we are reading the value of the referent field of a Reference
-  // object (either by using Unsafe directly or through reflection)
-  // then, if G1 is enabled, we need to record the referent in an
-  // SATB log buffer using the pre-barrier mechanism.
-  // Also we need to add memory barrier to prevent commoning reads
-  // from this field across safepoint since GC can change its value.
-  bool need_read_barrier = (((on_weak || on_phantom) && !no_keepalive) ||
-                            (in_heap && unknown && offset != top && obj != top));
-
-  if (!access.is_oop() || !need_read_barrier) {
-    return CardTableBarrierSetC2::load_at_resolved(access, val_type);
+  // If we are reading the value of the referent field of a Reference object, we
+  // need to record the referent in an SATB log buffer using the pre-barrier
+  // mechanism. Also we need to add a memory barrier to prevent commoning reads
+  // from this field across safepoints, since GC can change its value.
+  bool need_read_barrier = ((on_weak || on_phantom) && !no_keepalive);
+  if (access.is_oop() && need_read_barrier) {
+    access.set_barrier_data(G1C2BarrierPre);
   }
-
-  assert(access.is_parse_access(), "entry not supported at optimization time");
-
-  C2ParseAccess& parse_access = static_cast<C2ParseAccess&>(access);
-  GraphKit* kit = parse_access.kit();
-  Node* load;
-
-  Node* control =  kit->control();
-  const TypePtr* adr_type = access.addr().type();
-  MemNode::MemOrd mo = access.mem_node_mo();
-  bool requires_atomic_access = (decorators & MO_UNORDERED) == 0;
-  bool unaligned = (decorators & C2_UNALIGNED) != 0;
-  bool unsafe = (decorators & C2_UNSAFE_ACCESS) != 0;
-  // Pinned control dependency is the strictest. So it's ok to substitute it for any other.
-  load = kit->make_load(control, adr, val_type, access.type(), adr_type, mo,
-      LoadNode::Pinned, requires_atomic_access, unaligned, mismatched, unsafe,
-      access.barrier_data());
-
-
-  if (on_weak || on_phantom) {
-    // Use the pre-barrier to record the value in the referent field
-    pre_barrier(kit, false /* do_load */,
-                kit->control(),
-                nullptr /* obj */, nullptr /* adr */, max_juint /* alias_idx */, nullptr /* val */, nullptr /* val_type */,
-                load /* pre_val */, T_OBJECT);
-    // Add memory barrier to prevent commoning reads from this field
-    // across safepoint since GC can change its value.
-    kit->insert_mem_bar(Op_MemBarCPUOrder);
-  } else if (unknown) {
-    // We do not require a mem bar inside pre_barrier if need_mem_bar
-    // is set: the barriers would be emitted by us.
-    insert_pre_barrier(kit, obj, offset, load, !need_cpu_mem_bar);
-  }
-
-  return load;
-}
-
-bool G1BarrierSetC2::is_gc_barrier_node(Node* node) const {
-  if (CardTableBarrierSetC2::is_gc_barrier_node(node)) {
-    return true;
-  }
-  if (node->Opcode() != Op_CallLeaf) {
-    return false;
-  }
-  CallLeafNode *call = node->as_CallLeaf();
-  if (call->_name == nullptr) {
-    return false;
-  }
-
-  return strcmp(call->_name, "write_ref_field_pre_entry") == 0 || strcmp(call->_name, "write_ref_field_post_entry") == 0;
-}
-
-bool G1BarrierSetC2::is_g1_pre_val_load(Node* n) {
-  if (n->is_Load() && n->as_Load()->has_pinned_control_dependency()) {
-    // Make sure the only users of it are: CmpP, StoreP, and a call to write_ref_field_pre_entry
-
-    // Skip possible decode
-    if (n->outcnt() == 1 && n->unique_out()->is_DecodeN()) {
-      n = n->unique_out();
-    }
-
-    if (n->outcnt() == 3) {
-      int found = 0;
-      for (SimpleDUIterator iter(n); iter.has_next(); iter.next()) {
-        Node* use = iter.get();
-        if (use->is_Cmp() || use->is_Store()) {
-          ++found;
-        } else if (use->is_CallLeaf()) {
-          CallLeafNode* call = use->as_CallLeaf();
-          if (strcmp(call->_name, "write_ref_field_pre_entry") == 0) {
-            ++found;
-          }
-        }
-      }
-      if (found == 3) {
-        return true;
-      }
-    }
-  }
-  return false;
-}
-
-bool G1BarrierSetC2::is_gc_pre_barrier_node(Node *node) const {
-  return is_g1_pre_val_load(node);
+  return CardTableBarrierSetC2::load_at_resolved(access, val_type);
 }
 
 void G1BarrierSetC2::eliminate_gc_barrier(PhaseMacroExpand* macro, Node* node) const {
-  if (is_g1_pre_val_load(node)) {
-    macro->replace_node(node, macro->zerocon(node->as_Load()->bottom_type()->basic_type()));
-  } else {
-    assert(node->Opcode() == Op_CastP2X, "ConvP2XNode required");
-    assert(node->outcnt() <= 2, "expects 1 or 2 users: Xor and URShift nodes");
-    // It could be only one user, URShift node, in Object.clone() intrinsic
-    // but the new allocation is passed to arraycopy stub and it could not
-    // be scalar replaced. So we don't check the case.
+  eliminate_gc_barrier_data(node);
+}
 
-    // An other case of only one user (Xor) is when the value check for null
-    // in G1 post barrier is folded after CCP so the code which used URShift
-    // is removed.
-
-    // Take Region node before eliminating post barrier since it also
-    // eliminates CastP2X node when it has only one user.
-    Node* this_region = node->in(0);
-    assert(this_region != nullptr, "");
-
-    // Remove G1 post barrier.
-
-    // Search for CastP2X->Xor->URShift->Cmp path which
-    // checks if the store done to a different from the value's region.
-    // And replace Cmp with #0 (false) to collapse G1 post barrier.
-    Node* xorx = node->find_out_with(Op_XorX);
-    if (xorx != nullptr) {
-      Node* shift = xorx->unique_out();
-      Node* cmpx = shift->unique_out();
-      assert(cmpx->is_Cmp() && cmpx->unique_out()->is_Bool() &&
-          cmpx->unique_out()->as_Bool()->_test._test == BoolTest::ne,
-          "missing region check in G1 post barrier");
-      macro->replace_node(cmpx, macro->makecon(TypeInt::CC_EQ));
-
-      // Remove G1 pre barrier.
-
-      // Search "if (marking != 0)" check and set it to "false".
-      // There is no G1 pre barrier if previous stored value is null
-      // (for example, after initialization).
-      if (this_region->is_Region() && this_region->req() == 3) {
-        int ind = 1;
-        if (!this_region->in(ind)->is_IfFalse()) {
-          ind = 2;
-        }
-        if (this_region->in(ind)->is_IfFalse() &&
-            this_region->in(ind)->in(0)->Opcode() == Op_If) {
-          Node* bol = this_region->in(ind)->in(0)->in(1);
-          assert(bol->is_Bool(), "");
-          cmpx = bol->in(1);
-          if (bol->as_Bool()->_test._test == BoolTest::ne &&
-              cmpx->is_Cmp() && cmpx->in(2) == macro->intcon(0) &&
-              cmpx->in(1)->is_Load()) {
-            Node* adr = cmpx->in(1)->as_Load()->in(MemNode::Address);
-            const int marking_offset = in_bytes(G1ThreadLocalData::satb_mark_queue_active_offset());
-            if (adr->is_AddP() && adr->in(AddPNode::Base) == macro->top() &&
-                adr->in(AddPNode::Address)->Opcode() == Op_ThreadLocal &&
-                adr->in(AddPNode::Offset) == macro->MakeConX(marking_offset)) {
-              macro->replace_node(cmpx, macro->makecon(TypeInt::CC_EQ));
-            }
-          }
-        }
-      }
-    } else {
-      assert(!use_ReduceInitialCardMarks(), "can only happen with card marking");
-      // This is a G1 post barrier emitted by the Object.clone() intrinsic.
-      // Search for the CastP2X->URShiftX->AddP->LoadB->Cmp path which checks if the card
-      // is marked as young_gen and replace the Cmp with 0 (false) to collapse the barrier.
-      Node* shift = node->find_out_with(Op_URShiftX);
-      assert(shift != nullptr, "missing G1 post barrier");
-      Node* addp = shift->unique_out();
-      Node* load = addp->find_out_with(Op_LoadB);
-      assert(load != nullptr, "missing G1 post barrier");
-      Node* cmpx = load->unique_out();
-      assert(cmpx->is_Cmp() && cmpx->unique_out()->is_Bool() &&
-          cmpx->unique_out()->as_Bool()->_test._test == BoolTest::ne,
-          "missing card value check in G1 post barrier");
-      macro->replace_node(cmpx, macro->makecon(TypeInt::CC_EQ));
-      // There is no G1 pre barrier in this case
-    }
-    // Now CastP2X can be removed since it is used only on dead path
-    // which currently still alive until igvn optimize it.
-    assert(node->outcnt() == 0 || node->unique_out()->Opcode() == Op_URShiftX, "");
-    macro->replace_node(node, macro->top());
+void G1BarrierSetC2::eliminate_gc_barrier_data(Node* node) const {
+  if (node->is_LoadStore()) {
+    LoadStoreNode* loadstore = node->as_LoadStore();
+    loadstore->set_barrier_data(0);
+  } else if (node->is_Mem()) {
+    MemNode* mem = node->as_Mem();
+    mem->set_barrier_data(0);
   }
 }
 
-Node* G1BarrierSetC2::step_over_gc_barrier(Node* c) const {
-  if (!use_ReduceInitialCardMarks() &&
-      c != nullptr && c->is_Region() && c->req() == 3) {
-    for (uint i = 1; i < c->req(); i++) {
-      if (c->in(i) != nullptr && c->in(i)->is_Region() &&
-          c->in(i)->req() == 3) {
-        Node* r = c->in(i);
-        for (uint j = 1; j < r->req(); j++) {
-          if (r->in(j) != nullptr && r->in(j)->is_Proj() &&
-              r->in(j)->in(0) != nullptr &&
-              r->in(j)->in(0)->Opcode() == Op_CallLeaf &&
-              r->in(j)->in(0)->as_Call()->entry_point() == CAST_FROM_FN_PTR(address, G1BarrierSetRuntime::write_ref_field_post_entry)) {
-            Node* call = r->in(j)->in(0);
-            c = c->in(i == 1 ? 2 : 1);
-            if (c != nullptr && c->Opcode() != Op_Parm) {
-              c = c->in(0);
-              if (c != nullptr) {
-                c = c->in(0);
-                assert(call->in(0) == nullptr ||
-                       call->in(0)->in(0) == nullptr ||
-                       call->in(0)->in(0)->in(0) == nullptr ||
-                       call->in(0)->in(0)->in(0)->in(0) == nullptr ||
-                       call->in(0)->in(0)->in(0)->in(0)->in(0) == nullptr ||
-                       c == call->in(0)->in(0)->in(0)->in(0)->in(0), "bad barrier shape");
-                return c;
-              }
-            }
-          }
-        }
-      }
-    }
+static void refine_barrier_by_new_val_type(const Node* n) {
+  if (n->Opcode() != Op_StoreP &&
+      n->Opcode() != Op_StoreN) {
+    return;
   }
-  return c;
+  MemNode* store = n->as_Mem();
+  const Node* newval = n->in(MemNode::ValueIn);
+  assert(newval != nullptr, "");
+  const Type* newval_bottom = newval->bottom_type();
+  TypePtr::PTR newval_type = newval_bottom->make_ptr()->ptr();
+  uint8_t barrier_data = store->barrier_data();
+  if (!newval_bottom->isa_oopptr() &&
+      !newval_bottom->isa_narrowoop() &&
+      newval_type != TypePtr::Null) {
+    // newval is neither an OOP nor null, so there is no barrier to refine.
+    assert(barrier_data == 0, "non-OOP stores should have no barrier data");
+    return;
+  }
+  if (barrier_data == 0) {
+    // No barrier to refine.
+    return;
+  }
+  if (newval_type == TypePtr::Null) {
+    // Simply elide post-barrier if writing null.
+    barrier_data &= ~G1C2BarrierPost;
+    barrier_data &= ~G1C2BarrierPostNotNull;
+  } else if (((barrier_data & G1C2BarrierPost) != 0) &&
+             newval_type == TypePtr::NotNull) {
+    // If the post-barrier has not been elided yet (e.g. due to newval being
+    // freshly allocated), mark it as not-null (simplifies barrier tests and
+    // compressed OOPs logic).
+    barrier_data |= G1C2BarrierPostNotNull;
+  }
+  store->set_barrier_data(barrier_data);
+  return;
 }
 
-#ifdef ASSERT
-bool G1BarrierSetC2::has_cas_in_use_chain(Node *n) const {
-  Unique_Node_List visited;
+// Refine (not really expand) G1 barriers by looking at the new value type
+// (whether it is necessarily null or necessarily non-null).
+bool G1BarrierSetC2::expand_barriers(Compile* C, PhaseIterGVN& igvn) const {
+  ResourceMark rm;
+  VectorSet visited;
   Node_List worklist;
-  worklist.push(n);
+  worklist.push(C->root());
   while (worklist.size() > 0) {
-    Node* x = worklist.pop();
-    if (visited.member(x)) {
+    Node* n = worklist.pop();
+    if (visited.test_set(n->_idx)) {
       continue;
-    } else {
-      visited.push(x);
     }
-
-    if (x->is_LoadStore()) {
-      int op = x->Opcode();
-      if (op == Op_CompareAndExchangeP || op == Op_CompareAndExchangeN ||
-          op == Op_CompareAndSwapP     || op == Op_CompareAndSwapN     ||
-          op == Op_WeakCompareAndSwapP || op == Op_WeakCompareAndSwapN) {
-        return true;
-      }
-    }
-    if (!x->is_CFG()) {
-      for (SimpleDUIterator iter(x); iter.has_next(); iter.next()) {
-        Node* use = iter.get();
-        worklist.push(use);
+    refine_barrier_by_new_val_type(n);
+    for (uint j = 0; j < n->req(); j++) {
+      Node* in = n->in(j);
+      if (in != nullptr) {
+        worklist.push(in);
       }
     }
   }
   return false;
 }
 
-void G1BarrierSetC2::verify_pre_load(Node* marking_if, Unique_Node_List& loads /*output*/) const {
-  assert(loads.size() == 0, "Loads list should be empty");
-  Node* pre_val_if = marking_if->find_out_with(Op_IfTrue)->find_out_with(Op_If);
-  if (pre_val_if != nullptr) {
-    Unique_Node_List visited;
-    Node_List worklist;
-    Node* pre_val = pre_val_if->in(1)->in(1)->in(1);
-
-    worklist.push(pre_val);
-    while (worklist.size() > 0) {
-      Node* x = worklist.pop();
-      if (visited.member(x)) {
-        continue;
-      } else {
-        visited.push(x);
-      }
-
-      if (has_cas_in_use_chain(x)) {
-        loads.clear();
-        return;
-      }
-
-      if (x->is_Con()) {
-        continue;
-      }
-      if (x->is_EncodeP() || x->is_DecodeN()) {
-        worklist.push(x->in(1));
-        continue;
-      }
-      if (x->is_Load() || x->is_LoadStore()) {
-        assert(x->in(0) != nullptr, "Pre-val load has to have a control");
-        loads.push(x);
-        continue;
-      }
-      if (x->is_Phi()) {
-        for (uint i = 1; i < x->req(); i++) {
-          worklist.push(x->in(i));
-        }
-        continue;
-      }
-      assert(false, "Pre-val anomaly");
-    }
+uint G1BarrierSetC2::estimated_barrier_size(const Node* node) const {
+  // These Ideal node counts are extracted from the pre-matching Ideal graph
+  // generated when compiling the following method with early barrier expansion:
+  //   static void write(MyObject obj1, Object o) {
+  //     obj1.o1 = o;
+  //   }
+  uint8_t barrier_data = MemNode::barrier_data(node);
+  uint nodes = 0;
+  if ((barrier_data & G1C2BarrierPre) != 0) {
+    nodes += 50;
   }
+  if ((barrier_data & G1C2BarrierPost) != 0) {
+    nodes += 60;
+  }
+  return nodes;
 }
 
-void G1BarrierSetC2::verify_no_safepoints(Compile* compile, Node* marking_check_if, const Unique_Node_List& loads) const {
-  if (loads.size() == 0) {
+bool G1BarrierSetC2::can_initialize_object(const StoreNode* store) const {
+  assert(store->Opcode() == Op_StoreP || store->Opcode() == Op_StoreN, "OOP store expected");
+  // It is OK to move the store across the object initialization boundary only
+  // if it does not have any barrier, or if it has barriers that can be safely
+  // elided (because of the compensation steps taken on the allocation slow path
+  // when ReduceInitialCardMarks is enabled).
+  return (MemNode::barrier_data(store) == 0) || use_ReduceInitialCardMarks();
+}
+
+void G1BarrierSetC2::clone_at_expansion(PhaseMacroExpand* phase, ArrayCopyNode* ac) const {
+  if (ac->is_clone_inst() && !use_ReduceInitialCardMarks()) {
+    clone_in_runtime(phase, ac, G1BarrierSetRuntime::clone_addr(), "G1BarrierSetRuntime::clone");
     return;
   }
+  BarrierSetC2::clone_at_expansion(phase, ac);
+}
 
-  if (loads.size() == 1) { // Handle the typical situation when there a single pre-value load
-                           // that is dominated by the marking_check_if, that's true when the
-                           // barrier itself does the pre-val load.
-    Node *pre_val = loads.at(0);
-    if (pre_val->in(0)->in(0) == marking_check_if) { // IfTrue->If
+Node* G1BarrierSetC2::store_at_resolved(C2Access& access, C2AccessValue& val) const {
+  DecoratorSet decorators = access.decorators();
+  bool anonymous = (decorators & ON_UNKNOWN_OOP_REF) != 0;
+  bool in_heap = (decorators & IN_HEAP) != 0;
+  bool tightly_coupled_alloc = (decorators & C2_TIGHTLY_COUPLED_ALLOC) != 0;
+  bool need_store_barrier = !(tightly_coupled_alloc && use_ReduceInitialCardMarks()) && (in_heap || anonymous);
+  if (access.is_oop() && need_store_barrier) {
+    access.set_barrier_data(get_store_barrier(access));
+    if (tightly_coupled_alloc) {
+      assert(!use_ReduceInitialCardMarks(),
+             "post-barriers are only needed for tightly-coupled initialization stores when ReduceInitialCardMarks is disabled");
+      access.set_barrier_data(access.barrier_data() ^ G1C2BarrierPre);
+    }
+  }
+  return BarrierSetC2::store_at_resolved(access, val);
+}
+
+Node* G1BarrierSetC2::atomic_cmpxchg_val_at_resolved(C2AtomicParseAccess& access, Node* expected_val,
+                                                     Node* new_val, const Type* value_type) const {
+  GraphKit* kit = access.kit();
+  if (!access.is_oop()) {
+    return BarrierSetC2::atomic_cmpxchg_val_at_resolved(access, expected_val, new_val, value_type);
+  }
+  access.set_barrier_data(G1C2BarrierPre | G1C2BarrierPost);
+  return BarrierSetC2::atomic_cmpxchg_val_at_resolved(access, expected_val, new_val, value_type);
+}
+
+Node* G1BarrierSetC2::atomic_cmpxchg_bool_at_resolved(C2AtomicParseAccess& access, Node* expected_val,
+                                                      Node* new_val, const Type* value_type) const {
+  GraphKit* kit = access.kit();
+  if (!access.is_oop()) {
+    return BarrierSetC2::atomic_cmpxchg_bool_at_resolved(access, expected_val, new_val, value_type);
+  }
+  access.set_barrier_data(G1C2BarrierPre | G1C2BarrierPost);
+  return BarrierSetC2::atomic_cmpxchg_bool_at_resolved(access, expected_val, new_val, value_type);
+}
+
+Node* G1BarrierSetC2::atomic_xchg_at_resolved(C2AtomicParseAccess& access, Node* new_val, const Type* value_type) const {
+  GraphKit* kit = access.kit();
+  if (!access.is_oop()) {
+    return BarrierSetC2::atomic_xchg_at_resolved(access, new_val, value_type);
+  }
+  access.set_barrier_data(G1C2BarrierPre | G1C2BarrierPost);
+  return BarrierSetC2::atomic_xchg_at_resolved(access, new_val, value_type);
+}
+
+class G1BarrierSetC2State : public BarrierSetC2State {
+private:
+  GrowableArray<G1BarrierStubC2*>* _stubs;
+
+public:
+  G1BarrierSetC2State(Arena* arena)
+    : BarrierSetC2State(arena),
+      _stubs(new (arena) GrowableArray<G1BarrierStubC2*>(arena, 8,  0, nullptr)) {}
+
+  GrowableArray<G1BarrierStubC2*>* stubs() {
+    return _stubs;
+  }
+
+  bool needs_liveness_data(const MachNode* mach) const {
+    return G1PreBarrierStubC2::needs_barrier(mach) ||
+           G1PostBarrierStubC2::needs_barrier(mach);
+  }
+
+  bool needs_livein_data() const {
+    return false;
+  }
+};
+
+static G1BarrierSetC2State* barrier_set_state() {
+  return reinterpret_cast<G1BarrierSetC2State*>(Compile::current()->barrier_set_state());
+}
+
+G1BarrierStubC2::G1BarrierStubC2(const MachNode* node) : BarrierStubC2(node) {}
+
+G1PreBarrierStubC2::G1PreBarrierStubC2(const MachNode* node) : G1BarrierStubC2(node) {}
+
+bool G1PreBarrierStubC2::needs_barrier(const MachNode* node) {
+  return (node->barrier_data() & G1C2BarrierPre) != 0;
+}
+
+G1PreBarrierStubC2* G1PreBarrierStubC2::create(const MachNode* node) {
+  G1PreBarrierStubC2* const stub = new (Compile::current()->comp_arena()) G1PreBarrierStubC2(node);
+  if (!Compile::current()->output()->in_scratch_emit_size()) {
+    barrier_set_state()->stubs()->append(stub);
+  }
+  return stub;
+}
+
+void G1PreBarrierStubC2::initialize_registers(Register obj, Register pre_val, Register thread, Register tmp1, Register tmp2) {
+  _obj = obj;
+  _pre_val = pre_val;
+  _thread = thread;
+  _tmp1 = tmp1;
+  _tmp2 = tmp2;
+}
+
+Register G1PreBarrierStubC2::obj() const {
+  return _obj;
+}
+
+Register G1PreBarrierStubC2::pre_val() const {
+  return _pre_val;
+}
+
+Register G1PreBarrierStubC2::thread() const {
+  return _thread;
+}
+
+Register G1PreBarrierStubC2::tmp1() const {
+  return _tmp1;
+}
+
+Register G1PreBarrierStubC2::tmp2() const {
+  return _tmp2;
+}
+
+void G1PreBarrierStubC2::emit_code(MacroAssembler& masm) {
+  G1BarrierSetAssembler* bs = static_cast<G1BarrierSetAssembler*>(BarrierSet::barrier_set()->barrier_set_assembler());
+  bs->generate_c2_pre_barrier_stub(&masm, this);
+}
+
+G1PostBarrierStubC2::G1PostBarrierStubC2(const MachNode* node) : G1BarrierStubC2(node) {}
+
+bool G1PostBarrierStubC2::needs_barrier(const MachNode* node) {
+  return (node->barrier_data() & G1C2BarrierPost) != 0;
+}
+
+G1PostBarrierStubC2* G1PostBarrierStubC2::create(const MachNode* node) {
+  G1PostBarrierStubC2* const stub = new (Compile::current()->comp_arena()) G1PostBarrierStubC2(node);
+  if (!Compile::current()->output()->in_scratch_emit_size()) {
+    barrier_set_state()->stubs()->append(stub);
+  }
+  return stub;
+}
+
+void G1PostBarrierStubC2::initialize_registers(Register thread, Register tmp1, Register tmp2, Register tmp3) {
+  _thread = thread;
+  _tmp1 = tmp1;
+  _tmp2 = tmp2;
+  _tmp3 = tmp3;
+}
+
+Register G1PostBarrierStubC2::thread() const {
+  return _thread;
+}
+
+Register G1PostBarrierStubC2::tmp1() const {
+  return _tmp1;
+}
+
+Register G1PostBarrierStubC2::tmp2() const {
+  return _tmp2;
+}
+
+Register G1PostBarrierStubC2::tmp3() const {
+  return _tmp3;
+}
+
+void G1PostBarrierStubC2::emit_code(MacroAssembler& masm) {
+  G1BarrierSetAssembler* bs = static_cast<G1BarrierSetAssembler*>(BarrierSet::barrier_set()->barrier_set_assembler());
+  bs->generate_c2_post_barrier_stub(&masm, this);
+}
+
+void* G1BarrierSetC2::create_barrier_state(Arena* comp_arena) const {
+  return new (comp_arena) G1BarrierSetC2State(comp_arena);
+}
+
+int G1BarrierSetC2::get_store_barrier(C2Access& access) const {
+  if (!access.is_parse_access()) {
+    // Only support for eliding barriers at parse time for now.
+    return G1C2BarrierPre | G1C2BarrierPost;
+  }
+  GraphKit* kit = (static_cast<C2ParseAccess&>(access)).kit();
+  Node* ctl = kit->control();
+  Node* adr = access.addr().node();
+  uint adr_idx = kit->C->get_alias_index(access.addr().type());
+  assert(adr_idx != Compile::AliasIdxTop, "use other store_to_memory factory");
+
+  bool can_remove_pre_barrier = g1_can_remove_pre_barrier(kit, &kit->gvn(), adr, access.type(), adr_idx);
+
+  // We can skip marks on a freshly-allocated object in Eden. Keep this code in
+  // sync with CardTableBarrierSet::on_slowpath_allocation_exit. That routine
+  // informs GC to take appropriate compensating steps, upon a slow-path
+  // allocation, so as to make this card-mark elision safe.
+  // The post-barrier can also be removed if null is written. This case is
+  // handled by G1BarrierSetC2::expand_barriers, which runs at the end of C2's
+  // platform-independent optimizations to exploit stronger type information.
+  bool can_remove_post_barrier = use_ReduceInitialCardMarks() &&
+    ((access.base() == kit->just_allocated_object(ctl)) ||
+     g1_can_remove_post_barrier(kit, &kit->gvn(), ctl, adr));
+
+  int barriers = 0;
+  if (!can_remove_pre_barrier) {
+    barriers |= G1C2BarrierPre;
+  }
+  if (!can_remove_post_barrier) {
+    barriers |= G1C2BarrierPost;
+  }
+
+  return barriers;
+}
+
+void G1BarrierSetC2::late_barrier_analysis() const {
+  compute_liveness_at_stubs();
+}
+
+void G1BarrierSetC2::emit_stubs(CodeBuffer& cb) const {
+  MacroAssembler masm(&cb);
+  GrowableArray<G1BarrierStubC2*>* const stubs = barrier_set_state()->stubs();
+  for (int i = 0; i < stubs->length(); i++) {
+    // Make sure there is enough space in the code buffer
+    if (cb.insts()->maybe_expand_to_ensure_remaining(PhaseOutput::MAX_inst_size) && cb.blob() == nullptr) {
+      ciEnv::current()->record_failure("CodeCache is full");
       return;
     }
+    stubs->at(i)->emit_code(masm);
   }
-
-  // All other cases are when pre-value loads dominate the marking check.
-  Unique_Node_List controls;
-  for (uint i = 0; i < loads.size(); i++) {
-    Node *c = loads.at(i)->in(0);
-    controls.push(c);
-  }
-
-  Unique_Node_List visited;
-  Unique_Node_List safepoints;
-  Node_List worklist;
-  uint found = 0;
-
-  worklist.push(marking_check_if);
-  while (worklist.size() > 0 && found < controls.size()) {
-    Node* x = worklist.pop();
-    if (x == nullptr || x == compile->top()) continue;
-    if (visited.member(x)) {
-      continue;
-    } else {
-      visited.push(x);
-    }
-
-    if (controls.member(x)) {
-      found++;
-    }
-    if (x->is_Region()) {
-      for (uint i = 1; i < x->req(); i++) {
-        worklist.push(x->in(i));
-      }
-    } else {
-      if (!x->is_SafePoint()) {
-        worklist.push(x->in(0));
-      } else {
-        safepoints.push(x);
-      }
-    }
-  }
-  assert(found == controls.size(), "Pre-barrier structure anomaly or possible safepoint");
+  masm.flush();
 }
 
-void G1BarrierSetC2::verify_gc_barriers(Compile* compile, CompilePhase phase) const {
-  if (phase != BarrierSetC2::BeforeCodeGen) {
-    return;
+#ifndef PRODUCT
+void G1BarrierSetC2::dump_barrier_data(const MachNode* mach, outputStream* st) const {
+  if ((mach->barrier_data() & G1C2BarrierPre) != 0) {
+    st->print("pre ");
   }
-  // Verify G1 pre-barriers
-  const int marking_offset = in_bytes(G1ThreadLocalData::satb_mark_queue_active_offset());
-
-  Unique_Node_List visited;
-  Node_List worklist;
-  // We're going to walk control flow backwards starting from the Root
-  worklist.push(compile->root());
-  while (worklist.size() > 0) {
-    Node* x = worklist.pop();
-    if (x == nullptr || x == compile->top()) continue;
-    if (visited.member(x)) {
-      continue;
-    } else {
-      visited.push(x);
-    }
-
-    if (x->is_Region()) {
-      for (uint i = 1; i < x->req(); i++) {
-        worklist.push(x->in(i));
-      }
-    } else {
-      worklist.push(x->in(0));
-      // We are looking for the pattern:
-      //                            /->ThreadLocal
-      // If->Bool->CmpI->LoadB->AddP->ConL(marking_offset)
-      //              \->ConI(0)
-      // We want to verify that the If and the LoadB have the same control
-      // See GraphKit::g1_write_barrier_pre()
-      if (x->is_If()) {
-        IfNode *iff = x->as_If();
-        if (iff->in(1)->is_Bool() && iff->in(1)->in(1)->is_Cmp()) {
-          CmpNode *cmp = iff->in(1)->in(1)->as_Cmp();
-          if (cmp->Opcode() == Op_CmpI && cmp->in(2)->is_Con() && cmp->in(2)->bottom_type()->is_int()->get_con() == 0
-              && cmp->in(1)->is_Load()) {
-            LoadNode* load = cmp->in(1)->as_Load();
-            if (load->Opcode() == Op_LoadB && load->in(2)->is_AddP() && load->in(2)->in(2)->Opcode() == Op_ThreadLocal
-                && load->in(2)->in(3)->is_Con()
-                && load->in(2)->in(3)->bottom_type()->is_intptr_t()->get_con() == marking_offset) {
-
-              Node* if_ctrl = iff->in(0);
-              Node* load_ctrl = load->in(0);
-
-              if (if_ctrl != load_ctrl) {
-                // Skip possible CProj->NeverBranch in infinite loops
-                if ((if_ctrl->is_Proj() && if_ctrl->Opcode() == Op_CProj)
-                    && if_ctrl->in(0)->is_NeverBranch()) {
-                  if_ctrl = if_ctrl->in(0)->in(0);
-                }
-              }
-              assert(load_ctrl != nullptr && if_ctrl == load_ctrl, "controls must match");
-
-              Unique_Node_List loads;
-              verify_pre_load(iff, loads);
-              verify_no_safepoints(compile, iff, loads);
-            }
-          }
-        }
-      }
-    }
+  if ((mach->barrier_data() & G1C2BarrierPost) != 0) {
+    st->print("post ");
+  }
+  if ((mach->barrier_data() & G1C2BarrierPostNotNull) != 0) {
+    st->print("notnull ");
   }
 }
-#endif
-
-bool G1BarrierSetC2::escape_add_to_con_graph(ConnectionGraph* conn_graph, PhaseGVN* gvn, Unique_Node_List* delayed_worklist, Node* n, uint opcode) const {
-  if (opcode == Op_StoreP) {
-    Node* adr = n->in(MemNode::Address);
-    const Type* adr_type = gvn->type(adr);
-    // Pointer stores in G1 barriers looks like unsafe access.
-    // Ignore such stores to be able scalar replace non-escaping
-    // allocations.
-    if (adr_type->isa_rawptr() && adr->is_AddP()) {
-      Node* base = conn_graph->get_addp_base(adr);
-      if (base->Opcode() == Op_LoadP &&
-          base->in(MemNode::Address)->is_AddP()) {
-        adr = base->in(MemNode::Address);
-        Node* tls = conn_graph->get_addp_base(adr);
-        if (tls->Opcode() == Op_ThreadLocal) {
-          int offs = (int) gvn->find_intptr_t_con(adr->in(AddPNode::Offset), Type::OffsetBot);
-          const int buf_offset = in_bytes(G1ThreadLocalData::satb_mark_queue_buffer_offset());
-          if (offs == buf_offset) {
-            return true; // G1 pre barrier previous oop value store.
-          }
-          if (offs == in_bytes(G1ThreadLocalData::dirty_card_queue_buffer_offset())) {
-            return true; // G1 post barrier card address store.
-          }
-        }
-      }
-    }
-  }
-  return false;
-}
+#endif // !PRODUCT
diff --git a/src/hotspot/share/gc/g1/c2/g1BarrierSetC2.hpp b/src/hotspot/share/gc/g1/c2/g1BarrierSetC2.hpp
index c445a87d2e4..dc333d8c331 100644
--- a/src/hotspot/share/gc/g1/c2/g1BarrierSetC2.hpp
+++ b/src/hotspot/share/gc/g1/c2/g1BarrierSetC2.hpp
@@ -31,29 +31,62 @@ class PhaseTransform;
 class Type;
 class TypeFunc;
 
+const int G1C2BarrierPre         = 1;
+const int G1C2BarrierPost        = 2;
+const int G1C2BarrierPostNotNull = 4;
+
+class G1BarrierStubC2 : public BarrierStubC2 {
+public:
+  G1BarrierStubC2(const MachNode* node);
+  virtual void emit_code(MacroAssembler& masm) = 0;
+};
+
+class G1PreBarrierStubC2 : public G1BarrierStubC2 {
+private:
+  Register _obj;
+  Register _pre_val;
+  Register _thread;
+  Register _tmp1;
+  Register _tmp2;
+
+protected:
+  G1PreBarrierStubC2(const MachNode* node);
+
+public:
+  static bool needs_barrier(const MachNode* node);
+  static G1PreBarrierStubC2* create(const MachNode* node);
+  void initialize_registers(Register obj, Register pre_val, Register thread, Register tmp1 = noreg, Register tmp2 = noreg);
+  Register obj() const;
+  Register pre_val() const;
+  Register thread() const;
+  Register tmp1() const;
+  Register tmp2() const;
+  virtual void emit_code(MacroAssembler& masm);
+};
+
+class G1PostBarrierStubC2 : public G1BarrierStubC2 {
+private:
+  Register _thread;
+  Register _tmp1;
+  Register _tmp2;
+  Register _tmp3;
+
+protected:
+  G1PostBarrierStubC2(const MachNode* node);
+
+public:
+  static bool needs_barrier(const MachNode* node);
+  static G1PostBarrierStubC2* create(const MachNode* node);
+  void initialize_registers(Register thread, Register tmp1 = noreg, Register tmp2 = noreg, Register tmp3 = noreg);
+  Register thread() const;
+  Register tmp1() const;
+  Register tmp2() const;
+  Register tmp3() const;
+  virtual void emit_code(MacroAssembler& masm);
+};
+
 class G1BarrierSetC2: public CardTableBarrierSetC2 {
 protected:
-  virtual void pre_barrier(GraphKit* kit,
-                           bool do_load,
-                           Node* ctl,
-                           Node* obj,
-                           Node* adr,
-                           uint adr_idx,
-                           Node* val,
-                           const TypeOopPtr* val_type,
-                           Node* pre_val,
-                           BasicType bt) const;
-
-  virtual void post_barrier(GraphKit* kit,
-                            Node* ctl,
-                            Node* store,
-                            Node* obj,
-                            Node* adr,
-                            uint adr_idx,
-                            Node* val,
-                            BasicType bt,
-                            bool use_precise) const;
-
   bool g1_can_remove_pre_barrier(GraphKit* kit,
                                  PhaseValues* phase,
                                  Node* adr,
@@ -64,44 +97,31 @@ protected:
                                   PhaseValues* phase, Node* store,
                                   Node* adr) const;
 
-  void g1_mark_card(GraphKit* kit,
-                    IdealKit& ideal,
-                    Node* card_adr,
-                    Node* oop_store,
-                    uint oop_alias_idx,
-                    Node* index,
-                    Node* index_adr,
-                    Node* buffer,
-                    const TypeFunc* tf) const;
-
-  // Helper for unsafe accesses, that may or may not be on the referent field.
-  // Generates the guards that check whether the result of
-  // Unsafe.getReference should be recorded in an SATB log buffer.
-  void insert_pre_barrier(GraphKit* kit, Node* base_oop, Node* offset, Node* pre_val, bool need_mem_bar) const;
-
-  static const TypeFunc* write_ref_field_pre_entry_Type();
-  static const TypeFunc* write_ref_field_post_entry_Type();
+  int get_store_barrier(C2Access& access) const;
 
   virtual Node* load_at_resolved(C2Access& access, const Type* val_type) const;
+  virtual Node* store_at_resolved(C2Access& access, C2AccessValue& val) const;
+  virtual Node* atomic_cmpxchg_val_at_resolved(C2AtomicParseAccess& access, Node* expected_val,
+                                               Node* new_val, const Type* value_type) const;
+  virtual Node* atomic_cmpxchg_bool_at_resolved(C2AtomicParseAccess& access, Node* expected_val,
+                                                Node* new_val, const Type* value_type) const;
+  virtual Node* atomic_xchg_at_resolved(C2AtomicParseAccess& access, Node* new_val, const Type* value_type) const;
 
-#ifdef ASSERT
-  bool has_cas_in_use_chain(Node* x) const;
-  void verify_pre_load(Node* marking_check_if, Unique_Node_List& loads /*output*/) const;
-  void verify_no_safepoints(Compile* compile, Node* marking_load, const Unique_Node_List& loads) const;
-#endif
-
-  static bool is_g1_pre_val_load(Node* n);
 public:
-  virtual bool is_gc_pre_barrier_node(Node* node) const;
-  virtual bool is_gc_barrier_node(Node* node) const;
   virtual void eliminate_gc_barrier(PhaseMacroExpand* macro, Node* node) const;
-  virtual Node* step_over_gc_barrier(Node* c) const;
+  virtual void eliminate_gc_barrier_data(Node* node) const;
+  virtual bool expand_barriers(Compile* C, PhaseIterGVN& igvn) const;
+  virtual uint estimated_barrier_size(const Node* node) const;
+  virtual bool can_initialize_object(const StoreNode* store) const;
+  virtual void clone_at_expansion(PhaseMacroExpand* phase,
+                                  ArrayCopyNode* ac) const;
+  virtual void* create_barrier_state(Arena* comp_arena) const;
+  virtual void emit_stubs(CodeBuffer& cb) const;
+  virtual void late_barrier_analysis() const;
 
-#ifdef ASSERT
-  virtual void verify_gc_barriers(Compile* compile, CompilePhase phase) const;
+#ifndef PRODUCT
+  virtual void dump_barrier_data(const MachNode* mach, outputStream* st) const;
 #endif
-
-  virtual bool escape_add_to_con_graph(ConnectionGraph* conn_graph, PhaseGVN* gvn, Unique_Node_List* delayed_worklist, Node* n, uint opcode) const;
 };
 
 #endif // SHARE_GC_G1_C2_G1BARRIERSETC2_HPP
diff --git a/src/hotspot/share/gc/g1/g1BarrierSetRuntime.cpp b/src/hotspot/share/gc/g1/g1BarrierSetRuntime.cpp
index a0fce437807..2e247f46c93 100644
--- a/src/hotspot/share/gc/g1/g1BarrierSetRuntime.cpp
+++ b/src/hotspot/share/gc/g1/g1BarrierSetRuntime.cpp
@@ -61,3 +61,11 @@ JRT_LEAF(void, G1BarrierSetRuntime::write_ref_field_post_entry(volatile G1CardTa
   G1DirtyCardQueue& queue = G1ThreadLocalData::dirty_card_queue(thread);
   G1BarrierSet::dirty_card_queue_set().enqueue(queue, card_addr);
 JRT_END
+
+JRT_LEAF(void, G1BarrierSetRuntime::clone(oopDesc* src, oopDesc* dst, size_t size))
+  HeapAccess<>::clone(src, dst, size);
+JRT_END
+
+address G1BarrierSetRuntime::clone_addr() {
+  return reinterpret_cast<address>(clone);
+}
diff --git a/src/hotspot/share/gc/g1/g1BarrierSetRuntime.hpp b/src/hotspot/share/gc/g1/g1BarrierSetRuntime.hpp
index 366679f032b..f98e94096e7 100644
--- a/src/hotspot/share/gc/g1/g1BarrierSetRuntime.hpp
+++ b/src/hotspot/share/gc/g1/g1BarrierSetRuntime.hpp
@@ -35,6 +35,8 @@ class oopDesc;
 class JavaThread;
 
 class G1BarrierSetRuntime: public AllStatic {
+private:
+  static void clone(oopDesc* src, oopDesc* dst, size_t size);
 public:
   using CardValue = G1CardTable::CardValue;
 
@@ -46,6 +48,8 @@ public:
   // C2 slow-path runtime calls.
   static void write_ref_field_pre_entry(oopDesc* orig, JavaThread *thread);
   static void write_ref_field_post_entry(volatile CardValue* card_addr, JavaThread* thread);
+
+  static address clone_addr();
 };
 
 #endif // SHARE_GC_G1_G1BARRIERSETRUNTIME_HPP
diff --git a/src/hotspot/share/gc/shared/c2/barrierSetC2.cpp b/src/hotspot/share/gc/shared/c2/barrierSetC2.cpp
index 59e02452044..643a7936b9b 100644
--- a/src/hotspot/share/gc/shared/c2/barrierSetC2.cpp
+++ b/src/hotspot/share/gc/shared/c2/barrierSetC2.cpp
@@ -109,6 +109,10 @@ Label* BarrierStubC2::continuation() {
   return &_continuation;
 }
 
+uint8_t BarrierStubC2::barrier_data() const {
+  return _node->barrier_data();
+}
+
 void BarrierStubC2::preserve(Register r) {
   const VMReg vm_reg = r->as_VMReg();
   assert(vm_reg->is_Register(), "r must be a general-purpose register");
diff --git a/src/hotspot/share/gc/shared/c2/barrierSetC2.hpp b/src/hotspot/share/gc/shared/c2/barrierSetC2.hpp
index c1485c069c8..00fbf1f2c9f 100644
--- a/src/hotspot/share/gc/shared/c2/barrierSetC2.hpp
+++ b/src/hotspot/share/gc/shared/c2/barrierSetC2.hpp
@@ -254,6 +254,8 @@ public:
   Label* entry();
   // Return point from the stub (typically end of barrier).
   Label* continuation();
+  // High-level, GC-specific barrier flags.
+  uint8_t barrier_data() const;
 
   // Preserve the value in reg across runtime calls in this barrier.
   void preserve(Register reg);
@@ -340,6 +342,8 @@ public:
   // Estimated size of the node barrier in number of C2 Ideal nodes.
   // This is used to guide heuristics in C2, e.g. whether to unroll a loop.
   virtual uint estimated_barrier_size(const Node* node) const { return 0; }
+  // Whether the given store can be used to initialize a newly allocated object.
+  virtual bool can_initialize_object(const StoreNode* store) const { return true; }
 
   enum CompilePhase {
     BeforeOptimize,
diff --git a/src/hotspot/share/gc/shared/c2/cardTableBarrierSetC2.cpp b/src/hotspot/share/gc/shared/c2/cardTableBarrierSetC2.cpp
index 87bb9f3cd51..11b742156a8 100644
--- a/src/hotspot/share/gc/shared/c2/cardTableBarrierSetC2.cpp
+++ b/src/hotspot/share/gc/shared/c2/cardTableBarrierSetC2.cpp
@@ -125,39 +125,10 @@ void CardTableBarrierSetC2::post_barrier(GraphKit* kit,
   kit->final_sync(ideal);
 }
 
-void CardTableBarrierSetC2::clone(GraphKit* kit, Node* src, Node* dst, Node* size, bool is_array) const {
-  BarrierSetC2::clone(kit, src, dst, size, is_array);
-  const TypePtr* raw_adr_type = TypeRawPtr::BOTTOM;
-
-  // If necessary, emit some card marks afterwards.  (Non-arrays only.)
-  bool card_mark = !is_array && !use_ReduceInitialCardMarks();
-  if (card_mark) {
-    assert(!is_array, "");
-    // Put in store barrier for any and all oops we are sticking
-    // into this object.  (We could avoid this if we could prove
-    // that the object type contains no oop fields at all.)
-    Node* no_particular_value = nullptr;
-    Node* no_particular_field = nullptr;
-    int raw_adr_idx = Compile::AliasIdxRaw;
-    post_barrier(kit, kit->control(),
-                 kit->memory(raw_adr_type),
-                 dst,
-                 no_particular_field,
-                 raw_adr_idx,
-                 no_particular_value,
-                 T_OBJECT,
-                 false);
-  }
-}
-
 bool CardTableBarrierSetC2::use_ReduceInitialCardMarks() const {
   return ReduceInitialCardMarks;
 }
 
-bool CardTableBarrierSetC2::is_gc_barrier_node(Node* node) const {
-  return ModRefBarrierSetC2::is_gc_barrier_node(node) || node->Opcode() == Op_StoreCM;
-}
-
 void CardTableBarrierSetC2::eliminate_gc_barrier(PhaseMacroExpand* macro, Node* node) const {
   assert(node->Opcode() == Op_CastP2X, "ConvP2XNode required");
   Node *shift = node->unique_out();
diff --git a/src/hotspot/share/gc/shared/c2/cardTableBarrierSetC2.hpp b/src/hotspot/share/gc/shared/c2/cardTableBarrierSetC2.hpp
index 9512f09ff8a..3bbf14892d3 100644
--- a/src/hotspot/share/gc/shared/c2/cardTableBarrierSetC2.hpp
+++ b/src/hotspot/share/gc/shared/c2/cardTableBarrierSetC2.hpp
@@ -42,8 +42,6 @@ protected:
   Node* byte_map_base_node(GraphKit* kit) const;
 
 public:
-  virtual void clone(GraphKit* kit, Node* src, Node* dst, Node* size, bool is_array) const;
-  virtual bool is_gc_barrier_node(Node* node) const;
   virtual void eliminate_gc_barrier(PhaseMacroExpand* macro, Node* node) const;
   virtual bool array_copy_requires_gc_barriers(bool tightly_coupled_alloc, BasicType type, bool is_clone, bool is_clone_instance, ArrayCopyPhase phase) const;
 
diff --git a/src/hotspot/share/opto/buildOopMap.cpp b/src/hotspot/share/opto/buildOopMap.cpp
index 4591e87da2d..b553cc6ea69 100644
--- a/src/hotspot/share/opto/buildOopMap.cpp
+++ b/src/hotspot/share/opto/buildOopMap.cpp
@@ -235,6 +235,13 @@ OopMap *OopFlow::build_oop_map( Node *n, int max_reg, PhaseRegAlloc *regalloc, i
     Node *def = _defs[reg];     // Get reaching def
     assert( def, "since live better have reaching def" );
 
+    if (def->is_MachTemp()) {
+      assert(!def->bottom_type()->isa_oop_ptr(),
+             "ADLC only assigns OOP types to MachTemp defs corresponding to xRegN operands");
+      // Exclude MachTemp definitions even if they are typed as oops.
+      continue;
+    }
+
     // Classify the reaching def as oop, derived, callee-save, dead, or other
     const Type *t = def->bottom_type();
     if( t->isa_oop_ptr() ) {    // Oop or derived?
diff --git a/src/hotspot/share/opto/lcm.cpp b/src/hotspot/share/opto/lcm.cpp
index 9db94748ca2..3c6de96074a 100644
--- a/src/hotspot/share/opto/lcm.cpp
+++ b/src/hotspot/share/opto/lcm.cpp
@@ -161,6 +161,14 @@ void PhaseCFG::implicit_null_check(Block* block, Node *proj, Node *val, int allo
     Node *m = val->out(i);
     if( !m->is_Mach() ) continue;
     MachNode *mach = m->as_Mach();
+    if (mach->barrier_data() != 0) {
+      // Using memory accesses with barriers to perform implicit null checks is
+      // not supported. These operations might expand into multiple assembly
+      // instructions during code emission, including new memory accesses (e.g.
+      // in G1's pre-barrier), which would invalidate the implicit null
+      // exception table.
+      continue;
+    }
     was_store = false;
     int iop = mach->ideal_Opcode();
     switch( iop ) {
diff --git a/src/hotspot/share/opto/matcher.cpp b/src/hotspot/share/opto/matcher.cpp
index bf773d43d3d..6d96bff1c1c 100644
--- a/src/hotspot/share/opto/matcher.cpp
+++ b/src/hotspot/share/opto/matcher.cpp
@@ -1594,6 +1594,14 @@ static bool match_into_reg( const Node *n, Node *m, Node *control, int i, bool s
     // the same register.  See find_shared_node.
     return false;
   } else {                      // Not a constant
+    if (!shared && Matcher::is_encode_and_store_pattern(n, m)) {
+      // Make it possible to match "encode and store" patterns with non-shared
+      // encode operations that are pinned to a control node (e.g. by CastPP
+      // node removal in final graph reshaping). The matched instruction cannot
+      // float above the encode's control node because it is pinned to the
+      // store's control node.
+      return false;
+    }
     // Stop recursion if they have different Controls.
     Node* m_control = m->in(0);
     // Control of load's memory can post-dominates load's control.
@@ -2833,6 +2841,18 @@ bool Matcher::is_non_long_integral_vector(const Node* n) {
   return is_subword_type(bt) || bt == T_INT;
 }
 
+bool Matcher::is_encode_and_store_pattern(const Node* n, const Node* m) {
+  if (n == nullptr ||
+      m == nullptr ||
+      n->Opcode() != Op_StoreN ||
+      !m->is_EncodeP() ||
+      n->as_Store()->barrier_data() == 0) {
+    return false;
+  }
+  assert(m == n->in(MemNode::ValueIn), "m should be input to n");
+  return true;
+}
+
 #ifdef ASSERT
 bool Matcher::verify_after_postselect_cleanup() {
   assert(!C->failing(), "sanity");
diff --git a/src/hotspot/share/opto/matcher.hpp b/src/hotspot/share/opto/matcher.hpp
index 84e48086f92..25762835088 100644
--- a/src/hotspot/share/opto/matcher.hpp
+++ b/src/hotspot/share/opto/matcher.hpp
@@ -385,6 +385,8 @@ public:
     return ((bt & BoolTest::unsigned_compare) == BoolTest::unsigned_compare);
   }
 
+  static bool is_encode_and_store_pattern(const Node* n, const Node* m);
+
   // These calls are all generated by the ADLC
 
   // Java-Java calling convention
diff --git a/src/hotspot/share/opto/memnode.cpp b/src/hotspot/share/opto/memnode.cpp
index eee14e5ba03..66139188260 100644
--- a/src/hotspot/share/opto/memnode.cpp
+++ b/src/hotspot/share/opto/memnode.cpp
@@ -4644,6 +4644,11 @@ intptr_t InitializeNode::can_capture_store(StoreNode* st, PhaseGVN* phase, bool
   Node* mem = st->in(MemNode::Memory);
   if (!(mem->is_Proj() && mem->in(0) == this))
     return FAIL;                // must not be preceded by other stores
+  BarrierSetC2* bs = BarrierSet::barrier_set()->barrier_set_c2();
+  if ((st->Opcode() == Op_StoreP || st->Opcode() == Op_StoreN) &&
+      !bs->can_initialize_object(st)) {
+    return FAIL;
+  }
   Node* adr = st->in(MemNode::Address);
   intptr_t offset;
   AllocateNode* alloc = AllocateNode::Ideal_allocation(adr, phase, offset);
diff --git a/src/hotspot/share/opto/output.cpp b/src/hotspot/share/opto/output.cpp
index b3f251bb361..260f887347f 100644
--- a/src/hotspot/share/opto/output.cpp
+++ b/src/hotspot/share/opto/output.cpp
@@ -2022,6 +2022,8 @@ void PhaseOutput::FillExceptionTables(uint cnt, uint *call_returns, uint *inct_s
 
     // Handle implicit null exception table updates
     if (n->is_MachNullCheck()) {
+      assert(n->in(1)->as_Mach()->barrier_data() == 0,
+             "Implicit null checks on memory accesses with barriers are not yet supported");
       uint block_num = block->non_connector_successor(0)->_pre_order;
       _inc_table.append(inct_starts[inct_cnt++], blk_labels[block_num].loc_pos());
       continue;
diff --git a/test/hotspot/jtreg/compiler/c2/aarch64/TestVolatiles.java b/test/hotspot/jtreg/compiler/c2/aarch64/TestVolatiles.java
index 23b9321fc35..3f82c3e00b3 100644
--- a/test/hotspot/jtreg/compiler/c2/aarch64/TestVolatiles.java
+++ b/test/hotspot/jtreg/compiler/c2/aarch64/TestVolatiles.java
@@ -261,20 +261,11 @@ public class TestVolatiles {
             };
             break;
         case "G1":
-            // a card mark volatile barrier should be generated
-            // before the card mark strb
-            //
-            // following the fix for 8225776 the G1 barrier is now
-            // scheduled out of line after the membar volatile and
-            // and subsequent return
             matches = new String[] {
                 "membar_release \\(elided\\)",
                 useCompressedOops ? "stlrw?" : "stlr",
                 "membar_volatile \\(elided\\)",
-                "ret",
-                "membar_volatile",
-                "dmb ish",
-                "strb"
+                "ret"
             };
             break;
         case "Shenandoah":
@@ -332,20 +323,11 @@ public class TestVolatiles {
             };
             break;
         case "G1":
-            // a card mark volatile barrier should be generated
-            // before the card mark strb
-            //
-            // following the fix for 8225776 the G1 barrier is now
-            // scheduled out of line after the membar acquire and
-            // and subsequent return
             matches = new String[] {
                 "membar_release \\(elided\\)",
                 useCompressedOops ? "cmpxchgw?_acq" : "cmpxchg_acq",
                 "membar_acquire \\(elided\\)",
-                "ret",
-                "membar_volatile",
-                "dmb ish",
-                "strb"
+                "ret"
             };
             break;
         case "Shenandoah":
@@ -418,20 +400,11 @@ public class TestVolatiles {
             return;
 
         case "G1":
-            // a card mark volatile barrier should be generated
-            // before the card mark strb
-            //
-            // following the fix for 8225776 the G1 barrier is now
-            // scheduled out of line after the membar acquire and
-            // and subsequent return
             matches = new String[] {
                 "membar_release \\(elided\\)",
                 useCompressedOops ? "cmpxchgw?_acq" : "cmpxchg_acq",
                 "membar_acquire \\(elided\\)",
-                "ret",
-                "membar_volatile",
-                "dmb ish",
-                "strb"
+                "ret"
             };
             break;
         case "Shenandoah":
@@ -484,20 +457,11 @@ public class TestVolatiles {
             };
             break;
         case "G1":
-            // a card mark volatile barrier should be generated
-            // before the card mark strb
-            //
-            // following the fix for 8225776 the G1 barrier is now
-            // scheduled out of line after the membar acquire and
-            // and subsequent return
             matches = new String[] {
                 "membar_release \\(elided\\)",
                 useCompressedOops ? "atomic_xchgw?_acq" : "atomic_xchg_acq",
                 "membar_acquire \\(elided\\)",
-                "ret",
-                "membar_volatile",
-                "dmb ish",
-                "strb"
+                "ret"
             };
             break;
         case "Shenandoah":
diff --git a/test/hotspot/jtreg/compiler/c2/irTests/scalarReplacement/AllocationMergesTests.java b/test/hotspot/jtreg/compiler/c2/irTests/scalarReplacement/AllocationMergesTests.java
index cd3d5329771..69b3cb5274b 100644
--- a/test/hotspot/jtreg/compiler/c2/irTests/scalarReplacement/AllocationMergesTests.java
+++ b/test/hotspot/jtreg/compiler/c2/irTests/scalarReplacement/AllocationMergesTests.java
@@ -1355,9 +1355,12 @@ public class AllocationMergesTests {
     }
 
     @Test
-    @IR(counts = { IRNode.ALLOC, "1" })
-    // The last allocation won't be reduced because it would cause the creation
-    // of a nested SafePointScalarMergeNode.
+    // Using G1, all allocations are reduced.
+    @IR(applyIf = {"UseG1GC", "true"}, failOn = { IRNode.ALLOC })
+    // Otherwise, the last allocation won't be reduced because it would cause
+    // the creation of a nested SafePointScalarMergeNode. This is caused by the
+    // store barrier corresponding to 'C.other = B'.
+    @IR(applyIf = {"UseG1GC", "false"}, counts = { IRNode.ALLOC, "1" })
     int testReReduce_C2(boolean cond1, int x, int y) { return testReReduce(cond1, x, y); }
 
     @DontCompile
diff --git a/test/hotspot/jtreg/compiler/gcbarriers/TestG1BarrierGeneration.java b/test/hotspot/jtreg/compiler/gcbarriers/TestG1BarrierGeneration.java
new file mode 100644
index 00000000000..36ad0bf84a4
--- /dev/null
+++ b/test/hotspot/jtreg/compiler/gcbarriers/TestG1BarrierGeneration.java
@@ -0,0 +1,639 @@
+/*
+ * Copyright (c) 2024, Oracle and/or its affiliates. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ */
+
+package compiler.gcbarriers;
+
+import compiler.lib.ir_framework.*;
+import java.lang.invoke.VarHandle;
+import java.lang.invoke.MethodHandles;
+import java.lang.ref.Reference;
+import java.lang.ref.ReferenceQueue;
+import java.lang.ref.SoftReference;
+import java.lang.ref.WeakReference;
+import java.util.concurrent.ThreadLocalRandom;
+import jdk.test.lib.Asserts;
+
+/**
+ * @test
+ * @summary Test that G1 barriers are generated and optimized as expected.
+ * @library /test/lib /
+ * @requires vm.gc.G1
+ * @run driver compiler.gcbarriers.TestG1BarrierGeneration
+ */
+
+public class TestG1BarrierGeneration {
+    static final String PRE_ONLY = "pre";
+    static final String POST_ONLY = "post";
+    static final String POST_ONLY_NOT_NULL = "post notnull";
+    static final String PRE_AND_POST = "pre post";
+    static final String PRE_AND_POST_NOT_NULL = "pre post notnull";
+
+    static class Outer {
+        Object f;
+    }
+
+    static class OuterWithVolatileField {
+        volatile Object f;
+    }
+
+    static class OuterWithFewFields implements Cloneable {
+        Object f1;
+        Object f2;
+        public Object clone() throws CloneNotSupportedException {
+            return super.clone();
+        }
+    }
+
+    static class OuterWithManyFields implements Cloneable {
+        Object f1;
+        Object f2;
+        Object f3;
+        Object f4;
+        Object f5;
+        Object f6;
+        Object f7;
+        Object f8;
+        Object f9;
+        Object f10;
+        public Object clone() throws CloneNotSupportedException {
+            return super.clone();
+        }
+    }
+
+    static final VarHandle fVarHandle;
+    static {
+        MethodHandles.Lookup l = MethodHandles.lookup();
+        try {
+            fVarHandle = l.findVarHandle(Outer.class, "f", Object.class);
+        } catch (Exception e) {
+            throw new Error(e);
+        }
+    }
+
+    public static void main(String[] args) {
+        TestFramework framework = new TestFramework();
+        Scenario[] scenarios = new Scenario[2*2];
+        int scenarioIndex = 0;
+        for (int i = 0; i < 2; i++) {
+            for (int j = 0; j < 2; j++) {
+                scenarios[scenarioIndex] =
+                    new Scenario(scenarioIndex,
+                                 "-XX:CompileCommand=inline,java.lang.ref.*::*",
+                                 "-XX:" + (i == 0 ? "-" : "+") + "UseCompressedOops",
+                                 "-XX:" + (j == 0 ? "-" : "+") + "ReduceInitialCardMarks");
+                scenarioIndex++;
+            }
+        }
+        framework.addScenarios(scenarios);
+        framework.start();
+    }
+
+    @Test
+    @IR(applyIf = {"UseCompressedOops", "false"},
+        counts = {IRNode.G1_STORE_P_WITH_BARRIER_FLAG, PRE_AND_POST, "1"},
+        phase = CompilePhase.FINAL_CODE)
+    @IR(applyIf = {"UseCompressedOops", "true"},
+        counts = {IRNode.G1_ENCODE_P_AND_STORE_N_WITH_BARRIER_FLAG, PRE_AND_POST, "1"},
+        phase = CompilePhase.FINAL_CODE)
+    public static void testStore(Outer o, Object o1) {
+        o.f = o1;
+    }
+
+    @Test
+    @IR(applyIf = {"UseCompressedOops", "false"},
+        counts = {IRNode.G1_STORE_P_WITH_BARRIER_FLAG, PRE_ONLY, "1"},
+        phase = CompilePhase.FINAL_CODE)
+    @IR(applyIf = {"UseCompressedOops", "true"},
+        counts = {IRNode.G1_STORE_N_WITH_BARRIER_FLAG, PRE_ONLY, "1"},
+        phase = CompilePhase.FINAL_CODE)
+    public static void testStoreNull(Outer o) {
+        o.f = null;
+    }
+
+    @Test
+    @IR(applyIf = {"UseCompressedOops", "false"},
+        counts = {IRNode.G1_STORE_P_WITH_BARRIER_FLAG, PRE_ONLY, "1"},
+        phase = CompilePhase.FINAL_CODE)
+    @IR(applyIf = {"UseCompressedOops", "true"},
+        counts = {IRNode.G1_STORE_N_WITH_BARRIER_FLAG, PRE_ONLY, "1"},
+        phase = CompilePhase.FINAL_CODE)
+    public static void testStoreObfuscatedNull(Outer o, Object o1) {
+        Object o2 = o1;
+        for (int i = 0; i < 4; i++) {
+            if ((i % 2) == 0) {
+                o2 = null;
+            }
+        }
+        // o2 is null here, but this is only known to C2 after applying some
+        // optimizations (loop unrolling, IGVN).
+        o.f = o2;
+    }
+
+    @Test
+    @IR(applyIf = {"UseCompressedOops", "false"},
+        counts = {IRNode.G1_STORE_P_WITH_BARRIER_FLAG, PRE_AND_POST_NOT_NULL, "1"},
+        phase = CompilePhase.FINAL_CODE)
+    @IR(applyIf = {"UseCompressedOops", "true"},
+        counts = {IRNode.G1_ENCODE_P_AND_STORE_N_WITH_BARRIER_FLAG, PRE_AND_POST_NOT_NULL, "1"},
+        phase = CompilePhase.FINAL_CODE)
+    public static void testStoreNotNull(Outer o, Object o1) {
+        if (o1.hashCode() == 42) {
+            return;
+        }
+        o.f = o1;
+    }
+
+    @Test
+    @IR(applyIf = {"UseCompressedOops", "false"},
+        counts = {IRNode.G1_STORE_P_WITH_BARRIER_FLAG, PRE_AND_POST, "2"},
+        phase = CompilePhase.FINAL_CODE)
+    @IR(applyIf = {"UseCompressedOops", "true"},
+        counts = {IRNode.G1_ENCODE_P_AND_STORE_N_WITH_BARRIER_FLAG, PRE_AND_POST, "2"},
+        phase = CompilePhase.FINAL_CODE)
+    public static void testStoreTwice(Outer o, Outer p, Object o1) {
+        o.f = o1;
+        p.f = o1;
+    }
+
+    @Test
+    @IR(applyIf = {"UseCompressedOops", "false"},
+        counts = {IRNode.G1_STORE_P_WITH_BARRIER_FLAG, PRE_AND_POST, "1"},
+        phase = CompilePhase.FINAL_CODE)
+    @IR(applyIf = {"UseCompressedOops", "true"},
+        counts = {IRNode.G1_ENCODE_P_AND_STORE_N_WITH_BARRIER_FLAG, PRE_AND_POST, "1"},
+        phase = CompilePhase.FINAL_CODE)
+    public static void testStoreVolatile(OuterWithVolatileField o, Object o1) {
+        o.f = o1;
+    }
+
+    @Test
+    @IR(applyIfAnd = {"UseCompressedOops", "false", "ReduceInitialCardMarks", "false"},
+        counts = {IRNode.G1_STORE_P_WITH_BARRIER_FLAG, POST_ONLY, "1"},
+        phase = CompilePhase.FINAL_CODE)
+    @IR(applyIfAnd = {"UseCompressedOops", "true", "ReduceInitialCardMarks", "false"},
+        counts = {IRNode.G1_ENCODE_P_AND_STORE_N_WITH_BARRIER_FLAG, POST_ONLY, "1"},
+        phase = CompilePhase.FINAL_CODE)
+    @IR(applyIfAnd = {"UseCompressedOops", "false", "ReduceInitialCardMarks", "true"},
+        failOn = {IRNode.G1_STORE_P},
+        phase = CompilePhase.FINAL_CODE)
+    @IR(applyIfAnd = {"UseCompressedOops", "true", "ReduceInitialCardMarks", "true"},
+        failOn = {IRNode.G1_STORE_N, IRNode.G1_ENCODE_P_AND_STORE_N},
+        phase = CompilePhase.FINAL_CODE)
+    public static Outer testStoreOnNewObject(Object o1) {
+        Outer o = new Outer();
+        o.f = o1;
+        return o;
+    }
+
+    @Test
+    @IR(failOn = {IRNode.STORE_P, IRNode.STORE_N},
+        phase = CompilePhase.BEFORE_MACRO_EXPANSION)
+    public static Outer testStoreNullOnNewObject() {
+        Outer o = new Outer();
+        o.f = null;
+        return o;
+    }
+
+    @Test
+    @IR(applyIfAnd = {"UseCompressedOops", "false", "ReduceInitialCardMarks", "false"},
+        counts = {IRNode.G1_STORE_P_WITH_BARRIER_FLAG, POST_ONLY_NOT_NULL, "1"},
+        phase = CompilePhase.FINAL_CODE)
+    @IR(applyIfAnd = {"UseCompressedOops", "true", "ReduceInitialCardMarks", "false"},
+        counts = {IRNode.G1_ENCODE_P_AND_STORE_N_WITH_BARRIER_FLAG, POST_ONLY_NOT_NULL, "1"},
+        phase = CompilePhase.FINAL_CODE)
+    @IR(applyIfAnd = {"UseCompressedOops", "false", "ReduceInitialCardMarks", "true"},
+        failOn = {IRNode.G1_STORE_P},
+        phase = CompilePhase.FINAL_CODE)
+    @IR(applyIfAnd = {"UseCompressedOops", "true", "ReduceInitialCardMarks", "true"},
+        failOn = {IRNode.G1_STORE_N, IRNode.G1_ENCODE_P_AND_STORE_N},
+        phase = CompilePhase.FINAL_CODE)
+    public static Outer testStoreNotNullOnNewObject(Object o1) {
+        if (o1.hashCode() == 42) {
+            return null;
+        }
+        Outer o = new Outer();
+        o.f = o1;
+        return o;
+    }
+
+    @Test
+    @IR(applyIfAnd = {"UseCompressedOops", "false", "ReduceInitialCardMarks", "false"},
+        counts = {IRNode.G1_STORE_P_WITH_BARRIER_FLAG, POST_ONLY, "2"},
+        phase = CompilePhase.FINAL_CODE)
+    @IR(applyIfAnd = {"UseCompressedOops", "true", "ReduceInitialCardMarks", "false"},
+        counts = {IRNode.G1_ENCODE_P_AND_STORE_N_WITH_BARRIER_FLAG, POST_ONLY, "2"},
+        phase = CompilePhase.FINAL_CODE)
+    @IR(applyIfAnd = {"UseCompressedOops", "false", "ReduceInitialCardMarks", "true"},
+        failOn = {IRNode.G1_STORE_P},
+        phase = CompilePhase.FINAL_CODE)
+    @IR(applyIfAnd = {"UseCompressedOops", "true", "ReduceInitialCardMarks", "true"},
+        failOn = {IRNode.G1_STORE_N, IRNode.G1_ENCODE_P_AND_STORE_N},
+        phase = CompilePhase.FINAL_CODE)
+    public static Outer testStoreOnNewObjectInTwoPaths(Object o1, boolean c) {
+        Outer o;
+        if (c) {
+            o = new Outer();
+            o.f = o1;
+        } else {
+            o = new Outer();
+            o.f = o1;
+        }
+        return o;
+    }
+
+    @Run(test = {"testStore",
+                 "testStoreNull",
+                 "testStoreObfuscatedNull",
+                 "testStoreNotNull",
+                 "testStoreTwice",
+                 "testStoreVolatile",
+                 "testStoreOnNewObject",
+                 "testStoreNullOnNewObject",
+                 "testStoreNotNullOnNewObject",
+                 "testStoreOnNewObjectInTwoPaths"})
+    public void runStoreTests() {
+        {
+            Outer o = new Outer();
+            Object o1 = new Object();
+            testStore(o, o1);
+            Asserts.assertEquals(o1, o.f);
+        }
+        {
+            Outer o = new Outer();
+            testStoreNull(o);
+            Asserts.assertNull(o.f);
+        }
+        {
+            Outer o = new Outer();
+            Object o1 = new Object();
+            testStoreObfuscatedNull(o, o1);
+            Asserts.assertNull(o.f);
+        }
+        {
+            Outer o = new Outer();
+            Object o1 = new Object();
+            testStoreNotNull(o, o1);
+            Asserts.assertEquals(o1, o.f);
+        }
+        {
+            Outer o = new Outer();
+            Outer p = new Outer();
+            Object o1 = new Object();
+            testStoreTwice(o, p, o1);
+            Asserts.assertEquals(o1, o.f);
+            Asserts.assertEquals(o1, p.f);
+        }
+        {
+            OuterWithVolatileField o = new OuterWithVolatileField();
+            Object o1 = new Object();
+            testStoreVolatile(o, o1);
+            Asserts.assertEquals(o1, o.f);
+        }
+        {
+            Object o1 = new Object();
+            Outer o = testStoreOnNewObject(o1);
+            Asserts.assertEquals(o1, o.f);
+        }
+        {
+            Outer o = testStoreNullOnNewObject();
+            Asserts.assertNull(o.f);
+        }
+        {
+            Object o1 = new Object();
+            Outer o = testStoreNotNullOnNewObject(o1);
+            Asserts.assertEquals(o1, o.f);
+        }
+        {
+            Object o1 = new Object();
+            Outer o = testStoreOnNewObjectInTwoPaths(o1, ThreadLocalRandom.current().nextBoolean());
+            Asserts.assertEquals(o1, o.f);
+        }
+    }
+
+    @Test
+    @IR(applyIf = {"UseCompressedOops", "false"},
+        counts = {IRNode.G1_STORE_P_WITH_BARRIER_FLAG, PRE_AND_POST, "1"},
+        phase = CompilePhase.FINAL_CODE)
+    @IR(applyIf = {"UseCompressedOops", "true"},
+        counts = {IRNode.G1_ENCODE_P_AND_STORE_N_WITH_BARRIER_FLAG, PRE_AND_POST, "1"},
+        phase = CompilePhase.FINAL_CODE)
+    public static void testArrayStore(Object[] a, int index, Object o1) {
+        a[index] = o1;
+    }
+
+    @Test
+    @IR(applyIf = {"UseCompressedOops", "false"},
+        counts = {IRNode.G1_STORE_P_WITH_BARRIER_FLAG, PRE_ONLY, "1"},
+        phase = CompilePhase.FINAL_CODE)
+    @IR(applyIf = {"UseCompressedOops", "true"},
+        counts = {IRNode.G1_STORE_N_WITH_BARRIER_FLAG, PRE_ONLY, "1"},
+        phase = CompilePhase.FINAL_CODE)
+    public static void testArrayStoreNull(Object[] a, int index) {
+        a[index] = null;
+    }
+
+    @Test
+    @IR(applyIf = {"UseCompressedOops", "false"},
+        counts = {IRNode.G1_STORE_P_WITH_BARRIER_FLAG, PRE_AND_POST_NOT_NULL, "1"},
+        phase = CompilePhase.FINAL_CODE)
+    @IR(applyIf = {"UseCompressedOops", "true"},
+        counts = {IRNode.G1_ENCODE_P_AND_STORE_N_WITH_BARRIER_FLAG, PRE_AND_POST_NOT_NULL, "1"},
+        phase = CompilePhase.FINAL_CODE)
+    public static void testArrayStoreNotNull(Object[] a, int index, Object o1) {
+        if (o1.hashCode() == 42) {
+            return;
+        }
+        a[index] = o1;
+    }
+
+    @Test
+    @IR(applyIf = {"UseCompressedOops", "false"},
+        counts = {IRNode.G1_STORE_P_WITH_BARRIER_FLAG, PRE_AND_POST, "2"},
+        phase = CompilePhase.FINAL_CODE)
+    @IR(applyIf = {"UseCompressedOops", "true"},
+        counts = {IRNode.G1_ENCODE_P_AND_STORE_N_WITH_BARRIER_FLAG, PRE_AND_POST, "2"},
+        phase = CompilePhase.FINAL_CODE)
+    public static void testArrayStoreTwice(Object[] a, Object[] b, int index, Object o1) {
+        a[index] = o1;
+        b[index] = o1;
+    }
+
+    @Test
+    @IR(applyIfAnd = {"UseCompressedOops", "false", "ReduceInitialCardMarks", "true"},
+        failOn = {IRNode.G1_STORE_P},
+        phase = CompilePhase.FINAL_CODE)
+    @IR(applyIfAnd = {"UseCompressedOops", "true", "ReduceInitialCardMarks", "true"},
+        failOn = {IRNode.G1_STORE_N, IRNode.G1_ENCODE_P_AND_STORE_N},
+        phase = CompilePhase.FINAL_CODE)
+    public static Object[] testStoreOnNewArray(Object o1) {
+        Object[] a = new Object[10];
+        // The index needs to be concrete for C2 to detect that it is safe to
+        // remove the pre-barrier.
+        a[4] = o1;
+        return a;
+    }
+
+    @Run(test = {"testArrayStore",
+                 "testArrayStoreNull",
+                 "testArrayStoreNotNull",
+                 "testArrayStoreTwice",
+                 "testStoreOnNewArray"})
+    public void runArrayStoreTests() {
+        {
+            Object[] a = new Object[10];
+            Object o1 = new Object();
+            testArrayStore(a, 4, o1);
+            Asserts.assertEquals(o1, a[4]);
+        }
+        {
+            Object[] a = new Object[10];
+            testArrayStoreNull(a, 4);
+            Asserts.assertNull(a[4]);
+        }
+        {
+            Object[] a = new Object[10];
+            Object o1 = new Object();
+            testArrayStoreNotNull(a, 4, o1);
+            Asserts.assertEquals(o1, a[4]);
+        }
+        {
+            Object[] a = new Object[10];
+            Object[] b = new Object[10];
+            Object o1 = new Object();
+            testArrayStoreTwice(a, b, 4, o1);
+            Asserts.assertEquals(o1, a[4]);
+            Asserts.assertEquals(o1, b[4]);
+        }
+        {
+            Object o1 = new Object();
+            Object[] a = testStoreOnNewArray(o1);
+            Asserts.assertEquals(o1, a[4]);
+        }
+    }
+
+    @Test
+    public static Object[] testCloneArrayOfObjects(Object[] a) {
+        Object[] a1 = null;
+        try {
+            a1 = a.clone();
+        } catch (Exception e) {}
+        return a1;
+    }
+
+    @Test
+    @IR(applyIf = {"ReduceInitialCardMarks", "true"},
+        failOn = {IRNode.G1_STORE_P, IRNode.G1_STORE_N, IRNode.G1_ENCODE_P_AND_STORE_N},
+        phase = CompilePhase.FINAL_CODE)
+    @IR(applyIfAnd = {"ReduceInitialCardMarks", "false", "UseCompressedOops", "false"},
+        counts = {IRNode.G1_STORE_P_WITH_BARRIER_FLAG, POST_ONLY, "2"},
+        phase = CompilePhase.FINAL_CODE)
+    @IR(applyIfAnd = {"ReduceInitialCardMarks", "false", "UseCompressedOops", "true"},
+        counts = {IRNode.G1_STORE_N_WITH_BARRIER_FLAG, POST_ONLY, "2"},
+        phase = CompilePhase.FINAL_CODE)
+    public static OuterWithFewFields testCloneObjectWithFewFields(OuterWithFewFields o) {
+        Object o1 = null;
+        try {
+            o1 = o.clone();
+        } catch (Exception e) {}
+        return (OuterWithFewFields)o1;
+    }
+
+    @Test
+    @IR(applyIf = {"ReduceInitialCardMarks", "true"},
+        counts = {IRNode.CALL_OF, "jlong_disjoint_arraycopy", "1"})
+    @IR(applyIf = {"ReduceInitialCardMarks", "false"},
+        counts = {IRNode.CALL_OF, "G1BarrierSetRuntime::clone", "1"})
+    public static OuterWithManyFields testCloneObjectWithManyFields(OuterWithManyFields o) {
+        Object o1 = null;
+        try {
+            o1 = o.clone();
+        } catch (Exception e) {}
+        return (OuterWithManyFields)o1;
+    }
+
+    @Run(test = {"testCloneArrayOfObjects",
+                 "testCloneObjectWithFewFields",
+                 "testCloneObjectWithManyFields"})
+    public void runCloneTests() {
+        {
+            Object o1 = new Object();
+            Object[] a = new Object[4];
+            for (int i = 0; i < 4; i++) {
+                a[i] = o1;
+            }
+            Object[] a1 = testCloneArrayOfObjects(a);
+            for (int i = 0; i < 4; i++) {
+                Asserts.assertEquals(o1, a1[i]);
+            }
+        }
+        {
+            Object a = new Object();
+            Object b = new Object();
+            OuterWithFewFields o = new OuterWithFewFields();
+            o.f1 = a;
+            o.f2 = b;
+            OuterWithFewFields o1 = testCloneObjectWithFewFields(o);
+            Asserts.assertEquals(a, o1.f1);
+            Asserts.assertEquals(b, o1.f2);
+        }
+        {
+            Object a = new Object();
+            Object b = new Object();
+            Object c = new Object();
+            Object d = new Object();
+            Object e = new Object();
+            Object f = new Object();
+            Object g = new Object();
+            Object h = new Object();
+            Object i = new Object();
+            Object j = new Object();
+            OuterWithManyFields o = new OuterWithManyFields();
+            o.f1 = a;
+            o.f2 = b;
+            o.f3 = c;
+            o.f4 = d;
+            o.f5 = e;
+            o.f6 = f;
+            o.f7 = g;
+            o.f8 = h;
+            o.f9 = i;
+            o.f10 = j;
+            OuterWithManyFields o1 = testCloneObjectWithManyFields(o);
+            Asserts.assertEquals(a, o1.f1);
+            Asserts.assertEquals(b, o1.f2);
+            Asserts.assertEquals(c, o1.f3);
+            Asserts.assertEquals(d, o1.f4);
+            Asserts.assertEquals(e, o1.f5);
+            Asserts.assertEquals(f, o1.f6);
+            Asserts.assertEquals(g, o1.f7);
+            Asserts.assertEquals(h, o1.f8);
+            Asserts.assertEquals(i, o1.f9);
+            Asserts.assertEquals(j, o1.f10);
+        }
+    }
+
+    @Test
+    @IR(applyIf = {"UseCompressedOops", "false"},
+        counts = {IRNode.G1_COMPARE_AND_EXCHANGE_P_WITH_BARRIER_FLAG, PRE_AND_POST, "1"},
+        phase = CompilePhase.FINAL_CODE)
+    @IR(applyIf = {"UseCompressedOops", "true"},
+        counts = {IRNode.G1_COMPARE_AND_EXCHANGE_N_WITH_BARRIER_FLAG, PRE_AND_POST, "1"},
+        phase = CompilePhase.FINAL_CODE)
+    static Object testCompareAndExchange(Outer o, Object oldVal, Object newVal) {
+        return fVarHandle.compareAndExchange(o, oldVal, newVal);
+    }
+
+    @Test
+    @IR(applyIf = {"UseCompressedOops", "false"},
+        counts = {IRNode.G1_COMPARE_AND_SWAP_P_WITH_BARRIER_FLAG, PRE_AND_POST, "1"},
+        phase = CompilePhase.FINAL_CODE)
+    @IR(applyIf = {"UseCompressedOops", "true"},
+        counts = {IRNode.G1_COMPARE_AND_SWAP_N_WITH_BARRIER_FLAG, PRE_AND_POST, "1"},
+        phase = CompilePhase.FINAL_CODE)
+    static boolean testCompareAndSwap(Outer o, Object oldVal, Object newVal) {
+        return fVarHandle.compareAndSet(o, oldVal, newVal);
+    }
+
+    @Test
+    @IR(applyIf = {"UseCompressedOops", "false"},
+        counts = {IRNode.G1_GET_AND_SET_P_WITH_BARRIER_FLAG, PRE_AND_POST, "1"},
+        phase = CompilePhase.FINAL_CODE)
+    @IR(applyIf = {"UseCompressedOops", "true"},
+        counts = {IRNode.G1_GET_AND_SET_N_WITH_BARRIER_FLAG, PRE_AND_POST, "1"},
+        phase = CompilePhase.FINAL_CODE)
+    static Object testGetAndSet(Outer o, Object newVal) {
+        return fVarHandle.getAndSet(o, newVal);
+    }
+
+    @Run(test = {"testCompareAndExchange",
+                 "testCompareAndSwap",
+                 "testGetAndSet"})
+    public void runAtomicTests() {
+        {
+            Outer o = new Outer();
+            Object oldVal = new Object();
+            o.f = oldVal;
+            Object newVal = new Object();
+            Object oldVal2 = testCompareAndExchange(o, oldVal, newVal);
+            Asserts.assertEquals(oldVal, oldVal2);
+            Asserts.assertEquals(o.f, newVal);
+        }
+        {
+            Outer o = new Outer();
+            Object oldVal = new Object();
+            o.f = oldVal;
+            Object newVal = new Object();
+            boolean b = testCompareAndSwap(o, oldVal, newVal);
+            Asserts.assertTrue(b);
+            Asserts.assertEquals(o.f, newVal);
+        }
+        {
+            Outer o = new Outer();
+            Object oldVal = new Object();
+            o.f = oldVal;
+            Object newVal = new Object();
+            Object oldVal2 = testGetAndSet(o, newVal);
+            Asserts.assertEquals(oldVal, oldVal2);
+            Asserts.assertEquals(o.f, newVal);
+        }
+    }
+
+    @Test
+    @IR(applyIf = {"UseCompressedOops", "false"},
+        counts = {IRNode.G1_LOAD_P_WITH_BARRIER_FLAG, PRE_ONLY, "1"},
+        phase = CompilePhase.FINAL_CODE)
+    @IR(applyIf = {"UseCompressedOops", "true"},
+        counts = {IRNode.G1_LOAD_N_WITH_BARRIER_FLAG, PRE_ONLY, "1"},
+        phase = CompilePhase.FINAL_CODE)
+    static Object testLoadSoftReference(SoftReference<Object> ref) {
+        return ref.get();
+    }
+
+    @Test
+    @IR(applyIf = {"UseCompressedOops", "false"},
+        counts = {IRNode.G1_LOAD_P_WITH_BARRIER_FLAG, PRE_ONLY, "1"},
+        phase = CompilePhase.FINAL_CODE)
+    @IR(applyIf = {"UseCompressedOops", "true"},
+        counts = {IRNode.G1_LOAD_N_WITH_BARRIER_FLAG, PRE_ONLY, "1"},
+        phase = CompilePhase.FINAL_CODE)
+    static Object testLoadWeakReference(WeakReference<Object> ref) {
+        return ref.get();
+    }
+
+    @Run(test = {"testLoadSoftReference",
+                 "testLoadWeakReference"})
+    public void runReferenceTests() {
+        {
+            Object o1 = new Object();
+            SoftReference<Object> sref = new SoftReference<Object>(o1);
+            Object o2 = testLoadSoftReference(sref);
+            Asserts.assertTrue(o2 == o1 || o2 == null);
+        }
+        {
+            Object o1 = new Object();
+            WeakReference<Object> wref = new WeakReference<Object>(o1);
+            Object o2 = testLoadWeakReference(wref);
+            Asserts.assertTrue(o2 == o1 || o2 == null);
+        }
+    }
+}
diff --git a/test/hotspot/jtreg/compiler/lib/ir_framework/IRNode.java b/test/hotspot/jtreg/compiler/lib/ir_framework/IRNode.java
index 16f56012d3d..a7c61f71050 100644
--- a/test/hotspot/jtreg/compiler/lib/ir_framework/IRNode.java
+++ b/test/hotspot/jtreg/compiler/lib/ir_framework/IRNode.java
@@ -358,6 +358,11 @@ public class IRNode {
         beforeMatchingNameRegex(CALL, "Call.*Java");
     }
 
+    public static final String CALL_OF = COMPOSITE_PREFIX + "CALL_OF" + POSTFIX;
+    static {
+        callOfNodes(CALL_OF, "Call.*");
+    }
+
     public static final String CALL_OF_METHOD = COMPOSITE_PREFIX + "CALL_OF_METHOD" + POSTFIX;
     static {
         callOfNodes(CALL_OF_METHOD, "Call.*Java");
@@ -581,6 +586,92 @@ public class IRNode {
         vectorNode(FMA_VD, "FmaVD", TYPE_DOUBLE);
     }
 
+    public static final String G1_COMPARE_AND_EXCHANGE_N_WITH_BARRIER_FLAG = COMPOSITE_PREFIX + "G1_COMPARE_AND_EXCHANGE_N_WITH_BARRIER_FLAG" + POSTFIX;
+    static {
+        String regex = START + "g1CompareAndExchangeN\\S*" + MID + "barrier\\(\\s*" + IS_REPLACED + "\\s*\\)" + END;
+        machOnly(G1_COMPARE_AND_EXCHANGE_N_WITH_BARRIER_FLAG, regex);
+    }
+
+    public static final String G1_COMPARE_AND_EXCHANGE_P_WITH_BARRIER_FLAG = COMPOSITE_PREFIX + "G1_COMPARE_AND_EXCHANGE_P_WITH_BARRIER_FLAG" + POSTFIX;
+    static {
+        String regex = START + "g1CompareAndExchangeP\\S*" + MID + "barrier\\(\\s*" + IS_REPLACED + "\\s*\\)" + END;
+        machOnly(G1_COMPARE_AND_EXCHANGE_P_WITH_BARRIER_FLAG, regex);
+    }
+
+    public static final String G1_COMPARE_AND_SWAP_N_WITH_BARRIER_FLAG = COMPOSITE_PREFIX + "G1_COMPARE_AND_SWAP_N_WITH_BARRIER_FLAG" + POSTFIX;
+    static {
+        String regex = START + "g1CompareAndSwapN\\S*" + MID + "barrier\\(\\s*" + IS_REPLACED + "\\s*\\)" + END;
+        machOnly(G1_COMPARE_AND_SWAP_N_WITH_BARRIER_FLAG, regex);
+    }
+
+    public static final String G1_COMPARE_AND_SWAP_P_WITH_BARRIER_FLAG = COMPOSITE_PREFIX + "G1_COMPARE_AND_SWAP_P_WITH_BARRIER_FLAG" + POSTFIX;
+    static {
+        String regex = START + "g1CompareAndSwapP\\S*" + MID + "barrier\\(\\s*" + IS_REPLACED + "\\s*\\)" + END;
+        machOnly(G1_COMPARE_AND_SWAP_P_WITH_BARRIER_FLAG, regex);
+    }
+
+    public static final String G1_ENCODE_P_AND_STORE_N = PREFIX + "G1_ENCODE_P_AND_STORE_N" + POSTFIX;
+    static {
+        machOnlyNameRegex(G1_ENCODE_P_AND_STORE_N, "g1EncodePAndStoreN");
+    }
+
+    public static final String G1_ENCODE_P_AND_STORE_N_WITH_BARRIER_FLAG = COMPOSITE_PREFIX + "G1_ENCODE_P_AND_STORE_N_WITH_BARRIER_FLAG" + POSTFIX;
+    static {
+        String regex = START + "g1EncodePAndStoreN\\S*" + MID + "barrier\\(\\s*" + IS_REPLACED + "\\s*\\)" + END;
+        machOnly(G1_ENCODE_P_AND_STORE_N_WITH_BARRIER_FLAG, regex);
+    }
+
+    public static final String G1_GET_AND_SET_N_WITH_BARRIER_FLAG = COMPOSITE_PREFIX + "G1_GET_AND_SET_N_WITH_BARRIER_FLAG" + POSTFIX;
+    static {
+        String regex = START + "g1GetAndSetN\\S*" + MID + "barrier\\(\\s*" + IS_REPLACED + "\\s*\\)" + END;
+        machOnly(G1_GET_AND_SET_N_WITH_BARRIER_FLAG, regex);
+    }
+
+    public static final String G1_GET_AND_SET_P_WITH_BARRIER_FLAG = COMPOSITE_PREFIX + "G1_GET_AND_SET_P_WITH_BARRIER_FLAG" + POSTFIX;
+    static {
+        String regex = START + "g1GetAndSetP\\S*" + MID + "barrier\\(\\s*" + IS_REPLACED + "\\s*\\)" + END;
+        machOnly(G1_GET_AND_SET_P_WITH_BARRIER_FLAG, regex);
+    }
+
+    public static final String G1_LOAD_N = PREFIX + "G1_LOAD_N" + POSTFIX;
+    static {
+        machOnlyNameRegex(G1_LOAD_N, "g1LoadN");
+    }
+
+    public static final String G1_LOAD_N_WITH_BARRIER_FLAG = COMPOSITE_PREFIX + "G1_LOAD_N_WITH_BARRIER_FLAG" + POSTFIX;
+    static {
+        String regex = START + "g1LoadN\\S*" + MID + "barrier\\(\\s*" + IS_REPLACED + "\\s*\\)" + END;
+        machOnly(G1_LOAD_N_WITH_BARRIER_FLAG, regex);
+    }
+
+    public static final String G1_LOAD_P_WITH_BARRIER_FLAG = COMPOSITE_PREFIX + "G1_LOAD_P_WITH_BARRIER_FLAG" + POSTFIX;
+    static {
+        String regex = START + "g1LoadP\\S*" + MID + "barrier\\(\\s*" + IS_REPLACED + "\\s*\\)" + END;
+        machOnly(G1_LOAD_P_WITH_BARRIER_FLAG, regex);
+    }
+
+    public static final String G1_STORE_N = PREFIX + "G1_STORE_N" + POSTFIX;
+    static {
+        machOnlyNameRegex(G1_STORE_N, "g1StoreN");
+    }
+
+    public static final String G1_STORE_N_WITH_BARRIER_FLAG = COMPOSITE_PREFIX + "G1_STORE_N_WITH_BARRIER_FLAG" + POSTFIX;
+    static {
+        String regex = START + "g1StoreN\\S*" + MID + "barrier\\(\\s*" + IS_REPLACED + "\\s*\\)" + END;
+        machOnly(G1_STORE_N_WITH_BARRIER_FLAG, regex);
+    }
+
+    public static final String G1_STORE_P = PREFIX + "G1_STORE_P" + POSTFIX;
+    static {
+        machOnlyNameRegex(G1_STORE_P, "g1StoreP");
+    }
+
+    public static final String G1_STORE_P_WITH_BARRIER_FLAG = COMPOSITE_PREFIX + "G1_STORE_P_WITH_BARRIER_FLAG" + POSTFIX;
+    static {
+        String regex = START + "g1StoreP\\S*" + MID + "barrier\\(\\s*" + IS_REPLACED + "\\s*\\)" + END;
+        machOnly(G1_STORE_P_WITH_BARRIER_FLAG, regex);
+    }
+
     public static final String IF = PREFIX + "IF" + POSTFIX;
     static {
         beforeMatchingNameRegex(IF, "If\\b");
@@ -852,6 +943,11 @@ public class IRNode {
         vectorNode(LSHIFT_VL, "LShiftVL", TYPE_LONG);
     }
 
+    public static final String MACH_TEMP = PREFIX + "MACH_TEMP" + POSTFIX;
+    static {
+        machOnlyNameRegex(MACH_TEMP, "MachTemp");
+    }
+
     public static final String MACRO_LOGIC_V = PREFIX + "MACRO_LOGIC_V" + POSTFIX;
     static {
         afterBarrierExpansionToBeforeMatching(MACRO_LOGIC_V, "MacroLogicV");
@@ -1148,6 +1244,12 @@ public class IRNode {
         trapNodes(NULL_CHECK_TRAP, "null_check");
     }
 
+    public static final String OOPMAP_WITH = COMPOSITE_PREFIX + "OOPMAP_WITH" + POSTFIX;
+    static {
+        String regex = "(#\\s*OopMap\\s*\\{.*" + IS_REPLACED + ".*\\})";
+        optoOnly(OOPMAP_WITH, regex);
+    }
+
     public static final String OR_VB = VECTOR_PREFIX + "OR_VB" + POSTFIX;
     static {
         vectorNode(OR_VB, "OrV", TYPE_BYTE);
diff --git a/test/hotspot/jtreg/compiler/runtime/safepoints/TestMachTempsAcrossSafepoints.java b/test/hotspot/jtreg/compiler/runtime/safepoints/TestMachTempsAcrossSafepoints.java
new file mode 100644
index 00000000000..ecd8f58c5ed
--- /dev/null
+++ b/test/hotspot/jtreg/compiler/runtime/safepoints/TestMachTempsAcrossSafepoints.java
@@ -0,0 +1,98 @@
+/*
+ * Copyright (c) 2024, Oracle and/or its affiliates. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ */
+
+package compiler.runtime.safepoints;
+
+import compiler.lib.ir_framework.*;
+import java.lang.ref.SoftReference;
+
+/**
+ * @test
+ * @summary Test that undefined values generated by MachTemp nodes (in this
+ *          case, derived from G1 barriers) are not included in OopMaps.
+ *          Extracted from java.lang.invoke.LambdaFormEditor::getInCache.
+ * @key randomness
+ * @library /test/lib /
+ * @requires vm.gc.G1 & vm.bits == 64 & vm.opt.final.UseCompressedOops == true
+ * @run driver compiler.runtime.safepoints.TestMachTempsAcrossSafepoints
+ */
+
+public class TestMachTempsAcrossSafepoints {
+
+    static class RefWithKey extends SoftReference<Object> {
+        final int key;
+
+        public RefWithKey(int key) {
+            super(new Object());
+            this.key = key;
+        }
+
+        @DontInline
+        @Override
+        public boolean equals(Object obj) {
+            return obj instanceof RefWithKey that && this.key == that.key;
+        }
+    }
+
+    public static void main(String[] args) throws Exception {
+        String inlineCmd = "-XX:CompileCommand=inline,java.lang.ref.SoftReference::get";
+        TestFramework.runWithFlags(inlineCmd, "-XX:+StressGCM", "-XX:+StressLCM", "-XX:StressSeed=1");
+        TestFramework.runWithFlags(inlineCmd, "-XX:+StressGCM", "-XX:+StressLCM");
+    }
+
+    @Test
+    @IR(counts = {IRNode.G1_LOAD_N, "1"}, phase = CompilePhase.FINAL_CODE)
+    @IR(counts = {IRNode.MACH_TEMP, ">= 1"}, phase = CompilePhase.FINAL_CODE)
+    @IR(counts = {IRNode.STATIC_CALL_OF_METHOD, "equals", "2"})
+    @IR(failOn = {IRNode.OOPMAP_WITH, "NarrowOop"})
+    static private Object test(RefWithKey key, RefWithKey[] refs) {
+        RefWithKey k = null;
+        // This loop causes the register allocator to not "rematerialize" all
+        // MachTemp nodes generated for the reference g1LoadN instruction below.
+        for (int i = 0; i < refs.length; i++) {
+            RefWithKey k0 = refs[0];
+            if (k0.equals(key)) {
+                k = k0;
+            }
+        }
+        if (k != null && !key.equals(k)) {
+            return null;
+        }
+        // The MachTemp node implementing the dst TEMP operand in the g1LoadN
+        // instruction corresponding to k.get() can be scheduled across the
+        // above call to RefWithKey::equals(), due to an unfortunate interaction
+        // of inaccurate basic block frequency estimation (emulated in this test
+        // by randomizing the GCM and LCM heuristics) and call-catch cleanup.
+        // Since narrow pointer MachTemp nodes are typed as narrow OOPs, this
+        // causes the oopmap builder to include the MachTemp node definition in
+        // the RefWithKey::equals() return oopmap.
+        return (k != null) ? k.get() : null;
+    }
+
+    @Run(test = "test")
+    @Warmup(0)
+    public void run() {
+        RefWithKey ref = new RefWithKey(42);
+        test(ref, new RefWithKey[]{ref});
+    }
+}
diff --git a/test/hotspot/jtreg/testlibrary/ctw/src/sun/hotspot/tools/ctw/CtwRunner.java b/test/hotspot/jtreg/testlibrary/ctw/src/sun/hotspot/tools/ctw/CtwRunner.java
index df4f9063586..d62e286c68d 100644
--- a/test/hotspot/jtreg/testlibrary/ctw/src/sun/hotspot/tools/ctw/CtwRunner.java
+++ b/test/hotspot/jtreg/testlibrary/ctw/src/sun/hotspot/tools/ctw/CtwRunner.java
@@ -304,7 +304,10 @@ public class CtwRunner {
                 "-XX:+StressMacroExpansion",
                 "-XX:+StressIncrementalInlining",
                 // StressSeed is uint
-                "-XX:StressSeed=" + rng.nextInt(Integer.MAX_VALUE)));
+                "-XX:StressSeed=" + rng.nextInt(Integer.MAX_VALUE),
+                // Do not fail on huge methods where StressGCM makes register
+                // allocation allocate lots of memory
+                "-XX:CompileCommand=memlimit,*.*,0"));
 
         for (String arg : CTW_EXTRA_ARGS.split(",")) {
             Args.add(arg);
diff --git a/test/jdk/java/lang/invoke/BigArityTest.java b/test/jdk/java/lang/invoke/BigArityTest.java
index 338903f3163..2dba056a183 100644
--- a/test/jdk/java/lang/invoke/BigArityTest.java
+++ b/test/jdk/java/lang/invoke/BigArityTest.java
@@ -24,7 +24,7 @@
 /* @test
  * @summary High arity invocations
  * @compile BigArityTest.java
- * @run junit/othervm/timeout=2500 -XX:+IgnoreUnrecognizedVMOptions -XX:-VerifyDependencies -esa -DBigArityTest.ITERATION_COUNT=1 test.java.lang.invoke.BigArityTest
+ * @run junit/othervm/timeout=2500 -XX:+IgnoreUnrecognizedVMOptions -XX:-VerifyDependencies -XX:CompileCommand=memlimit,*.*,0 -esa -DBigArityTest.ITERATION_COUNT=1 test.java.lang.invoke.BigArityTest
  */
 
 package test.java.lang.invoke;