From 4b5ac3abacee0a4b06a9ed0ea57377ff903a90c3 Mon Sep 17 00:00:00 2001
From: Jatin Bhateja <jbhateja@openjdk.org>
Date: Sat, 10 Oct 2020 06:29:38 +0000
Subject: [PATCH] 8252847: Optimize primitive arrayCopy stubs using AVX-512
 masked instructions

Reviewed-by: neliasso, kvn
---
 src/hotspot/cpu/x86/assembler_x86.cpp         |  39 ++
 src/hotspot/cpu/x86/assembler_x86.hpp         |   8 +-
 src/hotspot/cpu/x86/macroAssembler_x86.cpp    |   1 +
 src/hotspot/cpu/x86/macroAssembler_x86.hpp    |  29 +
 .../x86/macroAssembler_x86_arrayCopy_avx3.cpp | 249 ++++++++
 src/hotspot/cpu/x86/stubGenerator_x86_64.cpp  | 602 +++++++++++++++---
 src/hotspot/cpu/x86/stubRoutines_x86.hpp      |   2 +-
 src/hotspot/cpu/x86/vm_version_x86.cpp        |   2 +
 .../arraycopy/TestArrayCopyConjoint.java      | 269 ++++++++
 .../arraycopy/TestArrayCopyDisjoint.java      | 226 +++++++
 .../bench/java/lang/ArrayCopyObject.java      | 121 ++++
 11 files changed, 1449 insertions(+), 99 deletions(-)
 create mode 100644 src/hotspot/cpu/x86/macroAssembler_x86_arrayCopy_avx3.cpp
 create mode 100644 test/hotspot/jtreg/compiler/arraycopy/TestArrayCopyConjoint.java
 create mode 100644 test/hotspot/jtreg/compiler/arraycopy/TestArrayCopyDisjoint.java
 create mode 100644 test/micro/org/openjdk/bench/java/lang/ArrayCopyObject.java

diff --git a/src/hotspot/cpu/x86/assembler_x86.cpp b/src/hotspot/cpu/x86/assembler_x86.cpp
index ef04d33c7f4..3168567e069 100644
--- a/src/hotspot/cpu/x86/assembler_x86.cpp
+++ b/src/hotspot/cpu/x86/assembler_x86.cpp
@@ -2589,6 +2589,38 @@ void Assembler::evmovdqub(XMMRegister dst, KRegister mask, Address src, int vect
   emit_operand(dst, src);
 }
 
+void Assembler::evmovdqu(XMMRegister dst, KRegister mask, Address src, int vector_len, int type) {
+  assert(VM_Version::supports_avx512vlbw(), "");
+  assert(type == T_BYTE || type == T_SHORT || type == T_CHAR || type == T_INT || type == T_LONG, "");
+  InstructionMark im(this);
+  bool wide = type == T_SHORT || type == T_CHAR || type == T_LONG;
+  int prefix = (type == T_BYTE ||  type == T_SHORT || type == T_CHAR) ? VEX_SIMD_F2 : VEX_SIMD_F3;
+  InstructionAttr attributes(vector_len, /* vex_w */ wide, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ true);
+  attributes.set_address_attributes(/* tuple_type */ EVEX_FVM, /* input_size_in_bits */ EVEX_NObit);
+  attributes.set_embedded_opmask_register_specifier(mask);
+  attributes.set_is_evex_instruction();
+  vex_prefix(src, 0, dst->encoding(), (Assembler::VexSimdPrefix)prefix, VEX_OPCODE_0F, &attributes);
+  emit_int8(0x6F);
+  emit_operand(dst, src);
+}
+
+void Assembler::evmovdqu(Address dst, KRegister mask, XMMRegister src, int vector_len, int type) {
+  assert(VM_Version::supports_avx512vlbw(), "");
+  assert(src != xnoreg, "sanity");
+  assert(type == T_BYTE || type == T_SHORT || type == T_CHAR || type == T_INT || type == T_LONG, "");
+  InstructionMark im(this);
+  bool wide = type == T_SHORT || type == T_CHAR || type == T_LONG;
+  int prefix = (type == T_BYTE ||  type == T_SHORT || type == T_CHAR) ? VEX_SIMD_F2 : VEX_SIMD_F3;
+  InstructionAttr attributes(vector_len, /* vex_w */ wide, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ true);
+  attributes.set_address_attributes(/* tuple_type */ EVEX_FVM, /* input_size_in_bits */ EVEX_NObit);
+  attributes.reset_is_clear_context();
+  attributes.set_embedded_opmask_register_specifier(mask);
+  attributes.set_is_evex_instruction();
+  vex_prefix(dst, 0, src->encoding(), (Assembler::VexSimdPrefix)prefix, VEX_OPCODE_0F, &attributes);
+  emit_int8(0x7F);
+  emit_operand(src, dst);
+}
+
 void Assembler::evmovdquw(XMMRegister dst, Address src, int vector_len) {
   assert(VM_Version::supports_evex(), "");
   InstructionMark im(this);
@@ -7803,6 +7835,13 @@ void Assembler::shlxq(Register dst, Register src1, Register src2) {
   emit_int16((unsigned char)0xF7, (0xC0 | encode));
 }
 
+void Assembler::shrxq(Register dst, Register src1, Register src2) {
+  assert(VM_Version::supports_bmi2(), "");
+  InstructionAttr attributes(AVX_128bit, /* vex_w */ true, /* legacy_mode */ true, /* no_mask_reg */ true, /* uses_vl */ true);
+  int encode = vex_prefix_and_encode(dst->encoding(), src2->encoding(), src1->encoding(), VEX_SIMD_F2, VEX_OPCODE_0F_38, &attributes);
+  emit_int16((unsigned char)0xF7, (0xC0 | encode));
+}
+
 #ifndef _LP64
 
 void Assembler::incl(Register dst) {
diff --git a/src/hotspot/cpu/x86/assembler_x86.hpp b/src/hotspot/cpu/x86/assembler_x86.hpp
index 283285dc347..23d8db40164 100644
--- a/src/hotspot/cpu/x86/assembler_x86.hpp
+++ b/src/hotspot/cpu/x86/assembler_x86.hpp
@@ -794,7 +794,6 @@ private:
 
   void decl(Register dst);
   void decl(Address dst);
-  void decq(Register dst);
   void decq(Address dst);
 
   void incl(Register dst);
@@ -879,6 +878,7 @@ private:
   void popa_uncached();
 #endif
   void vzeroupper_uncached();
+  void decq(Register dst);
 
   void pusha();
   void popa();
@@ -1487,6 +1487,10 @@ private:
   void evmovdquq(XMMRegister dst, Address src, int vector_len);
   void evmovdquq(XMMRegister dst, XMMRegister src, int vector_len);
 
+  // Generic move instructions.
+  void evmovdqu(Address dst, KRegister mask, XMMRegister src, int vector_len, int type);
+  void evmovdqu(XMMRegister dst, KRegister mask, Address src, int vector_len, int type);
+
   // Move lower 64bit to high 64bit in 128bit register
   void movlhps(XMMRegister dst, XMMRegister src);
 
@@ -1989,6 +1993,8 @@ private:
 
   void shlxl(Register dst, Register src1, Register src2);
   void shlxq(Register dst, Register src1, Register src2);
+  void shrxq(Register dst, Register src1, Register src2);
+
 
   //====================VECTOR ARITHMETIC=====================================
 
diff --git a/src/hotspot/cpu/x86/macroAssembler_x86.cpp b/src/hotspot/cpu/x86/macroAssembler_x86.cpp
index 0a8d11f9764..84cb5eb9ad7 100644
--- a/src/hotspot/cpu/x86/macroAssembler_x86.cpp
+++ b/src/hotspot/cpu/x86/macroAssembler_x86.cpp
@@ -7964,6 +7964,7 @@ void MacroAssembler::cache_wbsync(bool is_pre)
     sfence();
   }
 }
+
 #endif // _LP64
 
 Assembler::Condition MacroAssembler::negate_condition(Assembler::Condition cond) {
diff --git a/src/hotspot/cpu/x86/macroAssembler_x86.hpp b/src/hotspot/cpu/x86/macroAssembler_x86.hpp
index 325bdf912bc..b052985708a 100644
--- a/src/hotspot/cpu/x86/macroAssembler_x86.hpp
+++ b/src/hotspot/cpu/x86/macroAssembler_x86.hpp
@@ -1037,6 +1037,18 @@ public:
                 Register rax, Register rcx, Register rdx, Register tmp);
 #endif
 
+#ifdef _LP64
+  void arraycopy_avx3_special_cases(XMMRegister xmm, KRegister mask, Register from,
+                                    Register to, Register count, int shift,
+                                    Register index, Register temp,
+                                    bool use64byteVector, Label& L_entry, Label& L_exit);
+
+  void arraycopy_avx3_special_cases_conjoint(XMMRegister xmm, KRegister mask, Register from,
+                                             Register to, Register start_index, Register end_index,
+                                             Register count, int shift, Register temp,
+                                             bool use64byteVector, Label& L_entry, Label& L_exit);
+#endif
+
 private:
 
   // these are private because users should be doing movflt/movdbl
@@ -1725,6 +1737,23 @@ public:
 
   void cache_wb(Address line);
   void cache_wbsync(bool is_pre);
+
+  void copy64_masked_avx(Register dst, Register src, XMMRegister xmm,
+                         KRegister mask, Register length, Register index,
+                         Register temp, int shift = Address::times_1, int offset = 0,
+                         bool use64byteVector = false);
+
+  void copy32_masked_avx(Register dst, Register src, XMMRegister xmm,
+                         KRegister mask, Register length, Register index,
+                         Register temp, int shift = Address::times_1, int offset = 0);
+
+  void copy32_avx(Register dst, Register src, Register index, XMMRegister xmm,
+                  int shift = Address::times_1, int offset = 0);
+
+  void copy64_avx(Register dst, Register src, Register index, XMMRegister xmm,
+                  bool conjoint, int shift = Address::times_1, int offset = 0,
+                  bool use64byteVector = false);
+
 #endif // _LP64
 
   void vallones(XMMRegister dst, int vector_len);
diff --git a/src/hotspot/cpu/x86/macroAssembler_x86_arrayCopy_avx3.cpp b/src/hotspot/cpu/x86/macroAssembler_x86_arrayCopy_avx3.cpp
new file mode 100644
index 00000000000..9a6d10db6cc
--- /dev/null
+++ b/src/hotspot/cpu/x86/macroAssembler_x86_arrayCopy_avx3.cpp
@@ -0,0 +1,249 @@
+/*
+* Copyright (c) 2020, Intel Corporation.
+*
+* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+*
+* This code is free software; you can redistribute it and/or modify it
+* under the terms of the GNU General Public License version 2 only, as
+* published by the Free Software Foundation.
+*
+* This code is distributed in the hope that it will be useful, but WITHOUT
+* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+* FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+* version 2 for more details (a copy is included in the LICENSE file that
+* accompanied this code).
+*
+* You should have received a copy of the GNU General Public License version
+* 2 along with this work; if not, write to the Free Software Foundation,
+* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+*
+* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+* or visit www.oracle.com if you need additional information or have any
+* questions.
+*
+*/
+
+#include "precompiled.hpp"
+#include "asm/macroAssembler.hpp"
+#include "asm/macroAssembler.inline.hpp"
+
+#ifdef PRODUCT
+#define BLOCK_COMMENT(str) /* nothing */
+#else
+#define BLOCK_COMMENT(str) block_comment(str)
+#endif
+
+#define BIND(label) bind(label); BLOCK_COMMENT(#label ":")
+
+#ifdef _LP64
+
+void MacroAssembler::arraycopy_avx3_special_cases(XMMRegister xmm, KRegister mask, Register from,
+                                                  Register to, Register count, int shift,
+                                                  Register index, Register temp,
+                                                  bool use64byteVector, Label& L_entry, Label& L_exit) {
+  Label L_entry_64, L_entry_96, L_entry_128;
+  Label L_entry_160, L_entry_192;
+
+  int size_mat[][6] = {
+  /* T_BYTE */ {32 , 64,  96 , 128 , 160 , 192 },
+  /* T_SHORT*/ {16 , 32,  48 , 64  , 80  , 96  },
+  /* T_INT  */ {8  , 16,  24 , 32  , 40  , 48  },
+  /* T_LONG */ {4  ,  8,  12 , 16  , 20  , 24  }
+  };
+
+  // Case A) Special case for length less than equal to 32 bytes.
+  cmpq(count, size_mat[shift][0]);
+  jccb(Assembler::greater, L_entry_64);
+  copy32_masked_avx(to, from, xmm, mask, count, index, temp, shift);
+  jmp(L_exit);
+
+  // Case B) Special case for length less than equal to 64 bytes.
+  BIND(L_entry_64);
+  cmpq(count, size_mat[shift][1]);
+  jccb(Assembler::greater, L_entry_96);
+  copy64_masked_avx(to, from, xmm, mask, count, index, temp, shift, 0, use64byteVector);
+  jmp(L_exit);
+
+  // Case C) Special case for length less than equal to 96 bytes.
+  BIND(L_entry_96);
+  cmpq(count, size_mat[shift][2]);
+  jccb(Assembler::greater, L_entry_128);
+  copy64_avx(to, from, index, xmm, false, shift, 0, use64byteVector);
+  subq(count, 64 >> shift);
+  copy32_masked_avx(to, from, xmm, mask, count, index, temp, shift, 64);
+  jmp(L_exit);
+
+  // Case D) Special case for length less than equal to 128 bytes.
+  BIND(L_entry_128);
+  cmpq(count, size_mat[shift][3]);
+  jccb(Assembler::greater, L_entry_160);
+  copy64_avx(to, from, index, xmm, false, shift, 0, use64byteVector);
+  copy32_avx(to, from, index, xmm, shift, 64);
+  subq(count, 96 >> shift);
+  copy32_masked_avx(to, from, xmm, mask, count, index, temp, shift, 96);
+  jmp(L_exit);
+
+  // Case E) Special case for length less than equal to 160 bytes.
+  BIND(L_entry_160);
+  cmpq(count, size_mat[shift][4]);
+  jccb(Assembler::greater, L_entry_192);
+  copy64_avx(to, from, index, xmm, false, shift, 0, use64byteVector);
+  copy64_avx(to, from, index, xmm, false, shift, 64, use64byteVector);
+  subq(count, 128 >> shift);
+  copy32_masked_avx(to, from, xmm, mask, count, index, temp, shift, 128);
+  jmp(L_exit);
+
+  // Case F) Special case for length less than equal to 192 bytes.
+  BIND(L_entry_192);
+  cmpq(count, size_mat[shift][5]);
+  jcc(Assembler::greater, L_entry);
+  copy64_avx(to, from, index, xmm, false, shift, 0, use64byteVector);
+  copy64_avx(to, from, index, xmm, false, shift, 64, use64byteVector);
+  copy32_avx(to, from, index, xmm, shift, 128);
+  subq(count, 160 >> shift);
+  copy32_masked_avx(to, from, xmm, mask, count, index, temp, shift, 160);
+  jmp(L_exit);
+}
+
+void MacroAssembler::arraycopy_avx3_special_cases_conjoint(XMMRegister xmm, KRegister mask, Register from,
+                                                           Register to, Register start_index, Register end_index,
+                                                           Register count, int shift, Register temp,
+                                                           bool use64byteVector, Label& L_entry, Label& L_exit) {
+  Label L_entry_64, L_entry_96, L_entry_128;
+  Label L_entry_160, L_entry_192;
+  bool avx3 = MaxVectorSize > 32 && AVX3Threshold == 0;
+
+  int size_mat[][6] = {
+  /* T_BYTE */ {32 , 64,  96 , 128 , 160 , 192 },
+  /* T_SHORT*/ {16 , 32,  48 , 64  , 80  , 96  },
+  /* T_INT  */ {8  , 16,  24 , 32  , 40  , 48  },
+  /* T_LONG */ {4  ,  8,  12 , 16  , 20  , 24  }
+  };
+
+  // Case A) Special case for length less than equal to 32 bytes.
+  cmpq(count, size_mat[shift][0]);
+  jccb(Assembler::greater, L_entry_64);
+  copy32_masked_avx(to, from, xmm, mask, count, start_index, temp, shift);
+  jmp(L_exit);
+
+  // Case B) Special case for length less than equal to 64 bytes.
+  BIND(L_entry_64);
+  cmpq(count, size_mat[shift][1]);
+  jccb(Assembler::greater, L_entry_96);
+  if (avx3) {
+     copy64_masked_avx(to, from, xmm, mask, count, start_index, temp, shift, 0, true);
+  } else {
+     copy32_avx(to, from, end_index, xmm, shift, -32);
+     subq(count, 32 >> shift);
+     copy32_masked_avx(to, from, xmm, mask, count, start_index, temp, shift);
+  }
+  jmp(L_exit);
+
+  // Case C) Special case for length less than equal to 96 bytes.
+  BIND(L_entry_96);
+  cmpq(count, size_mat[shift][2]);
+  jccb(Assembler::greater, L_entry_128);
+  copy64_avx(to, from, end_index, xmm, true, shift, -64, use64byteVector);
+  subq(count, 64 >> shift);
+  copy32_masked_avx(to, from, xmm, mask, count, start_index, temp, shift);
+  jmp(L_exit);
+
+  // Case D) Special case for length less than equal to 128 bytes.
+  BIND(L_entry_128);
+  cmpq(count, size_mat[shift][3]);
+  jccb(Assembler::greater, L_entry_160);
+  copy64_avx(to, from, end_index, xmm, true, shift, -64, use64byteVector);
+  copy32_avx(to, from, end_index, xmm, shift, -96);
+  subq(count, 96 >> shift);
+  copy32_masked_avx(to, from, xmm, mask, count, start_index, temp, shift);
+  jmp(L_exit);
+
+  // Case E) Special case for length less than equal to 160 bytes.
+  BIND(L_entry_160);
+  cmpq(count, size_mat[shift][4]);
+  jccb(Assembler::greater, L_entry_192);
+  copy64_avx(to, from, end_index, xmm, true, shift, -64, use64byteVector);
+  copy64_avx(to, from, end_index, xmm, true, shift, -128, use64byteVector);
+  subq(count, 128 >> shift);
+  copy32_masked_avx(to, from, xmm, mask, count, start_index, temp, shift);
+  jmp(L_exit);
+
+  // Case F) Special case for length less than equal to 192 bytes.
+  BIND(L_entry_192);
+  cmpq(count, size_mat[shift][5]);
+  jcc(Assembler::greater, L_entry);
+  copy64_avx(to, from, end_index, xmm, true, shift, -64, use64byteVector);
+  copy64_avx(to, from, end_index, xmm, true, shift, -128, use64byteVector);
+  copy32_avx(to, from, end_index, xmm, shift, -160);
+  subq(count, 160 >> shift);
+  copy32_masked_avx(to, from, xmm, mask, count, start_index, temp, shift);
+  jmp(L_exit);
+}
+
+void MacroAssembler::copy64_masked_avx(Register dst, Register src, XMMRegister xmm,
+                                       KRegister mask, Register length, Register index,
+                                       Register temp, int shift, int offset,
+                                       bool use64byteVector) {
+  BasicType type[] = { T_BYTE,  T_SHORT,  T_INT,   T_LONG};
+  assert(MaxVectorSize >= 32, "vector length should be >= 32");
+  if (!use64byteVector) {
+    copy32_avx(dst, src, index, xmm, shift, offset);
+    subptr(length, 32 >> shift);
+    copy32_masked_avx(dst, src, xmm, mask, length, index, temp, shift, offset+32);
+  } else {
+    Address::ScaleFactor scale = (Address::ScaleFactor)(shift);
+    assert(MaxVectorSize == 64, "vector length != 64");
+    negptr(length);
+    addq(length, 64);
+    mov64(temp, -1);
+    shrxq(temp, temp, length);
+    kmovql(mask, temp);
+    evmovdqu(xmm, mask, Address(src, index, scale, offset), Assembler::AVX_512bit, type[shift]);
+    evmovdqu(Address(dst, index, scale, offset), mask, xmm, Assembler::AVX_512bit, type[shift]);
+  }
+}
+
+
+void MacroAssembler::copy32_masked_avx(Register dst, Register src, XMMRegister xmm,
+                                       KRegister mask, Register length, Register index,
+                                       Register temp, int shift, int offset) {
+  assert(MaxVectorSize >= 32, "vector length should be >= 32");
+  BasicType type[] = { T_BYTE,  T_SHORT,  T_INT,   T_LONG};
+  Address::ScaleFactor scale = (Address::ScaleFactor)(shift);
+  mov64(temp, 1);
+  shlxq(temp, temp, length);
+  decq(temp);
+  kmovql(mask, temp);
+  evmovdqu(xmm, mask, Address(src, index, scale, offset), Assembler::AVX_256bit, type[shift]);
+  evmovdqu(Address(dst, index, scale, offset), mask, xmm, Assembler::AVX_256bit, type[shift]);
+}
+
+
+void MacroAssembler::copy32_avx(Register dst, Register src, Register index, XMMRegister xmm,
+                                int shift, int offset) {
+  assert(MaxVectorSize >= 32, "vector length should be >= 32");
+  Address::ScaleFactor scale = (Address::ScaleFactor)(shift);
+  vmovdqu(xmm, Address(src, index, scale, offset));
+  vmovdqu(Address(dst, index, scale, offset), xmm);
+}
+
+
+void MacroAssembler::copy64_avx(Register dst, Register src, Register index, XMMRegister xmm,
+                                bool conjoint, int shift, int offset, bool use64byteVector) {
+  assert(MaxVectorSize == 64 || MaxVectorSize == 32, "vector length mismatch");
+  if (!use64byteVector) {
+    if (conjoint) {
+      copy32_avx(dst, src, index, xmm, shift, offset+32);
+      copy32_avx(dst, src, index, xmm, shift, offset);
+    } else {
+      copy32_avx(dst, src, index, xmm, shift, offset);
+      copy32_avx(dst, src, index, xmm, shift, offset+32);
+    }
+  } else {
+    Address::ScaleFactor scale = (Address::ScaleFactor)(shift);
+    evmovdquq(xmm, Address(src, index, scale, offset), Assembler::AVX_512bit);
+    evmovdquq(Address(dst, index, scale, offset), xmm, Assembler::AVX_512bit);
+  }
+}
+
+#endif
diff --git a/src/hotspot/cpu/x86/stubGenerator_x86_64.cpp b/src/hotspot/cpu/x86/stubGenerator_x86_64.cpp
index 3d2c7671304..e47e3baeea4 100644
--- a/src/hotspot/cpu/x86/stubGenerator_x86_64.cpp
+++ b/src/hotspot/cpu/x86/stubGenerator_x86_64.cpp
@@ -1124,59 +1124,28 @@ class StubGenerator: public StubCodeGenerator {
     __ align(OptoLoopAlignment);
     if (UseUnalignedLoadStores) {
       Label L_end;
-      // Copy 64-bytes per iteration
-      if (UseAVX > 2) {
-        Label L_loop_avx512, L_loop_avx2, L_32_byte_head, L_above_threshold, L_below_threshold;
-
-        __ BIND(L_copy_bytes);
-        __ cmpptr(qword_count, (-1 * AVX3Threshold / 8));
-        __ jccb(Assembler::less, L_above_threshold);
-        __ jmpb(L_below_threshold);
-
-        __ bind(L_loop_avx512);
-        __ evmovdqul(xmm0, Address(end_from, qword_count, Address::times_8, -56), Assembler::AVX_512bit);
-        __ evmovdqul(Address(end_to, qword_count, Address::times_8, -56), xmm0, Assembler::AVX_512bit);
-        __ bind(L_above_threshold);
-        __ addptr(qword_count, 8);
-        __ jcc(Assembler::lessEqual, L_loop_avx512);
-        __ jmpb(L_32_byte_head);
-
-        __ bind(L_loop_avx2);
+      __ BIND(L_loop);
+      if (UseAVX >= 2) {
         __ vmovdqu(xmm0, Address(end_from, qword_count, Address::times_8, -56));
         __ vmovdqu(Address(end_to, qword_count, Address::times_8, -56), xmm0);
         __ vmovdqu(xmm1, Address(end_from, qword_count, Address::times_8, -24));
         __ vmovdqu(Address(end_to, qword_count, Address::times_8, -24), xmm1);
-        __ bind(L_below_threshold);
-        __ addptr(qword_count, 8);
-        __ jcc(Assembler::lessEqual, L_loop_avx2);
-
-        __ bind(L_32_byte_head);
-        __ subptr(qword_count, 4);  // sub(8) and add(4)
-        __ jccb(Assembler::greater, L_end);
       } else {
-        __ BIND(L_loop);
-        if (UseAVX == 2) {
-          __ vmovdqu(xmm0, Address(end_from, qword_count, Address::times_8, -56));
-          __ vmovdqu(Address(end_to, qword_count, Address::times_8, -56), xmm0);
-          __ vmovdqu(xmm1, Address(end_from, qword_count, Address::times_8, -24));
-          __ vmovdqu(Address(end_to, qword_count, Address::times_8, -24), xmm1);
-        } else {
-          __ movdqu(xmm0, Address(end_from, qword_count, Address::times_8, -56));
-          __ movdqu(Address(end_to, qword_count, Address::times_8, -56), xmm0);
-          __ movdqu(xmm1, Address(end_from, qword_count, Address::times_8, -40));
-          __ movdqu(Address(end_to, qword_count, Address::times_8, -40), xmm1);
-          __ movdqu(xmm2, Address(end_from, qword_count, Address::times_8, -24));
-          __ movdqu(Address(end_to, qword_count, Address::times_8, -24), xmm2);
-          __ movdqu(xmm3, Address(end_from, qword_count, Address::times_8, - 8));
-          __ movdqu(Address(end_to, qword_count, Address::times_8, - 8), xmm3);
-        }
-
-        __ BIND(L_copy_bytes);
-        __ addptr(qword_count, 8);
-        __ jcc(Assembler::lessEqual, L_loop);
-        __ subptr(qword_count, 4);  // sub(8) and add(4)
-        __ jccb(Assembler::greater, L_end);
+        __ movdqu(xmm0, Address(end_from, qword_count, Address::times_8, -56));
+        __ movdqu(Address(end_to, qword_count, Address::times_8, -56), xmm0);
+        __ movdqu(xmm1, Address(end_from, qword_count, Address::times_8, -40));
+        __ movdqu(Address(end_to, qword_count, Address::times_8, -40), xmm1);
+        __ movdqu(xmm2, Address(end_from, qword_count, Address::times_8, -24));
+        __ movdqu(Address(end_to, qword_count, Address::times_8, -24), xmm2);
+        __ movdqu(xmm3, Address(end_from, qword_count, Address::times_8, - 8));
+        __ movdqu(Address(end_to, qword_count, Address::times_8, - 8), xmm3);
       }
+
+      __ BIND(L_copy_bytes);
+      __ addptr(qword_count, 8);
+      __ jcc(Assembler::lessEqual, L_loop);
+      __ subptr(qword_count, 4);  // sub(8) and add(4)
+      __ jccb(Assembler::greater, L_end);
       // Copy trailing 32 bytes
       if (UseAVX >= 2) {
         __ vmovdqu(xmm0, Address(end_from, qword_count, Address::times_8, -24));
@@ -1232,60 +1201,29 @@ class StubGenerator: public StubCodeGenerator {
     __ align(OptoLoopAlignment);
     if (UseUnalignedLoadStores) {
       Label L_end;
-      // Copy 64-bytes per iteration
-      if (UseAVX > 2) {
-        Label L_loop_avx512, L_loop_avx2, L_32_byte_head, L_above_threshold, L_below_threshold;
-
-        __ BIND(L_copy_bytes);
-        __ cmpptr(qword_count, (AVX3Threshold / 8));
-        __ jccb(Assembler::greater, L_above_threshold);
-        __ jmpb(L_below_threshold);
-
-        __ BIND(L_loop_avx512);
-        __ evmovdqul(xmm0, Address(from, qword_count, Address::times_8, 0), Assembler::AVX_512bit);
-        __ evmovdqul(Address(dest, qword_count, Address::times_8, 0), xmm0, Assembler::AVX_512bit);
-        __ bind(L_above_threshold);
-        __ subptr(qword_count, 8);
-        __ jcc(Assembler::greaterEqual, L_loop_avx512);
-        __ jmpb(L_32_byte_head);
-
-        __ bind(L_loop_avx2);
+      __ BIND(L_loop);
+      if (UseAVX >= 2) {
         __ vmovdqu(xmm0, Address(from, qword_count, Address::times_8, 32));
         __ vmovdqu(Address(dest, qword_count, Address::times_8, 32), xmm0);
-        __ vmovdqu(xmm1, Address(from, qword_count, Address::times_8, 0));
-        __ vmovdqu(Address(dest, qword_count, Address::times_8, 0), xmm1);
-        __ bind(L_below_threshold);
-        __ subptr(qword_count, 8);
-        __ jcc(Assembler::greaterEqual, L_loop_avx2);
-
-        __ bind(L_32_byte_head);
-        __ addptr(qword_count, 4);  // add(8) and sub(4)
-        __ jccb(Assembler::less, L_end);
+        __ vmovdqu(xmm1, Address(from, qword_count, Address::times_8,  0));
+        __ vmovdqu(Address(dest, qword_count, Address::times_8,  0), xmm1);
       } else {
-        __ BIND(L_loop);
-        if (UseAVX == 2) {
-          __ vmovdqu(xmm0, Address(from, qword_count, Address::times_8, 32));
-          __ vmovdqu(Address(dest, qword_count, Address::times_8, 32), xmm0);
-          __ vmovdqu(xmm1, Address(from, qword_count, Address::times_8,  0));
-          __ vmovdqu(Address(dest, qword_count, Address::times_8,  0), xmm1);
-        } else {
-          __ movdqu(xmm0, Address(from, qword_count, Address::times_8, 48));
-          __ movdqu(Address(dest, qword_count, Address::times_8, 48), xmm0);
-          __ movdqu(xmm1, Address(from, qword_count, Address::times_8, 32));
-          __ movdqu(Address(dest, qword_count, Address::times_8, 32), xmm1);
-          __ movdqu(xmm2, Address(from, qword_count, Address::times_8, 16));
-          __ movdqu(Address(dest, qword_count, Address::times_8, 16), xmm2);
-          __ movdqu(xmm3, Address(from, qword_count, Address::times_8,  0));
-          __ movdqu(Address(dest, qword_count, Address::times_8,  0), xmm3);
-        }
-
-        __ BIND(L_copy_bytes);
-        __ subptr(qword_count, 8);
-        __ jcc(Assembler::greaterEqual, L_loop);
-
-        __ addptr(qword_count, 4);  // add(8) and sub(4)
-        __ jccb(Assembler::less, L_end);
+        __ movdqu(xmm0, Address(from, qword_count, Address::times_8, 48));
+        __ movdqu(Address(dest, qword_count, Address::times_8, 48), xmm0);
+        __ movdqu(xmm1, Address(from, qword_count, Address::times_8, 32));
+        __ movdqu(Address(dest, qword_count, Address::times_8, 32), xmm1);
+        __ movdqu(xmm2, Address(from, qword_count, Address::times_8, 16));
+        __ movdqu(Address(dest, qword_count, Address::times_8, 16), xmm2);
+        __ movdqu(xmm3, Address(from, qword_count, Address::times_8,  0));
+        __ movdqu(Address(dest, qword_count, Address::times_8,  0), xmm3);
       }
+
+      __ BIND(L_copy_bytes);
+      __ subptr(qword_count, 8);
+      __ jcc(Assembler::greaterEqual, L_loop);
+
+      __ addptr(qword_count, 4);  // add(8) and sub(4)
+      __ jccb(Assembler::less, L_end);
       // Copy trailing 32 bytes
       if (UseAVX >= 2) {
         __ vmovdqu(xmm0, Address(from, qword_count, Address::times_8, 0));
@@ -1323,6 +1261,442 @@ class StubGenerator: public StubCodeGenerator {
     __ jcc(Assembler::greater, L_copy_8_bytes); // Copy trailing qwords
   }
 
+#ifndef PRODUCT
+    int& get_profile_ctr(int shift) {
+      if ( 0 == shift)
+        return SharedRuntime::_jbyte_array_copy_ctr;
+      else if(1 == shift)
+        return SharedRuntime::_jshort_array_copy_ctr;
+      else if(2 == shift)
+        return SharedRuntime::_jint_array_copy_ctr;
+      else
+        return SharedRuntime::_jlong_array_copy_ctr;
+    }
+#endif
+
+  void setup_argument_regs(BasicType type) {
+    if (type == T_BYTE || type == T_SHORT) {
+      setup_arg_regs(); // from => rdi, to => rsi, count => rdx
+                        // r9 and r10 may be used to save non-volatile registers
+    } else {
+      setup_arg_regs_using_thread(); // from => rdi, to => rsi, count => rdx
+                                     // r9 is used to save r15_thread
+    }
+  }
+
+  void restore_argument_regs(BasicType type) {
+    if (type == T_BYTE || type == T_SHORT) {
+      restore_arg_regs();
+    } else {
+      restore_arg_regs_using_thread();
+    }
+  }
+
+  // Note: Following rules apply to AVX3 optimized arraycopy stubs:-
+  // - If target supports AVX3 features (BW+VL+F) then implementation uses 32 byte vectors (YMMs)
+  //   for both special cases (various small block sizes) and aligned copy loop. This is the
+  //   default configuration.
+  // - If copy length is above AVX3Threshold, then implementation use 64 byte vectors (ZMMs)
+  //   for main copy loop (and subsequent tail) since bulk of the cycles will be consumed in it.
+  // - If user forces MaxVectorSize=32 then above 4096 bytes its seen that REP MOVs shows a
+  //   better performance for disjoint copies. For conjoint/backward copy vector based
+  //   copy performs better.
+  // - If user sets AVX3Threshold=0, then special cases for small blocks sizes operate over
+  //   64 byte vector registers (ZMMs).
+
+  // Inputs:
+  //   c_rarg0   - source array address
+  //   c_rarg1   - destination array address
+  //   c_rarg2   - element count, treated as ssize_t, can be zero
+  //
+  //
+  // Side Effects:
+  //   disjoint_copy_avx3_masked is set to the no-overlap entry point
+  //   used by generate_conjoint_[byte/int/short/long]_copy().
+  //
+
+  address generate_disjoint_copy_avx3_masked(address* entry, const char *name, int shift,
+                                             bool aligned, bool is_oop, bool dest_uninitialized) {
+    __ align(CodeEntryAlignment);
+    StubCodeMark mark(this, "StubRoutines", name);
+    address start = __ pc();
+
+    bool use64byteVector = MaxVectorSize > 32 && AVX3Threshold == 0;
+    Label L_main_loop, L_main_loop_64bytes, L_tail, L_tail64, L_exit, L_entry;
+    Label L_repmovs, L_main_pre_loop, L_main_pre_loop_64bytes, L_pre_main_post_64;
+    const Register from        = rdi;  // source array address
+    const Register to          = rsi;  // destination array address
+    const Register count       = rdx;  // elements count
+    const Register temp1       = r8;
+    const Register temp2       = r11;
+    const Register temp3       = rax;
+    const Register temp4       = rcx;
+    // End pointers are inclusive, and if count is not zero they point
+    // to the last unit copied:  end_to[0] := end_from[0]
+
+    __ enter(); // required for proper stackwalking of RuntimeStub frame
+    assert_clean_int(c_rarg2, rax);    // Make sure 'count' is clean int.
+
+    if (entry != NULL) {
+      *entry = __ pc();
+       // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
+      BLOCK_COMMENT("Entry:");
+    }
+
+    BasicType type_vec[] = { T_BYTE,  T_SHORT,  T_INT,   T_LONG};
+    BasicType type = is_oop ? T_OBJECT : type_vec[shift];
+
+    setup_argument_regs(type);
+
+    DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_DISJOINT;
+    if (dest_uninitialized) {
+      decorators |= IS_DEST_UNINITIALIZED;
+    }
+    if (aligned) {
+      decorators |= ARRAYCOPY_ALIGNED;
+    }
+    BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
+    bs->arraycopy_prologue(_masm, decorators, type, from, to, count);
+
+    {
+      // Type(shift)           byte(0), short(1), int(2),   long(3)
+      int loop_size[]        = { 192,     96,       48,      24};
+      int threshold[]        = { 4096,    2048,     1024,    512};
+
+      // UnsafeCopyMemory page error: continue after ucm
+      UnsafeCopyMemoryMark ucmm(this, !is_oop && !aligned, true);
+      // 'from', 'to' and 'count' are now valid
+
+      // temp1 holds remaining count and temp4 holds running count used to compute
+      // next address offset for start of to/from addresses (temp4 * scale).
+      __ mov64(temp4, 0);
+      __ movq(temp1, count);
+
+      // Zero length check.
+      __ BIND(L_tail);
+      __ cmpq(temp1, 0);
+      __ jcc(Assembler::lessEqual, L_exit);
+
+      // Special cases using 32 byte [masked] vector copy operations.
+      __ arraycopy_avx3_special_cases(xmm1, k2, from, to, temp1, shift,
+                                      temp4, temp3, use64byteVector, L_entry, L_exit);
+
+      // PRE-MAIN-POST loop for aligned copy.
+      __ BIND(L_entry);
+
+      if (AVX3Threshold != 0) {
+        __ cmpq(count, threshold[shift]);
+        if (MaxVectorSize == 64) {
+          // Copy using 64 byte vectors.
+          __ jcc(Assembler::greaterEqual, L_pre_main_post_64);
+        } else {
+          assert(MaxVectorSize < 64, "vector size should be < 64 bytes");
+          // REP MOVS offer a faster copy path.
+          __ jcc(Assembler::greaterEqual, L_repmovs);
+        }
+      }
+
+      if (MaxVectorSize < 64  || AVX3Threshold != 0) {
+        // Partial copy to make dst address 32 byte aligned.
+        __ movq(temp2, to);
+        __ andq(temp2, 31);
+        __ jcc(Assembler::equal, L_main_pre_loop);
+
+        __ negptr(temp2);
+        __ addq(temp2, 32);
+        if (shift) {
+          __ shrq(temp2, shift);
+        }
+        __ movq(temp3, temp2);
+        __ copy32_masked_avx(to, from, xmm1, k2, temp3, temp4, temp1, shift);
+        __ movq(temp4, temp2);
+        __ movq(temp1, count);
+        __ subq(temp1, temp2);
+
+        __ cmpq(temp1, loop_size[shift]);
+        __ jcc(Assembler::less, L_tail);
+
+        __ BIND(L_main_pre_loop);
+        __ subq(temp1, loop_size[shift]);
+
+        // Main loop with aligned copy block size of 192 bytes at 32 byte granularity.
+        __ BIND(L_main_loop);
+           __ copy64_avx(to, from, temp4, xmm1, false, shift, 0);
+           __ copy64_avx(to, from, temp4, xmm1, false, shift, 64);
+           __ copy64_avx(to, from, temp4, xmm1, false, shift, 128);
+           __ addptr(temp4, loop_size[shift]);
+           __ subq(temp1, loop_size[shift]);
+           __ jcc(Assembler::greater, L_main_loop);
+
+        __ addq(temp1, loop_size[shift]);
+
+        // Tail loop.
+        __ jmp(L_tail);
+
+        __ BIND(L_repmovs);
+          __ movq(temp2, temp1);
+          // Swap to(RSI) and from(RDI) addresses to comply with REP MOVs semantics.
+          __ movq(temp3, to);
+          __ movq(to,  from);
+          __ movq(from, temp3);
+          // Save to/from for restoration post rep_mov.
+          __ movq(temp1, to);
+          __ movq(temp3, from);
+          if(shift < 3) {
+            __ shrq(temp2, 3-shift);     // quad word count
+          }
+          __ movq(temp4 , temp2);        // move quad ward count into temp4(RCX).
+          __ rep_mov();
+          __ shlq(temp2, 3);             // convert quad words into byte count.
+          if(shift) {
+            __ shrq(temp2, shift);       // type specific count.
+          }
+          // Restore original addresses in to/from.
+          __ movq(to, temp3);
+          __ movq(from, temp1);
+          __ movq(temp4, temp2);
+          __ movq(temp1, count);
+          __ subq(temp1, temp2);         // tailing part (less than a quad ward size).
+          __ jmp(L_tail);
+      }
+
+      if (MaxVectorSize > 32) {
+        __ BIND(L_pre_main_post_64);
+        // Partial copy to make dst address 64 byte aligned.
+        __ movq(temp2, to);
+        __ andq(temp2, 63);
+        __ jcc(Assembler::equal, L_main_pre_loop_64bytes);
+
+        __ negptr(temp2);
+        __ addq(temp2, 64);
+        if (shift) {
+          __ shrq(temp2, shift);
+        }
+        __ movq(temp3, temp2);
+        __ copy64_masked_avx(to, from, xmm1, k2, temp3, temp4, temp1, shift, 0 , true);
+        __ movq(temp4, temp2);
+        __ movq(temp1, count);
+        __ subq(temp1, temp2);
+
+        __ cmpq(temp1, loop_size[shift]);
+        __ jcc(Assembler::less, L_tail64);
+
+        __ BIND(L_main_pre_loop_64bytes);
+        __ subq(temp1, loop_size[shift]);
+
+        // Main loop with aligned copy block size of 192 bytes at
+        // 64 byte copy granularity.
+        __ BIND(L_main_loop_64bytes);
+           __ copy64_avx(to, from, temp4, xmm1, false, shift, 0 , true);
+           __ copy64_avx(to, from, temp4, xmm1, false, shift, 64, true);
+           __ copy64_avx(to, from, temp4, xmm1, false, shift, 128, true);
+           __ addptr(temp4, loop_size[shift]);
+           __ subq(temp1, loop_size[shift]);
+           __ jcc(Assembler::greater, L_main_loop_64bytes);
+
+        __ addq(temp1, loop_size[shift]);
+        // Zero length check.
+        __ jcc(Assembler::lessEqual, L_exit);
+
+        __ BIND(L_tail64);
+
+        // Tail handling using 64 byte [masked] vector copy operations.
+        use64byteVector = true;
+        __ arraycopy_avx3_special_cases(xmm1, k2, from, to, temp1, shift,
+                                        temp4, temp3, use64byteVector, L_entry, L_exit);
+      }
+      __ BIND(L_exit);
+    }
+
+    address ucme_exit_pc = __ pc();
+    // When called from generic_arraycopy r11 contains specific values
+    // used during arraycopy epilogue, re-initializing r11.
+    if (is_oop) {
+      __ movq(r11, shift == 3 ? count : to);
+    }
+    bs->arraycopy_epilogue(_masm, decorators, type, from, to, count);
+    restore_argument_regs(type);
+    inc_counter_np(get_profile_ctr(shift)); // Update counter after rscratch1 is free
+    __ xorptr(rax, rax); // return 0
+    __ vzeroupper();
+    __ leave(); // required for proper stackwalking of RuntimeStub frame
+    __ ret(0);
+    return start;
+  }
+
+  // Inputs:
+  //   c_rarg0   - source array address
+  //   c_rarg1   - destination array address
+  //   c_rarg2   - element count, treated as ssize_t, can be zero
+  //
+  //
+  address generate_conjoint_copy_avx3_masked(address* entry, const char *name, int shift,
+                                             address nooverlap_target, bool aligned, bool is_oop,
+                                             bool dest_uninitialized) {
+    __ align(CodeEntryAlignment);
+    StubCodeMark mark(this, "StubRoutines", name);
+    address start = __ pc();
+
+    bool use64byteVector = MaxVectorSize > 32 && AVX3Threshold == 0;
+
+    Label L_main_pre_loop, L_main_pre_loop_64bytes, L_pre_main_post_64;
+    Label L_main_loop, L_main_loop_64bytes, L_tail, L_tail64, L_exit, L_entry;
+    const Register from        = rdi;  // source array address
+    const Register to          = rsi;  // destination array address
+    const Register count       = rdx;  // elements count
+    const Register temp1       = r8;
+    const Register temp2       = rcx;
+    const Register temp3       = r11;
+    const Register temp4       = rax;
+    // End pointers are inclusive, and if count is not zero they point
+    // to the last unit copied:  end_to[0] := end_from[0]
+
+    __ enter(); // required for proper stackwalking of RuntimeStub frame
+    assert_clean_int(c_rarg2, rax);    // Make sure 'count' is clean int.
+
+    if (entry != NULL) {
+      *entry = __ pc();
+       // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
+      BLOCK_COMMENT("Entry:");
+    }
+
+    array_overlap_test(nooverlap_target, (Address::ScaleFactor)(shift));
+
+    BasicType type_vec[] = { T_BYTE,  T_SHORT,  T_INT,   T_LONG};
+    BasicType type = is_oop ? T_OBJECT : type_vec[shift];
+
+    setup_argument_regs(type);
+
+    DecoratorSet decorators = IN_HEAP | IS_ARRAY;
+    if (dest_uninitialized) {
+      decorators |= IS_DEST_UNINITIALIZED;
+    }
+    if (aligned) {
+      decorators |= ARRAYCOPY_ALIGNED;
+    }
+    BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
+    bs->arraycopy_prologue(_masm, decorators, type, from, to, count);
+    {
+      // Type(shift)       byte(0), short(1), int(2),   long(3)
+      int loop_size[]   = { 192,     96,       48,      24};
+      int threshold[]   = { 4096,    2048,     1024,    512};
+
+      // UnsafeCopyMemory page error: continue after ucm
+      UnsafeCopyMemoryMark ucmm(this, !is_oop && !aligned, true);
+      // 'from', 'to' and 'count' are now valid
+
+      // temp1 holds remaining count.
+      __ movq(temp1, count);
+
+      // Zero length check.
+      __ BIND(L_tail);
+      __ cmpq(temp1, 0);
+      __ jcc(Assembler::lessEqual, L_exit);
+
+      __ mov64(temp2, 0);
+      __ movq(temp3, temp1);
+      // Special cases using 32 byte [masked] vector copy operations.
+      __ arraycopy_avx3_special_cases_conjoint(xmm1, k2, from, to, temp2, temp3, temp1, shift,
+                                               temp4, use64byteVector, L_entry, L_exit);
+
+      // PRE-MAIN-POST loop for aligned copy.
+      __ BIND(L_entry);
+
+      if (MaxVectorSize > 32 && AVX3Threshold != 0) {
+        __ cmpq(temp1, threshold[shift]);
+        __ jcc(Assembler::greaterEqual, L_pre_main_post_64);
+      }
+
+      if (MaxVectorSize < 64  || AVX3Threshold != 0) {
+        // Partial copy to make dst address 32 byte aligned.
+        __ leaq(temp2, Address(to, temp1, (Address::ScaleFactor)(shift), 0));
+        __ andq(temp2, 31);
+        __ jcc(Assembler::equal, L_main_pre_loop);
+
+        if (shift) {
+          __ shrq(temp2, shift);
+        }
+        __ subq(temp1, temp2);
+        __ copy32_masked_avx(to, from, xmm1, k2, temp2, temp1, temp3, shift);
+
+        __ cmpq(temp1, loop_size[shift]);
+        __ jcc(Assembler::less, L_tail);
+
+        __ BIND(L_main_pre_loop);
+
+        // Main loop with aligned copy block size of 192 bytes at 32 byte granularity.
+        __ BIND(L_main_loop);
+           __ copy64_avx(to, from, temp1, xmm1, true, shift, -64);
+           __ copy64_avx(to, from, temp1, xmm1, true, shift, -128);
+           __ copy64_avx(to, from, temp1, xmm1, true, shift, -192);
+           __ subptr(temp1, loop_size[shift]);
+           __ cmpq(temp1, loop_size[shift]);
+           __ jcc(Assembler::greater, L_main_loop);
+
+        // Tail loop.
+        __ jmp(L_tail);
+      }
+
+      if (MaxVectorSize > 32) {
+        __ BIND(L_pre_main_post_64);
+        // Partial copy to make dst address 64 byte aligned.
+        __ leaq(temp2, Address(to, temp1, (Address::ScaleFactor)(shift), 0));
+        __ andq(temp2, 63);
+        __ jcc(Assembler::equal, L_main_pre_loop_64bytes);
+
+        if (shift) {
+          __ shrq(temp2, shift);
+        }
+        __ subq(temp1, temp2);
+        __ copy64_masked_avx(to, from, xmm1, k2, temp2, temp1, temp3, shift, 0 , true);
+
+        __ cmpq(temp1, loop_size[shift]);
+        __ jcc(Assembler::less, L_tail64);
+
+        __ BIND(L_main_pre_loop_64bytes);
+
+        // Main loop with aligned copy block size of 192 bytes at
+        // 64 byte copy granularity.
+        __ BIND(L_main_loop_64bytes);
+           __ copy64_avx(to, from, temp1, xmm1, true, shift, -64 , true);
+           __ copy64_avx(to, from, temp1, xmm1, true, shift, -128, true);
+           __ copy64_avx(to, from, temp1, xmm1, true, shift, -192, true);
+           __ subq(temp1, loop_size[shift]);
+           __ cmpq(temp1, loop_size[shift]);
+           __ jcc(Assembler::greater, L_main_loop_64bytes);
+
+        // Zero length check.
+        __ cmpq(temp1, 0);
+        __ jcc(Assembler::lessEqual, L_exit);
+
+        __ BIND(L_tail64);
+
+        // Tail handling using 64 byte [masked] vector copy operations.
+        use64byteVector = true;
+        __ mov64(temp2, 0);
+        __ movq(temp3, temp1);
+        __ arraycopy_avx3_special_cases_conjoint(xmm1, k2, from, to, temp2, temp3, temp1, shift,
+                                                 temp4, use64byteVector, L_entry, L_exit);
+      }
+      __ BIND(L_exit);
+    }
+    address ucme_exit_pc = __ pc();
+    // When called from generic_arraycopy r11 contains specific values
+    // used during arraycopy epilogue, re-initializing r11.
+    if(is_oop) {
+      __ movq(r11, count);
+    }
+    bs->arraycopy_epilogue(_masm, decorators, type, from, to, count);
+    restore_argument_regs(type);
+    inc_counter_np(get_profile_ctr(shift)); // Update counter after rscratch1 is free
+    __ xorptr(rax, rax); // return 0
+    __ vzeroupper();
+    __ leave(); // required for proper stackwalking of RuntimeStub frame
+    __ ret(0);
+    return start;
+  }
+
+
   // Arguments:
   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
   //             ignored
@@ -1343,6 +1717,10 @@ class StubGenerator: public StubCodeGenerator {
   //   used by generate_conjoint_byte_copy().
   //
   address generate_disjoint_byte_copy(bool aligned, address* entry, const char *name) {
+    if (VM_Version::supports_avx512vlbw() && MaxVectorSize  >= 32) {
+       return generate_disjoint_copy_avx3_masked(entry, "jbyte_disjoint_arraycopy_avx3", 0,
+                                                 aligned, false, false);
+    }
     __ align(CodeEntryAlignment);
     StubCodeMark mark(this, "StubRoutines", name);
     address start = __ pc();
@@ -1453,6 +1831,10 @@ class StubGenerator: public StubCodeGenerator {
   //
   address generate_conjoint_byte_copy(bool aligned, address nooverlap_target,
                                       address* entry, const char *name) {
+    if (VM_Version::supports_avx512vlbw() && MaxVectorSize  >= 32) {
+       return generate_conjoint_copy_avx3_masked(entry, "jbyte_conjoint_arraycopy_avx3", 0,
+                                                 nooverlap_target, aligned, false, false);
+    }
     __ align(CodeEntryAlignment);
     StubCodeMark mark(this, "StubRoutines", name);
     address start = __ pc();
@@ -1558,6 +1940,11 @@ class StubGenerator: public StubCodeGenerator {
   //   used by generate_conjoint_short_copy().
   //
   address generate_disjoint_short_copy(bool aligned, address *entry, const char *name) {
+    if (VM_Version::supports_avx512vlbw() && MaxVectorSize  >= 32) {
+       return generate_disjoint_copy_avx3_masked(entry, "jshort_disjoint_arraycopy_avx3", 1,
+                                                 aligned, false, false);
+    }
+
     __ align(CodeEntryAlignment);
     StubCodeMark mark(this, "StubRoutines", name);
     address start = __ pc();
@@ -1682,6 +2069,10 @@ class StubGenerator: public StubCodeGenerator {
   //
   address generate_conjoint_short_copy(bool aligned, address nooverlap_target,
                                        address *entry, const char *name) {
+    if (VM_Version::supports_avx512vlbw() && MaxVectorSize  >= 32) {
+       return generate_conjoint_copy_avx3_masked(entry, "jshort_conjoint_arraycopy_avx3", 1,
+                                                 nooverlap_target, aligned, false, false);
+    }
     __ align(CodeEntryAlignment);
     StubCodeMark mark(this, "StubRoutines", name);
     address start = __ pc();
@@ -1780,6 +2171,11 @@ class StubGenerator: public StubCodeGenerator {
   //
   address generate_disjoint_int_oop_copy(bool aligned, bool is_oop, address* entry,
                                          const char *name, bool dest_uninitialized = false) {
+    if (VM_Version::supports_avx512vlbw() && MaxVectorSize  >= 32) {
+       return generate_disjoint_copy_avx3_masked(entry, "jint_disjoint_arraycopy_avx3", 2,
+                                                 aligned, is_oop, dest_uninitialized);
+    }
+
     __ align(CodeEntryAlignment);
     StubCodeMark mark(this, "StubRoutines", name);
     address start = __ pc();
@@ -1884,6 +2280,10 @@ class StubGenerator: public StubCodeGenerator {
   address generate_conjoint_int_oop_copy(bool aligned, bool is_oop, address nooverlap_target,
                                          address *entry, const char *name,
                                          bool dest_uninitialized = false) {
+    if (VM_Version::supports_avx512vlbw() && MaxVectorSize  >= 32) {
+       return generate_conjoint_copy_avx3_masked(entry, "jint_conjoint_arraycopy_avx3", 2,
+                                                 nooverlap_target, aligned, is_oop, dest_uninitialized);
+    }
     __ align(CodeEntryAlignment);
     StubCodeMark mark(this, "StubRoutines", name);
     address start = __ pc();
@@ -1991,6 +2391,10 @@ class StubGenerator: public StubCodeGenerator {
   //
   address generate_disjoint_long_oop_copy(bool aligned, bool is_oop, address *entry,
                                           const char *name, bool dest_uninitialized = false) {
+    if (VM_Version::supports_avx512vlbw() && MaxVectorSize  >= 32) {
+       return generate_disjoint_copy_avx3_masked(entry, "jlong_disjoint_arraycopy_avx3", 3,
+                                                 aligned, is_oop, dest_uninitialized);
+    }
     __ align(CodeEntryAlignment);
     StubCodeMark mark(this, "StubRoutines", name);
     address start = __ pc();
@@ -2095,6 +2499,10 @@ class StubGenerator: public StubCodeGenerator {
   address generate_conjoint_long_oop_copy(bool aligned, bool is_oop,
                                           address nooverlap_target, address *entry,
                                           const char *name, bool dest_uninitialized = false) {
+    if (VM_Version::supports_avx512vlbw() && MaxVectorSize  >= 32) {
+       return generate_conjoint_copy_avx3_masked(entry, "jlong_conjoint_arraycopy_avx3", 3,
+                                                 nooverlap_target, aligned, is_oop, dest_uninitialized);
+    }
     __ align(CodeEntryAlignment);
     StubCodeMark mark(this, "StubRoutines", name);
     address start = __ pc();
diff --git a/src/hotspot/cpu/x86/stubRoutines_x86.hpp b/src/hotspot/cpu/x86/stubRoutines_x86.hpp
index a23ee3666a6..ff4ed9c737a 100644
--- a/src/hotspot/cpu/x86/stubRoutines_x86.hpp
+++ b/src/hotspot/cpu/x86/stubRoutines_x86.hpp
@@ -33,7 +33,7 @@ static bool returns_to_call_stub(address return_pc) { return return_pc == _call_
 
 enum platform_dependent_constants {
   code_size1 = 20000 LP64_ONLY(+10000),         // simply increase if too small (assembler will crash if too small)
-  code_size2 = 35300 LP64_ONLY(+11400)          // simply increase if too small (assembler will crash if too small)
+  code_size2 = 35300 LP64_ONLY(+21400)          // simply increase if too small (assembler will crash if too small)
 };
 
 class x86 {
diff --git a/src/hotspot/cpu/x86/vm_version_x86.cpp b/src/hotspot/cpu/x86/vm_version_x86.cpp
index 089d720e88e..5b4d4dbfc7b 100644
--- a/src/hotspot/cpu/x86/vm_version_x86.cpp
+++ b/src/hotspot/cpu/x86/vm_version_x86.cpp
@@ -763,6 +763,8 @@ void VM_Version::get_processor_features() {
   if (is_intel()) { // Intel cpus specific settings
     if (is_knights_family()) {
       _features &= ~CPU_VZEROUPPER;
+      _features &= ~CPU_AVX512BW;
+      _features &= ~CPU_AVX512VL;
     }
   }
 
diff --git a/test/hotspot/jtreg/compiler/arraycopy/TestArrayCopyConjoint.java b/test/hotspot/jtreg/compiler/arraycopy/TestArrayCopyConjoint.java
new file mode 100644
index 00000000000..9a39807b078
--- /dev/null
+++ b/test/hotspot/jtreg/compiler/arraycopy/TestArrayCopyConjoint.java
@@ -0,0 +1,269 @@
+/*
+ * Copyright (c) 2020, Oracle and/or its affiliates. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ */
+
+package compiler.arraycopy;
+import java.util.Random;
+
+/**
+ * @test
+ * @bug 8251871
+ * @summary Optimize arrayCopy using AVX-512 masked instructions.
+ *
+ * @run main/othervm/timeout=600 -XX:-TieredCompilation  -Xbatch -XX:+IgnoreUnrecognizedVMOptions
+ *      -XX:UseAVX=3 -XX:+UnlockDiagnosticVMOptions -XX:ArrayCopyPartialInlineSize=0 -XX:MaxVectorSize=32 -XX:+UnlockDiagnosticVMOptions
+ *      compiler.arraycopy.TestArrayCopyConjoint
+ * @run main/othervm/timeout=600 -XX:-TieredCompilation  -Xbatch -XX:+IgnoreUnrecognizedVMOptions
+ *      -XX:UseAVX=3 -XX:+UnlockDiagnosticVMOptions -XX:ArrayCopyPartialInlineSize=0 -XX:MaxVectorSize=64
+ *      compiler.arraycopy.TestArrayCopyConjoint
+ * @run main/othervm/timeout=600 -XX:-TieredCompilation  -Xbatch -XX:+IgnoreUnrecognizedVMOptions
+ *      -XX:UseAVX=3 -XX:+UnlockDiagnosticVMOptions -XX:ArrayCopyPartialInlineSize=32 -XX:+UnlockDiagnosticVMOptions -XX:MaxVectorSize=32 -XX:+UnlockDiagnosticVMOption
+ *      compiler.arraycopy.TestArrayCopyConjoint
+ * @run main/othervm/timeout=600 -XX:-TieredCompilation  -Xbatch -XX:+IgnoreUnrecognizedVMOptions
+ *      -XX:UseAVX=3 -XX:+UnlockDiagnosticVMOptions -XX:ArrayCopyPartialInlineSize=32 -XX:+UnlockDiagnosticVMOptions -XX:MaxVectorSize=64
+ *      compiler.arraycopy.TestArrayCopyConjoint
+ * @run main/othervm/timeout=600 -XX:-TieredCompilation  -Xbatch -XX:+IgnoreUnrecognizedVMOptions
+ *      -XX:UseAVX=3 -XX:+UnlockDiagnosticVMOptions -XX:ArrayCopyPartialInlineSize=64 -XX:MaxVectorSize=64
+ *      compiler.arraycopy.TestArrayCopyConjoint
+ * @run main/othervm/timeout=600 -XX:-TieredCompilation  -Xbatch -XX:+IgnoreUnrecognizedVMOptions
+ *      -XX:UseAVX=3 -XX:+UnlockDiagnosticVMOptions -XX:ArrayCopyPartialInlineSize=32 -XX:+UnlockDiagnosticVMOptions -XX:MaxVectorSize=32 -XX:+UnlockDiagnosticVMOption -XX:ArrayCopyLoadStoreMaxElem=16
+ *      compiler.arraycopy.TestArrayCopyConjoint
+ * @run main/othervm/timeout=600 -XX:-TieredCompilation  -Xbatch -XX:+IgnoreUnrecognizedVMOptions
+ *      -XX:UseAVX=3 -XX:+UnlockDiagnosticVMOptions -XX:ArrayCopyPartialInlineSize=64 -XX:MaxVectorSize=64 -XX:ArrayCopyLoadStoreMaxElem=16
+ *      compiler.arraycopy.TestArrayCopyConjoint
+ *
+ */
+
+public class TestArrayCopyConjoint {
+
+   public static final int SIZE = 4096;
+   public static byte[] fromByteArr, toByteArr, valByteArr;
+   public static char[] fromCharArr, toCharArr, valCharArr;
+   public static int[] fromIntArr, toIntArr, valIntArr;
+   public static long[] fromLongArr, toLongArr, valLongArr;
+
+   static public  void reinit(Class<?> c) {
+     if (c == byte.class) {
+       for (int i = 0 ; i < SIZE ; i++) {
+         fromByteArr[i] = (byte)i;
+       }
+     } else if (c == char.class) {
+       for (int i = 0 ; i < SIZE ; i++) {
+         fromCharArr[i] = (char)i;
+       }
+     } else if (c == int.class) {
+       for (int i = 0 ; i < SIZE ; i++) {
+         fromIntArr[i]  = i;
+       }
+     } else {
+       assert c == long.class;
+       for (int i = 0 ; i < SIZE ; i++) {
+         fromLongArr[i] = i;
+       }
+     }
+   }
+
+   static public void setup() {
+     // Both positions aligned
+     fromByteArr = new byte[SIZE];
+     valByteArr  = new byte[SIZE];
+     toByteArr = fromByteArr;
+     fromCharArr = new char[SIZE];
+     valCharArr  = new char[SIZE];
+     toCharArr = fromCharArr;
+     fromIntArr = new int[SIZE];
+     valIntArr  = new int[SIZE];
+     toIntArr = fromIntArr;
+     fromLongArr = new long[SIZE];
+     valLongArr  = new long[SIZE];
+     toLongArr = fromLongArr;
+
+     for (int i = 0 ; i < SIZE ; i++) {
+        fromByteArr[i] = (byte)i;
+        valByteArr[i] = (byte)i;
+        fromCharArr[i] = (char)i;
+        valCharArr[i] = (char)i;
+        fromIntArr[i]  = i;
+        valIntArr[i]  = i;
+        fromLongArr[i] = i;
+        valLongArr[i] = i;
+     }
+    }
+
+    public static int validate_ctr = 0;
+    public static <E> void validate(String msg, E arr, int length, int fromPos, int toPos) {
+      validate_ctr++;
+      if (arr instanceof byte [])  {
+        byte [] barr = (byte [])arr;
+        for(int i = 0 ; i < length; i++)
+          if (valByteArr[i+fromPos] != barr[i+toPos]) {
+             System.out.println(msg + "[" + arr.getClass() + "] Result mismtach at i = " + i
+                                + " expected = " + valByteArr[i+fromPos]
+                                + " actual   = " + barr[i+toPos]
+                                + " fromPos = " + fromPos
+                                + "  toPos = " + toPos);
+             throw new Error("Fail");
+
+          }
+      }
+      else if (arr instanceof char [])  {
+        char [] carr = (char [])arr;
+        for(int i = 0 ; i < length; i++)
+          if (valCharArr[i+fromPos] != carr[i+toPos]) {
+             System.out.println(msg + "[" + arr.getClass() + "] Result mismtach at i = " + i
+                                + " expected = " + valCharArr[i+fromPos]
+                                + " actual   = " + carr[i+toPos]
+                                + " fromPos = " + fromPos
+                                + " toPos = " + toPos);
+             throw new Error("Fail");
+          }
+      }
+      else if (arr instanceof int [])  {
+        int [] iarr = (int [])arr;
+        for(int i = 0 ; i < length; i++)
+          if (valIntArr[i+fromPos] != iarr[i+toPos]) {
+             System.out.println(msg + "[" + arr.getClass() + "] Result mismtach at i = " + i
+                                + " expected = " + valIntArr[i+fromPos]
+                                + " actual   = " + iarr[i+toPos]
+                                + " fromPos = " + fromPos
+                                + " toPos = " + toPos);
+             throw new Error("Fail");
+          }
+      }
+      else if (arr instanceof long [])  {
+        long [] larr = (long [])arr;
+        for(int i = 0 ; i < length; i++)
+          if (valLongArr[i+fromPos] != larr[i+toPos]) {
+             System.out.println(msg + "[" + arr.getClass() + "] Result mismtach at i = " + i
+                                + " expected = " + valLongArr[i+fromPos]
+                                + " actual   = " + larr[i+toPos]
+                                + " fromPos = " + fromPos
+                                + " toPos = " + toPos);
+             throw new Error("Fail");
+          }
+      }
+    }
+
+    public static void testByte(int length, int fromPos, int toPos) {
+       System.arraycopy(fromByteArr, fromPos, toByteArr, toPos, length);
+       validate(" Test ByteArr ", toByteArr, length, fromPos, toPos);
+    }
+
+    public static void testChar(int length, int fromPos, int toPos) {
+       System.arraycopy(fromCharArr, fromPos, toCharArr, toPos, length);
+       validate(" Test CharArr ", toCharArr, length, fromPos, toPos);
+    }
+
+    public static void testInt(int length, int fromPos, int toPos) {
+       System.arraycopy(fromIntArr, fromPos, toIntArr, toPos, length);
+       validate(" Test IntArr ", toIntArr, length, fromPos, toPos);
+    }
+
+    public static void testLong(int length, int fromPos, int toPos) {
+       System.arraycopy(fromLongArr, fromPos, toLongArr, toPos, length);
+       validate(" Test LongArr ", toLongArr, length, fromPos, toPos);
+    }
+
+    public static void testByte_constant_LT32B(int fromPos, int toPos) {
+       System.arraycopy(fromByteArr, fromPos, toByteArr, toPos, 7);
+       validate(" Test Byte constant length 7 ", toByteArr, 7, fromPos, toPos);
+    }
+    public static void testByte_constant_LT64B(int fromPos, int toPos) {
+       System.arraycopy(fromByteArr, fromPos, toByteArr, toPos, 45);
+       validate(" Test Byte constant length 45 ", toByteArr, 45, fromPos, toPos);
+    }
+
+    public static void testChar_constant_LT32B(int fromPos, int toPos) {
+       System.arraycopy(fromCharArr, fromPos, toCharArr, toPos, 7);
+       validate(" Test Char constant length 7 ", toCharArr, 7, fromPos, toPos);
+    }
+    public static void testChar_constant_LT64B(int fromPos, int toPos) {
+       System.arraycopy(fromCharArr, fromPos, toCharArr, toPos, 22);
+       validate(" Test Char constant length 22 ", toCharArr, 22, fromPos, toPos);
+    }
+
+    public static void testInt_constant_LT32B(int fromPos, int toPos) {
+       System.arraycopy(fromIntArr, fromPos, toIntArr, toPos, 7);
+       validate(" Test Int constant length 7 ", toIntArr, 7, fromPos, toPos);
+    }
+    public static void testInt_constant_LT64B(int fromPos, int toPos) {
+       System.arraycopy(fromIntArr, fromPos, toIntArr, toPos, 11);
+       validate(" Test Int constant length 11 ", toIntArr, 11, fromPos, toPos);
+    }
+
+    public static void testLong_constant_LT32B(int fromPos, int toPos) {
+       System.arraycopy(fromLongArr, fromPos, toLongArr, toPos, 3);
+       validate(" Test Long constant length 3 ", toLongArr, 3, fromPos, toPos);
+    }
+    public static void testLong_constant_LT64B(int fromPos, int toPos) {
+       System.arraycopy(fromLongArr, fromPos, toLongArr, toPos, 6);
+       validate(" Test Long constant length 6 ", toLongArr, 6, fromPos, toPos);
+    }
+
+
+    public static void main(String [] args) {
+      // Cases to test each new optimized stub special blocks.
+      // Cases to test new PI handling (PI32 and PI64).
+      // Cases to test vectorized constant array copies for all primitive types.
+      //                  LT32B   LT64B  LT96B  LT128B   LT160B   LT192B  LOOP1   LOOP2
+      int [] lengths =  {   29,    59,    89,    125,     159,      189,   194,   1024 };
+      Random r = new Random(1024);
+
+      setup();
+
+      try {
+        for (int i = 0 ; i < 1000000 ; i++ ) {
+          int index = r.nextInt(2048);
+          testByte(lengths[i % lengths.length], index , index+2);
+          reinit(byte.class);
+          testByte_constant_LT32B (index , index+2);
+          reinit(byte.class);
+          testByte_constant_LT64B (index , index+2);
+          reinit(byte.class);
+
+          testChar(lengths[i % lengths.length] >> 1, index , index+2);
+          reinit(char.class);
+          testChar_constant_LT32B (index , index+2);
+          reinit(char.class);
+          testChar_constant_LT64B (index , index+2);
+          reinit(char.class);
+
+          testInt(lengths[i % lengths.length]  >> 2, index , index+2);
+          reinit(int.class);
+          testInt_constant_LT32B (index , index+2);
+          reinit(int.class);
+          testInt_constant_LT64B (index , index+2);
+          reinit(int.class);
+
+          testLong(lengths[i % lengths.length] >> 3, index , index+2);
+          reinit(long.class);
+          testLong_constant_LT32B (index , index+2);
+          reinit(long.class);
+          testLong_constant_LT64B (index , index+2);
+          reinit(long.class);
+        }
+        System.out.println("PASS : " + validate_ctr);
+      } catch (Exception e) {
+        System.out.println(e.getMessage());
+      }
+    }
+}
diff --git a/test/hotspot/jtreg/compiler/arraycopy/TestArrayCopyDisjoint.java b/test/hotspot/jtreg/compiler/arraycopy/TestArrayCopyDisjoint.java
new file mode 100644
index 00000000000..7bf676d3cae
--- /dev/null
+++ b/test/hotspot/jtreg/compiler/arraycopy/TestArrayCopyDisjoint.java
@@ -0,0 +1,226 @@
+/*
+ * Copyright (c) 2020, Oracle and/or its affiliates. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ */
+
+package compiler.arraycopy;
+import java.util.Random;
+
+/**
+ * @test
+ * @bug 8251871
+ * @summary Optimize arrayCopy using AVX-512 masked instructions.
+ *
+ * @run main/othervm/timeout=600 -XX:-TieredCompilation  -Xbatch -XX:+IgnoreUnrecognizedVMOptions
+ *      -XX:UseAVX=3 -XX:+UnlockDiagnosticVMOptions -XX:ArrayCopyPartialInlineSize=0 -XX:MaxVectorSize=32 -XX:+UnlockDiagnosticVMOptions
+ *      compiler.arraycopy.TestArrayCopyDisjoint
+ * @run main/othervm/timeout=600 -XX:-TieredCompilation  -Xbatch -XX:+IgnoreUnrecognizedVMOptions
+ *      -XX:UseAVX=3 -XX:+UnlockDiagnosticVMOptions -XX:ArrayCopyPartialInlineSize=0 -XX:MaxVectorSize=64
+ *      compiler.arraycopy.TestArrayCopyDisjoint
+ * @run main/othervm/timeout=600 -XX:-TieredCompilation  -Xbatch -XX:+IgnoreUnrecognizedVMOptions
+ *      -XX:UseAVX=3 -XX:+UnlockDiagnosticVMOptions -XX:ArrayCopyPartialInlineSize=32 -XX:+UnlockDiagnosticVMOptions -XX:MaxVectorSize=32 -XX:+UnlockDiagnosticVMOption
+ *      compiler.arraycopy.TestArrayCopyDisjoint
+ * @run main/othervm/timeout=600 -XX:-TieredCompilation  -Xbatch -XX:+IgnoreUnrecognizedVMOptions
+ *      -XX:UseAVX=3 -XX:+UnlockDiagnosticVMOptions -XX:ArrayCopyPartialInlineSize=32 -XX:+UnlockDiagnosticVMOptions -XX:MaxVectorSize=64
+ *      compiler.arraycopy.TestArrayCopyDisjoint
+ * @run main/othervm/timeout=600 -XX:-TieredCompilation  -Xbatch -XX:+IgnoreUnrecognizedVMOptions
+ *      -XX:UseAVX=3 -XX:+UnlockDiagnosticVMOptions -XX:ArrayCopyPartialInlineSize=64 -XX:MaxVectorSize=64
+ *      compiler.arraycopy.TestArrayCopyDisjoint
+ * @run main/othervm/timeout=600 -XX:-TieredCompilation  -Xbatch -XX:+IgnoreUnrecognizedVMOptions
+ *      -XX:UseAVX=3 -XX:+UnlockDiagnosticVMOptions -XX:ArrayCopyPartialInlineSize=32 -XX:+UnlockDiagnosticVMOptions -XX:MaxVectorSize=32 -XX:+UnlockDiagnosticVMOption -XX:ArrayCopyLoadStoreMaxElem=16
+ *      compiler.arraycopy.TestArrayCopyDisjoint
+ * @run main/othervm/timeout=600 -XX:-TieredCompilation  -Xbatch -XX:+IgnoreUnrecognizedVMOptions
+ *      -XX:UseAVX=3 -XX:+UnlockDiagnosticVMOptions -XX:ArrayCopyPartialInlineSize=64 -XX:MaxVectorSize=64 -XX:ArrayCopyLoadStoreMaxElem=16
+ *      compiler.arraycopy.TestArrayCopyDisjoint
+ *
+ */
+
+public class TestArrayCopyDisjoint {
+
+   public static final int SIZE = 4096;
+   public static byte[] fromByteArr, toByteArr;
+   public static char[] fromCharArr, toCharArr;
+   public static int[] fromIntArr, toIntArr;
+   public static long[] fromLongArr, toLongArr;
+
+   static public void setup() {
+     // Both positions aligned
+     fromByteArr = new byte[SIZE];
+     toByteArr = new byte[SIZE];
+     fromCharArr = new char[SIZE];
+     toCharArr = new char[SIZE];
+     fromIntArr = new int[SIZE];
+     toIntArr = new int[SIZE];
+     fromLongArr = new long[SIZE];
+     toLongArr = new long[SIZE];
+
+     for (int i = 0 ; i < SIZE ; i++) {
+        fromByteArr[i] = (byte)i;
+        fromCharArr[i] = (char)i;
+        fromIntArr[i]  = i;
+        fromLongArr[i] = i;
+     }
+    }
+
+    public static int validate_ctr = 0;
+    public static <E> void validate(String msg, E arr, int length, int fromPos, int toPos) {
+      validate_ctr++;
+      if (arr instanceof byte [])  {
+        byte [] barr = (byte [])arr;
+        for(int i = 0 ; i < length; i++)
+          if (fromByteArr[i+fromPos] != barr[i+toPos]) {
+             System.out.println(msg + "[" + arr.getClass() + "] Result mismtach at i = " + i
+                                + " expected = " + fromByteArr[i+fromPos]
+                                + " actual   = " + barr[i+toPos]
+                                + " fromPos = " + fromPos
+                                + " toPos = " + toPos);
+              throw new Error("Fail");
+          }
+      }
+      else if (arr instanceof char [])  {
+        char [] carr = (char [])arr;
+        for(int i = 0 ; i < length; i++)
+          if (fromCharArr[i+fromPos] != carr[i+toPos]) {
+             System.out.println(msg + "[" + arr.getClass() + "] Result mismtach at i = " + i
+                                + " expected = " + fromCharArr[i+fromPos]
+                                + " actual   = " + carr[i+toPos]
+                                + " fromPos = " + fromPos
+                                + " toPos = " + toPos);
+              throw new Error("Fail");
+          }
+      }
+      else if (arr instanceof int [])  {
+        int [] iarr = (int [])arr;
+        for(int i = 0 ; i < length; i++)
+          if (fromIntArr[i+fromPos] != iarr[i+toPos]) {
+             System.out.println(msg + "[" + arr.getClass() + "] Result mismtach at i = " + i
+                                + " expected = " + fromIntArr[i+fromPos]
+                                + " actual   = " + iarr[i+toPos]
+                                + " fromPos = " + fromPos
+                                + " toPos = " + toPos);
+              throw new Error("Fail");
+          }
+      }
+      else if (arr instanceof long [])  {
+        long [] larr = (long [])arr;
+        for(int i = 0 ; i < length; i++)
+          if (fromLongArr[i+fromPos] != larr[i+toPos]) {
+             System.out.println(msg + "[" + arr.getClass() + "] Result mismtach at i = " + i
+                                + " expected = " + fromLongArr[i+fromPos]
+                                + " actual   = " + larr[i+toPos]
+                                + " fromPos = " + fromPos
+                                + " toPos = " + toPos);
+              throw new Error("Fail");
+          }
+      }
+    }
+
+    public static void testByte(int length, int fromPos, int toPos) {
+       System.arraycopy(fromByteArr, fromPos, toByteArr, toPos, length);
+       validate(" Test ByteArr ", toByteArr, length, fromPos, toPos);
+    }
+
+    public static void testChar(int length, int fromPos, int toPos) {
+       System.arraycopy(fromCharArr, fromPos, toCharArr, toPos, length);
+       validate(" Test CharArr ", toCharArr, length, fromPos, toPos);
+    }
+
+    public static void testInt(int length, int fromPos, int toPos) {
+       System.arraycopy(fromIntArr, fromPos, toIntArr, toPos, length);
+       validate(" Test IntArr ", toIntArr, length, fromPos, toPos);
+    }
+
+    public static void testLong(int length, int fromPos, int toPos) {
+       System.arraycopy(fromLongArr, fromPos, toLongArr, toPos, length);
+       validate(" Test LongArr ", toLongArr, length, fromPos, toPos);
+    }
+
+    public static void testByte_constant_LT32B(int fromPos, int toPos) {
+       System.arraycopy(fromByteArr, fromPos, toByteArr, toPos, 7);
+       validate(" Test Byte constant length 7 ", toByteArr, 7, fromPos, toPos);
+    }
+    public static void testByte_constant_LT64B(int fromPos, int toPos) {
+       System.arraycopy(fromByteArr, fromPos, toByteArr, toPos, 45);
+       validate(" Test Byte constant length 45 ", toByteArr, 45, fromPos, toPos);
+    }
+
+    public static void testChar_constant_LT32B(int fromPos, int toPos) {
+       System.arraycopy(fromCharArr, fromPos, toCharArr, toPos, 7);
+       validate(" Test Char constant length 7 ", toCharArr, 7, fromPos, toPos);
+    }
+    public static void testChar_constant_LT64B(int fromPos, int toPos) {
+       System.arraycopy(fromCharArr, fromPos, toCharArr, toPos, 22);
+       validate(" Test Char constant length 22 ", toCharArr, 22, fromPos, toPos);
+    }
+
+    public static void testInt_constant_LT32B(int fromPos, int toPos) {
+       System.arraycopy(fromIntArr, fromPos, toIntArr, toPos, 7);
+       validate(" Test Int constant length 7 ", toIntArr, 7, fromPos, toPos);
+    }
+    public static void testInt_constant_LT64B(int fromPos, int toPos) {
+       System.arraycopy(fromIntArr, fromPos, toIntArr, toPos, 11);
+       validate(" Test Int constant length 11 ", toIntArr, 11, fromPos, toPos);
+    }
+
+    public static void testLong_constant_LT32B(int fromPos, int toPos) {
+       System.arraycopy(fromLongArr, fromPos, toLongArr, toPos, 3);
+       validate(" Test Long constant length 3 ", toLongArr, 3, fromPos, toPos);
+    }
+    public static void testLong_constant_LT64B(int fromPos, int toPos) {
+       System.arraycopy(fromLongArr, fromPos, toLongArr, toPos, 6);
+       validate(" Test Long constant length 6 ", toLongArr, 6, fromPos, toPos);
+    }
+
+
+    public static void main(String [] args) {
+      // Cases to test each new optimized stub special blocks.
+      // Cases to test new PI handling (PI32 and PI64).
+      // Cases to test vectorized constant array copies for all primitive types.
+      //                  LT32B   LT64B  LT96B  LT128B   LT160B   LT192B  LOOP1   LOOP2
+      int [] lengths =  {   29,    59,    89,    125,     159,      189,   194,   1024 };
+      Random r = new Random(1024);
+
+      setup();
+
+      try {
+        for (int i = 0 ; i < 1000000 ; i++ ) {
+          testByte(lengths[i % lengths.length], r.nextInt(2048) , r.nextInt(2048));
+          testByte_constant_LT32B (r.nextInt(2048) , r.nextInt(2048));
+          testByte_constant_LT64B (r.nextInt(2048) , r.nextInt(2048));
+
+          testChar(lengths[i % lengths.length] >> 1, r.nextInt(2048) , r.nextInt(2048));
+          testChar_constant_LT32B (r.nextInt(2048) , r.nextInt(2048));
+          testChar_constant_LT64B (r.nextInt(2048) , r.nextInt(2048));
+
+          testInt(lengths[i % lengths.length]  >> 2, r.nextInt(2048) , r.nextInt(2048));
+          testInt_constant_LT32B (r.nextInt(2048) , r.nextInt(2048));
+          testInt_constant_LT64B (r.nextInt(2048) , r.nextInt(2048));
+
+          testLong(lengths[i % lengths.length] >> 3, r.nextInt(2048) , r.nextInt(2048));
+          testLong_constant_LT32B (r.nextInt(2048) , r.nextInt(2048));
+          testLong_constant_LT64B (r.nextInt(2048) , r.nextInt(2048));
+        }
+        System.out.println("PASS : " + validate_ctr);
+      } catch (Exception e) {
+         System.out.println(e.getMessage());
+      }
+    }
+}
diff --git a/test/micro/org/openjdk/bench/java/lang/ArrayCopyObject.java b/test/micro/org/openjdk/bench/java/lang/ArrayCopyObject.java
new file mode 100644
index 00000000000..ff33d451fdb
--- /dev/null
+++ b/test/micro/org/openjdk/bench/java/lang/ArrayCopyObject.java
@@ -0,0 +1,121 @@
+/*
+ * Copyright (c) 2020, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2020, Arm Limited. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ */
+
+package org.openjdk.bench.vm.compiler;
+
+import org.openjdk.jmh.annotations.Benchmark;
+import org.openjdk.jmh.annotations.BenchmarkMode;
+import org.openjdk.jmh.annotations.Mode;
+import org.openjdk.jmh.annotations.OutputTimeUnit;
+import org.openjdk.jmh.annotations.Param;
+import org.openjdk.jmh.annotations.Scope;
+import org.openjdk.jmh.annotations.Setup;
+import org.openjdk.jmh.annotations.State;
+import org.openjdk.jmh.results.Result;
+import org.openjdk.jmh.results.RunResult;
+import org.openjdk.jmh.runner.Runner;
+import org.openjdk.jmh.runner.RunnerException;
+import org.openjdk.jmh.runner.options.Options;
+import org.openjdk.jmh.runner.options.OptionsBuilder;
+import org.openjdk.jmh.runner.options.TimeValue;
+
+
+
+
+import java.util.concurrent.TimeUnit;
+import java.util.Arrays;
+
+class MyClass {
+ public int field1;
+ public int field2;
+ public int field3;
+
+ public MyClass(int val) {
+   field1 = val;
+   field2 = val;
+   field3 = val;
+ }
+}
+
+@State(Scope.Benchmark)
+@BenchmarkMode(Mode.Throughput)
+@OutputTimeUnit(TimeUnit.MILLISECONDS)
+public class ArrayCopyObject {
+    @Param({"31", "63", "127" , "2047" , "4095", "8191"}) private int size;
+
+    private MyClass [] src;
+    private MyClass [] dst;
+
+    @Setup
+    public void setup() {
+      src = new MyClass[size];
+      dst = new MyClass[size];
+      for (int i = 0; i < src.length ; i++) {
+        src[i] = new MyClass(i);
+        dst[i] = new MyClass(0);
+      }
+    }
+
+    @Benchmark
+    public void disjoint_micro() {
+      System.arraycopy(src, 0 , dst, 0 , size);
+    }
+
+    @Benchmark
+    public void conjoint_micro() {
+      System.arraycopy(src, 0 , src, 10 , size - 10 );
+    }
+
+    public static void main(String[] args) throws RunnerException {
+       String [] base_opts =
+          { "-XX:+UnlockDiagnosticVMOptions ",
+            "-XX:+IgnoreUnrecognizedVMOptions ",
+          "-XX:UseAVX=3" };
+       String [] opts_str1 = {"-XX:-UseCompressedOops "};
+       String [] opts_str2 = {"-XX:+UseCompressedOops "};
+
+       Options baseOpts = new OptionsBuilder()
+          .include(ArrayCopyObject.class.getName())
+          .warmupTime(TimeValue.seconds(30))
+          .measurementTime(TimeValue.seconds(10))
+          .warmupIterations(1)
+          .measurementIterations(2)
+          .jvmArgs(base_opts)
+          .forks(1)
+          .build();
+
+       RunResult r1 = new Runner(new OptionsBuilder()
+         .parent(baseOpts)
+         .jvmArgs(opts_str1)
+         .build()).runSingle();
+
+       RunResult r2 = new Runner(new OptionsBuilder()
+         .parent(baseOpts)
+         .jvmArgs(opts_str2)
+         .build()).runSingle();
+
+        System.out.println(r1.getPrimaryResult().getScore() + r2.getPrimaryResult().getScore());
+    }
+}
+