Merge

2015-12-15 13:42:13 +00:00 · 2015-12-15 13:42:13 +00:00 · ac6fa27965
commit ac6fa27965
parent c2221a88e8 2b4403dc88
40 changed files with 848 additions and 148 deletions
--- a/hotspot/make/test/JtregNative.gmk
+++ b/hotspot/make/test/JtregNative.gmk
@ -46,6 +46,7 @@ BUILD_HOTSPOT_JTREG_NATIVE_SRC := \
    $(HOTSPOT_TOPDIR)/test/runtime/jni/8033445 \
    $(HOTSPOT_TOPDIR)/test/runtime/jni/ToStringInInterfaceTest \
    $(HOTSPOT_TOPDIR)/test/runtime/SameObject \
    $(HOTSPOT_TOPDIR)/test/compiler/floatingpoint/ \
    #
 # Add conditional directories here when needed.
--- a/hotspot/src/cpu/aarch64/vm/globalDefinitions_aarch64.hpp
+++ b/hotspot/src/cpu/aarch64/vm/globalDefinitions_aarch64.hpp
@ -28,6 +28,10 @@
 const int StackAlignmentInBytes  = 16;
 // Indicates whether the C calling conventions require that
 // 32-bit integer argument values are extended to 64 bits.
 const bool CCallingConventionRequiresIntsAsLongs = false;
 #define SUPPORTS_NATIVE_CX8
 // The maximum B/BL offset range on AArch64 is 128MB.
--- a/hotspot/src/cpu/aarch64/vm/vm_version_aarch64.cpp
+++ b/hotspot/src/cpu/aarch64/vm/vm_version_aarch64.cpp
@ -182,6 +182,11 @@ void VM_Version::get_processor_features() {
    FLAG_SET_DEFAULT(UseAdler32Intrinsics, true);
  }
  if (UseVectorizedMismatchIntrinsic) {
    warning("UseVectorizedMismatchIntrinsic specified, but not available on this CPU.");
    FLAG_SET_DEFAULT(UseVectorizedMismatchIntrinsic, false);
  }
  if (auxv & HWCAP_AES) {
    UseAES = UseAES || FLAG_IS_DEFAULT(UseAES);
    UseAESIntrinsics =
--- a/hotspot/src/cpu/ppc/vm/globalDefinitions_ppc.hpp
+++ b/hotspot/src/cpu/ppc/vm/globalDefinitions_ppc.hpp
@ -31,6 +31,10 @@ const int BytesPerInstWord = 4;
 const int StackAlignmentInBytes = 16;
 // Indicates whether the C calling conventions require that
 // 32-bit integer argument values are extended to 64 bits.
 const bool CCallingConventionRequiresIntsAsLongs = true;
 #define SUPPORTS_NATIVE_CX8
 // The PPC CPUs are NOT multiple-copy-atomic.
--- a/hotspot/src/cpu/ppc/vm/ppc.ad
+++ b/hotspot/src/cpu/ppc/vm/ppc.ad
@ -3486,6 +3486,7 @@ encode %{
    call->_jvmadj            = _jvmadj;
    call->_in_rms            = _in_rms;
    call->_nesting           = _nesting;
    call->_override_symbolic_info = _override_symbolic_info;
    // New call needs all inputs of old call.
    // Req...
--- a/hotspot/src/cpu/ppc/vm/vm_version_ppc.cpp
+++ b/hotspot/src/cpu/ppc/vm/vm_version_ppc.cpp
@ -223,6 +223,11 @@ void VM_Version::initialize() {
    UseMultiplyToLenIntrinsic = true;
  }
  if (UseVectorizedMismatchIntrinsic) {
    warning("UseVectorizedMismatchIntrinsic specified, but not available on this CPU.");
    FLAG_SET_DEFAULT(UseVectorizedMismatchIntrinsic, false);
  }
  // Adjust RTM (Restricted Transactional Memory) flags.
  if (!has_tcheck() && UseRTMLocking) {
    // Can't continue because UseRTMLocking affects UseBiasedLocking flag
--- a/hotspot/src/cpu/sparc/vm/globalDefinitions_sparc.hpp
+++ b/hotspot/src/cpu/sparc/vm/globalDefinitions_sparc.hpp
@ -30,6 +30,10 @@ const int BytesPerInstWord = 4;
 const int StackAlignmentInBytes = (2*wordSize);
 // Indicates whether the C calling conventions require that
 // 32-bit integer argument values are extended to 64 bits.
 const bool CCallingConventionRequiresIntsAsLongs = false;
 #define SUPPORTS_NATIVE_CX8
 // The expected size in bytes of a cache line, used to pad data structures.
--- a/hotspot/src/cpu/sparc/vm/vm_version_sparc.cpp
+++ b/hotspot/src/cpu/sparc/vm/vm_version_sparc.cpp
@ -356,6 +356,11 @@ void VM_Version::initialize() {
    FLAG_SET_DEFAULT(UseCRC32Intrinsics, false);
  }
  if (UseVectorizedMismatchIntrinsic) {
    warning("UseVectorizedMismatchIntrinsic specified, but not available on this CPU.");
    FLAG_SET_DEFAULT(UseVectorizedMismatchIntrinsic, false);
  }
  if (FLAG_IS_DEFAULT(ContendedPaddingWidth) &&
    (cache_line_size > ContendedPaddingWidth))
    ContendedPaddingWidth = cache_line_size;
--- a/hotspot/src/cpu/x86/vm/assembler_x86.cpp
+++ b/hotspot/src/cpu/x86/vm/assembler_x86.cpp
@ -2152,33 +2152,64 @@ void Assembler::movddup(XMMRegister dst, XMMRegister src) {
  emit_int8(0xC0 | encode);
 }
-void Assembler::kmovwl(KRegister dst, Register src) {
+void Assembler::kmovbl(KRegister dst, Register src) {
-  NOT_LP64(assert(VM_Version::supports_evex(), ""));
+  assert(VM_Version::supports_avx512dq(), "");
  InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ true, /* no_mask_reg */ true, /* uses_vl */ false);
-  int encode = kreg_prefix_and_encode(dst, knoreg, src, VEX_SIMD_NONE, VEX_OPCODE_0F, &attributes);
+  int encode = vex_prefix_and_encode(dst->encoding(), 0, src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F, &attributes);
  emit_int8((unsigned char)0x92);
  emit_int8((unsigned char)(0xC0 | encode));
 }
 void Assembler::kmovbl(Register dst, KRegister src) {
  assert(VM_Version::supports_avx512dq(), "");
  InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ true, /* no_mask_reg */ true, /* uses_vl */ false);
  int encode = vex_prefix_and_encode(dst->encoding(), 0, src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F, &attributes);
  emit_int8((unsigned char)0x93);
  emit_int8((unsigned char)(0xC0 | encode));
 }
 void Assembler::kmovwl(KRegister dst, Register src) {
  assert(VM_Version::supports_evex(), "");
  InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ true, /* no_mask_reg */ true, /* uses_vl */ false);
  int encode = vex_prefix_and_encode(dst->encoding(), 0, src->encoding(), VEX_SIMD_NONE, VEX_OPCODE_0F, &attributes);
  emit_int8((unsigned char)0x92);
  emit_int8((unsigned char)(0xC0 | encode));
 }
 void Assembler::kmovwl(Register dst, KRegister src) {
  assert(VM_Version::supports_evex(), "");
  InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ true, /* no_mask_reg */ true, /* uses_vl */ false);
  int encode = vex_prefix_and_encode(dst->encoding(), 0, src->encoding(), VEX_SIMD_NONE, VEX_OPCODE_0F, &attributes);
  emit_int8((unsigned char)0x93);
  emit_int8((unsigned char)(0xC0 | encode));
 }
 void Assembler::kmovdl(KRegister dst, Register src) {
-  NOT_LP64(assert(VM_Version::supports_evex(), ""));
+  assert(VM_Version::supports_avx512bw(), "");
  VexSimdPrefix pre = !_legacy_mode_bw ? VEX_SIMD_F2 : VEX_SIMD_NONE;
  InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ true, /* no_mask_reg */ true, /* uses_vl */ false);
-  int encode = kreg_prefix_and_encode(dst, knoreg, src, pre, VEX_OPCODE_0F, &attributes);
+  int encode = vex_prefix_and_encode(dst->encoding(), 0, src->encoding(), VEX_SIMD_F2, VEX_OPCODE_0F, &attributes);
  emit_int8((unsigned char)0x92);
  emit_int8((unsigned char)(0xC0 | encode));
 }
 void Assembler::kmovdl(Register dst, KRegister src) {
  assert(VM_Version::supports_avx512bw(), "");
  InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ true, /* no_mask_reg */ true, /* uses_vl */ false);
  int encode = vex_prefix_and_encode(dst->encoding(), 0, src->encoding(), VEX_SIMD_F2, VEX_OPCODE_0F, &attributes);
  emit_int8((unsigned char)0x93);
  emit_int8((unsigned char)(0xC0 | encode));
 }
 void Assembler::kmovql(KRegister dst, KRegister src) {
-  NOT_LP64(assert(VM_Version::supports_evex(), ""));
+  assert(VM_Version::supports_avx512bw(), "");
  InstructionAttr attributes(AVX_128bit, /* rex_w */ true, /* legacy_mode */ true, /* no_mask_reg */ true, /* uses_vl */ false);
-  int encode = kreg_prefix_and_encode(dst, knoreg, src, VEX_SIMD_NONE, VEX_OPCODE_0F, &attributes);
+  int encode = vex_prefix_and_encode(dst->encoding(), 0, src->encoding(), VEX_SIMD_NONE, VEX_OPCODE_0F, &attributes);
  emit_int8((unsigned char)0x90);
  emit_int8((unsigned char)(0xC0 | encode));
 }
 void Assembler::kmovql(KRegister dst, Address src) {
-  NOT_LP64(assert(VM_Version::supports_evex(), ""));
+  assert(VM_Version::supports_avx512bw(), "");
  InstructionMark im(this);
  InstructionAttr attributes(AVX_128bit, /* vex_w */ true, /* legacy_mode */ true, /* no_mask_reg */ true, /* uses_vl */ false);
  vex_prefix(src, 0, dst->encoding(), VEX_SIMD_NONE, VEX_OPCODE_0F, &attributes);
@ -2187,7 +2218,7 @@ void Assembler::kmovql(KRegister dst, Address src) {
 }
 void Assembler::kmovql(Address dst, KRegister src) {
-  NOT_LP64(assert(VM_Version::supports_evex(), ""));
+  assert(VM_Version::supports_avx512bw(), "");
  InstructionMark im(this);
  InstructionAttr attributes(AVX_128bit, /* vex_w */ true, /* legacy_mode */ true, /* no_mask_reg */ true, /* uses_vl */ false);
  vex_prefix(dst, 0, src->encoding(), VEX_SIMD_NONE, VEX_OPCODE_0F, &attributes);
@ -2196,46 +2227,53 @@ void Assembler::kmovql(Address dst, KRegister src) {
 }
 void Assembler::kmovql(KRegister dst, Register src) {
-  NOT_LP64(assert(VM_Version::supports_evex(), ""));
+  assert(VM_Version::supports_avx512bw(), "");
-  VexSimdPrefix pre = !_legacy_mode_bw ? VEX_SIMD_F2 : VEX_SIMD_NONE;
+  InstructionAttr attributes(AVX_128bit, /* rex_w */ true, /* legacy_mode */ true, /* no_mask_reg */ true, /* uses_vl */ false);
-  InstructionAttr attributes(AVX_128bit, /* rex_w */ !_legacy_mode_bw, /* legacy_mode */ true, /* no_mask_reg */ true, /* uses_vl */ false);
+  int encode = vex_prefix_and_encode(dst->encoding(), 0, src->encoding(), VEX_SIMD_F2, VEX_OPCODE_0F, &attributes);
  int encode = kreg_prefix_and_encode(dst, knoreg, src, pre, VEX_OPCODE_0F, &attributes);
  emit_int8((unsigned char)0x92);
  emit_int8((unsigned char)(0xC0 | encode));
 }
 void Assembler::kmovql(Register dst, KRegister src) {
  assert(VM_Version::supports_avx512bw(), "");
  InstructionAttr attributes(AVX_128bit, /* rex_w */ true, /* legacy_mode */ true, /* no_mask_reg */ true, /* uses_vl */ false);
  int encode = vex_prefix_and_encode(dst->encoding(), 0, src->encoding(), VEX_SIMD_F2, VEX_OPCODE_0F, &attributes);
  emit_int8((unsigned char)0x93);
  emit_int8((unsigned char)(0xC0 | encode));
 }
 // This instruction produces ZF or CF flags
 void Assembler::kortestbl(KRegister src1, KRegister src2) {
-  NOT_LP64(assert(VM_Version::supports_avx512dq(), ""));
+  assert(VM_Version::supports_avx512dq(), "");
  InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ true, /* no_mask_reg */ true, /* uses_vl */ false);
-  int encode = kreg_prefix_and_encode(src1, knoreg, src2, VEX_SIMD_66, VEX_OPCODE_0F, &attributes);
+  int encode = vex_prefix_and_encode(src1->encoding(), 0, src2->encoding(), VEX_SIMD_66, VEX_OPCODE_0F, &attributes);
  emit_int8((unsigned char)0x98);
  emit_int8((unsigned char)(0xC0 | encode));
 }
 // This instruction produces ZF or CF flags
 void Assembler::kortestwl(KRegister src1, KRegister src2) {
-  NOT_LP64(assert(VM_Version::supports_evex(), ""));
+  assert(VM_Version::supports_evex(), "");
  InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ true, /* no_mask_reg */ true, /* uses_vl */ false);
-  int encode = kreg_prefix_and_encode(src1, knoreg, src2, VEX_SIMD_NONE, VEX_OPCODE_0F, &attributes);
+  int encode = vex_prefix_and_encode(src1->encoding(), 0, src2->encoding(), VEX_SIMD_NONE, VEX_OPCODE_0F, &attributes);
  emit_int8((unsigned char)0x98);
  emit_int8((unsigned char)(0xC0 | encode));
 }
 // This instruction produces ZF or CF flags
 void Assembler::kortestdl(KRegister src1, KRegister src2) {
-  NOT_LP64(assert(VM_Version::supports_avx512bw(), ""));
+  assert(VM_Version::supports_avx512bw(), "");
  InstructionAttr attributes(AVX_128bit, /* rex_w */ true, /* legacy_mode */ true, /* no_mask_reg */ true, /* uses_vl */ false);
-  int encode = kreg_prefix_and_encode(src1, knoreg, src2, VEX_SIMD_66, VEX_OPCODE_0F, &attributes);
+  int encode = vex_prefix_and_encode(src1->encoding(), 0, src2->encoding(), VEX_SIMD_66, VEX_OPCODE_0F, &attributes);
  emit_int8((unsigned char)0x98);
  emit_int8((unsigned char)(0xC0 | encode));
 }
 // This instruction produces ZF or CF flags
 void Assembler::kortestql(KRegister src1, KRegister src2) {
-  NOT_LP64(assert(VM_Version::supports_avx512bw(), ""));
+  assert(VM_Version::supports_avx512bw(), "");
  InstructionAttr attributes(AVX_128bit, /* rex_w */ true, /* legacy_mode */ true, /* no_mask_reg */ true, /* uses_vl */ false);
-  int encode = kreg_prefix_and_encode(src1, knoreg, src2, VEX_SIMD_NONE, VEX_OPCODE_0F, &attributes);
+  int encode = vex_prefix_and_encode(src1->encoding(), 0, src2->encoding(), VEX_SIMD_NONE, VEX_OPCODE_0F, &attributes);
  emit_int8((unsigned char)0x98);
  emit_int8((unsigned char)(0xC0 | encode));
 }
@ -2375,7 +2413,7 @@ void Assembler::vmovdqu(Address dst, XMMRegister src) {
 // Move Unaligned EVEX enabled Vector (programmable : 8,16,32,64)
 void Assembler::evmovdqub(XMMRegister dst, XMMRegister src, int vector_len) {
  assert(VM_Version::supports_evex(), "");
-  InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ false, /* uses_vl */ true);
+  InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ true, /* uses_vl */ true);
  int encode = vex_prefix_and_encode(dst->encoding(), 0, src->encoding(), VEX_SIMD_F2, VEX_OPCODE_0F, &attributes);
  emit_int8(0x6F);
  emit_int8((unsigned char)(0xC0 | encode));
@ -2395,7 +2433,7 @@ void Assembler::evmovdqub(Address dst, XMMRegister src, int vector_len) {
  assert(VM_Version::supports_evex(), "");
  assert(src != xnoreg, "sanity");
  InstructionMark im(this);
-  InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ false, /* uses_vl */ true);
+  InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ true, /* uses_vl */ true);
  attributes.set_address_attributes(/* tuple_type */ EVEX_FVM, /* input_size_in_bits */ EVEX_NObit);
  vex_prefix(dst, 0, src->encoding(), VEX_SIMD_F2, VEX_OPCODE_0F, &attributes);
  emit_int8(0x7F);
@ -2404,7 +2442,7 @@ void Assembler::evmovdqub(Address dst, XMMRegister src, int vector_len) {
 void Assembler::evmovdquw(XMMRegister dst, XMMRegister src, int vector_len) {
  assert(VM_Version::supports_evex(), "");
-  InstructionAttr attributes(vector_len, /* vex_w */ true, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ false, /* uses_vl */ true);
+  InstructionAttr attributes(vector_len, /* vex_w */ true, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ true, /* uses_vl */ true);
  int encode = vex_prefix_and_encode(dst->encoding(), 0, src->encoding(), VEX_SIMD_F2, VEX_OPCODE_0F, &attributes);
  emit_int8(0x6F);
  emit_int8((unsigned char)(0xC0 | encode));
@ -2424,7 +2462,7 @@ void Assembler::evmovdquw(Address dst, XMMRegister src, int vector_len) {
  assert(VM_Version::supports_evex(), "");
  assert(src != xnoreg, "sanity");
  InstructionMark im(this);
-  InstructionAttr attributes(vector_len, /* vex_w */ true, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ false, /* uses_vl */ true);
+  InstructionAttr attributes(vector_len, /* vex_w */ true, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ true, /* uses_vl */ true);
  attributes.set_address_attributes(/* tuple_type */ EVEX_FVM, /* input_size_in_bits */ EVEX_NObit);
  vex_prefix(dst, 0, src->encoding(), VEX_SIMD_F2, VEX_OPCODE_0F, &attributes);
  emit_int8(0x7F);
@ -3069,7 +3107,7 @@ void Assembler::packuswb(XMMRegister dst, Address src) {
  NOT_LP64(assert(VM_Version::supports_sse2(), ""));
  assert((UseAVX > 0), "SSE mode requires address alignment 16 bytes");
  InstructionMark im(this);
-  InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ false, /* uses_vl */ true);
+  InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ true, /* uses_vl */ true);
  attributes.set_address_attributes(/* tuple_type */ EVEX_FV, /* input_size_in_bits */ EVEX_32bit);
  simd_prefix(dst, dst, src, VEX_SIMD_66, VEX_OPCODE_0F, &attributes);
  emit_int8(0x67);
@ -3078,7 +3116,7 @@ void Assembler::packuswb(XMMRegister dst, Address src) {
 void Assembler::packuswb(XMMRegister dst, XMMRegister src) {
  NOT_LP64(assert(VM_Version::supports_sse2(), ""));
-  InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ false, /* uses_vl */ true);
+  InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ true, /* uses_vl */ true);
  int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_66, VEX_OPCODE_0F, &attributes);
  emit_int8(0x67);
  emit_int8((unsigned char)(0xC0 | encode));
@ -3086,7 +3124,7 @@ void Assembler::packuswb(XMMRegister dst, XMMRegister src) {
 void Assembler::vpackuswb(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
  assert(UseAVX > 0, "some form of AVX must be enabled");
-  InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ false, /* uses_vl */ true);
+  InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ true, /* uses_vl */ true);
  int nds_enc = nds->is_valid() ? nds->encoding() : 0;
  int encode = vex_prefix_and_encode(dst->encoding(), nds_enc, src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F, &attributes);
  emit_int8(0x67);
@ -3128,7 +3166,7 @@ void Assembler::pcmpestri(XMMRegister dst, XMMRegister src, int imm8) {
 // In this context, the dst vector contains the components that are equal, non equal components are zeroed in dst
 void Assembler::pcmpeqb(XMMRegister dst, XMMRegister src) {
-  NOT_LP64(assert(VM_Version::supports_sse2(), ""));
+  assert(VM_Version::supports_sse2(), "");
  InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ true, /* no_mask_reg */ false, /* uses_vl */ false);
  int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_66, VEX_OPCODE_0F, &attributes);
  emit_int8(0x74);
@ -3148,16 +3186,28 @@ void Assembler::vpcmpeqb(XMMRegister dst, XMMRegister nds, XMMRegister src, int
 // In this context, kdst is written the mask used to process the equal components
 void Assembler::evpcmpeqb(KRegister kdst, XMMRegister nds, XMMRegister src, int vector_len) {
  assert(VM_Version::supports_avx512bw(), "");
-  InstructionAttr attributes(vector_len, /* rex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ true);
+  InstructionAttr attributes(vector_len, /* rex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ true);
  int nds_enc = nds->is_valid() ? nds->encoding() : 0;
  int encode = vex_prefix_and_encode(kdst->encoding(), nds_enc, src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F, &attributes);
  emit_int8(0x74);
  emit_int8((unsigned char)(0xC0 | encode));
 }
 void Assembler::evpcmpeqb(KRegister kdst, XMMRegister nds, Address src, int vector_len) {
  assert(VM_Version::supports_avx512bw(), "");
  InstructionMark im(this);
  InstructionAttr attributes(vector_len, /* rex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ true);
  attributes.set_address_attributes(/* tuple_type */ EVEX_FVM, /* input_size_in_bits */ EVEX_NObit);
  int nds_enc = nds->is_valid() ? nds->encoding() : 0;
  int dst_enc = kdst->encoding();
  vex_prefix(src, nds_enc, dst_enc, VEX_SIMD_66, VEX_OPCODE_0F, &attributes);
  emit_int8(0x74);
  emit_operand(as_Register(dst_enc), src);
 }
 // In this context, the dst vector contains the components that are equal, non equal components are zeroed in dst
 void Assembler::pcmpeqw(XMMRegister dst, XMMRegister src) {
-  NOT_LP64(assert(VM_Version::supports_sse2(), ""));
+  assert(VM_Version::supports_sse2(), "");
  InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ true, /* no_mask_reg */ false, /* uses_vl */ false);
  int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_66, VEX_OPCODE_0F, &attributes);
  emit_int8(0x75);
@ -3177,16 +3227,28 @@ void Assembler::vpcmpeqw(XMMRegister dst, XMMRegister nds, XMMRegister src, int
 // In this context, kdst is written the mask used to process the equal components
 void Assembler::evpcmpeqw(KRegister kdst, XMMRegister nds, XMMRegister src, int vector_len) {
  assert(VM_Version::supports_avx512bw(), "");
-  InstructionAttr attributes(vector_len, /* rex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ true);
+  InstructionAttr attributes(vector_len, /* rex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ true);
  int nds_enc = nds->is_valid() ? nds->encoding() : 0;
  int encode = vex_prefix_and_encode(kdst->encoding(), nds_enc, src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F, &attributes);
  emit_int8(0x75);
  emit_int8((unsigned char)(0xC0 | encode));
 }
 void Assembler::evpcmpeqw(KRegister kdst, XMMRegister nds, Address src, int vector_len) {
  assert(VM_Version::supports_avx512bw(), "");
  InstructionMark im(this);
  InstructionAttr attributes(vector_len, /* rex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ true);
  attributes.set_address_attributes(/* tuple_type */ EVEX_FVM, /* input_size_in_bits */ EVEX_NObit);
  int nds_enc = nds->is_valid() ? nds->encoding() : 0;
  int dst_enc = kdst->encoding();
  vex_prefix(src, nds_enc, dst_enc, VEX_SIMD_66, VEX_OPCODE_0F, &attributes);
  emit_int8(0x75);
  emit_operand(as_Register(dst_enc), src);
 }
 // In this context, the dst vector contains the components that are equal, non equal components are zeroed in dst
 void Assembler::pcmpeqd(XMMRegister dst, XMMRegister src) {
-  NOT_LP64(assert(VM_Version::supports_sse2(), ""));
+  assert(VM_Version::supports_sse2(), "");
  InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ true, /* no_mask_reg */ false, /* uses_vl */ false);
  int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_66, VEX_OPCODE_0F, &attributes);
  emit_int8(0x76);
@ -3213,9 +3275,21 @@ void Assembler::evpcmpeqd(KRegister kdst, XMMRegister nds, XMMRegister src, int
  emit_int8((unsigned char)(0xC0 | encode));
 }
 void Assembler::evpcmpeqd(KRegister kdst, XMMRegister nds, Address src, int vector_len) {
  assert(VM_Version::supports_evex(), "");
  InstructionMark im(this);
  InstructionAttr attributes(vector_len, /* rex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ true);
  attributes.set_address_attributes(/* tuple_type */ EVEX_FV, /* input_size_in_bits */ EVEX_32bit);
  int nds_enc = nds->is_valid() ? nds->encoding() : 0;
  int dst_enc = kdst->encoding();
  vex_prefix(src, nds_enc, dst_enc, VEX_SIMD_66, VEX_OPCODE_0F, &attributes);
  emit_int8(0x76);
  emit_operand(as_Register(dst_enc), src);
 }
 // In this context, the dst vector contains the components that are equal, non equal components are zeroed in dst
 void Assembler::pcmpeqq(XMMRegister dst, XMMRegister src) {
-  NOT_LP64(assert(VM_Version::supports_sse4_1(), ""));
+  assert(VM_Version::supports_sse4_1(), "");
  InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ true, /* no_mask_reg */ false, /* uses_vl */ false);
  int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_66, VEX_OPCODE_0F_38, &attributes);
  emit_int8(0x29);
@ -3328,7 +3402,7 @@ void Assembler::pinsrw(XMMRegister dst, Register src, int imm8) {
 void Assembler::pmovzxbw(XMMRegister dst, Address src) {
  assert(VM_Version::supports_sse4_1(), "");
  InstructionMark im(this);
-  InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ false, /* uses_vl */ false);
+  InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ true, /* uses_vl */ false);
  attributes.set_address_attributes(/* tuple_type */ EVEX_HVM, /* input_size_in_bits */ EVEX_NObit);
  simd_prefix(dst, xnoreg, src, VEX_SIMD_66, VEX_OPCODE_0F_38, &attributes);
  emit_int8(0x30);
@ -3337,7 +3411,7 @@ void Assembler::pmovzxbw(XMMRegister dst, Address src) {
 void Assembler::pmovzxbw(XMMRegister dst, XMMRegister src) {
  assert(VM_Version::supports_sse4_1(), "");
-  InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ false, /* uses_vl */ false);
+  InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ true, /* uses_vl */ false);
  int encode = simd_prefix_and_encode(dst, xnoreg, src, VEX_SIMD_66, VEX_OPCODE_0F_38, &attributes);
  emit_int8(0x30);
  emit_int8((unsigned char)(0xC0 | encode));
@ -3347,7 +3421,7 @@ void Assembler::vpmovzxbw(XMMRegister dst, Address src, int vector_len) {
  assert(VM_Version::supports_avx(), "");
  InstructionMark im(this);
  assert(dst != xnoreg, "sanity");
-  InstructionAttr attributes(vector_len, /* rex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ false, /* uses_vl */ false);
+  InstructionAttr attributes(vector_len, /* rex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ true, /* uses_vl */ false);
  attributes.set_address_attributes(/* tuple_type */ EVEX_HVM, /* input_size_in_bits */ EVEX_NObit);
  vex_prefix(src, 0, dst->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_38, &attributes);
  emit_int8(0x30);
@ -3452,7 +3526,7 @@ void Assembler::prefix(Prefix p) {
 void Assembler::pshufb(XMMRegister dst, XMMRegister src) {
  assert(VM_Version::supports_ssse3(), "");
-  InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ false, /* uses_vl */ false);
+  InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ true, /* uses_vl */ false);
  int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_66, VEX_OPCODE_0F_38, &attributes);
  emit_int8(0x00);
  emit_int8((unsigned char)(0xC0 | encode));
@ -3461,7 +3535,7 @@ void Assembler::pshufb(XMMRegister dst, XMMRegister src) {
 void Assembler::pshufb(XMMRegister dst, Address src) {
  assert(VM_Version::supports_ssse3(), "");
  InstructionMark im(this);
-  InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ false, /* uses_vl */ false);
+  InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ true, /* uses_vl */ false);
  attributes.set_address_attributes(/* tuple_type */ EVEX_FVM, /* input_size_in_bits */ EVEX_NObit);
  simd_prefix(dst, dst, src, VEX_SIMD_66, VEX_OPCODE_0F_38, &attributes);
  emit_int8(0x00);
@ -3495,7 +3569,7 @@ void Assembler::pshufd(XMMRegister dst, Address src, int mode) {
 void Assembler::pshuflw(XMMRegister dst, XMMRegister src, int mode) {
  assert(isByte(mode), "invalid value");
  NOT_LP64(assert(VM_Version::supports_sse2(), ""));
-  InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ false, /* uses_vl */ false);
+  InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ true, /* uses_vl */ false);
  int encode = simd_prefix_and_encode(dst, xnoreg, src, VEX_SIMD_F2, VEX_OPCODE_0F, &attributes);
  emit_int8(0x70);
  emit_int8((unsigned char)(0xC0 | encode));
@ -3507,7 +3581,7 @@ void Assembler::pshuflw(XMMRegister dst, Address src, int mode) {
  NOT_LP64(assert(VM_Version::supports_sse2(), ""));
  assert((UseAVX > 0), "SSE mode requires address alignment 16 bytes");
  InstructionMark im(this);
-  InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ false, /* uses_vl */ false);
+  InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ true, /* uses_vl */ false);
  attributes.set_address_attributes(/* tuple_type */ EVEX_FVM, /* input_size_in_bits */ EVEX_NObit);
  simd_prefix(dst, xnoreg, src, VEX_SIMD_F2, VEX_OPCODE_0F, &attributes);
  emit_int8(0x70);
@ -4723,7 +4797,7 @@ void Assembler::vphaddd(XMMRegister dst, XMMRegister nds, XMMRegister src, int v
 void Assembler::paddb(XMMRegister dst, XMMRegister src) {
  NOT_LP64(assert(VM_Version::supports_sse2(), ""));
-  InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ false, /* uses_vl */ true);
+  InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ true, /* uses_vl */ true);
  int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_66, VEX_OPCODE_0F, &attributes);
  emit_int8((unsigned char)0xFC);
  emit_int8((unsigned char)(0xC0 | encode));
@ -4731,7 +4805,7 @@ void Assembler::paddb(XMMRegister dst, XMMRegister src) {
 void Assembler::paddw(XMMRegister dst, XMMRegister src) {
  NOT_LP64(assert(VM_Version::supports_sse2(), ""));
-  InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ false, /* uses_vl */ true);
+  InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ true, /* uses_vl */ true);
  int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_66, VEX_OPCODE_0F, &attributes);
  emit_int8((unsigned char)0xFD);
  emit_int8((unsigned char)(0xC0 | encode));
@ -4771,7 +4845,7 @@ void Assembler::phaddd(XMMRegister dst, XMMRegister src) {
 void Assembler::vpaddb(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
  assert(UseAVX > 0, "requires some form of AVX");
-  InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ false, /* uses_vl */ true);
+  InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ true, /* uses_vl */ true);
  int nds_enc = nds->is_valid() ? nds->encoding() : 0;
  int encode = vex_prefix_and_encode(dst->encoding(), nds_enc, src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F, &attributes);
  emit_int8((unsigned char)0xFC);
@ -4780,7 +4854,7 @@ void Assembler::vpaddb(XMMRegister dst, XMMRegister nds, XMMRegister src, int ve
 void Assembler::vpaddw(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
  assert(UseAVX > 0, "requires some form of AVX");
-  InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ false, /* uses_vl */ true);
+  InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ true, /* uses_vl */ true);
  int nds_enc = nds->is_valid() ? nds->encoding() : 0;
  int encode = vex_prefix_and_encode(dst->encoding(), nds_enc, src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F, &attributes);
  emit_int8((unsigned char)0xFD);
@ -4808,7 +4882,7 @@ void Assembler::vpaddq(XMMRegister dst, XMMRegister nds, XMMRegister src, int ve
 void Assembler::vpaddb(XMMRegister dst, XMMRegister nds, Address src, int vector_len) {
  assert(UseAVX > 0, "requires some form of AVX");
  InstructionMark im(this);
-  InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ false, /* uses_vl */ true);
+  InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ true, /* uses_vl */ true);
  attributes.set_address_attributes(/* tuple_type */ EVEX_FVM, /* input_size_in_bits */ EVEX_NObit);
  int nds_enc = nds->is_valid() ? nds->encoding() : 0;
  vex_prefix(src, nds_enc, dst->encoding(), VEX_SIMD_66, VEX_OPCODE_0F, &attributes);
@ -4819,7 +4893,7 @@ void Assembler::vpaddb(XMMRegister dst, XMMRegister nds, Address src, int vector
 void Assembler::vpaddw(XMMRegister dst, XMMRegister nds, Address src, int vector_len) {
  assert(UseAVX > 0, "requires some form of AVX");
  InstructionMark im(this);
-  InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ false, /* uses_vl */ true);
+  InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ true, /* uses_vl */ true);
  attributes.set_address_attributes(/* tuple_type */ EVEX_FVM, /* input_size_in_bits */ EVEX_NObit);
  int nds_enc = nds->is_valid() ? nds->encoding() : 0;
  vex_prefix(src, nds_enc, dst->encoding(), VEX_SIMD_66, VEX_OPCODE_0F, &attributes);
@ -4851,7 +4925,7 @@ void Assembler::vpaddq(XMMRegister dst, XMMRegister nds, Address src, int vector
 void Assembler::psubb(XMMRegister dst, XMMRegister src) {
  NOT_LP64(assert(VM_Version::supports_sse2(), ""));
-  InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ false, /* uses_vl */ true);
+  InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ true, /* uses_vl */ true);
  int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_66, VEX_OPCODE_0F, &attributes);
  emit_int8((unsigned char)0xF8);
  emit_int8((unsigned char)(0xC0 | encode));
@ -4859,7 +4933,7 @@ void Assembler::psubb(XMMRegister dst, XMMRegister src) {
 void Assembler::psubw(XMMRegister dst, XMMRegister src) {
  NOT_LP64(assert(VM_Version::supports_sse2(), ""));
-  InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ false, /* uses_vl */ true);
+  InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ true, /* uses_vl */ true);
  int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_66, VEX_OPCODE_0F, &attributes);
  emit_int8((unsigned char)0xF9);
  emit_int8((unsigned char)(0xC0 | encode));
@ -4882,7 +4956,7 @@ void Assembler::psubq(XMMRegister dst, XMMRegister src) {
 void Assembler::vpsubb(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
  assert(UseAVX > 0, "requires some form of AVX");
-  InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ false, /* uses_vl */ true);
+  InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ true, /* uses_vl */ true);
  int nds_enc = nds->is_valid() ? nds->encoding() : 0;
  int encode = vex_prefix_and_encode(dst->encoding(), nds_enc, src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F, &attributes);
  emit_int8((unsigned char)0xF8);
@ -4891,7 +4965,7 @@ void Assembler::vpsubb(XMMRegister dst, XMMRegister nds, XMMRegister src, int ve
 void Assembler::vpsubw(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
  assert(UseAVX > 0, "requires some form of AVX");
-  InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ false, /* uses_vl */ true);
+  InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ true, /* uses_vl */ true);
  int nds_enc = nds->is_valid() ? nds->encoding() : 0;
  int encode = vex_prefix_and_encode(dst->encoding(), nds_enc, src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F, &attributes);
  emit_int8((unsigned char)0xF9);
@ -4919,7 +4993,7 @@ void Assembler::vpsubq(XMMRegister dst, XMMRegister nds, XMMRegister src, int ve
 void Assembler::vpsubb(XMMRegister dst, XMMRegister nds, Address src, int vector_len) {
  assert(UseAVX > 0, "requires some form of AVX");
  InstructionMark im(this);
-  InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ false, /* uses_vl */ true);
+  InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ true, /* uses_vl */ true);
  attributes.set_address_attributes(/* tuple_type */ EVEX_FVM, /* input_size_in_bits */ EVEX_NObit);
  int nds_enc = nds->is_valid() ? nds->encoding() : 0;
  vex_prefix(src, nds_enc, dst->encoding(), VEX_SIMD_66, VEX_OPCODE_0F, &attributes);
@ -4930,7 +5004,7 @@ void Assembler::vpsubb(XMMRegister dst, XMMRegister nds, Address src, int vector
 void Assembler::vpsubw(XMMRegister dst, XMMRegister nds, Address src, int vector_len) {
  assert(UseAVX > 0, "requires some form of AVX");
  InstructionMark im(this);
-  InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ false, /* uses_vl */ true);
+  InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ true, /* uses_vl */ true);
  attributes.set_address_attributes(/* tuple_type */ EVEX_FVM, /* input_size_in_bits */ EVEX_NObit);
  int nds_enc = nds->is_valid() ? nds->encoding() : 0;
  vex_prefix(src, nds_enc, dst->encoding(), VEX_SIMD_66, VEX_OPCODE_0F, &attributes);
@ -4962,7 +5036,7 @@ void Assembler::vpsubq(XMMRegister dst, XMMRegister nds, Address src, int vector
 void Assembler::pmullw(XMMRegister dst, XMMRegister src) {
  NOT_LP64(assert(VM_Version::supports_sse2(), ""));
-  InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ false, /* uses_vl */ true);
+  InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ true, /* uses_vl */ true);
  int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_66, VEX_OPCODE_0F, &attributes);
  emit_int8((unsigned char)0xD5);
  emit_int8((unsigned char)(0xC0 | encode));
@ -4978,7 +5052,7 @@ void Assembler::pmulld(XMMRegister dst, XMMRegister src) {
 void Assembler::vpmullw(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
  assert(UseAVX > 0, "requires some form of AVX");
-  InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ false, /* uses_vl */ true);
+  InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ true, /* uses_vl */ true);
  int nds_enc = nds->is_valid() ? nds->encoding() : 0;
  int encode = vex_prefix_and_encode(dst->encoding(), nds_enc, src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F, &attributes);
  emit_int8((unsigned char)0xD5);
@ -5006,7 +5080,7 @@ void Assembler::vpmullq(XMMRegister dst, XMMRegister nds, XMMRegister src, int v
 void Assembler::vpmullw(XMMRegister dst, XMMRegister nds, Address src, int vector_len) {
  assert(UseAVX > 0, "requires some form of AVX");
  InstructionMark im(this);
-  InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ false, /* uses_vl */ true);
+  InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ true, /* uses_vl */ true);
  attributes.set_address_attributes(/* tuple_type */ EVEX_FVM, /* input_size_in_bits */ EVEX_NObit);
  int nds_enc = nds->is_valid() ? nds->encoding() : 0;
  vex_prefix(src, nds_enc, dst->encoding(), VEX_SIMD_66, VEX_OPCODE_0F, &attributes);
@ -5039,7 +5113,7 @@ void Assembler::vpmullq(XMMRegister dst, XMMRegister nds, Address src, int vecto
 // Shift packed integers left by specified number of bits.
 void Assembler::psllw(XMMRegister dst, int shift) {
  NOT_LP64(assert(VM_Version::supports_sse2(), ""));
-  InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ false, /* uses_vl */ true);
+  InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ true, /* uses_vl */ true);
  // XMM6 is for /6 encoding: 66 0F 71 /6 ib
  int encode = simd_prefix_and_encode(xmm6, dst, dst, VEX_SIMD_66, VEX_OPCODE_0F, &attributes);
  emit_int8(0x71);
@ -5069,7 +5143,7 @@ void Assembler::psllq(XMMRegister dst, int shift) {
 void Assembler::psllw(XMMRegister dst, XMMRegister shift) {
  NOT_LP64(assert(VM_Version::supports_sse2(), ""));
-  InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ false, /* uses_vl */ true);
+  InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ true, /* uses_vl */ true);
  int encode = simd_prefix_and_encode(dst, dst, shift, VEX_SIMD_66, VEX_OPCODE_0F, &attributes);
  emit_int8((unsigned char)0xF1);
  emit_int8((unsigned char)(0xC0 | encode));
@ -5093,7 +5167,7 @@ void Assembler::psllq(XMMRegister dst, XMMRegister shift) {
 void Assembler::vpsllw(XMMRegister dst, XMMRegister src, int shift, int vector_len) {
  assert(UseAVX > 0, "requires some form of AVX");
-  InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ false, /* uses_vl */ true);
+  InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ true, /* uses_vl */ true);
  // XMM6 is for /6 encoding: 66 0F 71 /6 ib
  int encode = vex_prefix_and_encode(xmm6->encoding(), dst->encoding(), src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F, &attributes);
  emit_int8(0x71);
@ -5124,7 +5198,7 @@ void Assembler::vpsllq(XMMRegister dst, XMMRegister src, int shift, int vector_l
 void Assembler::vpsllw(XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len) {
  assert(UseAVX > 0, "requires some form of AVX");
-  InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ false, /* uses_vl */ true);
+  InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ true, /* uses_vl */ true);
  int encode = vex_prefix_and_encode(dst->encoding(), src->encoding(), shift->encoding(), VEX_SIMD_66, VEX_OPCODE_0F, &attributes);
  emit_int8((unsigned char)0xF1);
  emit_int8((unsigned char)(0xC0 | encode));
@ -5149,7 +5223,7 @@ void Assembler::vpsllq(XMMRegister dst, XMMRegister src, XMMRegister shift, int
 // Shift packed integers logically right by specified number of bits.
 void Assembler::psrlw(XMMRegister dst, int shift) {
  NOT_LP64(assert(VM_Version::supports_sse2(), ""));
-  InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ false, /* uses_vl */ true);
+  InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ true, /* uses_vl */ true);
  // XMM2 is for /2 encoding: 66 0F 71 /2 ib
  int encode = simd_prefix_and_encode(xmm2, dst, dst, VEX_SIMD_66, VEX_OPCODE_0F, &attributes);
  emit_int8(0x71);
@ -5181,7 +5255,7 @@ void Assembler::psrlq(XMMRegister dst, int shift) {
 void Assembler::psrlw(XMMRegister dst, XMMRegister shift) {
  NOT_LP64(assert(VM_Version::supports_sse2(), ""));
-  InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ false, /* uses_vl */ true);
+  InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ true, /* uses_vl */ true);
  int encode = simd_prefix_and_encode(dst, dst, shift, VEX_SIMD_66, VEX_OPCODE_0F, &attributes);
  emit_int8((unsigned char)0xD1);
  emit_int8((unsigned char)(0xC0 | encode));
@ -5205,7 +5279,7 @@ void Assembler::psrlq(XMMRegister dst, XMMRegister shift) {
 void Assembler::vpsrlw(XMMRegister dst, XMMRegister src, int shift, int vector_len) {
  assert(UseAVX > 0, "requires some form of AVX");
-  InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ false, /* uses_vl */ true);
+  InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ true, /* uses_vl */ true);
  // XMM2 is for /2 encoding: 66 0F 71 /2 ib
  int encode = vex_prefix_and_encode(xmm2->encoding(), dst->encoding(), src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F, &attributes);
  emit_int8(0x71);
@ -5235,7 +5309,7 @@ void Assembler::vpsrlq(XMMRegister dst, XMMRegister src, int shift, int vector_l
 void Assembler::vpsrlw(XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len) {
  assert(UseAVX > 0, "requires some form of AVX");
-  InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ false, /* uses_vl */ true);
+  InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ true, /* uses_vl */ true);
  int encode = vex_prefix_and_encode(dst->encoding(), src->encoding(), shift->encoding(), VEX_SIMD_66, VEX_OPCODE_0F, &attributes);
  emit_int8((unsigned char)0xD1);
  emit_int8((unsigned char)(0xC0 | encode));
@ -5260,7 +5334,7 @@ void Assembler::vpsrlq(XMMRegister dst, XMMRegister src, XMMRegister shift, int
 // Shift packed integers arithmetically right by specified number of bits.
 void Assembler::psraw(XMMRegister dst, int shift) {
  NOT_LP64(assert(VM_Version::supports_sse2(), ""));
-  InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ false, /* uses_vl */ true);
+  InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ true, /* uses_vl */ true);
  // XMM4 is for /4 encoding: 66 0F 71 /4 ib
  int encode = simd_prefix_and_encode(xmm4, dst, dst, VEX_SIMD_66, VEX_OPCODE_0F, &attributes);
  emit_int8(0x71);
@ -5280,7 +5354,7 @@ void Assembler::psrad(XMMRegister dst, int shift) {
 void Assembler::psraw(XMMRegister dst, XMMRegister shift) {
  NOT_LP64(assert(VM_Version::supports_sse2(), ""));
-  InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ false, /* uses_vl */ true);
+  InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ true, /* uses_vl */ true);
  int encode = simd_prefix_and_encode(dst, dst, shift, VEX_SIMD_66, VEX_OPCODE_0F, &attributes);
  emit_int8((unsigned char)0xE1);
  emit_int8((unsigned char)(0xC0 | encode));
@ -5296,7 +5370,7 @@ void Assembler::psrad(XMMRegister dst, XMMRegister shift) {
 void Assembler::vpsraw(XMMRegister dst, XMMRegister src, int shift, int vector_len) {
  assert(UseAVX > 0, "requires some form of AVX");
-  InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ false, /* uses_vl */ true);
+  InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ true, /* uses_vl */ true);
  // XMM4 is for /4 encoding: 66 0F 71 /4 ib
  int encode = vex_prefix_and_encode(xmm4->encoding(), dst->encoding(), src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F, &attributes);
  emit_int8(0x71);
@ -5316,7 +5390,7 @@ void Assembler::vpsrad(XMMRegister dst, XMMRegister src, int shift, int vector_l
 void Assembler::vpsraw(XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len) {
  assert(UseAVX > 0, "requires some form of AVX");
-  InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ false, /* uses_vl */ true);
+  InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ true, /* uses_vl */ true);
  int encode = vex_prefix_and_encode(dst->encoding(), src->encoding(), shift->encoding(), VEX_SIMD_66, VEX_OPCODE_0F, &attributes);
  emit_int8((unsigned char)0xE1);
  emit_int8((unsigned char)(0xC0 | encode));
@ -5706,7 +5780,7 @@ void Assembler::vpbroadcastd(XMMRegister dst, XMMRegister src) {
 // duplicate 2-bytes integer data from src into 16 locations in dest
 void Assembler::vpbroadcastw(XMMRegister dst, XMMRegister src) {
  assert(VM_Version::supports_avx2(), "");
-  InstructionAttr attributes(AVX_256bit, /* vex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ false, /* uses_vl */ true);
+  InstructionAttr attributes(AVX_256bit, /* vex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ true, /* uses_vl */ true);
  int encode = vex_prefix_and_encode(dst->encoding(), 0, src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_38, &attributes);
  emit_int8(0x79);
  emit_int8((unsigned char)(0xC0 | encode));
@ -6573,18 +6647,6 @@ int Assembler::simd_prefix_and_encode(XMMRegister dst, XMMRegister nds, XMMRegis
  }
 }
 int Assembler::kreg_prefix_and_encode(KRegister dst, KRegister nds, KRegister src, VexSimdPrefix pre,
                                      VexOpcode opc, InstructionAttr *attributes) {
  int nds_enc = nds->is_valid() ? nds->encoding() : 0;
  return vex_prefix_and_encode(dst->encoding(), nds_enc, src->encoding(), pre, opc, attributes);
 }
 int Assembler::kreg_prefix_and_encode(KRegister dst, KRegister nds, Register src, VexSimdPrefix pre,
                                      VexOpcode opc, InstructionAttr *attributes) {
  int nds_enc = nds->is_valid() ? nds->encoding() : 0;
  return vex_prefix_and_encode(dst->encoding(), nds_enc, src->encoding(), pre, opc, attributes);
 }
 void Assembler::cmppd(XMMRegister dst, XMMRegister nds, XMMRegister src, int cop, int vector_len) {
  assert(VM_Version::supports_avx(), "");
  assert(!VM_Version::supports_evex(), "");
--- a/hotspot/src/cpu/x86/vm/assembler_x86.hpp
+++ b/hotspot/src/cpu/x86/vm/assembler_x86.hpp
@ -655,12 +655,6 @@ private:
  int simd_prefix_and_encode(XMMRegister dst, XMMRegister nds, XMMRegister src, VexSimdPrefix pre,
                             VexOpcode opc, InstructionAttr *attributes);
  int kreg_prefix_and_encode(KRegister dst, KRegister nds, KRegister src, VexSimdPrefix pre,
                             VexOpcode opc, InstructionAttr *attributes);
  int kreg_prefix_and_encode(KRegister dst, KRegister nds, Register src, VexSimdPrefix pre,
                             VexOpcode opc, InstructionAttr *attributes);
  // Helper functions for groups of instructions
  void emit_arith_b(int op1, int op2, Register dst, int imm8);
@ -1331,12 +1325,17 @@ private:
  void movddup(XMMRegister dst, XMMRegister src);
  void kmovbl(KRegister dst, Register src);
  void kmovbl(Register dst, KRegister src);
  void kmovwl(KRegister dst, Register src);
  void kmovwl(Register dst, KRegister src);
  void kmovdl(KRegister dst, Register src);
  void kmovdl(Register dst, KRegister src);
  void kmovql(KRegister dst, KRegister src);
  void kmovql(KRegister dst, Register src);
  void kmovql(Address dst, KRegister src);
  void kmovql(KRegister dst, Address src);
  void kmovql(KRegister dst, Register src);
  void kmovql(Register dst, KRegister src);
  void kortestbl(KRegister dst, KRegister src);
  void kortestwl(KRegister dst, KRegister src);
@ -1521,14 +1520,17 @@ private:
  void pcmpeqb(XMMRegister dst, XMMRegister src);
  void vpcmpeqb(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
  void evpcmpeqb(KRegister kdst, XMMRegister nds, XMMRegister src, int vector_len);
  void evpcmpeqb(KRegister kdst, XMMRegister nds, Address src, int vector_len);
  void pcmpeqw(XMMRegister dst, XMMRegister src);
  void vpcmpeqw(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
  void evpcmpeqw(KRegister kdst, XMMRegister nds, XMMRegister src, int vector_len);
  void evpcmpeqw(KRegister kdst, XMMRegister nds, Address src, int vector_len);
  void pcmpeqd(XMMRegister dst, XMMRegister src);
  void vpcmpeqd(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
  void evpcmpeqd(KRegister kdst, XMMRegister nds, XMMRegister src, int vector_len);
  void evpcmpeqd(KRegister kdst, XMMRegister nds, Address src, int vector_len);
  void pcmpeqq(XMMRegister dst, XMMRegister src);
  void vpcmpeqq(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
--- a/hotspot/src/cpu/x86/vm/globalDefinitions_x86.hpp
+++ b/hotspot/src/cpu/x86/vm/globalDefinitions_x86.hpp
@ -27,6 +27,10 @@
 const int StackAlignmentInBytes  = 16;
 // Indicates whether the C calling conventions require that
 // 32-bit integer argument values are extended to 64 bits.
 const bool CCallingConventionRequiresIntsAsLongs = false;
 #define SUPPORTS_NATIVE_CX8
 // The expected size in bytes of a cache line, used to pad data structures.
--- a/hotspot/src/cpu/x86/vm/macroAssembler_x86.cpp
+++ b/hotspot/src/cpu/x86/vm/macroAssembler_x86.cpp
@ -7999,9 +7999,15 @@ void MacroAssembler::string_compare(Register str1, Register str2,
                                    XMMRegister vec1, int ae) {
  ShortBranchVerifier sbv(this);
  Label LENGTH_DIFF_LABEL, POP_LABEL, DONE_LABEL, WHILE_HEAD_LABEL;
  Label COMPARE_WIDE_VECTORS_LOOP_FAILED;  // used only _LP64 && AVX3
  int stride, stride2, adr_stride, adr_stride1, adr_stride2;
  int stride2x2 = 0x40;
  Address::ScaleFactor scale, scale1, scale2;
  if (ae != StrIntrinsicNode::LL) {
    stride2x2 = 0x20;
  }
  if (ae == StrIntrinsicNode::LU || ae == StrIntrinsicNode::UL) {
    shrl(cnt2, 1);
  }
@ -8011,15 +8017,15 @@ void MacroAssembler::string_compare(Register str1, Register str2,
  movl(result, cnt1);
  subl(cnt1, cnt2);
  push(cnt1);
-  cmov32(Assembler::lessEqual, cnt2, result);
+  cmov32(Assembler::lessEqual, cnt2, result);    // cnt2 = min(cnt1, cnt2)
  // Is the minimum length zero?
  testl(cnt2, cnt2);
  jcc(Assembler::zero, LENGTH_DIFF_LABEL);
  if (ae == StrIntrinsicNode::LL) {
    // Load first bytes
-    load_unsigned_byte(result, Address(str1, 0));
+    load_unsigned_byte(result, Address(str1, 0));  // result = str1[0]
-    load_unsigned_byte(cnt1, Address(str2, 0));
+    load_unsigned_byte(cnt1, Address(str2, 0));    // cnt1   = str2[0]
  } else if (ae == StrIntrinsicNode::UU) {
    // Load first characters
    load_unsigned_short(result, Address(str1, 0));
@ -8060,7 +8066,10 @@ void MacroAssembler::string_compare(Register str1, Register str2,
    assert(UseSSE >= 4, "SSE4 must be enabled for SSE4.2 intrinsics to be available");
    Label COMPARE_WIDE_VECTORS, VECTOR_NOT_EQUAL, COMPARE_WIDE_TAIL, COMPARE_SMALL_STR;
    Label COMPARE_WIDE_VECTORS_LOOP, COMPARE_16_CHARS, COMPARE_INDEX_CHAR;
    Label COMPARE_WIDE_VECTORS_LOOP_AVX2;
    Label COMPARE_TAIL_LONG;
    Label COMPARE_WIDE_VECTORS_LOOP_AVX3;  // used only _LP64 && AVX3
    int pcmpmask = 0x19;
    if (ae == StrIntrinsicNode::LL) {
      pcmpmask &= ~0x01;
@ -8123,11 +8132,40 @@ void MacroAssembler::string_compare(Register str1, Register str2,
    }
    subl(result, stride2);
    subl(cnt2, stride2);
-    jccb(Assembler::zero, COMPARE_WIDE_TAIL);
+    jcc(Assembler::zero, COMPARE_WIDE_TAIL);
    negptr(result);
    //  In a loop, compare 16-chars (32-bytes) at once using (vpxor+vptest)
    bind(COMPARE_WIDE_VECTORS_LOOP);
 #ifdef _LP64
    if (VM_Version::supports_avx512vlbw()) { // trying 64 bytes fast loop
      cmpl(cnt2, stride2x2);
      jccb(Assembler::below, COMPARE_WIDE_VECTORS_LOOP_AVX2);
      testl(cnt2, stride2x2-1);   // cnt2 holds the vector count
      jccb(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP_AVX2);   // means we cannot subtract by 0x40
      bind(COMPARE_WIDE_VECTORS_LOOP_AVX3); // the hottest loop
      if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
        evmovdquq(vec1, Address(str1, result, scale), Assembler::AVX_512bit);
        evpcmpeqb(k7, vec1, Address(str2, result, scale), Assembler::AVX_512bit); // k7 == 11..11, if operands equal, otherwise k7 has some 0
      } else {
        vpmovzxbw(vec1, Address(str1, result, scale1), Assembler::AVX_512bit);
        evpcmpeqb(k7, vec1, Address(str2, result, scale2), Assembler::AVX_512bit); // k7 == 11..11, if operands equal, otherwise k7 has some 0
      }
      kortestql(k7, k7);
      jcc(Assembler::aboveEqual, COMPARE_WIDE_VECTORS_LOOP_FAILED);     // miscompare
      addptr(result, stride2x2);  // update since we already compared at this addr
      subl(cnt2, stride2x2);      // and sub the size too
      jccb(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP_AVX3);
      vpxor(vec1, vec1);
      jmpb(COMPARE_WIDE_TAIL);
    }//if (VM_Version::supports_avx512vlbw())
 #endif // _LP64
    bind(COMPARE_WIDE_VECTORS_LOOP_AVX2);
    if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
      vmovdqu(vec1, Address(str1, result, scale));
      vpxor(vec1, Address(str2, result, scale));
@ -8136,7 +8174,7 @@ void MacroAssembler::string_compare(Register str1, Register str2,
      vpxor(vec1, Address(str2, result, scale2));
    }
    vptest(vec1, vec1);
-    jccb(Assembler::notZero, VECTOR_NOT_EQUAL);
+    jcc(Assembler::notZero, VECTOR_NOT_EQUAL);
    addptr(result, stride2);
    subl(cnt2, stride2);
    jccb(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP);
@ -8151,7 +8189,7 @@ void MacroAssembler::string_compare(Register str1, Register str2,
    movl(result, stride2);
    movl(cnt2, result);
    negptr(result);
-    jmpb(COMPARE_WIDE_VECTORS_LOOP);
+    jmp(COMPARE_WIDE_VECTORS_LOOP_AVX2);
    // Identifies the mismatching (higher or lower)16-bytes in the 32-byte vectors.
    bind(VECTOR_NOT_EQUAL);
@ -8295,6 +8333,34 @@ void MacroAssembler::string_compare(Register str1, Register str2,
  }
  jmpb(DONE_LABEL);
 #ifdef _LP64
  if (VM_Version::supports_avx512vlbw()) {
    bind(COMPARE_WIDE_VECTORS_LOOP_FAILED);
    kmovql(cnt1, k7);
    notq(cnt1);
    bsfq(cnt2, cnt1);
    if (ae != StrIntrinsicNode::LL) {
      // Divide diff by 2 to get number of chars
      sarl(cnt2, 1);
    }
    addq(result, cnt2);
    if (ae == StrIntrinsicNode::LL) {
      load_unsigned_byte(cnt1, Address(str2, result));
      load_unsigned_byte(result, Address(str1, result));
    } else if (ae == StrIntrinsicNode::UU) {
      load_unsigned_short(cnt1, Address(str2, result, scale));
      load_unsigned_short(result, Address(str1, result, scale));
    } else {
      load_unsigned_short(cnt1, Address(str2, result, scale2));
      load_unsigned_byte(result, Address(str1, result, scale1));
    }
    subl(result, cnt1);
    jmpb(POP_LABEL);
  }//if (VM_Version::supports_avx512vlbw())
 #endif // _LP64
  // Discard the stored length difference
  bind(POP_LABEL);
  pop(cnt1);
@ -8304,6 +8370,7 @@ void MacroAssembler::string_compare(Register str1, Register str2,
  if(ae == StrIntrinsicNode::UL) {
    negl(result);
  }
 }
 // Search for Non-ASCII character (Negative byte value) in a byte array,
@ -9439,13 +9506,184 @@ void MacroAssembler::multiply_to_len(Register x, Register xlen, Register y, Regi
  pop(tmp1);
 }
 void MacroAssembler::vectorized_mismatch(Register obja, Register objb, Register length, Register log2_array_indxscale,
  Register result, Register tmp1, Register tmp2, XMMRegister rymm0, XMMRegister rymm1, XMMRegister rymm2){
  assert(UseSSE42Intrinsics, "SSE4.2 must be enabled.");
  Label VECTOR32_LOOP, VECTOR16_LOOP, VECTOR8_LOOP, VECTOR4_LOOP;
  Label VECTOR16_TAIL, VECTOR8_TAIL, VECTOR4_TAIL;
  Label VECTOR32_NOT_EQUAL, VECTOR16_NOT_EQUAL, VECTOR8_NOT_EQUAL, VECTOR4_NOT_EQUAL;
  Label SAME_TILL_END, DONE;
  Label BYTES_LOOP, BYTES_TAIL, BYTES_NOT_EQUAL;
  //scale is in rcx in both Win64 and Unix
  ShortBranchVerifier sbv(this);
  shlq(length);
  xorq(result, result);
  cmpq(length, 8);
  jcc(Assembler::equal, VECTOR8_LOOP);
  jcc(Assembler::less, VECTOR4_TAIL);
  if (UseAVX >= 2){
    cmpq(length, 16);
    jcc(Assembler::equal, VECTOR16_LOOP);
    jcc(Assembler::less, VECTOR8_LOOP);
    cmpq(length, 32);
    jccb(Assembler::less, VECTOR16_TAIL);
    subq(length, 32);
    bind(VECTOR32_LOOP);
    vmovdqu(rymm0, Address(obja, result));
    vmovdqu(rymm1, Address(objb, result));
    vpxor(rymm2, rymm0, rymm1, Assembler::AVX_256bit);
    vptest(rymm2, rymm2);
    jcc(Assembler::notZero, VECTOR32_NOT_EQUAL);//mismatch found
    addq(result, 32);
    subq(length, 32);
    jccb(Assembler::greaterEqual, VECTOR32_LOOP);
    addq(length, 32);
    jcc(Assembler::equal, SAME_TILL_END);
    //falling through if less than 32 bytes left //close the branch here.
    bind(VECTOR16_TAIL);
    cmpq(length, 16);
    jccb(Assembler::less, VECTOR8_TAIL);
    bind(VECTOR16_LOOP);
    movdqu(rymm0, Address(obja, result));
    movdqu(rymm1, Address(objb, result));
    vpxor(rymm2, rymm0, rymm1, Assembler::AVX_128bit);
    ptest(rymm2, rymm2);
    jcc(Assembler::notZero, VECTOR16_NOT_EQUAL);//mismatch found
    addq(result, 16);
    subq(length, 16);
    jcc(Assembler::equal, SAME_TILL_END);
    //falling through if less than 16 bytes left
  } else {//regular intrinsics
    cmpq(length, 16);
    jccb(Assembler::less, VECTOR8_TAIL);
    subq(length, 16);
    bind(VECTOR16_LOOP);
    movdqu(rymm0, Address(obja, result));
    movdqu(rymm1, Address(objb, result));
    pxor(rymm0, rymm1);
    ptest(rymm0, rymm0);
    jcc(Assembler::notZero, VECTOR16_NOT_EQUAL);//mismatch found
    addq(result, 16);
    subq(length, 16);
    jccb(Assembler::greaterEqual, VECTOR16_LOOP);
    addq(length, 16);
    jcc(Assembler::equal, SAME_TILL_END);
    //falling through if less than 16 bytes left
  }
  bind(VECTOR8_TAIL);
  cmpq(length, 8);
  jccb(Assembler::less, VECTOR4_TAIL);
  bind(VECTOR8_LOOP);
  movq(tmp1, Address(obja, result));
  movq(tmp2, Address(objb, result));
  xorq(tmp1, tmp2);
  testq(tmp1, tmp1);
  jcc(Assembler::notZero, VECTOR8_NOT_EQUAL);//mismatch found
  addq(result, 8);
  subq(length, 8);
  jcc(Assembler::equal, SAME_TILL_END);
  //falling through if less than 8 bytes left
  bind(VECTOR4_TAIL);
  cmpq(length, 4);
  jccb(Assembler::less, BYTES_TAIL);
  bind(VECTOR4_LOOP);
  movl(tmp1, Address(obja, result));
  xorl(tmp1, Address(objb, result));
  testl(tmp1, tmp1);
  jcc(Assembler::notZero, VECTOR4_NOT_EQUAL);//mismatch found
  addq(result, 4);
  subq(length, 4);
  jcc(Assembler::equal, SAME_TILL_END);
  //falling through if less than 4 bytes left
  bind(BYTES_TAIL);
  bind(BYTES_LOOP);
  load_unsigned_byte(tmp1, Address(obja, result));
  load_unsigned_byte(tmp2, Address(objb, result));
  xorl(tmp1, tmp2);
  testl(tmp1, tmp1);
  jccb(Assembler::notZero, BYTES_NOT_EQUAL);//mismatch found
  decq(length);
  jccb(Assembler::zero, SAME_TILL_END);
  incq(result);
  load_unsigned_byte(tmp1, Address(obja, result));
  load_unsigned_byte(tmp2, Address(objb, result));
  xorl(tmp1, tmp2);
  testl(tmp1, tmp1);
  jccb(Assembler::notZero, BYTES_NOT_EQUAL);//mismatch found
  decq(length);
  jccb(Assembler::zero, SAME_TILL_END);
  incq(result);
  load_unsigned_byte(tmp1, Address(obja, result));
  load_unsigned_byte(tmp2, Address(objb, result));
  xorl(tmp1, tmp2);
  testl(tmp1, tmp1);
  jccb(Assembler::notZero, BYTES_NOT_EQUAL);//mismatch found
  jmpb(SAME_TILL_END);
  if (UseAVX >= 2){
    bind(VECTOR32_NOT_EQUAL);
    vpcmpeqb(rymm2, rymm2, rymm2, Assembler::AVX_256bit);
    vpcmpeqb(rymm0, rymm0, rymm1, Assembler::AVX_256bit);
    vpxor(rymm0, rymm0, rymm2, Assembler::AVX_256bit);
    vpmovmskb(tmp1, rymm0);
    bsfq(tmp1, tmp1);
    addq(result, tmp1);
    shrq(result);
    jmpb(DONE);
  }
  bind(VECTOR16_NOT_EQUAL);
  if (UseAVX >= 2){
    vpcmpeqb(rymm2, rymm2, rymm2, Assembler::AVX_128bit);
    vpcmpeqb(rymm0, rymm0, rymm1, Assembler::AVX_128bit);
    pxor(rymm0, rymm2);
  } else {
    pcmpeqb(rymm2, rymm2);
    pxor(rymm0, rymm1);
    pcmpeqb(rymm0, rymm1);
    pxor(rymm0, rymm2);
  }
  pmovmskb(tmp1, rymm0);
  bsfq(tmp1, tmp1);
  addq(result, tmp1);
  shrq(result);
  jmpb(DONE);
  bind(VECTOR8_NOT_EQUAL);
  bind(VECTOR4_NOT_EQUAL);
  bsfq(tmp1, tmp1);
  shrq(tmp1, 3);
  addq(result, tmp1);
  bind(BYTES_NOT_EQUAL);
  shrq(result);
  jmpb(DONE);
  bind(SAME_TILL_END);
  mov64(result, -1);
  bind(DONE);
 }
 //Helper functions for square_to_len()
 /**
 * Store the squares of x[], right shifted one bit (divided by 2) into z[]
 * Preserves x and z and modifies rest of the registers.
 */
 void MacroAssembler::square_rshift(Register x, Register xlen, Register z, Register tmp1, Register tmp3, Register tmp4, Register tmp5, Register rdxReg, Register raxReg) {
  // Perform square and right shift by 1
  // Handle odd xlen case first, then for even xlen do the following
--- a/hotspot/src/cpu/x86/vm/macroAssembler_x86.hpp
+++ b/hotspot/src/cpu/x86/vm/macroAssembler_x86.hpp
@ -1346,7 +1346,6 @@ public:
                               Register carry2);
  void multiply_to_len(Register x, Register xlen, Register y, Register ylen, Register z, Register zlen,
                       Register tmp1, Register tmp2, Register tmp3, Register tmp4, Register tmp5);
  void square_rshift(Register x, Register len, Register z, Register tmp1, Register tmp3,
                     Register tmp4, Register tmp5, Register rdxReg, Register raxReg);
  void multiply_add_64_bmi2(Register sum, Register op1, Register op2, Register carry,
@ -1365,6 +1364,9 @@ public:
  void mul_add(Register out, Register in, Register offset, Register len, Register k, Register tmp1,
               Register tmp2, Register tmp3, Register tmp4, Register tmp5, Register rdxReg,
               Register raxReg);
  void vectorized_mismatch(Register obja, Register objb, Register length, Register log2_array_indxscale,
                           Register result, Register tmp1, Register tmp2,
                           XMMRegister vec1, XMMRegister vec2, XMMRegister vec3);
 #endif
  // CRC32 code for java.util.zip.CRC32::updateBytes() intrinsic.
--- a/hotspot/src/cpu/x86/vm/sharedRuntime_x86_64.cpp
+++ b/hotspot/src/cpu/x86/vm/sharedRuntime_x86_64.cpp
@ -189,7 +189,7 @@ OopMap* RegisterSaver::save_live_registers(MacroAssembler* masm, int additional_
      }
      // Save full ZMM registes(16..num_xmm_regs)
      base_addr = XSAVE_AREA_UPPERBANK;
-      int off = 0;
+      off = 0;
      int vector_len = Assembler::AVX_512bit;
      for (int n = 16; n < num_xmm_regs; n++) {
        __ evmovdqul(Address(rsp, base_addr+(off++*64)), as_XMMRegister(n), vector_len);
@ -199,7 +199,7 @@ OopMap* RegisterSaver::save_live_registers(MacroAssembler* masm, int additional_
    if (VM_Version::supports_evex()) {
      // Save upper bank of ZMM registers(16..31) for double/float usage
      int base_addr = XSAVE_AREA_UPPERBANK;
-      int off = 0;
+      off = 0;
      for (int n = 16; n < num_xmm_regs; n++) {
        __ movsd(Address(rsp, base_addr+(off++*64)), as_XMMRegister(n));
      }
@ -325,7 +325,7 @@ void RegisterSaver::restore_live_registers(MacroAssembler* masm, bool restore_ve
    assert(MaxVectorSize == 64, "only 512bit vectors are supported now");
  }
 #else
-  assert(!save_vectors, "vectors are generated only by C2");
+  assert(!restore_vectors, "vectors are generated only by C2");
 #endif
  // On EVEX enabled targets everything is handled in pop fpu state
--- a/hotspot/src/cpu/x86/vm/stubGenerator_x86_32.cpp
+++ b/hotspot/src/cpu/x86/vm/stubGenerator_x86_32.cpp
@ -170,7 +170,7 @@ class StubGenerator: public StubCodeGenerator {
    // provide initial value for required masks
    if (UseAVX > 2) {
      __ movl(rbx, 0xffff);
-      __ kmovdl(k1, rbx);
+      __ kmovwl(k1, rbx);
    }
    // save and initialize %mxcsr
@ -798,7 +798,7 @@ class StubGenerator: public StubCodeGenerator {
    if (UseAVX > 2) {
      __ push(rbx);
      __ movl(rbx, 0xffff);
-      __ kmovdl(k1, rbx);
+      __ kmovwl(k1, rbx);
      __ pop(rbx);
    }
    // Copy 64-byte chunks
--- a/hotspot/src/cpu/x86/vm/stubGenerator_x86_64.cpp
+++ b/hotspot/src/cpu/x86/vm/stubGenerator_x86_64.cpp
@ -266,7 +266,7 @@ class StubGenerator: public StubCodeGenerator {
    __ movptr(r15_save, r15);
    if (UseAVX > 2) {
      __ movl(rbx, 0xffff);
-      __ kmovql(k1, rbx);
+      __ kmovwl(k1, rbx);
    }
 #ifdef _WIN64
    int last_reg = 15;
@ -1350,7 +1350,7 @@ class StubGenerator: public StubCodeGenerator {
      Label L_end;
      if (UseAVX > 2) {
        __ movl(to, 0xffff);
-        __ kmovql(k1, to);
+        __ kmovwl(k1, to);
      }
      // Copy 64-bytes per iteration
      __ BIND(L_loop);
@ -1434,7 +1434,7 @@ class StubGenerator: public StubCodeGenerator {
      Label L_end;
      if (UseAVX > 2) {
        __ movl(to, 0xffff);
-        __ kmovql(k1, to);
+        __ kmovwl(k1, to);
      }
      // Copy 64-bytes per iteration
      __ BIND(L_loop);
@ -4054,6 +4054,54 @@ class StubGenerator: public StubCodeGenerator {
    return start;
  }
  /**
  *  Arguments:
  *
  *  Input:
  *    c_rarg0   - obja     address
  *    c_rarg1   - objb     address
  *    c_rarg3   - length   length
  *    c_rarg4   - scale    log2_array_indxscale
  */
  address generate_vectorizedMismatch() {
    __ align(CodeEntryAlignment);
    StubCodeMark mark(this, "StubRoutines", "vectorizedMismatch");
    address start = __ pc();
    BLOCK_COMMENT("Entry:");
    __ enter();
 #ifdef _WIN64  // Win64: rcx, rdx, r8, r9 (c_rarg0, c_rarg1, ...)
    const Register scale = c_rarg0;  //rcx, will exchange with r9
    const Register objb = c_rarg1;   //rdx
    const Register length = c_rarg2; //r8
    const Register obja = c_rarg3;   //r9
    __ xchgq(obja, scale);  //now obja and scale contains the correct contents
    const Register tmp1 = r10;
    const Register tmp2 = r11;
 #endif
 #ifndef _WIN64 // Unix:  rdi, rsi, rdx, rcx, r8, r9 (c_rarg0, c_rarg1, ...)
    const Register obja = c_rarg0;   //U:rdi
    const Register objb = c_rarg1;   //U:rsi
    const Register length = c_rarg2; //U:rdx
    const Register scale = c_rarg3;  //U:rcx
    const Register tmp1 = r8;
    const Register tmp2 = r9;
 #endif
    const Register result = rax; //return value
    const XMMRegister vec0 = xmm0;
    const XMMRegister vec1 = xmm1;
    const XMMRegister vec2 = xmm2;
    __ vectorized_mismatch(obja, objb, length, scale, result, tmp1, tmp2, vec0, vec1, vec2);
    __ leave();
    __ ret(0);
    return start;
  }
 /**
   *  Arguments:
   *
@ -4505,7 +4553,9 @@ class StubGenerator: public StubCodeGenerator {
    if (UseMulAddIntrinsic) {
      StubRoutines::_mulAdd = generate_mulAdd();
    }
-
+    if (UseVectorizedMismatchIntrinsic) {
      StubRoutines::_vectorizedMismatch = generate_vectorizedMismatch();
    }
 #ifndef _WINDOWS
    if (UseMontgomeryMultiplyIntrinsic) {
      StubRoutines::_montgomeryMultiply
--- a/hotspot/src/cpu/x86/vm/vm_version_x86.cpp
+++ b/hotspot/src/cpu/x86/vm/vm_version_x86.cpp
@ -1039,6 +1039,25 @@ void VM_Version::get_processor_features() {
    }
  }
 #ifdef _LP64
  if (UseSSE42Intrinsics) {
    if (FLAG_IS_DEFAULT(UseVectorizedMismatchIntrinsic)) {
      UseVectorizedMismatchIntrinsic = true;
    }
  } else if (UseVectorizedMismatchIntrinsic) {
    if (!FLAG_IS_DEFAULT(UseVectorizedMismatchIntrinsic))
      warning("vectorizedMismatch intrinsics are not available on this CPU");
    FLAG_SET_DEFAULT(UseVectorizedMismatchIntrinsic, false);
  }
 #else
  if (UseVectorizedMismatchIntrinsic) {
    if (!FLAG_IS_DEFAULT(UseVectorizedMismatchIntrinsic)) {
      warning("vectorizedMismatch intrinsic is not available in 32-bit VM");
    }
    FLAG_SET_DEFAULT(UseVectorizedMismatchIntrinsic, false);
  }
 #endif // _LP64
  // Use count leading zeros count instruction if available.
  if (supports_lzcnt()) {
    if (FLAG_IS_DEFAULT(UseCountLeadingZerosInstruction)) {
--- a/hotspot/src/cpu/zero/vm/globalDefinitions_zero.hpp
+++ b/hotspot/src/cpu/zero/vm/globalDefinitions_zero.hpp
@ -28,4 +28,8 @@
 #include <ffi.h>
 // Indicates whether the C calling conventions require that
 // 32-bit integer argument values are extended to 64 bits.
 const bool CCallingConventionRequiresIntsAsLongs = false;
 #endif // CPU_ZERO_VM_GLOBALDEFINITIONS_ZERO_HPP
--- a/hotspot/src/share/vm/c1/c1_LIRGenerator.cpp
+++ b/hotspot/src/share/vm/c1/c1_LIRGenerator.cpp
@ -3055,13 +3055,16 @@ void LIRGenerator::do_IfOp(IfOp* x) {
  __ cmove(lir_cond(x->cond()), t_val.result(), f_val.result(), reg, as_BasicType(x->x()->type()));
 }
-void LIRGenerator::do_RuntimeCall(address routine, int expected_arguments, Intrinsic* x) {
+void LIRGenerator::do_RuntimeCall(address routine, Intrinsic* x) {
-    assert(x->number_of_arguments() == expected_arguments, "wrong type");
+  assert(x->number_of_arguments() == 0, "wrong type");
-    LIR_Opr reg = result_register_for(x->type());
+  // Enforce computation of _reserved_argument_area_size which is required on some platforms.
-    __ call_runtime_leaf(routine, getThreadTemp(),
+  BasicTypeList signature;
-                         reg, new LIR_OprList());
+  CallingConvention* cc = frame_map()->c_calling_convention(&signature);
-    LIR_Opr result = rlock_result(x);
+  LIR_Opr reg = result_register_for(x->type());
-    __ move(reg, result);
+  __ call_runtime_leaf(routine, getThreadTemp(),
                       reg, new LIR_OprList());
  LIR_Opr result = rlock_result(x);
  __ move(reg, result);
 }
 #ifdef TRACE_HAVE_INTRINSICS
@ -3115,16 +3118,16 @@ void LIRGenerator::do_Intrinsic(Intrinsic* x) {
  case vmIntrinsics::_threadID: do_ThreadIDIntrinsic(x); break;
  case vmIntrinsics::_classID: do_ClassIDIntrinsic(x); break;
  case vmIntrinsics::_counterTime:
-    do_RuntimeCall(CAST_FROM_FN_PTR(address, TRACE_TIME_METHOD), 0, x);
+    do_RuntimeCall(CAST_FROM_FN_PTR(address, TRACE_TIME_METHOD), x);
    break;
 #endif
  case vmIntrinsics::_currentTimeMillis:
-    do_RuntimeCall(CAST_FROM_FN_PTR(address, os::javaTimeMillis), 0, x);
+    do_RuntimeCall(CAST_FROM_FN_PTR(address, os::javaTimeMillis), x);
    break;
  case vmIntrinsics::_nanoTime:
-    do_RuntimeCall(CAST_FROM_FN_PTR(address, os::javaTimeNanos), 0, x);
+    do_RuntimeCall(CAST_FROM_FN_PTR(address, os::javaTimeNanos), x);
    break;
  case vmIntrinsics::_Object_init:    do_RegisterFinalizer(x); break;
--- a/hotspot/src/share/vm/c1/c1_LIRGenerator.hpp
+++ b/hotspot/src/share/vm/c1/c1_LIRGenerator.hpp
@ -157,8 +157,8 @@ class LIRGenerator: public InstructionVisitor, public BlockClosure {
 private:
  void* operator new(size_t size) throw();
  void* operator new[](size_t size) throw();
-  void operator delete(void* p);
+  void operator delete(void* p) { ShouldNotReachHere(); }
-  void operator delete[](void* p);
+  void operator delete[](void* p) { ShouldNotReachHere(); }
  Compilation*  _compilation;
  ciMethod*     _method;    // method that we are compiling
@ -439,7 +439,7 @@ class LIRGenerator: public InstructionVisitor, public BlockClosure {
  SwitchRangeArray* create_lookup_ranges(LookupSwitch* x);
  void do_SwitchRanges(SwitchRangeArray* x, LIR_Opr value, BlockBegin* default_sux);
-  void do_RuntimeCall(address routine, int expected_arguments, Intrinsic* x);
+  void do_RuntimeCall(address routine, Intrinsic* x);
 #ifdef TRACE_HAVE_INTRINSICS
  void do_ThreadIDIntrinsic(Intrinsic* x);
  void do_ClassIDIntrinsic(Intrinsic* x);
--- a/hotspot/src/share/vm/c1/c1_RangeCheckElimination.hpp
+++ b/hotspot/src/share/vm/c1/c1_RangeCheckElimination.hpp
@ -50,8 +50,8 @@ private:
  private:
    void* operator new(size_t size) throw();
    void* operator new[](size_t size) throw();
-    void operator delete(void* p);
+    void operator delete(void* p) { ShouldNotReachHere(); }
-    void operator delete[](void* p);
+    void operator delete[](void* p) { ShouldNotReachHere(); }
    IR *_ir;
    boolArray _used;
--- a/hotspot/src/share/vm/classfile/vmSymbols.cpp
+++ b/hotspot/src/share/vm/classfile/vmSymbols.cpp
@ -681,6 +681,9 @@ bool vmIntrinsics::is_disabled_by_flags(const methodHandle& method) {
  case vmIntrinsics::_montgomerySquare:
    if (!UseMontgomerySquareIntrinsic) return true;
    break;
  case vmIntrinsics::_vectorizedMismatch:
    if (!UseVectorizedMismatchIntrinsic) return true;
    break;
  case vmIntrinsics::_addExactI:
  case vmIntrinsics::_addExactL:
  case vmIntrinsics::_decrementExactI:
--- a/hotspot/src/share/vm/classfile/vmSymbols.hpp
+++ b/hotspot/src/share/vm/classfile/vmSymbols.hpp
@ -957,6 +957,11 @@
   do_name(     montgomerySquare_name,                             "implMontgomerySquare")                              \
   do_signature(montgomerySquare_signature,                        "([I[IIJ[I)[I")                                      \
                                                                                                                        \
  do_class(java_util_ArraysSupport, "java/util/ArraysSupport")                                                          \
  do_intrinsic(_vectorizedMismatch, java_util_ArraysSupport, vectorizedMismatch_name, vectorizedMismatch_signature, F_S)\
   do_name(vectorizedMismatch_name, "vectorizedMismatch")                                                               \
   do_signature(vectorizedMismatch_signature, "(Ljava/lang/Object;JLjava/lang/Object;JII)I")                            \
                                                                                                                        \
  /* java/lang/ref/Reference */                                                                                         \
  do_intrinsic(_Reference_get,            java_lang_ref_Reference, get_name,    void_object_signature, F_R)             \
                                                                                                                        \
--- a/hotspot/src/share/vm/code/dependencies.cpp
+++ b/hotspot/src/share/vm/code/dependencies.cpp
@ -346,7 +346,6 @@ void Dependencies::assert_common_2(DepType dept,
      }
    }
  } else {
    assert(dep_implicit_context_arg(dept) == 0, "sanity");
    if (note_dep_seen(dept, x0) && note_dep_seen(dept, x1)) {
      // look in this bucket for redundant assertions
      const int stride = 2;
--- a/hotspot/src/share/vm/compiler/compileBroker.cpp
+++ b/hotspot/src/share/vm/compiler/compileBroker.cpp
@ -56,6 +56,7 @@
 #if INCLUDE_JVMCI
 #include "jvmci/jvmciCompiler.hpp"
 #include "jvmci/jvmciRuntime.hpp"
 #include "jvmci/jvmciJavaClasses.hpp"
 #include "runtime/vframe.hpp"
 #endif
 #ifdef COMPILER2
@ -498,7 +499,7 @@ CompilerCounters::CompilerCounters() {
 // CompileBroker::compilation_init
 //
 // Initialize the Compilation object
-void CompileBroker::compilation_init() {
+void CompileBroker::compilation_init(TRAPS) {
  _last_method_compiled[0] = '\0';
  // No need to initialize compilation system if we do not use it.
@ -529,6 +530,17 @@ void CompileBroker::compilation_init() {
      } else {
        c1_count = JVMCIHostThreads;
      }
      if (!UseInterpreter) {
        // Force initialization of JVMCI compiler otherwise JVMCI
        // compilations will not block until JVMCI is initialized
        ResourceMark rm;
        TempNewSymbol getCompiler = SymbolTable::new_symbol("getCompiler", CHECK);
        TempNewSymbol sig = SymbolTable::new_symbol("()Ljdk/vm/ci/runtime/JVMCICompiler;", CHECK);
        Handle jvmciRuntime = JVMCIRuntime::get_HotSpotJVMCIRuntime(CHECK);
        JavaValue result(T_OBJECT);
        JavaCalls::call_virtual(&result, jvmciRuntime, HotSpotJVMCIRuntime::klass(), getCompiler, sig, CHECK);
      }
    }
  }
 #endif // INCLUDE_JVMCI
--- a/hotspot/src/share/vm/compiler/compileBroker.hpp
+++ b/hotspot/src/share/vm/compiler/compileBroker.hpp
@ -276,7 +276,7 @@ public:
    CompileQueue *q = compile_queue(comp_level);
    return q != NULL ? q->size() : 0;
  }
-  static void compilation_init();
+  static void compilation_init(TRAPS);
  static void init_compiler_thread_log();
  static nmethod* compile_method(const methodHandle& method,
                                 int osr_bci,
--- a/hotspot/src/share/vm/opto/c2compiler.cpp
+++ b/hotspot/src/share/vm/opto/c2compiler.cpp
@ -441,6 +441,7 @@ bool C2Compiler::is_intrinsic_supported(const methodHandle& method, bool is_virt
  case vmIntrinsics::_mulAdd:
  case vmIntrinsics::_montgomeryMultiply:
  case vmIntrinsics::_montgomerySquare:
  case vmIntrinsics::_vectorizedMismatch:
  case vmIntrinsics::_ghash_processBlocks:
  case vmIntrinsics::_updateCRC32:
  case vmIntrinsics::_updateBytesCRC32:
--- a/hotspot/src/share/vm/opto/escape.cpp
+++ b/hotspot/src/share/vm/opto/escape.cpp
@ -987,7 +987,8 @@ void ConnectionGraph::process_call_arguments(CallNode *call) {
                  strcmp(call->as_CallLeaf()->_name, "squareToLen") == 0 ||
                  strcmp(call->as_CallLeaf()->_name, "mulAdd") == 0 ||
                  strcmp(call->as_CallLeaf()->_name, "montgomery_multiply") == 0 ||
-                  strcmp(call->as_CallLeaf()->_name, "montgomery_square") == 0)
+                  strcmp(call->as_CallLeaf()->_name, "montgomery_square") == 0 ||
                  strcmp(call->as_CallLeaf()->_name, "vectorizedMismatch") == 0)
                 ))) {
            call->dump();
            fatal("EA unexpected CallLeaf %s", call->as_CallLeaf()->_name);
--- a/hotspot/src/share/vm/opto/generateOptoStub.cpp
+++ b/hotspot/src/share/vm/opto/generateOptoStub.cpp
@ -72,16 +72,18 @@ void GraphKit::gen_stub(address C_function,
  // Make up the parameters
  uint i;
-  for( i = 0; i < parm_cnt; i++ )
+  for (i = 0; i < parm_cnt; i++) {
    map()->init_req(i, _gvn.transform(new ParmNode(start, i)));
-  for( ; i<map()->req(); i++ )
+  }
  for ( ; i<map()->req(); i++) {
    map()->init_req(i, top());      // For nicer debugging
  }
  // GraphKit requires memory to be a MergeMemNode:
  set_all_memory(map()->memory());
  // Get base of thread-local storage area
-  Node* thread = _gvn.transform( new ThreadLocalNode() );
+  Node* thread = _gvn.transform(new ThreadLocalNode());
  const int NoAlias = Compile::AliasIdxBot;
@ -113,21 +115,27 @@ void GraphKit::gen_stub(address C_function,
  //-----------------------------
  // Compute signature for C call.  Varies from the Java signature!
  const Type **fields = TypeTuple::fields(2*parm_cnt+2);
  uint cnt = TypeFunc::Parms;
  // The C routines gets the base of thread-local storage passed in as an
-  // extra argument.  Not all calls need it, but its cheap to add here.
+  // extra argument. Not all calls need it, but it is cheap to add here.
  for (uint pcnt = cnt; pcnt < parm_cnt; pcnt++, cnt++) {
-    fields[cnt] = jdomain->field_at(pcnt);
+    const Type *f = jdomain->field_at(pcnt);
    if (CCallingConventionRequiresIntsAsLongs && f->isa_int()) {
      fields[cnt++] = TypeLong::LONG;
      fields[cnt] = Type::HALF; // Must add an additional half for a long.
    } else {
      fields[cnt] = f;
    }
  }
  fields[cnt++] = TypeRawPtr::BOTTOM; // Thread-local storage
  // Also pass in the caller's PC, if asked for.
  if (return_pc) {
    fields[cnt++] = TypeRawPtr::BOTTOM; // Return PC
  }
  const TypeTuple* domain = TypeTuple::make(cnt, fields);
  const TypeTuple* domain = TypeTuple::make(cnt,fields);
  // The C routine we are about to call cannot return an oop; it can block on
  // exit and a GC will trash the oop while it sits in C-land.  Instead, we
  // return the oop through TLS for runtime calls.
@ -155,37 +163,44 @@ void GraphKit::gen_stub(address C_function,
      rfields[TypeFunc::Parms+1] = jrange->field_at(TypeFunc::Parms+1);
    }
  }
-  const TypeTuple* range = TypeTuple::make(jrange->cnt(),rfields);
+  const TypeTuple* range = TypeTuple::make(jrange->cnt(), rfields);
  // Final C signature
-  const TypeFunc *c_sig = TypeFunc::make(domain,range);
+  const TypeFunc *c_sig = TypeFunc::make(domain, range);
  //-----------------------------
-  // Make the call node
+  // Make the call node.
  CallRuntimeNode *call = new CallRuntimeNode(c_sig, C_function, name, TypePtr::BOTTOM);
  //-----------------------------
-  // Fix-up the debug info for the call
+  // Fix-up the debug info for the call.
-  call->set_jvms( new (C) JVMState(0) );
+  call->set_jvms(new (C) JVMState(0));
  call->jvms()->set_bci(0);
  call->jvms()->set_offsets(cnt);
-  // Set fixed predefined input arguments
+  // Set fixed predefined input arguments.
  cnt = 0;
-  for (i = 0; i < TypeFunc::Parms; i++)
+  for (i = 0; i < TypeFunc::Parms; i++) {
    call->init_req(cnt++, map()->in(i));
  // A little too aggressive on the parm copy; return address is not an input
  call->set_req(TypeFunc::ReturnAdr, top());
  for (; i < parm_cnt; i++) { // Regular input arguments
    call->init_req(cnt++, map()->in(i));
  }
  // A little too aggressive on the parm copy; return address is not an input.
  call->set_req(TypeFunc::ReturnAdr, top());
  for (; i < parm_cnt; i++) { // Regular input arguments.
    const Type *f = jdomain->field_at(i);
    if (CCallingConventionRequiresIntsAsLongs && f->isa_int()) {
      call->init_req(cnt++, _gvn.transform(new ConvI2LNode(map()->in(i))));
      call->init_req(cnt++, top());
    } else {
      call->init_req(cnt++, map()->in(i));
    }
  }
  call->init_req(cnt++, thread);
  if (return_pc) {             // Return PC, if asked for.
    call->init_req(cnt++, returnadr());
  }
  call->init_req( cnt++, thread );
  if( return_pc )             // Return PC, if asked for
    call->init_req( cnt++, returnadr() );
  _gvn.transform_no_reclaim(call);
  //-----------------------------
  // Now set up the return results
  set_control( _gvn.transform( new ProjNode(call,TypeFunc::Control)) );
--- a/hotspot/src/share/vm/opto/library_call.cpp
+++ b/hotspot/src/share/vm/opto/library_call.cpp
@ -312,6 +312,7 @@ class LibraryCallKit : public GraphKit {
  bool inline_mulAdd();
  bool inline_montgomeryMultiply();
  bool inline_montgomerySquare();
  bool inline_vectorizedMismatch();
  bool inline_profileBoolean();
  bool inline_isCompileConstant();
@ -720,6 +721,9 @@ bool LibraryCallKit::try_to_inline(int predicate) {
  case vmIntrinsics::_montgomerySquare:
    return inline_montgomerySquare();
  case vmIntrinsics::_vectorizedMismatch:
    return inline_vectorizedMismatch();
  case vmIntrinsics::_ghash_processBlocks:
    return inline_ghash_processBlocks();
@ -5581,6 +5585,50 @@ bool LibraryCallKit::inline_montgomerySquare() {
  return true;
 }
 //-------------inline_vectorizedMismatch------------------------------
 bool LibraryCallKit::inline_vectorizedMismatch() {
  assert(UseVectorizedMismatchIntrinsic, "not implementated on this platform");
  address stubAddr = StubRoutines::vectorizedMismatch();
  if (stubAddr == NULL) {
    return false; // Intrinsic's stub is not implemented on this platform
  }
  const char* stubName = "vectorizedMismatch";
  int size_l = callee()->signature()->size();
  assert(callee()->signature()->size() == 8, "vectorizedMismatch has 6 parameters");
  Node* obja = argument(0);
  Node* aoffset = argument(1);
  Node* objb = argument(3);
  Node* boffset = argument(4);
  Node* length = argument(6);
  Node* scale = argument(7);
  const Type* a_type = obja->Value(&_gvn);
  const Type* b_type = objb->Value(&_gvn);
  const TypeAryPtr* top_a = a_type->isa_aryptr();
  const TypeAryPtr* top_b = b_type->isa_aryptr();
  if (top_a == NULL || top_a->klass() == NULL ||
    top_b == NULL || top_b->klass() == NULL) {
    // failed array check
    return false;
  }
  Node* call;
  jvms()->set_should_reexecute(true);
  Node* obja_adr = make_unsafe_address(obja, aoffset);
  Node* objb_adr = make_unsafe_address(objb, boffset);
  call = make_runtime_call(RC_LEAF,
    OptoRuntime::vectorizedMismatch_Type(),
    stubAddr, stubName, TypePtr::BOTTOM,
    obja_adr, objb_adr, length, scale);
  Node* result = _gvn.transform(new ProjNode(call, TypeFunc::Parms));
  set_result(result);
  return true;
 }
 /**
 * Calculate CRC32 for byte.
--- a/hotspot/src/share/vm/opto/runtime.cpp
+++ b/hotspot/src/share/vm/opto/runtime.cpp
@ -1103,6 +1103,26 @@ const TypeFunc* OptoRuntime::montgomerySquare_Type() {
  return TypeFunc::make(domain, range);
 }
 const TypeFunc* OptoRuntime::vectorizedMismatch_Type() {
  // create input type (domain)
  int num_args = 4;
  int argcnt = num_args;
  const Type** fields = TypeTuple::fields(argcnt);
  int argp = TypeFunc::Parms;
  fields[argp++] = TypePtr::NOTNULL;    // obja
  fields[argp++] = TypePtr::NOTNULL;    // objb
  fields[argp++] = TypeInt::INT;        // length, number of elements
  fields[argp++] = TypeInt::INT;        // log2scale, element size
  assert(argp == TypeFunc::Parms + argcnt, "correct decoding");
  const TypeTuple* domain = TypeTuple::make(TypeFunc::Parms + argcnt, fields);
  //return mismatch index (int)
  fields = TypeTuple::fields(1);
  fields[TypeFunc::Parms + 0] = TypeInt::INT;
  const TypeTuple* range = TypeTuple::make(TypeFunc::Parms + 1, fields);
  return TypeFunc::make(domain, range);
 }
 // GHASH block processing
 const TypeFunc* OptoRuntime::ghash_processBlocks_Type() {
    int argcnt = 4;
--- a/hotspot/src/share/vm/opto/runtime.hpp
+++ b/hotspot/src/share/vm/opto/runtime.hpp
@ -299,6 +299,8 @@ private:
  static const TypeFunc* mulAdd_Type();
  static const TypeFunc* vectorizedMismatch_Type();
  static const TypeFunc* ghash_processBlocks_Type();
  static const TypeFunc* updateBytesCRC32_Type();
--- a/hotspot/src/share/vm/runtime/globals.hpp
+++ b/hotspot/src/share/vm/runtime/globals.hpp
@ -855,6 +855,9 @@ public:
  product(bool, UseAdler32Intrinsics, false,                                \
          "use intrinsics for java.util.zip.Adler32")                       \
                                                                            \
  product(bool, UseVectorizedMismatchIntrinsic, false,                      \
          "Enables intrinsification of ArraysSupport.vectorizedMismatch()") \
                                                                            \
  diagnostic(ccstrlist, DisableIntrinsic, "",                               \
         "do not expand intrinsics whose (internal) names appear here")     \
                                                                            \
--- a/hotspot/src/share/vm/runtime/stubRoutines.cpp
+++ b/hotspot/src/share/vm/runtime/stubRoutines.cpp
@ -148,6 +148,8 @@ address StubRoutines::_mulAdd = NULL;
 address StubRoutines::_montgomeryMultiply = NULL;
 address StubRoutines::_montgomerySquare = NULL;
 address StubRoutines::_vectorizedMismatch = NULL;
 address StubRoutines::_dexp = NULL;
 address StubRoutines::_dlog = NULL;
--- a/hotspot/src/share/vm/runtime/stubRoutines.hpp
+++ b/hotspot/src/share/vm/runtime/stubRoutines.hpp
@ -207,6 +207,8 @@ class StubRoutines: AllStatic {
  static address _montgomeryMultiply;
  static address _montgomerySquare;
  static address _vectorizedMismatch;
  static address _dexp;
  static address _dlog;
@ -376,6 +378,8 @@ class StubRoutines: AllStatic {
  static address montgomeryMultiply()  { return _montgomeryMultiply; }
  static address montgomerySquare()    { return _montgomerySquare; }
  static address vectorizedMismatch()  { return _vectorizedMismatch; }
  static address dexp()                { return _dexp; }
  static address dlog()                { return _dlog; }
--- a/hotspot/src/share/vm/runtime/thread.cpp
+++ b/hotspot/src/share/vm/runtime/thread.cpp
@ -3628,7 +3628,7 @@ jint Threads::create_vm(JavaVMInitArgs* args, bool* canTryAgain) {
  // initialize compiler(s)
 #if defined(COMPILER1) || defined(COMPILER2) || defined(SHARK) || INCLUDE_JVMCI
-  CompileBroker::compilation_init();
+  CompileBroker::compilation_init(CHECK_JNI_ERR);
 #endif
  // Pre-initialize some JSR292 core classes to avoid deadlock during class loading.
--- a/hotspot/src/share/vm/runtime/vmStructs.cpp
+++ b/hotspot/src/share/vm/runtime/vmStructs.cpp
@ -860,6 +860,7 @@ typedef CompactHashtable<Symbol*, char>       SymbolCompactHashTable;
     static_field(StubRoutines,                _mulAdd,                                       address)                               \
     static_field(StubRoutines,                _dexp,                                         address)                               \
     static_field(StubRoutines,                _dlog,                                         address)                               \
     static_field(StubRoutines,                _vectorizedMismatch,                           address)                               \
     static_field(StubRoutines,                _jbyte_arraycopy,                              address)                               \
     static_field(StubRoutines,                _jshort_arraycopy,                             address)                               \
     static_field(StubRoutines,                _jint_arraycopy,                               address)                               \
--- a/hotspot/test/compiler/arraycopy/TestArrayCopyOverflowArguments.java
+++ b/hotspot/test/compiler/arraycopy/TestArrayCopyOverflowArguments.java
@ -0,0 +1,69 @@
 /*
 * Copyright (c) 2015 SAP SE. All rights reserved.
 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
 *
 * This code is free software; you can redistribute it and/or modify it
 * under the terms of the GNU General Public License version 2 only, as
 * published by the Free Software Foundation.
 *
 * This code is distributed in the hope that it will be useful, but WITHOUT
 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
 * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
 * version 2 for more details (a copy is included in the LICENSE file that
 * accompanied this code).
 *
 * You should have received a copy of the GNU General Public License version
 * 2 along with this work; if not, write to the Free Software Foundation,
 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
 *
 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
 * or visit www.oracle.com if you need additional information or have any
 * questions.
 */
 /*
 * @test
 * @summary Test that overflowed integers passed to arraycopy don't do any harm. This might
 *          be the case on platforms where C-code expects that ints passed to a call
 *          are properly sign extended to 64 bit (e.g., PPC64, s390x). This can fail
 *          if slow_arraycopy_C() is commpiled by the C compiler without any imlicit
 *          casts (as spill stores to the stack that are done with 4-byte instruction).
 * @run main/othervm -XX:-BackgroundCompilation -XX:-UseOnStackReplacement TestArrayCopyOverflowArguments
 *
 */
 public class TestArrayCopyOverflowArguments {
    // Without volatile the overflowing computation was moved up and then
    // spilled to the stack. The 32-bit spill store caused proper rounding.
    static volatile int mod = Integer.MAX_VALUE;
    public static int[] m1(Object src) {
        if (src == null) return null;
        int[] dest = new int[10];
        try {
            // PPC C calling conventions require that ints are properly expanded
            // to longs when passed to a function.
            int pos   =  8 + mod + mod; // = 0x1_0000_0006.
            int start =  2 + mod + mod; // = 0x1_0000_0000.
            int len   = 12 + mod + mod; // = 0x1_0000_0010.
            // This is supposed to call SharedRuntime::slow_arraycopy_C().
            System.arraycopy(src, pos, dest, 0, 10);
        } catch (ArrayStoreException npe) {
        }
        return dest;
    }
    static public void main(String[] args) throws Exception {
        int[] src = new int[20];
        for (int i  = 0; i < 20; ++i) {
            src[i] = i * (i-1);
        }
        for (int i = 0; i < 20000; i++) {
            m1(src);
        }
    }
 }
--- a/hotspot/test/compiler/floatingpoint/Test15FloatJNIArgs.java
+++ b/hotspot/test/compiler/floatingpoint/Test15FloatJNIArgs.java
@ -0,0 +1,61 @@
 /*
 * Copyright (c) 2015 SAP SE. All rights reserved.
 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
 *
 * This code is free software; you can redistribute it and/or modify it
 * under the terms of the GNU General Public License version 2 only, as
 * published by the Free Software Foundation.
 *
 * This code is distributed in the hope that it will be useful, but WITHOUT
 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
 * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
 * version 2 for more details (a copy is included in the LICENSE file that
 * accompanied this code).
 *
 * You should have received a copy of the GNU General Public License version
 * 2 along with this work; if not, write to the Free Software Foundation,
 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
 *
 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
 * or visit www.oracle.com if you need additional information or have any
 * questions.
 */
 /* @test
 * @bug 8139258
 * @summary Regression test for 8139258 which failed to properly pass float args
 *          to a jni function on ppc64le.
 * @run main/othervm -Xint Test15FloatJNIArgs
 * @run main/othervm -XX:+TieredCompilation -Xcomp Test15FloatJNIArgs
 * @run main/othervm -XX:-TieredCompilation -Xcomp Test15FloatJNIArgs
 */
 public class Test15FloatJNIArgs {
    static {
        try {
            System.loadLibrary("Test15FloatJNIArgs");
        } catch (UnsatisfiedLinkError e) {
            System.out.println("could not load native lib: " + e);
        }
    }
    public static native float add15floats(
        float f1, float f2, float f3, float f4,
        float f5, float f6, float f7, float f8,
        float f9, float f10, float f11, float f12,
        float f13, float f14, float f15);
    static void test() throws Exception {
        float sum = Test15FloatJNIArgs.add15floats(1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f,
                                                   1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f);
        if (sum != 15.0f) {
            throw new Error("Passed 15 times 1.0f to jni function which didn't add them properly: " + sum);
        }
    }
    public static void main(String[] args) throws Exception {
        for (int i = 0; i < 200; ++i) {
            test();
        }
    }
 }
--- a/hotspot/test/compiler/floatingpoint/libTest15FloatJNIArgs.c
+++ b/hotspot/test/compiler/floatingpoint/libTest15FloatJNIArgs.c
@ -0,0 +1,41 @@
 /*
 * Copyright (c) 2015. All rights reserved.
 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
 *
 * This code is free software; you can redistribute it and/or modify it
 * under the terms of the GNU General Public License version 2 only, as
 * published by the Free Software Foundation.
 *
 * This code is distributed in the hope that it will be useful, but WITHOUT
 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
 * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
 * version 2 for more details (a copy is included in the LICENSE file that
 * accompanied this code).
 *
 * You should have received a copy of the GNU General Public License version
 * 2 along with this work; if not, write to the Free Software Foundation,
 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
 *
 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
 * or visit www.oracle.com if you need additional information or have any
 * questions.
 */
 #include <jni.h>
 #ifdef __cplusplus
 extern "C" {
 #endif
 JNIEXPORT jfloat JNICALL Java_Test15FloatJNIArgs_add15floats
  (JNIEnv *env, jclass cls,
   jfloat  f1, jfloat  f2, jfloat  f3, jfloat  f4,
   jfloat  f5, jfloat  f6, jfloat  f7, jfloat  f8,
   jfloat  f9, jfloat f10, jfloat f11, jfloat f12,
   jfloat f13, jfloat f14, jfloat f15) {
  return f1 + f2 + f3 + f4 + f5 + f6 + f7 + f8 + f9 + f10 + f11 + f12 + f13 + f14 + f15;
 }
 #ifdef __cplusplus
 }
 #endif