8211251: Default mask register for avx512 instructions

Encode AVX 512 instructions as unmasked instruction where mask register is not specified. Reviewed-by: kvn
2018-10-01 11:54:34 -07:00 · 2018-10-01 11:54:34 -07:00 · b0ea3a49d2
commit b0ea3a49d2
parent 5985805474
6 changed files with 252 additions and 437 deletions
--- a/src/hotspot/cpu/x86/assembler_x86.cpp
+++ b/src/hotspot/cpu/x86/assembler_x86.cpp
--- a/src/hotspot/cpu/x86/assembler_x86.hpp
+++ b/src/hotspot/cpu/x86/assembler_x86.hpp
@ -871,11 +871,6 @@ private:
  void clear_managed(void) { _is_managed = false; }
  bool is_managed(void) { return _is_managed; }

-  // Following functions are for stub code use only
-  void set_vector_masking(void) { _vector_masking = true; }
-  void clear_vector_masking(void) { _vector_masking = false; }
-  bool is_vector_masking(void) { return _vector_masking; }
-
  void lea(Register dst, Address src);

  void mov(Register dst, Register src);
@ -2210,7 +2205,7 @@ public:
    int vector_len,     // The length of vector to be applied in encoding - for both AVX and EVEX
    bool rex_vex_w,     // Width of data: if 32-bits or less, false, else if 64-bit or specially defined, true
    bool legacy_mode,   // Details if either this instruction is conditionally encoded to AVX or earlier if true else possibly EVEX
-    bool no_reg_mask,   // when true, k0 is used when EVEX encoding is chosen, else k1 is used under the same condition
+    bool no_reg_mask,   // when true, k0 is used when EVEX encoding is chosen, else embedded_opmask_register_specifier is used
    bool uses_vl)       // This instruction may have legacy constraints based on vector length for EVEX
    :
      _avx_vector_len(vector_len),
@ -2225,7 +2220,7 @@ public:
      _evex_encoding(0),
      _is_clear_context(true),
      _is_extended_context(false),
-      _embedded_opmask_register_specifier(1), // hard code k1, it will be initialized for now
+      _embedded_opmask_register_specifier(0), // hard code k0
      _current_assembler(NULL) {
    if (UseAVX < 3) _legacy_mode = true;
  }
--- a/src/hotspot/cpu/x86/macroAssembler_x86.cpp
+++ b/src/hotspot/cpu/x86/macroAssembler_x86.cpp
@ -3267,6 +3267,7 @@ void MacroAssembler::movq(XMMRegister dst, AddressLiteral src) {
 }

 void MacroAssembler::setvectmask(Register dst, Register src) {
+  guarantee(PostLoopMultiversioning == true, "must be");
  Assembler::movl(dst, 1);
  Assembler::shlxl(dst, dst, src);
  Assembler::decl(dst);
@ -3275,6 +3276,7 @@ void MacroAssembler::setvectmask(Register dst, Register src) {
 }

 void MacroAssembler::restorevectmask() {
+  guarantee(PostLoopMultiversioning == true, "must be");
  Assembler::knotwl(k1, k0);
 }

@ -5026,7 +5028,7 @@ void MacroAssembler::restore_cpu_control_state_after_jni() {
  // Clear upper bits of YMM registers to avoid SSE <-> AVX transition penalty.
  vzeroupper();
  // Reset k1 to 0xffff.
-  if (VM_Version::supports_evex()) {
+  if (PostLoopMultiversioning && VM_Version::supports_evex()) {
    push(rcx);
    movl(rcx, 0xffff);
    kmovwl(k1, rcx);
@ -6681,8 +6683,6 @@ void MacroAssembler::has_negatives(Register ary1, Register len,
    VM_Version::supports_avx512vlbw() &&
    VM_Version::supports_bmi2()) {

-    set_vector_masking();  // opening of the stub context for programming mask registers
-
    Label test_64_loop, test_tail;
    Register tmp3_aliased = len;

@ -6711,15 +6711,12 @@ void MacroAssembler::has_negatives(Register ary1, Register len,
    testl(tmp1, -1);
    jcc(Assembler::zero, FALSE_LABEL);

-    // Save k1
-    kmovql(k3, k1);
-
    // ~(~0 << len) applied up to two times (for 32-bit scenario)
 #ifdef _LP64
    mov64(tmp3_aliased, 0xFFFFFFFFFFFFFFFF);
    shlxq(tmp3_aliased, tmp3_aliased, tmp1);
    notq(tmp3_aliased);
-    kmovql(k1, tmp3_aliased);
+    kmovql(k3, tmp3_aliased);
 #else
    Label k_init;
    jmp(k_init);
@ -6728,7 +6725,7 @@ void MacroAssembler::has_negatives(Register ary1, Register len,
    // data required to compose 64 1's to the instruction stream
    // We emit 64 byte wide series of elements from 0..63 which later on would
    // be used as a compare targets with tail count contained in tmp1 register.
-    // Result would be a k1 register having tmp1 consecutive number or 1
+    // Result would be a k register having tmp1 consecutive number or 1
    // counting from least significant bit.
    address tmp = pc();
    emit_int64(0x0706050403020100);
@ -6744,18 +6741,14 @@ void MacroAssembler::has_negatives(Register ary1, Register len,
    lea(len, InternalAddress(tmp));
    // create mask to test for negative byte inside a vector
    evpbroadcastb(vec1, tmp1, Assembler::AVX_512bit);
-    evpcmpgtb(k1, vec1, Address(len, 0), Assembler::AVX_512bit);
+    evpcmpgtb(k3, vec1, Address(len, 0), Assembler::AVX_512bit);

 #endif
-    evpcmpgtb(k2, k1, vec2, Address(ary1, 0), Assembler::AVX_512bit);
-    ktestq(k2, k1);
-    // Restore k1
-    kmovql(k1, k3);
+    evpcmpgtb(k2, k3, vec2, Address(ary1, 0), Assembler::AVX_512bit);
+    ktestq(k2, k3);
    jcc(Assembler::notZero, TRUE_LABEL);

    jmp(FALSE_LABEL);
-
-    clear_vector_masking();   // closing of the stub context for programming mask registers
  } else {
    movl(result, len); // copy

@ -7197,10 +7190,6 @@ void MacroAssembler::generate_fill(BasicType t, bool aligned,
    {
      assert( UseSSE >= 2, "supported cpu only" );
      Label L_fill_32_bytes_loop, L_check_fill_8_bytes, L_fill_8_bytes_loop, L_fill_8_bytes;
-      if (UseAVX > 2) {
-        movl(rtmp, 0xffff);
-        kmovwl(k1, rtmp);
-      }
      movdl(xtmp, value);
      if (UseAVX > 2 && UseUnalignedLoadStores) {
        // Fill 64-byte chunks
@ -7945,7 +7934,6 @@ void MacroAssembler::vectorized_mismatch(Register obja, Register objb, Register
      VM_Version::supports_avx512vlbw()) {
    Label VECTOR64_LOOP, VECTOR64_NOT_EQUAL, VECTOR32_TAIL;

-    set_vector_masking();  // opening of the stub context for programming mask registers
    cmpq(length, 64);
    jcc(Assembler::less, VECTOR32_TAIL);
    movq(tmp1, length);
@ -7968,19 +7956,15 @@ void MacroAssembler::vectorized_mismatch(Register obja, Register objb, Register

    //bind(VECTOR64_TAIL);
    // AVX512 code to compare upto 63 byte vectors.
-    // Save k1
-    kmovql(k3, k1);
    mov64(tmp2, 0xFFFFFFFFFFFFFFFF);
    shlxq(tmp2, tmp2, tmp1);
    notq(tmp2);
-    kmovql(k1, tmp2);
+    kmovql(k3, tmp2);

-    evmovdqub(rymm0, k1, Address(obja, result), Assembler::AVX_512bit);
-    evpcmpeqb(k7, k1, rymm0, Address(objb, result), Assembler::AVX_512bit);
+    evmovdqub(rymm0, k3, Address(obja, result), Assembler::AVX_512bit);
+    evpcmpeqb(k7, k3, rymm0, Address(objb, result), Assembler::AVX_512bit);

-    ktestql(k7, k1);
-    // Restore k1
-    kmovql(k1, k3);
+    ktestql(k7, k3);
    jcc(Assembler::below, SAME_TILL_END);     // not mismatch

    bind(VECTOR64_NOT_EQUAL);
@ -7991,7 +7975,6 @@ void MacroAssembler::vectorized_mismatch(Register obja, Register objb, Register
    shrq(result);
    jmp(DONE);
    bind(VECTOR32_TAIL);
-    clear_vector_masking();   // closing of the stub context for programming mask registers
  }

  cmpq(length, 8);
@ -8752,11 +8735,6 @@ void MacroAssembler::kernel_crc32(Register crc, Register buf, Register len, Regi
  // For EVEX with VL and BW, provide a standard mask, VL = 128 will guide the merge
  // context for the registers used, where all instructions below are using 128-bit mode
  // On EVEX without VL and BW, these instructions will all be AVX.
-  if (VM_Version::supports_avx512vlbw()) {
-    movl(tmp, 0xffff);
-    kmovwl(k1, tmp);
-  }
-
  lea(table, ExternalAddress(StubRoutines::crc_table_addr()));
  notl(crc); // ~crc
  cmpl(len, 16);
@ -9418,9 +9396,7 @@ void MacroAssembler::char_array_compress(Register src, Register dst, Register le
    VM_Version::supports_avx512vlbw() &&
    VM_Version::supports_bmi2()) {

-    set_vector_masking();  // opening of the stub context for programming mask registers
-
-    Label copy_32_loop, copy_loop_tail, restore_k1_return_zero, below_threshold;
+    Label copy_32_loop, copy_loop_tail, below_threshold;

    // alignment
    Label post_alignment;
@ -9434,9 +9410,6 @@ void MacroAssembler::char_array_compress(Register src, Register dst, Register le
    movl(result, 0x00FF);
    evpbroadcastw(tmp2Reg, result, Assembler::AVX_512bit);

-    // Save k1
-    kmovql(k3, k1);
-
    testl(len, -64);
    jcc(Assembler::zero, post_alignment);

@ -9453,14 +9426,14 @@ void MacroAssembler::char_array_compress(Register src, Register dst, Register le
    movl(result, 0xFFFFFFFF);
    shlxl(result, result, tmp5);
    notl(result);
-    kmovdl(k1, result);
+    kmovdl(k3, result);

-    evmovdquw(tmp1Reg, k1, Address(src, 0), Assembler::AVX_512bit);
-    evpcmpuw(k2, k1, tmp1Reg, tmp2Reg, Assembler::le, Assembler::AVX_512bit);
-    ktestd(k2, k1);
-    jcc(Assembler::carryClear, restore_k1_return_zero);
+    evmovdquw(tmp1Reg, k3, Address(src, 0), Assembler::AVX_512bit);
+    evpcmpuw(k2, k3, tmp1Reg, tmp2Reg, Assembler::le, Assembler::AVX_512bit);
+    ktestd(k2, k3);
+    jcc(Assembler::carryClear, return_zero);

-    evpmovwb(Address(dst, 0), k1, tmp1Reg, Assembler::AVX_512bit);
+    evpmovwb(Address(dst, 0), k3, tmp1Reg, Assembler::AVX_512bit);

    addptr(src, tmp5);
    addptr(src, tmp5);
@ -9483,7 +9456,7 @@ void MacroAssembler::char_array_compress(Register src, Register dst, Register le
    evmovdquw(tmp1Reg, Address(src, len, Address::times_2), Assembler::AVX_512bit);
    evpcmpuw(k2, tmp1Reg, tmp2Reg, Assembler::le, Assembler::AVX_512bit);
    kortestdl(k2, k2);
-    jcc(Assembler::carryClear, restore_k1_return_zero);
+    jcc(Assembler::carryClear, return_zero);

    // All elements in current processed chunk are valid candidates for
    // compression. Write a truncated byte elements to the memory.
@ -9494,8 +9467,6 @@ void MacroAssembler::char_array_compress(Register src, Register dst, Register le
    bind(copy_loop_tail);
    // bail out when there is nothing to be done
    testl(tmp5, 0xFFFFFFFF);
-    // Restore k1
-    kmovql(k1, k3);
    jcc(Assembler::zero, return_length);

    movl(len, tmp5);
@ -9505,25 +9476,16 @@ void MacroAssembler::char_array_compress(Register src, Register dst, Register le
    shlxl(result, result, len);
    notl(result);

-    kmovdl(k1, result);
+    kmovdl(k3, result);

-    evmovdquw(tmp1Reg, k1, Address(src, 0), Assembler::AVX_512bit);
-    evpcmpuw(k2, k1, tmp1Reg, tmp2Reg, Assembler::le, Assembler::AVX_512bit);
-    ktestd(k2, k1);
-    jcc(Assembler::carryClear, restore_k1_return_zero);
+    evmovdquw(tmp1Reg, k3, Address(src, 0), Assembler::AVX_512bit);
+    evpcmpuw(k2, k3, tmp1Reg, tmp2Reg, Assembler::le, Assembler::AVX_512bit);
+    ktestd(k2, k3);
+    jcc(Assembler::carryClear, return_zero);

-    evpmovwb(Address(dst, 0), k1, tmp1Reg, Assembler::AVX_512bit);
-    // Restore k1
-    kmovql(k1, k3);
+    evpmovwb(Address(dst, 0), k3, tmp1Reg, Assembler::AVX_512bit);
    jmp(return_length);

-    bind(restore_k1_return_zero);
-    // Restore k1
-    kmovql(k1, k3);
-    jmp(return_zero);
-
-    clear_vector_masking();   // closing of the stub context for programming mask registers
-
    bind(below_threshold);
  }

@ -9637,8 +9599,6 @@ void MacroAssembler::byte_array_inflate(Register src, Register dst, Register len
    VM_Version::supports_avx512vlbw() &&
    VM_Version::supports_bmi2()) {

-    set_vector_masking();  // opening of the stub context for programming mask registers
-
    Label copy_32_loop, copy_tail;
    Register tmp3_aliased = len;

@ -9670,22 +9630,15 @@ void MacroAssembler::byte_array_inflate(Register src, Register dst, Register len
    testl(tmp2, -1); // we don't destroy the contents of tmp2 here
    jcc(Assembler::zero, done);

-    // Save k1
-    kmovql(k2, k1);
-
    // ~(~0 << length), where length is the # of remaining elements to process
    movl(tmp3_aliased, -1);
    shlxl(tmp3_aliased, tmp3_aliased, tmp2);
    notl(tmp3_aliased);
-    kmovdl(k1, tmp3_aliased);
-    evpmovzxbw(tmp1, k1, Address(src, 0), Assembler::AVX_512bit);
-    evmovdquw(Address(dst, 0), k1, tmp1, Assembler::AVX_512bit);
+    kmovdl(k2, tmp3_aliased);
+    evpmovzxbw(tmp1, k2, Address(src, 0), Assembler::AVX_512bit);
+    evmovdquw(Address(dst, 0), k2, tmp1, Assembler::AVX_512bit);

-    // Restore k1
-    kmovql(k1, k2);
    jmp(done);
-
-    clear_vector_masking();   // closing of the stub context for programming mask registers
  }
  if (UseSSE42Intrinsics) {
    Label copy_16_loop, copy_8_loop, copy_bytes, copy_new_tail, copy_tail;
--- a/src/hotspot/cpu/x86/stubGenerator_x86_32.cpp
+++ b/src/hotspot/cpu/x86/stubGenerator_x86_32.cpp
@ -153,12 +153,6 @@ class StubGenerator: public StubCodeGenerator {
    __ movptr(saved_rsi, rsi);
    __ movptr(saved_rbx, rbx);

-    // provide initial value for required masks
-    if (UseAVX > 2) {
-      __ movl(rbx, 0xffff);
-      __ kmovwl(k1, rbx);
-    }
-
    // save and initialize %mxcsr
    if (sse_save) {
      Label skip_ldmx;
@ -679,12 +673,7 @@ class StubGenerator: public StubCodeGenerator {
  void xmm_copy_forward(Register from, Register to_from, Register qword_count) {
    assert( UseSSE >= 2, "supported cpu only" );
    Label L_copy_64_bytes_loop, L_copy_64_bytes, L_copy_8_bytes, L_exit;
-    if (UseAVX > 2) {
-      __ push(rbx);
-      __ movl(rbx, 0xffff);
-      __ kmovwl(k1, rbx);
-      __ pop(rbx);
-    }
+
    // Copy 64-byte chunks
    __ jmpb(L_copy_64_bytes);
    __ align(OptoLoopAlignment);
@ -2115,14 +2104,6 @@ class StubGenerator: public StubCodeGenerator {

    __ enter();   // required for proper stackwalking of RuntimeStub frame

-    // For EVEX with VL and BW, provide a standard mask, VL = 128 will guide the merge
-    // context for the registers used, where all instructions below are using 128-bit mode
-    // On EVEX without VL and BW, these instructions will all be AVX.
-    if (VM_Version::supports_avx512vlbw()) {
-      __ movl(rdx, 0xffff);
-      __ kmovdl(k1, rdx);
-    }
-
    __ movptr(from, from_param);
    __ movptr(key, key_param);

@ -2222,14 +2203,6 @@ class StubGenerator: public StubCodeGenerator {

    __ enter(); // required for proper stackwalking of RuntimeStub frame

-    // For EVEX with VL and BW, provide a standard mask, VL = 128 will guide the merge
-    // context for the registers used, where all instructions below are using 128-bit mode
-    // On EVEX without VL and BW, these instructions will all be AVX.
-    if (VM_Version::supports_avx512vlbw()) {
-      __ movl(rdx, 0xffff);
-      __ kmovdl(k1, rdx);
-    }
-
    __ movptr(from, from_param);
    __ movptr(key, key_param);

@ -2356,14 +2329,6 @@ class StubGenerator: public StubCodeGenerator {
    __ enter(); // required for proper stackwalking of RuntimeStub frame
    handleSOERegisters(true /*saving*/);

-    // For EVEX with VL and BW, provide a standard mask, VL = 128 will guide the merge
-    // context for the registers used, where all instructions below are using 128-bit mode
-    // On EVEX without VL and BW, these instructions will all be AVX.
-    if (VM_Version::supports_avx512vlbw()) {
-      __ movl(rdx, 0xffff);
-      __ kmovdl(k1, rdx);
-    }
-
    // load registers from incoming parameters
    const Address  from_param(rbp, 8+0);
    const Address  to_param  (rbp, 8+4);
@ -2532,14 +2497,6 @@ class StubGenerator: public StubCodeGenerator {
    __ enter(); // required for proper stackwalking of RuntimeStub frame
    handleSOERegisters(true /*saving*/);

-    // For EVEX with VL and BW, provide a standard mask, VL = 128 will guide the merge
-    // context for the registers used, where all instructions below are using 128-bit mode
-    // On EVEX without VL and BW, these instructions will all be AVX.
-    if (VM_Version::supports_avx512vlbw()) {
-      __ movl(rdx, 0xffff);
-      __ kmovdl(k1, rdx);
-    }
-
    // load registers from incoming parameters
    const Address  from_param(rbp, 8+0);
    const Address  to_param  (rbp, 8+4);
@ -2693,14 +2650,6 @@ class StubGenerator: public StubCodeGenerator {
    __ enter(); // required for proper stackwalking of RuntimeStub frame
    handleSOERegisters(true /*saving*/); // save rbx, rsi, rdi

-    // For EVEX with VL and BW, provide a standard mask, VL = 128 will guide the merge
-    // context for the registers used, where all instructions below are using 128-bit mode
-    // On EVEX without VL and BW, these instructions will all be AVX.
-    if (VM_Version::supports_avx512vlbw()) {
-      __ movl(rdx, 0xffff);
-      __ kmovdl(k1, rdx);
-    }
-
    // load registers from incoming parameters
    const Address  from_param(rbp, 8+0);
    const Address  to_param  (rbp, 8+4);
@ -3154,14 +3103,6 @@ class StubGenerator: public StubCodeGenerator {
    __ enter();
    handleSOERegisters(true);  // Save registers

-    // For EVEX with VL and BW, provide a standard mask, VL = 128 will guide the merge
-    // context for the registers used, where all instructions below are using 128-bit mode
-    // On EVEX without VL and BW, these instructions will all be AVX.
-    if (VM_Version::supports_avx512vlbw()) {
-      __ movl(rdx, 0xffff);
-      __ kmovdl(k1, rdx);
-    }
-
    __ movptr(state, state_param);
    __ movptr(subkeyH, subkeyH_param);
    __ movptr(data, data_param);
--- a/src/hotspot/cpu/x86/stubGenerator_x86_64.cpp
+++ b/src/hotspot/cpu/x86/stubGenerator_x86_64.cpp
@ -254,10 +254,7 @@ class StubGenerator: public StubCodeGenerator {
    __ movptr(r13_save, r13);
    __ movptr(r14_save, r14);
    __ movptr(r15_save, r15);
-    if (UseAVX > 2) {
-      __ movl(rbx, 0xffff);
-      __ kmovwl(k1, rbx);
-    }
+
 #ifdef _WIN64
    int last_reg = 15;
    if (UseAVX > 2) {
@ -1257,10 +1254,6 @@ class StubGenerator: public StubCodeGenerator {
    __ align(OptoLoopAlignment);
    if (UseUnalignedLoadStores) {
      Label L_end;
-      if (UseAVX > 2) {
-        __ movl(to, 0xffff);
-        __ kmovwl(k1, to);
-      }
      // Copy 64-bytes per iteration
      __ BIND(L_loop);
      if (UseAVX > 2) {
@ -1341,10 +1334,6 @@ class StubGenerator: public StubCodeGenerator {
    __ align(OptoLoopAlignment);
    if (UseUnalignedLoadStores) {
      Label L_end;
-      if (UseAVX > 2) {
-        __ movl(to, 0xffff);
-        __ kmovwl(k1, to);
-      }
      // Copy 64-bytes per iteration
      __ BIND(L_loop);
      if (UseAVX > 2) {
@ -3005,14 +2994,6 @@ class StubGenerator: public StubCodeGenerator {

    __ enter(); // required for proper stackwalking of RuntimeStub frame

-    // For EVEX with VL and BW, provide a standard mask, VL = 128 will guide the merge
-    // context for the registers used, where all instructions below are using 128-bit mode
-    // On EVEX without VL and BW, these instructions will all be AVX.
-    if (VM_Version::supports_avx512vlbw()) {
-      __ movl(rax, 0xffff);
-      __ kmovql(k1, rax);
-    }
-
    // keylen could be only {11, 13, 15} * 4 = {44, 52, 60}
    __ movl(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));

@ -3107,14 +3088,6 @@ class StubGenerator: public StubCodeGenerator {

    __ enter(); // required for proper stackwalking of RuntimeStub frame

-    // For EVEX with VL and BW, provide a standard mask, VL = 128 will guide the merge
-    // context for the registers used, where all instructions below are using 128-bit mode
-    // On EVEX without VL and BW, these instructions will all be AVX.
-    if (VM_Version::supports_avx512vlbw()) {
-      __ movl(rax, 0xffff);
-      __ kmovql(k1, rax);
-    }
-
    // keylen could be only {11, 13, 15} * 4 = {44, 52, 60}
    __ movl(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));

@ -3227,14 +3200,6 @@ class StubGenerator: public StubCodeGenerator {

    __ enter(); // required for proper stackwalking of RuntimeStub frame

-    // For EVEX with VL and BW, provide a standard mask, VL = 128 will guide the merge
-    // context for the registers used, where all instructions below are using 128-bit mode
-    // On EVEX without VL and BW, these instructions will all be AVX.
-    if (VM_Version::supports_avx512vlbw()) {
-      __ movl(rax, 0xffff);
-      __ kmovql(k1, rax);
-    }
-
 #ifdef _WIN64
    // on win64, fill len_reg from stack position
    __ movl(len_reg, len_mem);
@ -3428,14 +3393,6 @@ class StubGenerator: public StubCodeGenerator {

    __ enter(); // required for proper stackwalking of RuntimeStub frame

-    // For EVEX with VL and BW, provide a standard mask, VL = 128 will guide the merge
-    // context for the registers used, where all instructions below are using 128-bit mode
-    // On EVEX without VL and BW, these instructions will all be AVX.
-    if (VM_Version::supports_avx512vlbw()) {
-      __ movl(rax, 0xffff);
-      __ kmovql(k1, rax);
-    }
-
 #ifdef _WIN64
    // on win64, fill len_reg from stack position
    __ movl(len_reg, len_mem);
@ -3902,14 +3859,6 @@ class StubGenerator: public StubCodeGenerator {

    __ enter(); // required for proper stackwalking of RuntimeStub frame

-    // For EVEX with VL and BW, provide a standard mask, VL = 128 will guide the merge
-    // context for the registers used, where all instructions below are using 128-bit mode
-    // On EVEX without VL and BW, these instructions will all be AVX.
-    if (VM_Version::supports_avx512vlbw()) {
-        __ movl(rax, 0xffff);
-        __ kmovql(k1, rax);
-    }
-
 #ifdef _WIN64
    // allocate spill slots for r13, r14
    enum {
@ -4484,14 +4433,6 @@ address generate_cipherBlockChaining_decryptVectorAESCrypt() {

    __ enter();

-    // For EVEX with VL and BW, provide a standard mask, VL = 128 will guide the merge
-    // context for the registers used, where all instructions below are using 128-bit mode
-    // On EVEX without VL and BW, these instructions will all be AVX.
-    if (VM_Version::supports_avx512vlbw()) {
-      __ movl(rax, 0xffff);
-      __ kmovql(k1, rax);
-    }
-
    __ movdqu(xmm_temp10, ExternalAddress(StubRoutines::x86::ghash_long_swap_mask_addr()));

    __ movdqu(xmm_temp0, Address(state, 0));
@ -4761,7 +4702,6 @@ address generate_cipherBlockChaining_decryptVectorAESCrypt() {
    __ push(r13);
    __ push(r14);
    __ push(r15);
-    __ push(rbx);

    // arguments
    const Register source = c_rarg0; // Source Array
@ -4790,8 +4730,6 @@ address generate_cipherBlockChaining_decryptVectorAESCrypt() {
    __ cmpl(length, 0);
    __ jcc(Assembler::lessEqual, L_exit);

-    // Save k1 value in rbx
-    __ kmovql(rbx, k1);
    __ lea(r11, ExternalAddress(StubRoutines::x86::base64_charset_addr()));
    // check if base64 charset(isURL=0) or base64 url charset(isURL=1) needs to be loaded
    __ cmpl(isURL, 0);
@ -4802,7 +4740,7 @@ address generate_cipherBlockChaining_decryptVectorAESCrypt() {
    __ BIND(L_processdata);
    __ movdqu(xmm16, ExternalAddress(StubRoutines::x86::base64_gather_mask_addr()));
    // Set 64 bits of K register.
-    __ evpcmpeqb(k1, xmm16, xmm16, Assembler::AVX_512bit);
+    __ evpcmpeqb(k3, xmm16, xmm16, Assembler::AVX_512bit);
    __ evmovdquq(xmm12, ExternalAddress(StubRoutines::x86::base64_bswap_mask_addr()), Assembler::AVX_256bit, r13);
    __ evmovdquq(xmm13, ExternalAddress(StubRoutines::x86::base64_right_shift_mask_addr()), Assembler::AVX_512bit, r13);
    __ evmovdquq(xmm14, ExternalAddress(StubRoutines::x86::base64_left_shift_mask_addr()), Assembler::AVX_512bit, r13);
@ -4881,17 +4819,17 @@ address generate_cipherBlockChaining_decryptVectorAESCrypt() {
    __ vextracti64x4(xmm4, xmm5, 1);
    __ vpmovzxwd(xmm7, xmm4, Assembler::AVX_512bit);

-    __ kmovql(k2, k1);
+    __ kmovql(k2, k3);
    __ evpgatherdd(xmm4, k2, Address(r11, xmm0, Address::times_4, 0), Assembler::AVX_512bit);
-    __ kmovql(k2, k1);
+    __ kmovql(k2, k3);
    __ evpgatherdd(xmm5, k2, Address(r11, xmm1, Address::times_4, 0), Assembler::AVX_512bit);
-    __ kmovql(k2, k1);
+    __ kmovql(k2, k3);
    __ evpgatherdd(xmm8, k2, Address(r11, xmm2, Address::times_4, 0), Assembler::AVX_512bit);
-    __ kmovql(k2, k1);
+    __ kmovql(k2, k3);
    __ evpgatherdd(xmm9, k2, Address(r11, xmm3, Address::times_4, 0), Assembler::AVX_512bit);
-    __ kmovql(k2, k1);
+    __ kmovql(k2, k3);
    __ evpgatherdd(xmm10, k2, Address(r11, xmm6, Address::times_4, 0), Assembler::AVX_512bit);
-    __ kmovql(k2, k1);
+    __ kmovql(k2, k3);
    __ evpgatherdd(xmm11, k2, Address(r11, xmm7, Address::times_4, 0), Assembler::AVX_512bit);

    //Down convert dword to byte. Final output is 16*6 = 96 bytes long
@ -4927,9 +4865,9 @@ address generate_cipherBlockChaining_decryptVectorAESCrypt() {
    __ vpmovzxwd(xmm6, xmm9, Assembler::AVX_512bit);
    __ vextracti64x4(xmm9, xmm1, 1);
    __ vpmovzxwd(xmm5, xmm9,  Assembler::AVX_512bit);
-    __ kmovql(k2, k1);
+    __ kmovql(k2, k3);
    __ evpgatherdd(xmm8, k2, Address(r11, xmm6, Address::times_4, 0), Assembler::AVX_512bit);
-    __ kmovql(k2, k1);
+    __ kmovql(k2, k3);
    __ evpgatherdd(xmm10, k2, Address(r11, xmm5, Address::times_4, 0), Assembler::AVX_512bit);
    __ evpmovdb(Address(dest, dp, Address::times_1, 0), xmm8, Assembler::AVX_512bit);
    __ evpmovdb(Address(dest, dp, Address::times_1, 16), xmm10, Assembler::AVX_512bit);
@ -4985,9 +4923,6 @@ address generate_cipherBlockChaining_decryptVectorAESCrypt() {
    __ addq(source, 3);
    __ jmp(L_process3);
    __ BIND(L_exit);
-    // restore k1 register value
-    __ kmovql(k1, rbx);
-    __ pop(rbx);
    __ pop(r15);
    __ pop(r14);
    __ pop(r13);
--- a/src/hotspot/cpu/x86/vm_version_x86.cpp
+++ b/src/hotspot/cpu/x86/vm_version_x86.cpp
@ -401,8 +401,6 @@ class VM_Version_StubGenerator: public StubCodeGenerator {
      // load value into all 64 bytes of zmm7 register
      __ movl(rcx, VM_Version::ymm_test_value());
      __ movdl(xmm0, rcx);
-      __ movl(rcx, 0xffff);
-      __ kmovwl(k1, rcx);
      __ vpbroadcastd(xmm0, xmm0, Assembler::AVX_512bit);
      __ evmovdqul(xmm7, xmm0, Assembler::AVX_512bit);
 #ifdef _LP64