8211251: Default mask register for avx512 instructions

Encode AVX 512 instructions as unmasked instruction where mask register is not specified.

Reviewed-by: kvn
This commit is contained in:
Sandhya Viswanathan 2018-10-01 11:54:34 -07:00 committed by Vladimir Kozlov
parent 5985805474
commit b0ea3a49d2
6 changed files with 252 additions and 437 deletions

File diff suppressed because it is too large Load Diff

@ -871,11 +871,6 @@ private:
void clear_managed(void) { _is_managed = false; }
bool is_managed(void) { return _is_managed; }
// Following functions are for stub code use only
void set_vector_masking(void) { _vector_masking = true; }
void clear_vector_masking(void) { _vector_masking = false; }
bool is_vector_masking(void) { return _vector_masking; }
void lea(Register dst, Address src);
void mov(Register dst, Register src);
@ -2210,7 +2205,7 @@ public:
int vector_len, // The length of vector to be applied in encoding - for both AVX and EVEX
bool rex_vex_w, // Width of data: if 32-bits or less, false, else if 64-bit or specially defined, true
bool legacy_mode, // Details if either this instruction is conditionally encoded to AVX or earlier if true else possibly EVEX
bool no_reg_mask, // when true, k0 is used when EVEX encoding is chosen, else k1 is used under the same condition
bool no_reg_mask, // when true, k0 is used when EVEX encoding is chosen, else embedded_opmask_register_specifier is used
bool uses_vl) // This instruction may have legacy constraints based on vector length for EVEX
:
_avx_vector_len(vector_len),
@ -2225,7 +2220,7 @@ public:
_evex_encoding(0),
_is_clear_context(true),
_is_extended_context(false),
_embedded_opmask_register_specifier(1), // hard code k1, it will be initialized for now
_embedded_opmask_register_specifier(0), // hard code k0
_current_assembler(NULL) {
if (UseAVX < 3) _legacy_mode = true;
}

@ -3267,6 +3267,7 @@ void MacroAssembler::movq(XMMRegister dst, AddressLiteral src) {
}
void MacroAssembler::setvectmask(Register dst, Register src) {
guarantee(PostLoopMultiversioning == true, "must be");
Assembler::movl(dst, 1);
Assembler::shlxl(dst, dst, src);
Assembler::decl(dst);
@ -3275,6 +3276,7 @@ void MacroAssembler::setvectmask(Register dst, Register src) {
}
void MacroAssembler::restorevectmask() {
guarantee(PostLoopMultiversioning == true, "must be");
Assembler::knotwl(k1, k0);
}
@ -5026,7 +5028,7 @@ void MacroAssembler::restore_cpu_control_state_after_jni() {
// Clear upper bits of YMM registers to avoid SSE <-> AVX transition penalty.
vzeroupper();
// Reset k1 to 0xffff.
if (VM_Version::supports_evex()) {
if (PostLoopMultiversioning && VM_Version::supports_evex()) {
push(rcx);
movl(rcx, 0xffff);
kmovwl(k1, rcx);
@ -6681,8 +6683,6 @@ void MacroAssembler::has_negatives(Register ary1, Register len,
VM_Version::supports_avx512vlbw() &&
VM_Version::supports_bmi2()) {
set_vector_masking(); // opening of the stub context for programming mask registers
Label test_64_loop, test_tail;
Register tmp3_aliased = len;
@ -6711,15 +6711,12 @@ void MacroAssembler::has_negatives(Register ary1, Register len,
testl(tmp1, -1);
jcc(Assembler::zero, FALSE_LABEL);
// Save k1
kmovql(k3, k1);
// ~(~0 << len) applied up to two times (for 32-bit scenario)
#ifdef _LP64
mov64(tmp3_aliased, 0xFFFFFFFFFFFFFFFF);
shlxq(tmp3_aliased, tmp3_aliased, tmp1);
notq(tmp3_aliased);
kmovql(k1, tmp3_aliased);
kmovql(k3, tmp3_aliased);
#else
Label k_init;
jmp(k_init);
@ -6728,7 +6725,7 @@ void MacroAssembler::has_negatives(Register ary1, Register len,
// data required to compose 64 1's to the instruction stream
// We emit 64 byte wide series of elements from 0..63 which later on would
// be used as a compare targets with tail count contained in tmp1 register.
// Result would be a k1 register having tmp1 consecutive number or 1
// Result would be a k register having tmp1 consecutive number or 1
// counting from least significant bit.
address tmp = pc();
emit_int64(0x0706050403020100);
@ -6744,18 +6741,14 @@ void MacroAssembler::has_negatives(Register ary1, Register len,
lea(len, InternalAddress(tmp));
// create mask to test for negative byte inside a vector
evpbroadcastb(vec1, tmp1, Assembler::AVX_512bit);
evpcmpgtb(k1, vec1, Address(len, 0), Assembler::AVX_512bit);
evpcmpgtb(k3, vec1, Address(len, 0), Assembler::AVX_512bit);
#endif
evpcmpgtb(k2, k1, vec2, Address(ary1, 0), Assembler::AVX_512bit);
ktestq(k2, k1);
// Restore k1
kmovql(k1, k3);
evpcmpgtb(k2, k3, vec2, Address(ary1, 0), Assembler::AVX_512bit);
ktestq(k2, k3);
jcc(Assembler::notZero, TRUE_LABEL);
jmp(FALSE_LABEL);
clear_vector_masking(); // closing of the stub context for programming mask registers
} else {
movl(result, len); // copy
@ -7197,10 +7190,6 @@ void MacroAssembler::generate_fill(BasicType t, bool aligned,
{
assert( UseSSE >= 2, "supported cpu only" );
Label L_fill_32_bytes_loop, L_check_fill_8_bytes, L_fill_8_bytes_loop, L_fill_8_bytes;
if (UseAVX > 2) {
movl(rtmp, 0xffff);
kmovwl(k1, rtmp);
}
movdl(xtmp, value);
if (UseAVX > 2 && UseUnalignedLoadStores) {
// Fill 64-byte chunks
@ -7945,7 +7934,6 @@ void MacroAssembler::vectorized_mismatch(Register obja, Register objb, Register
VM_Version::supports_avx512vlbw()) {
Label VECTOR64_LOOP, VECTOR64_NOT_EQUAL, VECTOR32_TAIL;
set_vector_masking(); // opening of the stub context for programming mask registers
cmpq(length, 64);
jcc(Assembler::less, VECTOR32_TAIL);
movq(tmp1, length);
@ -7968,19 +7956,15 @@ void MacroAssembler::vectorized_mismatch(Register obja, Register objb, Register
//bind(VECTOR64_TAIL);
// AVX512 code to compare upto 63 byte vectors.
// Save k1
kmovql(k3, k1);
mov64(tmp2, 0xFFFFFFFFFFFFFFFF);
shlxq(tmp2, tmp2, tmp1);
notq(tmp2);
kmovql(k1, tmp2);
kmovql(k3, tmp2);
evmovdqub(rymm0, k1, Address(obja, result), Assembler::AVX_512bit);
evpcmpeqb(k7, k1, rymm0, Address(objb, result), Assembler::AVX_512bit);
evmovdqub(rymm0, k3, Address(obja, result), Assembler::AVX_512bit);
evpcmpeqb(k7, k3, rymm0, Address(objb, result), Assembler::AVX_512bit);
ktestql(k7, k1);
// Restore k1
kmovql(k1, k3);
ktestql(k7, k3);
jcc(Assembler::below, SAME_TILL_END); // not mismatch
bind(VECTOR64_NOT_EQUAL);
@ -7991,7 +7975,6 @@ void MacroAssembler::vectorized_mismatch(Register obja, Register objb, Register
shrq(result);
jmp(DONE);
bind(VECTOR32_TAIL);
clear_vector_masking(); // closing of the stub context for programming mask registers
}
cmpq(length, 8);
@ -8752,11 +8735,6 @@ void MacroAssembler::kernel_crc32(Register crc, Register buf, Register len, Regi
// For EVEX with VL and BW, provide a standard mask, VL = 128 will guide the merge
// context for the registers used, where all instructions below are using 128-bit mode
// On EVEX without VL and BW, these instructions will all be AVX.
if (VM_Version::supports_avx512vlbw()) {
movl(tmp, 0xffff);
kmovwl(k1, tmp);
}
lea(table, ExternalAddress(StubRoutines::crc_table_addr()));
notl(crc); // ~crc
cmpl(len, 16);
@ -9418,9 +9396,7 @@ void MacroAssembler::char_array_compress(Register src, Register dst, Register le
VM_Version::supports_avx512vlbw() &&
VM_Version::supports_bmi2()) {
set_vector_masking(); // opening of the stub context for programming mask registers
Label copy_32_loop, copy_loop_tail, restore_k1_return_zero, below_threshold;
Label copy_32_loop, copy_loop_tail, below_threshold;
// alignment
Label post_alignment;
@ -9434,9 +9410,6 @@ void MacroAssembler::char_array_compress(Register src, Register dst, Register le
movl(result, 0x00FF);
evpbroadcastw(tmp2Reg, result, Assembler::AVX_512bit);
// Save k1
kmovql(k3, k1);
testl(len, -64);
jcc(Assembler::zero, post_alignment);
@ -9453,14 +9426,14 @@ void MacroAssembler::char_array_compress(Register src, Register dst, Register le
movl(result, 0xFFFFFFFF);
shlxl(result, result, tmp5);
notl(result);
kmovdl(k1, result);
kmovdl(k3, result);
evmovdquw(tmp1Reg, k1, Address(src, 0), Assembler::AVX_512bit);
evpcmpuw(k2, k1, tmp1Reg, tmp2Reg, Assembler::le, Assembler::AVX_512bit);
ktestd(k2, k1);
jcc(Assembler::carryClear, restore_k1_return_zero);
evmovdquw(tmp1Reg, k3, Address(src, 0), Assembler::AVX_512bit);
evpcmpuw(k2, k3, tmp1Reg, tmp2Reg, Assembler::le, Assembler::AVX_512bit);
ktestd(k2, k3);
jcc(Assembler::carryClear, return_zero);
evpmovwb(Address(dst, 0), k1, tmp1Reg, Assembler::AVX_512bit);
evpmovwb(Address(dst, 0), k3, tmp1Reg, Assembler::AVX_512bit);
addptr(src, tmp5);
addptr(src, tmp5);
@ -9483,7 +9456,7 @@ void MacroAssembler::char_array_compress(Register src, Register dst, Register le
evmovdquw(tmp1Reg, Address(src, len, Address::times_2), Assembler::AVX_512bit);
evpcmpuw(k2, tmp1Reg, tmp2Reg, Assembler::le, Assembler::AVX_512bit);
kortestdl(k2, k2);
jcc(Assembler::carryClear, restore_k1_return_zero);
jcc(Assembler::carryClear, return_zero);
// All elements in current processed chunk are valid candidates for
// compression. Write a truncated byte elements to the memory.
@ -9494,8 +9467,6 @@ void MacroAssembler::char_array_compress(Register src, Register dst, Register le
bind(copy_loop_tail);
// bail out when there is nothing to be done
testl(tmp5, 0xFFFFFFFF);
// Restore k1
kmovql(k1, k3);
jcc(Assembler::zero, return_length);
movl(len, tmp5);
@ -9505,25 +9476,16 @@ void MacroAssembler::char_array_compress(Register src, Register dst, Register le
shlxl(result, result, len);
notl(result);
kmovdl(k1, result);
kmovdl(k3, result);
evmovdquw(tmp1Reg, k1, Address(src, 0), Assembler::AVX_512bit);
evpcmpuw(k2, k1, tmp1Reg, tmp2Reg, Assembler::le, Assembler::AVX_512bit);
ktestd(k2, k1);
jcc(Assembler::carryClear, restore_k1_return_zero);
evmovdquw(tmp1Reg, k3, Address(src, 0), Assembler::AVX_512bit);
evpcmpuw(k2, k3, tmp1Reg, tmp2Reg, Assembler::le, Assembler::AVX_512bit);
ktestd(k2, k3);
jcc(Assembler::carryClear, return_zero);
evpmovwb(Address(dst, 0), k1, tmp1Reg, Assembler::AVX_512bit);
// Restore k1
kmovql(k1, k3);
evpmovwb(Address(dst, 0), k3, tmp1Reg, Assembler::AVX_512bit);
jmp(return_length);
bind(restore_k1_return_zero);
// Restore k1
kmovql(k1, k3);
jmp(return_zero);
clear_vector_masking(); // closing of the stub context for programming mask registers
bind(below_threshold);
}
@ -9637,8 +9599,6 @@ void MacroAssembler::byte_array_inflate(Register src, Register dst, Register len
VM_Version::supports_avx512vlbw() &&
VM_Version::supports_bmi2()) {
set_vector_masking(); // opening of the stub context for programming mask registers
Label copy_32_loop, copy_tail;
Register tmp3_aliased = len;
@ -9670,22 +9630,15 @@ void MacroAssembler::byte_array_inflate(Register src, Register dst, Register len
testl(tmp2, -1); // we don't destroy the contents of tmp2 here
jcc(Assembler::zero, done);
// Save k1
kmovql(k2, k1);
// ~(~0 << length), where length is the # of remaining elements to process
movl(tmp3_aliased, -1);
shlxl(tmp3_aliased, tmp3_aliased, tmp2);
notl(tmp3_aliased);
kmovdl(k1, tmp3_aliased);
evpmovzxbw(tmp1, k1, Address(src, 0), Assembler::AVX_512bit);
evmovdquw(Address(dst, 0), k1, tmp1, Assembler::AVX_512bit);
kmovdl(k2, tmp3_aliased);
evpmovzxbw(tmp1, k2, Address(src, 0), Assembler::AVX_512bit);
evmovdquw(Address(dst, 0), k2, tmp1, Assembler::AVX_512bit);
// Restore k1
kmovql(k1, k2);
jmp(done);
clear_vector_masking(); // closing of the stub context for programming mask registers
}
if (UseSSE42Intrinsics) {
Label copy_16_loop, copy_8_loop, copy_bytes, copy_new_tail, copy_tail;

@ -153,12 +153,6 @@ class StubGenerator: public StubCodeGenerator {
__ movptr(saved_rsi, rsi);
__ movptr(saved_rbx, rbx);
// provide initial value for required masks
if (UseAVX > 2) {
__ movl(rbx, 0xffff);
__ kmovwl(k1, rbx);
}
// save and initialize %mxcsr
if (sse_save) {
Label skip_ldmx;
@ -679,12 +673,7 @@ class StubGenerator: public StubCodeGenerator {
void xmm_copy_forward(Register from, Register to_from, Register qword_count) {
assert( UseSSE >= 2, "supported cpu only" );
Label L_copy_64_bytes_loop, L_copy_64_bytes, L_copy_8_bytes, L_exit;
if (UseAVX > 2) {
__ push(rbx);
__ movl(rbx, 0xffff);
__ kmovwl(k1, rbx);
__ pop(rbx);
}
// Copy 64-byte chunks
__ jmpb(L_copy_64_bytes);
__ align(OptoLoopAlignment);
@ -2115,14 +2104,6 @@ class StubGenerator: public StubCodeGenerator {
__ enter(); // required for proper stackwalking of RuntimeStub frame
// For EVEX with VL and BW, provide a standard mask, VL = 128 will guide the merge
// context for the registers used, where all instructions below are using 128-bit mode
// On EVEX without VL and BW, these instructions will all be AVX.
if (VM_Version::supports_avx512vlbw()) {
__ movl(rdx, 0xffff);
__ kmovdl(k1, rdx);
}
__ movptr(from, from_param);
__ movptr(key, key_param);
@ -2222,14 +2203,6 @@ class StubGenerator: public StubCodeGenerator {
__ enter(); // required for proper stackwalking of RuntimeStub frame
// For EVEX with VL and BW, provide a standard mask, VL = 128 will guide the merge
// context for the registers used, where all instructions below are using 128-bit mode
// On EVEX without VL and BW, these instructions will all be AVX.
if (VM_Version::supports_avx512vlbw()) {
__ movl(rdx, 0xffff);
__ kmovdl(k1, rdx);
}
__ movptr(from, from_param);
__ movptr(key, key_param);
@ -2356,14 +2329,6 @@ class StubGenerator: public StubCodeGenerator {
__ enter(); // required for proper stackwalking of RuntimeStub frame
handleSOERegisters(true /*saving*/);
// For EVEX with VL and BW, provide a standard mask, VL = 128 will guide the merge
// context for the registers used, where all instructions below are using 128-bit mode
// On EVEX without VL and BW, these instructions will all be AVX.
if (VM_Version::supports_avx512vlbw()) {
__ movl(rdx, 0xffff);
__ kmovdl(k1, rdx);
}
// load registers from incoming parameters
const Address from_param(rbp, 8+0);
const Address to_param (rbp, 8+4);
@ -2532,14 +2497,6 @@ class StubGenerator: public StubCodeGenerator {
__ enter(); // required for proper stackwalking of RuntimeStub frame
handleSOERegisters(true /*saving*/);
// For EVEX with VL and BW, provide a standard mask, VL = 128 will guide the merge
// context for the registers used, where all instructions below are using 128-bit mode
// On EVEX without VL and BW, these instructions will all be AVX.
if (VM_Version::supports_avx512vlbw()) {
__ movl(rdx, 0xffff);
__ kmovdl(k1, rdx);
}
// load registers from incoming parameters
const Address from_param(rbp, 8+0);
const Address to_param (rbp, 8+4);
@ -2693,14 +2650,6 @@ class StubGenerator: public StubCodeGenerator {
__ enter(); // required for proper stackwalking of RuntimeStub frame
handleSOERegisters(true /*saving*/); // save rbx, rsi, rdi
// For EVEX with VL and BW, provide a standard mask, VL = 128 will guide the merge
// context for the registers used, where all instructions below are using 128-bit mode
// On EVEX without VL and BW, these instructions will all be AVX.
if (VM_Version::supports_avx512vlbw()) {
__ movl(rdx, 0xffff);
__ kmovdl(k1, rdx);
}
// load registers from incoming parameters
const Address from_param(rbp, 8+0);
const Address to_param (rbp, 8+4);
@ -3154,14 +3103,6 @@ class StubGenerator: public StubCodeGenerator {
__ enter();
handleSOERegisters(true); // Save registers
// For EVEX with VL and BW, provide a standard mask, VL = 128 will guide the merge
// context for the registers used, where all instructions below are using 128-bit mode
// On EVEX without VL and BW, these instructions will all be AVX.
if (VM_Version::supports_avx512vlbw()) {
__ movl(rdx, 0xffff);
__ kmovdl(k1, rdx);
}
__ movptr(state, state_param);
__ movptr(subkeyH, subkeyH_param);
__ movptr(data, data_param);

@ -254,10 +254,7 @@ class StubGenerator: public StubCodeGenerator {
__ movptr(r13_save, r13);
__ movptr(r14_save, r14);
__ movptr(r15_save, r15);
if (UseAVX > 2) {
__ movl(rbx, 0xffff);
__ kmovwl(k1, rbx);
}
#ifdef _WIN64
int last_reg = 15;
if (UseAVX > 2) {
@ -1257,10 +1254,6 @@ class StubGenerator: public StubCodeGenerator {
__ align(OptoLoopAlignment);
if (UseUnalignedLoadStores) {
Label L_end;
if (UseAVX > 2) {
__ movl(to, 0xffff);
__ kmovwl(k1, to);
}
// Copy 64-bytes per iteration
__ BIND(L_loop);
if (UseAVX > 2) {
@ -1341,10 +1334,6 @@ class StubGenerator: public StubCodeGenerator {
__ align(OptoLoopAlignment);
if (UseUnalignedLoadStores) {
Label L_end;
if (UseAVX > 2) {
__ movl(to, 0xffff);
__ kmovwl(k1, to);
}
// Copy 64-bytes per iteration
__ BIND(L_loop);
if (UseAVX > 2) {
@ -3005,14 +2994,6 @@ class StubGenerator: public StubCodeGenerator {
__ enter(); // required for proper stackwalking of RuntimeStub frame
// For EVEX with VL and BW, provide a standard mask, VL = 128 will guide the merge
// context for the registers used, where all instructions below are using 128-bit mode
// On EVEX without VL and BW, these instructions will all be AVX.
if (VM_Version::supports_avx512vlbw()) {
__ movl(rax, 0xffff);
__ kmovql(k1, rax);
}
// keylen could be only {11, 13, 15} * 4 = {44, 52, 60}
__ movl(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
@ -3107,14 +3088,6 @@ class StubGenerator: public StubCodeGenerator {
__ enter(); // required for proper stackwalking of RuntimeStub frame
// For EVEX with VL and BW, provide a standard mask, VL = 128 will guide the merge
// context for the registers used, where all instructions below are using 128-bit mode
// On EVEX without VL and BW, these instructions will all be AVX.
if (VM_Version::supports_avx512vlbw()) {
__ movl(rax, 0xffff);
__ kmovql(k1, rax);
}
// keylen could be only {11, 13, 15} * 4 = {44, 52, 60}
__ movl(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
@ -3227,14 +3200,6 @@ class StubGenerator: public StubCodeGenerator {
__ enter(); // required for proper stackwalking of RuntimeStub frame
// For EVEX with VL and BW, provide a standard mask, VL = 128 will guide the merge
// context for the registers used, where all instructions below are using 128-bit mode
// On EVEX without VL and BW, these instructions will all be AVX.
if (VM_Version::supports_avx512vlbw()) {
__ movl(rax, 0xffff);
__ kmovql(k1, rax);
}
#ifdef _WIN64
// on win64, fill len_reg from stack position
__ movl(len_reg, len_mem);
@ -3428,14 +3393,6 @@ class StubGenerator: public StubCodeGenerator {
__ enter(); // required for proper stackwalking of RuntimeStub frame
// For EVEX with VL and BW, provide a standard mask, VL = 128 will guide the merge
// context for the registers used, where all instructions below are using 128-bit mode
// On EVEX without VL and BW, these instructions will all be AVX.
if (VM_Version::supports_avx512vlbw()) {
__ movl(rax, 0xffff);
__ kmovql(k1, rax);
}
#ifdef _WIN64
// on win64, fill len_reg from stack position
__ movl(len_reg, len_mem);
@ -3902,14 +3859,6 @@ class StubGenerator: public StubCodeGenerator {
__ enter(); // required for proper stackwalking of RuntimeStub frame
// For EVEX with VL and BW, provide a standard mask, VL = 128 will guide the merge
// context for the registers used, where all instructions below are using 128-bit mode
// On EVEX without VL and BW, these instructions will all be AVX.
if (VM_Version::supports_avx512vlbw()) {
__ movl(rax, 0xffff);
__ kmovql(k1, rax);
}
#ifdef _WIN64
// allocate spill slots for r13, r14
enum {
@ -4484,14 +4433,6 @@ address generate_cipherBlockChaining_decryptVectorAESCrypt() {
__ enter();
// For EVEX with VL and BW, provide a standard mask, VL = 128 will guide the merge
// context for the registers used, where all instructions below are using 128-bit mode
// On EVEX without VL and BW, these instructions will all be AVX.
if (VM_Version::supports_avx512vlbw()) {
__ movl(rax, 0xffff);
__ kmovql(k1, rax);
}
__ movdqu(xmm_temp10, ExternalAddress(StubRoutines::x86::ghash_long_swap_mask_addr()));
__ movdqu(xmm_temp0, Address(state, 0));
@ -4761,7 +4702,6 @@ address generate_cipherBlockChaining_decryptVectorAESCrypt() {
__ push(r13);
__ push(r14);
__ push(r15);
__ push(rbx);
// arguments
const Register source = c_rarg0; // Source Array
@ -4790,8 +4730,6 @@ address generate_cipherBlockChaining_decryptVectorAESCrypt() {
__ cmpl(length, 0);
__ jcc(Assembler::lessEqual, L_exit);
// Save k1 value in rbx
__ kmovql(rbx, k1);
__ lea(r11, ExternalAddress(StubRoutines::x86::base64_charset_addr()));
// check if base64 charset(isURL=0) or base64 url charset(isURL=1) needs to be loaded
__ cmpl(isURL, 0);
@ -4802,7 +4740,7 @@ address generate_cipherBlockChaining_decryptVectorAESCrypt() {
__ BIND(L_processdata);
__ movdqu(xmm16, ExternalAddress(StubRoutines::x86::base64_gather_mask_addr()));
// Set 64 bits of K register.
__ evpcmpeqb(k1, xmm16, xmm16, Assembler::AVX_512bit);
__ evpcmpeqb(k3, xmm16, xmm16, Assembler::AVX_512bit);
__ evmovdquq(xmm12, ExternalAddress(StubRoutines::x86::base64_bswap_mask_addr()), Assembler::AVX_256bit, r13);
__ evmovdquq(xmm13, ExternalAddress(StubRoutines::x86::base64_right_shift_mask_addr()), Assembler::AVX_512bit, r13);
__ evmovdquq(xmm14, ExternalAddress(StubRoutines::x86::base64_left_shift_mask_addr()), Assembler::AVX_512bit, r13);
@ -4881,17 +4819,17 @@ address generate_cipherBlockChaining_decryptVectorAESCrypt() {
__ vextracti64x4(xmm4, xmm5, 1);
__ vpmovzxwd(xmm7, xmm4, Assembler::AVX_512bit);
__ kmovql(k2, k1);
__ kmovql(k2, k3);
__ evpgatherdd(xmm4, k2, Address(r11, xmm0, Address::times_4, 0), Assembler::AVX_512bit);
__ kmovql(k2, k1);
__ kmovql(k2, k3);
__ evpgatherdd(xmm5, k2, Address(r11, xmm1, Address::times_4, 0), Assembler::AVX_512bit);
__ kmovql(k2, k1);
__ kmovql(k2, k3);
__ evpgatherdd(xmm8, k2, Address(r11, xmm2, Address::times_4, 0), Assembler::AVX_512bit);
__ kmovql(k2, k1);
__ kmovql(k2, k3);
__ evpgatherdd(xmm9, k2, Address(r11, xmm3, Address::times_4, 0), Assembler::AVX_512bit);
__ kmovql(k2, k1);
__ kmovql(k2, k3);
__ evpgatherdd(xmm10, k2, Address(r11, xmm6, Address::times_4, 0), Assembler::AVX_512bit);
__ kmovql(k2, k1);
__ kmovql(k2, k3);
__ evpgatherdd(xmm11, k2, Address(r11, xmm7, Address::times_4, 0), Assembler::AVX_512bit);
//Down convert dword to byte. Final output is 16*6 = 96 bytes long
@ -4927,9 +4865,9 @@ address generate_cipherBlockChaining_decryptVectorAESCrypt() {
__ vpmovzxwd(xmm6, xmm9, Assembler::AVX_512bit);
__ vextracti64x4(xmm9, xmm1, 1);
__ vpmovzxwd(xmm5, xmm9, Assembler::AVX_512bit);
__ kmovql(k2, k1);
__ kmovql(k2, k3);
__ evpgatherdd(xmm8, k2, Address(r11, xmm6, Address::times_4, 0), Assembler::AVX_512bit);
__ kmovql(k2, k1);
__ kmovql(k2, k3);
__ evpgatherdd(xmm10, k2, Address(r11, xmm5, Address::times_4, 0), Assembler::AVX_512bit);
__ evpmovdb(Address(dest, dp, Address::times_1, 0), xmm8, Assembler::AVX_512bit);
__ evpmovdb(Address(dest, dp, Address::times_1, 16), xmm10, Assembler::AVX_512bit);
@ -4985,9 +4923,6 @@ address generate_cipherBlockChaining_decryptVectorAESCrypt() {
__ addq(source, 3);
__ jmp(L_process3);
__ BIND(L_exit);
// restore k1 register value
__ kmovql(k1, rbx);
__ pop(rbx);
__ pop(r15);
__ pop(r14);
__ pop(r13);

@ -401,8 +401,6 @@ class VM_Version_StubGenerator: public StubCodeGenerator {
// load value into all 64 bytes of zmm7 register
__ movl(rcx, VM_Version::ymm_test_value());
__ movdl(xmm0, rcx);
__ movl(rcx, 0xffff);
__ kmovwl(k1, rcx);
__ vpbroadcastd(xmm0, xmm0, Assembler::AVX_512bit);
__ evmovdqul(xmm7, xmm0, Assembler::AVX_512bit);
#ifdef _LP64