jdk-24/src/hotspot/cpu/x86/macroAssembler_x86_aes.cpp
Smita Kamath a6649eb089 8233741: AES Countermode (AES-CTR) optimization using AVX512 + VAES instructions
Co-authored-by: Regev Shemy <regev.shemy@intel.com>
Co-authored-by: Shay Gueron <shay.gueron@intel.com>
Reviewed-by: kvn
2019-11-07 17:47:22 -08:00

1271 lines
55 KiB
C++

/*
* Copyright (c) 2019, Intel Corporation.
*
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License version 2 only, as
* published by the Free Software Foundation.
*
* This code is distributed in the hope that it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
* version 2 for more details (a copy is included in the LICENSE file that
* accompanied this code).
*
* You should have received a copy of the GNU General Public License version
* 2 along with this work; if not, write to the Free Software Foundation,
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
*
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
* or visit www.oracle.com if you need additional information or have any
* questions.
*
*/
#include "precompiled.hpp"
#include "asm/assembler.hpp"
#include "asm/assembler.inline.hpp"
#include "runtime/stubRoutines.hpp"
#include "macroAssembler_x86.hpp"
#ifdef _LP64
void MacroAssembler::roundEnc(XMMRegister key, int rnum) {
for (int xmm_reg_no = 0; xmm_reg_no <=rnum; xmm_reg_no++) {
vaesenc(as_XMMRegister(xmm_reg_no), as_XMMRegister(xmm_reg_no), key, Assembler::AVX_512bit);
}
}
void MacroAssembler::lastroundEnc(XMMRegister key, int rnum) {
for (int xmm_reg_no = 0; xmm_reg_no <=rnum; xmm_reg_no++) {
vaesenclast(as_XMMRegister(xmm_reg_no), as_XMMRegister(xmm_reg_no), key, Assembler::AVX_512bit);
}
}
void MacroAssembler::roundDec(XMMRegister key, int rnum) {
for (int xmm_reg_no = 0; xmm_reg_no <=rnum; xmm_reg_no++) {
vaesdec(as_XMMRegister(xmm_reg_no), as_XMMRegister(xmm_reg_no), key, Assembler::AVX_512bit);
}
}
void MacroAssembler::lastroundDec(XMMRegister key, int rnum) {
for (int xmm_reg_no = 0; xmm_reg_no <=rnum; xmm_reg_no++) {
vaesdeclast(as_XMMRegister(xmm_reg_no), as_XMMRegister(xmm_reg_no), key, Assembler::AVX_512bit);
}
}
// Load key and shuffle operation
void MacroAssembler::ev_load_key(XMMRegister xmmdst, Register key, int offset, XMMRegister xmm_shuf_mask=NULL) {
movdqu(xmmdst, Address(key, offset));
if (xmm_shuf_mask != NULL) {
pshufb(xmmdst, xmm_shuf_mask);
} else {
pshufb(xmmdst, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr()));
}
evshufi64x2(xmmdst, xmmdst, xmmdst, 0x0, Assembler::AVX_512bit);
}
// AES-ECB Encrypt Operation
void MacroAssembler::aesecb_encrypt(Register src_addr, Register dest_addr, Register key, Register len) {
const Register pos = rax;
const Register rounds = r12;
Label NO_PARTS, LOOP, Loop_start, LOOP2, AES192, END_LOOP, AES256, REMAINDER, LAST2, END, KEY_192, KEY_256, EXIT;
push(r13);
push(r12);
// For EVEX with VL and BW, provide a standard mask, VL = 128 will guide the merge
// context for the registers used, where all instructions below are using 128-bit mode
// On EVEX without VL and BW, these instructions will all be AVX.
if (VM_Version::supports_avx512vlbw()) {
movl(rax, 0xffff);
kmovql(k1, rax);
}
push(len); // Save
push(rbx);
vzeroupper();
xorptr(pos, pos);
// Calculate number of rounds based on key length(128, 192, 256):44 for 10-rounds, 52 for 12-rounds, 60 for 14-rounds
movl(rounds, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
// Load Key shuf mask
const XMMRegister xmm_key_shuf_mask = xmm31; // used temporarily to swap key bytes up front
movdqu(xmm_key_shuf_mask, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr()));
// Load and shuffle key based on number of rounds
ev_load_key(xmm8, key, 0 * 16, xmm_key_shuf_mask);
ev_load_key(xmm9, key, 1 * 16, xmm_key_shuf_mask);
ev_load_key(xmm10, key, 2 * 16, xmm_key_shuf_mask);
ev_load_key(xmm23, key, 3 * 16, xmm_key_shuf_mask);
ev_load_key(xmm12, key, 4 * 16, xmm_key_shuf_mask);
ev_load_key(xmm13, key, 5 * 16, xmm_key_shuf_mask);
ev_load_key(xmm14, key, 6 * 16, xmm_key_shuf_mask);
ev_load_key(xmm15, key, 7 * 16, xmm_key_shuf_mask);
ev_load_key(xmm16, key, 8 * 16, xmm_key_shuf_mask);
ev_load_key(xmm17, key, 9 * 16, xmm_key_shuf_mask);
ev_load_key(xmm24, key, 10 * 16, xmm_key_shuf_mask);
cmpl(rounds, 52);
jcc(Assembler::greaterEqual, KEY_192);
jmp(Loop_start);
bind(KEY_192);
ev_load_key(xmm19, key, 11 * 16, xmm_key_shuf_mask);
ev_load_key(xmm20, key, 12 * 16, xmm_key_shuf_mask);
cmpl(rounds, 60);
jcc(Assembler::equal, KEY_256);
jmp(Loop_start);
bind(KEY_256);
ev_load_key(xmm21, key, 13 * 16, xmm_key_shuf_mask);
ev_load_key(xmm22, key, 14 * 16, xmm_key_shuf_mask);
bind(Loop_start);
movq(rbx, len);
// Divide length by 16 to convert it to number of blocks
shrq(len, 4);
shlq(rbx, 60);
jcc(Assembler::equal, NO_PARTS);
addq(len, 1);
// Check if number of blocks is greater than or equal to 32
// If true, 512 bytes are processed at a time (code marked by label LOOP)
// If not, 16 bytes are processed (code marked by REMAINDER label)
bind(NO_PARTS);
movq(rbx, len);
shrq(len, 5);
jcc(Assembler::equal, REMAINDER);
movl(r13, len);
// Compute number of blocks that will be processed 512 bytes at a time
// Subtract this from the total number of blocks which will then be processed by REMAINDER loop
shlq(r13, 5);
subq(rbx, r13);
//Begin processing 512 bytes
bind(LOOP);
// Move 64 bytes of PT data into a zmm register, as a result 512 bytes of PT loaded in zmm0-7
evmovdquq(xmm0, Address(src_addr, pos, Address::times_1, 0 * 64), Assembler::AVX_512bit);
evmovdquq(xmm1, Address(src_addr, pos, Address::times_1, 1 * 64), Assembler::AVX_512bit);
evmovdquq(xmm2, Address(src_addr, pos, Address::times_1, 2 * 64), Assembler::AVX_512bit);
evmovdquq(xmm3, Address(src_addr, pos, Address::times_1, 3 * 64), Assembler::AVX_512bit);
evmovdquq(xmm4, Address(src_addr, pos, Address::times_1, 4 * 64), Assembler::AVX_512bit);
evmovdquq(xmm5, Address(src_addr, pos, Address::times_1, 5 * 64), Assembler::AVX_512bit);
evmovdquq(xmm6, Address(src_addr, pos, Address::times_1, 6 * 64), Assembler::AVX_512bit);
evmovdquq(xmm7, Address(src_addr, pos, Address::times_1, 7 * 64), Assembler::AVX_512bit);
// Xor with the first round key
evpxorq(xmm0, xmm0, xmm8, Assembler::AVX_512bit);
evpxorq(xmm1, xmm1, xmm8, Assembler::AVX_512bit);
evpxorq(xmm2, xmm2, xmm8, Assembler::AVX_512bit);
evpxorq(xmm3, xmm3, xmm8, Assembler::AVX_512bit);
evpxorq(xmm4, xmm4, xmm8, Assembler::AVX_512bit);
evpxorq(xmm5, xmm5, xmm8, Assembler::AVX_512bit);
evpxorq(xmm6, xmm6, xmm8, Assembler::AVX_512bit);
evpxorq(xmm7, xmm7, xmm8, Assembler::AVX_512bit);
// 9 Aes encode round operations
roundEnc(xmm9, 7);
roundEnc(xmm10, 7);
roundEnc(xmm23, 7);
roundEnc(xmm12, 7);
roundEnc(xmm13, 7);
roundEnc(xmm14, 7);
roundEnc(xmm15, 7);
roundEnc(xmm16, 7);
roundEnc(xmm17, 7);
cmpl(rounds, 52);
jcc(Assembler::aboveEqual, AES192);
// Aesenclast round operation for keysize = 128
lastroundEnc(xmm24, 7);
jmp(END_LOOP);
//Additional 2 rounds of Aesenc operation for keysize = 192
bind(AES192);
roundEnc(xmm24, 7);
roundEnc(xmm19, 7);
cmpl(rounds, 60);
jcc(Assembler::aboveEqual, AES256);
// Aesenclast round for keysize = 192
lastroundEnc(xmm20, 7);
jmp(END_LOOP);
// 2 rounds of Aesenc operation and Aesenclast for keysize = 256
bind(AES256);
roundEnc(xmm20, 7);
roundEnc(xmm21, 7);
lastroundEnc(xmm22, 7);
bind(END_LOOP);
// Move 512 bytes of CT to destination
evmovdquq(Address(dest_addr, pos, Address::times_1, 0 * 64), xmm0, Assembler::AVX_512bit);
evmovdquq(Address(dest_addr, pos, Address::times_1, 1 * 64), xmm1, Assembler::AVX_512bit);
evmovdquq(Address(dest_addr, pos, Address::times_1, 2 * 64), xmm2, Assembler::AVX_512bit);
evmovdquq(Address(dest_addr, pos, Address::times_1, 3 * 64), xmm3, Assembler::AVX_512bit);
evmovdquq(Address(dest_addr, pos, Address::times_1, 4 * 64), xmm4, Assembler::AVX_512bit);
evmovdquq(Address(dest_addr, pos, Address::times_1, 5 * 64), xmm5, Assembler::AVX_512bit);
evmovdquq(Address(dest_addr, pos, Address::times_1, 6 * 64), xmm6, Assembler::AVX_512bit);
evmovdquq(Address(dest_addr, pos, Address::times_1, 7 * 64), xmm7, Assembler::AVX_512bit);
addq(pos, 512);
decq(len);
jcc(Assembler::notEqual, LOOP);
bind(REMAINDER);
vzeroupper();
cmpq(rbx, 0);
jcc(Assembler::equal, END);
// Process 16 bytes at a time
bind(LOOP2);
movdqu(xmm1, Address(src_addr, pos, Address::times_1, 0));
vpxor(xmm1, xmm1, xmm8, Assembler::AVX_128bit);
// xmm2 contains shuffled key for Aesenclast operation.
vmovdqu(xmm2, xmm24);
vaesenc(xmm1, xmm1, xmm9, Assembler::AVX_128bit);
vaesenc(xmm1, xmm1, xmm10, Assembler::AVX_128bit);
vaesenc(xmm1, xmm1, xmm23, Assembler::AVX_128bit);
vaesenc(xmm1, xmm1, xmm12, Assembler::AVX_128bit);
vaesenc(xmm1, xmm1, xmm13, Assembler::AVX_128bit);
vaesenc(xmm1, xmm1, xmm14, Assembler::AVX_128bit);
vaesenc(xmm1, xmm1, xmm15, Assembler::AVX_128bit);
vaesenc(xmm1, xmm1, xmm16, Assembler::AVX_128bit);
vaesenc(xmm1, xmm1, xmm17, Assembler::AVX_128bit);
cmpl(rounds, 52);
jcc(Assembler::below, LAST2);
vmovdqu(xmm2, xmm20);
vaesenc(xmm1, xmm1, xmm24, Assembler::AVX_128bit);
vaesenc(xmm1, xmm1, xmm19, Assembler::AVX_128bit);
cmpl(rounds, 60);
jcc(Assembler::below, LAST2);
vmovdqu(xmm2, xmm22);
vaesenc(xmm1, xmm1, xmm20, Assembler::AVX_128bit);
vaesenc(xmm1, xmm1, xmm21, Assembler::AVX_128bit);
bind(LAST2);
// Aesenclast round
vaesenclast(xmm1, xmm1, xmm2, Assembler::AVX_128bit);
// Write 16 bytes of CT to destination
movdqu(Address(dest_addr, pos, Address::times_1, 0), xmm1);
addq(pos, 16);
decq(rbx);
jcc(Assembler::notEqual, LOOP2);
bind(END);
// Zero out the round keys
evpxorq(xmm8, xmm8, xmm8, Assembler::AVX_512bit);
evpxorq(xmm9, xmm9, xmm9, Assembler::AVX_512bit);
evpxorq(xmm10, xmm10, xmm10, Assembler::AVX_512bit);
evpxorq(xmm23, xmm23, xmm23, Assembler::AVX_512bit);
evpxorq(xmm12, xmm12, xmm12, Assembler::AVX_512bit);
evpxorq(xmm13, xmm13, xmm13, Assembler::AVX_512bit);
evpxorq(xmm14, xmm14, xmm14, Assembler::AVX_512bit);
evpxorq(xmm15, xmm15, xmm15, Assembler::AVX_512bit);
evpxorq(xmm16, xmm16, xmm16, Assembler::AVX_512bit);
evpxorq(xmm17, xmm17, xmm17, Assembler::AVX_512bit);
evpxorq(xmm24, xmm24, xmm24, Assembler::AVX_512bit);
cmpl(rounds, 44);
jcc(Assembler::belowEqual, EXIT);
evpxorq(xmm19, xmm19, xmm19, Assembler::AVX_512bit);
evpxorq(xmm20, xmm20, xmm20, Assembler::AVX_512bit);
cmpl(rounds, 52);
jcc(Assembler::belowEqual, EXIT);
evpxorq(xmm21, xmm21, xmm21, Assembler::AVX_512bit);
evpxorq(xmm22, xmm22, xmm22, Assembler::AVX_512bit);
bind(EXIT);
pop(rbx);
pop(rax); // return length
pop(r12);
pop(r13);
}
// AES-ECB Decrypt Operation
void MacroAssembler::aesecb_decrypt(Register src_addr, Register dest_addr, Register key, Register len) {
Label NO_PARTS, LOOP, Loop_start, LOOP2, AES192, END_LOOP, AES256, REMAINDER, LAST2, END, KEY_192, KEY_256, EXIT;
const Register pos = rax;
const Register rounds = r12;
push(r13);
push(r12);
// For EVEX with VL and BW, provide a standard mask, VL = 128 will guide the merge
// context for the registers used, where all instructions below are using 128-bit mode
// On EVEX without VL and BW, these instructions will all be AVX.
if (VM_Version::supports_avx512vlbw()) {
movl(rax, 0xffff);
kmovql(k1, rax);
}
push(len); // Save
push(rbx);
vzeroupper();
xorptr(pos, pos);
// Calculate number of rounds i.e. based on key length(128, 192, 256):44 for 10-rounds, 52 for 12-rounds, 60 for 14-rounds
movl(rounds, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
// Load Key shuf mask
const XMMRegister xmm_key_shuf_mask = xmm31; // used temporarily to swap key bytes up front
movdqu(xmm_key_shuf_mask, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr()));
// Load and shuffle round keys. The java expanded key ordering is rotated one position in decryption.
// So the first round key is loaded from 1*16 here and last round key is loaded from 0*16
ev_load_key(xmm9, key, 1 * 16, xmm_key_shuf_mask);
ev_load_key(xmm10, key, 2 * 16, xmm_key_shuf_mask);
ev_load_key(xmm11, key, 3 * 16, xmm_key_shuf_mask);
ev_load_key(xmm12, key, 4 * 16, xmm_key_shuf_mask);
ev_load_key(xmm13, key, 5 * 16, xmm_key_shuf_mask);
ev_load_key(xmm14, key, 6 * 16, xmm_key_shuf_mask);
ev_load_key(xmm15, key, 7 * 16, xmm_key_shuf_mask);
ev_load_key(xmm16, key, 8 * 16, xmm_key_shuf_mask);
ev_load_key(xmm17, key, 9 * 16, xmm_key_shuf_mask);
ev_load_key(xmm18, key, 10 * 16, xmm_key_shuf_mask);
ev_load_key(xmm27, key, 0 * 16, xmm_key_shuf_mask);
cmpl(rounds, 52);
jcc(Assembler::greaterEqual, KEY_192);
jmp(Loop_start);
bind(KEY_192);
ev_load_key(xmm19, key, 11 * 16, xmm_key_shuf_mask);
ev_load_key(xmm20, key, 12 * 16, xmm_key_shuf_mask);
cmpl(rounds, 60);
jcc(Assembler::equal, KEY_256);
jmp(Loop_start);
bind(KEY_256);
ev_load_key(xmm21, key, 13 * 16, xmm_key_shuf_mask);
ev_load_key(xmm22, key, 14 * 16, xmm_key_shuf_mask);
bind(Loop_start);
movq(rbx, len);
// Convert input length to number of blocks
shrq(len, 4);
shlq(rbx, 60);
jcc(Assembler::equal, NO_PARTS);
addq(len, 1);
// Check if number of blocks is greater than/ equal to 32
// If true, blocks then 512 bytes are processed at a time (code marked by label LOOP)
// If not, 16 bytes are processed (code marked by label REMAINDER)
bind(NO_PARTS);
movq(rbx, len);
shrq(len, 5);
jcc(Assembler::equal, REMAINDER);
movl(r13, len);
// Compute number of blocks that will be processed as 512 bytes at a time
// Subtract this from the total number of blocks, which will then be processed by REMAINDER loop.
shlq(r13, 5);
subq(rbx, r13);
bind(LOOP);
// Move 64 bytes of CT data into a zmm register, as a result 512 bytes of CT loaded in zmm0-7
evmovdquq(xmm0, Address(src_addr, pos, Address::times_1, 0 * 64), Assembler::AVX_512bit);
evmovdquq(xmm1, Address(src_addr, pos, Address::times_1, 1 * 64), Assembler::AVX_512bit);
evmovdquq(xmm2, Address(src_addr, pos, Address::times_1, 2 * 64), Assembler::AVX_512bit);
evmovdquq(xmm3, Address(src_addr, pos, Address::times_1, 3 * 64), Assembler::AVX_512bit);
evmovdquq(xmm4, Address(src_addr, pos, Address::times_1, 4 * 64), Assembler::AVX_512bit);
evmovdquq(xmm5, Address(src_addr, pos, Address::times_1, 5 * 64), Assembler::AVX_512bit);
evmovdquq(xmm6, Address(src_addr, pos, Address::times_1, 6 * 64), Assembler::AVX_512bit);
evmovdquq(xmm7, Address(src_addr, pos, Address::times_1, 7 * 64), Assembler::AVX_512bit);
// Xor with the first round key
evpxorq(xmm0, xmm0, xmm9, Assembler::AVX_512bit);
evpxorq(xmm1, xmm1, xmm9, Assembler::AVX_512bit);
evpxorq(xmm2, xmm2, xmm9, Assembler::AVX_512bit);
evpxorq(xmm3, xmm3, xmm9, Assembler::AVX_512bit);
evpxorq(xmm4, xmm4, xmm9, Assembler::AVX_512bit);
evpxorq(xmm5, xmm5, xmm9, Assembler::AVX_512bit);
evpxorq(xmm6, xmm6, xmm9, Assembler::AVX_512bit);
evpxorq(xmm7, xmm7, xmm9, Assembler::AVX_512bit);
// 9 rounds of Aesdec
roundDec(xmm10, 7);
roundDec(xmm11, 7);
roundDec(xmm12, 7);
roundDec(xmm13, 7);
roundDec(xmm14, 7);
roundDec(xmm15, 7);
roundDec(xmm16, 7);
roundDec(xmm17, 7);
roundDec(xmm18, 7);
cmpl(rounds, 52);
jcc(Assembler::aboveEqual, AES192);
// Aesdeclast round for keysize = 128
lastroundDec(xmm27, 7);
jmp(END_LOOP);
bind(AES192);
// 2 Additional rounds for keysize = 192
roundDec(xmm19, 7);
roundDec(xmm20, 7);
cmpl(rounds, 60);
jcc(Assembler::aboveEqual, AES256);
// Aesdeclast round for keysize = 192
lastroundDec(xmm27, 7);
jmp(END_LOOP);
bind(AES256);
// 2 Additional rounds and Aesdeclast for keysize = 256
roundDec(xmm21, 7);
roundDec(xmm22, 7);
lastroundDec(xmm27, 7);
bind(END_LOOP);
// Write 512 bytes of PT to the destination
evmovdquq(Address(dest_addr, pos, Address::times_1, 0 * 64), xmm0, Assembler::AVX_512bit);
evmovdquq(Address(dest_addr, pos, Address::times_1, 1 * 64), xmm1, Assembler::AVX_512bit);
evmovdquq(Address(dest_addr, pos, Address::times_1, 2 * 64), xmm2, Assembler::AVX_512bit);
evmovdquq(Address(dest_addr, pos, Address::times_1, 3 * 64), xmm3, Assembler::AVX_512bit);
evmovdquq(Address(dest_addr, pos, Address::times_1, 4 * 64), xmm4, Assembler::AVX_512bit);
evmovdquq(Address(dest_addr, pos, Address::times_1, 5 * 64), xmm5, Assembler::AVX_512bit);
evmovdquq(Address(dest_addr, pos, Address::times_1, 6 * 64), xmm6, Assembler::AVX_512bit);
evmovdquq(Address(dest_addr, pos, Address::times_1, 7 * 64), xmm7, Assembler::AVX_512bit);
addq(pos, 512);
decq(len);
jcc(Assembler::notEqual, LOOP);
bind(REMAINDER);
vzeroupper();
cmpq(rbx, 0);
jcc(Assembler::equal, END);
// Process 16 bytes at a time
bind(LOOP2);
movdqu(xmm1, Address(src_addr, pos, Address::times_1, 0));
vpxor(xmm1, xmm1, xmm9, Assembler::AVX_128bit);
// xmm2 contains shuffled key for Aesdeclast operation.
vmovdqu(xmm2, xmm27);
vaesdec(xmm1, xmm1, xmm10, Assembler::AVX_128bit);
vaesdec(xmm1, xmm1, xmm11, Assembler::AVX_128bit);
vaesdec(xmm1, xmm1, xmm12, Assembler::AVX_128bit);
vaesdec(xmm1, xmm1, xmm13, Assembler::AVX_128bit);
vaesdec(xmm1, xmm1, xmm14, Assembler::AVX_128bit);
vaesdec(xmm1, xmm1, xmm15, Assembler::AVX_128bit);
vaesdec(xmm1, xmm1, xmm16, Assembler::AVX_128bit);
vaesdec(xmm1, xmm1, xmm17, Assembler::AVX_128bit);
vaesdec(xmm1, xmm1, xmm18, Assembler::AVX_128bit);
cmpl(rounds, 52);
jcc(Assembler::below, LAST2);
vaesdec(xmm1, xmm1, xmm19, Assembler::AVX_128bit);
vaesdec(xmm1, xmm1, xmm20, Assembler::AVX_128bit);
cmpl(rounds, 60);
jcc(Assembler::below, LAST2);
vaesdec(xmm1, xmm1, xmm21, Assembler::AVX_128bit);
vaesdec(xmm1, xmm1, xmm22, Assembler::AVX_128bit);
bind(LAST2);
// Aesdeclast round
vaesdeclast(xmm1, xmm1, xmm2, Assembler::AVX_128bit);
// Write 16 bytes of PT to destination
movdqu(Address(dest_addr, pos, Address::times_1, 0), xmm1);
addq(pos, 16);
decq(rbx);
jcc(Assembler::notEqual, LOOP2);
bind(END);
// Zero out the round keys
evpxorq(xmm8, xmm8, xmm8, Assembler::AVX_512bit);
evpxorq(xmm9, xmm9, xmm9, Assembler::AVX_512bit);
evpxorq(xmm10, xmm10, xmm10, Assembler::AVX_512bit);
evpxorq(xmm11, xmm11, xmm11, Assembler::AVX_512bit);
evpxorq(xmm12, xmm12, xmm12, Assembler::AVX_512bit);
evpxorq(xmm13, xmm13, xmm13, Assembler::AVX_512bit);
evpxorq(xmm14, xmm14, xmm14, Assembler::AVX_512bit);
evpxorq(xmm15, xmm15, xmm15, Assembler::AVX_512bit);
evpxorq(xmm16, xmm16, xmm16, Assembler::AVX_512bit);
evpxorq(xmm17, xmm17, xmm17, Assembler::AVX_512bit);
evpxorq(xmm18, xmm18, xmm18, Assembler::AVX_512bit);
evpxorq(xmm27, xmm27, xmm27, Assembler::AVX_512bit);
cmpl(rounds, 44);
jcc(Assembler::belowEqual, EXIT);
evpxorq(xmm19, xmm19, xmm19, Assembler::AVX_512bit);
evpxorq(xmm20, xmm20, xmm20, Assembler::AVX_512bit);
cmpl(rounds, 52);
jcc(Assembler::belowEqual, EXIT);
evpxorq(xmm21, xmm21, xmm21, Assembler::AVX_512bit);
evpxorq(xmm22, xmm22, xmm22, Assembler::AVX_512bit);
bind(EXIT);
pop(rbx);
pop(rax); // return length
pop(r12);
pop(r13);
}
// Multiply 128 x 128 bits, using 4 pclmulqdq operations
void MacroAssembler::schoolbookAAD(int i, Register htbl, XMMRegister data,
XMMRegister tmp0, XMMRegister tmp1, XMMRegister tmp2, XMMRegister tmp3) {
movdqu(xmm15, Address(htbl, i * 16));
vpclmulhqlqdq(tmp3, data, xmm15); // 0x01
vpxor(tmp2, tmp2, tmp3, Assembler::AVX_128bit);
vpclmulldq(tmp3, data, xmm15); // 0x00
vpxor(tmp0, tmp0, tmp3, Assembler::AVX_128bit);
vpclmulhdq(tmp3, data, xmm15); // 0x11
vpxor(tmp1, tmp1, tmp3, Assembler::AVX_128bit);
vpclmullqhqdq(tmp3, data, xmm15); // 0x10
vpxor(tmp2, tmp2, tmp3, Assembler::AVX_128bit);
}
// Multiply two 128 bit numbers resulting in a 256 bit value
// Result of the multiplication followed by reduction stored in state
void MacroAssembler::gfmul(XMMRegister tmp0, XMMRegister state) {
const XMMRegister tmp1 = xmm4;
const XMMRegister tmp2 = xmm5;
const XMMRegister tmp3 = xmm6;
const XMMRegister tmp4 = xmm7;
vpclmulldq(tmp1, state, tmp0); //0x00 (a0 * b0)
vpclmulhdq(tmp4, state, tmp0);//0x11 (a1 * b1)
vpclmullqhqdq(tmp2, state, tmp0);//0x10 (a1 * b0)
vpclmulhqlqdq(tmp3, state, tmp0); //0x01 (a0 * b1)
vpxor(tmp2, tmp2, tmp3, Assembler::AVX_128bit); // (a0 * b1) + (a1 * b0)
vpslldq(tmp3, tmp2, 8, Assembler::AVX_128bit);
vpsrldq(tmp2, tmp2, 8, Assembler::AVX_128bit);
vpxor(tmp1, tmp1, tmp3, Assembler::AVX_128bit); // tmp1 and tmp4 hold the result
vpxor(tmp4, tmp4, tmp2, Assembler::AVX_128bit); // of carryless multiplication
// Follows the reduction technique mentioned in
// Shift-XOR reduction described in Gueron-Kounavis May 2010
// First phase of reduction
//
vpslld(xmm8, tmp1, 31, Assembler::AVX_128bit); // packed right shift shifting << 31
vpslld(xmm9, tmp1, 30, Assembler::AVX_128bit); // packed right shift shifting << 30
vpslld(xmm10, tmp1, 25, Assembler::AVX_128bit);// packed right shift shifting << 25
// xor the shifted versions
vpxor(xmm8, xmm8, xmm9, Assembler::AVX_128bit);
vpxor(xmm8, xmm8, xmm10, Assembler::AVX_128bit);
vpslldq(xmm9, xmm8, 12, Assembler::AVX_128bit);
vpsrldq(xmm8, xmm8, 4, Assembler::AVX_128bit);
vpxor(tmp1, tmp1, xmm9, Assembler::AVX_128bit);// first phase of the reduction complete
//
// Second phase of the reduction
//
vpsrld(xmm9, tmp1, 1, Assembler::AVX_128bit);// packed left shifting >> 1
vpsrld(xmm10, tmp1, 2, Assembler::AVX_128bit);// packed left shifting >> 2
vpsrld(xmm11, tmp1, 7, Assembler::AVX_128bit);// packed left shifting >> 7
vpxor(xmm9, xmm9, xmm10, Assembler::AVX_128bit);// xor the shifted versions
vpxor(xmm9, xmm9, xmm11, Assembler::AVX_128bit);
vpxor(xmm9, xmm9, xmm8, Assembler::AVX_128bit);
vpxor(tmp1, tmp1, xmm9, Assembler::AVX_128bit);
vpxor(state, tmp4, tmp1, Assembler::AVX_128bit);// the result is in state
ret(0);
}
// This method takes the subkey after expansion as input and generates 1 * 16 power of subkey H.
// The power of H is used in reduction process for one block ghash
void MacroAssembler::generateHtbl_one_block(Register htbl) {
const XMMRegister t = xmm13;
// load the original subkey hash
movdqu(t, Address(htbl, 0));
// shuffle using long swap mask
movdqu(xmm10, ExternalAddress(StubRoutines::x86::ghash_long_swap_mask_addr()));
vpshufb(t, t, xmm10, Assembler::AVX_128bit);
// Compute H' = GFMUL(H, 2)
vpsrld(xmm3, t, 7, Assembler::AVX_128bit);
movdqu(xmm4, ExternalAddress(StubRoutines::x86::ghash_shufflemask_addr()));
vpshufb(xmm3, xmm3, xmm4, Assembler::AVX_128bit);
movl(rax, 0xff00);
movdl(xmm4, rax);
vpshufb(xmm4, xmm4, xmm3, Assembler::AVX_128bit);
movdqu(xmm5, ExternalAddress(StubRoutines::x86::ghash_polynomial_addr()));
vpand(xmm5, xmm5, xmm4, Assembler::AVX_128bit);
vpsrld(xmm3, t, 31, Assembler::AVX_128bit);
vpslld(xmm4, t, 1, Assembler::AVX_128bit);
vpslldq(xmm3, xmm3, 4, Assembler::AVX_128bit);
vpxor(t, xmm4, xmm3, Assembler::AVX_128bit);// t holds p(x) <<1 or H * 2
//Adding p(x)<<1 to xmm5 which holds the reduction polynomial
vpxor(t, t, xmm5, Assembler::AVX_128bit);
movdqu(Address(htbl, 1 * 16), t); // H * 2
ret(0);
}
// This method takes the subkey after expansion as input and generates the remaining powers of subkey H.
// The power of H is used in reduction process for eight block ghash
void MacroAssembler::generateHtbl_eight_blocks(Register htbl) {
const XMMRegister t = xmm13;
const XMMRegister tmp0 = xmm1;
Label GFMUL;
movdqu(t, Address(htbl, 1 * 16));
movdqu(tmp0, t);
// tmp0 and t hold H. Now we compute powers of H by using GFMUL(H, H)
call(GFMUL, relocInfo::none);
movdqu(Address(htbl, 2 * 16), t); //H ^ 2 * 2
call(GFMUL, relocInfo::none);
movdqu(Address(htbl, 3 * 16), t); //H ^ 3 * 2
call(GFMUL, relocInfo::none);
movdqu(Address(htbl, 4 * 16), t); //H ^ 4 * 2
call(GFMUL, relocInfo::none);
movdqu(Address(htbl, 5 * 16), t); //H ^ 5 * 2
call(GFMUL, relocInfo::none);
movdqu(Address(htbl, 6 * 16), t); //H ^ 6 * 2
call(GFMUL, relocInfo::none);
movdqu(Address(htbl, 7 * 16), t); //H ^ 7 * 2
call(GFMUL, relocInfo::none);
movdqu(Address(htbl, 8 * 16), t); //H ^ 8 * 2
ret(0);
bind(GFMUL);
gfmul(tmp0, t);
}
// Multiblock and single block GHASH computation using Shift XOR reduction technique
void MacroAssembler::avx_ghash(Register input_state, Register htbl,
Register input_data, Register blocks) {
// temporary variables to hold input data and input state
const XMMRegister data = xmm1;
const XMMRegister state = xmm0;
// temporary variables to hold intermediate results
const XMMRegister tmp0 = xmm3;
const XMMRegister tmp1 = xmm4;
const XMMRegister tmp2 = xmm5;
const XMMRegister tmp3 = xmm6;
// temporary variables to hold byte and long swap masks
const XMMRegister bswap_mask = xmm2;
const XMMRegister lswap_mask = xmm14;
Label GENERATE_HTBL_1_BLK, GENERATE_HTBL_8_BLKS, BEGIN_PROCESS, GFMUL, BLOCK8_REDUCTION,
ONE_BLK_INIT, PROCESS_1_BLOCK, PROCESS_8_BLOCKS, SAVE_STATE, EXIT_GHASH;
testptr(blocks, blocks);
jcc(Assembler::zero, EXIT_GHASH);
// Check if Hashtable (1*16) has been already generated
// For anything less than 8 blocks, we generate only the first power of H.
movdqu(tmp2, Address(htbl, 1 * 16));
ptest(tmp2, tmp2);
jcc(Assembler::notZero, BEGIN_PROCESS);
call(GENERATE_HTBL_1_BLK, relocInfo::none);
// Shuffle the input state
bind(BEGIN_PROCESS);
movdqu(lswap_mask, ExternalAddress(StubRoutines::x86::ghash_long_swap_mask_addr()));
movdqu(state, Address(input_state, 0));
vpshufb(state, state, lswap_mask, Assembler::AVX_128bit);
cmpl(blocks, 8);
jcc(Assembler::below, ONE_BLK_INIT);
// If we have 8 blocks or more data, then generate remaining powers of H
movdqu(tmp2, Address(htbl, 8 * 16));
ptest(tmp2, tmp2);
jcc(Assembler::notZero, PROCESS_8_BLOCKS);
call(GENERATE_HTBL_8_BLKS, relocInfo::none);
//Do 8 multiplies followed by a reduction processing 8 blocks of data at a time
//Each block = 16 bytes.
bind(PROCESS_8_BLOCKS);
subl(blocks, 8);
movdqu(bswap_mask, ExternalAddress(StubRoutines::x86::ghash_byte_swap_mask_addr()));
movdqu(data, Address(input_data, 16 * 7));
vpshufb(data, data, bswap_mask, Assembler::AVX_128bit);
//Loading 1*16 as calculated powers of H required starts at that location.
movdqu(xmm15, Address(htbl, 1 * 16));
//Perform carryless multiplication of (H*2, data block #7)
vpclmulhqlqdq(tmp2, data, xmm15);//a0 * b1
vpclmulldq(tmp0, data, xmm15);//a0 * b0
vpclmulhdq(tmp1, data, xmm15);//a1 * b1
vpclmullqhqdq(tmp3, data, xmm15);//a1* b0
vpxor(tmp2, tmp2, tmp3, Assembler::AVX_128bit);// (a0 * b1) + (a1 * b0)
movdqu(data, Address(input_data, 16 * 6));
vpshufb(data, data, bswap_mask, Assembler::AVX_128bit);
// Perform carryless multiplication of (H^2 * 2, data block #6)
schoolbookAAD(2, htbl, data, tmp0, tmp1, tmp2, tmp3);
movdqu(data, Address(input_data, 16 * 5));
vpshufb(data, data, bswap_mask, Assembler::AVX_128bit);
// Perform carryless multiplication of (H^3 * 2, data block #5)
schoolbookAAD(3, htbl, data, tmp0, tmp1, tmp2, tmp3);
movdqu(data, Address(input_data, 16 * 4));
vpshufb(data, data, bswap_mask, Assembler::AVX_128bit);
// Perform carryless multiplication of (H^4 * 2, data block #4)
schoolbookAAD(4, htbl, data, tmp0, tmp1, tmp2, tmp3);
movdqu(data, Address(input_data, 16 * 3));
vpshufb(data, data, bswap_mask, Assembler::AVX_128bit);
// Perform carryless multiplication of (H^5 * 2, data block #3)
schoolbookAAD(5, htbl, data, tmp0, tmp1, tmp2, tmp3);
movdqu(data, Address(input_data, 16 * 2));
vpshufb(data, data, bswap_mask, Assembler::AVX_128bit);
// Perform carryless multiplication of (H^6 * 2, data block #2)
schoolbookAAD(6, htbl, data, tmp0, tmp1, tmp2, tmp3);
movdqu(data, Address(input_data, 16 * 1));
vpshufb(data, data, bswap_mask, Assembler::AVX_128bit);
// Perform carryless multiplication of (H^7 * 2, data block #1)
schoolbookAAD(7, htbl, data, tmp0, tmp1, tmp2, tmp3);
movdqu(data, Address(input_data, 16 * 0));
// xor data block#0 with input state before perfoming carry-less multiplication
vpshufb(data, data, bswap_mask, Assembler::AVX_128bit);
vpxor(data, data, state, Assembler::AVX_128bit);
// Perform carryless multiplication of (H^8 * 2, data block #0)
schoolbookAAD(8, htbl, data, tmp0, tmp1, tmp2, tmp3);
vpslldq(tmp3, tmp2, 8, Assembler::AVX_128bit);
vpsrldq(tmp2, tmp2, 8, Assembler::AVX_128bit);
vpxor(tmp0, tmp0, tmp3, Assembler::AVX_128bit);// tmp0, tmp1 contains aggregated results of
vpxor(tmp1, tmp1, tmp2, Assembler::AVX_128bit);// the multiplication operation
// we have the 2 128-bit partially accumulated multiplication results in tmp0:tmp1
// with higher 128-bit in tmp1 and lower 128-bit in corresponding tmp0
// Follows the reduction technique mentioned in
// Shift-XOR reduction described in Gueron-Kounavis May 2010
bind(BLOCK8_REDUCTION);
// First Phase of the reduction
vpslld(xmm8, tmp0, 31, Assembler::AVX_128bit); // packed right shifting << 31
vpslld(xmm9, tmp0, 30, Assembler::AVX_128bit); // packed right shifting << 30
vpslld(xmm10, tmp0, 25, Assembler::AVX_128bit); // packed right shifting << 25
// xor the shifted versions
vpxor(xmm8, xmm8, xmm10, Assembler::AVX_128bit);
vpxor(xmm8, xmm8, xmm9, Assembler::AVX_128bit);
vpslldq(xmm9, xmm8, 12, Assembler::AVX_128bit);
vpsrldq(xmm8, xmm8, 4, Assembler::AVX_128bit);
vpxor(tmp0, tmp0, xmm9, Assembler::AVX_128bit); // first phase of reduction is complete
// second phase of the reduction
vpsrld(xmm9, tmp0, 1, Assembler::AVX_128bit); // packed left shifting >> 1
vpsrld(xmm10, tmp0, 2, Assembler::AVX_128bit); // packed left shifting >> 2
vpsrld(tmp2, tmp0, 7, Assembler::AVX_128bit); // packed left shifting >> 7
// xor the shifted versions
vpxor(xmm9, xmm9, xmm10, Assembler::AVX_128bit);
vpxor(xmm9, xmm9, tmp2, Assembler::AVX_128bit);
vpxor(xmm9, xmm9, xmm8, Assembler::AVX_128bit);
vpxor(tmp0, xmm9, tmp0, Assembler::AVX_128bit);
// Final result is in state
vpxor(state, tmp0, tmp1, Assembler::AVX_128bit);
lea(input_data, Address(input_data, 16 * 8));
cmpl(blocks, 8);
jcc(Assembler::below, ONE_BLK_INIT);
jmp(PROCESS_8_BLOCKS);
// Since this is one block operation we will only use H * 2 i.e. the first power of H
bind(ONE_BLK_INIT);
movdqu(tmp0, Address(htbl, 1 * 16));
movdqu(bswap_mask, ExternalAddress(StubRoutines::x86::ghash_byte_swap_mask_addr()));
//Do one (128 bit x 128 bit) carry-less multiplication at a time followed by a reduction.
bind(PROCESS_1_BLOCK);
cmpl(blocks, 0);
jcc(Assembler::equal, SAVE_STATE);
subl(blocks, 1);
movdqu(data, Address(input_data, 0));
vpshufb(data, data, bswap_mask, Assembler::AVX_128bit);
vpxor(state, state, data, Assembler::AVX_128bit);
// gfmul(H*2, state)
call(GFMUL, relocInfo::none);
addptr(input_data, 16);
jmp(PROCESS_1_BLOCK);
bind(SAVE_STATE);
vpshufb(state, state, lswap_mask, Assembler::AVX_128bit);
movdqu(Address(input_state, 0), state);
jmp(EXIT_GHASH);
bind(GFMUL);
gfmul(tmp0, state);
bind(GENERATE_HTBL_1_BLK);
generateHtbl_one_block(htbl);
bind(GENERATE_HTBL_8_BLKS);
generateHtbl_eight_blocks(htbl);
bind(EXIT_GHASH);
// zero out xmm registers used for Htbl storage
vpxor(xmm0, xmm0, xmm0, Assembler::AVX_128bit);
vpxor(xmm1, xmm1, xmm1, Assembler::AVX_128bit);
vpxor(xmm3, xmm3, xmm3, Assembler::AVX_128bit);
vpxor(xmm15, xmm15, xmm15, Assembler::AVX_128bit);
}
// AES Counter Mode using VAES instructions
void MacroAssembler::aesctr_encrypt(Register src_addr, Register dest_addr, Register key, Register counter,
Register len_reg, Register used, Register used_addr, Register saved_encCounter_start) {
const Register rounds = 0;
const Register pos = r12;
Label PRELOOP_START, EXIT_PRELOOP, REMAINDER, REMAINDER_16, LOOP, END, EXIT, END_LOOP,
AES192, AES256, AES192_REMAINDER16, REMAINDER16_END_LOOP, AES256_REMAINDER16,
REMAINDER_8, REMAINDER_4, AES192_REMAINDER8, REMAINDER_LOOP, AES256_REMINDER,
AES192_REMAINDER, END_REMAINDER_LOOP, AES256_REMAINDER8, REMAINDER8_END_LOOP,
AES192_REMAINDER4, AES256_REMAINDER4, AES256_REMAINDER, END_REMAINDER4, EXTRACT_TAILBYTES,
EXTRACT_TAIL_4BYTES, EXTRACT_TAIL_2BYTES, EXTRACT_TAIL_1BYTE, STORE_CTR;
cmpl(len_reg, 0);
jcc(Assembler::belowEqual, EXIT);
movl(pos, 0);
// if the number of used encrypted counter bytes < 16,
// XOR PT with saved encrypted counter to obtain CT
bind(PRELOOP_START);
cmpl(used, 16);
jcc(Assembler::aboveEqual, EXIT_PRELOOP);
movb(rbx, Address(saved_encCounter_start, used));
xorb(rbx, Address(src_addr, pos));
movb(Address(dest_addr, pos), rbx);
addptr(pos, 1);
addptr(used, 1);
decrement(len_reg);
jmp(PRELOOP_START);
bind(EXIT_PRELOOP);
movl(Address(used_addr, 0), used);
// Calculate number of rounds i.e. 10, 12, 14, based on key length(128, 192, 256).
movl(rounds, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
vpxor(xmm0, xmm0, xmm0, Assembler::AVX_128bit);
// Move initial counter value in xmm0
movdqu(xmm0, Address(counter, 0));
// broadcast counter value to zmm8
evshufi64x2(xmm8, xmm0, xmm0, 0, Assembler::AVX_512bit);
// load lbswap mask
evmovdquq(xmm16, ExternalAddress(StubRoutines::x86::counter_mask_addr()), Assembler::AVX_512bit, r15);
//shuffle counter using lbswap_mask
vpshufb(xmm8, xmm8, xmm16, Assembler::AVX_512bit);
// pre-increment and propagate counter values to zmm9-zmm15 registers.
// Linc0 increments the zmm8 by 1 (initial value being 0), Linc4 increments the counters zmm9-zmm15 by 4
// The counter is incremented after each block i.e. 16 bytes is processed;
// each zmm register has 4 counter values as its MSB
// the counters are incremented in parallel
vpaddd(xmm8, xmm8, ExternalAddress(StubRoutines::x86::counter_mask_addr() + 64), Assembler::AVX_512bit, r15);//linc0
vpaddd(xmm9, xmm8, ExternalAddress(StubRoutines::x86::counter_mask_addr() + 128), Assembler::AVX_512bit, r15);//linc4(rip)
vpaddd(xmm10, xmm9, ExternalAddress(StubRoutines::x86::counter_mask_addr() + 128), Assembler::AVX_512bit, r15);//Linc4(rip)
vpaddd(xmm11, xmm10, ExternalAddress(StubRoutines::x86::counter_mask_addr() + 128), Assembler::AVX_512bit, r15);//Linc4(rip)
vpaddd(xmm12, xmm11, ExternalAddress(StubRoutines::x86::counter_mask_addr() + 128), Assembler::AVX_512bit, r15);//Linc4(rip)
vpaddd(xmm13, xmm12, ExternalAddress(StubRoutines::x86::counter_mask_addr() + 128), Assembler::AVX_512bit, r15);//Linc4(rip)
vpaddd(xmm14, xmm13, ExternalAddress(StubRoutines::x86::counter_mask_addr() + 128), Assembler::AVX_512bit, r15);//Linc4(rip)
vpaddd(xmm15, xmm14, ExternalAddress(StubRoutines::x86::counter_mask_addr() + 128), Assembler::AVX_512bit, r15);//Linc4(rip)
// load linc32 mask in zmm register.linc32 increments counter by 32
evmovdquq(xmm19, ExternalAddress(StubRoutines::x86::counter_mask_addr() + 256), Assembler::AVX_512bit, r15);//Linc32
// xmm31 contains the key shuffle mask.
movdqu(xmm31, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr()));
// Load key function loads 128 bit key and shuffles it. Then we broadcast the shuffled key to convert it into a 512 bit value.
// For broadcasting the values to ZMM, vshufi64 is used instead of evbroadcasti64x2 as the source in this case is ZMM register
// that holds shuffled key value.
ev_load_key(xmm20, key, 0, xmm31);
ev_load_key(xmm21, key, 1 * 16, xmm31);
ev_load_key(xmm22, key, 2 * 16, xmm31);
ev_load_key(xmm23, key, 3 * 16, xmm31);
ev_load_key(xmm24, key, 4 * 16, xmm31);
ev_load_key(xmm25, key, 5 * 16, xmm31);
ev_load_key(xmm26, key, 6 * 16, xmm31);
ev_load_key(xmm27, key, 7 * 16, xmm31);
ev_load_key(xmm28, key, 8 * 16, xmm31);
ev_load_key(xmm29, key, 9 * 16, xmm31);
ev_load_key(xmm30, key, 10 * 16, xmm31);
// Process 32 blocks or 512 bytes of data
bind(LOOP);
cmpl(len_reg, 512);
jcc(Assembler::less, REMAINDER);
subq(len_reg, 512);
//Shuffle counter and Exor it with roundkey1. Result is stored in zmm0-7
vpshufb(xmm0, xmm8, xmm16, Assembler::AVX_512bit);
evpxorq(xmm0, xmm0, xmm20, Assembler::AVX_512bit);
vpshufb(xmm1, xmm9, xmm16, Assembler::AVX_512bit);
evpxorq(xmm1, xmm1, xmm20, Assembler::AVX_512bit);
vpshufb(xmm2, xmm10, xmm16, Assembler::AVX_512bit);
evpxorq(xmm2, xmm2, xmm20, Assembler::AVX_512bit);
vpshufb(xmm3, xmm11, xmm16, Assembler::AVX_512bit);
evpxorq(xmm3, xmm3, xmm20, Assembler::AVX_512bit);
vpshufb(xmm4, xmm12, xmm16, Assembler::AVX_512bit);
evpxorq(xmm4, xmm4, xmm20, Assembler::AVX_512bit);
vpshufb(xmm5, xmm13, xmm16, Assembler::AVX_512bit);
evpxorq(xmm5, xmm5, xmm20, Assembler::AVX_512bit);
vpshufb(xmm6, xmm14, xmm16, Assembler::AVX_512bit);
evpxorq(xmm6, xmm6, xmm20, Assembler::AVX_512bit);
vpshufb(xmm7, xmm15, xmm16, Assembler::AVX_512bit);
evpxorq(xmm7, xmm7, xmm20, Assembler::AVX_512bit);
// Perform AES encode operations and put results in zmm0-zmm7.
// This is followed by incrementing counter values in zmm8-zmm15.
// Since we will be processing 32 blocks at a time, the counter is incremented by 32.
roundEnc(xmm21, 7);
vpaddq(xmm8, xmm8, xmm19, Assembler::AVX_512bit);
roundEnc(xmm22, 7);
vpaddq(xmm9, xmm9, xmm19, Assembler::AVX_512bit);
roundEnc(xmm23, 7);
vpaddq(xmm10, xmm10, xmm19, Assembler::AVX_512bit);
roundEnc(xmm24, 7);
vpaddq(xmm11, xmm11, xmm19, Assembler::AVX_512bit);
roundEnc(xmm25, 7);
vpaddq(xmm12, xmm12, xmm19, Assembler::AVX_512bit);
roundEnc(xmm26, 7);
vpaddq(xmm13, xmm13, xmm19, Assembler::AVX_512bit);
roundEnc(xmm27, 7);
vpaddq(xmm14, xmm14, xmm19, Assembler::AVX_512bit);
roundEnc(xmm28, 7);
vpaddq(xmm15, xmm15, xmm19, Assembler::AVX_512bit);
roundEnc(xmm29, 7);
cmpl(rounds, 52);
jcc(Assembler::aboveEqual, AES192);
lastroundEnc(xmm30, 7);
jmp(END_LOOP);
bind(AES192);
roundEnc(xmm30, 7);
ev_load_key(xmm18, key, 11 * 16, xmm31);
roundEnc(xmm18, 7);
cmpl(rounds, 60);
jcc(Assembler::aboveEqual, AES256);
ev_load_key(xmm18, key, 12 * 16, xmm31);
lastroundEnc(xmm18, 7);
jmp(END_LOOP);
bind(AES256);
ev_load_key(xmm18, key, 12 * 16, xmm31);
roundEnc(xmm18, 7);
ev_load_key(xmm18, key, 13 * 16, xmm31);
roundEnc(xmm18, 7);
ev_load_key(xmm18, key, 14 * 16, xmm31);
lastroundEnc(xmm18, 7);
// After AES encode rounds, the encrypted block cipher lies in zmm0-zmm7
// xor encrypted block cipher and input plaintext and store resultant ciphertext
bind(END_LOOP);
evpxorq(xmm0, xmm0, Address(src_addr, pos, Address::times_1, 0 * 64), Assembler::AVX_512bit);
evmovdquq(Address(dest_addr, pos, Address::times_1, 0), xmm0, Assembler::AVX_512bit);
evpxorq(xmm1, xmm1, Address(src_addr, pos, Address::times_1, 1 * 64), Assembler::AVX_512bit);
evmovdquq(Address(dest_addr, pos, Address::times_1, 64), xmm1, Assembler::AVX_512bit);
evpxorq(xmm2, xmm2, Address(src_addr, pos, Address::times_1, 2 * 64), Assembler::AVX_512bit);
evmovdquq(Address(dest_addr, pos, Address::times_1, 2 * 64), xmm2, Assembler::AVX_512bit);
evpxorq(xmm3, xmm3, Address(src_addr, pos, Address::times_1, 3 * 64), Assembler::AVX_512bit);
evmovdquq(Address(dest_addr, pos, Address::times_1, 3 * 64), xmm3, Assembler::AVX_512bit);
evpxorq(xmm4, xmm4, Address(src_addr, pos, Address::times_1, 4 * 64), Assembler::AVX_512bit);
evmovdquq(Address(dest_addr, pos, Address::times_1, 4 * 64), xmm4, Assembler::AVX_512bit);
evpxorq(xmm5, xmm5, Address(src_addr, pos, Address::times_1, 5 * 64), Assembler::AVX_512bit);
evmovdquq(Address(dest_addr, pos, Address::times_1, 5 * 64), xmm5, Assembler::AVX_512bit);
evpxorq(xmm6, xmm6, Address(src_addr, pos, Address::times_1, 6 * 64), Assembler::AVX_512bit);
evmovdquq(Address(dest_addr, pos, Address::times_1, 6 * 64), xmm6, Assembler::AVX_512bit);
evpxorq(xmm7, xmm7, Address(src_addr, pos, Address::times_1, 7 * 64), Assembler::AVX_512bit);
evmovdquq(Address(dest_addr, pos, Address::times_1, 7 * 64), xmm7, Assembler::AVX_512bit);
addq(pos, 512);
jmp(LOOP);
// Encode 256, 128, 64 or 16 bytes at a time if length is less than 512 bytes
bind(REMAINDER);
cmpl(len_reg, 0);
jcc(Assembler::equal, END);
cmpl(len_reg, 256);
jcc(Assembler::aboveEqual, REMAINDER_16);
cmpl(len_reg, 128);
jcc(Assembler::aboveEqual, REMAINDER_8);
cmpl(len_reg, 64);
jcc(Assembler::aboveEqual, REMAINDER_4);
// At this point, we will process 16 bytes of data at a time.
// So load xmm19 with counter increment value as 1
evmovdquq(xmm19, ExternalAddress(StubRoutines::x86::counter_mask_addr() + 80), Assembler::AVX_128bit, r15);
jmp(REMAINDER_LOOP);
// Each ZMM register can be used to encode 64 bytes of data, so we have 4 ZMM registers to encode 256 bytes of data
bind(REMAINDER_16);
subq(len_reg, 256);
// As we process 16 blocks at a time, load mask for incrementing the counter value by 16
evmovdquq(xmm19, ExternalAddress(StubRoutines::x86::counter_mask_addr() + 320), Assembler::AVX_512bit, r15);//Linc16(rip)
// shuffle counter and XOR counter with roundkey1
vpshufb(xmm0, xmm8, xmm16, Assembler::AVX_512bit);
evpxorq(xmm0, xmm0, xmm20, Assembler::AVX_512bit);
vpshufb(xmm1, xmm9, xmm16, Assembler::AVX_512bit);
evpxorq(xmm1, xmm1, xmm20, Assembler::AVX_512bit);
vpshufb(xmm2, xmm10, xmm16, Assembler::AVX_512bit);
evpxorq(xmm2, xmm2, xmm20, Assembler::AVX_512bit);
vpshufb(xmm3, xmm11, xmm16, Assembler::AVX_512bit);
evpxorq(xmm3, xmm3, xmm20, Assembler::AVX_512bit);
// Increment counter values by 16
vpaddq(xmm8, xmm8, xmm19, Assembler::AVX_512bit);
vpaddq(xmm9, xmm9, xmm19, Assembler::AVX_512bit);
// AES encode rounds
roundEnc(xmm21, 3);
roundEnc(xmm22, 3);
roundEnc(xmm23, 3);
roundEnc(xmm24, 3);
roundEnc(xmm25, 3);
roundEnc(xmm26, 3);
roundEnc(xmm27, 3);
roundEnc(xmm28, 3);
roundEnc(xmm29, 3);
cmpl(rounds, 52);
jcc(Assembler::aboveEqual, AES192_REMAINDER16);
lastroundEnc(xmm30, 3);
jmp(REMAINDER16_END_LOOP);
bind(AES192_REMAINDER16);
roundEnc(xmm30, 3);
ev_load_key(xmm18, key, 11 * 16, xmm31);
roundEnc(xmm18, 3);
ev_load_key(xmm5, key, 12 * 16, xmm31);
cmpl(rounds, 60);
jcc(Assembler::aboveEqual, AES256_REMAINDER16);
lastroundEnc(xmm5, 3);
jmp(REMAINDER16_END_LOOP);
bind(AES256_REMAINDER16);
roundEnc(xmm5, 3);
ev_load_key(xmm6, key, 13 * 16, xmm31);
roundEnc(xmm6, 3);
ev_load_key(xmm7, key, 14 * 16, xmm31);
lastroundEnc(xmm7, 3);
// After AES encode rounds, the encrypted block cipher lies in zmm0-zmm3
// xor 256 bytes of PT with the encrypted counters to produce CT.
bind(REMAINDER16_END_LOOP);
evpxorq(xmm0, xmm0, Address(src_addr, pos, Address::times_1, 0), Assembler::AVX_512bit);
evmovdquq(Address(dest_addr, pos, Address::times_1, 0), xmm0, Assembler::AVX_512bit);
evpxorq(xmm1, xmm1, Address(src_addr, pos, Address::times_1, 1 * 64), Assembler::AVX_512bit);
evmovdquq(Address(dest_addr, pos, Address::times_1, 1 * 64), xmm1, Assembler::AVX_512bit);
evpxorq(xmm2, xmm2, Address(src_addr, pos, Address::times_1, 2 * 64), Assembler::AVX_512bit);
evmovdquq(Address(dest_addr, pos, Address::times_1, 2 * 64), xmm2, Assembler::AVX_512bit);
evpxorq(xmm3, xmm3, Address(src_addr, pos, Address::times_1, 3 * 64), Assembler::AVX_512bit);
evmovdquq(Address(dest_addr, pos, Address::times_1, 3 * 64), xmm3, Assembler::AVX_512bit);
addq(pos, 256);
cmpl(len_reg, 128);
jcc(Assembler::aboveEqual, REMAINDER_8);
cmpl(len_reg, 64);
jcc(Assembler::aboveEqual, REMAINDER_4);
//load mask for incrementing the counter value by 1
evmovdquq(xmm19, ExternalAddress(StubRoutines::x86::counter_mask_addr() + 80), Assembler::AVX_128bit, r15);//Linc0 + 16(rip)
jmp(REMAINDER_LOOP);
// Each ZMM register can be used to encode 64 bytes of data, so we have 2 ZMM registers to encode 128 bytes of data
bind(REMAINDER_8);
subq(len_reg, 128);
// As we process 8 blocks at a time, load mask for incrementing the counter value by 8
evmovdquq(xmm19, ExternalAddress(StubRoutines::x86::counter_mask_addr() + 192), Assembler::AVX_512bit, r15);//Linc8(rip)
// shuffle counters and xor with roundkey1
vpshufb(xmm0, xmm8, xmm16, Assembler::AVX_512bit);
evpxorq(xmm0, xmm0, xmm20, Assembler::AVX_512bit);
vpshufb(xmm1, xmm9, xmm16, Assembler::AVX_512bit);
evpxorq(xmm1, xmm1, xmm20, Assembler::AVX_512bit);
// increment counter by 8
vpaddq(xmm8, xmm8, xmm19, Assembler::AVX_512bit);
// AES encode
roundEnc(xmm21, 1);
roundEnc(xmm22, 1);
roundEnc(xmm23, 1);
roundEnc(xmm24, 1);
roundEnc(xmm25, 1);
roundEnc(xmm26, 1);
roundEnc(xmm27, 1);
roundEnc(xmm28, 1);
roundEnc(xmm29, 1);
cmpl(rounds, 52);
jcc(Assembler::aboveEqual, AES192_REMAINDER8);
lastroundEnc(xmm30, 1);
jmp(REMAINDER8_END_LOOP);
bind(AES192_REMAINDER8);
roundEnc(xmm30, 1);
ev_load_key(xmm18, key, 11 * 16, xmm31);
roundEnc(xmm18, 1);
ev_load_key(xmm5, key, 12 * 16, xmm31);
cmpl(rounds, 60);
jcc(Assembler::aboveEqual, AES256_REMAINDER8);
lastroundEnc(xmm5, 1);
jmp(REMAINDER8_END_LOOP);
bind(AES256_REMAINDER8);
roundEnc(xmm5, 1);
ev_load_key(xmm6, key, 13 * 16, xmm31);
roundEnc(xmm6, 1);
ev_load_key(xmm7, key, 14 * 16, xmm31);
lastroundEnc(xmm7, 1);
bind(REMAINDER8_END_LOOP);
// After AES encode rounds, the encrypted block cipher lies in zmm0-zmm1
// XOR PT with the encrypted counter and store as CT
evpxorq(xmm0, xmm0, Address(src_addr, pos, Address::times_1, 0 * 64), Assembler::AVX_512bit);
evmovdquq(Address(dest_addr, pos, Address::times_1, 0 * 64), xmm0, Assembler::AVX_512bit);
evpxorq(xmm1, xmm1, Address(src_addr, pos, Address::times_1, 1 * 64), Assembler::AVX_512bit);
evmovdquq(Address(dest_addr, pos, Address::times_1, 1 * 64), xmm1, Assembler::AVX_512bit);
addq(pos, 128);
cmpl(len_reg, 64);
jcc(Assembler::aboveEqual, REMAINDER_4);
// load mask for incrementing the counter value by 1
evmovdquq(xmm19, ExternalAddress(StubRoutines::x86::counter_mask_addr() + 80), Assembler::AVX_128bit, r15);//Linc0 + 16(rip)
jmp(REMAINDER_LOOP);
// Each ZMM register can be used to encode 64 bytes of data, so we have 1 ZMM register used in this block of code
bind(REMAINDER_4);
subq(len_reg, 64);
// As we process 4 blocks at a time, load mask for incrementing the counter value by 4
evmovdquq(xmm19, ExternalAddress(StubRoutines::x86::counter_mask_addr() + 128), Assembler::AVX_512bit, r15);//Linc4(rip)
// XOR counter with first roundkey
vpshufb(xmm0, xmm8, xmm16, Assembler::AVX_512bit);
evpxorq(xmm0, xmm0, xmm20, Assembler::AVX_512bit);
// Increment counter
vpaddq(xmm8, xmm8, xmm19, Assembler::AVX_512bit);
vaesenc(xmm0, xmm0, xmm21, Assembler::AVX_512bit);
vaesenc(xmm0, xmm0, xmm22, Assembler::AVX_512bit);
vaesenc(xmm0, xmm0, xmm23, Assembler::AVX_512bit);
vaesenc(xmm0, xmm0, xmm24, Assembler::AVX_512bit);
vaesenc(xmm0, xmm0, xmm25, Assembler::AVX_512bit);
vaesenc(xmm0, xmm0, xmm26, Assembler::AVX_512bit);
vaesenc(xmm0, xmm0, xmm27, Assembler::AVX_512bit);
vaesenc(xmm0, xmm0, xmm28, Assembler::AVX_512bit);
vaesenc(xmm0, xmm0, xmm29, Assembler::AVX_512bit);
cmpl(rounds, 52);
jcc(Assembler::aboveEqual, AES192_REMAINDER4);
vaesenclast(xmm0, xmm0, xmm30, Assembler::AVX_512bit);
jmp(END_REMAINDER4);
bind(AES192_REMAINDER4);
vaesenc(xmm0, xmm0, xmm30, Assembler::AVX_512bit);
ev_load_key(xmm18, key, 11 * 16, xmm31);
vaesenc(xmm0, xmm0, xmm18, Assembler::AVX_512bit);
ev_load_key(xmm5, key, 12 * 16, xmm31);
cmpl(rounds, 60);
jcc(Assembler::aboveEqual, AES256_REMAINDER4);
vaesenclast(xmm0, xmm0, xmm5, Assembler::AVX_512bit);
jmp(END_REMAINDER4);
bind(AES256_REMAINDER4);
vaesenc(xmm0, xmm0, xmm5, Assembler::AVX_512bit);
ev_load_key(xmm6, key, 13 * 16, xmm31);
vaesenc(xmm0, xmm0, xmm6, Assembler::AVX_512bit);
ev_load_key(xmm7, key, 14 * 16, xmm31);
vaesenclast(xmm0, xmm0, xmm7, Assembler::AVX_512bit);
// After AES encode rounds, the encrypted block cipher lies in zmm0.
// XOR encrypted block cipher with PT and store 64 bytes of ciphertext
bind(END_REMAINDER4);
evpxorq(xmm0, xmm0, Address(src_addr, pos, Address::times_1, 0 * 64), Assembler::AVX_512bit);
evmovdquq(Address(dest_addr, pos, Address::times_1, 0), xmm0, Assembler::AVX_512bit);
addq(pos, 64);
// load mask for incrementing the counter value by 1
evmovdquq(xmm19, ExternalAddress(StubRoutines::x86::counter_mask_addr() + 80), Assembler::AVX_128bit, r15);//Linc0 + 16(rip)
// For a single block, the AES rounds start here.
bind(REMAINDER_LOOP);
cmpl(len_reg, 0);
jcc(Assembler::belowEqual, END);
// XOR counter with first roundkey
vpshufb(xmm0, xmm8, xmm16, Assembler::AVX_128bit);
evpxorq(xmm0, xmm0, xmm20, Assembler::AVX_128bit);
vaesenc(xmm0, xmm0, xmm21, Assembler::AVX_128bit);
// Increment counter by 1
vpaddq(xmm8, xmm8, xmm19, Assembler::AVX_128bit);
vaesenc(xmm0, xmm0, xmm22, Assembler::AVX_128bit);
vaesenc(xmm0, xmm0, xmm23, Assembler::AVX_128bit);
vaesenc(xmm0, xmm0, xmm24, Assembler::AVX_128bit);
vaesenc(xmm0, xmm0, xmm25, Assembler::AVX_128bit);
vaesenc(xmm0, xmm0, xmm26, Assembler::AVX_128bit);
vaesenc(xmm0, xmm0, xmm27, Assembler::AVX_128bit);
vaesenc(xmm0, xmm0, xmm28, Assembler::AVX_128bit);
vaesenc(xmm0, xmm0, xmm29, Assembler::AVX_128bit);
cmpl(rounds, 52);
jcc(Assembler::aboveEqual, AES192_REMAINDER);
vaesenclast(xmm0, xmm0, xmm30, Assembler::AVX_128bit);
jmp(END_REMAINDER_LOOP);
bind(AES192_REMAINDER);
vaesenc(xmm0, xmm0, xmm30, Assembler::AVX_128bit);
ev_load_key(xmm18, key, 11 * 16, xmm31);
vaesenc(xmm0, xmm0, xmm18, Assembler::AVX_128bit);
ev_load_key(xmm5, key, 12 * 16, xmm31);
cmpl(rounds, 60);
jcc(Assembler::aboveEqual, AES256_REMAINDER);
vaesenclast(xmm0, xmm0, xmm5, Assembler::AVX_128bit);
jmp(END_REMAINDER_LOOP);
bind(AES256_REMAINDER);
vaesenc(xmm0, xmm0, xmm5, Assembler::AVX_128bit);
ev_load_key(xmm6, key, 13 * 16, xmm31);
vaesenc(xmm0, xmm0, xmm6, Assembler::AVX_128bit);
ev_load_key(xmm7, key, 14 * 16, xmm31);
vaesenclast(xmm0, xmm0, xmm7, Assembler::AVX_128bit);
bind(END_REMAINDER_LOOP);
// If the length register is less than the blockSize i.e. 16
// then we store only those bytes of the CT to the destination
// corresponding to the length register value
// extracting the exact number of bytes is handled by EXTRACT_TAILBYTES
cmpl(len_reg, 16);
jcc(Assembler::less, EXTRACT_TAILBYTES);
subl(len_reg, 16);
// After AES encode rounds, the encrypted block cipher lies in xmm0.
// If the length register is equal to 16 bytes, store CT in dest after XOR operation.
evpxorq(xmm0, xmm0, Address(src_addr, pos, Address::times_1, 0), Assembler::AVX_128bit);
evmovdquq(Address(dest_addr, pos, Address::times_1, 0), xmm0, Assembler::AVX_128bit);
addl(pos, 16);
jmp(REMAINDER_LOOP);
bind(EXTRACT_TAILBYTES);
// Save encrypted counter value in xmm0 for next invocation, before XOR operation
movdqu(Address(saved_encCounter_start, 0), xmm0);
// XOR encryted block cipher in xmm0 with PT to produce CT
evpxorq(xmm0, xmm0, Address(src_addr, pos, Address::times_1, 0), Assembler::AVX_128bit);
// extract upto 15 bytes of CT from xmm0 as specified by length register
testptr(len_reg, 8);
jcc(Assembler::zero, EXTRACT_TAIL_4BYTES);
pextrq(Address(dest_addr, pos), xmm0, 0);
psrldq(xmm0, 8);
addl(pos, 8);
bind(EXTRACT_TAIL_4BYTES);
testptr(len_reg, 4);
jcc(Assembler::zero, EXTRACT_TAIL_2BYTES);
pextrd(Address(dest_addr, pos), xmm0, 0);
psrldq(xmm0, 4);
addq(pos, 4);
bind(EXTRACT_TAIL_2BYTES);
testptr(len_reg, 2);
jcc(Assembler::zero, EXTRACT_TAIL_1BYTE);
pextrw(Address(dest_addr, pos), xmm0, 0);
psrldq(xmm0, 2);
addl(pos, 2);
bind(EXTRACT_TAIL_1BYTE);
testptr(len_reg, 1);
jcc(Assembler::zero, END);
pextrb(Address(dest_addr, pos), xmm0, 0);
addl(pos, 1);
bind(END);
// If there are no tail bytes, store counter value and exit
cmpl(len_reg, 0);
jcc(Assembler::equal, STORE_CTR);
movl(Address(used_addr, 0), len_reg);
bind(STORE_CTR);
//shuffle updated counter and store it
vpshufb(xmm8, xmm8, xmm16, Assembler::AVX_128bit);
movdqu(Address(counter, 0), xmm8);
// Zero out counter and key registers
evpxorq(xmm8, xmm8, xmm8, Assembler::AVX_512bit);
evpxorq(xmm20, xmm20, xmm20, Assembler::AVX_512bit);
evpxorq(xmm21, xmm21, xmm21, Assembler::AVX_512bit);
evpxorq(xmm22, xmm22, xmm22, Assembler::AVX_512bit);
evpxorq(xmm23, xmm23, xmm23, Assembler::AVX_512bit);
evpxorq(xmm24, xmm24, xmm24, Assembler::AVX_512bit);
evpxorq(xmm25, xmm25, xmm25, Assembler::AVX_512bit);
evpxorq(xmm26, xmm26, xmm26, Assembler::AVX_512bit);
evpxorq(xmm27, xmm27, xmm27, Assembler::AVX_512bit);
evpxorq(xmm28, xmm28, xmm28, Assembler::AVX_512bit);
evpxorq(xmm29, xmm29, xmm29, Assembler::AVX_512bit);
evpxorq(xmm30, xmm30, xmm30, Assembler::AVX_512bit);
cmpl(rounds, 44);
jcc(Assembler::belowEqual, EXIT);
evpxorq(xmm18, xmm18, xmm18, Assembler::AVX_512bit);
evpxorq(xmm5, xmm5, xmm5, Assembler::AVX_512bit);
cmpl(rounds, 52);
jcc(Assembler::belowEqual, EXIT);
evpxorq(xmm6, xmm6, xmm6, Assembler::AVX_512bit);
evpxorq(xmm7, xmm7, xmm7, Assembler::AVX_512bit);
bind(EXIT);
}
#endif // _LP64