8205398: AES-CBC decryption algorithm using AVX512 instructions

Co-authored-by: Shay Gueron <shay.gueron@intel.com>
Co-authored-by: Smita Kamath <smita.kamath@intel.com>
Co-authored-by: Shravya Rukmannagari <shravya.rukmannagari@intel.com>
Reviewed-by: kvn
This commit is contained in:
Regev Shemy 2018-06-21 10:54:07 -07:00 committed by Vladimir Kozlov
parent 277b35da28
commit a6ac56a69e
5 changed files with 372 additions and 2 deletions

View File

@ -1303,6 +1303,16 @@ void Assembler::aesdec(XMMRegister dst, XMMRegister src) {
emit_int8(0xC0 | encode);
}
void Assembler::vaesdec(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
assert(VM_Version::supports_vaes(), "");
InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ true);
attributes.set_is_evex_instruction();
int encode = vex_prefix_and_encode(dst->encoding(), nds->encoding(), src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_38, &attributes);
emit_int8((unsigned char)0xDE);
emit_int8((unsigned char)(0xC0 | encode));
}
void Assembler::aesdeclast(XMMRegister dst, Address src) {
assert(VM_Version::supports_aes(), "");
InstructionMark im(this);
@ -1320,6 +1330,15 @@ void Assembler::aesdeclast(XMMRegister dst, XMMRegister src) {
emit_int8((unsigned char)(0xC0 | encode));
}
void Assembler::vaesdeclast(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
assert(VM_Version::supports_vaes(), "");
InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ true);
attributes.set_is_evex_instruction();
int encode = vex_prefix_and_encode(dst->encoding(), nds->encoding(), src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_38, &attributes);
emit_int8((unsigned char)0xDF);
emit_int8((unsigned char)(0xC0 | encode));
}
void Assembler::aesenc(XMMRegister dst, Address src) {
assert(VM_Version::supports_aes(), "");
InstructionMark im(this);
@ -4391,6 +4410,15 @@ void Assembler::vpalignr(XMMRegister dst, XMMRegister nds, XMMRegister src, int
emit_int8(imm8);
}
void Assembler::evalignq(XMMRegister dst, XMMRegister nds, XMMRegister src, uint8_t imm8) {
assert(VM_Version::supports_evex(), "");
InstructionAttr attributes(AVX_512bit, /* vex_w */ true, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ true);
int encode = vex_prefix_and_encode(dst->encoding(), nds->encoding(), src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_3A, &attributes);
emit_int8(0x3);
emit_int8((unsigned char)(0xC0 | encode));
emit_int8(imm8);
}
void Assembler::pblendw(XMMRegister dst, XMMRegister src, int imm8) {
assert(VM_Version::supports_sse4_1(), "");
InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ true, /* no_mask_reg */ true, /* uses_vl */ false);
@ -6708,7 +6736,29 @@ void Assembler::evpbroadcastq(XMMRegister dst, Address src, int vector_len) {
emit_int8(0x59);
emit_operand(dst, src);
}
void Assembler::evbroadcasti64x2(XMMRegister dst, XMMRegister src, int vector_len) {
assert(vector_len != Assembler::AVX_128bit, "");
assert(VM_Version::supports_avx512dq(), "");
InstructionAttr attributes(vector_len, /* vex_w */ true, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ true);
attributes.set_rex_vex_w_reverted();
int encode = vex_prefix_and_encode(dst->encoding(), 0, src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_38, &attributes);
emit_int8(0x5A);
emit_int8((unsigned char)(0xC0 | encode));
}
void Assembler::evbroadcasti64x2(XMMRegister dst, Address src, int vector_len) {
assert(vector_len != Assembler::AVX_128bit, "");
assert(VM_Version::supports_avx512dq(), "");
assert(dst != xnoreg, "sanity");
InstructionMark im(this);
InstructionAttr attributes(vector_len, /* vex_w */ true, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ true);
attributes.set_rex_vex_w_reverted();
attributes.set_address_attributes(/* tuple_type */ EVEX_T2, /* input_size_in_bits */ EVEX_64bit);
// swap src<->dst for encoding
vex_prefix(src, 0, dst->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_38, &attributes);
emit_int8(0x5A);
emit_operand(dst, src);
}
// scalar single/double precision replicate

View File

@ -926,7 +926,8 @@ private:
void aesenc(XMMRegister dst, XMMRegister src);
void aesenclast(XMMRegister dst, Address src);
void aesenclast(XMMRegister dst, XMMRegister src);
void vaesdec(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
void vaesdeclast(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
void andl(Address dst, int32_t imm32);
void andl(Register dst, int32_t imm32);
@ -1739,6 +1740,7 @@ private:
void palignr(XMMRegister dst, XMMRegister src, int imm8);
void vpalignr(XMMRegister dst, XMMRegister src1, XMMRegister src2, int imm8, int vector_len);
void evalignq(XMMRegister dst, XMMRegister nds, XMMRegister src, uint8_t imm8);
void pblendw(XMMRegister dst, XMMRegister src, int imm8);
@ -2102,6 +2104,9 @@ private:
void evpbroadcastq(XMMRegister dst, XMMRegister src, int vector_len);
void evpbroadcastq(XMMRegister dst, Address src, int vector_len);
void evbroadcasti64x2(XMMRegister dst, XMMRegister src, int vector_len);
void evbroadcasti64x2(XMMRegister dst, Address src, int vector_len);
// scalar single/double precision replicate
void evpbroadcastss(XMMRegister dst, XMMRegister src, int vector_len);
void evpbroadcastss(XMMRegister dst, Address src, int vector_len);

View File

@ -4084,6 +4084,312 @@ class StubGenerator: public StubCodeGenerator {
return start;
}
void roundDec(XMMRegister xmm_reg) {
__ vaesdec(xmm1, xmm1, xmm_reg, Assembler::AVX_512bit);
__ vaesdec(xmm2, xmm2, xmm_reg, Assembler::AVX_512bit);
__ vaesdec(xmm3, xmm3, xmm_reg, Assembler::AVX_512bit);
__ vaesdec(xmm4, xmm4, xmm_reg, Assembler::AVX_512bit);
__ vaesdec(xmm5, xmm5, xmm_reg, Assembler::AVX_512bit);
__ vaesdec(xmm6, xmm6, xmm_reg, Assembler::AVX_512bit);
__ vaesdec(xmm7, xmm7, xmm_reg, Assembler::AVX_512bit);
__ vaesdec(xmm8, xmm8, xmm_reg, Assembler::AVX_512bit);
}
void roundDeclast(XMMRegister xmm_reg) {
__ vaesdeclast(xmm1, xmm1, xmm_reg, Assembler::AVX_512bit);
__ vaesdeclast(xmm2, xmm2, xmm_reg, Assembler::AVX_512bit);
__ vaesdeclast(xmm3, xmm3, xmm_reg, Assembler::AVX_512bit);
__ vaesdeclast(xmm4, xmm4, xmm_reg, Assembler::AVX_512bit);
__ vaesdeclast(xmm5, xmm5, xmm_reg, Assembler::AVX_512bit);
__ vaesdeclast(xmm6, xmm6, xmm_reg, Assembler::AVX_512bit);
__ vaesdeclast(xmm7, xmm7, xmm_reg, Assembler::AVX_512bit);
__ vaesdeclast(xmm8, xmm8, xmm_reg, Assembler::AVX_512bit);
}
void ev_load_key(XMMRegister xmmdst, Register key, int offset, XMMRegister xmm_shuf_mask = NULL) {
__ movdqu(xmmdst, Address(key, offset));
if (xmm_shuf_mask != NULL) {
__ pshufb(xmmdst, xmm_shuf_mask);
} else {
__ pshufb(xmmdst, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr()));
}
__ evshufi64x2(xmmdst, xmmdst, xmmdst, 0x0, Assembler::AVX_512bit);
}
address generate_cipherBlockChaining_decryptVectorAESCrypt() {
assert(VM_Version::supports_vaes(), "need AES instructions and misaligned SSE support");
__ align(CodeEntryAlignment);
StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_decryptAESCrypt");
address start = __ pc();
const Register from = c_rarg0; // source array address
const Register to = c_rarg1; // destination array address
const Register key = c_rarg2; // key array address
const Register rvec = c_rarg3; // r byte array initialized from initvector array address
// and left with the results of the last encryption block
#ifndef _WIN64
const Register len_reg = c_rarg4; // src len (must be multiple of blocksize 16)
#else
const Address len_mem(rbp, 6 * wordSize); // length is on stack on Win64
const Register len_reg = r11; // pick the volatile windows register
#endif
Label Loop, Loop1, L_128, L_256, L_192, KEY_192, KEY_256, Loop2, Lcbc_dec_rem_loop,
Lcbc_dec_rem_last, Lcbc_dec_ret, Lcbc_dec_rem, Lcbc_exit;
__ enter();
#ifdef _WIN64
// on win64, fill len_reg from stack position
__ movl(len_reg, len_mem);
#else
__ push(len_reg); // Save
#endif
__ push(rbx);
__ vzeroupper();
// Temporary variable declaration for swapping key bytes
const XMMRegister xmm_key_shuf_mask = xmm1;
__ movdqu(xmm_key_shuf_mask, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr()));
// Calculate number of rounds from key size: 44 for 10-rounds, 52 for 12-rounds, 60 for 14-rounds
const Register rounds = rbx;
__ movl(rounds, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
const XMMRegister IV = xmm0;
// Load IV and broadcast value to 512-bits
__ evbroadcasti64x2(IV, Address(rvec, 0), Assembler::AVX_512bit);
// Temporary variables for storing round keys
const XMMRegister RK0 = xmm30;
const XMMRegister RK1 = xmm9;
const XMMRegister RK2 = xmm18;
const XMMRegister RK3 = xmm19;
const XMMRegister RK4 = xmm20;
const XMMRegister RK5 = xmm21;
const XMMRegister RK6 = xmm22;
const XMMRegister RK7 = xmm23;
const XMMRegister RK8 = xmm24;
const XMMRegister RK9 = xmm25;
const XMMRegister RK10 = xmm26;
// Load and shuffle key
// the java expanded key ordering is rotated one position from what we want
// so we start from 1*16 here and hit 0*16 last
ev_load_key(RK1, key, 1 * 16, xmm_key_shuf_mask);
ev_load_key(RK2, key, 2 * 16, xmm_key_shuf_mask);
ev_load_key(RK3, key, 3 * 16, xmm_key_shuf_mask);
ev_load_key(RK4, key, 4 * 16, xmm_key_shuf_mask);
ev_load_key(RK5, key, 5 * 16, xmm_key_shuf_mask);
ev_load_key(RK6, key, 6 * 16, xmm_key_shuf_mask);
ev_load_key(RK7, key, 7 * 16, xmm_key_shuf_mask);
ev_load_key(RK8, key, 8 * 16, xmm_key_shuf_mask);
ev_load_key(RK9, key, 9 * 16, xmm_key_shuf_mask);
ev_load_key(RK10, key, 10 * 16, xmm_key_shuf_mask);
ev_load_key(RK0, key, 0*16, xmm_key_shuf_mask);
// Variables for storing source cipher text
const XMMRegister S0 = xmm10;
const XMMRegister S1 = xmm11;
const XMMRegister S2 = xmm12;
const XMMRegister S3 = xmm13;
const XMMRegister S4 = xmm14;
const XMMRegister S5 = xmm15;
const XMMRegister S6 = xmm16;
const XMMRegister S7 = xmm17;
// Variables for storing decrypted text
const XMMRegister B0 = xmm1;
const XMMRegister B1 = xmm2;
const XMMRegister B2 = xmm3;
const XMMRegister B3 = xmm4;
const XMMRegister B4 = xmm5;
const XMMRegister B5 = xmm6;
const XMMRegister B6 = xmm7;
const XMMRegister B7 = xmm8;
__ cmpl(rounds, 44);
__ jcc(Assembler::greater, KEY_192);
__ jmp(Loop);
__ BIND(KEY_192);
const XMMRegister RK11 = xmm27;
const XMMRegister RK12 = xmm28;
ev_load_key(RK11, key, 11*16, xmm_key_shuf_mask);
ev_load_key(RK12, key, 12*16, xmm_key_shuf_mask);
__ cmpl(rounds, 52);
__ jcc(Assembler::greater, KEY_256);
__ jmp(Loop);
__ BIND(KEY_256);
const XMMRegister RK13 = xmm29;
const XMMRegister RK14 = xmm31;
ev_load_key(RK13, key, 13*16, xmm_key_shuf_mask);
ev_load_key(RK14, key, 14*16, xmm_key_shuf_mask);
__ BIND(Loop);
__ cmpl(len_reg, 512);
__ jcc(Assembler::below, Lcbc_dec_rem);
__ BIND(Loop1);
__ subl(len_reg, 512);
__ evmovdquq(S0, Address(from, 0 * 64), Assembler::AVX_512bit);
__ evmovdquq(S1, Address(from, 1 * 64), Assembler::AVX_512bit);
__ evmovdquq(S2, Address(from, 2 * 64), Assembler::AVX_512bit);
__ evmovdquq(S3, Address(from, 3 * 64), Assembler::AVX_512bit);
__ evmovdquq(S4, Address(from, 4 * 64), Assembler::AVX_512bit);
__ evmovdquq(S5, Address(from, 5 * 64), Assembler::AVX_512bit);
__ evmovdquq(S6, Address(from, 6 * 64), Assembler::AVX_512bit);
__ evmovdquq(S7, Address(from, 7 * 64), Assembler::AVX_512bit);
__ leaq(from, Address(from, 8 * 64));
__ evpxorq(B0, S0, RK1, Assembler::AVX_512bit);
__ evpxorq(B1, S1, RK1, Assembler::AVX_512bit);
__ evpxorq(B2, S2, RK1, Assembler::AVX_512bit);
__ evpxorq(B3, S3, RK1, Assembler::AVX_512bit);
__ evpxorq(B4, S4, RK1, Assembler::AVX_512bit);
__ evpxorq(B5, S5, RK1, Assembler::AVX_512bit);
__ evpxorq(B6, S6, RK1, Assembler::AVX_512bit);
__ evpxorq(B7, S7, RK1, Assembler::AVX_512bit);
__ evalignq(IV, S0, IV, 0x06);
__ evalignq(S0, S1, S0, 0x06);
__ evalignq(S1, S2, S1, 0x06);
__ evalignq(S2, S3, S2, 0x06);
__ evalignq(S3, S4, S3, 0x06);
__ evalignq(S4, S5, S4, 0x06);
__ evalignq(S5, S6, S5, 0x06);
__ evalignq(S6, S7, S6, 0x06);
roundDec(RK2);
roundDec(RK3);
roundDec(RK4);
roundDec(RK5);
roundDec(RK6);
roundDec(RK7);
roundDec(RK8);
roundDec(RK9);
roundDec(RK10);
__ cmpl(rounds, 44);
__ jcc(Assembler::belowEqual, L_128);
roundDec(RK11);
roundDec(RK12);
__ cmpl(rounds, 52);
__ jcc(Assembler::belowEqual, L_192);
roundDec(RK13);
roundDec(RK14);
__ BIND(L_256);
roundDeclast(RK0);
__ jmp(Loop2);
__ BIND(L_128);
roundDeclast(RK0);
__ jmp(Loop2);
__ BIND(L_192);
roundDeclast(RK0);
__ BIND(Loop2);
__ evpxorq(B0, B0, IV, Assembler::AVX_512bit);
__ evpxorq(B1, B1, S0, Assembler::AVX_512bit);
__ evpxorq(B2, B2, S1, Assembler::AVX_512bit);
__ evpxorq(B3, B3, S2, Assembler::AVX_512bit);
__ evpxorq(B4, B4, S3, Assembler::AVX_512bit);
__ evpxorq(B5, B5, S4, Assembler::AVX_512bit);
__ evpxorq(B6, B6, S5, Assembler::AVX_512bit);
__ evpxorq(B7, B7, S6, Assembler::AVX_512bit);
__ evmovdquq(IV, S7, Assembler::AVX_512bit);
__ evmovdquq(Address(to, 0 * 64), B0, Assembler::AVX_512bit);
__ evmovdquq(Address(to, 1 * 64), B1, Assembler::AVX_512bit);
__ evmovdquq(Address(to, 2 * 64), B2, Assembler::AVX_512bit);
__ evmovdquq(Address(to, 3 * 64), B3, Assembler::AVX_512bit);
__ evmovdquq(Address(to, 4 * 64), B4, Assembler::AVX_512bit);
__ evmovdquq(Address(to, 5 * 64), B5, Assembler::AVX_512bit);
__ evmovdquq(Address(to, 6 * 64), B6, Assembler::AVX_512bit);
__ evmovdquq(Address(to, 7 * 64), B7, Assembler::AVX_512bit);
__ leaq(to, Address(to, 8 * 64));
__ jmp(Loop);
__ BIND(Lcbc_dec_rem);
__ evshufi64x2(IV, IV, IV, 0x03, Assembler::AVX_512bit);
__ BIND(Lcbc_dec_rem_loop);
__ subl(len_reg, 16);
__ jcc(Assembler::carrySet, Lcbc_dec_ret);
__ movdqu(S0, Address(from, 0));
__ evpxorq(B0, S0, RK1, Assembler::AVX_512bit);
__ vaesdec(B0, B0, RK2, Assembler::AVX_512bit);
__ vaesdec(B0, B0, RK3, Assembler::AVX_512bit);
__ vaesdec(B0, B0, RK4, Assembler::AVX_512bit);
__ vaesdec(B0, B0, RK5, Assembler::AVX_512bit);
__ vaesdec(B0, B0, RK6, Assembler::AVX_512bit);
__ vaesdec(B0, B0, RK7, Assembler::AVX_512bit);
__ vaesdec(B0, B0, RK8, Assembler::AVX_512bit);
__ vaesdec(B0, B0, RK9, Assembler::AVX_512bit);
__ vaesdec(B0, B0, RK10, Assembler::AVX_512bit);
__ cmpl(rounds, 44);
__ jcc(Assembler::belowEqual, Lcbc_dec_rem_last);
__ vaesdec(B0, B0, RK11, Assembler::AVX_512bit);
__ vaesdec(B0, B0, RK12, Assembler::AVX_512bit);
__ cmpl(rounds, 52);
__ jcc(Assembler::belowEqual, Lcbc_dec_rem_last);
__ vaesdec(B0, B0, RK13, Assembler::AVX_512bit);
__ vaesdec(B0, B0, RK14, Assembler::AVX_512bit);
__ BIND(Lcbc_dec_rem_last);
__ vaesdeclast(B0, B0, RK0, Assembler::AVX_512bit);
__ evpxorq(B0, B0, IV, Assembler::AVX_512bit);
__ evmovdquq(IV, S0, Assembler::AVX_512bit);
__ movdqu(Address(to, 0), B0);
__ leaq(from, Address(from, 16));
__ leaq(to, Address(to, 16));
__ jmp(Lcbc_dec_rem_loop);
__ BIND(Lcbc_dec_ret);
__ movdqu(Address(rvec, 0), IV);
// Zero out the round keys
__ evpxorq(RK0, RK0, RK0, Assembler::AVX_512bit);
__ evpxorq(RK1, RK1, RK1, Assembler::AVX_512bit);
__ evpxorq(RK2, RK2, RK2, Assembler::AVX_512bit);
__ evpxorq(RK3, RK3, RK3, Assembler::AVX_512bit);
__ evpxorq(RK4, RK4, RK4, Assembler::AVX_512bit);
__ evpxorq(RK5, RK5, RK5, Assembler::AVX_512bit);
__ evpxorq(RK6, RK6, RK6, Assembler::AVX_512bit);
__ evpxorq(RK7, RK7, RK7, Assembler::AVX_512bit);
__ evpxorq(RK8, RK8, RK8, Assembler::AVX_512bit);
__ evpxorq(RK9, RK9, RK9, Assembler::AVX_512bit);
__ evpxorq(RK10, RK10, RK10, Assembler::AVX_512bit);
__ cmpl(rounds, 44);
__ jcc(Assembler::belowEqual, Lcbc_exit);
__ evpxorq(RK11, RK11, RK11, Assembler::AVX_512bit);
__ evpxorq(RK12, RK12, RK12, Assembler::AVX_512bit);
__ cmpl(rounds, 52);
__ jcc(Assembler::belowEqual, Lcbc_exit);
__ evpxorq(RK13, RK13, RK13, Assembler::AVX_512bit);
__ evpxorq(RK14, RK14, RK14, Assembler::AVX_512bit);
__ BIND(Lcbc_exit);
__ pop(rbx);
#ifdef _WIN64
__ movl(rax, len_mem);
#else
__ pop(rax); // return length
#endif
__ leave(); // required for proper stackwalking of RuntimeStub frame
__ ret(0);
return start;
}
// byte swap x86 long
address generate_ghash_long_swap_mask() {
__ align(CodeEntryAlignment);
@ -5078,7 +5384,11 @@ class StubGenerator: public StubCodeGenerator {
StubRoutines::_aescrypt_encryptBlock = generate_aescrypt_encryptBlock();
StubRoutines::_aescrypt_decryptBlock = generate_aescrypt_decryptBlock();
StubRoutines::_cipherBlockChaining_encryptAESCrypt = generate_cipherBlockChaining_encryptAESCrypt();
StubRoutines::_cipherBlockChaining_decryptAESCrypt = generate_cipherBlockChaining_decryptAESCrypt_Parallel();
if (VM_Version::supports_vaes() && VM_Version::supports_avx512vl() && VM_Version::supports_avx512dq() ) {
StubRoutines::_cipherBlockChaining_decryptAESCrypt = generate_cipherBlockChaining_decryptVectorAESCrypt();
} else {
StubRoutines::_cipherBlockChaining_decryptAESCrypt = generate_cipherBlockChaining_decryptAESCrypt_Parallel();
}
}
if (UseAESCTRIntrinsics){
StubRoutines::x86::_counter_shuffle_mask_addr = generate_counter_shuffle_mask();

View File

@ -666,6 +666,7 @@ void VM_Version::get_processor_features() {
_features &= ~CPU_AVX512VL;
_features &= ~CPU_AVX512_VPOPCNTDQ;
_features &= ~CPU_VPCLMULQDQ;
_features &= ~CPU_VAES;
}
if (UseAVX < 2)

View File

@ -335,6 +335,7 @@ protected:
#define CPU_VZEROUPPER ((uint64_t)UCONST64(0x1000000000)) // Vzeroupper instruction
#define CPU_AVX512_VPOPCNTDQ ((uint64_t)UCONST64(0x2000000000)) // Vector popcount
#define CPU_VPCLMULQDQ ((uint64_t)UCONST64(0x4000000000)) //Vector carryless multiplication
#define CPU_VAES ((uint64_t)UCONST64(0x8000000000)) // Vector AES instructions
enum Extended_Family {
// AMD
@ -545,6 +546,8 @@ protected:
result |= CPU_AVX512_VPOPCNTDQ;
if (_cpuid_info.sef_cpuid7_ecx.bits.vpclmulqdq != 0)
result |= CPU_VPCLMULQDQ;
if (_cpuid_info.sef_cpuid7_ecx.bits.vaes != 0)
result |= CPU_VAES;
}
}
if(_cpuid_info.sef_cpuid7_ebx.bits.bmi1 != 0)
@ -823,6 +826,7 @@ public:
static bool supports_vzeroupper() { return (_features & CPU_VZEROUPPER) != 0; }
static bool supports_vpopcntdq() { return (_features & CPU_AVX512_VPOPCNTDQ) != 0; }
static bool supports_vpclmulqdq() { return (_features & CPU_VPCLMULQDQ) != 0; }
static bool supports_vaes() { return (_features & CPU_VAES) != 0; }
// Intel features
static bool is_intel_family_core() { return is_intel() &&