8341527: AVX-512 intrinsic for SHA3

Reviewed-by: sviswanathan
This commit is contained in:
Ferenc Rakoczi 2024-10-29 15:18:24 +00:00 committed by Weijun Wang
parent 4ce19ca110
commit 9cfb0f7f7a
9 changed files with 426 additions and 5 deletions

View File

@ -3585,6 +3585,23 @@ void Assembler::evmovdqub(Address dst, KRegister mask, XMMRegister src, bool mer
emit_operand(src, dst, 0); emit_operand(src, dst, 0);
} }
void Assembler::evmovdquw(XMMRegister dst, XMMRegister src, int vector_len) {
// Unmasked instruction
evmovdquw(dst, k0, src, /*merge*/ false, vector_len);
}
void Assembler::evmovdquw(XMMRegister dst, KRegister mask, XMMRegister src, bool merge, int vector_len) {
assert(vector_len <= AVX_256bit ? VM_Version::supports_avx512vlbw() : VM_Version::supports_avx512bw(), "");
InstructionAttr attributes(vector_len, /* vex_w */ true, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ true);
attributes.set_embedded_opmask_register_specifier(mask);
attributes.set_is_evex_instruction();
if (merge) {
attributes.reset_is_clear_context();
}
int encode = vex_prefix_and_encode(dst->encoding(), 0, src->encoding(), VEX_SIMD_F2, VEX_OPCODE_0F, &attributes);
emit_int16(0x6F, (0xC0 | encode));
}
void Assembler::evmovdquw(XMMRegister dst, Address src, int vector_len) { void Assembler::evmovdquw(XMMRegister dst, Address src, int vector_len) {
// Unmasked instruction // Unmasked instruction
evmovdquw(dst, k0, src, /*merge*/ false, vector_len); evmovdquw(dst, k0, src, /*merge*/ false, vector_len);
@ -8711,6 +8728,15 @@ void Assembler::vpmuludq(XMMRegister dst, XMMRegister nds, XMMRegister src, int
emit_int16((unsigned char)0xF4, (0xC0 | encode)); emit_int16((unsigned char)0xF4, (0xC0 | encode));
} }
void Assembler::vpmuldq(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
assert(vector_len == AVX_128bit ? VM_Version::supports_avx() :
(vector_len == AVX_256bit ? VM_Version::supports_avx2() : VM_Version::supports_evex()), "");
InstructionAttr attributes(vector_len, /* vex_w */ VM_Version::supports_evex(), /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ true);
attributes.set_rex_vex_w_reverted();
int encode = vex_prefix_and_encode(dst->encoding(), nds->encoding(), src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_38, &attributes);
emit_int16(0x28, (0xC0 | encode));
}
void Assembler::vpmullw(XMMRegister dst, XMMRegister nds, Address src, int vector_len) { void Assembler::vpmullw(XMMRegister dst, XMMRegister nds, Address src, int vector_len) {
assert(UseAVX > 0, "requires some form of AVX"); assert(UseAVX > 0, "requires some form of AVX");
InstructionMark im(this); InstructionMark im(this);
@ -11246,6 +11272,18 @@ void Assembler::evpmullq(XMMRegister dst, KRegister mask, XMMRegister nds, Addre
emit_operand(dst, src, 0); emit_operand(dst, src, 0);
} }
void Assembler::evpmulhw(XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len) {
assert(VM_Version::supports_avx512bw() && (vector_len == AVX_512bit || VM_Version::supports_avx512vl()), "");
InstructionAttr attributes(vector_len, /* vex_w */ false,/* legacy_mode */ false, /* no_mask_reg */ false,/* uses_vl */ true);
attributes.set_is_evex_instruction();
attributes.set_embedded_opmask_register_specifier(mask);
if (merge) {
attributes.reset_is_clear_context();
}
int encode = vex_prefix_and_encode(dst->encoding(), nds->encoding(), src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F, &attributes);
emit_int16((unsigned char)0xE5, (0xC0 | encode));
}
void Assembler::evmulps(XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len) { void Assembler::evmulps(XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len) {
assert(VM_Version::supports_evex(), ""); assert(VM_Version::supports_evex(), "");
assert(vector_len == AVX_512bit || VM_Version::supports_avx512vl(), ""); assert(vector_len == AVX_512bit || VM_Version::supports_avx512vl(), "");
@ -16914,3 +16952,28 @@ void Assembler::evpermt2b(XMMRegister dst, XMMRegister nds, XMMRegister src, int
int encode = vex_prefix_and_encode(dst->encoding(), nds->encoding(), src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_38, &attributes); int encode = vex_prefix_and_encode(dst->encoding(), nds->encoding(), src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_38, &attributes);
emit_int16(0x7D, (0xC0 | encode)); emit_int16(0x7D, (0xC0 | encode));
} }
void Assembler::evpermt2w(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
assert(vector_len <= AVX_256bit ? VM_Version::supports_avx512vlbw() : VM_Version::supports_avx512bw(), "");
InstructionAttr attributes(vector_len, /* vex_w */ true, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ true);
attributes.set_is_evex_instruction();
int encode = vex_prefix_and_encode(dst->encoding(), nds->encoding(), src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_38, &attributes);
emit_int16(0x7D, (0xC0 | encode));
}
void Assembler::evpermt2d(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
assert(VM_Version::supports_evex() && (vector_len == Assembler::AVX_512bit || VM_Version::supports_avx512vl()), "");
InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ true);
attributes.set_is_evex_instruction();
int encode = vex_prefix_and_encode(dst->encoding(), nds->encoding(), src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_38, &attributes);
emit_int16(0x7E, (0xC0 | encode));
}
void Assembler::evpermt2q(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
assert(VM_Version::supports_evex() && (vector_len == Assembler::AVX_512bit || VM_Version::supports_avx512vl()), "");
InstructionAttr attributes(vector_len, /* vex_w */ true, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ true);
attributes.set_is_evex_instruction();
int encode = vex_prefix_and_encode(dst->encoding(), nds->encoding(), src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_38, &attributes);
emit_int16(0x7E, (0xC0 | encode));
}

View File

@ -1757,6 +1757,7 @@ private:
void evmovdqub(XMMRegister dst, KRegister mask, Address src, bool merge, int vector_len); void evmovdqub(XMMRegister dst, KRegister mask, Address src, bool merge, int vector_len);
void evmovdqub(Address dst, KRegister mask, XMMRegister src, bool merge, int vector_len); void evmovdqub(Address dst, KRegister mask, XMMRegister src, bool merge, int vector_len);
void evmovdquw(XMMRegister dst, XMMRegister src, int vector_len);
void evmovdquw(XMMRegister dst, Address src, int vector_len); void evmovdquw(XMMRegister dst, Address src, int vector_len);
void evmovdquw(Address dst, XMMRegister src, int vector_len); void evmovdquw(Address dst, XMMRegister src, int vector_len);
void evmovdquw(XMMRegister dst, KRegister mask, XMMRegister src, bool merge, int vector_len); void evmovdquw(XMMRegister dst, KRegister mask, XMMRegister src, bool merge, int vector_len);
@ -1970,6 +1971,9 @@ private:
void evpermi2ps(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len); void evpermi2ps(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
void evpermi2pd(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len); void evpermi2pd(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
void evpermt2b(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len); void evpermt2b(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
void evpermt2w(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
void evpermt2d(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
void evpermt2q(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
void pause(); void pause();
@ -2741,6 +2745,7 @@ private:
void evsubps(XMMRegister dst, KRegister mask, XMMRegister nds, Address src, bool merge, int vector_len); void evsubps(XMMRegister dst, KRegister mask, XMMRegister nds, Address src, bool merge, int vector_len);
void evsubpd(XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len); void evsubpd(XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len);
void evsubpd(XMMRegister dst, KRegister mask, XMMRegister nds, Address src, bool merge, int vector_len); void evsubpd(XMMRegister dst, KRegister mask, XMMRegister nds, Address src, bool merge, int vector_len);
void evpmulhw(XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len);
void evpmullw(XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len); void evpmullw(XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len);
void evpmullw(XMMRegister dst, KRegister mask, XMMRegister nds, Address src, bool merge, int vector_len); void evpmullw(XMMRegister dst, KRegister mask, XMMRegister nds, Address src, bool merge, int vector_len);
void evpmulld(XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len); void evpmulld(XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len);
@ -2876,6 +2881,7 @@ private:
void vpmulld(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len); void vpmulld(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
void evpmullq(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len); void evpmullq(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
void vpmuludq(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len); void vpmuludq(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
void vpmuldq(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
void vpmullw(XMMRegister dst, XMMRegister nds, Address src, int vector_len); void vpmullw(XMMRegister dst, XMMRegister nds, Address src, int vector_len);
void vpmulld(XMMRegister dst, XMMRegister nds, Address src, int vector_len); void vpmulld(XMMRegister dst, XMMRegister nds, Address src, int vector_len);
void evpmullq(XMMRegister dst, XMMRegister nds, Address src, int vector_len); void evpmullq(XMMRegister dst, XMMRegister nds, Address src, int vector_len);

View File

@ -1296,6 +1296,7 @@ public:
void evmovdqub(XMMRegister dst, KRegister mask, Address src, bool merge, int vector_len) { Assembler::evmovdqub(dst, mask, src, merge, vector_len); } void evmovdqub(XMMRegister dst, KRegister mask, Address src, bool merge, int vector_len) { Assembler::evmovdqub(dst, mask, src, merge, vector_len); }
void evmovdqub(XMMRegister dst, KRegister mask, AddressLiteral src, bool merge, int vector_len, Register rscratch = noreg); void evmovdqub(XMMRegister dst, KRegister mask, AddressLiteral src, bool merge, int vector_len, Register rscratch = noreg);
void evmovdquw(XMMRegister dst, XMMRegister src, int vector_len) { Assembler::evmovdquw(dst, src, vector_len); }
void evmovdquw(Address dst, XMMRegister src, int vector_len) { Assembler::evmovdquw(dst, src, vector_len); } void evmovdquw(Address dst, XMMRegister src, int vector_len) { Assembler::evmovdquw(dst, src, vector_len); }
void evmovdquw(XMMRegister dst, Address src, int vector_len) { Assembler::evmovdquw(dst, src, vector_len); } void evmovdquw(XMMRegister dst, Address src, int vector_len) { Assembler::evmovdquw(dst, src, vector_len); }
@ -1506,6 +1507,8 @@ public:
void vpmulld(XMMRegister dst, XMMRegister nds, Address src, int vector_len) { Assembler::vpmulld(dst, nds, src, vector_len); } void vpmulld(XMMRegister dst, XMMRegister nds, Address src, int vector_len) { Assembler::vpmulld(dst, nds, src, vector_len); }
void vpmulld(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register rscratch = noreg); void vpmulld(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register rscratch = noreg);
void vpmuldq(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) { Assembler::vpmuldq(dst, nds, src, vector_len); }
void vpsubb(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len); void vpsubb(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
void vpsubb(XMMRegister dst, XMMRegister nds, Address src, int vector_len); void vpsubb(XMMRegister dst, XMMRegister nds, Address src, int vector_len);
@ -1515,9 +1518,13 @@ public:
void vpsraw(XMMRegister dst, XMMRegister nds, XMMRegister shift, int vector_len); void vpsraw(XMMRegister dst, XMMRegister nds, XMMRegister shift, int vector_len);
void vpsraw(XMMRegister dst, XMMRegister nds, int shift, int vector_len); void vpsraw(XMMRegister dst, XMMRegister nds, int shift, int vector_len);
void evpsrad(XMMRegister dst, XMMRegister nds, XMMRegister shift, int vector_len);
void evpsrad(XMMRegister dst, XMMRegister nds, int shift, int vector_len);
void evpsraq(XMMRegister dst, XMMRegister nds, XMMRegister shift, int vector_len); void evpsraq(XMMRegister dst, XMMRegister nds, XMMRegister shift, int vector_len);
void evpsraq(XMMRegister dst, XMMRegister nds, int shift, int vector_len); void evpsraq(XMMRegister dst, XMMRegister nds, int shift, int vector_len);
using Assembler::evpsllw;
void evpsllw(XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len, bool is_varshift) { void evpsllw(XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len, bool is_varshift) {
if (!is_varshift) { if (!is_varshift) {
Assembler::evpsllw(dst, mask, nds, src, merge, vector_len); Assembler::evpsllw(dst, mask, nds, src, merge, vector_len);
@ -1562,6 +1569,7 @@ public:
Assembler::evpsrlvq(dst, mask, nds, src, merge, vector_len); Assembler::evpsrlvq(dst, mask, nds, src, merge, vector_len);
} }
} }
using Assembler::evpsraw;
void evpsraw(XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len, bool is_varshift) { void evpsraw(XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len, bool is_varshift) {
if (!is_varshift) { if (!is_varshift) {
Assembler::evpsraw(dst, mask, nds, src, merge, vector_len); Assembler::evpsraw(dst, mask, nds, src, merge, vector_len);
@ -1569,6 +1577,7 @@ public:
Assembler::evpsravw(dst, mask, nds, src, merge, vector_len); Assembler::evpsravw(dst, mask, nds, src, merge, vector_len);
} }
} }
using Assembler::evpsrad;
void evpsrad(XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len, bool is_varshift) { void evpsrad(XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len, bool is_varshift) {
if (!is_varshift) { if (!is_varshift) {
Assembler::evpsrad(dst, mask, nds, src, merge, vector_len); Assembler::evpsrad(dst, mask, nds, src, merge, vector_len);

View File

@ -4032,6 +4032,8 @@ void StubGenerator::generate_compiler_stubs() {
generate_chacha_stubs(); generate_chacha_stubs();
generate_sha3_stubs();
#ifdef COMPILER2 #ifdef COMPILER2
if ((UseAVX == 2) && EnableX86ECoreOpts) { if ((UseAVX == 2) && EnableX86ECoreOpts) {
generate_string_indexof(StubRoutines::_string_indexof_array); generate_string_indexof(StubRoutines::_string_indexof_array);

View File

@ -497,6 +497,10 @@ class StubGenerator: public StubCodeGenerator {
address generate_intpoly_montgomeryMult_P256(); address generate_intpoly_montgomeryMult_P256();
address generate_intpoly_assign(); address generate_intpoly_assign();
// SHA3 stubs
void generate_sha3_stubs();
address generate_sha3_implCompress(bool multiBlock, const char *name);
// BASE64 stubs // BASE64 stubs
address base64_shuffle_addr(); address base64_shuffle_addr();

View File

@ -0,0 +1,326 @@
/*
* Copyright (c) 2024, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License version 2 only, as
* published by the Free Software Foundation.
*
* This code is distributed in the hope that it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
* version 2 for more details (a copy is included in the LICENSE file that
* accompanied this code).
*
* You should have received a copy of the GNU General Public License version
* 2 along with this work; if not, write to the Free Software Foundation,
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
*
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
* or visit www.oracle.com if you need additional information or have any
* questions.
*
*/
#include "precompiled.hpp"
#include "asm/assembler.hpp"
#include "asm/assembler.inline.hpp"
#include "runtime/stubRoutines.hpp"
#include "macroAssembler_x86.hpp"
#include "stubGenerator_x86_64.hpp"
#define __ _masm->
#ifdef PRODUCT
#define BLOCK_COMMENT(str) /* nothing */
#else
#define BLOCK_COMMENT(str) __ block_comment(str)
#endif // PRODUCT
#define BIND(label) bind(label); BLOCK_COMMENT(#label ":")
// Constants
ATTRIBUTE_ALIGNED(64) static const uint64_t round_consts_arr[24] = {
0x0000000000000001L, 0x0000000000008082L, 0x800000000000808AL,
0x8000000080008000L, 0x000000000000808BL, 0x0000000080000001L,
0x8000000080008081L, 0x8000000000008009L, 0x000000000000008AL,
0x0000000000000088L, 0x0000000080008009L, 0x000000008000000AL,
0x000000008000808BL, 0x800000000000008BL, 0x8000000000008089L,
0x8000000000008003L, 0x8000000000008002L, 0x8000000000000080L,
0x000000000000800AL, 0x800000008000000AL, 0x8000000080008081L,
0x8000000000008080L, 0x0000000080000001L, 0x8000000080008008L
};
ATTRIBUTE_ALIGNED(64) static const uint64_t permsAndRots[] = {
// permutation in combined rho and pi
9, 2, 11, 0, 1, 2, 3, 4, // step 1 and 3
8, 1, 9, 2, 11, 4, 12, 0, // step 2
9, 2, 10, 3, 11, 4, 12, 0, // step 4
8, 9, 2, 3, 4, 5, 6, 7, // step 5
0, 8, 9, 10, 15, 0, 0, 0, // step 6
4, 5, 8, 9, 6, 7, 10, 11, // step 7 and 8
0, 1, 2, 3, 13, 0, 0, 0, // step 9
2, 3, 0, 1, 11, 0, 0, 0, // step 10
4, 5, 6, 7, 14, 0, 0, 0, // step 11
14, 15, 12, 13, 4, 0, 0, 0, // step 12
// size of rotations (after step 5)
1, 6, 62, 55, 28, 20, 27, 36,
3, 45, 10, 15, 25, 8, 39, 41,
44, 43, 21, 18, 2, 61, 56, 14,
// rotation of row elements
12, 8, 9, 10, 11, 5, 6, 7,
9, 10, 11, 12, 8, 5, 6, 7
};
static address round_constsAddr() {
return (address) round_consts_arr;
}
static address permsAndRotsAddr() {
return (address) permsAndRots;
}
void StubGenerator::generate_sha3_stubs() {
if (UseSHA3Intrinsics) {
StubRoutines::_sha3_implCompress = generate_sha3_implCompress(false,"sha3_implCompress");
StubRoutines::_sha3_implCompressMB = generate_sha3_implCompress(true, "sha3_implCompressMB");
}
}
// Arguments:
//
// Inputs:
// c_rarg0 - byte[] source+offset
// c_rarg1 - long[] SHA3.state
// c_rarg2 - int block_size
// c_rarg3 - int offset
// c_rarg4 - int limit
//
address StubGenerator::generate_sha3_implCompress(bool multiBlock, const char *name) {
__ align(CodeEntryAlignment);
StubCodeMark mark(this, "StubRoutines", name);
address start = __ pc();
const Register buf = c_rarg0;
const Register state = c_rarg1;
const Register block_size = c_rarg2;
const Register ofs = c_rarg3;
#ifndef _WIN64
const Register limit = c_rarg4;
#else
const Address limit_mem(rbp, 6 * wordSize);
const Register limit = r12;
#endif
const Register permsAndRots = r10;
const Register round_consts = r11;
const Register constant2use = r13;
const Register roundsLeft = r14;
Label sha3_loop;
Label rounds24_loop, block104, block136, block144, block168;
__ enter();
__ push(r12);
__ push(r13);
__ push(r14);
#ifdef _WIN64
// on win64, fill limit from stack position
__ movptr(limit, limit_mem);
#endif
__ lea(permsAndRots, ExternalAddress(permsAndRotsAddr()));
__ lea(round_consts, ExternalAddress(round_constsAddr()));
// set up the masks
__ movl(rax, 0x1F);
__ kmovwl(k5, rax);
__ kshiftrwl(k4, k5, 1);
__ kshiftrwl(k3, k5, 2);
__ kshiftrwl(k2, k5, 3);
__ kshiftrwl(k1, k5, 4);
// load the state
__ evmovdquq(xmm0, k5, Address(state, 0), false, Assembler::AVX_512bit);
__ evmovdquq(xmm1, k5, Address(state, 40), false, Assembler::AVX_512bit);
__ evmovdquq(xmm2, k5, Address(state, 80), false, Assembler::AVX_512bit);
__ evmovdquq(xmm3, k5, Address(state, 120), false, Assembler::AVX_512bit);
__ evmovdquq(xmm4, k5, Address(state, 160), false, Assembler::AVX_512bit);
// load the permutation and rotation constants
__ evmovdquq(xmm17, Address(permsAndRots, 0), Assembler::AVX_512bit);
__ evmovdquq(xmm18, Address(permsAndRots, 64), Assembler::AVX_512bit);
__ evmovdquq(xmm19, Address(permsAndRots, 128), Assembler::AVX_512bit);
__ evmovdquq(xmm20, Address(permsAndRots, 192), Assembler::AVX_512bit);
__ evmovdquq(xmm21, Address(permsAndRots, 256), Assembler::AVX_512bit);
__ evmovdquq(xmm22, Address(permsAndRots, 320), Assembler::AVX_512bit);
__ evmovdquq(xmm23, Address(permsAndRots, 384), Assembler::AVX_512bit);
__ evmovdquq(xmm24, Address(permsAndRots, 448), Assembler::AVX_512bit);
__ evmovdquq(xmm25, Address(permsAndRots, 512), Assembler::AVX_512bit);
__ evmovdquq(xmm26, Address(permsAndRots, 576), Assembler::AVX_512bit);
__ evmovdquq(xmm27, Address(permsAndRots, 640), Assembler::AVX_512bit);
__ evmovdquq(xmm28, Address(permsAndRots, 704), Assembler::AVX_512bit);
__ evmovdquq(xmm29, Address(permsAndRots, 768), Assembler::AVX_512bit);
__ evmovdquq(xmm30, Address(permsAndRots, 832), Assembler::AVX_512bit);
__ evmovdquq(xmm31, Address(permsAndRots, 896), Assembler::AVX_512bit);
__ BIND(sha3_loop);
// there will be 24 keccak rounds
__ movl(roundsLeft, 24);
// load round_constants base
__ movptr(constant2use, round_consts);
// load input: 72, 104, 136, 144 or 168 bytes
// i.e. 5+4, 2*5+3, 3*5+2, 3*5+3 or 4*5+1 longs
__ evpxorq(xmm0, k5, xmm0, Address(buf, 0), true, Assembler::AVX_512bit);
// if(blockSize == 72) SHA3-512
__ cmpl(block_size, 72);
__ jcc(Assembler::notEqual, block104);
__ evpxorq(xmm1, k4, xmm1, Address(buf, 40), true, Assembler::AVX_512bit);
__ jmp(rounds24_loop);
// if(blockSize == 104) SHA3-384
__ BIND(block104);
__ cmpl(block_size, 104);
__ jcc(Assembler::notEqual, block136);
__ evpxorq(xmm1, k5, xmm1, Address(buf, 40), true, Assembler::AVX_512bit);
__ evpxorq(xmm2, k3, xmm2, Address(buf, 80), true, Assembler::AVX_512bit);
__ jmp(rounds24_loop);
// if(blockSize == 136) SHA3-256 and SHAKE256
__ BIND(block136);
__ cmpl(block_size, 136);
__ jcc(Assembler::notEqual, block144);
__ evpxorq(xmm1, k5, xmm1, Address(buf, 40), true, Assembler::AVX_512bit);
__ evpxorq(xmm2, k5, xmm2, Address(buf, 80), true, Assembler::AVX_512bit);
__ evpxorq(xmm3, k2, xmm3, Address(buf, 120), true, Assembler::AVX_512bit);
__ jmp(rounds24_loop);
// if(blockSize == 144) SHA3-224
__ BIND(block144);
__ cmpl(block_size, 144);
__ jcc(Assembler::notEqual, block168);
__ evpxorq(xmm1, k5, xmm1, Address(buf, 40), true, Assembler::AVX_512bit);
__ evpxorq(xmm2, k5, xmm2, Address(buf, 80), true, Assembler::AVX_512bit);
__ evpxorq(xmm3, k3, xmm3, Address(buf, 120), true, Assembler::AVX_512bit);
__ jmp(rounds24_loop);
// if(blockSize == 168) SHAKE128
__ BIND(block168);
__ evpxorq(xmm1, k5, xmm1, Address(buf, 40), true, Assembler::AVX_512bit);
__ evpxorq(xmm2, k5, xmm2, Address(buf, 80), true, Assembler::AVX_512bit);
__ evpxorq(xmm3, k5, xmm3, Address(buf, 120), true, Assembler::AVX_512bit);
__ evpxorq(xmm4, k1, xmm4, Address(buf, 160), true, Assembler::AVX_512bit);
// The 24 rounds of the keccak transformation.
// The implementation closely follows the Java version, with the state
// array "rows" in the lowest 5 64-bit slots of zmm0 - zmm4, i.e.
// each row of the SHA3 specification is located in one zmm register.
__ BIND(rounds24_loop);
__ subl(roundsLeft, 1);
__ evmovdquw(xmm5, xmm0, Assembler::AVX_512bit);
// vpternlogq(x, 150, y, z) does x = x ^ y ^ z
__ vpternlogq(xmm5, 150, xmm1, xmm2, Assembler::AVX_512bit);
__ vpternlogq(xmm5, 150, xmm3, xmm4, Assembler::AVX_512bit);
// Now the "c row", i.e. c0-c4 are in zmm5.
// Rotate each element of the c row by one bit to zmm6, call the
// rotated version c'.
__ evprolq(xmm6, xmm5, 1, Assembler::AVX_512bit);
// Rotate elementwise the c row so that c4 becomes c0,
// c0 becomes c1, etc.
__ evpermt2q(xmm5, xmm30, xmm5, Assembler::AVX_512bit);
// rotate elementwise the c' row so that c'0 becomes c'4,
// c'1 becomes c'0, etc.
__ evpermt2q(xmm6, xmm31, xmm6, Assembler::AVX_512bit);
__ vpternlogq(xmm0, 150, xmm5, xmm6, Assembler::AVX_512bit);
__ vpternlogq(xmm1, 150, xmm5, xmm6, Assembler::AVX_512bit);
__ vpternlogq(xmm2, 150, xmm5, xmm6, Assembler::AVX_512bit);
__ vpternlogq(xmm3, 150, xmm5, xmm6, Assembler::AVX_512bit);
__ vpternlogq(xmm4, 150, xmm5, xmm6, Assembler::AVX_512bit);
// Now the theta mapping has been finished.
// Do the cyclical permutation of the 24 moving state elements
// and the required rotations within each element (the combined
// rho and sigma steps).
__ evpermt2q(xmm4, xmm17, xmm3, Assembler::AVX_512bit);
__ evpermt2q(xmm3, xmm18, xmm2, Assembler::AVX_512bit);
__ evpermt2q(xmm2, xmm17, xmm1, Assembler::AVX_512bit);
__ evpermt2q(xmm1, xmm19, xmm0, Assembler::AVX_512bit);
__ evpermt2q(xmm4, xmm20, xmm2, Assembler::AVX_512bit);
// The 24 moving elements are now in zmm1, zmm3 and zmm4,
// do the rotations now.
__ evprolvq(xmm1, xmm1, xmm27, Assembler::AVX_512bit);
__ evprolvq(xmm3, xmm3, xmm28, Assembler::AVX_512bit);
__ evprolvq(xmm4, xmm4, xmm29, Assembler::AVX_512bit);
__ evmovdquw(xmm2, xmm1, Assembler::AVX_512bit);
__ evmovdquw(xmm5, xmm3, Assembler::AVX_512bit);
__ evpermt2q(xmm0, xmm21, xmm4, Assembler::AVX_512bit);
__ evpermt2q(xmm1, xmm22, xmm3, Assembler::AVX_512bit);
__ evpermt2q(xmm5, xmm22, xmm2, Assembler::AVX_512bit);
__ evmovdquw(xmm3, xmm1, Assembler::AVX_512bit);
__ evmovdquw(xmm2, xmm5, Assembler::AVX_512bit);
__ evpermt2q(xmm1, xmm23, xmm4, Assembler::AVX_512bit);
__ evpermt2q(xmm2, xmm24, xmm4, Assembler::AVX_512bit);
__ evpermt2q(xmm3, xmm25, xmm4, Assembler::AVX_512bit);
__ evpermt2q(xmm4, xmm26, xmm5, Assembler::AVX_512bit);
// The combined rho and sigma steps are done.
// Do the chi step (the same operation on all 5 rows).
// vpternlogq(x, 180, y, z) does x = x ^ (y & ~z).
__ evpermt2q(xmm5, xmm31, xmm0, Assembler::AVX_512bit);
__ evpermt2q(xmm6, xmm31, xmm5, Assembler::AVX_512bit);
__ vpternlogq(xmm0, 180, xmm6, xmm5, Assembler::AVX_512bit);
__ evpermt2q(xmm5, xmm31, xmm1, Assembler::AVX_512bit);
__ evpermt2q(xmm6, xmm31, xmm5, Assembler::AVX_512bit);
__ vpternlogq(xmm1, 180, xmm6, xmm5, Assembler::AVX_512bit);
// xor the round constant into a0 (the lowest 64 bits of zmm0
__ evpxorq(xmm0, k1, xmm0, Address(constant2use, 0), true, Assembler::AVX_512bit);
__ addptr(constant2use, 8);
__ evpermt2q(xmm5, xmm31, xmm2, Assembler::AVX_512bit);
__ evpermt2q(xmm6, xmm31, xmm5, Assembler::AVX_512bit);
__ vpternlogq(xmm2, 180, xmm6, xmm5, Assembler::AVX_512bit);
__ evpermt2q(xmm5, xmm31, xmm3, Assembler::AVX_512bit);
__ evpermt2q(xmm6, xmm31, xmm5, Assembler::AVX_512bit);
__ vpternlogq(xmm3, 180, xmm6, xmm5, Assembler::AVX_512bit);
__ evpermt2q(xmm5, xmm31, xmm4, Assembler::AVX_512bit);
__ evpermt2q(xmm6, xmm31, xmm5, Assembler::AVX_512bit);
__ vpternlogq(xmm4, 180, xmm6, xmm5, Assembler::AVX_512bit);
__ cmpl(roundsLeft, 0);
__ jcc(Assembler::notEqual, rounds24_loop);
if (multiBlock) {
__ addptr(buf, block_size);
__ addl(ofs, block_size);
__ cmpl(ofs, limit);
__ jcc(Assembler::lessEqual, sha3_loop);
__ movq(rax, ofs); // return ofs
} else {
__ xorq(rax, rax); // return 0
}
// store the state
__ evmovdquq(Address(state, 0), k5, xmm0, true, Assembler::AVX_512bit);
__ evmovdquq(Address(state, 40), k5, xmm1, true, Assembler::AVX_512bit);
__ evmovdquq(Address(state, 80), k5, xmm2, true, Assembler::AVX_512bit);
__ evmovdquq(Address(state, 120), k5, xmm3, true, Assembler::AVX_512bit);
__ evmovdquq(Address(state, 160), k5, xmm4, true, Assembler::AVX_512bit);
__ pop(r14);
__ pop(r13);
__ pop(r12);
__ leave(); // required for proper stackwalking of RuntimeStub frame
__ ret(0);
return start;
}

View File

@ -1316,9 +1316,16 @@ void VM_Version::get_processor_features() {
FLAG_SET_DEFAULT(UseSHA512Intrinsics, false); FLAG_SET_DEFAULT(UseSHA512Intrinsics, false);
} }
if (UseSHA3Intrinsics) { #ifdef _LP64
warning("Intrinsics for SHA3-224, SHA3-256, SHA3-384 and SHA3-512 crypto hash functions not available on this CPU."); if (supports_evex() && supports_avx512bw()) {
FLAG_SET_DEFAULT(UseSHA3Intrinsics, false); if (FLAG_IS_DEFAULT(UseSHA3Intrinsics)) {
UseSHA3Intrinsics = true;
}
} else
#endif
if (UseSHA3Intrinsics) {
warning("Intrinsics for SHA3-224, SHA3-256, SHA3-384 and SHA3-512 crypto hash functions not available on this CPU.");
FLAG_SET_DEFAULT(UseSHA3Intrinsics, false);
} }
if (!(UseSHA1Intrinsics || UseSHA256Intrinsics || UseSHA512Intrinsics)) { if (!(UseSHA1Intrinsics || UseSHA256Intrinsics || UseSHA512Intrinsics)) {

View File

@ -32,6 +32,7 @@ import java.security.ProviderException;
import java.util.Arrays; import java.util.Arrays;
import java.util.Objects; import java.util.Objects;
import jdk.internal.util.Preconditions;
import jdk.internal.vm.annotation.IntrinsicCandidate; import jdk.internal.vm.annotation.IntrinsicCandidate;
import static java.lang.Math.min; import static java.lang.Math.min;
@ -99,6 +100,7 @@ public abstract class SHA3 extends DigestBase {
private void implCompressCheck(byte[] b, int ofs) { private void implCompressCheck(byte[] b, int ofs) {
Objects.requireNonNull(b); Objects.requireNonNull(b);
Preconditions.checkIndex(ofs + blockSize - 1, b.length, Preconditions.AIOOBE_FORMATTER);
} }
/** /**

View File

@ -104,8 +104,10 @@ public class IntrinsicPredicates {
new CPUSpecificPredicate("x86_64", new String[] { "avx2", "bmi2" }, null)))))))))); new CPUSpecificPredicate("x86_64", new String[] { "avx2", "bmi2" }, null))))))))));
public static final BooleanSupplier SHA3_INSTRUCTION_AVAILABLE public static final BooleanSupplier SHA3_INSTRUCTION_AVAILABLE
// sha3 is only implemented on aarch64 for now // sha3 is only implemented on aarch64 and avx512 for now
= new CPUSpecificPredicate("aarch64.*", new String[] {"sha3" }, null); = new OrPredicate(new CPUSpecificPredicate("aarch64.*", new String[] {"sha3" }, null),
new OrPredicate(new CPUSpecificPredicate("amd64.*", new String[] {"avx512f", "avx512bw"}, null),
new CPUSpecificPredicate("x86_64", new String[] {"avx512f", "avx512bw"}, null)));
public static final BooleanSupplier ANY_SHA_INSTRUCTION_AVAILABLE public static final BooleanSupplier ANY_SHA_INSTRUCTION_AVAILABLE
= new OrPredicate(IntrinsicPredicates.SHA1_INSTRUCTION_AVAILABLE, = new OrPredicate(IntrinsicPredicates.SHA1_INSTRUCTION_AVAILABLE,