diff --git a/src/hotspot/cpu/x86/assembler_x86.cpp b/src/hotspot/cpu/x86/assembler_x86.cpp index 5ba2a91d32b..1155166d0da 100644 --- a/src/hotspot/cpu/x86/assembler_x86.cpp +++ b/src/hotspot/cpu/x86/assembler_x86.cpp @@ -3585,6 +3585,23 @@ void Assembler::evmovdqub(Address dst, KRegister mask, XMMRegister src, bool mer emit_operand(src, dst, 0); } +void Assembler::evmovdquw(XMMRegister dst, XMMRegister src, int vector_len) { + // Unmasked instruction + evmovdquw(dst, k0, src, /*merge*/ false, vector_len); +} + +void Assembler::evmovdquw(XMMRegister dst, KRegister mask, XMMRegister src, bool merge, int vector_len) { + assert(vector_len <= AVX_256bit ? VM_Version::supports_avx512vlbw() : VM_Version::supports_avx512bw(), ""); + InstructionAttr attributes(vector_len, /* vex_w */ true, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ true); + attributes.set_embedded_opmask_register_specifier(mask); + attributes.set_is_evex_instruction(); + if (merge) { + attributes.reset_is_clear_context(); + } + int encode = vex_prefix_and_encode(dst->encoding(), 0, src->encoding(), VEX_SIMD_F2, VEX_OPCODE_0F, &attributes); + emit_int16(0x6F, (0xC0 | encode)); +} + void Assembler::evmovdquw(XMMRegister dst, Address src, int vector_len) { // Unmasked instruction evmovdquw(dst, k0, src, /*merge*/ false, vector_len); @@ -8711,6 +8728,15 @@ void Assembler::vpmuludq(XMMRegister dst, XMMRegister nds, XMMRegister src, int emit_int16((unsigned char)0xF4, (0xC0 | encode)); } +void Assembler::vpmuldq(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) { + assert(vector_len == AVX_128bit ? VM_Version::supports_avx() : + (vector_len == AVX_256bit ? VM_Version::supports_avx2() : VM_Version::supports_evex()), ""); + InstructionAttr attributes(vector_len, /* vex_w */ VM_Version::supports_evex(), /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ true); + attributes.set_rex_vex_w_reverted(); + int encode = vex_prefix_and_encode(dst->encoding(), nds->encoding(), src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_38, &attributes); + emit_int16(0x28, (0xC0 | encode)); +} + void Assembler::vpmullw(XMMRegister dst, XMMRegister nds, Address src, int vector_len) { assert(UseAVX > 0, "requires some form of AVX"); InstructionMark im(this); @@ -11246,6 +11272,18 @@ void Assembler::evpmullq(XMMRegister dst, KRegister mask, XMMRegister nds, Addre emit_operand(dst, src, 0); } +void Assembler::evpmulhw(XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len) { + assert(VM_Version::supports_avx512bw() && (vector_len == AVX_512bit || VM_Version::supports_avx512vl()), ""); + InstructionAttr attributes(vector_len, /* vex_w */ false,/* legacy_mode */ false, /* no_mask_reg */ false,/* uses_vl */ true); + attributes.set_is_evex_instruction(); + attributes.set_embedded_opmask_register_specifier(mask); + if (merge) { + attributes.reset_is_clear_context(); + } + int encode = vex_prefix_and_encode(dst->encoding(), nds->encoding(), src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F, &attributes); + emit_int16((unsigned char)0xE5, (0xC0 | encode)); +} + void Assembler::evmulps(XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len) { assert(VM_Version::supports_evex(), ""); assert(vector_len == AVX_512bit || VM_Version::supports_avx512vl(), ""); @@ -16914,3 +16952,28 @@ void Assembler::evpermt2b(XMMRegister dst, XMMRegister nds, XMMRegister src, int int encode = vex_prefix_and_encode(dst->encoding(), nds->encoding(), src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_38, &attributes); emit_int16(0x7D, (0xC0 | encode)); } + +void Assembler::evpermt2w(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) { + assert(vector_len <= AVX_256bit ? VM_Version::supports_avx512vlbw() : VM_Version::supports_avx512bw(), ""); + InstructionAttr attributes(vector_len, /* vex_w */ true, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ true); + attributes.set_is_evex_instruction(); + int encode = vex_prefix_and_encode(dst->encoding(), nds->encoding(), src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_38, &attributes); + emit_int16(0x7D, (0xC0 | encode)); +} + +void Assembler::evpermt2d(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) { + assert(VM_Version::supports_evex() && (vector_len == Assembler::AVX_512bit || VM_Version::supports_avx512vl()), ""); + InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ true); + attributes.set_is_evex_instruction(); + int encode = vex_prefix_and_encode(dst->encoding(), nds->encoding(), src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_38, &attributes); + emit_int16(0x7E, (0xC0 | encode)); +} + +void Assembler::evpermt2q(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) { + assert(VM_Version::supports_evex() && (vector_len == Assembler::AVX_512bit || VM_Version::supports_avx512vl()), ""); + InstructionAttr attributes(vector_len, /* vex_w */ true, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ true); + attributes.set_is_evex_instruction(); + int encode = vex_prefix_and_encode(dst->encoding(), nds->encoding(), src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_38, &attributes); + emit_int16(0x7E, (0xC0 | encode)); +} + diff --git a/src/hotspot/cpu/x86/assembler_x86.hpp b/src/hotspot/cpu/x86/assembler_x86.hpp index 7eecb9302ff..420c28254d5 100644 --- a/src/hotspot/cpu/x86/assembler_x86.hpp +++ b/src/hotspot/cpu/x86/assembler_x86.hpp @@ -1757,6 +1757,7 @@ private: void evmovdqub(XMMRegister dst, KRegister mask, Address src, bool merge, int vector_len); void evmovdqub(Address dst, KRegister mask, XMMRegister src, bool merge, int vector_len); + void evmovdquw(XMMRegister dst, XMMRegister src, int vector_len); void evmovdquw(XMMRegister dst, Address src, int vector_len); void evmovdquw(Address dst, XMMRegister src, int vector_len); void evmovdquw(XMMRegister dst, KRegister mask, XMMRegister src, bool merge, int vector_len); @@ -1970,6 +1971,9 @@ private: void evpermi2ps(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len); void evpermi2pd(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len); void evpermt2b(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len); + void evpermt2w(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len); + void evpermt2d(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len); + void evpermt2q(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len); void pause(); @@ -2741,6 +2745,7 @@ private: void evsubps(XMMRegister dst, KRegister mask, XMMRegister nds, Address src, bool merge, int vector_len); void evsubpd(XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len); void evsubpd(XMMRegister dst, KRegister mask, XMMRegister nds, Address src, bool merge, int vector_len); + void evpmulhw(XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len); void evpmullw(XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len); void evpmullw(XMMRegister dst, KRegister mask, XMMRegister nds, Address src, bool merge, int vector_len); void evpmulld(XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len); @@ -2876,6 +2881,7 @@ private: void vpmulld(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len); void evpmullq(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len); void vpmuludq(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len); + void vpmuldq(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len); void vpmullw(XMMRegister dst, XMMRegister nds, Address src, int vector_len); void vpmulld(XMMRegister dst, XMMRegister nds, Address src, int vector_len); void evpmullq(XMMRegister dst, XMMRegister nds, Address src, int vector_len); diff --git a/src/hotspot/cpu/x86/macroAssembler_x86.hpp b/src/hotspot/cpu/x86/macroAssembler_x86.hpp index cd3ca71b311..d508feed93c 100644 --- a/src/hotspot/cpu/x86/macroAssembler_x86.hpp +++ b/src/hotspot/cpu/x86/macroAssembler_x86.hpp @@ -1296,6 +1296,7 @@ public: void evmovdqub(XMMRegister dst, KRegister mask, Address src, bool merge, int vector_len) { Assembler::evmovdqub(dst, mask, src, merge, vector_len); } void evmovdqub(XMMRegister dst, KRegister mask, AddressLiteral src, bool merge, int vector_len, Register rscratch = noreg); + void evmovdquw(XMMRegister dst, XMMRegister src, int vector_len) { Assembler::evmovdquw(dst, src, vector_len); } void evmovdquw(Address dst, XMMRegister src, int vector_len) { Assembler::evmovdquw(dst, src, vector_len); } void evmovdquw(XMMRegister dst, Address src, int vector_len) { Assembler::evmovdquw(dst, src, vector_len); } @@ -1506,6 +1507,8 @@ public: void vpmulld(XMMRegister dst, XMMRegister nds, Address src, int vector_len) { Assembler::vpmulld(dst, nds, src, vector_len); } void vpmulld(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register rscratch = noreg); + void vpmuldq(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) { Assembler::vpmuldq(dst, nds, src, vector_len); } + void vpsubb(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len); void vpsubb(XMMRegister dst, XMMRegister nds, Address src, int vector_len); @@ -1515,9 +1518,13 @@ public: void vpsraw(XMMRegister dst, XMMRegister nds, XMMRegister shift, int vector_len); void vpsraw(XMMRegister dst, XMMRegister nds, int shift, int vector_len); + void evpsrad(XMMRegister dst, XMMRegister nds, XMMRegister shift, int vector_len); + void evpsrad(XMMRegister dst, XMMRegister nds, int shift, int vector_len); + void evpsraq(XMMRegister dst, XMMRegister nds, XMMRegister shift, int vector_len); void evpsraq(XMMRegister dst, XMMRegister nds, int shift, int vector_len); + using Assembler::evpsllw; void evpsllw(XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len, bool is_varshift) { if (!is_varshift) { Assembler::evpsllw(dst, mask, nds, src, merge, vector_len); @@ -1562,6 +1569,7 @@ public: Assembler::evpsrlvq(dst, mask, nds, src, merge, vector_len); } } + using Assembler::evpsraw; void evpsraw(XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len, bool is_varshift) { if (!is_varshift) { Assembler::evpsraw(dst, mask, nds, src, merge, vector_len); @@ -1569,6 +1577,7 @@ public: Assembler::evpsravw(dst, mask, nds, src, merge, vector_len); } } + using Assembler::evpsrad; void evpsrad(XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len, bool is_varshift) { if (!is_varshift) { Assembler::evpsrad(dst, mask, nds, src, merge, vector_len); diff --git a/src/hotspot/cpu/x86/stubGenerator_x86_64.cpp b/src/hotspot/cpu/x86/stubGenerator_x86_64.cpp index e23c83ed197..93b1618024e 100644 --- a/src/hotspot/cpu/x86/stubGenerator_x86_64.cpp +++ b/src/hotspot/cpu/x86/stubGenerator_x86_64.cpp @@ -4032,6 +4032,8 @@ void StubGenerator::generate_compiler_stubs() { generate_chacha_stubs(); + generate_sha3_stubs(); + #ifdef COMPILER2 if ((UseAVX == 2) && EnableX86ECoreOpts) { generate_string_indexof(StubRoutines::_string_indexof_array); diff --git a/src/hotspot/cpu/x86/stubGenerator_x86_64.hpp b/src/hotspot/cpu/x86/stubGenerator_x86_64.hpp index 7280e9fbe95..c6fa31c5213 100644 --- a/src/hotspot/cpu/x86/stubGenerator_x86_64.hpp +++ b/src/hotspot/cpu/x86/stubGenerator_x86_64.hpp @@ -497,6 +497,10 @@ class StubGenerator: public StubCodeGenerator { address generate_intpoly_montgomeryMult_P256(); address generate_intpoly_assign(); + // SHA3 stubs + void generate_sha3_stubs(); + address generate_sha3_implCompress(bool multiBlock, const char *name); + // BASE64 stubs address base64_shuffle_addr(); diff --git a/src/hotspot/cpu/x86/stubGenerator_x86_64_sha3.cpp b/src/hotspot/cpu/x86/stubGenerator_x86_64_sha3.cpp new file mode 100644 index 00000000000..49c39226708 --- /dev/null +++ b/src/hotspot/cpu/x86/stubGenerator_x86_64_sha3.cpp @@ -0,0 +1,326 @@ +/* + * Copyright (c) 2024, Oracle and/or its affiliates. All rights reserved. + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This code is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 only, as + * published by the Free Software Foundation. + * + * This code is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * version 2 for more details (a copy is included in the LICENSE file that + * accompanied this code). + * + * You should have received a copy of the GNU General Public License version + * 2 along with this work; if not, write to the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. + * + * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA + * or visit www.oracle.com if you need additional information or have any + * questions. + * + */ + +#include "precompiled.hpp" +#include "asm/assembler.hpp" +#include "asm/assembler.inline.hpp" +#include "runtime/stubRoutines.hpp" +#include "macroAssembler_x86.hpp" +#include "stubGenerator_x86_64.hpp" + +#define __ _masm-> + +#ifdef PRODUCT +#define BLOCK_COMMENT(str) /* nothing */ +#else +#define BLOCK_COMMENT(str) __ block_comment(str) +#endif // PRODUCT + +#define BIND(label) bind(label); BLOCK_COMMENT(#label ":") + +// Constants +ATTRIBUTE_ALIGNED(64) static const uint64_t round_consts_arr[24] = { + 0x0000000000000001L, 0x0000000000008082L, 0x800000000000808AL, + 0x8000000080008000L, 0x000000000000808BL, 0x0000000080000001L, + 0x8000000080008081L, 0x8000000000008009L, 0x000000000000008AL, + 0x0000000000000088L, 0x0000000080008009L, 0x000000008000000AL, + 0x000000008000808BL, 0x800000000000008BL, 0x8000000000008089L, + 0x8000000000008003L, 0x8000000000008002L, 0x8000000000000080L, + 0x000000000000800AL, 0x800000008000000AL, 0x8000000080008081L, + 0x8000000000008080L, 0x0000000080000001L, 0x8000000080008008L + }; + +ATTRIBUTE_ALIGNED(64) static const uint64_t permsAndRots[] = { + // permutation in combined rho and pi + 9, 2, 11, 0, 1, 2, 3, 4, // step 1 and 3 + 8, 1, 9, 2, 11, 4, 12, 0, // step 2 + 9, 2, 10, 3, 11, 4, 12, 0, // step 4 + 8, 9, 2, 3, 4, 5, 6, 7, // step 5 + 0, 8, 9, 10, 15, 0, 0, 0, // step 6 + 4, 5, 8, 9, 6, 7, 10, 11, // step 7 and 8 + 0, 1, 2, 3, 13, 0, 0, 0, // step 9 + 2, 3, 0, 1, 11, 0, 0, 0, // step 10 + 4, 5, 6, 7, 14, 0, 0, 0, // step 11 + 14, 15, 12, 13, 4, 0, 0, 0, // step 12 + // size of rotations (after step 5) + 1, 6, 62, 55, 28, 20, 27, 36, + 3, 45, 10, 15, 25, 8, 39, 41, + 44, 43, 21, 18, 2, 61, 56, 14, + // rotation of row elements + 12, 8, 9, 10, 11, 5, 6, 7, + 9, 10, 11, 12, 8, 5, 6, 7 +}; + +static address round_constsAddr() { + return (address) round_consts_arr; +} + +static address permsAndRotsAddr() { + return (address) permsAndRots; +} + +void StubGenerator::generate_sha3_stubs() { + if (UseSHA3Intrinsics) { + StubRoutines::_sha3_implCompress = generate_sha3_implCompress(false,"sha3_implCompress"); + StubRoutines::_sha3_implCompressMB = generate_sha3_implCompress(true, "sha3_implCompressMB"); + } +} + +// Arguments: +// +// Inputs: +// c_rarg0 - byte[] source+offset +// c_rarg1 - long[] SHA3.state +// c_rarg2 - int block_size +// c_rarg3 - int offset +// c_rarg4 - int limit +// +address StubGenerator::generate_sha3_implCompress(bool multiBlock, const char *name) { + __ align(CodeEntryAlignment); + StubCodeMark mark(this, "StubRoutines", name); + address start = __ pc(); + + const Register buf = c_rarg0; + const Register state = c_rarg1; + const Register block_size = c_rarg2; + const Register ofs = c_rarg3; +#ifndef _WIN64 + const Register limit = c_rarg4; +#else + const Address limit_mem(rbp, 6 * wordSize); + const Register limit = r12; +#endif + + const Register permsAndRots = r10; + const Register round_consts = r11; + const Register constant2use = r13; + const Register roundsLeft = r14; + + Label sha3_loop; + Label rounds24_loop, block104, block136, block144, block168; + + __ enter(); + + __ push(r12); + __ push(r13); + __ push(r14); + +#ifdef _WIN64 + // on win64, fill limit from stack position + __ movptr(limit, limit_mem); +#endif + + __ lea(permsAndRots, ExternalAddress(permsAndRotsAddr())); + __ lea(round_consts, ExternalAddress(round_constsAddr())); + + // set up the masks + __ movl(rax, 0x1F); + __ kmovwl(k5, rax); + __ kshiftrwl(k4, k5, 1); + __ kshiftrwl(k3, k5, 2); + __ kshiftrwl(k2, k5, 3); + __ kshiftrwl(k1, k5, 4); + + // load the state + __ evmovdquq(xmm0, k5, Address(state, 0), false, Assembler::AVX_512bit); + __ evmovdquq(xmm1, k5, Address(state, 40), false, Assembler::AVX_512bit); + __ evmovdquq(xmm2, k5, Address(state, 80), false, Assembler::AVX_512bit); + __ evmovdquq(xmm3, k5, Address(state, 120), false, Assembler::AVX_512bit); + __ evmovdquq(xmm4, k5, Address(state, 160), false, Assembler::AVX_512bit); + + // load the permutation and rotation constants + __ evmovdquq(xmm17, Address(permsAndRots, 0), Assembler::AVX_512bit); + __ evmovdquq(xmm18, Address(permsAndRots, 64), Assembler::AVX_512bit); + __ evmovdquq(xmm19, Address(permsAndRots, 128), Assembler::AVX_512bit); + __ evmovdquq(xmm20, Address(permsAndRots, 192), Assembler::AVX_512bit); + __ evmovdquq(xmm21, Address(permsAndRots, 256), Assembler::AVX_512bit); + __ evmovdquq(xmm22, Address(permsAndRots, 320), Assembler::AVX_512bit); + __ evmovdquq(xmm23, Address(permsAndRots, 384), Assembler::AVX_512bit); + __ evmovdquq(xmm24, Address(permsAndRots, 448), Assembler::AVX_512bit); + __ evmovdquq(xmm25, Address(permsAndRots, 512), Assembler::AVX_512bit); + __ evmovdquq(xmm26, Address(permsAndRots, 576), Assembler::AVX_512bit); + __ evmovdquq(xmm27, Address(permsAndRots, 640), Assembler::AVX_512bit); + __ evmovdquq(xmm28, Address(permsAndRots, 704), Assembler::AVX_512bit); + __ evmovdquq(xmm29, Address(permsAndRots, 768), Assembler::AVX_512bit); + __ evmovdquq(xmm30, Address(permsAndRots, 832), Assembler::AVX_512bit); + __ evmovdquq(xmm31, Address(permsAndRots, 896), Assembler::AVX_512bit); + + __ BIND(sha3_loop); + + // there will be 24 keccak rounds + __ movl(roundsLeft, 24); + // load round_constants base + __ movptr(constant2use, round_consts); + + // load input: 72, 104, 136, 144 or 168 bytes + // i.e. 5+4, 2*5+3, 3*5+2, 3*5+3 or 4*5+1 longs + __ evpxorq(xmm0, k5, xmm0, Address(buf, 0), true, Assembler::AVX_512bit); + + // if(blockSize == 72) SHA3-512 + __ cmpl(block_size, 72); + __ jcc(Assembler::notEqual, block104); + __ evpxorq(xmm1, k4, xmm1, Address(buf, 40), true, Assembler::AVX_512bit); + __ jmp(rounds24_loop); + + // if(blockSize == 104) SHA3-384 + __ BIND(block104); + __ cmpl(block_size, 104); + __ jcc(Assembler::notEqual, block136); + __ evpxorq(xmm1, k5, xmm1, Address(buf, 40), true, Assembler::AVX_512bit); + __ evpxorq(xmm2, k3, xmm2, Address(buf, 80), true, Assembler::AVX_512bit); + __ jmp(rounds24_loop); + + // if(blockSize == 136) SHA3-256 and SHAKE256 + __ BIND(block136); + __ cmpl(block_size, 136); + __ jcc(Assembler::notEqual, block144); + __ evpxorq(xmm1, k5, xmm1, Address(buf, 40), true, Assembler::AVX_512bit); + __ evpxorq(xmm2, k5, xmm2, Address(buf, 80), true, Assembler::AVX_512bit); + __ evpxorq(xmm3, k2, xmm3, Address(buf, 120), true, Assembler::AVX_512bit); + __ jmp(rounds24_loop); + + // if(blockSize == 144) SHA3-224 + __ BIND(block144); + __ cmpl(block_size, 144); + __ jcc(Assembler::notEqual, block168); + __ evpxorq(xmm1, k5, xmm1, Address(buf, 40), true, Assembler::AVX_512bit); + __ evpxorq(xmm2, k5, xmm2, Address(buf, 80), true, Assembler::AVX_512bit); + __ evpxorq(xmm3, k3, xmm3, Address(buf, 120), true, Assembler::AVX_512bit); + __ jmp(rounds24_loop); + + // if(blockSize == 168) SHAKE128 + __ BIND(block168); + __ evpxorq(xmm1, k5, xmm1, Address(buf, 40), true, Assembler::AVX_512bit); + __ evpxorq(xmm2, k5, xmm2, Address(buf, 80), true, Assembler::AVX_512bit); + __ evpxorq(xmm3, k5, xmm3, Address(buf, 120), true, Assembler::AVX_512bit); + __ evpxorq(xmm4, k1, xmm4, Address(buf, 160), true, Assembler::AVX_512bit); + + // The 24 rounds of the keccak transformation. + // The implementation closely follows the Java version, with the state + // array "rows" in the lowest 5 64-bit slots of zmm0 - zmm4, i.e. + // each row of the SHA3 specification is located in one zmm register. + __ BIND(rounds24_loop); + __ subl(roundsLeft, 1); + + __ evmovdquw(xmm5, xmm0, Assembler::AVX_512bit); + // vpternlogq(x, 150, y, z) does x = x ^ y ^ z + __ vpternlogq(xmm5, 150, xmm1, xmm2, Assembler::AVX_512bit); + __ vpternlogq(xmm5, 150, xmm3, xmm4, Assembler::AVX_512bit); + // Now the "c row", i.e. c0-c4 are in zmm5. + // Rotate each element of the c row by one bit to zmm6, call the + // rotated version c'. + __ evprolq(xmm6, xmm5, 1, Assembler::AVX_512bit); + // Rotate elementwise the c row so that c4 becomes c0, + // c0 becomes c1, etc. + __ evpermt2q(xmm5, xmm30, xmm5, Assembler::AVX_512bit); + // rotate elementwise the c' row so that c'0 becomes c'4, + // c'1 becomes c'0, etc. + __ evpermt2q(xmm6, xmm31, xmm6, Assembler::AVX_512bit); + __ vpternlogq(xmm0, 150, xmm5, xmm6, Assembler::AVX_512bit); + __ vpternlogq(xmm1, 150, xmm5, xmm6, Assembler::AVX_512bit); + __ vpternlogq(xmm2, 150, xmm5, xmm6, Assembler::AVX_512bit); + __ vpternlogq(xmm3, 150, xmm5, xmm6, Assembler::AVX_512bit); + __ vpternlogq(xmm4, 150, xmm5, xmm6, Assembler::AVX_512bit); + // Now the theta mapping has been finished. + + // Do the cyclical permutation of the 24 moving state elements + // and the required rotations within each element (the combined + // rho and sigma steps). + __ evpermt2q(xmm4, xmm17, xmm3, Assembler::AVX_512bit); + __ evpermt2q(xmm3, xmm18, xmm2, Assembler::AVX_512bit); + __ evpermt2q(xmm2, xmm17, xmm1, Assembler::AVX_512bit); + __ evpermt2q(xmm1, xmm19, xmm0, Assembler::AVX_512bit); + __ evpermt2q(xmm4, xmm20, xmm2, Assembler::AVX_512bit); + // The 24 moving elements are now in zmm1, zmm3 and zmm4, + // do the rotations now. + __ evprolvq(xmm1, xmm1, xmm27, Assembler::AVX_512bit); + __ evprolvq(xmm3, xmm3, xmm28, Assembler::AVX_512bit); + __ evprolvq(xmm4, xmm4, xmm29, Assembler::AVX_512bit); + __ evmovdquw(xmm2, xmm1, Assembler::AVX_512bit); + __ evmovdquw(xmm5, xmm3, Assembler::AVX_512bit); + __ evpermt2q(xmm0, xmm21, xmm4, Assembler::AVX_512bit); + __ evpermt2q(xmm1, xmm22, xmm3, Assembler::AVX_512bit); + __ evpermt2q(xmm5, xmm22, xmm2, Assembler::AVX_512bit); + __ evmovdquw(xmm3, xmm1, Assembler::AVX_512bit); + __ evmovdquw(xmm2, xmm5, Assembler::AVX_512bit); + __ evpermt2q(xmm1, xmm23, xmm4, Assembler::AVX_512bit); + __ evpermt2q(xmm2, xmm24, xmm4, Assembler::AVX_512bit); + __ evpermt2q(xmm3, xmm25, xmm4, Assembler::AVX_512bit); + __ evpermt2q(xmm4, xmm26, xmm5, Assembler::AVX_512bit); + // The combined rho and sigma steps are done. + + // Do the chi step (the same operation on all 5 rows). + // vpternlogq(x, 180, y, z) does x = x ^ (y & ~z). + __ evpermt2q(xmm5, xmm31, xmm0, Assembler::AVX_512bit); + __ evpermt2q(xmm6, xmm31, xmm5, Assembler::AVX_512bit); + __ vpternlogq(xmm0, 180, xmm6, xmm5, Assembler::AVX_512bit); + + __ evpermt2q(xmm5, xmm31, xmm1, Assembler::AVX_512bit); + __ evpermt2q(xmm6, xmm31, xmm5, Assembler::AVX_512bit); + __ vpternlogq(xmm1, 180, xmm6, xmm5, Assembler::AVX_512bit); + + // xor the round constant into a0 (the lowest 64 bits of zmm0 + __ evpxorq(xmm0, k1, xmm0, Address(constant2use, 0), true, Assembler::AVX_512bit); + __ addptr(constant2use, 8); + + __ evpermt2q(xmm5, xmm31, xmm2, Assembler::AVX_512bit); + __ evpermt2q(xmm6, xmm31, xmm5, Assembler::AVX_512bit); + __ vpternlogq(xmm2, 180, xmm6, xmm5, Assembler::AVX_512bit); + + __ evpermt2q(xmm5, xmm31, xmm3, Assembler::AVX_512bit); + __ evpermt2q(xmm6, xmm31, xmm5, Assembler::AVX_512bit); + __ vpternlogq(xmm3, 180, xmm6, xmm5, Assembler::AVX_512bit); + + __ evpermt2q(xmm5, xmm31, xmm4, Assembler::AVX_512bit); + __ evpermt2q(xmm6, xmm31, xmm5, Assembler::AVX_512bit); + __ vpternlogq(xmm4, 180, xmm6, xmm5, Assembler::AVX_512bit); + __ cmpl(roundsLeft, 0); + __ jcc(Assembler::notEqual, rounds24_loop); + + if (multiBlock) { + __ addptr(buf, block_size); + __ addl(ofs, block_size); + __ cmpl(ofs, limit); + __ jcc(Assembler::lessEqual, sha3_loop); + __ movq(rax, ofs); // return ofs + } else { + __ xorq(rax, rax); // return 0 + } + + // store the state + __ evmovdquq(Address(state, 0), k5, xmm0, true, Assembler::AVX_512bit); + __ evmovdquq(Address(state, 40), k5, xmm1, true, Assembler::AVX_512bit); + __ evmovdquq(Address(state, 80), k5, xmm2, true, Assembler::AVX_512bit); + __ evmovdquq(Address(state, 120), k5, xmm3, true, Assembler::AVX_512bit); + __ evmovdquq(Address(state, 160), k5, xmm4, true, Assembler::AVX_512bit); + + __ pop(r14); + __ pop(r13); + __ pop(r12); + + __ leave(); // required for proper stackwalking of RuntimeStub frame + __ ret(0); + + return start; +} diff --git a/src/hotspot/cpu/x86/vm_version_x86.cpp b/src/hotspot/cpu/x86/vm_version_x86.cpp index 63347c51d60..f8c5de551cd 100644 --- a/src/hotspot/cpu/x86/vm_version_x86.cpp +++ b/src/hotspot/cpu/x86/vm_version_x86.cpp @@ -1316,9 +1316,16 @@ void VM_Version::get_processor_features() { FLAG_SET_DEFAULT(UseSHA512Intrinsics, false); } - if (UseSHA3Intrinsics) { - warning("Intrinsics for SHA3-224, SHA3-256, SHA3-384 and SHA3-512 crypto hash functions not available on this CPU."); - FLAG_SET_DEFAULT(UseSHA3Intrinsics, false); +#ifdef _LP64 + if (supports_evex() && supports_avx512bw()) { + if (FLAG_IS_DEFAULT(UseSHA3Intrinsics)) { + UseSHA3Intrinsics = true; + } + } else +#endif + if (UseSHA3Intrinsics) { + warning("Intrinsics for SHA3-224, SHA3-256, SHA3-384 and SHA3-512 crypto hash functions not available on this CPU."); + FLAG_SET_DEFAULT(UseSHA3Intrinsics, false); } if (!(UseSHA1Intrinsics || UseSHA256Intrinsics || UseSHA512Intrinsics)) { diff --git a/src/java.base/share/classes/sun/security/provider/SHA3.java b/src/java.base/share/classes/sun/security/provider/SHA3.java index 75430c63916..5f974bc6ea6 100644 --- a/src/java.base/share/classes/sun/security/provider/SHA3.java +++ b/src/java.base/share/classes/sun/security/provider/SHA3.java @@ -32,6 +32,7 @@ import java.security.ProviderException; import java.util.Arrays; import java.util.Objects; +import jdk.internal.util.Preconditions; import jdk.internal.vm.annotation.IntrinsicCandidate; import static java.lang.Math.min; @@ -99,6 +100,7 @@ public abstract class SHA3 extends DigestBase { private void implCompressCheck(byte[] b, int ofs) { Objects.requireNonNull(b); + Preconditions.checkIndex(ofs + blockSize - 1, b.length, Preconditions.AIOOBE_FORMATTER); } /** diff --git a/test/hotspot/jtreg/compiler/testlibrary/sha/predicate/IntrinsicPredicates.java b/test/hotspot/jtreg/compiler/testlibrary/sha/predicate/IntrinsicPredicates.java index 27fe9989247..19257f81614 100644 --- a/test/hotspot/jtreg/compiler/testlibrary/sha/predicate/IntrinsicPredicates.java +++ b/test/hotspot/jtreg/compiler/testlibrary/sha/predicate/IntrinsicPredicates.java @@ -104,8 +104,10 @@ public class IntrinsicPredicates { new CPUSpecificPredicate("x86_64", new String[] { "avx2", "bmi2" }, null)))))))))); public static final BooleanSupplier SHA3_INSTRUCTION_AVAILABLE - // sha3 is only implemented on aarch64 for now - = new CPUSpecificPredicate("aarch64.*", new String[] {"sha3" }, null); + // sha3 is only implemented on aarch64 and avx512 for now + = new OrPredicate(new CPUSpecificPredicate("aarch64.*", new String[] {"sha3" }, null), + new OrPredicate(new CPUSpecificPredicate("amd64.*", new String[] {"avx512f", "avx512bw"}, null), + new CPUSpecificPredicate("x86_64", new String[] {"avx512f", "avx512bw"}, null))); public static final BooleanSupplier ANY_SHA_INSTRUCTION_AVAILABLE = new OrPredicate(IntrinsicPredicates.SHA1_INSTRUCTION_AVAILABLE,