3789983e89
Reviewed-by: darcy, ihse
1526 lines
50 KiB
C++
1526 lines
50 KiB
C++
/*
|
|
* Copyright (c) 2016, Intel Corporation.
|
|
*
|
|
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
|
|
*
|
|
* This code is free software; you can redistribute it and/or modify it
|
|
* under the terms of the GNU General Public License version 2 only, as
|
|
* published by the Free Software Foundation.
|
|
*
|
|
* This code is distributed in the hope that it will be useful, but WITHOUT
|
|
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
|
|
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
|
|
* version 2 for more details (a copy is included in the LICENSE file that
|
|
* accompanied this code).
|
|
*
|
|
* You should have received a copy of the GNU General Public License version
|
|
* 2 along with this work; if not, write to the Free Software Foundation,
|
|
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
|
|
*
|
|
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
|
|
* or visit www.oracle.com if you need additional information or have any
|
|
* questions.
|
|
*
|
|
*/
|
|
|
|
#include "precompiled.hpp"
|
|
#include "asm/assembler.hpp"
|
|
#include "asm/assembler.inline.hpp"
|
|
#include "runtime/stubRoutines.hpp"
|
|
#include "macroAssembler_x86.hpp"
|
|
|
|
// ofs and limit are used for multi-block byte array.
|
|
// int com.sun.security.provider.DigestBase.implCompressMultiBlock(byte[] b, int ofs, int limit)
|
|
void MacroAssembler::fast_sha1(XMMRegister abcd, XMMRegister e0, XMMRegister e1, XMMRegister msg0,
|
|
XMMRegister msg1, XMMRegister msg2, XMMRegister msg3, XMMRegister shuf_mask,
|
|
Register buf, Register state, Register ofs, Register limit, Register rsp, bool multi_block) {
|
|
|
|
Label start, done_hash, loop0;
|
|
|
|
address upper_word_mask = StubRoutines::x86::upper_word_mask_addr();
|
|
address shuffle_byte_flip_mask = StubRoutines::x86::shuffle_byte_flip_mask_addr();
|
|
|
|
bind(start);
|
|
movdqu(abcd, Address(state, 0));
|
|
pinsrd(e0, Address(state, 16), 3);
|
|
movdqu(shuf_mask, ExternalAddress(upper_word_mask)); // 0xFFFFFFFF000000000000000000000000
|
|
pand(e0, shuf_mask);
|
|
pshufd(abcd, abcd, 0x1B);
|
|
movdqu(shuf_mask, ExternalAddress(shuffle_byte_flip_mask)); //0x000102030405060708090a0b0c0d0e0f
|
|
|
|
bind(loop0);
|
|
// Save hash values for addition after rounds
|
|
movdqu(Address(rsp, 0), e0);
|
|
movdqu(Address(rsp, 16), abcd);
|
|
|
|
|
|
// Rounds 0 - 3
|
|
movdqu(msg0, Address(buf, 0));
|
|
pshufb(msg0, shuf_mask);
|
|
paddd(e0, msg0);
|
|
movdqa(e1, abcd);
|
|
sha1rnds4(abcd, e0, 0);
|
|
|
|
// Rounds 4 - 7
|
|
movdqu(msg1, Address(buf, 16));
|
|
pshufb(msg1, shuf_mask);
|
|
sha1nexte(e1, msg1);
|
|
movdqa(e0, abcd);
|
|
sha1rnds4(abcd, e1, 0);
|
|
sha1msg1(msg0, msg1);
|
|
|
|
// Rounds 8 - 11
|
|
movdqu(msg2, Address(buf, 32));
|
|
pshufb(msg2, shuf_mask);
|
|
sha1nexte(e0, msg2);
|
|
movdqa(e1, abcd);
|
|
sha1rnds4(abcd, e0, 0);
|
|
sha1msg1(msg1, msg2);
|
|
pxor(msg0, msg2);
|
|
|
|
// Rounds 12 - 15
|
|
movdqu(msg3, Address(buf, 48));
|
|
pshufb(msg3, shuf_mask);
|
|
sha1nexte(e1, msg3);
|
|
movdqa(e0, abcd);
|
|
sha1msg2(msg0, msg3);
|
|
sha1rnds4(abcd, e1, 0);
|
|
sha1msg1(msg2, msg3);
|
|
pxor(msg1, msg3);
|
|
|
|
// Rounds 16 - 19
|
|
sha1nexte(e0, msg0);
|
|
movdqa(e1, abcd);
|
|
sha1msg2(msg1, msg0);
|
|
sha1rnds4(abcd, e0, 0);
|
|
sha1msg1(msg3, msg0);
|
|
pxor(msg2, msg0);
|
|
|
|
// Rounds 20 - 23
|
|
sha1nexte(e1, msg1);
|
|
movdqa(e0, abcd);
|
|
sha1msg2(msg2, msg1);
|
|
sha1rnds4(abcd, e1, 1);
|
|
sha1msg1(msg0, msg1);
|
|
pxor(msg3, msg1);
|
|
|
|
// Rounds 24 - 27
|
|
sha1nexte(e0, msg2);
|
|
movdqa(e1, abcd);
|
|
sha1msg2(msg3, msg2);
|
|
sha1rnds4(abcd, e0, 1);
|
|
sha1msg1(msg1, msg2);
|
|
pxor(msg0, msg2);
|
|
|
|
// Rounds 28 - 31
|
|
sha1nexte(e1, msg3);
|
|
movdqa(e0, abcd);
|
|
sha1msg2(msg0, msg3);
|
|
sha1rnds4(abcd, e1, 1);
|
|
sha1msg1(msg2, msg3);
|
|
pxor(msg1, msg3);
|
|
|
|
// Rounds 32 - 35
|
|
sha1nexte(e0, msg0);
|
|
movdqa(e1, abcd);
|
|
sha1msg2(msg1, msg0);
|
|
sha1rnds4(abcd, e0, 1);
|
|
sha1msg1(msg3, msg0);
|
|
pxor(msg2, msg0);
|
|
|
|
// Rounds 36 - 39
|
|
sha1nexte(e1, msg1);
|
|
movdqa(e0, abcd);
|
|
sha1msg2(msg2, msg1);
|
|
sha1rnds4(abcd, e1, 1);
|
|
sha1msg1(msg0, msg1);
|
|
pxor(msg3, msg1);
|
|
|
|
// Rounds 40 - 43
|
|
sha1nexte(e0, msg2);
|
|
movdqa(e1, abcd);
|
|
sha1msg2(msg3, msg2);
|
|
sha1rnds4(abcd, e0, 2);
|
|
sha1msg1(msg1, msg2);
|
|
pxor(msg0, msg2);
|
|
|
|
// Rounds 44 - 47
|
|
sha1nexte(e1, msg3);
|
|
movdqa(e0, abcd);
|
|
sha1msg2(msg0, msg3);
|
|
sha1rnds4(abcd, e1, 2);
|
|
sha1msg1(msg2, msg3);
|
|
pxor(msg1, msg3);
|
|
|
|
// Rounds 48 - 51
|
|
sha1nexte(e0, msg0);
|
|
movdqa(e1, abcd);
|
|
sha1msg2(msg1, msg0);
|
|
sha1rnds4(abcd, e0, 2);
|
|
sha1msg1(msg3, msg0);
|
|
pxor(msg2, msg0);
|
|
|
|
// Rounds 52 - 55
|
|
sha1nexte(e1, msg1);
|
|
movdqa(e0, abcd);
|
|
sha1msg2(msg2, msg1);
|
|
sha1rnds4(abcd, e1, 2);
|
|
sha1msg1(msg0, msg1);
|
|
pxor(msg3, msg1);
|
|
|
|
// Rounds 56 - 59
|
|
sha1nexte(e0, msg2);
|
|
movdqa(e1, abcd);
|
|
sha1msg2(msg3, msg2);
|
|
sha1rnds4(abcd, e0, 2);
|
|
sha1msg1(msg1, msg2);
|
|
pxor(msg0, msg2);
|
|
|
|
// Rounds 60 - 63
|
|
sha1nexte(e1, msg3);
|
|
movdqa(e0, abcd);
|
|
sha1msg2(msg0, msg3);
|
|
sha1rnds4(abcd, e1, 3);
|
|
sha1msg1(msg2, msg3);
|
|
pxor(msg1, msg3);
|
|
|
|
// Rounds 64 - 67
|
|
sha1nexte(e0, msg0);
|
|
movdqa(e1, abcd);
|
|
sha1msg2(msg1, msg0);
|
|
sha1rnds4(abcd, e0, 3);
|
|
sha1msg1(msg3, msg0);
|
|
pxor(msg2, msg0);
|
|
|
|
// Rounds 68 - 71
|
|
sha1nexte(e1, msg1);
|
|
movdqa(e0, abcd);
|
|
sha1msg2(msg2, msg1);
|
|
sha1rnds4(abcd, e1, 3);
|
|
pxor(msg3, msg1);
|
|
|
|
// Rounds 72 - 75
|
|
sha1nexte(e0, msg2);
|
|
movdqa(e1, abcd);
|
|
sha1msg2(msg3, msg2);
|
|
sha1rnds4(abcd, e0, 3);
|
|
|
|
// Rounds 76 - 79
|
|
sha1nexte(e1, msg3);
|
|
movdqa(e0, abcd);
|
|
sha1rnds4(abcd, e1, 3);
|
|
|
|
// add current hash values with previously saved
|
|
movdqu(msg0, Address(rsp, 0));
|
|
sha1nexte(e0, msg0);
|
|
movdqu(msg0, Address(rsp, 16));
|
|
paddd(abcd, msg0);
|
|
|
|
if (multi_block) {
|
|
// increment data pointer and loop if more to process
|
|
addptr(buf, 64);
|
|
addptr(ofs, 64);
|
|
cmpptr(ofs, limit);
|
|
jcc(Assembler::belowEqual, loop0);
|
|
movptr(rax, ofs); //return ofs
|
|
}
|
|
// write hash values back in the correct order
|
|
pshufd(abcd, abcd, 0x1b);
|
|
movdqu(Address(state, 0), abcd);
|
|
pextrd(Address(state, 16), e0, 3);
|
|
|
|
bind(done_hash);
|
|
|
|
}
|
|
|
|
// xmm0 (msg) is used as an implicit argument to sh256rnds2
|
|
// and state0 and state1 can never use xmm0 register.
|
|
// ofs and limit are used for multi-block byte array.
|
|
// int com.sun.security.provider.DigestBase.implCompressMultiBlock(byte[] b, int ofs, int limit)
|
|
#ifdef _LP64
|
|
void MacroAssembler::fast_sha256(XMMRegister msg, XMMRegister state0, XMMRegister state1, XMMRegister msgtmp0,
|
|
XMMRegister msgtmp1, XMMRegister msgtmp2, XMMRegister msgtmp3, XMMRegister msgtmp4,
|
|
Register buf, Register state, Register ofs, Register limit, Register rsp,
|
|
bool multi_block, XMMRegister shuf_mask) {
|
|
#else
|
|
void MacroAssembler::fast_sha256(XMMRegister msg, XMMRegister state0, XMMRegister state1, XMMRegister msgtmp0,
|
|
XMMRegister msgtmp1, XMMRegister msgtmp2, XMMRegister msgtmp3, XMMRegister msgtmp4,
|
|
Register buf, Register state, Register ofs, Register limit, Register rsp,
|
|
bool multi_block) {
|
|
#endif
|
|
Label start, done_hash, loop0;
|
|
|
|
address K256 = StubRoutines::x86::k256_addr();
|
|
address pshuffle_byte_flip_mask = StubRoutines::x86::pshuffle_byte_flip_mask_addr();
|
|
|
|
bind(start);
|
|
movdqu(state0, Address(state, 0));
|
|
movdqu(state1, Address(state, 16));
|
|
|
|
pshufd(state0, state0, 0xB1);
|
|
pshufd(state1, state1, 0x1B);
|
|
movdqa(msgtmp4, state0);
|
|
palignr(state0, state1, 8);
|
|
pblendw(state1, msgtmp4, 0xF0);
|
|
|
|
#ifdef _LP64
|
|
movdqu(shuf_mask, ExternalAddress(pshuffle_byte_flip_mask));
|
|
#endif
|
|
lea(rax, ExternalAddress(K256));
|
|
|
|
bind(loop0);
|
|
movdqu(Address(rsp, 0), state0);
|
|
movdqu(Address(rsp, 16), state1);
|
|
|
|
// Rounds 0-3
|
|
movdqu(msg, Address(buf, 0));
|
|
#ifdef _LP64
|
|
pshufb(msg, shuf_mask);
|
|
#else
|
|
pshufb(msg, ExternalAddress(pshuffle_byte_flip_mask));
|
|
#endif
|
|
movdqa(msgtmp0, msg);
|
|
paddd(msg, Address(rax, 0));
|
|
sha256rnds2(state1, state0);
|
|
pshufd(msg, msg, 0x0E);
|
|
sha256rnds2(state0, state1);
|
|
|
|
// Rounds 4-7
|
|
movdqu(msg, Address(buf, 16));
|
|
#ifdef _LP64
|
|
pshufb(msg, shuf_mask);
|
|
#else
|
|
pshufb(msg, ExternalAddress(pshuffle_byte_flip_mask));
|
|
#endif
|
|
movdqa(msgtmp1, msg);
|
|
paddd(msg, Address(rax, 16));
|
|
sha256rnds2(state1, state0);
|
|
pshufd(msg, msg, 0x0E);
|
|
sha256rnds2(state0, state1);
|
|
sha256msg1(msgtmp0, msgtmp1);
|
|
|
|
// Rounds 8-11
|
|
movdqu(msg, Address(buf, 32));
|
|
#ifdef _LP64
|
|
pshufb(msg, shuf_mask);
|
|
#else
|
|
pshufb(msg, ExternalAddress(pshuffle_byte_flip_mask));
|
|
#endif
|
|
movdqa(msgtmp2, msg);
|
|
paddd(msg, Address(rax, 32));
|
|
sha256rnds2(state1, state0);
|
|
pshufd(msg, msg, 0x0E);
|
|
sha256rnds2(state0, state1);
|
|
sha256msg1(msgtmp1, msgtmp2);
|
|
|
|
// Rounds 12-15
|
|
movdqu(msg, Address(buf, 48));
|
|
#ifdef _LP64
|
|
pshufb(msg, shuf_mask);
|
|
#else
|
|
pshufb(msg, ExternalAddress(pshuffle_byte_flip_mask));
|
|
#endif
|
|
movdqa(msgtmp3, msg);
|
|
paddd(msg, Address(rax, 48));
|
|
sha256rnds2(state1, state0);
|
|
movdqa(msgtmp4, msgtmp3);
|
|
palignr(msgtmp4, msgtmp2, 4);
|
|
paddd(msgtmp0, msgtmp4);
|
|
sha256msg2(msgtmp0, msgtmp3);
|
|
pshufd(msg, msg, 0x0E);
|
|
sha256rnds2(state0, state1);
|
|
sha256msg1(msgtmp2, msgtmp3);
|
|
|
|
// Rounds 16-19
|
|
movdqa(msg, msgtmp0);
|
|
paddd(msg, Address(rax, 64));
|
|
sha256rnds2(state1, state0);
|
|
movdqa(msgtmp4, msgtmp0);
|
|
palignr(msgtmp4, msgtmp3, 4);
|
|
paddd(msgtmp1, msgtmp4);
|
|
sha256msg2(msgtmp1, msgtmp0);
|
|
pshufd(msg, msg, 0x0E);
|
|
sha256rnds2(state0, state1);
|
|
sha256msg1(msgtmp3, msgtmp0);
|
|
|
|
// Rounds 20-23
|
|
movdqa(msg, msgtmp1);
|
|
paddd(msg, Address(rax, 80));
|
|
sha256rnds2(state1, state0);
|
|
movdqa(msgtmp4, msgtmp1);
|
|
palignr(msgtmp4, msgtmp0, 4);
|
|
paddd(msgtmp2, msgtmp4);
|
|
sha256msg2(msgtmp2, msgtmp1);
|
|
pshufd(msg, msg, 0x0E);
|
|
sha256rnds2(state0, state1);
|
|
sha256msg1(msgtmp0, msgtmp1);
|
|
|
|
// Rounds 24-27
|
|
movdqa(msg, msgtmp2);
|
|
paddd(msg, Address(rax, 96));
|
|
sha256rnds2(state1, state0);
|
|
movdqa(msgtmp4, msgtmp2);
|
|
palignr(msgtmp4, msgtmp1, 4);
|
|
paddd(msgtmp3, msgtmp4);
|
|
sha256msg2(msgtmp3, msgtmp2);
|
|
pshufd(msg, msg, 0x0E);
|
|
sha256rnds2(state0, state1);
|
|
sha256msg1(msgtmp1, msgtmp2);
|
|
|
|
// Rounds 28-31
|
|
movdqa(msg, msgtmp3);
|
|
paddd(msg, Address(rax, 112));
|
|
sha256rnds2(state1, state0);
|
|
movdqa(msgtmp4, msgtmp3);
|
|
palignr(msgtmp4, msgtmp2, 4);
|
|
paddd(msgtmp0, msgtmp4);
|
|
sha256msg2(msgtmp0, msgtmp3);
|
|
pshufd(msg, msg, 0x0E);
|
|
sha256rnds2(state0, state1);
|
|
sha256msg1(msgtmp2, msgtmp3);
|
|
|
|
// Rounds 32-35
|
|
movdqa(msg, msgtmp0);
|
|
paddd(msg, Address(rax, 128));
|
|
sha256rnds2(state1, state0);
|
|
movdqa(msgtmp4, msgtmp0);
|
|
palignr(msgtmp4, msgtmp3, 4);
|
|
paddd(msgtmp1, msgtmp4);
|
|
sha256msg2(msgtmp1, msgtmp0);
|
|
pshufd(msg, msg, 0x0E);
|
|
sha256rnds2(state0, state1);
|
|
sha256msg1(msgtmp3, msgtmp0);
|
|
|
|
// Rounds 36-39
|
|
movdqa(msg, msgtmp1);
|
|
paddd(msg, Address(rax, 144));
|
|
sha256rnds2(state1, state0);
|
|
movdqa(msgtmp4, msgtmp1);
|
|
palignr(msgtmp4, msgtmp0, 4);
|
|
paddd(msgtmp2, msgtmp4);
|
|
sha256msg2(msgtmp2, msgtmp1);
|
|
pshufd(msg, msg, 0x0E);
|
|
sha256rnds2(state0, state1);
|
|
sha256msg1(msgtmp0, msgtmp1);
|
|
|
|
// Rounds 40-43
|
|
movdqa(msg, msgtmp2);
|
|
paddd(msg, Address(rax, 160));
|
|
sha256rnds2(state1, state0);
|
|
movdqa(msgtmp4, msgtmp2);
|
|
palignr(msgtmp4, msgtmp1, 4);
|
|
paddd(msgtmp3, msgtmp4);
|
|
sha256msg2(msgtmp3, msgtmp2);
|
|
pshufd(msg, msg, 0x0E);
|
|
sha256rnds2(state0, state1);
|
|
sha256msg1(msgtmp1, msgtmp2);
|
|
|
|
// Rounds 44-47
|
|
movdqa(msg, msgtmp3);
|
|
paddd(msg, Address(rax, 176));
|
|
sha256rnds2(state1, state0);
|
|
movdqa(msgtmp4, msgtmp3);
|
|
palignr(msgtmp4, msgtmp2, 4);
|
|
paddd(msgtmp0, msgtmp4);
|
|
sha256msg2(msgtmp0, msgtmp3);
|
|
pshufd(msg, msg, 0x0E);
|
|
sha256rnds2(state0, state1);
|
|
sha256msg1(msgtmp2, msgtmp3);
|
|
|
|
// Rounds 48-51
|
|
movdqa(msg, msgtmp0);
|
|
paddd(msg, Address(rax, 192));
|
|
sha256rnds2(state1, state0);
|
|
movdqa(msgtmp4, msgtmp0);
|
|
palignr(msgtmp4, msgtmp3, 4);
|
|
paddd(msgtmp1, msgtmp4);
|
|
sha256msg2(msgtmp1, msgtmp0);
|
|
pshufd(msg, msg, 0x0E);
|
|
sha256rnds2(state0, state1);
|
|
sha256msg1(msgtmp3, msgtmp0);
|
|
|
|
// Rounds 52-55
|
|
movdqa(msg, msgtmp1);
|
|
paddd(msg, Address(rax, 208));
|
|
sha256rnds2(state1, state0);
|
|
movdqa(msgtmp4, msgtmp1);
|
|
palignr(msgtmp4, msgtmp0, 4);
|
|
paddd(msgtmp2, msgtmp4);
|
|
sha256msg2(msgtmp2, msgtmp1);
|
|
pshufd(msg, msg, 0x0E);
|
|
sha256rnds2(state0, state1);
|
|
|
|
// Rounds 56-59
|
|
movdqa(msg, msgtmp2);
|
|
paddd(msg, Address(rax, 224));
|
|
sha256rnds2(state1, state0);
|
|
movdqa(msgtmp4, msgtmp2);
|
|
palignr(msgtmp4, msgtmp1, 4);
|
|
paddd(msgtmp3, msgtmp4);
|
|
sha256msg2(msgtmp3, msgtmp2);
|
|
pshufd(msg, msg, 0x0E);
|
|
sha256rnds2(state0, state1);
|
|
|
|
// Rounds 60-63
|
|
movdqa(msg, msgtmp3);
|
|
paddd(msg, Address(rax, 240));
|
|
sha256rnds2(state1, state0);
|
|
pshufd(msg, msg, 0x0E);
|
|
sha256rnds2(state0, state1);
|
|
movdqu(msg, Address(rsp, 0));
|
|
paddd(state0, msg);
|
|
movdqu(msg, Address(rsp, 16));
|
|
paddd(state1, msg);
|
|
|
|
if (multi_block) {
|
|
// increment data pointer and loop if more to process
|
|
addptr(buf, 64);
|
|
addptr(ofs, 64);
|
|
cmpptr(ofs, limit);
|
|
jcc(Assembler::belowEqual, loop0);
|
|
movptr(rax, ofs); //return ofs
|
|
}
|
|
|
|
pshufd(state0, state0, 0x1B);
|
|
pshufd(state1, state1, 0xB1);
|
|
movdqa(msgtmp4, state0);
|
|
pblendw(state0, state1, 0xF0);
|
|
palignr(state1, msgtmp4, 8);
|
|
|
|
movdqu(Address(state, 0), state0);
|
|
movdqu(Address(state, 16), state1);
|
|
|
|
bind(done_hash);
|
|
|
|
}
|
|
|
|
#ifdef _LP64
|
|
/*
|
|
The algorithm below is based on Intel publication:
|
|
"Fast SHA-256 Implementations on Intelë Architecture Processors" by Jim Guilford, Kirk Yap and Vinodh Gopal.
|
|
The assembly code was originally provided by Sean Gulley and in many places preserves
|
|
the original assembly NAMES and comments to simplify matching Java assembly with its original.
|
|
The Java version was substantially redesigned to replace 1200 assembly instruction with
|
|
much shorter run-time generator of the same code in memory.
|
|
*/
|
|
|
|
void MacroAssembler::sha256_AVX2_one_round_compute(
|
|
Register reg_old_h,
|
|
Register reg_a,
|
|
Register reg_b,
|
|
Register reg_c,
|
|
Register reg_d,
|
|
Register reg_e,
|
|
Register reg_f,
|
|
Register reg_g,
|
|
Register reg_h,
|
|
int iter) {
|
|
const Register& reg_y0 = r13;
|
|
const Register& reg_y1 = r14;
|
|
const Register& reg_y2 = r15;
|
|
const Register& reg_y3 = rcx;
|
|
const Register& reg_T1 = r12;
|
|
//;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; RND iter ;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
|
if (iter%4 > 0) {
|
|
addl(reg_old_h, reg_y2); // reg_h = k + w + reg_h + S0 + S1 + CH = t1 + S0; --
|
|
}
|
|
movl(reg_y2, reg_f); // reg_y2 = reg_f ; CH
|
|
rorxd(reg_y0, reg_e, 25); // reg_y0 = reg_e >> 25 ; S1A
|
|
rorxd(reg_y1, reg_e, 11); // reg_y1 = reg_e >> 11 ; S1B
|
|
xorl(reg_y2, reg_g); // reg_y2 = reg_f^reg_g ; CH
|
|
|
|
xorl(reg_y0, reg_y1); // reg_y0 = (reg_e>>25) ^ (reg_h>>11) ; S1
|
|
rorxd(reg_y1, reg_e, 6); // reg_y1 = (reg_e >> 6) ; S1
|
|
andl(reg_y2, reg_e); // reg_y2 = (reg_f^reg_g)®_e ; CH
|
|
|
|
if (iter%4 > 0) {
|
|
addl(reg_old_h, reg_y3); // reg_h = t1 + S0 + MAJ ; --
|
|
}
|
|
|
|
xorl(reg_y0, reg_y1); // reg_y0 = (reg_e>>25) ^ (reg_e>>11) ^ (reg_e>>6) ; S1
|
|
rorxd(reg_T1, reg_a, 13); // reg_T1 = reg_a >> 13 ; S0B
|
|
xorl(reg_y2, reg_g); // reg_y2 = CH = ((reg_f^reg_g)®_e)^reg_g ; CH
|
|
rorxd(reg_y1, reg_a, 22); // reg_y1 = reg_a >> 22 ; S0A
|
|
movl(reg_y3, reg_a); // reg_y3 = reg_a ; MAJA
|
|
|
|
xorl(reg_y1, reg_T1); // reg_y1 = (reg_a>>22) ^ (reg_a>>13) ; S0
|
|
rorxd(reg_T1, reg_a, 2); // reg_T1 = (reg_a >> 2) ; S0
|
|
addl(reg_h, Address(rsp, rdx, Address::times_1, 4*iter)); // reg_h = k + w + reg_h ; --
|
|
orl(reg_y3, reg_c); // reg_y3 = reg_a|reg_c ; MAJA
|
|
|
|
xorl(reg_y1, reg_T1); // reg_y1 = (reg_a>>22) ^ (reg_a>>13) ^ (reg_a>>2) ; S0
|
|
movl(reg_T1, reg_a); // reg_T1 = reg_a ; MAJB
|
|
andl(reg_y3, reg_b); // reg_y3 = (reg_a|reg_c)®_b ; MAJA
|
|
andl(reg_T1, reg_c); // reg_T1 = reg_a®_c ; MAJB
|
|
addl(reg_y2, reg_y0); // reg_y2 = S1 + CH ; --
|
|
|
|
|
|
addl(reg_d, reg_h); // reg_d = k + w + reg_h + reg_d ; --
|
|
orl(reg_y3, reg_T1); // reg_y3 = MAJ = (reg_a|reg_c)®_b)|(reg_a®_c) ; MAJ
|
|
addl(reg_h, reg_y1); // reg_h = k + w + reg_h + S0 ; --
|
|
|
|
addl(reg_d, reg_y2); // reg_d = k + w + reg_h + reg_d + S1 + CH = reg_d + t1 ; --
|
|
|
|
|
|
if (iter%4 == 3) {
|
|
addl(reg_h, reg_y2); // reg_h = k + w + reg_h + S0 + S1 + CH = t1 + S0; --
|
|
addl(reg_h, reg_y3); // reg_h = t1 + S0 + MAJ ; --
|
|
}
|
|
}
|
|
|
|
void MacroAssembler::sha256_AVX2_four_rounds_compute_first(int start) {
|
|
sha256_AVX2_one_round_compute(rax, rax, rbx, rdi, rsi, r8, r9, r10, r11, start + 0);
|
|
sha256_AVX2_one_round_compute(r11, r11, rax, rbx, rdi, rsi, r8, r9, r10, start + 1);
|
|
sha256_AVX2_one_round_compute(r10, r10, r11, rax, rbx, rdi, rsi, r8, r9, start + 2);
|
|
sha256_AVX2_one_round_compute(r9, r9, r10, r11, rax, rbx, rdi, rsi, r8, start + 3);
|
|
}
|
|
|
|
void MacroAssembler::sha256_AVX2_four_rounds_compute_last(int start) {
|
|
sha256_AVX2_one_round_compute(r8, r8, r9, r10, r11, rax, rbx, rdi, rsi, start + 0);
|
|
sha256_AVX2_one_round_compute(rsi, rsi, r8, r9, r10, r11, rax, rbx, rdi, start + 1);
|
|
sha256_AVX2_one_round_compute(rdi, rdi, rsi, r8, r9, r10, r11, rax, rbx, start + 2);
|
|
sha256_AVX2_one_round_compute(rbx, rbx, rdi, rsi, r8, r9, r10, r11, rax, start + 3);
|
|
}
|
|
|
|
void MacroAssembler::sha256_AVX2_one_round_and_sched(
|
|
XMMRegister xmm_0, /* == ymm4 on 0, 1, 2, 3 iterations, then rotate 4 registers left on 4, 8, 12 iterations */
|
|
XMMRegister xmm_1, /* ymm5 */ /* full cycle is 16 iterations */
|
|
XMMRegister xmm_2, /* ymm6 */
|
|
XMMRegister xmm_3, /* ymm7 */
|
|
Register reg_a, /* == rax on 0 iteration, then rotate 8 register right on each next iteration */
|
|
Register reg_b, /* rbx */ /* full cycle is 8 iterations */
|
|
Register reg_c, /* rdi */
|
|
Register reg_d, /* rsi */
|
|
Register reg_e, /* r8 */
|
|
Register reg_f, /* r9d */
|
|
Register reg_g, /* r10d */
|
|
Register reg_h, /* r11d */
|
|
int iter)
|
|
{
|
|
movl(rcx, reg_a); // rcx = reg_a ; MAJA
|
|
rorxd(r13, reg_e, 25); // r13 = reg_e >> 25 ; S1A
|
|
rorxd(r14, reg_e, 11); // r14 = reg_e >> 11 ; S1B
|
|
addl(reg_h, Address(rsp, rdx, Address::times_1, 4*iter));
|
|
orl(rcx, reg_c); // rcx = reg_a|reg_c ; MAJA
|
|
|
|
movl(r15, reg_f); // r15 = reg_f ; CH
|
|
rorxd(r12, reg_a, 13); // r12 = reg_a >> 13 ; S0B
|
|
xorl(r13, r14); // r13 = (reg_e>>25) ^ (reg_e>>11) ; S1
|
|
xorl(r15, reg_g); // r15 = reg_f^reg_g ; CH
|
|
|
|
rorxd(r14, reg_e, 6); // r14 = (reg_e >> 6) ; S1
|
|
andl(r15, reg_e); // r15 = (reg_f^reg_g)®_e ; CH
|
|
|
|
xorl(r13, r14); // r13 = (reg_e>>25) ^ (reg_e>>11) ^ (reg_e>>6) ; S1
|
|
rorxd(r14, reg_a, 22); // r14 = reg_a >> 22 ; S0A
|
|
addl(reg_d, reg_h); // reg_d = k + w + reg_h + reg_d ; --
|
|
|
|
andl(rcx, reg_b); // rcx = (reg_a|reg_c)®_b ; MAJA
|
|
xorl(r14, r12); // r14 = (reg_a>>22) ^ (reg_a>>13) ; S0
|
|
|
|
rorxd(r12, reg_a, 2); // r12 = (reg_a >> 2) ; S0
|
|
xorl(r15, reg_g); // r15 = CH = ((reg_f^reg_g)®_e)^reg_g ; CH
|
|
|
|
xorl(r14, r12); // r14 = (reg_a>>22) ^ (reg_a>>13) ^ (reg_a>>2) ; S0
|
|
movl(r12, reg_a); // r12 = reg_a ; MAJB
|
|
andl(r12, reg_c); // r12 = reg_a®_c ; MAJB
|
|
addl(r15, r13); // r15 = S1 + CH ; --
|
|
|
|
orl(rcx, r12); // rcx = MAJ = (reg_a|reg_c)®_b)|(reg_a®_c) ; MAJ
|
|
addl(reg_h, r14); // reg_h = k + w + reg_h + S0 ; --
|
|
addl(reg_d, r15); // reg_d = k + w + reg_h + reg_d + S1 + CH = reg_d + t1 ; --
|
|
|
|
addl(reg_h, r15); // reg_h = k + w + reg_h + S0 + S1 + CH = t1 + S0; --
|
|
addl(reg_h, rcx); // reg_h = t1 + S0 + MAJ ; --
|
|
|
|
if (iter%4 == 0) {
|
|
vpalignr(xmm0, xmm_3, xmm_2, 4, AVX_256bit); // ymm0 = W[-7]
|
|
vpaddd(xmm0, xmm0, xmm_0, AVX_256bit); // ymm0 = W[-7] + W[-16]; y1 = (e >> 6) ; S1
|
|
vpalignr(xmm1, xmm_1, xmm_0, 4, AVX_256bit); // ymm1 = W[-15]
|
|
vpsrld(xmm2, xmm1, 7, AVX_256bit);
|
|
vpslld(xmm3, xmm1, 32-7, AVX_256bit);
|
|
vpor(xmm3, xmm3, xmm2, AVX_256bit); // ymm3 = W[-15] ror 7
|
|
vpsrld(xmm2, xmm1,18, AVX_256bit);
|
|
} else if (iter%4 == 1 ) {
|
|
vpsrld(xmm8, xmm1, 3, AVX_256bit); // ymm8 = W[-15] >> 3
|
|
vpslld(xmm1, xmm1, 32-18, AVX_256bit);
|
|
vpxor(xmm3, xmm3, xmm1, AVX_256bit);
|
|
vpxor(xmm3, xmm3, xmm2, AVX_256bit); // ymm3 = W[-15] ror 7 ^ W[-15] ror 18
|
|
vpxor(xmm1, xmm3, xmm8, AVX_256bit); // ymm1 = s0
|
|
vpshufd(xmm2, xmm_3, 0xFA, AVX_256bit); // 11111010b ; ymm2 = W[-2] {BBAA}
|
|
vpaddd(xmm0, xmm0, xmm1, AVX_256bit); // ymm0 = W[-16] + W[-7] + s0
|
|
vpsrld(xmm8, xmm2, 10, AVX_256bit); // ymm8 = W[-2] >> 10 {BBAA}
|
|
} else if (iter%4 == 2) {
|
|
vpsrlq(xmm3, xmm2, 19, AVX_256bit); // ymm3 = W[-2] ror 19 {xBxA}
|
|
vpsrlq(xmm2, xmm2, 17, AVX_256bit); // ymm2 = W[-2] ror 17 {xBxA}
|
|
vpxor(xmm2, xmm2, xmm3, AVX_256bit);
|
|
vpxor(xmm8, xmm8, xmm2, AVX_256bit); // ymm8 = s1 {xBxA}
|
|
vpshufb(xmm8, xmm8, xmm10, AVX_256bit); // ymm8 = s1 {00BA}
|
|
vpaddd(xmm0, xmm0, xmm8, AVX_256bit); // ymm0 = {..., ..., W[1], W[0]}
|
|
vpshufd(xmm2, xmm0, 0x50, AVX_256bit); // 01010000b ; ymm2 = W[-2] {DDCC}
|
|
} else if (iter%4 == 3) {
|
|
vpsrld(xmm11, xmm2, 10, AVX_256bit); // ymm11 = W[-2] >> 10 {DDCC}
|
|
vpsrlq(xmm3, xmm2, 19, AVX_256bit); // ymm3 = W[-2] ror 19 {xDxC}
|
|
vpsrlq(xmm2, xmm2, 17, AVX_256bit); // ymm2 = W[-2] ror 17 {xDxC}
|
|
vpxor(xmm2, xmm2, xmm3, AVX_256bit);
|
|
vpxor(xmm11, xmm11, xmm2, AVX_256bit); // ymm11 = s1 {xDxC}
|
|
vpshufb(xmm11, xmm11, xmm12, AVX_256bit); // ymm11 = s1 {DC00}
|
|
vpaddd(xmm_0, xmm11, xmm0, AVX_256bit); // xmm_0 = {W[3], W[2], W[1], W[0]}
|
|
}
|
|
}
|
|
|
|
void MacroAssembler::addm(int disp, Register r1, Register r2) {
|
|
addl(r2, Address(r1, disp));
|
|
movl(Address(r1, disp), r2);
|
|
}
|
|
|
|
void MacroAssembler::addmq(int disp, Register r1, Register r2) {
|
|
addq(r2, Address(r1, disp));
|
|
movq(Address(r1, disp), r2);
|
|
}
|
|
|
|
void MacroAssembler::sha256_AVX2(XMMRegister msg, XMMRegister state0, XMMRegister state1, XMMRegister msgtmp0,
|
|
XMMRegister msgtmp1, XMMRegister msgtmp2, XMMRegister msgtmp3, XMMRegister msgtmp4,
|
|
Register buf, Register state, Register ofs, Register limit, Register rsp,
|
|
bool multi_block, XMMRegister shuf_mask) {
|
|
|
|
Label loop0, loop1, loop2, loop3,
|
|
last_block_enter, do_last_block, only_one_block, done_hash,
|
|
compute_size, compute_size_end,
|
|
compute_size1, compute_size_end1;
|
|
|
|
address K256_W = StubRoutines::x86::k256_W_addr();
|
|
address pshuffle_byte_flip_mask = StubRoutines::x86::pshuffle_byte_flip_mask_addr();
|
|
address pshuffle_byte_flip_mask_addr = 0;
|
|
|
|
const XMMRegister& SHUF_00BA = xmm10; // ymm10: shuffle xBxA -> 00BA
|
|
const XMMRegister& SHUF_DC00 = xmm12; // ymm12: shuffle xDxC -> DC00
|
|
const XMMRegister& BYTE_FLIP_MASK = xmm13; // ymm13
|
|
|
|
const XMMRegister& X_BYTE_FLIP_MASK = xmm13; //XMM version of BYTE_FLIP_MASK
|
|
|
|
const Register& NUM_BLKS = r8; // 3rd arg
|
|
const Register& CTX = rdx; // 2nd arg
|
|
const Register& INP = rcx; // 1st arg
|
|
|
|
const Register& c = rdi;
|
|
const Register& d = rsi;
|
|
const Register& e = r8; // clobbers NUM_BLKS
|
|
const Register& y3 = rcx; // clobbers INP
|
|
|
|
const Register& TBL = rbp;
|
|
const Register& SRND = CTX; // SRND is same register as CTX
|
|
|
|
const Register& a = rax;
|
|
const Register& b = rbx;
|
|
const Register& f = r9;
|
|
const Register& g = r10;
|
|
const Register& h = r11;
|
|
|
|
const Register& T1 = r12;
|
|
const Register& y0 = r13;
|
|
const Register& y1 = r14;
|
|
const Register& y2 = r15;
|
|
|
|
|
|
enum {
|
|
_XFER_SIZE = 2*64*4, // 2 blocks, 64 rounds, 4 bytes/round
|
|
_INP_END_SIZE = 8,
|
|
_INP_SIZE = 8,
|
|
_CTX_SIZE = 8,
|
|
_RSP_SIZE = 8,
|
|
|
|
_XFER = 0,
|
|
_INP_END = _XFER + _XFER_SIZE,
|
|
_INP = _INP_END + _INP_END_SIZE,
|
|
_CTX = _INP + _INP_SIZE,
|
|
_RSP = _CTX + _CTX_SIZE,
|
|
STACK_SIZE = _RSP + _RSP_SIZE
|
|
};
|
|
|
|
#ifndef _WIN64
|
|
push(rcx); // linux: this is limit, need at the end
|
|
push(rdx); // linux: this is ofs
|
|
#else
|
|
push(r8); // win64: this is ofs
|
|
push(r9); // win64: this is limit, we need them again at the very and
|
|
#endif
|
|
|
|
|
|
push(rbx);
|
|
#ifdef _WIN64
|
|
push(rsi);
|
|
push(rdi);
|
|
#endif
|
|
push(rbp);
|
|
push(r12);
|
|
push(r13);
|
|
push(r14);
|
|
push(r15);
|
|
|
|
movq(rax, rsp);
|
|
subq(rsp, STACK_SIZE);
|
|
andq(rsp, -32);
|
|
movq(Address(rsp, _RSP), rax);
|
|
|
|
#ifndef _WIN64
|
|
// copy linux params to win64 params, therefore the rest of code will be the same for both
|
|
movq(r9, rcx);
|
|
movq(r8, rdx);
|
|
movq(rdx, rsi);
|
|
movq(rcx, rdi);
|
|
#endif
|
|
|
|
// setting original assembly ABI
|
|
/** message to encrypt in INP */
|
|
lea(INP, Address(rcx, 0)); // rcx == message (buf) ;; linux: INP = buf = rdi
|
|
/** digest in CTX */
|
|
movq(CTX, rdx); // rdx = digest (state) ;; linux: CTX = state = rsi
|
|
|
|
/** NUM_BLK is the length of message, need to set it from ofs and limit */
|
|
if (multi_block) {
|
|
|
|
// Win64: cannot directly update NUM_BLKS, since NUM_BLKS = ofs = r8
|
|
// on entry r8 = ofs
|
|
// on exit r8 = NUM_BLKS
|
|
|
|
xorq(rax, rax);
|
|
|
|
bind(compute_size);
|
|
cmpptr(r8, r9); // assume the original ofs <= limit ;; linux: cmp rcx, rdx
|
|
jccb(Assembler::aboveEqual, compute_size_end);
|
|
addq(r8, 64); //;; linux: ofs = rdx
|
|
addq(rax, 64);
|
|
jmpb(compute_size);
|
|
|
|
bind(compute_size_end);
|
|
movq(NUM_BLKS, rax); // NUM_BLK (r8) ;; linux: NUM_BLK = rdx
|
|
|
|
cmpq(NUM_BLKS, 0);
|
|
jcc(Assembler::equal, done_hash);
|
|
|
|
} else {
|
|
xorq(NUM_BLKS, NUM_BLKS);
|
|
addq(NUM_BLKS, 64);
|
|
}//if (!multi_block)
|
|
|
|
lea(NUM_BLKS, Address(INP, NUM_BLKS, Address::times_1, -64)); // pointer to the last block
|
|
movq(Address(rsp, _INP_END), NUM_BLKS); //
|
|
|
|
cmpptr(INP, NUM_BLKS); //cmp INP, NUM_BLKS
|
|
jcc(Assembler::equal, only_one_block); //je only_one_block
|
|
|
|
// load initial digest
|
|
movl(a, Address(CTX, 4*0));
|
|
movl(b, Address(CTX, 4*1));
|
|
movl(c, Address(CTX, 4*2));
|
|
movl(d, Address(CTX, 4*3));
|
|
movl(e, Address(CTX, 4*4));
|
|
movl(f, Address(CTX, 4*5));
|
|
// load g - r10 after it is used as scratch
|
|
movl(h, Address(CTX, 4*7));
|
|
|
|
pshuffle_byte_flip_mask_addr = pshuffle_byte_flip_mask;
|
|
vmovdqu(BYTE_FLIP_MASK, ExternalAddress(pshuffle_byte_flip_mask_addr +0)); //[PSHUFFLE_BYTE_FLIP_MASK wrt rip]
|
|
vmovdqu(SHUF_00BA, ExternalAddress(pshuffle_byte_flip_mask_addr + 32)); //[_SHUF_00BA wrt rip]
|
|
vmovdqu(SHUF_DC00, ExternalAddress(pshuffle_byte_flip_mask_addr + 64)); //[_SHUF_DC00 wrt rip]
|
|
|
|
movl(g, Address(CTX, 4*6));
|
|
|
|
movq(Address(rsp, _CTX), CTX); // store
|
|
|
|
bind(loop0);
|
|
lea(TBL, ExternalAddress(K256_W));
|
|
|
|
// assume buffers not aligned
|
|
|
|
// Load first 16 dwords from two blocks
|
|
vmovdqu(xmm0, Address(INP, 0*32));
|
|
vmovdqu(xmm1, Address(INP, 1*32));
|
|
vmovdqu(xmm2, Address(INP, 2*32));
|
|
vmovdqu(xmm3, Address(INP, 3*32));
|
|
|
|
// byte swap data
|
|
vpshufb(xmm0, xmm0, BYTE_FLIP_MASK, AVX_256bit);
|
|
vpshufb(xmm1, xmm1, BYTE_FLIP_MASK, AVX_256bit);
|
|
vpshufb(xmm2, xmm2, BYTE_FLIP_MASK, AVX_256bit);
|
|
vpshufb(xmm3, xmm3, BYTE_FLIP_MASK, AVX_256bit);
|
|
|
|
// transpose data into high/low halves
|
|
vperm2i128(xmm4, xmm0, xmm2, 0x20);
|
|
vperm2i128(xmm5, xmm0, xmm2, 0x31);
|
|
vperm2i128(xmm6, xmm1, xmm3, 0x20);
|
|
vperm2i128(xmm7, xmm1, xmm3, 0x31);
|
|
|
|
bind(last_block_enter);
|
|
addq(INP, 64);
|
|
movq(Address(rsp, _INP), INP);
|
|
|
|
//;; schedule 48 input dwords, by doing 3 rounds of 12 each
|
|
xorq(SRND, SRND);
|
|
|
|
align(16);
|
|
bind(loop1);
|
|
vpaddd(xmm9, xmm4, Address(TBL, SRND, Address::times_1, 0*32), AVX_256bit);
|
|
vmovdqu(Address(rsp, SRND, Address::times_1, _XFER + 0*32), xmm9);
|
|
sha256_AVX2_one_round_and_sched(xmm4, xmm5, xmm6, xmm7, rax, rbx, rdi, rsi, r8, r9, r10, r11, 0);
|
|
sha256_AVX2_one_round_and_sched(xmm4, xmm5, xmm6, xmm7, r11, rax, rbx, rdi, rsi, r8, r9, r10, 1);
|
|
sha256_AVX2_one_round_and_sched(xmm4, xmm5, xmm6, xmm7, r10, r11, rax, rbx, rdi, rsi, r8, r9, 2);
|
|
sha256_AVX2_one_round_and_sched(xmm4, xmm5, xmm6, xmm7, r9, r10, r11, rax, rbx, rdi, rsi, r8, 3);
|
|
|
|
vpaddd(xmm9, xmm5, Address(TBL, SRND, Address::times_1, 1*32), AVX_256bit);
|
|
vmovdqu(Address(rsp, SRND, Address::times_1, _XFER + 1*32), xmm9);
|
|
sha256_AVX2_one_round_and_sched(xmm5, xmm6, xmm7, xmm4, r8, r9, r10, r11, rax, rbx, rdi, rsi, 8+0);
|
|
sha256_AVX2_one_round_and_sched(xmm5, xmm6, xmm7, xmm4, rsi, r8, r9, r10, r11, rax, rbx, rdi, 8+1);
|
|
sha256_AVX2_one_round_and_sched(xmm5, xmm6, xmm7, xmm4, rdi, rsi, r8, r9, r10, r11, rax, rbx, 8+2);
|
|
sha256_AVX2_one_round_and_sched(xmm5, xmm6, xmm7, xmm4, rbx, rdi, rsi, r8, r9, r10, r11, rax, 8+3);
|
|
|
|
vpaddd(xmm9, xmm6, Address(TBL, SRND, Address::times_1, 2*32), AVX_256bit);
|
|
vmovdqu(Address(rsp, SRND, Address::times_1, _XFER + 2*32), xmm9);
|
|
sha256_AVX2_one_round_and_sched(xmm6, xmm7, xmm4, xmm5, rax, rbx, rdi, rsi, r8, r9, r10, r11, 16+0);
|
|
sha256_AVX2_one_round_and_sched(xmm6, xmm7, xmm4, xmm5, r11, rax, rbx, rdi, rsi, r8, r9, r10, 16+1);
|
|
sha256_AVX2_one_round_and_sched(xmm6, xmm7, xmm4, xmm5, r10, r11, rax, rbx, rdi, rsi, r8, r9, 16+2);
|
|
sha256_AVX2_one_round_and_sched(xmm6, xmm7, xmm4, xmm5, r9, r10, r11, rax, rbx, rdi, rsi, r8, 16+3);
|
|
|
|
vpaddd(xmm9, xmm7, Address(TBL, SRND, Address::times_1, 3*32), AVX_256bit);
|
|
vmovdqu(Address(rsp, SRND, Address::times_1, _XFER + 3*32), xmm9);
|
|
|
|
sha256_AVX2_one_round_and_sched(xmm7, xmm4, xmm5, xmm6, r8, r9, r10, r11, rax, rbx, rdi, rsi, 24+0);
|
|
sha256_AVX2_one_round_and_sched(xmm7, xmm4, xmm5, xmm6, rsi, r8, r9, r10, r11, rax, rbx, rdi, 24+1);
|
|
sha256_AVX2_one_round_and_sched(xmm7, xmm4, xmm5, xmm6, rdi, rsi, r8, r9, r10, r11, rax, rbx, 24+2);
|
|
sha256_AVX2_one_round_and_sched(xmm7, xmm4, xmm5, xmm6, rbx, rdi, rsi, r8, r9, r10, r11, rax, 24+3);
|
|
|
|
addq(SRND, 4*32);
|
|
cmpq(SRND, 3 * 4*32);
|
|
jcc(Assembler::below, loop1);
|
|
|
|
bind(loop2);
|
|
// Do last 16 rounds with no scheduling
|
|
vpaddd(xmm9, xmm4, Address(TBL, SRND, Address::times_1, 0*32), AVX_256bit);
|
|
vmovdqu(Address(rsp, SRND, Address::times_1, _XFER + 0*32), xmm9);
|
|
sha256_AVX2_four_rounds_compute_first(0);
|
|
|
|
vpaddd(xmm9, xmm5, Address(TBL, SRND, Address::times_1, 1*32), AVX_256bit);
|
|
vmovdqu(Address(rsp, SRND, Address::times_1, _XFER + 1*32), xmm9);
|
|
sha256_AVX2_four_rounds_compute_last(0 + 8);
|
|
|
|
addq(SRND, 2*32);
|
|
|
|
vmovdqu(xmm4, xmm6);
|
|
vmovdqu(xmm5, xmm7);
|
|
|
|
cmpq(SRND, 4 * 4*32);
|
|
jcc(Assembler::below, loop2);
|
|
|
|
movq(CTX, Address(rsp, _CTX));
|
|
movq(INP, Address(rsp, _INP));
|
|
|
|
addm(4*0, CTX, a);
|
|
addm(4*1, CTX, b);
|
|
addm(4*2, CTX, c);
|
|
addm(4*3, CTX, d);
|
|
addm(4*4, CTX, e);
|
|
addm(4*5, CTX, f);
|
|
addm(4*6, CTX, g);
|
|
addm(4*7, CTX, h);
|
|
|
|
cmpq(INP, Address(rsp, _INP_END));
|
|
jcc(Assembler::above, done_hash);
|
|
|
|
//Do second block using previously scheduled results
|
|
xorq(SRND, SRND);
|
|
align(16);
|
|
bind(loop3);
|
|
sha256_AVX2_four_rounds_compute_first(4);
|
|
sha256_AVX2_four_rounds_compute_last(4+8);
|
|
|
|
addq(SRND, 2*32);
|
|
cmpq(SRND, 4 * 4*32);
|
|
jcc(Assembler::below, loop3);
|
|
|
|
movq(CTX, Address(rsp, _CTX));
|
|
movq(INP, Address(rsp, _INP));
|
|
addq(INP, 64);
|
|
|
|
addm(4*0, CTX, a);
|
|
addm(4*1, CTX, b);
|
|
addm(4*2, CTX, c);
|
|
addm(4*3, CTX, d);
|
|
addm(4*4, CTX, e);
|
|
addm(4*5, CTX, f);
|
|
addm(4*6, CTX, g);
|
|
addm(4*7, CTX, h);
|
|
|
|
cmpq(INP, Address(rsp, _INP_END));
|
|
jcc(Assembler::below, loop0);
|
|
jccb(Assembler::above, done_hash);
|
|
|
|
bind(do_last_block);
|
|
lea(TBL, ExternalAddress(K256_W));
|
|
|
|
movdqu(xmm4, Address(INP, 0*16));
|
|
movdqu(xmm5, Address(INP, 1*16));
|
|
movdqu(xmm6, Address(INP, 2*16));
|
|
movdqu(xmm7, Address(INP, 3*16));
|
|
|
|
vpshufb(xmm4, xmm4, xmm13, AVX_128bit);
|
|
vpshufb(xmm5, xmm5, xmm13, AVX_128bit);
|
|
vpshufb(xmm6, xmm6, xmm13, AVX_128bit);
|
|
vpshufb(xmm7, xmm7, xmm13, AVX_128bit);
|
|
|
|
jmp(last_block_enter);
|
|
|
|
bind(only_one_block);
|
|
|
|
// load initial digest ;; table should be preloaded with following values
|
|
movl(a, Address(CTX, 4*0)); // 0x6a09e667
|
|
movl(b, Address(CTX, 4*1)); // 0xbb67ae85
|
|
movl(c, Address(CTX, 4*2)); // 0x3c6ef372
|
|
movl(d, Address(CTX, 4*3)); // 0xa54ff53a
|
|
movl(e, Address(CTX, 4*4)); // 0x510e527f
|
|
movl(f, Address(CTX, 4*5)); // 0x9b05688c
|
|
// load g - r10 after use as scratch
|
|
movl(h, Address(CTX, 4*7)); // 0x5be0cd19
|
|
|
|
|
|
pshuffle_byte_flip_mask_addr = pshuffle_byte_flip_mask;
|
|
vmovdqu(BYTE_FLIP_MASK, ExternalAddress(pshuffle_byte_flip_mask_addr + 0)); //[PSHUFFLE_BYTE_FLIP_MASK wrt rip]
|
|
vmovdqu(SHUF_00BA, ExternalAddress(pshuffle_byte_flip_mask_addr + 32)); //[_SHUF_00BA wrt rip]
|
|
vmovdqu(SHUF_DC00, ExternalAddress(pshuffle_byte_flip_mask_addr + 64)); //[_SHUF_DC00 wrt rip]
|
|
|
|
movl(g, Address(CTX, 4*6)); // 0x1f83d9ab
|
|
|
|
movq(Address(rsp, _CTX), CTX);
|
|
jmpb(do_last_block);
|
|
|
|
bind(done_hash);
|
|
|
|
movq(rsp, Address(rsp, _RSP));
|
|
|
|
pop(r15);
|
|
pop(r14);
|
|
pop(r13);
|
|
pop(r12);
|
|
pop(rbp);
|
|
#ifdef _WIN64
|
|
pop(rdi);
|
|
pop(rsi);
|
|
#endif
|
|
pop(rbx);
|
|
|
|
#ifdef _WIN64
|
|
pop(r9);
|
|
pop(r8);
|
|
#else
|
|
pop(rdx);
|
|
pop(rcx);
|
|
#endif
|
|
|
|
if (multi_block) {
|
|
#ifdef _WIN64
|
|
const Register& limit_end = r9;
|
|
const Register& ofs_end = r8;
|
|
#else
|
|
const Register& limit_end = rcx;
|
|
const Register& ofs_end = rdx;
|
|
#endif
|
|
movq(rax, ofs_end);
|
|
|
|
bind(compute_size1);
|
|
cmpptr(rax, limit_end); // assume the original ofs <= limit
|
|
jccb(Assembler::aboveEqual, compute_size_end1);
|
|
addq(rax, 64);
|
|
jmpb(compute_size1);
|
|
|
|
bind(compute_size_end1);
|
|
}
|
|
}
|
|
|
|
void MacroAssembler::sha512_AVX2_one_round_compute(Register old_h, Register a, Register b, Register c,
|
|
Register d, Register e, Register f, Register g, Register h,
|
|
int iteration)
|
|
{
|
|
|
|
const Register& y0 = r13;
|
|
const Register& y1 = r14;
|
|
const Register& y2 = r15;
|
|
#ifdef _WIN64
|
|
const Register& y3 = rcx;
|
|
#else
|
|
const Register& y3 = rdi;
|
|
#endif
|
|
const Register& T1 = r12;
|
|
|
|
if (iteration % 4 > 0) {
|
|
addq(old_h, y2); //h = k + w + h + S0 + S1 + CH = t1 + S0;
|
|
}
|
|
movq(y2, f); //y2 = f; CH
|
|
rorxq(y0, e, 41); //y0 = e >> 41; S1A
|
|
rorxq(y1, e, 18); //y1 = e >> 18; S1B
|
|
xorq(y2, g); //y2 = f^g; CH
|
|
|
|
xorq(y0, y1); //y0 = (e >> 41) ^ (e >> 18); S1
|
|
rorxq(y1, e, 14); //y1 = (e >> 14); S1
|
|
andq(y2, e); //y2 = (f^g)&e; CH
|
|
|
|
if (iteration % 4 > 0 ) {
|
|
addq(old_h, y3); //h = t1 + S0 + MAJ
|
|
}
|
|
xorq(y0, y1); //y0 = (e >> 41) ^ (e >> 18) ^ (e >> 14); S1
|
|
rorxq(T1, a, 34); //T1 = a >> 34; S0B
|
|
xorq(y2, g); //y2 = CH = ((f^g)&e) ^g; CH
|
|
rorxq(y1, a, 39); //y1 = a >> 39; S0A
|
|
movq(y3, a); //y3 = a; MAJA
|
|
|
|
xorq(y1, T1); //y1 = (a >> 39) ^ (a >> 34); S0
|
|
rorxq(T1, a, 28); //T1 = (a >> 28); S0
|
|
addq(h, Address(rsp, (8 * iteration))); //h = k + w + h; --
|
|
orq(y3, c); //y3 = a | c; MAJA
|
|
|
|
xorq(y1, T1); //y1 = (a >> 39) ^ (a >> 34) ^ (a >> 28); S0
|
|
movq(T1, a); //T1 = a; MAJB
|
|
andq(y3, b); //y3 = (a | c)&b; MAJA
|
|
andq(T1, c); //T1 = a&c; MAJB
|
|
addq(y2, y0); //y2 = S1 + CH; --
|
|
|
|
addq(d, h); //d = k + w + h + d; --
|
|
orq(y3, T1); //y3 = MAJ = (a | c)&b) | (a&c); MAJ
|
|
addq(h, y1); //h = k + w + h + S0; --
|
|
|
|
addq(d, y2); //d = k + w + h + d + S1 + CH = d + t1; --
|
|
|
|
if (iteration % 4 == 3) {
|
|
addq(h, y2); //h = k + w + h + S0 + S1 + CH = t1 + S0; --
|
|
addq(h, y3); //h = t1 + S0 + MAJ; --
|
|
}
|
|
}
|
|
|
|
void MacroAssembler::sha512_AVX2_one_round_and_schedule(
|
|
XMMRegister xmm4, // ymm4
|
|
XMMRegister xmm5, // ymm5
|
|
XMMRegister xmm6, // ymm6
|
|
XMMRegister xmm7, // ymm7
|
|
Register a, //rax
|
|
Register b, //rbx
|
|
Register c, //rdi
|
|
Register d, //rsi
|
|
Register e, //r8
|
|
Register f, //r9
|
|
Register g, //r10
|
|
Register h, //r11
|
|
int iteration)
|
|
{
|
|
|
|
const Register& y0 = r13;
|
|
const Register& y1 = r14;
|
|
const Register& y2 = r15;
|
|
#ifdef _WIN64
|
|
const Register& y3 = rcx;
|
|
#else
|
|
const Register& y3 = rdi;
|
|
#endif
|
|
const Register& T1 = r12;
|
|
|
|
if (iteration % 4 == 0) {
|
|
// Extract w[t - 7]
|
|
// xmm0 = W[-7]
|
|
vperm2f128(xmm0, xmm7, xmm6, 3);
|
|
vpalignr(xmm0, xmm0, xmm6, 8, AVX_256bit);
|
|
|
|
// Calculate w[t - 16] + w[t - 7]
|
|
vpaddq(xmm0, xmm0, xmm4, AVX_256bit); //xmm0 = W[-7] + W[-16]
|
|
// Extract w[t - 15]
|
|
//xmm1 = W[-15]
|
|
vperm2f128(xmm1, xmm5, xmm4, 3);
|
|
vpalignr(xmm1, xmm1, xmm4, 8, AVX_256bit);
|
|
|
|
// Calculate sigma0
|
|
// Calculate w[t - 15] ror 1
|
|
vpsrlq(xmm2, xmm1, 1, AVX_256bit);
|
|
vpsllq(xmm3, xmm1, (64 - 1), AVX_256bit);
|
|
vpor(xmm3, xmm3, xmm2, AVX_256bit); //xmm3 = W[-15] ror 1
|
|
// Calculate w[t - 15] shr 7
|
|
vpsrlq(xmm8, xmm1, 7, AVX_256bit); //xmm8 = W[-15] >> 7
|
|
|
|
} else if (iteration % 4 == 1) {
|
|
//Calculate w[t - 15] ror 8
|
|
vpsrlq(xmm2, xmm1, 8, AVX_256bit);
|
|
vpsllq(xmm1, xmm1, (64 - 8), AVX_256bit);
|
|
vpor(xmm1, xmm1, xmm2, AVX_256bit); //xmm1 = W[-15] ror 8
|
|
|
|
//XOR the three components
|
|
vpxor(xmm3, xmm3, xmm8, AVX_256bit); //xmm3 = W[-15] ror 1 ^ W[-15] >> 7
|
|
vpxor(xmm1, xmm3, xmm1, AVX_256bit); //xmm1 = s0
|
|
|
|
//Add three components, w[t - 16], w[t - 7] and sigma0
|
|
vpaddq(xmm0, xmm0, xmm1, AVX_256bit); //xmm0 = W[-16] + W[-7] + s0
|
|
|
|
// Move to appropriate lanes for calculating w[16] and w[17]
|
|
vperm2f128(xmm4, xmm0, xmm0, 0); //xmm4 = W[-16] + W[-7] + s0{ BABA }
|
|
|
|
//Move to appropriate lanes for calculating w[18] and w[19]
|
|
vpand(xmm0, xmm0, xmm10, AVX_256bit); //xmm0 = W[-16] + W[-7] + s0{ DC00 }
|
|
//Calculate w[16] and w[17] in both 128 bit lanes
|
|
//Calculate sigma1 for w[16] and w[17] on both 128 bit lanes
|
|
vperm2f128(xmm2, xmm7, xmm7, 17); //xmm2 = W[-2] {BABA}
|
|
vpsrlq(xmm8, xmm2, 6, AVX_256bit); //xmm8 = W[-2] >> 6 {BABA}
|
|
|
|
} else if (iteration % 4 == 2) {
|
|
vpsrlq(xmm3, xmm2, 19, AVX_256bit); //xmm3 = W[-2] >> 19 {BABA}
|
|
vpsllq(xmm1, xmm2, (64 - 19), AVX_256bit); //xmm1 = W[-2] << 19 {BABA}
|
|
vpor(xmm3, xmm3, xmm1, AVX_256bit); //xmm3 = W[-2] ror 19 {BABA}
|
|
vpxor(xmm8, xmm8, xmm3, AVX_256bit);// xmm8 = W[-2] ror 19 ^ W[-2] >> 6 {BABA}
|
|
vpsrlq(xmm3, xmm2, 61, AVX_256bit); //xmm3 = W[-2] >> 61 {BABA}
|
|
vpsllq(xmm1, xmm2, (64 - 61), AVX_256bit); //xmm1 = W[-2] << 61 {BABA}
|
|
vpor(xmm3, xmm3, xmm1, AVX_256bit); //xmm3 = W[-2] ror 61 {BABA}
|
|
vpxor(xmm8, xmm8, xmm3, AVX_256bit); //xmm8 = s1 = (W[-2] ror 19) ^ (W[-2] ror 61) ^ (W[-2] >> 6) { BABA }
|
|
|
|
//Add sigma1 to the other components to get w[16] and w[17]
|
|
vpaddq(xmm4, xmm4, xmm8, AVX_256bit); //xmm4 = { W[1], W[0], W[1], W[0] }
|
|
|
|
//Calculate sigma1 for w[18] and w[19] for upper 128 bit lane
|
|
vpsrlq(xmm8, xmm4, 6, AVX_256bit); //xmm8 = W[-2] >> 6 {DC--}
|
|
|
|
} else if (iteration % 4 == 3){
|
|
vpsrlq(xmm3, xmm4, 19, AVX_256bit); //xmm3 = W[-2] >> 19 {DC--}
|
|
vpsllq(xmm1, xmm4, (64 - 19), AVX_256bit); //xmm1 = W[-2] << 19 {DC--}
|
|
vpor(xmm3, xmm3, xmm1, AVX_256bit); //xmm3 = W[-2] ror 19 {DC--}
|
|
vpxor(xmm8, xmm8, xmm3, AVX_256bit); //xmm8 = W[-2] ror 19 ^ W[-2] >> 6 {DC--}
|
|
vpsrlq(xmm3, xmm4, 61, AVX_256bit); //xmm3 = W[-2] >> 61 {DC--}
|
|
vpsllq(xmm1, xmm4, (64 - 61), AVX_256bit); //xmm1 = W[-2] << 61 {DC--}
|
|
vpor(xmm3, xmm3, xmm1, AVX_256bit); //xmm3 = W[-2] ror 61 {DC--}
|
|
vpxor(xmm8, xmm8, xmm3, AVX_256bit); //xmm8 = s1 = (W[-2] ror 19) ^ (W[-2] ror 61) ^ (W[-2] >> 6) { DC-- }
|
|
|
|
//Add the sigma0 + w[t - 7] + w[t - 16] for w[18] and w[19] to newly calculated sigma1 to get w[18] and w[19]
|
|
vpaddq(xmm2, xmm0, xmm8, AVX_256bit); //xmm2 = { W[3], W[2], --, -- }
|
|
|
|
//Form w[19, w[18], w17], w[16]
|
|
vpblendd(xmm4, xmm4, xmm2, 0xF0, AVX_256bit); //xmm4 = { W[3], W[2], W[1], W[0] }
|
|
}
|
|
|
|
movq(y3, a); //y3 = a; MAJA
|
|
rorxq(y0, e, 41); // y0 = e >> 41; S1A
|
|
rorxq(y1, e, 18); //y1 = e >> 18; S1B
|
|
addq(h, Address(rsp, (iteration * 8))); //h = k + w + h; --
|
|
orq(y3, c); //y3 = a | c; MAJA
|
|
movq(y2, f); //y2 = f; CH
|
|
|
|
xorq(y2, g); //y2 = f^g; CH
|
|
|
|
rorxq(T1, a, 34); //T1 = a >> 34; S0B
|
|
xorq(y0, y1); //y0 = (e >> 41) ^ (e >> 18); S1
|
|
|
|
rorxq(y1, e, 14); //y1 = (e >> 14); S1
|
|
|
|
andq(y2, e); //y2 = (f^g) & e; CH
|
|
addq(d, h); //d = k + w + h + d; --
|
|
|
|
andq(y3, b); //y3 = (a | c)&b; MAJA
|
|
xorq(y0, y1); //y0 = (e >> 41) ^ (e >> 18) ^ (e >> 14); S1
|
|
rorxq(y1, a, 39); //y1 = a >> 39; S0A
|
|
|
|
xorq(y1, T1); //y1 = (a >> 39) ^ (a >> 34); S0
|
|
rorxq(T1, a, 28); //T1 = (a >> 28); S0
|
|
xorq(y2, g); //y2 = CH = ((f^g)&e) ^ g; CH
|
|
|
|
xorq(y1, T1); //y1 = (a >> 39) ^ (a >> 34) ^ (a >> 28); S0
|
|
movq(T1, a); //T1 = a; MAJB
|
|
|
|
andq(T1, c); //T1 = a&c; MAJB
|
|
addq(y2, y0); //y2 = S1 + CH; --
|
|
|
|
orq(y3, T1); //y3 = MAJ = (a | c)&b) | (a&c); MAJ
|
|
addq(h, y1); //h = k + w + h + S0; --
|
|
|
|
addq(d, y2); //d = k + w + h + d + S1 + CH = d + t1; --
|
|
addq(h, y2); //h = k + w + h + S0 + S1 + CH = t1 + S0; --
|
|
addq(h, y3); //h = t1 + S0 + MAJ; --
|
|
}
|
|
|
|
void MacroAssembler::sha512_AVX2(XMMRegister msg, XMMRegister state0, XMMRegister state1, XMMRegister msgtmp0,
|
|
XMMRegister msgtmp1, XMMRegister msgtmp2, XMMRegister msgtmp3, XMMRegister msgtmp4,
|
|
Register buf, Register state, Register ofs, Register limit, Register rsp,
|
|
bool multi_block, XMMRegister shuf_mask)
|
|
{
|
|
|
|
Label loop0, loop1, loop2, done_hash,
|
|
compute_block_size, compute_size,
|
|
compute_block_size_end, compute_size_end;
|
|
|
|
address K512_W = StubRoutines::x86::k512_W_addr();
|
|
address pshuffle_byte_flip_mask_sha512 = StubRoutines::x86::pshuffle_byte_flip_mask_addr_sha512();
|
|
address pshuffle_byte_flip_mask_addr = 0;
|
|
|
|
const XMMRegister& XFER = xmm0; // YTMP0
|
|
const XMMRegister& BYTE_FLIP_MASK = xmm9; // ymm9
|
|
const XMMRegister& YMM_MASK_LO = xmm10; // ymm10
|
|
#ifdef _WIN64
|
|
const Register& INP = rcx; //1st arg
|
|
const Register& CTX = rdx; //2nd arg
|
|
const Register& NUM_BLKS = r8; //3rd arg
|
|
const Register& c = rdi;
|
|
const Register& d = rsi;
|
|
const Register& e = r8;
|
|
const Register& y3 = rcx;
|
|
const Register& offset = r8;
|
|
const Register& input_limit = r9;
|
|
#else
|
|
const Register& INP = rdi; //1st arg
|
|
const Register& CTX = rsi; //2nd arg
|
|
const Register& NUM_BLKS = rdx; //3rd arg
|
|
const Register& c = rcx;
|
|
const Register& d = r8;
|
|
const Register& e = rdx;
|
|
const Register& y3 = rdi;
|
|
const Register& offset = rdx;
|
|
const Register& input_limit = rcx;
|
|
#endif
|
|
|
|
const Register& TBL = rbp;
|
|
|
|
const Register& a = rax;
|
|
const Register& b = rbx;
|
|
|
|
const Register& f = r9;
|
|
const Register& g = r10;
|
|
const Register& h = r11;
|
|
|
|
//Local variables as defined in assembly file.
|
|
enum
|
|
{
|
|
_XFER_SIZE = 4 * 8, // resq 4 => reserve 4 quadwords. Hence 4 * 8
|
|
_SRND_SIZE = 8, // resq 1
|
|
_INP_SIZE = 8,
|
|
_INP_END_SIZE = 8,
|
|
_RSP_SAVE_SIZE = 8, // defined as resq 1
|
|
|
|
#ifdef _WIN64
|
|
_GPR_SAVE_SIZE = 8 * 8, // defined as resq 8
|
|
#else
|
|
_GPR_SAVE_SIZE = 6 * 8 // resq 6
|
|
#endif
|
|
};
|
|
|
|
enum
|
|
{
|
|
_XFER = 0,
|
|
_SRND = _XFER + _XFER_SIZE, // 32
|
|
_INP = _SRND + _SRND_SIZE, // 40
|
|
_INP_END = _INP + _INP_SIZE, // 48
|
|
_RSP = _INP_END + _INP_END_SIZE, // 56
|
|
_GPR = _RSP + _RSP_SAVE_SIZE, // 64
|
|
_STACK_SIZE = _GPR + _GPR_SAVE_SIZE // 128 for windows and 112 for linux.
|
|
};
|
|
|
|
//Saving offset and limit as it will help with blocksize calculation for multiblock SHA512.
|
|
#ifdef _WIN64
|
|
push(r8); // win64: this is ofs
|
|
push(r9); // win64: this is limit, we need them again at the very end.
|
|
#else
|
|
push(rdx); // linux : this is ofs, need at the end for multiblock calculation
|
|
push(rcx); // linux: This is the limit.
|
|
#endif
|
|
|
|
//Allocate Stack Space
|
|
movq(rax, rsp);
|
|
subq(rsp, _STACK_SIZE);
|
|
andq(rsp, -32);
|
|
movq(Address(rsp, _RSP), rax);
|
|
|
|
//Save GPRs
|
|
movq(Address(rsp, _GPR), rbp);
|
|
movq(Address(rsp, (_GPR + 8)), rbx);
|
|
movq(Address(rsp, (_GPR + 16)), r12);
|
|
movq(Address(rsp, (_GPR + 24)), r13);
|
|
movq(Address(rsp, (_GPR + 32)), r14);
|
|
movq(Address(rsp, (_GPR + 40)), r15);
|
|
|
|
#ifdef _WIN64
|
|
movq(Address(rsp, (_GPR + 48)), rsi);
|
|
movq(Address(rsp, (_GPR + 56)), rdi);
|
|
#endif
|
|
|
|
vpblendd(xmm0, xmm0, xmm1, 0xF0, AVX_128bit);
|
|
vpblendd(xmm0, xmm0, xmm1, 0xF0, AVX_256bit);
|
|
|
|
if (multi_block) {
|
|
xorq(rax, rax);
|
|
bind(compute_block_size);
|
|
cmpptr(offset, input_limit); // Assuming that offset is less than limit.
|
|
jccb(Assembler::aboveEqual, compute_block_size_end);
|
|
addq(offset, 128);
|
|
addq(rax, 128);
|
|
jmpb(compute_block_size);
|
|
|
|
bind(compute_block_size_end);
|
|
movq(NUM_BLKS, rax);
|
|
|
|
cmpq(NUM_BLKS, 0);
|
|
jcc(Assembler::equal, done_hash);
|
|
} else {
|
|
xorq(NUM_BLKS, NUM_BLKS); //If single block.
|
|
addq(NUM_BLKS, 128);
|
|
}
|
|
|
|
addq(NUM_BLKS, INP); //pointer to end of data
|
|
movq(Address(rsp, _INP_END), NUM_BLKS);
|
|
|
|
//load initial digest
|
|
movq(a, Address(CTX, 8 * 0));
|
|
movq(b, Address(CTX, 8 * 1));
|
|
movq(c, Address(CTX, 8 * 2));
|
|
movq(d, Address(CTX, 8 * 3));
|
|
movq(e, Address(CTX, 8 * 4));
|
|
movq(f, Address(CTX, 8 * 5));
|
|
// load g - r10 after it is used as scratch
|
|
movq(h, Address(CTX, 8 * 7));
|
|
|
|
pshuffle_byte_flip_mask_addr = pshuffle_byte_flip_mask_sha512;
|
|
vmovdqu(BYTE_FLIP_MASK, ExternalAddress(pshuffle_byte_flip_mask_addr + 0)); //PSHUFFLE_BYTE_FLIP_MASK wrt rip
|
|
vmovdqu(YMM_MASK_LO, ExternalAddress(pshuffle_byte_flip_mask_addr + 32));
|
|
|
|
movq(g, Address(CTX, 8 * 6));
|
|
|
|
bind(loop0);
|
|
lea(TBL, ExternalAddress(K512_W));
|
|
|
|
//byte swap first 16 dwords
|
|
vmovdqu(xmm4, Address(INP, 32 * 0));
|
|
vpshufb(xmm4, xmm4, BYTE_FLIP_MASK, AVX_256bit);
|
|
vmovdqu(xmm5, Address(INP, 32 * 1));
|
|
vpshufb(xmm5, xmm5, BYTE_FLIP_MASK, AVX_256bit);
|
|
vmovdqu(xmm6, Address(INP, 32 * 2));
|
|
vpshufb(xmm6, xmm6, BYTE_FLIP_MASK, AVX_256bit);
|
|
vmovdqu(xmm7, Address(INP, 32 * 3));
|
|
vpshufb(xmm7, xmm7, BYTE_FLIP_MASK, AVX_256bit);
|
|
|
|
movq(Address(rsp, _INP), INP);
|
|
|
|
movslq(Address(rsp, _SRND), 4);
|
|
align(16);
|
|
|
|
//Schedule 64 input dwords, by calling sha512_AVX2_one_round_and_schedule
|
|
bind(loop1);
|
|
vpaddq(xmm0, xmm4, Address(TBL, 0 * 32), AVX_256bit);
|
|
vmovdqu(Address(rsp, _XFER), xmm0);
|
|
//four rounds and schedule
|
|
sha512_AVX2_one_round_and_schedule(xmm4, xmm5, xmm6, xmm7, a, b, c, d, e, f, g, h, 0);
|
|
sha512_AVX2_one_round_and_schedule(xmm4, xmm5, xmm6, xmm7, h, a, b, c, d, e, f, g, 1);
|
|
sha512_AVX2_one_round_and_schedule(xmm4, xmm5, xmm6, xmm7, g, h, a, b, c, d, e, f, 2);
|
|
sha512_AVX2_one_round_and_schedule(xmm4, xmm5, xmm6, xmm7, f, g, h, a, b, c, d, e, 3);
|
|
|
|
vpaddq(xmm0, xmm5, Address(TBL, 1 * 32), AVX_256bit);
|
|
vmovdqu(Address(rsp, _XFER), xmm0);
|
|
//four rounds and schedule
|
|
sha512_AVX2_one_round_and_schedule(xmm5, xmm6, xmm7, xmm4, e, f, g, h, a, b, c, d, 0);
|
|
sha512_AVX2_one_round_and_schedule(xmm5, xmm6, xmm7, xmm4, d, e, f, g, h, a, b, c, 1);
|
|
sha512_AVX2_one_round_and_schedule(xmm5, xmm6, xmm7, xmm4, c, d, e, f, g, h, a, b, 2);
|
|
sha512_AVX2_one_round_and_schedule(xmm5, xmm6, xmm7, xmm4, b, c, d, e, f, g, h, a, 3);
|
|
|
|
vpaddq(xmm0, xmm6, Address(TBL, 2 * 32), AVX_256bit);
|
|
vmovdqu(Address(rsp, _XFER), xmm0);
|
|
//four rounds and schedule
|
|
sha512_AVX2_one_round_and_schedule(xmm6, xmm7, xmm4, xmm5, a, b, c, d, e, f, g, h, 0);
|
|
sha512_AVX2_one_round_and_schedule(xmm6, xmm7, xmm4, xmm5, h, a, b, c, d, e, f, g, 1);
|
|
sha512_AVX2_one_round_and_schedule(xmm6, xmm7, xmm4, xmm5, g, h, a, b, c, d, e, f, 2);
|
|
sha512_AVX2_one_round_and_schedule(xmm6, xmm7, xmm4, xmm5, f, g, h, a, b, c, d, e, 3);
|
|
|
|
vpaddq(xmm0, xmm7, Address(TBL, 3 * 32), AVX_256bit);
|
|
vmovdqu(Address(rsp, _XFER), xmm0);
|
|
addq(TBL, 4 * 32);
|
|
//four rounds and schedule
|
|
sha512_AVX2_one_round_and_schedule(xmm7, xmm4, xmm5, xmm6, e, f, g, h, a, b, c, d, 0);
|
|
sha512_AVX2_one_round_and_schedule(xmm7, xmm4, xmm5, xmm6, d, e, f, g, h, a, b, c, 1);
|
|
sha512_AVX2_one_round_and_schedule(xmm7, xmm4, xmm5, xmm6, c, d, e, f, g, h, a, b, 2);
|
|
sha512_AVX2_one_round_and_schedule(xmm7, xmm4, xmm5, xmm6, b, c, d, e, f, g, h, a, 3);
|
|
|
|
subq(Address(rsp, _SRND), 1);
|
|
jcc(Assembler::notEqual, loop1);
|
|
|
|
movslq(Address(rsp, _SRND), 2);
|
|
|
|
bind(loop2);
|
|
vpaddq(xmm0, xmm4, Address(TBL, 0 * 32), AVX_256bit);
|
|
vmovdqu(Address(rsp, _XFER), xmm0);
|
|
//four rounds and compute.
|
|
sha512_AVX2_one_round_compute(a, a, b, c, d, e, f, g, h, 0);
|
|
sha512_AVX2_one_round_compute(h, h, a, b, c, d, e, f, g, 1);
|
|
sha512_AVX2_one_round_compute(g, g, h, a, b, c, d, e, f, 2);
|
|
sha512_AVX2_one_round_compute(f, f, g, h, a, b, c, d, e, 3);
|
|
|
|
vpaddq(xmm0, xmm5, Address(TBL, 1 * 32), AVX_256bit);
|
|
vmovdqu(Address(rsp, _XFER), xmm0);
|
|
addq(TBL, 2 * 32);
|
|
// four rounds and compute.
|
|
sha512_AVX2_one_round_compute(e, e, f, g, h, a, b, c, d, 0);
|
|
sha512_AVX2_one_round_compute(d, d, e, f, g, h, a, b, c, 1);
|
|
sha512_AVX2_one_round_compute(c, c, d, e, f, g, h, a, b, 2);
|
|
sha512_AVX2_one_round_compute(b, b, c, d, e, f, g, h, a, 3);
|
|
|
|
vmovdqu(xmm4, xmm6);
|
|
vmovdqu(xmm5, xmm7);
|
|
|
|
subq(Address(rsp, _SRND), 1);
|
|
jcc(Assembler::notEqual, loop2);
|
|
|
|
addmq(8 * 0, CTX, a);
|
|
addmq(8 * 1, CTX, b);
|
|
addmq(8 * 2, CTX, c);
|
|
addmq(8 * 3, CTX, d);
|
|
addmq(8 * 4, CTX, e);
|
|
addmq(8 * 5, CTX, f);
|
|
addmq(8 * 6, CTX, g);
|
|
addmq(8 * 7, CTX, h);
|
|
|
|
movq(INP, Address(rsp, _INP));
|
|
addq(INP, 128);
|
|
cmpq(INP, Address(rsp, _INP_END));
|
|
jcc(Assembler::notEqual, loop0);
|
|
|
|
bind(done_hash);
|
|
|
|
//Restore GPRs
|
|
movq(rbp, Address(rsp, (_GPR + 0)));
|
|
movq(rbx, Address(rsp, (_GPR + 8)));
|
|
movq(r12, Address(rsp, (_GPR + 16)));
|
|
movq(r13, Address(rsp, (_GPR + 24)));
|
|
movq(r14, Address(rsp, (_GPR + 32)));
|
|
movq(r15, Address(rsp, (_GPR + 40)));
|
|
|
|
#ifdef _WIN64
|
|
movq(rsi, Address(rsp, (_GPR + 48)));
|
|
movq(rdi, Address(rsp, (_GPR + 56)));
|
|
#endif
|
|
|
|
//Restore Stack Pointer
|
|
movq(rsp, Address(rsp, _RSP));
|
|
|
|
#ifdef _WIN64
|
|
pop(r9);
|
|
pop(r8);
|
|
#else
|
|
pop(rcx);
|
|
pop(rdx);
|
|
#endif
|
|
|
|
if (multi_block) {
|
|
#ifdef _WIN64
|
|
const Register& limit_end = r9;
|
|
const Register& ofs_end = r8;
|
|
#else
|
|
const Register& limit_end = rcx;
|
|
const Register& ofs_end = rdx;
|
|
#endif
|
|
movq(rax, ofs_end);
|
|
bind(compute_size);
|
|
cmpptr(rax, limit_end);
|
|
jccb(Assembler::aboveEqual, compute_size_end);
|
|
addq(rax, 128);
|
|
jmpb(compute_size);
|
|
bind(compute_size_end);
|
|
}
|
|
}
|
|
|
|
#endif //#ifdef _LP64
|
|
|