8214074: Ghash optimization using AVX instructions
Reviewed-by: kvn, ascarpino
This commit is contained in:
parent
0e86ce5715
commit
3623c99b27
@ -4178,6 +4178,17 @@ void Assembler::psrldq(XMMRegister dst, int shift) {
|
||||
emit_int8(shift);
|
||||
}
|
||||
|
||||
void Assembler::vpsrldq(XMMRegister dst, XMMRegister src, int shift, int vector_len) {
|
||||
assert(vector_len == AVX_128bit ? VM_Version::supports_avx() :
|
||||
vector_len == AVX_256bit ? VM_Version::supports_avx2() :
|
||||
vector_len == AVX_512bit ? VM_Version::supports_avx512bw() : 0, "");
|
||||
InstructionAttr attributes(vector_len, /*vex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ true, /* uses_vl */ true);
|
||||
int encode = vex_prefix_and_encode(xmm3->encoding(), dst->encoding(), src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F, &attributes);
|
||||
emit_int8(0x73);
|
||||
emit_int8((unsigned char)(0xC0 | encode));
|
||||
emit_int8(shift & 0xFF);
|
||||
}
|
||||
|
||||
void Assembler::pslldq(XMMRegister dst, int shift) {
|
||||
// Shift left 128 bit value in dst XMMRegister by shift number of bytes.
|
||||
NOT_LP64(assert(VM_Version::supports_sse2(), ""));
|
||||
@ -4189,6 +4200,17 @@ void Assembler::pslldq(XMMRegister dst, int shift) {
|
||||
emit_int8(shift);
|
||||
}
|
||||
|
||||
void Assembler::vpslldq(XMMRegister dst, XMMRegister src, int shift, int vector_len) {
|
||||
assert(vector_len == AVX_128bit ? VM_Version::supports_avx() :
|
||||
vector_len == AVX_256bit ? VM_Version::supports_avx2() :
|
||||
vector_len == AVX_512bit ? VM_Version::supports_avx512bw() : 0, "");
|
||||
InstructionAttr attributes(vector_len, /*vex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ true, /* uses_vl */ true);
|
||||
int encode = vex_prefix_and_encode(xmm7->encoding(), dst->encoding(), src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F, &attributes);
|
||||
emit_int8(0x73);
|
||||
emit_int8((unsigned char)(0xC0 | encode));
|
||||
emit_int8(shift & 0xFF);
|
||||
}
|
||||
|
||||
void Assembler::ptest(XMMRegister dst, Address src) {
|
||||
assert(VM_Version::supports_sse4_1(), "");
|
||||
assert((UseAVX > 0), "SSE mode requires address alignment 16 bytes");
|
||||
@ -4200,7 +4222,7 @@ void Assembler::ptest(XMMRegister dst, Address src) {
|
||||
}
|
||||
|
||||
void Assembler::ptest(XMMRegister dst, XMMRegister src) {
|
||||
assert(VM_Version::supports_sse4_1(), "");
|
||||
assert(VM_Version::supports_sse4_1() || VM_Version::supports_avx(), "");
|
||||
InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ true, /* no_mask_reg */ true, /* uses_vl */ false);
|
||||
int encode = simd_prefix_and_encode(dst, xnoreg, src, VEX_SIMD_66, VEX_OPCODE_0F_38, &attributes);
|
||||
emit_int8(0x17);
|
||||
|
@ -2055,6 +2055,7 @@ private:
|
||||
void vpsllw(XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len);
|
||||
void vpslld(XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len);
|
||||
void vpsllq(XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len);
|
||||
void vpslldq(XMMRegister dst, XMMRegister src, int shift, int vector_len);
|
||||
|
||||
// Logical shift right packed integers
|
||||
void psrlw(XMMRegister dst, int shift);
|
||||
@ -2069,6 +2070,7 @@ private:
|
||||
void vpsrlw(XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len);
|
||||
void vpsrld(XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len);
|
||||
void vpsrlq(XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len);
|
||||
void vpsrldq(XMMRegister dst, XMMRegister src, int shift, int vector_len);
|
||||
void evpsrlvw(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
|
||||
void evpsllvw(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
|
||||
|
||||
|
@ -943,12 +943,17 @@ class MacroAssembler: public Assembler {
|
||||
int iter);
|
||||
|
||||
void addm(int disp, Register r1, Register r2);
|
||||
|
||||
void gfmul(XMMRegister tmp0, XMMRegister t);
|
||||
void schoolbookAAD(int i, Register subkeyH, XMMRegister data, XMMRegister tmp0,
|
||||
XMMRegister tmp1, XMMRegister tmp2, XMMRegister tmp3);
|
||||
void generateHtbl_one_block(Register htbl);
|
||||
void generateHtbl_eight_blocks(Register htbl);
|
||||
public:
|
||||
void sha256_AVX2(XMMRegister msg, XMMRegister state0, XMMRegister state1, XMMRegister msgtmp0,
|
||||
XMMRegister msgtmp1, XMMRegister msgtmp2, XMMRegister msgtmp3, XMMRegister msgtmp4,
|
||||
Register buf, Register state, Register ofs, Register limit, Register rsp,
|
||||
bool multi_block, XMMRegister shuf_mask);
|
||||
void avx_ghash(Register state, Register htbl, Register data, Register blocks);
|
||||
#endif
|
||||
|
||||
#ifdef _LP64
|
||||
@ -1498,6 +1503,15 @@ public:
|
||||
// 0x11 - multiply upper 64 bits [64:127]
|
||||
Assembler::vpclmulqdq(dst, nds, src, 0x11);
|
||||
}
|
||||
void vpclmullqhqdq(XMMRegister dst, XMMRegister nds, XMMRegister src) {
|
||||
// 0x10 - multiply nds[0:63] and src[64:127]
|
||||
Assembler::vpclmulqdq(dst, nds, src, 0x10);
|
||||
}
|
||||
void vpclmulhqlqdq(XMMRegister dst, XMMRegister nds, XMMRegister src) {
|
||||
//0x01 - multiply nds[64:127] and src[0:63]
|
||||
Assembler::vpclmulqdq(dst, nds, src, 0x01);
|
||||
}
|
||||
|
||||
void evpclmulldq(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
|
||||
// 0x00 - multiply lower 64 bits [0:63]
|
||||
Assembler::evpclmulqdq(dst, nds, src, 0x00, vector_len);
|
||||
|
322
src/hotspot/cpu/x86/macroAssembler_x86_aes.cpp
Normal file
322
src/hotspot/cpu/x86/macroAssembler_x86_aes.cpp
Normal file
@ -0,0 +1,322 @@
|
||||
/*
|
||||
* Copyright (c) 2018, Intel Corporation.
|
||||
*
|
||||
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
|
||||
*
|
||||
* This code is free software; you can redistribute it and/or modify it
|
||||
* under the terms of the GNU General Public License version 2 only, as
|
||||
* published by the Free Software Foundation.
|
||||
*
|
||||
* This code is distributed in the hope that it will be useful, but WITHOUT
|
||||
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
|
||||
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
|
||||
* version 2 for more details (a copy is included in the LICENSE file that
|
||||
* accompanied this code).
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License version
|
||||
* 2 along with this work; if not, write to the Free Software Foundation,
|
||||
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||
*
|
||||
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
|
||||
* or visit www.oracle.com if you need additional information or have any
|
||||
* questions.
|
||||
*
|
||||
*/
|
||||
|
||||
#include "precompiled.hpp"
|
||||
#include "asm/assembler.hpp"
|
||||
#include "asm/assembler.inline.hpp"
|
||||
#include "runtime/stubRoutines.hpp"
|
||||
#include "macroAssembler_x86.hpp"
|
||||
|
||||
// Multiply 128 x 128 bits, using 4 pclmulqdq operations
|
||||
void MacroAssembler::schoolbookAAD(int i, Register htbl, XMMRegister data,
|
||||
XMMRegister tmp0, XMMRegister tmp1, XMMRegister tmp2, XMMRegister tmp3) {
|
||||
movdqu(xmm15, Address(htbl, i * 16));
|
||||
vpclmulhqlqdq(tmp3, data, xmm15); // 0x01
|
||||
vpxor(tmp2, tmp2, tmp3, Assembler::AVX_128bit);
|
||||
vpclmulldq(tmp3, data, xmm15); // 0x00
|
||||
vpxor(tmp0, tmp0, tmp3, Assembler::AVX_128bit);
|
||||
vpclmulhdq(tmp3, data, xmm15); // 0x11
|
||||
vpxor(tmp1, tmp1, tmp3, Assembler::AVX_128bit);
|
||||
vpclmullqhqdq(tmp3, data, xmm15); // 0x10
|
||||
vpxor(tmp2, tmp2, tmp3, Assembler::AVX_128bit);
|
||||
}
|
||||
|
||||
// Multiply two 128 bit numbers resulting in a 256 bit value
|
||||
// Result of the multiplication followed by reduction stored in state
|
||||
void MacroAssembler::gfmul(XMMRegister tmp0, XMMRegister state) {
|
||||
const XMMRegister tmp1 = xmm4;
|
||||
const XMMRegister tmp2 = xmm5;
|
||||
const XMMRegister tmp3 = xmm6;
|
||||
const XMMRegister tmp4 = xmm7;
|
||||
|
||||
vpclmulldq(tmp1, state, tmp0); //0x00 (a0 * b0)
|
||||
vpclmulhdq(tmp4, state, tmp0);//0x11 (a1 * b1)
|
||||
vpclmullqhqdq(tmp2, state, tmp0);//0x10 (a1 * b0)
|
||||
vpclmulhqlqdq(tmp3, state, tmp0); //0x01 (a0 * b1)
|
||||
|
||||
vpxor(tmp2, tmp2, tmp3, Assembler::AVX_128bit); // (a0 * b1) + (a1 * b0)
|
||||
|
||||
vpslldq(tmp3, tmp2, 8, Assembler::AVX_128bit);
|
||||
vpsrldq(tmp2, tmp2, 8, Assembler::AVX_128bit);
|
||||
vpxor(tmp1, tmp1, tmp3, Assembler::AVX_128bit); // tmp1 and tmp4 hold the result
|
||||
vpxor(tmp4, tmp4, tmp2, Assembler::AVX_128bit); // of carryless multiplication
|
||||
// Follows the reduction technique mentioned in
|
||||
// Shift-XOR reduction described in Gueron-Kounavis May 2010
|
||||
// First phase of reduction
|
||||
//
|
||||
vpslld(xmm8, tmp1, 31, Assembler::AVX_128bit); // packed right shift shifting << 31
|
||||
vpslld(xmm9, tmp1, 30, Assembler::AVX_128bit); // packed right shift shifting << 30
|
||||
vpslld(xmm10, tmp1, 25, Assembler::AVX_128bit);// packed right shift shifting << 25
|
||||
// xor the shifted versions
|
||||
vpxor(xmm8, xmm8, xmm9, Assembler::AVX_128bit);
|
||||
vpxor(xmm8, xmm8, xmm10, Assembler::AVX_128bit);
|
||||
vpslldq(xmm9, xmm8, 12, Assembler::AVX_128bit);
|
||||
vpsrldq(xmm8, xmm8, 4, Assembler::AVX_128bit);
|
||||
vpxor(tmp1, tmp1, xmm9, Assembler::AVX_128bit);// first phase of the reduction complete
|
||||
//
|
||||
// Second phase of the reduction
|
||||
//
|
||||
vpsrld(xmm9, tmp1, 1, Assembler::AVX_128bit);// packed left shifting >> 1
|
||||
vpsrld(xmm10, tmp1, 2, Assembler::AVX_128bit);// packed left shifting >> 2
|
||||
vpsrld(xmm11, tmp1, 7, Assembler::AVX_128bit);// packed left shifting >> 7
|
||||
vpxor(xmm9, xmm9, xmm10, Assembler::AVX_128bit);// xor the shifted versions
|
||||
vpxor(xmm9, xmm9, xmm11, Assembler::AVX_128bit);
|
||||
vpxor(xmm9, xmm9, xmm8, Assembler::AVX_128bit);
|
||||
vpxor(tmp1, tmp1, xmm9, Assembler::AVX_128bit);
|
||||
vpxor(state, tmp4, tmp1, Assembler::AVX_128bit);// the result is in state
|
||||
ret(0);
|
||||
}
|
||||
|
||||
// This method takes the subkey after expansion as input and generates 1 * 16 power of subkey H.
|
||||
// The power of H is used in reduction process for one block ghash
|
||||
void MacroAssembler::generateHtbl_one_block(Register htbl) {
|
||||
const XMMRegister t = xmm13;
|
||||
|
||||
// load the original subkey hash
|
||||
movdqu(t, Address(htbl, 0));
|
||||
// shuffle using long swap mask
|
||||
movdqu(xmm10, ExternalAddress(StubRoutines::x86::ghash_long_swap_mask_addr()));
|
||||
vpshufb(t, t, xmm10, Assembler::AVX_128bit);
|
||||
|
||||
// Compute H' = GFMUL(H, 2)
|
||||
vpsrld(xmm3, t, 7, Assembler::AVX_128bit);
|
||||
movdqu(xmm4, ExternalAddress(StubRoutines::x86::ghash_shufflemask_addr()));
|
||||
vpshufb(xmm3, xmm3, xmm4, Assembler::AVX_128bit);
|
||||
movl(rax, 0xff00);
|
||||
movdl(xmm4, rax);
|
||||
vpshufb(xmm4, xmm4, xmm3, Assembler::AVX_128bit);
|
||||
movdqu(xmm5, ExternalAddress(StubRoutines::x86::ghash_polynomial_addr()));
|
||||
vpand(xmm5, xmm5, xmm4, Assembler::AVX_128bit);
|
||||
vpsrld(xmm3, t, 31, Assembler::AVX_128bit);
|
||||
vpslld(xmm4, t, 1, Assembler::AVX_128bit);
|
||||
vpslldq(xmm3, xmm3, 4, Assembler::AVX_128bit);
|
||||
vpxor(t, xmm4, xmm3, Assembler::AVX_128bit);// t holds p(x) <<1 or H * 2
|
||||
|
||||
//Adding p(x)<<1 to xmm5 which holds the reduction polynomial
|
||||
vpxor(t, t, xmm5, Assembler::AVX_128bit);
|
||||
movdqu(Address(htbl, 1 * 16), t); // H * 2
|
||||
|
||||
ret(0);
|
||||
}
|
||||
|
||||
// This method takes the subkey after expansion as input and generates the remaining powers of subkey H.
|
||||
// The power of H is used in reduction process for eight block ghash
|
||||
void MacroAssembler::generateHtbl_eight_blocks(Register htbl) {
|
||||
const XMMRegister t = xmm13;
|
||||
const XMMRegister tmp0 = xmm1;
|
||||
Label GFMUL;
|
||||
|
||||
movdqu(t, Address(htbl, 1 * 16));
|
||||
movdqu(tmp0, t);
|
||||
|
||||
// tmp0 and t hold H. Now we compute powers of H by using GFMUL(H, H)
|
||||
call(GFMUL, relocInfo::none);
|
||||
movdqu(Address(htbl, 2 * 16), t); //H ^ 2 * 2
|
||||
call(GFMUL, relocInfo::none);
|
||||
movdqu(Address(htbl, 3 * 16), t); //H ^ 3 * 2
|
||||
call(GFMUL, relocInfo::none);
|
||||
movdqu(Address(htbl, 4 * 16), t); //H ^ 4 * 2
|
||||
call(GFMUL, relocInfo::none);
|
||||
movdqu(Address(htbl, 5 * 16), t); //H ^ 5 * 2
|
||||
call(GFMUL, relocInfo::none);
|
||||
movdqu(Address(htbl, 6 * 16), t); //H ^ 6 * 2
|
||||
call(GFMUL, relocInfo::none);
|
||||
movdqu(Address(htbl, 7 * 16), t); //H ^ 7 * 2
|
||||
call(GFMUL, relocInfo::none);
|
||||
movdqu(Address(htbl, 8 * 16), t); //H ^ 8 * 2
|
||||
ret(0);
|
||||
|
||||
bind(GFMUL);
|
||||
gfmul(tmp0, t);
|
||||
}
|
||||
|
||||
// Multiblock and single block GHASH computation using Shift XOR reduction technique
|
||||
void MacroAssembler::avx_ghash(Register input_state, Register htbl,
|
||||
Register input_data, Register blocks) {
|
||||
|
||||
// temporary variables to hold input data and input state
|
||||
const XMMRegister data = xmm1;
|
||||
const XMMRegister state = xmm0;
|
||||
// temporary variables to hold intermediate results
|
||||
const XMMRegister tmp0 = xmm3;
|
||||
const XMMRegister tmp1 = xmm4;
|
||||
const XMMRegister tmp2 = xmm5;
|
||||
const XMMRegister tmp3 = xmm6;
|
||||
// temporary variables to hold byte and long swap masks
|
||||
const XMMRegister bswap_mask = xmm2;
|
||||
const XMMRegister lswap_mask = xmm14;
|
||||
|
||||
Label GENERATE_HTBL_1_BLK, GENERATE_HTBL_8_BLKS, BEGIN_PROCESS, GFMUL, BLOCK8_REDUCTION,
|
||||
ONE_BLK_INIT, PROCESS_1_BLOCK, PROCESS_8_BLOCKS, SAVE_STATE, EXIT_GHASH;
|
||||
|
||||
testptr(blocks, blocks);
|
||||
jcc(Assembler::zero, EXIT_GHASH);
|
||||
|
||||
// Check if Hashtable (1*16) has been already generated
|
||||
// For anything less than 8 blocks, we generate only the first power of H.
|
||||
movdqu(tmp2, Address(htbl, 1 * 16));
|
||||
ptest(tmp2, tmp2);
|
||||
jcc(Assembler::notZero, BEGIN_PROCESS);
|
||||
call(GENERATE_HTBL_1_BLK, relocInfo::none);
|
||||
|
||||
// Shuffle the input state
|
||||
bind(BEGIN_PROCESS);
|
||||
movdqu(lswap_mask, ExternalAddress(StubRoutines::x86::ghash_long_swap_mask_addr()));
|
||||
movdqu(state, Address(input_state, 0));
|
||||
vpshufb(state, state, lswap_mask, Assembler::AVX_128bit);
|
||||
|
||||
cmpl(blocks, 8);
|
||||
jcc(Assembler::below, ONE_BLK_INIT);
|
||||
// If we have 8 blocks or more data, then generate remaining powers of H
|
||||
movdqu(tmp2, Address(htbl, 8 * 16));
|
||||
ptest(tmp2, tmp2);
|
||||
jcc(Assembler::notZero, PROCESS_8_BLOCKS);
|
||||
call(GENERATE_HTBL_8_BLKS, relocInfo::none);
|
||||
|
||||
//Do 8 multiplies followed by a reduction processing 8 blocks of data at a time
|
||||
//Each block = 16 bytes.
|
||||
bind(PROCESS_8_BLOCKS);
|
||||
subl(blocks, 8);
|
||||
movdqu(bswap_mask, ExternalAddress(StubRoutines::x86::ghash_byte_swap_mask_addr()));
|
||||
movdqu(data, Address(input_data, 16 * 7));
|
||||
vpshufb(data, data, bswap_mask, Assembler::AVX_128bit);
|
||||
//Loading 1*16 as calculated powers of H required starts at that location.
|
||||
movdqu(xmm15, Address(htbl, 1 * 16));
|
||||
//Perform carryless multiplication of (H*2, data block #7)
|
||||
vpclmulhqlqdq(tmp2, data, xmm15);//a0 * b1
|
||||
vpclmulldq(tmp0, data, xmm15);//a0 * b0
|
||||
vpclmulhdq(tmp1, data, xmm15);//a1 * b1
|
||||
vpclmullqhqdq(tmp3, data, xmm15);//a1* b0
|
||||
vpxor(tmp2, tmp2, tmp3, Assembler::AVX_128bit);// (a0 * b1) + (a1 * b0)
|
||||
|
||||
movdqu(data, Address(input_data, 16 * 6));
|
||||
vpshufb(data, data, bswap_mask, Assembler::AVX_128bit);
|
||||
// Perform carryless multiplication of (H^2 * 2, data block #6)
|
||||
schoolbookAAD(2, htbl, data, tmp0, tmp1, tmp2, tmp3);
|
||||
|
||||
movdqu(data, Address(input_data, 16 * 5));
|
||||
vpshufb(data, data, bswap_mask, Assembler::AVX_128bit);
|
||||
// Perform carryless multiplication of (H^3 * 2, data block #5)
|
||||
schoolbookAAD(3, htbl, data, tmp0, tmp1, tmp2, tmp3);
|
||||
movdqu(data, Address(input_data, 16 * 4));
|
||||
vpshufb(data, data, bswap_mask, Assembler::AVX_128bit);
|
||||
// Perform carryless multiplication of (H^4 * 2, data block #4)
|
||||
schoolbookAAD(4, htbl, data, tmp0, tmp1, tmp2, tmp3);
|
||||
movdqu(data, Address(input_data, 16 * 3));
|
||||
vpshufb(data, data, bswap_mask, Assembler::AVX_128bit);
|
||||
// Perform carryless multiplication of (H^5 * 2, data block #3)
|
||||
schoolbookAAD(5, htbl, data, tmp0, tmp1, tmp2, tmp3);
|
||||
movdqu(data, Address(input_data, 16 * 2));
|
||||
vpshufb(data, data, bswap_mask, Assembler::AVX_128bit);
|
||||
// Perform carryless multiplication of (H^6 * 2, data block #2)
|
||||
schoolbookAAD(6, htbl, data, tmp0, tmp1, tmp2, tmp3);
|
||||
movdqu(data, Address(input_data, 16 * 1));
|
||||
vpshufb(data, data, bswap_mask, Assembler::AVX_128bit);
|
||||
// Perform carryless multiplication of (H^7 * 2, data block #1)
|
||||
schoolbookAAD(7, htbl, data, tmp0, tmp1, tmp2, tmp3);
|
||||
movdqu(data, Address(input_data, 16 * 0));
|
||||
// xor data block#0 with input state before perfoming carry-less multiplication
|
||||
vpshufb(data, data, bswap_mask, Assembler::AVX_128bit);
|
||||
vpxor(data, data, state, Assembler::AVX_128bit);
|
||||
// Perform carryless multiplication of (H^8 * 2, data block #0)
|
||||
schoolbookAAD(8, htbl, data, tmp0, tmp1, tmp2, tmp3);
|
||||
vpslldq(tmp3, tmp2, 8, Assembler::AVX_128bit);
|
||||
vpsrldq(tmp2, tmp2, 8, Assembler::AVX_128bit);
|
||||
vpxor(tmp0, tmp0, tmp3, Assembler::AVX_128bit);// tmp0, tmp1 contains aggregated results of
|
||||
vpxor(tmp1, tmp1, tmp2, Assembler::AVX_128bit);// the multiplication operation
|
||||
|
||||
// we have the 2 128-bit partially accumulated multiplication results in tmp0:tmp1
|
||||
// with higher 128-bit in tmp1 and lower 128-bit in corresponding tmp0
|
||||
// Follows the reduction technique mentioned in
|
||||
// Shift-XOR reduction described in Gueron-Kounavis May 2010
|
||||
bind(BLOCK8_REDUCTION);
|
||||
// First Phase of the reduction
|
||||
vpslld(xmm8, tmp0, 31, Assembler::AVX_128bit); // packed right shifting << 31
|
||||
vpslld(xmm9, tmp0, 30, Assembler::AVX_128bit); // packed right shifting << 30
|
||||
vpslld(xmm10, tmp0, 25, Assembler::AVX_128bit); // packed right shifting << 25
|
||||
// xor the shifted versions
|
||||
vpxor(xmm8, xmm8, xmm10, Assembler::AVX_128bit);
|
||||
vpxor(xmm8, xmm8, xmm9, Assembler::AVX_128bit);
|
||||
|
||||
vpslldq(xmm9, xmm8, 12, Assembler::AVX_128bit);
|
||||
vpsrldq(xmm8, xmm8, 4, Assembler::AVX_128bit);
|
||||
|
||||
vpxor(tmp0, tmp0, xmm9, Assembler::AVX_128bit); // first phase of reduction is complete
|
||||
// second phase of the reduction
|
||||
vpsrld(xmm9, tmp0, 1, Assembler::AVX_128bit); // packed left shifting >> 1
|
||||
vpsrld(xmm10, tmp0, 2, Assembler::AVX_128bit); // packed left shifting >> 2
|
||||
vpsrld(tmp2, tmp0, 7, Assembler::AVX_128bit); // packed left shifting >> 7
|
||||
// xor the shifted versions
|
||||
vpxor(xmm9, xmm9, xmm10, Assembler::AVX_128bit);
|
||||
vpxor(xmm9, xmm9, tmp2, Assembler::AVX_128bit);
|
||||
vpxor(xmm9, xmm9, xmm8, Assembler::AVX_128bit);
|
||||
vpxor(tmp0, xmm9, tmp0, Assembler::AVX_128bit);
|
||||
// Final result is in state
|
||||
vpxor(state, tmp0, tmp1, Assembler::AVX_128bit);
|
||||
|
||||
lea(input_data, Address(input_data, 16 * 8));
|
||||
cmpl(blocks, 8);
|
||||
jcc(Assembler::below, ONE_BLK_INIT);
|
||||
jmp(PROCESS_8_BLOCKS);
|
||||
|
||||
// Since this is one block operation we will only use H * 2 i.e. the first power of H
|
||||
bind(ONE_BLK_INIT);
|
||||
movdqu(tmp0, Address(htbl, 1 * 16));
|
||||
movdqu(bswap_mask, ExternalAddress(StubRoutines::x86::ghash_byte_swap_mask_addr()));
|
||||
|
||||
//Do one (128 bit x 128 bit) carry-less multiplication at a time followed by a reduction.
|
||||
bind(PROCESS_1_BLOCK);
|
||||
cmpl(blocks, 0);
|
||||
jcc(Assembler::equal, SAVE_STATE);
|
||||
subl(blocks, 1);
|
||||
movdqu(data, Address(input_data, 0));
|
||||
vpshufb(data, data, bswap_mask, Assembler::AVX_128bit);
|
||||
vpxor(state, state, data, Assembler::AVX_128bit);
|
||||
// gfmul(H*2, state)
|
||||
call(GFMUL, relocInfo::none);
|
||||
addptr(input_data, 16);
|
||||
jmp(PROCESS_1_BLOCK);
|
||||
|
||||
bind(SAVE_STATE);
|
||||
vpshufb(state, state, lswap_mask, Assembler::AVX_128bit);
|
||||
movdqu(Address(input_state, 0), state);
|
||||
jmp(EXIT_GHASH);
|
||||
|
||||
bind(GFMUL);
|
||||
gfmul(tmp0, state);
|
||||
|
||||
bind(GENERATE_HTBL_1_BLK);
|
||||
generateHtbl_one_block(htbl);
|
||||
|
||||
bind(GENERATE_HTBL_8_BLKS);
|
||||
generateHtbl_eight_blocks(htbl);
|
||||
|
||||
bind(EXIT_GHASH);
|
||||
// zero out xmm registers used for Htbl storage
|
||||
vpxor(xmm0, xmm0, xmm0, Assembler::AVX_128bit);
|
||||
vpxor(xmm1, xmm1, xmm1, Assembler::AVX_128bit);
|
||||
vpxor(xmm3, xmm3, xmm3, Assembler::AVX_128bit);
|
||||
vpxor(xmm15, xmm15, xmm15, Assembler::AVX_128bit);
|
||||
}
|
@ -4388,6 +4388,45 @@ address generate_cipherBlockChaining_decryptVectorAESCrypt() {
|
||||
return start;
|
||||
}
|
||||
|
||||
// Polynomial x^128+x^127+x^126+x^121+1
|
||||
address ghash_polynomial_addr() {
|
||||
__ align(CodeEntryAlignment);
|
||||
StubCodeMark mark(this, "StubRoutines", "_ghash_poly_addr");
|
||||
address start = __ pc();
|
||||
__ emit_data64(0x0000000000000001, relocInfo::none);
|
||||
__ emit_data64(0xc200000000000000, relocInfo::none);
|
||||
return start;
|
||||
}
|
||||
|
||||
address ghash_shufflemask_addr() {
|
||||
__ align(CodeEntryAlignment);
|
||||
StubCodeMark mark(this, "StubRoutines", "_ghash_shuffmask_addr");
|
||||
address start = __ pc();
|
||||
__ emit_data64(0x0f0f0f0f0f0f0f0f, relocInfo::none);
|
||||
__ emit_data64(0x0f0f0f0f0f0f0f0f, relocInfo::none);
|
||||
return start;
|
||||
}
|
||||
|
||||
// Ghash single and multi block operations using AVX instructions
|
||||
address generate_avx_ghash_processBlocks() {
|
||||
__ align(CodeEntryAlignment);
|
||||
|
||||
StubCodeMark mark(this, "StubRoutines", "ghash_processBlocks");
|
||||
address start = __ pc();
|
||||
|
||||
// arguments
|
||||
const Register state = c_rarg0;
|
||||
const Register htbl = c_rarg1;
|
||||
const Register data = c_rarg2;
|
||||
const Register blocks = c_rarg3;
|
||||
__ enter();
|
||||
// Save state before entering routine
|
||||
__ avx_ghash(state, htbl, data, blocks);
|
||||
__ leave(); // required for proper stackwalking of RuntimeStub frame
|
||||
__ ret(0);
|
||||
return start;
|
||||
}
|
||||
|
||||
// byte swap x86 long
|
||||
address generate_ghash_long_swap_mask() {
|
||||
__ align(CodeEntryAlignment);
|
||||
@ -5886,9 +5925,15 @@ address generate_cipherBlockChaining_decryptVectorAESCrypt() {
|
||||
|
||||
// Generate GHASH intrinsics code
|
||||
if (UseGHASHIntrinsics) {
|
||||
StubRoutines::x86::_ghash_long_swap_mask_addr = generate_ghash_long_swap_mask();
|
||||
StubRoutines::x86::_ghash_byte_swap_mask_addr = generate_ghash_byte_swap_mask();
|
||||
StubRoutines::_ghash_processBlocks = generate_ghash_processBlocks();
|
||||
StubRoutines::x86::_ghash_long_swap_mask_addr = generate_ghash_long_swap_mask();
|
||||
StubRoutines::x86::_ghash_byte_swap_mask_addr = generate_ghash_byte_swap_mask();
|
||||
if (VM_Version::supports_avx()) {
|
||||
StubRoutines::x86::_ghash_shuffmask_addr = ghash_shufflemask_addr();
|
||||
StubRoutines::x86::_ghash_poly_addr = ghash_polynomial_addr();
|
||||
StubRoutines::_ghash_processBlocks = generate_avx_ghash_processBlocks();
|
||||
} else {
|
||||
StubRoutines::_ghash_processBlocks = generate_ghash_processBlocks();
|
||||
}
|
||||
}
|
||||
|
||||
if (UseBASE64Intrinsics) {
|
||||
|
@ -38,6 +38,8 @@ address StubRoutines::x86::_key_shuffle_mask_addr = NULL;
|
||||
address StubRoutines::x86::_counter_shuffle_mask_addr = NULL;
|
||||
address StubRoutines::x86::_ghash_long_swap_mask_addr = NULL;
|
||||
address StubRoutines::x86::_ghash_byte_swap_mask_addr = NULL;
|
||||
address StubRoutines::x86::_ghash_poly_addr = NULL;
|
||||
address StubRoutines::x86::_ghash_shuffmask_addr = NULL;
|
||||
address StubRoutines::x86::_upper_word_mask_addr = NULL;
|
||||
address StubRoutines::x86::_shuffle_byte_flip_mask_addr = NULL;
|
||||
address StubRoutines::x86::_k256_adr = NULL;
|
||||
|
@ -128,6 +128,8 @@ class x86 {
|
||||
// swap mask for ghash
|
||||
static address _ghash_long_swap_mask_addr;
|
||||
static address _ghash_byte_swap_mask_addr;
|
||||
static address _ghash_poly_addr;
|
||||
static address _ghash_shuffmask_addr;
|
||||
|
||||
// upper word mask for sha1
|
||||
static address _upper_word_mask_addr;
|
||||
@ -205,6 +207,8 @@ class x86 {
|
||||
static address crc_by128_masks_addr() { return (address)_crc_by128_masks; }
|
||||
static address ghash_long_swap_mask_addr() { return _ghash_long_swap_mask_addr; }
|
||||
static address ghash_byte_swap_mask_addr() { return _ghash_byte_swap_mask_addr; }
|
||||
static address ghash_shufflemask_addr() { return _ghash_shuffmask_addr; }
|
||||
static address ghash_polynomial_addr() { return _ghash_poly_addr; }
|
||||
static address upper_word_mask_addr() { return _upper_word_mask_addr; }
|
||||
static address shuffle_byte_flip_mask_addr() { return _shuffle_byte_flip_mask_addr; }
|
||||
static address k256_addr() { return _k256_adr; }
|
||||
|
@ -124,10 +124,10 @@ final class GHASH {
|
||||
|
||||
}
|
||||
|
||||
/* subkeyH and state are stored in long[] for GHASH intrinsic use */
|
||||
/* subkeyHtbl and state are stored in long[] for GHASH intrinsic use */
|
||||
|
||||
// hash subkey H; should not change after the object has been constructed
|
||||
private final long[] subkeyH;
|
||||
// hashtable subkeyHtbl; holds 2*9 powers of subkeyH computed using carry-less multiplication
|
||||
private long[] subkeyHtbl;
|
||||
|
||||
// buffer for storing hash
|
||||
private final long[] state;
|
||||
@ -149,9 +149,9 @@ final class GHASH {
|
||||
throw new ProviderException("Internal error");
|
||||
}
|
||||
state = new long[2];
|
||||
this.subkeyH = new long[2];
|
||||
this.subkeyH[0] = getLong(subkeyH, 0);
|
||||
this.subkeyH[1] = getLong(subkeyH, 8);
|
||||
subkeyHtbl = new long[2*9];
|
||||
subkeyHtbl[0] = getLong(subkeyH, 0);
|
||||
subkeyHtbl[1] = getLong(subkeyH, 8);
|
||||
}
|
||||
|
||||
/**
|
||||
@ -194,8 +194,8 @@ final class GHASH {
|
||||
if (inLen == 0) {
|
||||
return;
|
||||
}
|
||||
ghashRangeCheck(in, inOfs, inLen, state, subkeyH);
|
||||
processBlocks(in, inOfs, inLen/AES_BLOCK_SIZE, state, subkeyH);
|
||||
ghashRangeCheck(in, inOfs, inLen, state, subkeyHtbl);
|
||||
processBlocks(in, inOfs, inLen/AES_BLOCK_SIZE, state, subkeyHtbl);
|
||||
}
|
||||
|
||||
private static void ghashRangeCheck(byte[] in, int inOfs, int inLen, long[] st, long[] subH) {
|
||||
@ -219,8 +219,8 @@ final class GHASH {
|
||||
throw new RuntimeException("internal state has invalid length: " +
|
||||
st.length);
|
||||
}
|
||||
if (subH.length != 2) {
|
||||
throw new RuntimeException("internal subkeyH has invalid length: " +
|
||||
if (subH.length != 18) {
|
||||
throw new RuntimeException("internal subkeyHtbl has invalid length: " +
|
||||
subH.length);
|
||||
}
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user