8214074: Ghash optimization using AVX instructions

Reviewed-by: kvn, ascarpino
This commit is contained in:
Smita Kamath 2018-12-12 12:17:33 -08:00 committed by Anthony Scarpino
parent 0e86ce5715
commit 3623c99b27
8 changed files with 426 additions and 15 deletions

View File

@ -4178,6 +4178,17 @@ void Assembler::psrldq(XMMRegister dst, int shift) {
emit_int8(shift); emit_int8(shift);
} }
void Assembler::vpsrldq(XMMRegister dst, XMMRegister src, int shift, int vector_len) {
assert(vector_len == AVX_128bit ? VM_Version::supports_avx() :
vector_len == AVX_256bit ? VM_Version::supports_avx2() :
vector_len == AVX_512bit ? VM_Version::supports_avx512bw() : 0, "");
InstructionAttr attributes(vector_len, /*vex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ true, /* uses_vl */ true);
int encode = vex_prefix_and_encode(xmm3->encoding(), dst->encoding(), src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F, &attributes);
emit_int8(0x73);
emit_int8((unsigned char)(0xC0 | encode));
emit_int8(shift & 0xFF);
}
void Assembler::pslldq(XMMRegister dst, int shift) { void Assembler::pslldq(XMMRegister dst, int shift) {
// Shift left 128 bit value in dst XMMRegister by shift number of bytes. // Shift left 128 bit value in dst XMMRegister by shift number of bytes.
NOT_LP64(assert(VM_Version::supports_sse2(), "")); NOT_LP64(assert(VM_Version::supports_sse2(), ""));
@ -4189,6 +4200,17 @@ void Assembler::pslldq(XMMRegister dst, int shift) {
emit_int8(shift); emit_int8(shift);
} }
void Assembler::vpslldq(XMMRegister dst, XMMRegister src, int shift, int vector_len) {
assert(vector_len == AVX_128bit ? VM_Version::supports_avx() :
vector_len == AVX_256bit ? VM_Version::supports_avx2() :
vector_len == AVX_512bit ? VM_Version::supports_avx512bw() : 0, "");
InstructionAttr attributes(vector_len, /*vex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ true, /* uses_vl */ true);
int encode = vex_prefix_and_encode(xmm7->encoding(), dst->encoding(), src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F, &attributes);
emit_int8(0x73);
emit_int8((unsigned char)(0xC0 | encode));
emit_int8(shift & 0xFF);
}
void Assembler::ptest(XMMRegister dst, Address src) { void Assembler::ptest(XMMRegister dst, Address src) {
assert(VM_Version::supports_sse4_1(), ""); assert(VM_Version::supports_sse4_1(), "");
assert((UseAVX > 0), "SSE mode requires address alignment 16 bytes"); assert((UseAVX > 0), "SSE mode requires address alignment 16 bytes");
@ -4200,7 +4222,7 @@ void Assembler::ptest(XMMRegister dst, Address src) {
} }
void Assembler::ptest(XMMRegister dst, XMMRegister src) { void Assembler::ptest(XMMRegister dst, XMMRegister src) {
assert(VM_Version::supports_sse4_1(), ""); assert(VM_Version::supports_sse4_1() || VM_Version::supports_avx(), "");
InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ true, /* no_mask_reg */ true, /* uses_vl */ false); InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ true, /* no_mask_reg */ true, /* uses_vl */ false);
int encode = simd_prefix_and_encode(dst, xnoreg, src, VEX_SIMD_66, VEX_OPCODE_0F_38, &attributes); int encode = simd_prefix_and_encode(dst, xnoreg, src, VEX_SIMD_66, VEX_OPCODE_0F_38, &attributes);
emit_int8(0x17); emit_int8(0x17);

View File

@ -2055,6 +2055,7 @@ private:
void vpsllw(XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len); void vpsllw(XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len);
void vpslld(XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len); void vpslld(XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len);
void vpsllq(XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len); void vpsllq(XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len);
void vpslldq(XMMRegister dst, XMMRegister src, int shift, int vector_len);
// Logical shift right packed integers // Logical shift right packed integers
void psrlw(XMMRegister dst, int shift); void psrlw(XMMRegister dst, int shift);
@ -2069,6 +2070,7 @@ private:
void vpsrlw(XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len); void vpsrlw(XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len);
void vpsrld(XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len); void vpsrld(XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len);
void vpsrlq(XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len); void vpsrlq(XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len);
void vpsrldq(XMMRegister dst, XMMRegister src, int shift, int vector_len);
void evpsrlvw(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len); void evpsrlvw(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
void evpsllvw(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len); void evpsllvw(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);

View File

@ -943,12 +943,17 @@ class MacroAssembler: public Assembler {
int iter); int iter);
void addm(int disp, Register r1, Register r2); void addm(int disp, Register r1, Register r2);
void gfmul(XMMRegister tmp0, XMMRegister t);
void schoolbookAAD(int i, Register subkeyH, XMMRegister data, XMMRegister tmp0,
XMMRegister tmp1, XMMRegister tmp2, XMMRegister tmp3);
void generateHtbl_one_block(Register htbl);
void generateHtbl_eight_blocks(Register htbl);
public: public:
void sha256_AVX2(XMMRegister msg, XMMRegister state0, XMMRegister state1, XMMRegister msgtmp0, void sha256_AVX2(XMMRegister msg, XMMRegister state0, XMMRegister state1, XMMRegister msgtmp0,
XMMRegister msgtmp1, XMMRegister msgtmp2, XMMRegister msgtmp3, XMMRegister msgtmp4, XMMRegister msgtmp1, XMMRegister msgtmp2, XMMRegister msgtmp3, XMMRegister msgtmp4,
Register buf, Register state, Register ofs, Register limit, Register rsp, Register buf, Register state, Register ofs, Register limit, Register rsp,
bool multi_block, XMMRegister shuf_mask); bool multi_block, XMMRegister shuf_mask);
void avx_ghash(Register state, Register htbl, Register data, Register blocks);
#endif #endif
#ifdef _LP64 #ifdef _LP64
@ -1498,6 +1503,15 @@ public:
// 0x11 - multiply upper 64 bits [64:127] // 0x11 - multiply upper 64 bits [64:127]
Assembler::vpclmulqdq(dst, nds, src, 0x11); Assembler::vpclmulqdq(dst, nds, src, 0x11);
} }
void vpclmullqhqdq(XMMRegister dst, XMMRegister nds, XMMRegister src) {
// 0x10 - multiply nds[0:63] and src[64:127]
Assembler::vpclmulqdq(dst, nds, src, 0x10);
}
void vpclmulhqlqdq(XMMRegister dst, XMMRegister nds, XMMRegister src) {
//0x01 - multiply nds[64:127] and src[0:63]
Assembler::vpclmulqdq(dst, nds, src, 0x01);
}
void evpclmulldq(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) { void evpclmulldq(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
// 0x00 - multiply lower 64 bits [0:63] // 0x00 - multiply lower 64 bits [0:63]
Assembler::evpclmulqdq(dst, nds, src, 0x00, vector_len); Assembler::evpclmulqdq(dst, nds, src, 0x00, vector_len);

View File

@ -0,0 +1,322 @@
/*
* Copyright (c) 2018, Intel Corporation.
*
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License version 2 only, as
* published by the Free Software Foundation.
*
* This code is distributed in the hope that it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
* version 2 for more details (a copy is included in the LICENSE file that
* accompanied this code).
*
* You should have received a copy of the GNU General Public License version
* 2 along with this work; if not, write to the Free Software Foundation,
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
*
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
* or visit www.oracle.com if you need additional information or have any
* questions.
*
*/
#include "precompiled.hpp"
#include "asm/assembler.hpp"
#include "asm/assembler.inline.hpp"
#include "runtime/stubRoutines.hpp"
#include "macroAssembler_x86.hpp"
// Multiply 128 x 128 bits, using 4 pclmulqdq operations
void MacroAssembler::schoolbookAAD(int i, Register htbl, XMMRegister data,
XMMRegister tmp0, XMMRegister tmp1, XMMRegister tmp2, XMMRegister tmp3) {
movdqu(xmm15, Address(htbl, i * 16));
vpclmulhqlqdq(tmp3, data, xmm15); // 0x01
vpxor(tmp2, tmp2, tmp3, Assembler::AVX_128bit);
vpclmulldq(tmp3, data, xmm15); // 0x00
vpxor(tmp0, tmp0, tmp3, Assembler::AVX_128bit);
vpclmulhdq(tmp3, data, xmm15); // 0x11
vpxor(tmp1, tmp1, tmp3, Assembler::AVX_128bit);
vpclmullqhqdq(tmp3, data, xmm15); // 0x10
vpxor(tmp2, tmp2, tmp3, Assembler::AVX_128bit);
}
// Multiply two 128 bit numbers resulting in a 256 bit value
// Result of the multiplication followed by reduction stored in state
void MacroAssembler::gfmul(XMMRegister tmp0, XMMRegister state) {
const XMMRegister tmp1 = xmm4;
const XMMRegister tmp2 = xmm5;
const XMMRegister tmp3 = xmm6;
const XMMRegister tmp4 = xmm7;
vpclmulldq(tmp1, state, tmp0); //0x00 (a0 * b0)
vpclmulhdq(tmp4, state, tmp0);//0x11 (a1 * b1)
vpclmullqhqdq(tmp2, state, tmp0);//0x10 (a1 * b0)
vpclmulhqlqdq(tmp3, state, tmp0); //0x01 (a0 * b1)
vpxor(tmp2, tmp2, tmp3, Assembler::AVX_128bit); // (a0 * b1) + (a1 * b0)
vpslldq(tmp3, tmp2, 8, Assembler::AVX_128bit);
vpsrldq(tmp2, tmp2, 8, Assembler::AVX_128bit);
vpxor(tmp1, tmp1, tmp3, Assembler::AVX_128bit); // tmp1 and tmp4 hold the result
vpxor(tmp4, tmp4, tmp2, Assembler::AVX_128bit); // of carryless multiplication
// Follows the reduction technique mentioned in
// Shift-XOR reduction described in Gueron-Kounavis May 2010
// First phase of reduction
//
vpslld(xmm8, tmp1, 31, Assembler::AVX_128bit); // packed right shift shifting << 31
vpslld(xmm9, tmp1, 30, Assembler::AVX_128bit); // packed right shift shifting << 30
vpslld(xmm10, tmp1, 25, Assembler::AVX_128bit);// packed right shift shifting << 25
// xor the shifted versions
vpxor(xmm8, xmm8, xmm9, Assembler::AVX_128bit);
vpxor(xmm8, xmm8, xmm10, Assembler::AVX_128bit);
vpslldq(xmm9, xmm8, 12, Assembler::AVX_128bit);
vpsrldq(xmm8, xmm8, 4, Assembler::AVX_128bit);
vpxor(tmp1, tmp1, xmm9, Assembler::AVX_128bit);// first phase of the reduction complete
//
// Second phase of the reduction
//
vpsrld(xmm9, tmp1, 1, Assembler::AVX_128bit);// packed left shifting >> 1
vpsrld(xmm10, tmp1, 2, Assembler::AVX_128bit);// packed left shifting >> 2
vpsrld(xmm11, tmp1, 7, Assembler::AVX_128bit);// packed left shifting >> 7
vpxor(xmm9, xmm9, xmm10, Assembler::AVX_128bit);// xor the shifted versions
vpxor(xmm9, xmm9, xmm11, Assembler::AVX_128bit);
vpxor(xmm9, xmm9, xmm8, Assembler::AVX_128bit);
vpxor(tmp1, tmp1, xmm9, Assembler::AVX_128bit);
vpxor(state, tmp4, tmp1, Assembler::AVX_128bit);// the result is in state
ret(0);
}
// This method takes the subkey after expansion as input and generates 1 * 16 power of subkey H.
// The power of H is used in reduction process for one block ghash
void MacroAssembler::generateHtbl_one_block(Register htbl) {
const XMMRegister t = xmm13;
// load the original subkey hash
movdqu(t, Address(htbl, 0));
// shuffle using long swap mask
movdqu(xmm10, ExternalAddress(StubRoutines::x86::ghash_long_swap_mask_addr()));
vpshufb(t, t, xmm10, Assembler::AVX_128bit);
// Compute H' = GFMUL(H, 2)
vpsrld(xmm3, t, 7, Assembler::AVX_128bit);
movdqu(xmm4, ExternalAddress(StubRoutines::x86::ghash_shufflemask_addr()));
vpshufb(xmm3, xmm3, xmm4, Assembler::AVX_128bit);
movl(rax, 0xff00);
movdl(xmm4, rax);
vpshufb(xmm4, xmm4, xmm3, Assembler::AVX_128bit);
movdqu(xmm5, ExternalAddress(StubRoutines::x86::ghash_polynomial_addr()));
vpand(xmm5, xmm5, xmm4, Assembler::AVX_128bit);
vpsrld(xmm3, t, 31, Assembler::AVX_128bit);
vpslld(xmm4, t, 1, Assembler::AVX_128bit);
vpslldq(xmm3, xmm3, 4, Assembler::AVX_128bit);
vpxor(t, xmm4, xmm3, Assembler::AVX_128bit);// t holds p(x) <<1 or H * 2
//Adding p(x)<<1 to xmm5 which holds the reduction polynomial
vpxor(t, t, xmm5, Assembler::AVX_128bit);
movdqu(Address(htbl, 1 * 16), t); // H * 2
ret(0);
}
// This method takes the subkey after expansion as input and generates the remaining powers of subkey H.
// The power of H is used in reduction process for eight block ghash
void MacroAssembler::generateHtbl_eight_blocks(Register htbl) {
const XMMRegister t = xmm13;
const XMMRegister tmp0 = xmm1;
Label GFMUL;
movdqu(t, Address(htbl, 1 * 16));
movdqu(tmp0, t);
// tmp0 and t hold H. Now we compute powers of H by using GFMUL(H, H)
call(GFMUL, relocInfo::none);
movdqu(Address(htbl, 2 * 16), t); //H ^ 2 * 2
call(GFMUL, relocInfo::none);
movdqu(Address(htbl, 3 * 16), t); //H ^ 3 * 2
call(GFMUL, relocInfo::none);
movdqu(Address(htbl, 4 * 16), t); //H ^ 4 * 2
call(GFMUL, relocInfo::none);
movdqu(Address(htbl, 5 * 16), t); //H ^ 5 * 2
call(GFMUL, relocInfo::none);
movdqu(Address(htbl, 6 * 16), t); //H ^ 6 * 2
call(GFMUL, relocInfo::none);
movdqu(Address(htbl, 7 * 16), t); //H ^ 7 * 2
call(GFMUL, relocInfo::none);
movdqu(Address(htbl, 8 * 16), t); //H ^ 8 * 2
ret(0);
bind(GFMUL);
gfmul(tmp0, t);
}
// Multiblock and single block GHASH computation using Shift XOR reduction technique
void MacroAssembler::avx_ghash(Register input_state, Register htbl,
Register input_data, Register blocks) {
// temporary variables to hold input data and input state
const XMMRegister data = xmm1;
const XMMRegister state = xmm0;
// temporary variables to hold intermediate results
const XMMRegister tmp0 = xmm3;
const XMMRegister tmp1 = xmm4;
const XMMRegister tmp2 = xmm5;
const XMMRegister tmp3 = xmm6;
// temporary variables to hold byte and long swap masks
const XMMRegister bswap_mask = xmm2;
const XMMRegister lswap_mask = xmm14;
Label GENERATE_HTBL_1_BLK, GENERATE_HTBL_8_BLKS, BEGIN_PROCESS, GFMUL, BLOCK8_REDUCTION,
ONE_BLK_INIT, PROCESS_1_BLOCK, PROCESS_8_BLOCKS, SAVE_STATE, EXIT_GHASH;
testptr(blocks, blocks);
jcc(Assembler::zero, EXIT_GHASH);
// Check if Hashtable (1*16) has been already generated
// For anything less than 8 blocks, we generate only the first power of H.
movdqu(tmp2, Address(htbl, 1 * 16));
ptest(tmp2, tmp2);
jcc(Assembler::notZero, BEGIN_PROCESS);
call(GENERATE_HTBL_1_BLK, relocInfo::none);
// Shuffle the input state
bind(BEGIN_PROCESS);
movdqu(lswap_mask, ExternalAddress(StubRoutines::x86::ghash_long_swap_mask_addr()));
movdqu(state, Address(input_state, 0));
vpshufb(state, state, lswap_mask, Assembler::AVX_128bit);
cmpl(blocks, 8);
jcc(Assembler::below, ONE_BLK_INIT);
// If we have 8 blocks or more data, then generate remaining powers of H
movdqu(tmp2, Address(htbl, 8 * 16));
ptest(tmp2, tmp2);
jcc(Assembler::notZero, PROCESS_8_BLOCKS);
call(GENERATE_HTBL_8_BLKS, relocInfo::none);
//Do 8 multiplies followed by a reduction processing 8 blocks of data at a time
//Each block = 16 bytes.
bind(PROCESS_8_BLOCKS);
subl(blocks, 8);
movdqu(bswap_mask, ExternalAddress(StubRoutines::x86::ghash_byte_swap_mask_addr()));
movdqu(data, Address(input_data, 16 * 7));
vpshufb(data, data, bswap_mask, Assembler::AVX_128bit);
//Loading 1*16 as calculated powers of H required starts at that location.
movdqu(xmm15, Address(htbl, 1 * 16));
//Perform carryless multiplication of (H*2, data block #7)
vpclmulhqlqdq(tmp2, data, xmm15);//a0 * b1
vpclmulldq(tmp0, data, xmm15);//a0 * b0
vpclmulhdq(tmp1, data, xmm15);//a1 * b1
vpclmullqhqdq(tmp3, data, xmm15);//a1* b0
vpxor(tmp2, tmp2, tmp3, Assembler::AVX_128bit);// (a0 * b1) + (a1 * b0)
movdqu(data, Address(input_data, 16 * 6));
vpshufb(data, data, bswap_mask, Assembler::AVX_128bit);
// Perform carryless multiplication of (H^2 * 2, data block #6)
schoolbookAAD(2, htbl, data, tmp0, tmp1, tmp2, tmp3);
movdqu(data, Address(input_data, 16 * 5));
vpshufb(data, data, bswap_mask, Assembler::AVX_128bit);
// Perform carryless multiplication of (H^3 * 2, data block #5)
schoolbookAAD(3, htbl, data, tmp0, tmp1, tmp2, tmp3);
movdqu(data, Address(input_data, 16 * 4));
vpshufb(data, data, bswap_mask, Assembler::AVX_128bit);
// Perform carryless multiplication of (H^4 * 2, data block #4)
schoolbookAAD(4, htbl, data, tmp0, tmp1, tmp2, tmp3);
movdqu(data, Address(input_data, 16 * 3));
vpshufb(data, data, bswap_mask, Assembler::AVX_128bit);
// Perform carryless multiplication of (H^5 * 2, data block #3)
schoolbookAAD(5, htbl, data, tmp0, tmp1, tmp2, tmp3);
movdqu(data, Address(input_data, 16 * 2));
vpshufb(data, data, bswap_mask, Assembler::AVX_128bit);
// Perform carryless multiplication of (H^6 * 2, data block #2)
schoolbookAAD(6, htbl, data, tmp0, tmp1, tmp2, tmp3);
movdqu(data, Address(input_data, 16 * 1));
vpshufb(data, data, bswap_mask, Assembler::AVX_128bit);
// Perform carryless multiplication of (H^7 * 2, data block #1)
schoolbookAAD(7, htbl, data, tmp0, tmp1, tmp2, tmp3);
movdqu(data, Address(input_data, 16 * 0));
// xor data block#0 with input state before perfoming carry-less multiplication
vpshufb(data, data, bswap_mask, Assembler::AVX_128bit);
vpxor(data, data, state, Assembler::AVX_128bit);
// Perform carryless multiplication of (H^8 * 2, data block #0)
schoolbookAAD(8, htbl, data, tmp0, tmp1, tmp2, tmp3);
vpslldq(tmp3, tmp2, 8, Assembler::AVX_128bit);
vpsrldq(tmp2, tmp2, 8, Assembler::AVX_128bit);
vpxor(tmp0, tmp0, tmp3, Assembler::AVX_128bit);// tmp0, tmp1 contains aggregated results of
vpxor(tmp1, tmp1, tmp2, Assembler::AVX_128bit);// the multiplication operation
// we have the 2 128-bit partially accumulated multiplication results in tmp0:tmp1
// with higher 128-bit in tmp1 and lower 128-bit in corresponding tmp0
// Follows the reduction technique mentioned in
// Shift-XOR reduction described in Gueron-Kounavis May 2010
bind(BLOCK8_REDUCTION);
// First Phase of the reduction
vpslld(xmm8, tmp0, 31, Assembler::AVX_128bit); // packed right shifting << 31
vpslld(xmm9, tmp0, 30, Assembler::AVX_128bit); // packed right shifting << 30
vpslld(xmm10, tmp0, 25, Assembler::AVX_128bit); // packed right shifting << 25
// xor the shifted versions
vpxor(xmm8, xmm8, xmm10, Assembler::AVX_128bit);
vpxor(xmm8, xmm8, xmm9, Assembler::AVX_128bit);
vpslldq(xmm9, xmm8, 12, Assembler::AVX_128bit);
vpsrldq(xmm8, xmm8, 4, Assembler::AVX_128bit);
vpxor(tmp0, tmp0, xmm9, Assembler::AVX_128bit); // first phase of reduction is complete
// second phase of the reduction
vpsrld(xmm9, tmp0, 1, Assembler::AVX_128bit); // packed left shifting >> 1
vpsrld(xmm10, tmp0, 2, Assembler::AVX_128bit); // packed left shifting >> 2
vpsrld(tmp2, tmp0, 7, Assembler::AVX_128bit); // packed left shifting >> 7
// xor the shifted versions
vpxor(xmm9, xmm9, xmm10, Assembler::AVX_128bit);
vpxor(xmm9, xmm9, tmp2, Assembler::AVX_128bit);
vpxor(xmm9, xmm9, xmm8, Assembler::AVX_128bit);
vpxor(tmp0, xmm9, tmp0, Assembler::AVX_128bit);
// Final result is in state
vpxor(state, tmp0, tmp1, Assembler::AVX_128bit);
lea(input_data, Address(input_data, 16 * 8));
cmpl(blocks, 8);
jcc(Assembler::below, ONE_BLK_INIT);
jmp(PROCESS_8_BLOCKS);
// Since this is one block operation we will only use H * 2 i.e. the first power of H
bind(ONE_BLK_INIT);
movdqu(tmp0, Address(htbl, 1 * 16));
movdqu(bswap_mask, ExternalAddress(StubRoutines::x86::ghash_byte_swap_mask_addr()));
//Do one (128 bit x 128 bit) carry-less multiplication at a time followed by a reduction.
bind(PROCESS_1_BLOCK);
cmpl(blocks, 0);
jcc(Assembler::equal, SAVE_STATE);
subl(blocks, 1);
movdqu(data, Address(input_data, 0));
vpshufb(data, data, bswap_mask, Assembler::AVX_128bit);
vpxor(state, state, data, Assembler::AVX_128bit);
// gfmul(H*2, state)
call(GFMUL, relocInfo::none);
addptr(input_data, 16);
jmp(PROCESS_1_BLOCK);
bind(SAVE_STATE);
vpshufb(state, state, lswap_mask, Assembler::AVX_128bit);
movdqu(Address(input_state, 0), state);
jmp(EXIT_GHASH);
bind(GFMUL);
gfmul(tmp0, state);
bind(GENERATE_HTBL_1_BLK);
generateHtbl_one_block(htbl);
bind(GENERATE_HTBL_8_BLKS);
generateHtbl_eight_blocks(htbl);
bind(EXIT_GHASH);
// zero out xmm registers used for Htbl storage
vpxor(xmm0, xmm0, xmm0, Assembler::AVX_128bit);
vpxor(xmm1, xmm1, xmm1, Assembler::AVX_128bit);
vpxor(xmm3, xmm3, xmm3, Assembler::AVX_128bit);
vpxor(xmm15, xmm15, xmm15, Assembler::AVX_128bit);
}

View File

@ -4388,6 +4388,45 @@ address generate_cipherBlockChaining_decryptVectorAESCrypt() {
return start; return start;
} }
// Polynomial x^128+x^127+x^126+x^121+1
address ghash_polynomial_addr() {
__ align(CodeEntryAlignment);
StubCodeMark mark(this, "StubRoutines", "_ghash_poly_addr");
address start = __ pc();
__ emit_data64(0x0000000000000001, relocInfo::none);
__ emit_data64(0xc200000000000000, relocInfo::none);
return start;
}
address ghash_shufflemask_addr() {
__ align(CodeEntryAlignment);
StubCodeMark mark(this, "StubRoutines", "_ghash_shuffmask_addr");
address start = __ pc();
__ emit_data64(0x0f0f0f0f0f0f0f0f, relocInfo::none);
__ emit_data64(0x0f0f0f0f0f0f0f0f, relocInfo::none);
return start;
}
// Ghash single and multi block operations using AVX instructions
address generate_avx_ghash_processBlocks() {
__ align(CodeEntryAlignment);
StubCodeMark mark(this, "StubRoutines", "ghash_processBlocks");
address start = __ pc();
// arguments
const Register state = c_rarg0;
const Register htbl = c_rarg1;
const Register data = c_rarg2;
const Register blocks = c_rarg3;
__ enter();
// Save state before entering routine
__ avx_ghash(state, htbl, data, blocks);
__ leave(); // required for proper stackwalking of RuntimeStub frame
__ ret(0);
return start;
}
// byte swap x86 long // byte swap x86 long
address generate_ghash_long_swap_mask() { address generate_ghash_long_swap_mask() {
__ align(CodeEntryAlignment); __ align(CodeEntryAlignment);
@ -5886,9 +5925,15 @@ address generate_cipherBlockChaining_decryptVectorAESCrypt() {
// Generate GHASH intrinsics code // Generate GHASH intrinsics code
if (UseGHASHIntrinsics) { if (UseGHASHIntrinsics) {
StubRoutines::x86::_ghash_long_swap_mask_addr = generate_ghash_long_swap_mask(); StubRoutines::x86::_ghash_long_swap_mask_addr = generate_ghash_long_swap_mask();
StubRoutines::x86::_ghash_byte_swap_mask_addr = generate_ghash_byte_swap_mask(); StubRoutines::x86::_ghash_byte_swap_mask_addr = generate_ghash_byte_swap_mask();
StubRoutines::_ghash_processBlocks = generate_ghash_processBlocks(); if (VM_Version::supports_avx()) {
StubRoutines::x86::_ghash_shuffmask_addr = ghash_shufflemask_addr();
StubRoutines::x86::_ghash_poly_addr = ghash_polynomial_addr();
StubRoutines::_ghash_processBlocks = generate_avx_ghash_processBlocks();
} else {
StubRoutines::_ghash_processBlocks = generate_ghash_processBlocks();
}
} }
if (UseBASE64Intrinsics) { if (UseBASE64Intrinsics) {

View File

@ -38,6 +38,8 @@ address StubRoutines::x86::_key_shuffle_mask_addr = NULL;
address StubRoutines::x86::_counter_shuffle_mask_addr = NULL; address StubRoutines::x86::_counter_shuffle_mask_addr = NULL;
address StubRoutines::x86::_ghash_long_swap_mask_addr = NULL; address StubRoutines::x86::_ghash_long_swap_mask_addr = NULL;
address StubRoutines::x86::_ghash_byte_swap_mask_addr = NULL; address StubRoutines::x86::_ghash_byte_swap_mask_addr = NULL;
address StubRoutines::x86::_ghash_poly_addr = NULL;
address StubRoutines::x86::_ghash_shuffmask_addr = NULL;
address StubRoutines::x86::_upper_word_mask_addr = NULL; address StubRoutines::x86::_upper_word_mask_addr = NULL;
address StubRoutines::x86::_shuffle_byte_flip_mask_addr = NULL; address StubRoutines::x86::_shuffle_byte_flip_mask_addr = NULL;
address StubRoutines::x86::_k256_adr = NULL; address StubRoutines::x86::_k256_adr = NULL;

View File

@ -128,6 +128,8 @@ class x86 {
// swap mask for ghash // swap mask for ghash
static address _ghash_long_swap_mask_addr; static address _ghash_long_swap_mask_addr;
static address _ghash_byte_swap_mask_addr; static address _ghash_byte_swap_mask_addr;
static address _ghash_poly_addr;
static address _ghash_shuffmask_addr;
// upper word mask for sha1 // upper word mask for sha1
static address _upper_word_mask_addr; static address _upper_word_mask_addr;
@ -205,6 +207,8 @@ class x86 {
static address crc_by128_masks_addr() { return (address)_crc_by128_masks; } static address crc_by128_masks_addr() { return (address)_crc_by128_masks; }
static address ghash_long_swap_mask_addr() { return _ghash_long_swap_mask_addr; } static address ghash_long_swap_mask_addr() { return _ghash_long_swap_mask_addr; }
static address ghash_byte_swap_mask_addr() { return _ghash_byte_swap_mask_addr; } static address ghash_byte_swap_mask_addr() { return _ghash_byte_swap_mask_addr; }
static address ghash_shufflemask_addr() { return _ghash_shuffmask_addr; }
static address ghash_polynomial_addr() { return _ghash_poly_addr; }
static address upper_word_mask_addr() { return _upper_word_mask_addr; } static address upper_word_mask_addr() { return _upper_word_mask_addr; }
static address shuffle_byte_flip_mask_addr() { return _shuffle_byte_flip_mask_addr; } static address shuffle_byte_flip_mask_addr() { return _shuffle_byte_flip_mask_addr; }
static address k256_addr() { return _k256_adr; } static address k256_addr() { return _k256_adr; }

View File

@ -124,10 +124,10 @@ final class GHASH {
} }
/* subkeyH and state are stored in long[] for GHASH intrinsic use */ /* subkeyHtbl and state are stored in long[] for GHASH intrinsic use */
// hash subkey H; should not change after the object has been constructed // hashtable subkeyHtbl; holds 2*9 powers of subkeyH computed using carry-less multiplication
private final long[] subkeyH; private long[] subkeyHtbl;
// buffer for storing hash // buffer for storing hash
private final long[] state; private final long[] state;
@ -149,9 +149,9 @@ final class GHASH {
throw new ProviderException("Internal error"); throw new ProviderException("Internal error");
} }
state = new long[2]; state = new long[2];
this.subkeyH = new long[2]; subkeyHtbl = new long[2*9];
this.subkeyH[0] = getLong(subkeyH, 0); subkeyHtbl[0] = getLong(subkeyH, 0);
this.subkeyH[1] = getLong(subkeyH, 8); subkeyHtbl[1] = getLong(subkeyH, 8);
} }
/** /**
@ -194,8 +194,8 @@ final class GHASH {
if (inLen == 0) { if (inLen == 0) {
return; return;
} }
ghashRangeCheck(in, inOfs, inLen, state, subkeyH); ghashRangeCheck(in, inOfs, inLen, state, subkeyHtbl);
processBlocks(in, inOfs, inLen/AES_BLOCK_SIZE, state, subkeyH); processBlocks(in, inOfs, inLen/AES_BLOCK_SIZE, state, subkeyHtbl);
} }
private static void ghashRangeCheck(byte[] in, int inOfs, int inLen, long[] st, long[] subH) { private static void ghashRangeCheck(byte[] in, int inOfs, int inLen, long[] st, long[] subH) {
@ -219,8 +219,8 @@ final class GHASH {
throw new RuntimeException("internal state has invalid length: " + throw new RuntimeException("internal state has invalid length: " +
st.length); st.length);
} }
if (subH.length != 2) { if (subH.length != 18) {
throw new RuntimeException("internal subkeyH has invalid length: " + throw new RuntimeException("internal subkeyHtbl has invalid length: " +
subH.length); subH.length);
} }
} }