Merge
This commit is contained in:
commit
2691776621
@ -1,5 +1,5 @@
|
||||
/*
|
||||
* Copyright (c) 1997, 2012, Oracle and/or its affiliates. All rights reserved.
|
||||
* Copyright (c) 1997, 2015, Oracle and/or its affiliates. All rights reserved.
|
||||
* Copyright (c) 2015, Red Hat Inc. All rights reserved.
|
||||
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
|
||||
*
|
||||
@ -190,6 +190,11 @@ void VM_Version::get_processor_features() {
|
||||
}
|
||||
}
|
||||
|
||||
if (UseGHASHIntrinsics) {
|
||||
warning("GHASH intrinsics are not available on this CPU");
|
||||
FLAG_SET_DEFAULT(UseGHASHIntrinsics, false);
|
||||
}
|
||||
|
||||
if (FLAG_IS_DEFAULT(UseCRC32Intrinsics)) {
|
||||
UseCRC32Intrinsics = true;
|
||||
}
|
||||
|
@ -176,6 +176,11 @@ void VM_Version::initialize() {
|
||||
FLAG_SET_DEFAULT(UseAESIntrinsics, false);
|
||||
}
|
||||
|
||||
if (UseGHASHIntrinsics) {
|
||||
warning("GHASH intrinsics are not available on this CPU");
|
||||
FLAG_SET_DEFAULT(UseGHASHIntrinsics, false);
|
||||
}
|
||||
|
||||
if (UseSHA) {
|
||||
warning("SHA instructions are not available on this CPU");
|
||||
FLAG_SET_DEFAULT(UseSHA, false);
|
||||
|
@ -1,5 +1,5 @@
|
||||
/*
|
||||
* Copyright (c) 1997, 2014, Oracle and/or its affiliates. All rights reserved.
|
||||
* Copyright (c) 1997, 2015, Oracle and/or its affiliates. All rights reserved.
|
||||
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
|
||||
*
|
||||
* This code is free software; you can redistribute it and/or modify it
|
||||
@ -129,6 +129,7 @@ class Assembler : public AbstractAssembler {
|
||||
flog3_op3 = 0x36,
|
||||
edge_op3 = 0x36,
|
||||
fsrc_op3 = 0x36,
|
||||
xmulx_op3 = 0x36,
|
||||
impdep2_op3 = 0x37,
|
||||
stpartialf_op3 = 0x37,
|
||||
jmpl_op3 = 0x38,
|
||||
@ -220,6 +221,8 @@ class Assembler : public AbstractAssembler {
|
||||
mdtox_opf = 0x110,
|
||||
mstouw_opf = 0x111,
|
||||
mstosw_opf = 0x113,
|
||||
xmulx_opf = 0x115,
|
||||
xmulxhi_opf = 0x116,
|
||||
mxtod_opf = 0x118,
|
||||
mwtos_opf = 0x119,
|
||||
|
||||
@ -1212,6 +1215,9 @@ public:
|
||||
void movwtos( Register s, FloatRegister d ) { vis3_only(); emit_int32( op(arith_op) | fd(d, FloatRegisterImpl::S) | op3(mftoi_op3) | opf(mwtos_opf) | rs2(s)); }
|
||||
void movxtod( Register s, FloatRegister d ) { vis3_only(); emit_int32( op(arith_op) | fd(d, FloatRegisterImpl::D) | op3(mftoi_op3) | opf(mxtod_opf) | rs2(s)); }
|
||||
|
||||
void xmulx(Register s1, Register s2, Register d) { vis3_only(); emit_int32( op(arith_op) | rd(d) | op3(xmulx_op3) | rs1(s1) | opf(xmulx_opf) | rs2(s2)); }
|
||||
void xmulxhi(Register s1, Register s2, Register d) { vis3_only(); emit_int32( op(arith_op) | rd(d) | op3(xmulx_op3) | rs1(s1) | opf(xmulxhi_opf) | rs2(s2)); }
|
||||
|
||||
// Crypto SHA instructions
|
||||
|
||||
void sha1() { sha1_only(); emit_int32( op(arith_op) | op3(sha_op3) | opf(sha1_opf)); }
|
||||
|
@ -4786,6 +4786,130 @@ class StubGenerator: public StubCodeGenerator {
|
||||
return start;
|
||||
}
|
||||
|
||||
/* Single and multi-block ghash operations */
|
||||
address generate_ghash_processBlocks() {
|
||||
__ align(CodeEntryAlignment);
|
||||
Label L_ghash_loop, L_aligned, L_main;
|
||||
StubCodeMark mark(this, "StubRoutines", "ghash_processBlocks");
|
||||
address start = __ pc();
|
||||
|
||||
Register state = I0;
|
||||
Register subkeyH = I1;
|
||||
Register data = I2;
|
||||
Register len = I3;
|
||||
|
||||
__ save_frame(0);
|
||||
|
||||
__ ldx(state, 0, O0);
|
||||
__ ldx(state, 8, O1);
|
||||
|
||||
// Loop label for multiblock operations
|
||||
__ BIND(L_ghash_loop);
|
||||
|
||||
// Check if 'data' is unaligned
|
||||
__ andcc(data, 7, G1);
|
||||
__ br(Assembler::zero, false, Assembler::pt, L_aligned);
|
||||
__ delayed()->nop();
|
||||
|
||||
Register left_shift = L1;
|
||||
Register right_shift = L2;
|
||||
Register data_ptr = L3;
|
||||
|
||||
// Get left and right shift values in bits
|
||||
__ sll(G1, LogBitsPerByte, left_shift);
|
||||
__ mov(64, right_shift);
|
||||
__ sub(right_shift, left_shift, right_shift);
|
||||
|
||||
// Align to read 'data'
|
||||
__ sub(data, G1, data_ptr);
|
||||
|
||||
// Load first 8 bytes of 'data'
|
||||
__ ldx(data_ptr, 0, O4);
|
||||
__ sllx(O4, left_shift, O4);
|
||||
__ ldx(data_ptr, 8, O5);
|
||||
__ srlx(O5, right_shift, G4);
|
||||
__ bset(G4, O4);
|
||||
|
||||
// Load second 8 bytes of 'data'
|
||||
__ sllx(O5, left_shift, O5);
|
||||
__ ldx(data_ptr, 16, G4);
|
||||
__ srlx(G4, right_shift, G4);
|
||||
__ ba(L_main);
|
||||
__ delayed()->bset(G4, O5);
|
||||
|
||||
// If 'data' is aligned, load normally
|
||||
__ BIND(L_aligned);
|
||||
__ ldx(data, 0, O4);
|
||||
__ ldx(data, 8, O5);
|
||||
|
||||
__ BIND(L_main);
|
||||
__ ldx(subkeyH, 0, O2);
|
||||
__ ldx(subkeyH, 8, O3);
|
||||
|
||||
__ xor3(O0, O4, O0);
|
||||
__ xor3(O1, O5, O1);
|
||||
|
||||
__ xmulxhi(O0, O3, G3);
|
||||
__ xmulx(O0, O2, O5);
|
||||
__ xmulxhi(O1, O2, G4);
|
||||
__ xmulxhi(O1, O3, G5);
|
||||
__ xmulx(O0, O3, G1);
|
||||
__ xmulx(O1, O3, G2);
|
||||
__ xmulx(O1, O2, O3);
|
||||
__ xmulxhi(O0, O2, O4);
|
||||
|
||||
__ mov(0xE1, O0);
|
||||
__ sllx(O0, 56, O0);
|
||||
|
||||
__ xor3(O5, G3, O5);
|
||||
__ xor3(O5, G4, O5);
|
||||
__ xor3(G5, G1, G1);
|
||||
__ xor3(G1, O3, G1);
|
||||
__ srlx(G2, 63, O1);
|
||||
__ srlx(G1, 63, G3);
|
||||
__ sllx(G2, 63, O3);
|
||||
__ sllx(G2, 58, O2);
|
||||
__ xor3(O3, O2, O2);
|
||||
|
||||
__ sllx(G1, 1, G1);
|
||||
__ or3(G1, O1, G1);
|
||||
|
||||
__ xor3(G1, O2, G1);
|
||||
|
||||
__ sllx(G2, 1, G2);
|
||||
|
||||
__ xmulxhi(G1, O0, O1);
|
||||
__ xmulx(G1, O0, O2);
|
||||
__ xmulxhi(G2, O0, O3);
|
||||
__ xmulx(G2, O0, G1);
|
||||
|
||||
__ xor3(O4, O1, O4);
|
||||
__ xor3(O5, O2, O5);
|
||||
__ xor3(O5, O3, O5);
|
||||
|
||||
__ sllx(O4, 1, O2);
|
||||
__ srlx(O5, 63, O3);
|
||||
|
||||
__ or3(O2, O3, O0);
|
||||
|
||||
__ sllx(O5, 1, O1);
|
||||
__ srlx(G1, 63, O2);
|
||||
__ or3(O1, O2, O1);
|
||||
__ xor3(O1, G3, O1);
|
||||
|
||||
__ deccc(len);
|
||||
__ br(Assembler::notZero, true, Assembler::pt, L_ghash_loop);
|
||||
__ delayed()->add(data, 16, data);
|
||||
|
||||
__ stx(O0, I0, 0);
|
||||
__ stx(O1, I0, 8);
|
||||
|
||||
__ ret();
|
||||
__ delayed()->restore();
|
||||
|
||||
return start;
|
||||
}
|
||||
|
||||
void generate_initial() {
|
||||
// Generates all stubs and initializes the entry points
|
||||
|
||||
@ -4859,6 +4983,10 @@ class StubGenerator: public StubCodeGenerator {
|
||||
StubRoutines::_cipherBlockChaining_encryptAESCrypt = generate_cipherBlockChaining_encryptAESCrypt();
|
||||
StubRoutines::_cipherBlockChaining_decryptAESCrypt = generate_cipherBlockChaining_decryptAESCrypt_Parallel();
|
||||
}
|
||||
// generate GHASH intrinsics code
|
||||
if (UseGHASHIntrinsics) {
|
||||
StubRoutines::_ghash_processBlocks = generate_ghash_processBlocks();
|
||||
}
|
||||
|
||||
// generate SHA1/SHA256/SHA512 intrinsics code
|
||||
if (UseSHA1Intrinsics) {
|
||||
|
@ -300,6 +300,17 @@ void VM_Version::initialize() {
|
||||
}
|
||||
}
|
||||
|
||||
// GHASH/GCM intrinsics
|
||||
if (has_vis3() && (UseVIS > 2)) {
|
||||
if (FLAG_IS_DEFAULT(UseGHASHIntrinsics)) {
|
||||
UseGHASHIntrinsics = true;
|
||||
}
|
||||
} else if (UseGHASHIntrinsics) {
|
||||
if (!FLAG_IS_DEFAULT(UseGHASHIntrinsics))
|
||||
warning("GHASH intrinsics require VIS3 insructions support. Intriniscs will be disabled");
|
||||
FLAG_SET_DEFAULT(UseGHASHIntrinsics, false);
|
||||
}
|
||||
|
||||
// SHA1, SHA256, and SHA512 instructions were added to SPARC T-series at different times
|
||||
if (has_sha1() || has_sha256() || has_sha512()) {
|
||||
if (UseVIS > 0) { // SHA intrinsics use VIS1 instructions
|
||||
|
@ -3095,8 +3095,16 @@ void Assembler::pshuflw(XMMRegister dst, Address src, int mode) {
|
||||
void Assembler::psrldq(XMMRegister dst, int shift) {
|
||||
// Shift 128 bit value in xmm register by number of bytes.
|
||||
NOT_LP64(assert(VM_Version::supports_sse2(), ""));
|
||||
int encode = simd_prefix_and_encode(xmm3, dst, dst, VEX_SIMD_66, true, VEX_OPCODE_0F,
|
||||
false, AVX_128bit, (VM_Version::supports_avx512bw() == false));
|
||||
int encode = simd_prefix_and_encode(xmm3, dst, dst, VEX_SIMD_66, true, VEX_OPCODE_0F, false, AVX_128bit, (VM_Version::supports_avx512bw() == false));
|
||||
emit_int8(0x73);
|
||||
emit_int8((unsigned char)(0xC0 | encode));
|
||||
emit_int8(shift);
|
||||
}
|
||||
|
||||
void Assembler::pslldq(XMMRegister dst, int shift) {
|
||||
// Shift left 128 bit value in xmm register by number of bytes.
|
||||
NOT_LP64(assert(VM_Version::supports_sse2(), ""));
|
||||
int encode = simd_prefix_and_encode(xmm7, dst, dst, VEX_SIMD_66, true, VEX_OPCODE_0F, false, AVX_128bit, (VM_Version::supports_avx512bw() == false));
|
||||
emit_int8(0x73);
|
||||
emit_int8((unsigned char)(0xC0 | encode));
|
||||
emit_int8(shift);
|
||||
|
@ -1666,6 +1666,8 @@ private:
|
||||
|
||||
// Shift Right by bytes Logical DoubleQuadword Immediate
|
||||
void psrldq(XMMRegister dst, int shift);
|
||||
// Shift Left by bytes Logical DoubleQuadword Immediate
|
||||
void pslldq(XMMRegister dst, int shift);
|
||||
|
||||
// Logical Compare 128bit
|
||||
void ptest(XMMRegister dst, XMMRegister src);
|
||||
|
@ -58,4 +58,6 @@ void Compile::pd_compiler2_init() {
|
||||
OptoReg::invalidate(i);
|
||||
}
|
||||
}
|
||||
|
||||
SuperWordLoopUnrollAnalysis = true;
|
||||
}
|
||||
|
@ -2727,6 +2727,167 @@ class StubGenerator: public StubCodeGenerator {
|
||||
return start;
|
||||
}
|
||||
|
||||
// byte swap x86 long
|
||||
address generate_ghash_long_swap_mask() {
|
||||
__ align(CodeEntryAlignment);
|
||||
StubCodeMark mark(this, "StubRoutines", "ghash_long_swap_mask");
|
||||
address start = __ pc();
|
||||
__ emit_data(0x0b0a0908, relocInfo::none, 0);
|
||||
__ emit_data(0x0f0e0d0c, relocInfo::none, 0);
|
||||
__ emit_data(0x03020100, relocInfo::none, 0);
|
||||
__ emit_data(0x07060504, relocInfo::none, 0);
|
||||
|
||||
return start;
|
||||
}
|
||||
|
||||
// byte swap x86 byte array
|
||||
address generate_ghash_byte_swap_mask() {
|
||||
__ align(CodeEntryAlignment);
|
||||
StubCodeMark mark(this, "StubRoutines", "ghash_byte_swap_mask");
|
||||
address start = __ pc();
|
||||
__ emit_data(0x0c0d0e0f, relocInfo::none, 0);
|
||||
__ emit_data(0x08090a0b, relocInfo::none, 0);
|
||||
__ emit_data(0x04050607, relocInfo::none, 0);
|
||||
__ emit_data(0x00010203, relocInfo::none, 0);
|
||||
return start;
|
||||
}
|
||||
|
||||
/* Single and multi-block ghash operations */
|
||||
address generate_ghash_processBlocks() {
|
||||
assert(UseGHASHIntrinsics, "need GHASH intrinsics and CLMUL support");
|
||||
__ align(CodeEntryAlignment);
|
||||
Label L_ghash_loop, L_exit;
|
||||
StubCodeMark mark(this, "StubRoutines", "ghash_processBlocks");
|
||||
address start = __ pc();
|
||||
|
||||
const Register state = rdi;
|
||||
const Register subkeyH = rsi;
|
||||
const Register data = rdx;
|
||||
const Register blocks = rcx;
|
||||
|
||||
const Address state_param(rbp, 8+0);
|
||||
const Address subkeyH_param(rbp, 8+4);
|
||||
const Address data_param(rbp, 8+8);
|
||||
const Address blocks_param(rbp, 8+12);
|
||||
|
||||
const XMMRegister xmm_temp0 = xmm0;
|
||||
const XMMRegister xmm_temp1 = xmm1;
|
||||
const XMMRegister xmm_temp2 = xmm2;
|
||||
const XMMRegister xmm_temp3 = xmm3;
|
||||
const XMMRegister xmm_temp4 = xmm4;
|
||||
const XMMRegister xmm_temp5 = xmm5;
|
||||
const XMMRegister xmm_temp6 = xmm6;
|
||||
const XMMRegister xmm_temp7 = xmm7;
|
||||
|
||||
__ enter();
|
||||
|
||||
__ movptr(state, state_param);
|
||||
__ movptr(subkeyH, subkeyH_param);
|
||||
__ movptr(data, data_param);
|
||||
__ movptr(blocks, blocks_param);
|
||||
|
||||
__ movdqu(xmm_temp0, Address(state, 0));
|
||||
__ pshufb(xmm_temp0, ExternalAddress(StubRoutines::x86::ghash_long_swap_mask_addr()));
|
||||
|
||||
__ movdqu(xmm_temp1, Address(subkeyH, 0));
|
||||
__ pshufb(xmm_temp1, ExternalAddress(StubRoutines::x86::ghash_long_swap_mask_addr()));
|
||||
|
||||
__ BIND(L_ghash_loop);
|
||||
__ movdqu(xmm_temp2, Address(data, 0));
|
||||
__ pshufb(xmm_temp2, ExternalAddress(StubRoutines::x86::ghash_byte_swap_mask_addr()));
|
||||
|
||||
__ pxor(xmm_temp0, xmm_temp2);
|
||||
|
||||
//
|
||||
// Multiply with the hash key
|
||||
//
|
||||
__ movdqu(xmm_temp3, xmm_temp0);
|
||||
__ pclmulqdq(xmm_temp3, xmm_temp1, 0); // xmm3 holds a0*b0
|
||||
__ movdqu(xmm_temp4, xmm_temp0);
|
||||
__ pclmulqdq(xmm_temp4, xmm_temp1, 16); // xmm4 holds a0*b1
|
||||
|
||||
__ movdqu(xmm_temp5, xmm_temp0);
|
||||
__ pclmulqdq(xmm_temp5, xmm_temp1, 1); // xmm5 holds a1*b0
|
||||
__ movdqu(xmm_temp6, xmm_temp0);
|
||||
__ pclmulqdq(xmm_temp6, xmm_temp1, 17); // xmm6 holds a1*b1
|
||||
|
||||
__ pxor(xmm_temp4, xmm_temp5); // xmm4 holds a0*b1 + a1*b0
|
||||
|
||||
__ movdqu(xmm_temp5, xmm_temp4); // move the contents of xmm4 to xmm5
|
||||
__ psrldq(xmm_temp4, 8); // shift by xmm4 64 bits to the right
|
||||
__ pslldq(xmm_temp5, 8); // shift by xmm5 64 bits to the left
|
||||
__ pxor(xmm_temp3, xmm_temp5);
|
||||
__ pxor(xmm_temp6, xmm_temp4); // Register pair <xmm6:xmm3> holds the result
|
||||
// of the carry-less multiplication of
|
||||
// xmm0 by xmm1.
|
||||
|
||||
// We shift the result of the multiplication by one bit position
|
||||
// to the left to cope for the fact that the bits are reversed.
|
||||
__ movdqu(xmm_temp7, xmm_temp3);
|
||||
__ movdqu(xmm_temp4, xmm_temp6);
|
||||
__ pslld (xmm_temp3, 1);
|
||||
__ pslld(xmm_temp6, 1);
|
||||
__ psrld(xmm_temp7, 31);
|
||||
__ psrld(xmm_temp4, 31);
|
||||
__ movdqu(xmm_temp5, xmm_temp7);
|
||||
__ pslldq(xmm_temp4, 4);
|
||||
__ pslldq(xmm_temp7, 4);
|
||||
__ psrldq(xmm_temp5, 12);
|
||||
__ por(xmm_temp3, xmm_temp7);
|
||||
__ por(xmm_temp6, xmm_temp4);
|
||||
__ por(xmm_temp6, xmm_temp5);
|
||||
|
||||
//
|
||||
// First phase of the reduction
|
||||
//
|
||||
// Move xmm3 into xmm4, xmm5, xmm7 in order to perform the shifts
|
||||
// independently.
|
||||
__ movdqu(xmm_temp7, xmm_temp3);
|
||||
__ movdqu(xmm_temp4, xmm_temp3);
|
||||
__ movdqu(xmm_temp5, xmm_temp3);
|
||||
__ pslld(xmm_temp7, 31); // packed right shift shifting << 31
|
||||
__ pslld(xmm_temp4, 30); // packed right shift shifting << 30
|
||||
__ pslld(xmm_temp5, 25); // packed right shift shifting << 25
|
||||
__ pxor(xmm_temp7, xmm_temp4); // xor the shifted versions
|
||||
__ pxor(xmm_temp7, xmm_temp5);
|
||||
__ movdqu(xmm_temp4, xmm_temp7);
|
||||
__ pslldq(xmm_temp7, 12);
|
||||
__ psrldq(xmm_temp4, 4);
|
||||
__ pxor(xmm_temp3, xmm_temp7); // first phase of the reduction complete
|
||||
|
||||
//
|
||||
// Second phase of the reduction
|
||||
//
|
||||
// Make 3 copies of xmm3 in xmm2, xmm5, xmm7 for doing these
|
||||
// shift operations.
|
||||
__ movdqu(xmm_temp2, xmm_temp3);
|
||||
__ movdqu(xmm_temp7, xmm_temp3);
|
||||
__ movdqu(xmm_temp5, xmm_temp3);
|
||||
__ psrld(xmm_temp2, 1); // packed left shifting >> 1
|
||||
__ psrld(xmm_temp7, 2); // packed left shifting >> 2
|
||||
__ psrld(xmm_temp5, 7); // packed left shifting >> 7
|
||||
__ pxor(xmm_temp2, xmm_temp7); // xor the shifted versions
|
||||
__ pxor(xmm_temp2, xmm_temp5);
|
||||
__ pxor(xmm_temp2, xmm_temp4);
|
||||
__ pxor(xmm_temp3, xmm_temp2);
|
||||
__ pxor(xmm_temp6, xmm_temp3); // the result is in xmm6
|
||||
|
||||
__ decrement(blocks);
|
||||
__ jcc(Assembler::zero, L_exit);
|
||||
__ movdqu(xmm_temp0, xmm_temp6);
|
||||
__ addptr(data, 16);
|
||||
__ jmp(L_ghash_loop);
|
||||
|
||||
__ BIND(L_exit);
|
||||
// Byte swap 16-byte result
|
||||
__ pshufb(xmm_temp6, ExternalAddress(StubRoutines::x86::ghash_long_swap_mask_addr()));
|
||||
__ movdqu(Address(state, 0), xmm_temp6); // store the result
|
||||
|
||||
__ leave();
|
||||
__ ret(0);
|
||||
return start;
|
||||
}
|
||||
|
||||
/**
|
||||
* Arguments:
|
||||
*
|
||||
@ -3026,6 +3187,13 @@ class StubGenerator: public StubCodeGenerator {
|
||||
StubRoutines::_cipherBlockChaining_decryptAESCrypt = generate_cipherBlockChaining_decryptAESCrypt();
|
||||
}
|
||||
|
||||
// Generate GHASH intrinsics code
|
||||
if (UseGHASHIntrinsics) {
|
||||
StubRoutines::x86::_ghash_long_swap_mask_addr = generate_ghash_long_swap_mask();
|
||||
StubRoutines::x86::_ghash_byte_swap_mask_addr = generate_ghash_byte_swap_mask();
|
||||
StubRoutines::_ghash_processBlocks = generate_ghash_processBlocks();
|
||||
}
|
||||
|
||||
// Safefetch stubs.
|
||||
generate_safefetch("SafeFetch32", sizeof(int), &StubRoutines::_safefetch32_entry,
|
||||
&StubRoutines::_safefetch32_fault_pc,
|
||||
|
@ -3681,6 +3681,175 @@ class StubGenerator: public StubCodeGenerator {
|
||||
return start;
|
||||
}
|
||||
|
||||
|
||||
// byte swap x86 long
|
||||
address generate_ghash_long_swap_mask() {
|
||||
__ align(CodeEntryAlignment);
|
||||
StubCodeMark mark(this, "StubRoutines", "ghash_long_swap_mask");
|
||||
address start = __ pc();
|
||||
__ emit_data64(0x0f0e0d0c0b0a0908, relocInfo::none );
|
||||
__ emit_data64(0x0706050403020100, relocInfo::none );
|
||||
return start;
|
||||
}
|
||||
|
||||
// byte swap x86 byte array
|
||||
address generate_ghash_byte_swap_mask() {
|
||||
__ align(CodeEntryAlignment);
|
||||
StubCodeMark mark(this, "StubRoutines", "ghash_byte_swap_mask");
|
||||
address start = __ pc();
|
||||
__ emit_data64(0x08090a0b0c0d0e0f, relocInfo::none );
|
||||
__ emit_data64(0x0001020304050607, relocInfo::none );
|
||||
return start;
|
||||
}
|
||||
|
||||
/* Single and multi-block ghash operations */
|
||||
address generate_ghash_processBlocks() {
|
||||
__ align(CodeEntryAlignment);
|
||||
Label L_ghash_loop, L_exit;
|
||||
StubCodeMark mark(this, "StubRoutines", "ghash_processBlocks");
|
||||
address start = __ pc();
|
||||
|
||||
const Register state = c_rarg0;
|
||||
const Register subkeyH = c_rarg1;
|
||||
const Register data = c_rarg2;
|
||||
const Register blocks = c_rarg3;
|
||||
|
||||
#ifdef _WIN64
|
||||
const int XMM_REG_LAST = 10;
|
||||
#endif
|
||||
|
||||
const XMMRegister xmm_temp0 = xmm0;
|
||||
const XMMRegister xmm_temp1 = xmm1;
|
||||
const XMMRegister xmm_temp2 = xmm2;
|
||||
const XMMRegister xmm_temp3 = xmm3;
|
||||
const XMMRegister xmm_temp4 = xmm4;
|
||||
const XMMRegister xmm_temp5 = xmm5;
|
||||
const XMMRegister xmm_temp6 = xmm6;
|
||||
const XMMRegister xmm_temp7 = xmm7;
|
||||
const XMMRegister xmm_temp8 = xmm8;
|
||||
const XMMRegister xmm_temp9 = xmm9;
|
||||
const XMMRegister xmm_temp10 = xmm10;
|
||||
|
||||
__ enter();
|
||||
|
||||
#ifdef _WIN64
|
||||
// save the xmm registers which must be preserved 6-10
|
||||
__ subptr(rsp, -rsp_after_call_off * wordSize);
|
||||
for (int i = 6; i <= XMM_REG_LAST; i++) {
|
||||
__ movdqu(xmm_save(i), as_XMMRegister(i));
|
||||
}
|
||||
#endif
|
||||
|
||||
__ movdqu(xmm_temp10, ExternalAddress(StubRoutines::x86::ghash_long_swap_mask_addr()));
|
||||
|
||||
__ movdqu(xmm_temp0, Address(state, 0));
|
||||
__ pshufb(xmm_temp0, xmm_temp10);
|
||||
|
||||
|
||||
__ BIND(L_ghash_loop);
|
||||
__ movdqu(xmm_temp2, Address(data, 0));
|
||||
__ pshufb(xmm_temp2, ExternalAddress(StubRoutines::x86::ghash_byte_swap_mask_addr()));
|
||||
|
||||
__ movdqu(xmm_temp1, Address(subkeyH, 0));
|
||||
__ pshufb(xmm_temp1, xmm_temp10);
|
||||
|
||||
__ pxor(xmm_temp0, xmm_temp2);
|
||||
|
||||
//
|
||||
// Multiply with the hash key
|
||||
//
|
||||
__ movdqu(xmm_temp3, xmm_temp0);
|
||||
__ pclmulqdq(xmm_temp3, xmm_temp1, 0); // xmm3 holds a0*b0
|
||||
__ movdqu(xmm_temp4, xmm_temp0);
|
||||
__ pclmulqdq(xmm_temp4, xmm_temp1, 16); // xmm4 holds a0*b1
|
||||
|
||||
__ movdqu(xmm_temp5, xmm_temp0);
|
||||
__ pclmulqdq(xmm_temp5, xmm_temp1, 1); // xmm5 holds a1*b0
|
||||
__ movdqu(xmm_temp6, xmm_temp0);
|
||||
__ pclmulqdq(xmm_temp6, xmm_temp1, 17); // xmm6 holds a1*b1
|
||||
|
||||
__ pxor(xmm_temp4, xmm_temp5); // xmm4 holds a0*b1 + a1*b0
|
||||
|
||||
__ movdqu(xmm_temp5, xmm_temp4); // move the contents of xmm4 to xmm5
|
||||
__ psrldq(xmm_temp4, 8); // shift by xmm4 64 bits to the right
|
||||
__ pslldq(xmm_temp5, 8); // shift by xmm5 64 bits to the left
|
||||
__ pxor(xmm_temp3, xmm_temp5);
|
||||
__ pxor(xmm_temp6, xmm_temp4); // Register pair <xmm6:xmm3> holds the result
|
||||
// of the carry-less multiplication of
|
||||
// xmm0 by xmm1.
|
||||
|
||||
// We shift the result of the multiplication by one bit position
|
||||
// to the left to cope for the fact that the bits are reversed.
|
||||
__ movdqu(xmm_temp7, xmm_temp3);
|
||||
__ movdqu(xmm_temp8, xmm_temp6);
|
||||
__ pslld(xmm_temp3, 1);
|
||||
__ pslld(xmm_temp6, 1);
|
||||
__ psrld(xmm_temp7, 31);
|
||||
__ psrld(xmm_temp8, 31);
|
||||
__ movdqu(xmm_temp9, xmm_temp7);
|
||||
__ pslldq(xmm_temp8, 4);
|
||||
__ pslldq(xmm_temp7, 4);
|
||||
__ psrldq(xmm_temp9, 12);
|
||||
__ por(xmm_temp3, xmm_temp7);
|
||||
__ por(xmm_temp6, xmm_temp8);
|
||||
__ por(xmm_temp6, xmm_temp9);
|
||||
|
||||
//
|
||||
// First phase of the reduction
|
||||
//
|
||||
// Move xmm3 into xmm7, xmm8, xmm9 in order to perform the shifts
|
||||
// independently.
|
||||
__ movdqu(xmm_temp7, xmm_temp3);
|
||||
__ movdqu(xmm_temp8, xmm_temp3);
|
||||
__ movdqu(xmm_temp9, xmm_temp3);
|
||||
__ pslld(xmm_temp7, 31); // packed right shift shifting << 31
|
||||
__ pslld(xmm_temp8, 30); // packed right shift shifting << 30
|
||||
__ pslld(xmm_temp9, 25); // packed right shift shifting << 25
|
||||
__ pxor(xmm_temp7, xmm_temp8); // xor the shifted versions
|
||||
__ pxor(xmm_temp7, xmm_temp9);
|
||||
__ movdqu(xmm_temp8, xmm_temp7);
|
||||
__ pslldq(xmm_temp7, 12);
|
||||
__ psrldq(xmm_temp8, 4);
|
||||
__ pxor(xmm_temp3, xmm_temp7); // first phase of the reduction complete
|
||||
|
||||
//
|
||||
// Second phase of the reduction
|
||||
//
|
||||
// Make 3 copies of xmm3 in xmm2, xmm4, xmm5 for doing these
|
||||
// shift operations.
|
||||
__ movdqu(xmm_temp2, xmm_temp3);
|
||||
__ movdqu(xmm_temp4, xmm_temp3);
|
||||
__ movdqu(xmm_temp5, xmm_temp3);
|
||||
__ psrld(xmm_temp2, 1); // packed left shifting >> 1
|
||||
__ psrld(xmm_temp4, 2); // packed left shifting >> 2
|
||||
__ psrld(xmm_temp5, 7); // packed left shifting >> 7
|
||||
__ pxor(xmm_temp2, xmm_temp4); // xor the shifted versions
|
||||
__ pxor(xmm_temp2, xmm_temp5);
|
||||
__ pxor(xmm_temp2, xmm_temp8);
|
||||
__ pxor(xmm_temp3, xmm_temp2);
|
||||
__ pxor(xmm_temp6, xmm_temp3); // the result is in xmm6
|
||||
|
||||
__ decrement(blocks);
|
||||
__ jcc(Assembler::zero, L_exit);
|
||||
__ movdqu(xmm_temp0, xmm_temp6);
|
||||
__ addptr(data, 16);
|
||||
__ jmp(L_ghash_loop);
|
||||
|
||||
__ BIND(L_exit);
|
||||
__ pshufb(xmm_temp6, xmm_temp10); // Byte swap 16-byte result
|
||||
__ movdqu(Address(state, 0), xmm_temp6); // store the result
|
||||
|
||||
#ifdef _WIN64
|
||||
// restore xmm regs belonging to calling function
|
||||
for (int i = 6; i <= XMM_REG_LAST; i++) {
|
||||
__ movdqu(as_XMMRegister(i), xmm_save(i));
|
||||
}
|
||||
#endif
|
||||
__ leave();
|
||||
__ ret(0);
|
||||
return start;
|
||||
}
|
||||
|
||||
/**
|
||||
* Arguments:
|
||||
*
|
||||
@ -4120,6 +4289,13 @@ class StubGenerator: public StubCodeGenerator {
|
||||
StubRoutines::_cipherBlockChaining_decryptAESCrypt = generate_cipherBlockChaining_decryptAESCrypt_Parallel();
|
||||
}
|
||||
|
||||
// Generate GHASH intrinsics code
|
||||
if (UseGHASHIntrinsics) {
|
||||
StubRoutines::x86::_ghash_long_swap_mask_addr = generate_ghash_long_swap_mask();
|
||||
StubRoutines::x86::_ghash_byte_swap_mask_addr = generate_ghash_byte_swap_mask();
|
||||
StubRoutines::_ghash_processBlocks = generate_ghash_processBlocks();
|
||||
}
|
||||
|
||||
// Safefetch stubs.
|
||||
generate_safefetch("SafeFetch32", sizeof(int), &StubRoutines::_safefetch32_entry,
|
||||
&StubRoutines::_safefetch32_fault_pc,
|
||||
|
@ -1,5 +1,5 @@
|
||||
/*
|
||||
* Copyright (c) 2013, Oracle and/or its affiliates. All rights reserved.
|
||||
* Copyright (c) 2013, 2015, Oracle and/or its affiliates. All rights reserved.
|
||||
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
|
||||
*
|
||||
* This code is free software; you can redistribute it and/or modify it
|
||||
@ -33,6 +33,8 @@
|
||||
|
||||
address StubRoutines::x86::_verify_mxcsr_entry = NULL;
|
||||
address StubRoutines::x86::_key_shuffle_mask_addr = NULL;
|
||||
address StubRoutines::x86::_ghash_long_swap_mask_addr = NULL;
|
||||
address StubRoutines::x86::_ghash_byte_swap_mask_addr = NULL;
|
||||
|
||||
uint64_t StubRoutines::x86::_crc_by128_masks[] =
|
||||
{
|
||||
|
@ -1,5 +1,5 @@
|
||||
/*
|
||||
* Copyright (c) 2013, Oracle and/or its affiliates. All rights reserved.
|
||||
* Copyright (c) 2013, 2015, Oracle and/or its affiliates. All rights reserved.
|
||||
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
|
||||
*
|
||||
* This code is free software; you can redistribute it and/or modify it
|
||||
@ -36,10 +36,15 @@
|
||||
// masks and table for CRC32
|
||||
static uint64_t _crc_by128_masks[];
|
||||
static juint _crc_table[];
|
||||
// swap mask for ghash
|
||||
static address _ghash_long_swap_mask_addr;
|
||||
static address _ghash_byte_swap_mask_addr;
|
||||
|
||||
public:
|
||||
static address verify_mxcsr_entry() { return _verify_mxcsr_entry; }
|
||||
static address key_shuffle_mask_addr() { return _key_shuffle_mask_addr; }
|
||||
static address crc_by128_masks_addr() { return (address)_crc_by128_masks; }
|
||||
static address ghash_long_swap_mask_addr() { return _ghash_long_swap_mask_addr; }
|
||||
static address ghash_byte_swap_mask_addr() { return _ghash_byte_swap_mask_addr; }
|
||||
|
||||
#endif // CPU_X86_VM_STUBROUTINES_X86_32_HPP
|
||||
|
@ -677,6 +677,17 @@ void VM_Version::get_processor_features() {
|
||||
FLAG_SET_DEFAULT(UseAESIntrinsics, false);
|
||||
}
|
||||
|
||||
// GHASH/GCM intrinsics
|
||||
if (UseCLMUL && (UseSSE > 2)) {
|
||||
if (FLAG_IS_DEFAULT(UseGHASHIntrinsics)) {
|
||||
UseGHASHIntrinsics = true;
|
||||
}
|
||||
} else if (UseGHASHIntrinsics) {
|
||||
if (!FLAG_IS_DEFAULT(UseGHASHIntrinsics))
|
||||
warning("GHASH intrinsic requires CLMUL and SSE2 instructions on this CPU");
|
||||
FLAG_SET_DEFAULT(UseGHASHIntrinsics, false);
|
||||
}
|
||||
|
||||
if (UseSHA) {
|
||||
warning("SHA instructions are not available on this CPU");
|
||||
FLAG_SET_DEFAULT(UseSHA, false);
|
||||
|
@ -846,6 +846,12 @@
|
||||
do_name( implCompressMB_name, "implCompressMultiBlock") \
|
||||
do_signature(implCompressMB_signature, "([BII)I") \
|
||||
\
|
||||
/* support for com.sun.crypto.provider.GHASH */ \
|
||||
do_class(com_sun_crypto_provider_ghash, "com/sun/crypto/provider/GHASH") \
|
||||
do_intrinsic(_ghash_processBlocks, com_sun_crypto_provider_ghash, processBlocks_name, ghash_processBlocks_signature, F_S) \
|
||||
do_name(processBlocks_name, "processBlocks") \
|
||||
do_signature(ghash_processBlocks_signature, "([BII[J[J)V") \
|
||||
\
|
||||
/* support for java.util.zip */ \
|
||||
do_class(java_util_zip_CRC32, "java/util/zip/CRC32") \
|
||||
do_intrinsic(_updateCRC32, java_util_zip_CRC32, update_name, int2_int_signature, F_SN) \
|
||||
|
@ -1,5 +1,5 @@
|
||||
/*
|
||||
* Copyright (c) 1997, 2014, Oracle and/or its affiliates. All rights reserved.
|
||||
* Copyright (c) 1997, 2015, Oracle and/or its affiliates. All rights reserved.
|
||||
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
|
||||
*
|
||||
* This code is free software; you can redistribute it and/or modify it
|
||||
@ -26,6 +26,7 @@
|
||||
#include "code/debugInfo.hpp"
|
||||
#include "code/debugInfoRec.hpp"
|
||||
#include "code/nmethod.hpp"
|
||||
#include "oops/oop.inline.hpp"
|
||||
#include "runtime/handles.inline.hpp"
|
||||
|
||||
PRAGMA_FORMAT_MUTE_WARNINGS_FOR_GCC
|
||||
@ -47,6 +48,12 @@ void DebugInfoWriteStream::write_metadata(Metadata* h) {
|
||||
write_int(recorder()->oop_recorder()->find_index(h));
|
||||
}
|
||||
|
||||
oop DebugInfoReadStream::read_oop() {
|
||||
oop o = code()->oop_at(read_int());
|
||||
assert(o->is_oop_or_null(), "oop only");
|
||||
return o;
|
||||
}
|
||||
|
||||
ScopeValue* DebugInfoReadStream::read_object_value() {
|
||||
int id = read_int();
|
||||
#ifdef ASSERT
|
||||
|
@ -1,5 +1,5 @@
|
||||
/*
|
||||
* Copyright (c) 1997, 2014, Oracle and/or its affiliates. All rights reserved.
|
||||
* Copyright (c) 1997, 2015, Oracle and/or its affiliates. All rights reserved.
|
||||
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
|
||||
*
|
||||
* This code is free software; you can redistribute it and/or modify it
|
||||
@ -266,11 +266,7 @@ class DebugInfoReadStream : public CompressedReadStream {
|
||||
|
||||
} ;
|
||||
|
||||
oop read_oop() {
|
||||
oop o = code()->oop_at(read_int());
|
||||
assert(o == NULL || o->is_oop(), "oop only");
|
||||
return o;
|
||||
}
|
||||
oop read_oop();
|
||||
Method* read_method() {
|
||||
Method* o = (Method*)(code()->metadata_at(read_int()));
|
||||
// is_metadata() is a faster check than is_metaspace_object()
|
||||
|
@ -191,6 +191,13 @@
|
||||
product(intx, LoopMaxUnroll, 16, \
|
||||
"Maximum number of unrolls for main loop") \
|
||||
\
|
||||
product(bool, SuperWordLoopUnrollAnalysis, false, \
|
||||
"Map number of unrolls for main loop via " \
|
||||
"Superword Level Parallelism analysis") \
|
||||
\
|
||||
notproduct(bool, TraceSuperWordLoopUnrollAnalysis, false, \
|
||||
"Trace what Superword Level Parallelism analysis applies") \
|
||||
\
|
||||
product(intx, LoopUnrollMin, 4, \
|
||||
"Minimum number of unroll loop bodies before checking progress" \
|
||||
"of rounds of unroll,optimize,..") \
|
||||
|
@ -966,6 +966,7 @@ void ConnectionGraph::process_call_arguments(CallNode *call) {
|
||||
strcmp(call->as_CallLeaf()->_name, "aescrypt_decryptBlock") == 0 ||
|
||||
strcmp(call->as_CallLeaf()->_name, "cipherBlockChaining_encryptAESCrypt") == 0 ||
|
||||
strcmp(call->as_CallLeaf()->_name, "cipherBlockChaining_decryptAESCrypt") == 0 ||
|
||||
strcmp(call->as_CallLeaf()->_name, "ghash_processBlocks") == 0 ||
|
||||
strcmp(call->as_CallLeaf()->_name, "sha1_implCompress") == 0 ||
|
||||
strcmp(call->as_CallLeaf()->_name, "sha1_implCompressMB") == 0 ||
|
||||
strcmp(call->as_CallLeaf()->_name, "sha256_implCompress") == 0 ||
|
||||
|
@ -278,6 +278,7 @@ class LibraryCallKit : public GraphKit {
|
||||
Node* inline_cipherBlockChaining_AESCrypt_predicate(bool decrypting);
|
||||
Node* get_key_start_from_aescrypt_object(Node* aescrypt_object);
|
||||
Node* get_original_key_start_from_aescrypt_object(Node* aescrypt_object);
|
||||
bool inline_ghash_processBlocks();
|
||||
bool inline_sha_implCompress(vmIntrinsics::ID id);
|
||||
bool inline_digestBase_implCompressMB(int predicate);
|
||||
bool inline_sha_implCompressMB(Node* digestBaseObj, ciInstanceKlass* instklass_SHA,
|
||||
@ -528,6 +529,10 @@ CallGenerator* Compile::make_vm_intrinsic(ciMethod* m, bool is_virtual) {
|
||||
predicates = 3;
|
||||
break;
|
||||
|
||||
case vmIntrinsics::_ghash_processBlocks:
|
||||
if (!UseGHASHIntrinsics) return NULL;
|
||||
break;
|
||||
|
||||
case vmIntrinsics::_updateCRC32:
|
||||
case vmIntrinsics::_updateBytesCRC32:
|
||||
case vmIntrinsics::_updateByteBufferCRC32:
|
||||
@ -929,6 +934,9 @@ bool LibraryCallKit::try_to_inline(int predicate) {
|
||||
case vmIntrinsics::_mulAdd:
|
||||
return inline_mulAdd();
|
||||
|
||||
case vmIntrinsics::_ghash_processBlocks:
|
||||
return inline_ghash_processBlocks();
|
||||
|
||||
case vmIntrinsics::_encodeISOArray:
|
||||
return inline_encodeISOArray();
|
||||
|
||||
@ -5858,6 +5866,35 @@ Node* LibraryCallKit::inline_cipherBlockChaining_AESCrypt_predicate(bool decrypt
|
||||
return _gvn.transform(region);
|
||||
}
|
||||
|
||||
//------------------------------inline_ghash_processBlocks
|
||||
bool LibraryCallKit::inline_ghash_processBlocks() {
|
||||
address stubAddr;
|
||||
const char *stubName;
|
||||
assert(UseGHASHIntrinsics, "need GHASH intrinsics support");
|
||||
|
||||
stubAddr = StubRoutines::ghash_processBlocks();
|
||||
stubName = "ghash_processBlocks";
|
||||
|
||||
Node* data = argument(0);
|
||||
Node* offset = argument(1);
|
||||
Node* len = argument(2);
|
||||
Node* state = argument(3);
|
||||
Node* subkeyH = argument(4);
|
||||
|
||||
Node* state_start = array_element_address(state, intcon(0), T_LONG);
|
||||
assert(state_start, "state is NULL");
|
||||
Node* subkeyH_start = array_element_address(subkeyH, intcon(0), T_LONG);
|
||||
assert(subkeyH_start, "subkeyH is NULL");
|
||||
Node* data_start = array_element_address(data, offset, T_BYTE);
|
||||
assert(data_start, "data is NULL");
|
||||
|
||||
Node* ghash = make_runtime_call(RC_LEAF|RC_NO_FP,
|
||||
OptoRuntime::ghash_processBlocks_Type(),
|
||||
stubAddr, stubName, TypePtr::BOTTOM,
|
||||
state_start, subkeyH_start, data_start, len);
|
||||
return true;
|
||||
}
|
||||
|
||||
//------------------------------inline_sha_implCompress-----------------------
|
||||
//
|
||||
// Calculate SHA (i.e., SHA-1) for single-block byte[] array.
|
||||
|
@ -38,6 +38,7 @@
|
||||
#include "opto/rootnode.hpp"
|
||||
#include "opto/runtime.hpp"
|
||||
#include "opto/subnode.hpp"
|
||||
#include "opto/superword.hpp"
|
||||
#include "opto/vectornode.hpp"
|
||||
|
||||
//------------------------------is_loop_exit-----------------------------------
|
||||
@ -640,7 +641,7 @@ bool IdealLoopTree::policy_maximally_unroll( PhaseIdealLoop *phase ) const {
|
||||
//------------------------------policy_unroll----------------------------------
|
||||
// Return TRUE or FALSE if the loop should be unrolled or not. Unroll if
|
||||
// the loop is a CountedLoop and the body is small enough.
|
||||
bool IdealLoopTree::policy_unroll( PhaseIdealLoop *phase ) const {
|
||||
bool IdealLoopTree::policy_unroll(PhaseIdealLoop *phase) {
|
||||
|
||||
CountedLoopNode *cl = _head->as_CountedLoop();
|
||||
assert(cl->is_normal_loop() || cl->is_main_loop(), "");
|
||||
@ -652,6 +653,8 @@ bool IdealLoopTree::policy_unroll( PhaseIdealLoop *phase ) const {
|
||||
// After split at least one iteration will be executed in pre-loop.
|
||||
if (cl->trip_count() <= (uint)(cl->is_normal_loop() ? 2 : 1)) return false;
|
||||
|
||||
_local_loop_unroll_limit = LoopUnrollLimit;
|
||||
_local_loop_unroll_factor = 4;
|
||||
int future_unroll_ct = cl->unrolled_count() * 2;
|
||||
if (future_unroll_ct > LoopMaxUnroll) return false;
|
||||
|
||||
@ -747,8 +750,24 @@ bool IdealLoopTree::policy_unroll( PhaseIdealLoop *phase ) const {
|
||||
} // switch
|
||||
}
|
||||
|
||||
if (UseSuperWord) {
|
||||
if (!cl->is_reduction_loop()) {
|
||||
phase->mark_reductions(this);
|
||||
}
|
||||
|
||||
// Only attempt slp analysis when user controls do not prohibit it
|
||||
if (LoopMaxUnroll > _local_loop_unroll_factor) {
|
||||
// Once policy_slp_analysis succeeds, mark the loop with the
|
||||
// maximal unroll factor so that we minimize analysis passes
|
||||
if ((future_unroll_ct > _local_loop_unroll_factor) ||
|
||||
(body_size > (uint)_local_loop_unroll_limit)) {
|
||||
policy_unroll_slp_analysis(cl, phase, future_unroll_ct);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Check for being too big
|
||||
if (body_size > (uint)LoopUnrollLimit) {
|
||||
if (body_size > (uint)_local_loop_unroll_limit) {
|
||||
if (xors_in_loop >= 4 && body_size < (uint)LoopUnrollLimit*4) return true;
|
||||
// Normal case: loop too big
|
||||
return false;
|
||||
@ -758,6 +777,36 @@ bool IdealLoopTree::policy_unroll( PhaseIdealLoop *phase ) const {
|
||||
return true;
|
||||
}
|
||||
|
||||
void IdealLoopTree::policy_unroll_slp_analysis(CountedLoopNode *cl, PhaseIdealLoop *phase, int future_unroll_ct) {
|
||||
// Enable this functionality target by target as needed
|
||||
if (SuperWordLoopUnrollAnalysis) {
|
||||
if (!cl->has_passed_slp()) {
|
||||
SuperWord sw(phase);
|
||||
sw.transform_loop(this, false);
|
||||
|
||||
// If the loop is slp canonical analyze it
|
||||
if (sw.early_return() == false) {
|
||||
sw.unrolling_analysis(cl, _local_loop_unroll_factor);
|
||||
}
|
||||
}
|
||||
|
||||
int slp_max_unroll_factor = cl->slp_max_unroll();
|
||||
if ((slp_max_unroll_factor > 4) &&
|
||||
(slp_max_unroll_factor >= future_unroll_ct)) {
|
||||
int new_limit = cl->node_count_before_unroll() * slp_max_unroll_factor;
|
||||
if (new_limit > LoopUnrollLimit) {
|
||||
#ifndef PRODUCT
|
||||
if (TraceSuperWordLoopUnrollAnalysis) {
|
||||
tty->print_cr("slp analysis is applying unroll limit %d, the original limit was %d\n",
|
||||
new_limit, _local_loop_unroll_limit);
|
||||
}
|
||||
#endif
|
||||
_local_loop_unroll_limit = new_limit;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
//------------------------------policy_align-----------------------------------
|
||||
// Return TRUE or FALSE if the loop should be cache-line aligned. Gather the
|
||||
// expression that does the alignment. Note that only one array base can be
|
||||
@ -1611,6 +1660,7 @@ void PhaseIdealLoop::mark_reductions(IdealLoopTree *loop) {
|
||||
// iff the uses conform
|
||||
if (ok) {
|
||||
def_node->add_flag(Node::Flag_is_reduction);
|
||||
loop_head->mark_has_reductions();
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -2517,7 +2567,6 @@ bool IdealLoopTree::iteration_split_impl( PhaseIdealLoop *phase, Node_List &old_
|
||||
// and we'd rather unroll the post-RCE'd loop SO... do not unroll if
|
||||
// peeling.
|
||||
if (should_unroll && !should_peel) {
|
||||
phase->mark_reductions(this);
|
||||
phase->do_unroll(this, old_new, true);
|
||||
}
|
||||
|
||||
|
@ -2408,7 +2408,7 @@ void PhaseIdealLoop::build_and_optimize(bool do_split_ifs, bool skip_loop_opts)
|
||||
for (LoopTreeIterator iter(_ltree_root); !iter.done(); iter.next()) {
|
||||
IdealLoopTree* lpt = iter.current();
|
||||
if (lpt->is_counted()) {
|
||||
sw.transform_loop(lpt);
|
||||
sw.transform_loop(lpt, true);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -62,7 +62,9 @@ protected:
|
||||
HasExactTripCount=8,
|
||||
InnerLoop=16,
|
||||
PartialPeelLoop=32,
|
||||
PartialPeelFailed=64 };
|
||||
PartialPeelFailed=64,
|
||||
HasReductions=128,
|
||||
PassedSlpAnalysis=256 };
|
||||
char _unswitch_count;
|
||||
enum { _unswitch_max=3 };
|
||||
|
||||
@ -77,6 +79,8 @@ public:
|
||||
void set_partial_peel_loop() { _loop_flags |= PartialPeelLoop; }
|
||||
int partial_peel_has_failed() const { return _loop_flags & PartialPeelFailed; }
|
||||
void mark_partial_peel_failed() { _loop_flags |= PartialPeelFailed; }
|
||||
void mark_has_reductions() { _loop_flags |= HasReductions; }
|
||||
void mark_passed_slp() { _loop_flags |= PassedSlpAnalysis; }
|
||||
|
||||
int unswitch_max() { return _unswitch_max; }
|
||||
int unswitch_count() { return _unswitch_count; }
|
||||
@ -155,11 +159,15 @@ class CountedLoopNode : public LoopNode {
|
||||
// unroll,optimize,unroll,optimize,... is making progress
|
||||
int _node_count_before_unroll;
|
||||
|
||||
// If slp analysis is performed we record the maximum
|
||||
// vector mapped unroll factor here
|
||||
int _slp_maximum_unroll_factor;
|
||||
|
||||
public:
|
||||
CountedLoopNode( Node *entry, Node *backedge )
|
||||
: LoopNode(entry, backedge), _main_idx(0), _trip_count(max_juint),
|
||||
_profile_trip_cnt(COUNT_UNKNOWN), _unrolled_count_log2(0),
|
||||
_node_count_before_unroll(0) {
|
||||
_node_count_before_unroll(0), _slp_maximum_unroll_factor(0) {
|
||||
init_class_id(Class_CountedLoop);
|
||||
// Initialize _trip_count to the largest possible value.
|
||||
// Will be reset (lower) if the loop's trip count is known.
|
||||
@ -199,10 +207,12 @@ public:
|
||||
|
||||
// A 'main' loop that is ONLY unrolled or peeled, never RCE'd or
|
||||
// Aligned, may be missing it's pre-loop.
|
||||
int is_normal_loop() const { return (_loop_flags&PreMainPostFlagsMask) == Normal; }
|
||||
int is_pre_loop () const { return (_loop_flags&PreMainPostFlagsMask) == Pre; }
|
||||
int is_main_loop () const { return (_loop_flags&PreMainPostFlagsMask) == Main; }
|
||||
int is_post_loop () const { return (_loop_flags&PreMainPostFlagsMask) == Post; }
|
||||
int is_normal_loop () const { return (_loop_flags&PreMainPostFlagsMask) == Normal; }
|
||||
int is_pre_loop () const { return (_loop_flags&PreMainPostFlagsMask) == Pre; }
|
||||
int is_main_loop () const { return (_loop_flags&PreMainPostFlagsMask) == Main; }
|
||||
int is_post_loop () const { return (_loop_flags&PreMainPostFlagsMask) == Post; }
|
||||
int is_reduction_loop() const { return (_loop_flags&HasReductions) == HasReductions; }
|
||||
int has_passed_slp () const { return (_loop_flags&PassedSlpAnalysis) == PassedSlpAnalysis; }
|
||||
int is_main_no_pre_loop() const { return _loop_flags & MainHasNoPreLoop; }
|
||||
void set_main_no_pre_loop() { _loop_flags |= MainHasNoPreLoop; }
|
||||
|
||||
@ -232,8 +242,10 @@ public:
|
||||
void double_unrolled_count() { _unrolled_count_log2++; }
|
||||
int unrolled_count() { return 1 << MIN2(_unrolled_count_log2, BitsPerInt-3); }
|
||||
|
||||
void set_node_count_before_unroll(int ct) { _node_count_before_unroll = ct; }
|
||||
int node_count_before_unroll() { return _node_count_before_unroll; }
|
||||
void set_node_count_before_unroll(int ct) { _node_count_before_unroll = ct; }
|
||||
int node_count_before_unroll() { return _node_count_before_unroll; }
|
||||
void set_slp_max_unroll(int unroll_factor) { _slp_maximum_unroll_factor = unroll_factor; }
|
||||
int slp_max_unroll() const { return _slp_maximum_unroll_factor; }
|
||||
|
||||
#ifndef PRODUCT
|
||||
virtual void dump_spec(outputStream *st) const;
|
||||
@ -336,6 +348,8 @@ public:
|
||||
Node *_tail; // Tail of loop
|
||||
inline Node *tail(); // Handle lazy update of _tail field
|
||||
PhaseIdealLoop* _phase;
|
||||
int _local_loop_unroll_limit;
|
||||
int _local_loop_unroll_factor;
|
||||
|
||||
Node_List _body; // Loop body for inner loops
|
||||
|
||||
@ -356,7 +370,8 @@ public:
|
||||
_safepts(NULL),
|
||||
_required_safept(NULL),
|
||||
_allow_optimizations(true),
|
||||
_nest(0), _irreducible(0), _has_call(0), _has_sfpt(0), _rce_candidate(0)
|
||||
_nest(0), _irreducible(0), _has_call(0), _has_sfpt(0), _rce_candidate(0),
|
||||
_local_loop_unroll_limit(0), _local_loop_unroll_factor(0)
|
||||
{ }
|
||||
|
||||
// Is 'l' a member of 'this'?
|
||||
@ -444,7 +459,10 @@ public:
|
||||
|
||||
// Return TRUE or FALSE if the loop should be unrolled or not. Unroll if
|
||||
// the loop is a CountedLoop and the body is small enough.
|
||||
bool policy_unroll( PhaseIdealLoop *phase ) const;
|
||||
bool policy_unroll(PhaseIdealLoop *phase);
|
||||
|
||||
// Loop analyses to map to a maximal superword unrolling for vectorization.
|
||||
void policy_unroll_slp_analysis(CountedLoopNode *cl, PhaseIdealLoop *phase, int future_unroll_ct);
|
||||
|
||||
// Return TRUE or FALSE if the loop should be range-check-eliminated.
|
||||
// Gather a list of IF tests that are dominated by iteration splitting;
|
||||
|
@ -987,7 +987,25 @@ const TypeFunc* OptoRuntime::mulAdd_Type() {
|
||||
return TypeFunc::make(domain, range);
|
||||
}
|
||||
|
||||
// GHASH block processing
|
||||
const TypeFunc* OptoRuntime::ghash_processBlocks_Type() {
|
||||
int argcnt = 4;
|
||||
|
||||
const Type** fields = TypeTuple::fields(argcnt);
|
||||
int argp = TypeFunc::Parms;
|
||||
fields[argp++] = TypePtr::NOTNULL; // state
|
||||
fields[argp++] = TypePtr::NOTNULL; // subkeyH
|
||||
fields[argp++] = TypePtr::NOTNULL; // data
|
||||
fields[argp++] = TypeInt::INT; // blocks
|
||||
assert(argp == TypeFunc::Parms+argcnt, "correct decoding");
|
||||
const TypeTuple* domain = TypeTuple::make(TypeFunc::Parms+argcnt, fields);
|
||||
|
||||
// result type needed
|
||||
fields = TypeTuple::fields(1);
|
||||
fields[TypeFunc::Parms+0] = NULL; // void
|
||||
const TypeTuple* range = TypeTuple::make(TypeFunc::Parms, fields);
|
||||
return TypeFunc::make(domain, range);
|
||||
}
|
||||
|
||||
//------------- Interpreter state access for on stack replacement
|
||||
const TypeFunc* OptoRuntime::osr_end_Type() {
|
||||
|
@ -316,6 +316,8 @@ private:
|
||||
|
||||
static const TypeFunc* mulAdd_Type();
|
||||
|
||||
static const TypeFunc* ghash_processBlocks_Type();
|
||||
|
||||
static const TypeFunc* updateBytesCRC32_Type();
|
||||
|
||||
// leaf on stack replacement interpreter accessor types
|
||||
|
@ -68,6 +68,7 @@ SuperWord::SuperWord(PhaseIdealLoop* phase) :
|
||||
_bb(NULL), // basic block
|
||||
_iv(NULL), // induction var
|
||||
_race_possible(false), // cases where SDMU is true
|
||||
_early_return(true), // analysis evaluations routine
|
||||
_num_work_vecs(0), // amount of vector work we have
|
||||
_num_reductions(0), // amount of reduction work we have
|
||||
_do_vector_loop(phase->C->do_vector_loop()), // whether to do vectorization/simd style
|
||||
@ -78,7 +79,7 @@ SuperWord::SuperWord(PhaseIdealLoop* phase) :
|
||||
{}
|
||||
|
||||
//------------------------------transform_loop---------------------------
|
||||
void SuperWord::transform_loop(IdealLoopTree* lpt) {
|
||||
void SuperWord::transform_loop(IdealLoopTree* lpt, bool do_optimization) {
|
||||
assert(UseSuperWord, "should be");
|
||||
// Do vectors exist on this architecture?
|
||||
if (Matcher::vector_width_in_bytes(T_BYTE) < 2) return;
|
||||
@ -113,8 +114,156 @@ void SuperWord::transform_loop(IdealLoopTree* lpt) {
|
||||
// For now, define one block which is the entire loop body
|
||||
set_bb(cl);
|
||||
|
||||
assert(_packset.length() == 0, "packset must be empty");
|
||||
SLP_extract();
|
||||
if (do_optimization) {
|
||||
assert(_packset.length() == 0, "packset must be empty");
|
||||
SLP_extract();
|
||||
}
|
||||
}
|
||||
|
||||
//------------------------------early unrolling analysis------------------------------
|
||||
void SuperWord::unrolling_analysis(CountedLoopNode *cl, int &local_loop_unroll_factor) {
|
||||
bool is_slp = true;
|
||||
ResourceMark rm;
|
||||
size_t ignored_size = lpt()->_body.size();
|
||||
int *ignored_loop_nodes = NEW_RESOURCE_ARRAY(int, ignored_size);
|
||||
Node_Stack nstack((int)ignored_size);
|
||||
Node *cl_exit = cl->loopexit();
|
||||
|
||||
// First clear the entries
|
||||
for (uint i = 0; i < lpt()->_body.size(); i++) {
|
||||
ignored_loop_nodes[i] = -1;
|
||||
}
|
||||
|
||||
int max_vector = Matcher::max_vector_size(T_INT);
|
||||
|
||||
// Process the loop, some/all of the stack entries will not be in order, ergo
|
||||
// need to preprocess the ignored initial state before we process the loop
|
||||
for (uint i = 0; i < lpt()->_body.size(); i++) {
|
||||
Node* n = lpt()->_body.at(i);
|
||||
if (n == cl->incr() ||
|
||||
n->is_reduction() ||
|
||||
n->is_AddP() ||
|
||||
n->is_Cmp() ||
|
||||
n->is_IfTrue() ||
|
||||
n->is_CountedLoop() ||
|
||||
(n == cl_exit)) {
|
||||
ignored_loop_nodes[i] = n->_idx;
|
||||
continue;
|
||||
}
|
||||
|
||||
if (n->is_If()) {
|
||||
IfNode *iff = n->as_If();
|
||||
if (iff->_fcnt != COUNT_UNKNOWN && iff->_prob != PROB_UNKNOWN) {
|
||||
if (lpt()->is_loop_exit(iff)) {
|
||||
ignored_loop_nodes[i] = n->_idx;
|
||||
continue;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (n->is_Phi() && (n->bottom_type() == Type::MEMORY)) {
|
||||
Node* n_tail = n->in(LoopNode::LoopBackControl);
|
||||
if (n_tail != n->in(LoopNode::EntryControl)) {
|
||||
if (!n_tail->is_Mem()) {
|
||||
is_slp = false;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// This must happen after check of phi/if
|
||||
if (n->is_Phi() || n->is_If()) {
|
||||
ignored_loop_nodes[i] = n->_idx;
|
||||
continue;
|
||||
}
|
||||
|
||||
if (n->is_LoadStore() || n->is_MergeMem() ||
|
||||
(n->is_Proj() && !n->as_Proj()->is_CFG())) {
|
||||
is_slp = false;
|
||||
break;
|
||||
}
|
||||
|
||||
if (n->is_Mem()) {
|
||||
MemNode* current = n->as_Mem();
|
||||
BasicType bt = current->memory_type();
|
||||
if (is_java_primitive(bt) == false) {
|
||||
ignored_loop_nodes[i] = n->_idx;
|
||||
continue;
|
||||
}
|
||||
Node* adr = n->in(MemNode::Address);
|
||||
Node* n_ctrl = _phase->get_ctrl(adr);
|
||||
|
||||
// save a queue of post process nodes
|
||||
if (n_ctrl != NULL && lpt()->is_member(_phase->get_loop(n_ctrl))) {
|
||||
// Process the memory expression
|
||||
int stack_idx = 0;
|
||||
bool have_side_effects = true;
|
||||
if (adr->is_AddP() == false) {
|
||||
nstack.push(adr, stack_idx++);
|
||||
} else {
|
||||
// Mark the components of the memory operation in nstack
|
||||
SWPointer p1(current, this, &nstack, true);
|
||||
have_side_effects = p1.node_stack()->is_nonempty();
|
||||
}
|
||||
|
||||
// Process the pointer stack
|
||||
while (have_side_effects) {
|
||||
Node* pointer_node = nstack.node();
|
||||
for (uint j = 0; j < lpt()->_body.size(); j++) {
|
||||
Node* cur_node = lpt()->_body.at(j);
|
||||
if (cur_node == pointer_node) {
|
||||
ignored_loop_nodes[j] = cur_node->_idx;
|
||||
break;
|
||||
}
|
||||
}
|
||||
nstack.pop();
|
||||
have_side_effects = nstack.is_nonempty();
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (is_slp) {
|
||||
// Now we try to find the maximum supported consistent vector which the machine
|
||||
// description can use
|
||||
for (uint i = 0; i < lpt()->_body.size(); i++) {
|
||||
if (ignored_loop_nodes[i] != -1) continue;
|
||||
|
||||
BasicType bt;
|
||||
Node* n = lpt()->_body.at(i);
|
||||
if (n->is_Store()) {
|
||||
bt = n->as_Mem()->memory_type();
|
||||
} else {
|
||||
bt = n->bottom_type()->basic_type();
|
||||
}
|
||||
|
||||
int cur_max_vector = Matcher::max_vector_size(bt);
|
||||
|
||||
// If a max vector exists which is not larger than _local_loop_unroll_factor
|
||||
// stop looking, we already have the max vector to map to.
|
||||
if (cur_max_vector <= local_loop_unroll_factor) {
|
||||
is_slp = false;
|
||||
#ifndef PRODUCT
|
||||
if (TraceSuperWordLoopUnrollAnalysis) {
|
||||
tty->print_cr("slp analysis fails: unroll limit equals max vector\n");
|
||||
}
|
||||
#endif
|
||||
break;
|
||||
}
|
||||
|
||||
// Map the maximal common vector
|
||||
if (VectorNode::implemented(n->Opcode(), cur_max_vector, bt)) {
|
||||
if (cur_max_vector < max_vector) {
|
||||
max_vector = cur_max_vector;
|
||||
}
|
||||
}
|
||||
}
|
||||
if (is_slp) {
|
||||
local_loop_unroll_factor = max_vector;
|
||||
}
|
||||
cl->mark_passed_slp();
|
||||
cl->set_slp_max_unroll(local_loop_unroll_factor);
|
||||
}
|
||||
}
|
||||
|
||||
//------------------------------SLP_extract---------------------------
|
||||
@ -268,12 +417,12 @@ void SuperWord::find_adjacent_refs() {
|
||||
best_iv_adjustment = iv_adjustment;
|
||||
}
|
||||
|
||||
SWPointer align_to_ref_p(mem_ref, this);
|
||||
SWPointer align_to_ref_p(mem_ref, this, NULL, false);
|
||||
// Set alignment relative to "align_to_ref" for all related memory operations.
|
||||
for (int i = memops.size() - 1; i >= 0; i--) {
|
||||
MemNode* s = memops.at(i)->as_Mem();
|
||||
if (isomorphic(s, mem_ref)) {
|
||||
SWPointer p2(s, this);
|
||||
SWPointer p2(s, this, NULL, false);
|
||||
if (p2.comparable(align_to_ref_p)) {
|
||||
int align = memory_alignment(s, iv_adjustment);
|
||||
set_alignment(s, align);
|
||||
@ -294,7 +443,7 @@ void SuperWord::find_adjacent_refs() {
|
||||
// iterations in pre-loop will be not enough to align it.
|
||||
create_pack = false;
|
||||
} else {
|
||||
SWPointer p2(best_align_to_mem_ref, this);
|
||||
SWPointer p2(best_align_to_mem_ref, this, NULL, false);
|
||||
if (align_to_ref_p.invar() != p2.invar()) {
|
||||
// Do not vectorize memory accesses with different invariants
|
||||
// if unaligned memory accesses are not allowed.
|
||||
@ -411,7 +560,7 @@ MemNode* SuperWord::find_align_to_ref(Node_List &memops) {
|
||||
// Count number of comparable memory ops
|
||||
for (uint i = 0; i < memops.size(); i++) {
|
||||
MemNode* s1 = memops.at(i)->as_Mem();
|
||||
SWPointer p1(s1, this);
|
||||
SWPointer p1(s1, this, NULL, false);
|
||||
// Discard if pre loop can't align this reference
|
||||
if (!ref_is_alignable(p1)) {
|
||||
*cmp_ct.adr_at(i) = 0;
|
||||
@ -420,7 +569,7 @@ MemNode* SuperWord::find_align_to_ref(Node_List &memops) {
|
||||
for (uint j = i+1; j < memops.size(); j++) {
|
||||
MemNode* s2 = memops.at(j)->as_Mem();
|
||||
if (isomorphic(s1, s2)) {
|
||||
SWPointer p2(s2, this);
|
||||
SWPointer p2(s2, this, NULL, false);
|
||||
if (p1.comparable(p2)) {
|
||||
(*cmp_ct.adr_at(i))++;
|
||||
(*cmp_ct.adr_at(j))++;
|
||||
@ -441,7 +590,7 @@ MemNode* SuperWord::find_align_to_ref(Node_List &memops) {
|
||||
if (s->is_Store()) {
|
||||
int vw = vector_width_in_bytes(s);
|
||||
assert(vw > 1, "sanity");
|
||||
SWPointer p(s, this);
|
||||
SWPointer p(s, this, NULL, false);
|
||||
if (cmp_ct.at(j) > max_ct ||
|
||||
cmp_ct.at(j) == max_ct &&
|
||||
(vw > max_vw ||
|
||||
@ -464,7 +613,7 @@ MemNode* SuperWord::find_align_to_ref(Node_List &memops) {
|
||||
if (s->is_Load()) {
|
||||
int vw = vector_width_in_bytes(s);
|
||||
assert(vw > 1, "sanity");
|
||||
SWPointer p(s, this);
|
||||
SWPointer p(s, this, NULL, false);
|
||||
if (cmp_ct.at(j) > max_ct ||
|
||||
cmp_ct.at(j) == max_ct &&
|
||||
(vw > max_vw ||
|
||||
@ -575,7 +724,7 @@ bool SuperWord::ref_is_alignable(SWPointer& p) {
|
||||
//---------------------------get_iv_adjustment---------------------------
|
||||
// Calculate loop's iv adjustment for this memory ops.
|
||||
int SuperWord::get_iv_adjustment(MemNode* mem_ref) {
|
||||
SWPointer align_to_ref_p(mem_ref, this);
|
||||
SWPointer align_to_ref_p(mem_ref, this, NULL, false);
|
||||
int offset = align_to_ref_p.offset_in_bytes();
|
||||
int scale = align_to_ref_p.scale_in_bytes();
|
||||
int elt_size = align_to_ref_p.memory_size();
|
||||
@ -649,13 +798,13 @@ void SuperWord::dependence_graph() {
|
||||
if (_dg.dep(s1)->in_cnt() == 0) {
|
||||
_dg.make_edge(slice, s1);
|
||||
}
|
||||
SWPointer p1(s1->as_Mem(), this);
|
||||
SWPointer p1(s1->as_Mem(), this, NULL, false);
|
||||
bool sink_dependent = true;
|
||||
for (int k = j - 1; k >= 0; k--) {
|
||||
Node* s2 = _nlist.at(k);
|
||||
if (s1->is_Load() && s2->is_Load())
|
||||
continue;
|
||||
SWPointer p2(s2->as_Mem(), this);
|
||||
SWPointer p2(s2->as_Mem(), this, NULL, false);
|
||||
|
||||
int cmp = p1.cmp(p2);
|
||||
if (SuperWordRTDepCheck &&
|
||||
@ -795,8 +944,8 @@ bool SuperWord::are_adjacent_refs(Node* s1, Node* s2) {
|
||||
if (_phase->C->get_alias_index(s1->as_Mem()->adr_type()) !=
|
||||
_phase->C->get_alias_index(s2->as_Mem()->adr_type()))
|
||||
return false;
|
||||
SWPointer p1(s1->as_Mem(), this);
|
||||
SWPointer p2(s2->as_Mem(), this);
|
||||
SWPointer p1(s1->as_Mem(), this, NULL, false);
|
||||
SWPointer p2(s2->as_Mem(), this, NULL, false);
|
||||
if (p1.base() != p2.base() || !p1.comparable(p2)) return false;
|
||||
int diff = p2.offset_in_bytes() - p1.offset_in_bytes();
|
||||
return diff == data_size(s1);
|
||||
@ -1615,13 +1764,13 @@ void SuperWord::output() {
|
||||
if (n->is_Load()) {
|
||||
Node* ctl = n->in(MemNode::Control);
|
||||
Node* mem = first->in(MemNode::Memory);
|
||||
SWPointer p1(n->as_Mem(), this);
|
||||
SWPointer p1(n->as_Mem(), this, NULL, false);
|
||||
// Identify the memory dependency for the new loadVector node by
|
||||
// walking up through memory chain.
|
||||
// This is done to give flexibility to the new loadVector node so that
|
||||
// it can move above independent storeVector nodes.
|
||||
while (mem->is_StoreVector()) {
|
||||
SWPointer p2(mem->as_Mem(), this);
|
||||
SWPointer p2(mem->as_Mem(), this, NULL, false);
|
||||
int cmp = p1.cmp(p2);
|
||||
if (SWPointer::not_equal(cmp) || !SWPointer::comparable(cmp)) {
|
||||
mem = mem->in(MemNode::Memory);
|
||||
@ -2138,7 +2287,7 @@ void SuperWord::compute_vector_element_type() {
|
||||
//------------------------------memory_alignment---------------------------
|
||||
// Alignment within a vector memory reference
|
||||
int SuperWord::memory_alignment(MemNode* s, int iv_adjust) {
|
||||
SWPointer p(s, this);
|
||||
SWPointer p(s, this, NULL, false);
|
||||
if (!p.valid()) {
|
||||
return bottom_align;
|
||||
}
|
||||
@ -2315,7 +2464,7 @@ void SuperWord::align_initial_loop_index(MemNode* align_to_ref) {
|
||||
Node *orig_limit = pre_opaq->original_loop_limit();
|
||||
assert(orig_limit != NULL && _igvn.type(orig_limit) != Type::TOP, "");
|
||||
|
||||
SWPointer align_to_ref_p(align_to_ref, this);
|
||||
SWPointer align_to_ref_p(align_to_ref, this, NULL, false);
|
||||
assert(align_to_ref_p.valid(), "sanity");
|
||||
|
||||
// Given:
|
||||
@ -2489,6 +2638,7 @@ void SuperWord::init() {
|
||||
_bb = NULL;
|
||||
_iv = NULL;
|
||||
_race_possible = 0;
|
||||
_early_return = false;
|
||||
_num_work_vecs = 0;
|
||||
_num_reductions = 0;
|
||||
}
|
||||
@ -2559,9 +2709,11 @@ char* SuperWord::blank(uint depth) {
|
||||
//==============================SWPointer===========================
|
||||
|
||||
//----------------------------SWPointer------------------------
|
||||
SWPointer::SWPointer(MemNode* mem, SuperWord* slp) :
|
||||
SWPointer::SWPointer(MemNode* mem, SuperWord* slp, Node_Stack *nstack, bool analyze_only) :
|
||||
_mem(mem), _slp(slp), _base(NULL), _adr(NULL),
|
||||
_scale(0), _offset(0), _invar(NULL), _negate_invar(false) {
|
||||
_scale(0), _offset(0), _invar(NULL), _negate_invar(false),
|
||||
_nstack(nstack), _analyze_only(analyze_only),
|
||||
_stack_idx(0) {
|
||||
|
||||
Node* adr = mem->in(MemNode::Address);
|
||||
if (!adr->is_AddP()) {
|
||||
@ -2599,7 +2751,9 @@ SWPointer::SWPointer(MemNode* mem, SuperWord* slp) :
|
||||
// the pattern match of an address expression.
|
||||
SWPointer::SWPointer(SWPointer* p) :
|
||||
_mem(p->_mem), _slp(p->_slp), _base(NULL), _adr(NULL),
|
||||
_scale(0), _offset(0), _invar(NULL), _negate_invar(false) {}
|
||||
_scale(0), _offset(0), _invar(NULL), _negate_invar(false),
|
||||
_nstack(p->_nstack), _analyze_only(p->_analyze_only),
|
||||
_stack_idx(p->_stack_idx) {}
|
||||
|
||||
//------------------------scaled_iv_plus_offset--------------------
|
||||
// Match: k*iv + offset
|
||||
@ -2642,6 +2796,9 @@ bool SWPointer::scaled_iv(Node* n) {
|
||||
_scale = 1;
|
||||
return true;
|
||||
}
|
||||
if (_analyze_only && (invariant(n) == false)) {
|
||||
_nstack->push(n, _stack_idx++);
|
||||
}
|
||||
int opc = n->Opcode();
|
||||
if (opc == Op_MulI) {
|
||||
if (n->in(1) == iv() && n->in(2)->is_Con()) {
|
||||
@ -2699,6 +2856,9 @@ bool SWPointer::offset_plus_k(Node* n, bool negate) {
|
||||
return false;
|
||||
}
|
||||
if (_invar != NULL) return false; // already have an invariant
|
||||
if (_analyze_only && (invariant(n) == false)) {
|
||||
_nstack->push(n, _stack_idx++);
|
||||
}
|
||||
if (opc == Op_AddI) {
|
||||
if (n->in(2)->is_Con() && invariant(n->in(1))) {
|
||||
_negate_invar = negate;
|
||||
|
@ -239,12 +239,15 @@ class SuperWord : public ResourceObj {
|
||||
public:
|
||||
SuperWord(PhaseIdealLoop* phase);
|
||||
|
||||
void transform_loop(IdealLoopTree* lpt);
|
||||
void transform_loop(IdealLoopTree* lpt, bool do_optimization);
|
||||
|
||||
void unrolling_analysis(CountedLoopNode *cl, int &local_loop_unroll_factor);
|
||||
|
||||
// Accessors for SWPointer
|
||||
PhaseIdealLoop* phase() { return _phase; }
|
||||
IdealLoopTree* lpt() { return _lpt; }
|
||||
PhiNode* iv() { return _iv; }
|
||||
bool early_return() { return _early_return; }
|
||||
|
||||
private:
|
||||
IdealLoopTree* _lpt; // Current loop tree node
|
||||
@ -252,6 +255,7 @@ class SuperWord : public ResourceObj {
|
||||
Node* _bb; // Current basic block
|
||||
PhiNode* _iv; // Induction var
|
||||
bool _race_possible; // In cases where SDMU is true
|
||||
bool _early_return; // True if we do not initialize
|
||||
bool _do_vector_loop; // whether to do vectorization/simd style
|
||||
bool _vector_loop_debug; // provide more printing in debug mode
|
||||
int _num_work_vecs; // Number of non memory vector operations
|
||||
@ -462,15 +466,18 @@ class SuperWord : public ResourceObj {
|
||||
// Information about an address for dependence checking and vector alignment
|
||||
class SWPointer VALUE_OBJ_CLASS_SPEC {
|
||||
protected:
|
||||
MemNode* _mem; // My memory reference node
|
||||
SuperWord* _slp; // SuperWord class
|
||||
MemNode* _mem; // My memory reference node
|
||||
SuperWord* _slp; // SuperWord class
|
||||
|
||||
Node* _base; // NULL if unsafe nonheap reference
|
||||
Node* _adr; // address pointer
|
||||
jint _scale; // multiplier for iv (in bytes), 0 if no loop iv
|
||||
jint _offset; // constant offset (in bytes)
|
||||
Node* _invar; // invariant offset (in bytes), NULL if none
|
||||
bool _negate_invar; // if true then use: (0 - _invar)
|
||||
Node* _base; // NULL if unsafe nonheap reference
|
||||
Node* _adr; // address pointer
|
||||
jint _scale; // multiplier for iv (in bytes), 0 if no loop iv
|
||||
jint _offset; // constant offset (in bytes)
|
||||
Node* _invar; // invariant offset (in bytes), NULL if none
|
||||
bool _negate_invar; // if true then use: (0 - _invar)
|
||||
Node_Stack* _nstack; // stack used to record a swpointer trace of variants
|
||||
bool _analyze_only; // Used in loop unrolling only for swpointer trace
|
||||
uint _stack_idx; // Used in loop unrolling only for swpointer trace
|
||||
|
||||
PhaseIdealLoop* phase() { return _slp->phase(); }
|
||||
IdealLoopTree* lpt() { return _slp->lpt(); }
|
||||
@ -497,7 +504,7 @@ class SWPointer VALUE_OBJ_CLASS_SPEC {
|
||||
NotComparable = (Less | Greater | Equal)
|
||||
};
|
||||
|
||||
SWPointer(MemNode* mem, SuperWord* slp);
|
||||
SWPointer(MemNode* mem, SuperWord* slp, Node_Stack *nstack, bool analyze_only);
|
||||
// Following is used to create a temporary object during
|
||||
// the pattern match of an address expression.
|
||||
SWPointer(SWPointer* p);
|
||||
@ -505,14 +512,15 @@ class SWPointer VALUE_OBJ_CLASS_SPEC {
|
||||
bool valid() { return _adr != NULL; }
|
||||
bool has_iv() { return _scale != 0; }
|
||||
|
||||
Node* base() { return _base; }
|
||||
Node* adr() { return _adr; }
|
||||
MemNode* mem() { return _mem; }
|
||||
int scale_in_bytes() { return _scale; }
|
||||
Node* invar() { return _invar; }
|
||||
bool negate_invar() { return _negate_invar; }
|
||||
int offset_in_bytes() { return _offset; }
|
||||
int memory_size() { return _mem->memory_size(); }
|
||||
Node* base() { return _base; }
|
||||
Node* adr() { return _adr; }
|
||||
MemNode* mem() { return _mem; }
|
||||
int scale_in_bytes() { return _scale; }
|
||||
Node* invar() { return _invar; }
|
||||
bool negate_invar() { return _negate_invar; }
|
||||
int offset_in_bytes() { return _offset; }
|
||||
int memory_size() { return _mem->memory_size(); }
|
||||
Node_Stack* node_stack() { return _nstack; }
|
||||
|
||||
// Comparable?
|
||||
int cmp(SWPointer& q) {
|
||||
|
@ -674,6 +674,9 @@ class CommandLineFlags {
|
||||
product(bool, UseSHA, false, \
|
||||
"Control whether SHA instructions can be used on SPARC") \
|
||||
\
|
||||
product(bool, UseGHASHIntrinsics, false, \
|
||||
"Use intrinsics for GHASH versions of crypto") \
|
||||
\
|
||||
product(size_t, LargePageSizeInBytes, 0, \
|
||||
"Large page size (0 to let VM choose the page size)") \
|
||||
\
|
||||
|
@ -125,6 +125,7 @@ address StubRoutines::_aescrypt_encryptBlock = NULL;
|
||||
address StubRoutines::_aescrypt_decryptBlock = NULL;
|
||||
address StubRoutines::_cipherBlockChaining_encryptAESCrypt = NULL;
|
||||
address StubRoutines::_cipherBlockChaining_decryptAESCrypt = NULL;
|
||||
address StubRoutines::_ghash_processBlocks = NULL;
|
||||
|
||||
address StubRoutines::_sha1_implCompress = NULL;
|
||||
address StubRoutines::_sha1_implCompressMB = NULL;
|
||||
|
@ -185,6 +185,7 @@ class StubRoutines: AllStatic {
|
||||
static address _aescrypt_decryptBlock;
|
||||
static address _cipherBlockChaining_encryptAESCrypt;
|
||||
static address _cipherBlockChaining_decryptAESCrypt;
|
||||
static address _ghash_processBlocks;
|
||||
|
||||
static address _sha1_implCompress;
|
||||
static address _sha1_implCompressMB;
|
||||
@ -346,6 +347,7 @@ class StubRoutines: AllStatic {
|
||||
static address aescrypt_decryptBlock() { return _aescrypt_decryptBlock; }
|
||||
static address cipherBlockChaining_encryptAESCrypt() { return _cipherBlockChaining_encryptAESCrypt; }
|
||||
static address cipherBlockChaining_decryptAESCrypt() { return _cipherBlockChaining_decryptAESCrypt; }
|
||||
static address ghash_processBlocks() { return _ghash_processBlocks; }
|
||||
|
||||
static address sha1_implCompress() { return _sha1_implCompress; }
|
||||
static address sha1_implCompressMB() { return _sha1_implCompressMB; }
|
||||
|
@ -828,6 +828,7 @@ typedef CompactHashtable<Symbol*, char> SymbolCompactHashTable;
|
||||
static_field(StubRoutines, _aescrypt_decryptBlock, address) \
|
||||
static_field(StubRoutines, _cipherBlockChaining_encryptAESCrypt, address) \
|
||||
static_field(StubRoutines, _cipherBlockChaining_decryptAESCrypt, address) \
|
||||
static_field(StubRoutines, _ghash_processBlocks, address) \
|
||||
static_field(StubRoutines, _updateBytesCRC32, address) \
|
||||
static_field(StubRoutines, _crc_table_adr, address) \
|
||||
static_field(StubRoutines, _multiplyToLen, address) \
|
||||
|
@ -31,6 +31,7 @@ import java.security.AlgorithmParameters;
|
||||
import java.util.Random;
|
||||
import javax.crypto.Cipher;
|
||||
import javax.crypto.SecretKey;
|
||||
import javax.crypto.spec.GCMParameterSpec;
|
||||
import javax.crypto.spec.IvParameterSpec;
|
||||
import javax.crypto.spec.SecretKeySpec;
|
||||
|
||||
@ -62,6 +63,10 @@ abstract public class TestAESBase {
|
||||
Cipher dCipher;
|
||||
AlgorithmParameters algParams;
|
||||
SecretKey key;
|
||||
GCMParameterSpec gcm_spec;
|
||||
byte[] aad;
|
||||
int tlen = 12;
|
||||
byte[] iv;
|
||||
|
||||
static int numThreads = 0;
|
||||
int threadId;
|
||||
@ -100,6 +105,12 @@ abstract public class TestAESBase {
|
||||
int ivLen = (algorithm.equals("AES") ? 16 : algorithm.equals("DES") ? 8 : 0);
|
||||
IvParameterSpec initVector = new IvParameterSpec(new byte[ivLen]);
|
||||
cipher.init(Cipher.ENCRYPT_MODE, key, initVector);
|
||||
} else if (mode.equals("GCM")) {
|
||||
iv = new byte[64];
|
||||
random.nextBytes(iv);
|
||||
aad = new byte[5];
|
||||
random.nextBytes(aad);
|
||||
gcm_init();
|
||||
} else {
|
||||
algParams = cipher.getParameters();
|
||||
cipher.init(Cipher.ENCRYPT_MODE, key, algParams);
|
||||
@ -186,4 +197,12 @@ abstract public class TestAESBase {
|
||||
}
|
||||
|
||||
abstract void childShowCipher();
|
||||
|
||||
void gcm_init() throws Exception {
|
||||
tlen = 12;
|
||||
gcm_spec = new GCMParameterSpec(tlen * 8, iv);
|
||||
cipher = Cipher.getInstance(algorithm + "/" + mode + "/" + paddingStr, "SunJCE");
|
||||
cipher.init(Cipher.ENCRYPT_MODE, key, gcm_spec);
|
||||
cipher.update(aad);
|
||||
}
|
||||
}
|
||||
|
@ -1,5 +1,5 @@
|
||||
/*
|
||||
* Copyright (c) 2012, 2014, Oracle and/or its affiliates. All rights reserved.
|
||||
* Copyright (c) 2012, 2015, Oracle and/or its affiliates. All rights reserved.
|
||||
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
|
||||
*
|
||||
* This code is free software; you can redistribute it and/or modify it
|
||||
@ -32,7 +32,11 @@ public class TestAESEncode extends TestAESBase {
|
||||
@Override
|
||||
public void run() {
|
||||
try {
|
||||
if (!noReinit) cipher.init(Cipher.ENCRYPT_MODE, key, algParams);
|
||||
if (mode.equals("GCM")) {
|
||||
gcm_init();
|
||||
} else if (!noReinit) {
|
||||
cipher.init(Cipher.ENCRYPT_MODE, key, algParams);
|
||||
}
|
||||
encode = new byte[encodeLength];
|
||||
if (testingMisalignment) {
|
||||
int tempSize = cipher.update(input, encInputOffset, (msgSize - lastChunkSize), encode, encOutputOffset);
|
||||
|
@ -44,6 +44,13 @@
|
||||
* @run main/othervm/timeout=600 -Xbatch -DcheckOutput=true -Dmode=ECB -DencInputOffset=1 -DencOutputOffset=1 TestAESMain
|
||||
* @run main/othervm/timeout=600 -Xbatch -DcheckOutput=true -Dmode=ECB -DencInputOffset=1 -DencOutputOffset=1 -DdecOutputOffset=1 TestAESMain
|
||||
* @run main/othervm/timeout=600 -Xbatch -DcheckOutput=true -Dmode=ECB -DencInputOffset=1 -DencOutputOffset=1 -DdecOutputOffset=1 -DpaddingStr=NoPadding -DmsgSize=640 TestAESMain
|
||||
* @run main/othervm/timeout=600 -Xbatch -DcheckOutput=true -Dmode=GCM TestAESMain
|
||||
* @run main/othervm/timeout=600 -Xbatch -DcheckOutput=true -Dmode=GCM -DencInputOffset=1 TestAESMain
|
||||
* @run main/othervm/timeout=600 -Xbatch -DcheckOutput=true -Dmode=GCM -DencOutputOffset=1 TestAESMain
|
||||
* @run main/othervm/timeout=600 -Xbatch -DcheckOutput=true -Dmode=GCM -DdecOutputOffset=1 TestAESMain
|
||||
* @run main/othervm/timeout=600 -Xbatch -DcheckOutput=true -Dmode=GCM -DencInputOffset=1 -DencOutputOffset=1 TestAESMain
|
||||
* @run main/othervm/timeout=600 -Xbatch -DcheckOutput=true -Dmode=GCM -DencInputOffset=1 -DencOutputOffset=1 -DdecOutputOffset=1 TestAESMain
|
||||
* @run main/othervm/timeout=600 -Xbatch -DcheckOutput=true -Dmode=GCM -DencInputOffset=1 -DencOutputOffset=1 -DdecOutputOffset=1 -DpaddingStr=NoPadding -DmsgSize=640 TestAESMain
|
||||
*
|
||||
* @author Tom Deneau
|
||||
*/
|
||||
|
Loading…
x
Reference in New Issue
Block a user