8185979: PPC64: Implement SHA2 intrinsic

Co-authored-by: Bruno Rosa <bruno.rosa@eldorado.org.br>
Co-authored-by: Gustavo Serra Scalet <gustavo.scalet@eldorado.org.br>
Co-authored-by: Igor Nunes <igor.nunes@eldorado.org.br>
Reviewed-by: mdoerr, goetz
This commit is contained in:
Martin Doerr 2017-09-25 17:40:06 +02:00
parent 6213838f11
commit d0dc2dd231
10 changed files with 1271 additions and 12 deletions

@ -2175,7 +2175,8 @@ class Assembler : public AbstractAssembler {
inline void vsbox( VectorRegister d, VectorRegister a);
// SHA (introduced with Power 8)
// Not yet implemented.
inline void vshasigmad(VectorRegister d, VectorRegister a, bool st, int six);
inline void vshasigmaw(VectorRegister d, VectorRegister a, bool st, int six);
// Vector Binary Polynomial Multiplication (introduced with Power 8)
inline void vpmsumb( VectorRegister d, VectorRegister a, VectorRegister b);
@ -2286,6 +2287,10 @@ class Assembler : public AbstractAssembler {
inline void lvsl( VectorRegister d, Register s2);
inline void lvsr( VectorRegister d, Register s2);
// Endianess specific concatenation of 2 loaded vectors.
inline void load_perm(VectorRegister perm, Register addr);
inline void vec_perm(VectorRegister first_dest, VectorRegister second, VectorRegister perm);
// RegisterOrConstant versions.
// These emitters choose between the versions using two registers and
// those with register and immediate, depending on the content of roc.

@ -926,7 +926,8 @@ inline void Assembler::vncipherlast(VectorRegister d, VectorRegister a, VectorRe
inline void Assembler::vsbox( VectorRegister d, VectorRegister a) { emit_int32( VSBOX_OPCODE | vrt(d) | vra(a) ); }
// SHA (introduced with Power 8)
// Not yet implemented.
inline void Assembler::vshasigmad(VectorRegister d, VectorRegister a, bool st, int six) { emit_int32( VSHASIGMAD_OPCODE | vrt(d) | vra(a) | vst(st) | vsix(six)); }
inline void Assembler::vshasigmaw(VectorRegister d, VectorRegister a, bool st, int six) { emit_int32( VSHASIGMAW_OPCODE | vrt(d) | vra(a) | vst(st) | vsix(six)); }
// Vector Binary Polynomial Multiplication (introduced with Power 8)
inline void Assembler::vpmsumb( VectorRegister d, VectorRegister a, VectorRegister b) { emit_int32( VPMSUMB_OPCODE | vrt(d) | vra(a) | vrb(b)); }
@ -1035,6 +1036,22 @@ inline void Assembler::stvxl( VectorRegister d, Register s2) { emit_int32( STVXL
inline void Assembler::lvsl( VectorRegister d, Register s2) { emit_int32( LVSL_OPCODE | vrt(d) | rb(s2)); }
inline void Assembler::lvsr( VectorRegister d, Register s2) { emit_int32( LVSR_OPCODE | vrt(d) | rb(s2)); }
inline void Assembler::load_perm(VectorRegister perm, Register addr) {
#if defined(VM_LITTLE_ENDIAN)
lvsr(perm, addr);
#else
lvsl(perm, addr);
#endif
}
inline void Assembler::vec_perm(VectorRegister first_dest, VectorRegister second, VectorRegister perm) {
#if defined(VM_LITTLE_ENDIAN)
vperm(first_dest, second, first_dest, perm);
#else
vperm(first_dest, first_dest, second, perm);
#endif
}
inline void Assembler::load_const(Register d, void* x, Register tmp) {
load_const(d, (long)x, tmp);
}

@ -866,6 +866,40 @@ class MacroAssembler: public Assembler {
void kernel_crc32_singleByteReg(Register crc, Register val, Register table,
bool invertCRC);
// SHA-2 auxiliary functions and public interfaces
private:
void sha256_deque(const VectorRegister src,
const VectorRegister dst1, const VectorRegister dst2, const VectorRegister dst3);
void sha256_load_h_vec(const VectorRegister a, const VectorRegister e, const Register hptr);
void sha256_round(const VectorRegister* hs, const int total_hs, int& h_cnt, const VectorRegister kpw);
void sha256_load_w_plus_k_vec(const Register buf_in, const VectorRegister* ws,
const int total_ws, const Register k, const VectorRegister* kpws,
const int total_kpws);
void sha256_calc_4w(const VectorRegister w0, const VectorRegister w1,
const VectorRegister w2, const VectorRegister w3, const VectorRegister kpw0,
const VectorRegister kpw1, const VectorRegister kpw2, const VectorRegister kpw3,
const Register j, const Register k);
void sha256_update_sha_state(const VectorRegister a, const VectorRegister b,
const VectorRegister c, const VectorRegister d, const VectorRegister e,
const VectorRegister f, const VectorRegister g, const VectorRegister h,
const Register hptr);
void sha512_load_w_vec(const Register buf_in, const VectorRegister* ws, const int total_ws);
void sha512_update_sha_state(const Register state, const VectorRegister* hs, const int total_hs);
void sha512_round(const VectorRegister* hs, const int total_hs, int& h_cnt, const VectorRegister kpw);
void sha512_load_h_vec(const Register state, const VectorRegister* hs, const int total_hs);
void sha512_calc_2w(const VectorRegister w0, const VectorRegister w1,
const VectorRegister w2, const VectorRegister w3,
const VectorRegister w4, const VectorRegister w5,
const VectorRegister w6, const VectorRegister w7,
const VectorRegister kpw0, const VectorRegister kpw1, const Register j,
const VectorRegister vRb, const Register k);
public:
void sha256(bool multi_block);
void sha512(bool multi_block);
//
// Debugging
//

File diff suppressed because it is too large Load Diff

@ -3095,6 +3095,28 @@ class StubGenerator: public StubCodeGenerator {
return start;
}
address generate_sha256_implCompress(bool multi_block, const char *name) {
assert(UseSHA, "need SHA instructions");
StubCodeMark mark(this, "StubRoutines", name);
address start = __ function_entry();
__ sha256 (multi_block);
__ blr();
return start;
}
address generate_sha512_implCompress(bool multi_block, const char *name) {
assert(UseSHA, "need SHA instructions");
StubCodeMark mark(this, "StubRoutines", name);
address start = __ function_entry();
__ sha512 (multi_block);
__ blr();
return start;
}
void generate_arraycopy_stubs() {
// Note: the disjoint stubs must be generated first, some of
// the conjoint stubs use them.
@ -3781,6 +3803,14 @@ class StubGenerator: public StubCodeGenerator {
StubRoutines::_aescrypt_decryptBlock = generate_aescrypt_decryptBlock();
}
if (UseSHA256Intrinsics) {
StubRoutines::_sha256_implCompress = generate_sha256_implCompress(false, "sha256_implCompress");
StubRoutines::_sha256_implCompressMB = generate_sha256_implCompress(true, "sha256_implCompressMB");
}
if (UseSHA512Intrinsics) {
StubRoutines::_sha512_implCompress = generate_sha512_implCompress(false, "sha512_implCompress");
StubRoutines::_sha512_implCompressMB = generate_sha512_implCompress(true, "sha512_implCompressMB");
}
}
public:

@ -34,7 +34,7 @@ static bool returns_to_call_stub(address return_pc) { return return_pc == _call_
enum platform_dependent_constants {
code_size1 = 20000, // simply increase if too small (assembler will crash if too small)
code_size2 = 20000 // simply increase if too small (assembler will crash if too small)
code_size2 = 22000 // simply increase if too small (assembler will crash if too small)
};
// CRC32 Intrinsics.

@ -113,7 +113,7 @@ void VM_Version::initialize() {
// Create and print feature-string.
char buf[(num_features+1) * 16]; // Max 16 chars per feature.
jio_snprintf(buf, sizeof(buf),
"ppc64%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s",
"ppc64%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s",
(has_fsqrt() ? " fsqrt" : ""),
(has_isel() ? " isel" : ""),
(has_lxarxeh() ? " lxarxeh" : ""),
@ -130,7 +130,8 @@ void VM_Version::initialize() {
(has_mfdscr() ? " mfdscr" : ""),
(has_vsx() ? " vsx" : ""),
(has_ldbrx() ? " ldbrx" : ""),
(has_stdbrx() ? " stdbrx" : "")
(has_stdbrx() ? " stdbrx" : ""),
(has_vshasig() ? " sha" : "")
// Make sure number of %s matches num_features!
);
_features_string = os::strdup(buf);
@ -247,17 +248,43 @@ void VM_Version::initialize() {
FLAG_SET_DEFAULT(UseFMA, true);
}
if (UseSHA) {
warning("SHA instructions are not available on this CPU");
if (has_vshasig()) {
if (FLAG_IS_DEFAULT(UseSHA)) {
UseSHA = true;
}
} else if (UseSHA) {
if (!FLAG_IS_DEFAULT(UseSHA))
warning("SHA instructions are not available on this CPU");
FLAG_SET_DEFAULT(UseSHA, false);
}
if (UseSHA1Intrinsics || UseSHA256Intrinsics || UseSHA512Intrinsics) {
warning("SHA intrinsics are not available on this CPU");
if (UseSHA1Intrinsics) {
warning("Intrinsics for SHA-1 crypto hash functions not available on this CPU.");
FLAG_SET_DEFAULT(UseSHA1Intrinsics, false);
}
if (UseSHA && has_vshasig()) {
if (FLAG_IS_DEFAULT(UseSHA256Intrinsics)) {
FLAG_SET_DEFAULT(UseSHA256Intrinsics, true);
}
} else if (UseSHA256Intrinsics) {
warning("Intrinsics for SHA-224 and SHA-256 crypto hash functions not available on this CPU.");
FLAG_SET_DEFAULT(UseSHA256Intrinsics, false);
}
if (UseSHA && has_vshasig()) {
if (FLAG_IS_DEFAULT(UseSHA512Intrinsics)) {
FLAG_SET_DEFAULT(UseSHA512Intrinsics, true);
}
} else if (UseSHA512Intrinsics) {
warning("Intrinsics for SHA-384 and SHA-512 crypto hash functions not available on this CPU.");
FLAG_SET_DEFAULT(UseSHA512Intrinsics, false);
}
if (!(UseSHA1Intrinsics || UseSHA256Intrinsics || UseSHA512Intrinsics)) {
FLAG_SET_DEFAULT(UseSHA, false);
}
if (FLAG_IS_DEFAULT(UseSquareToLenIntrinsic)) {
UseSquareToLenIntrinsic = true;
}
@ -663,6 +690,7 @@ void VM_Version::determine_features() {
a->lxvd2x(VSR0, R3_ARG1); // code[14] -> vsx
a->ldbrx(R7, R3_ARG1, R4_ARG2); // code[15] -> ldbrx
a->stdbrx(R7, R3_ARG1, R4_ARG2); // code[16] -> stdbrx
a->vshasigmaw(VR0, VR1, 1, 0xF); // code[17] -> vshasig
a->blr();
// Emit function to set one cache line to zero. Emit function descriptor and get pointer to it.
@ -714,6 +742,7 @@ void VM_Version::determine_features() {
if (code[feature_cntr++]) features |= vsx_m;
if (code[feature_cntr++]) features |= ldbrx_m;
if (code[feature_cntr++]) features |= stdbrx_m;
if (code[feature_cntr++]) features |= vshasig_m;
// Print the detection code.
if (PrintAssembly) {

@ -49,6 +49,7 @@ protected:
vsx,
ldbrx,
stdbrx,
vshasig,
num_features // last entry to count features
};
enum Feature_Flag_Set {
@ -64,6 +65,7 @@ protected:
vand_m = (1 << vand ),
lqarx_m = (1 << lqarx ),
vcipher_m = (1 << vcipher),
vshasig_m = (1 << vshasig),
vpmsumb_m = (1 << vpmsumb),
tcheck_m = (1 << tcheck ),
mfdscr_m = (1 << mfdscr ),
@ -106,6 +108,7 @@ public:
static bool has_vsx() { return (_features & vsx_m) != 0; }
static bool has_ldbrx() { return (_features & ldbrx_m) != 0; }
static bool has_stdbrx() { return (_features & stdbrx_m) != 0; }
static bool has_vshasig() { return (_features & vshasig_m) != 0; }
static bool has_mtfprd() { return has_vpmsumb(); } // alias for P8
// Assembler testing

@ -42,7 +42,8 @@ public class GenericTestCaseForOtherCPU extends
new OrPredicate(Platform::isAArch64,
new OrPredicate(Platform::isS390x,
new OrPredicate(Platform::isSparc,
new OrPredicate(Platform::isX64, Platform::isX86))))));
new OrPredicate(Platform::isPPC,
new OrPredicate(Platform::isX64, Platform::isX86)))))));
}
@Override

@ -71,23 +71,27 @@ public class IntrinsicPredicates {
= new OrPredicate(new CPUSpecificPredicate("aarch64.*", new String[] { "sha256" }, null),
new OrPredicate(new CPUSpecificPredicate("s390.*", new String[] { "sha256" }, null),
new OrPredicate(new CPUSpecificPredicate("sparc.*", new String[] { "sha256" }, null),
new OrPredicate(new CPUSpecificPredicate("ppc64.*", new String[] { "sha" }, null),
new OrPredicate(new CPUSpecificPredicate("ppc64le.*", new String[] { "sha" }, null),
// x86 variants
new OrPredicate(new CPUSpecificPredicate("amd64.*", new String[] { "sha" }, null),
new OrPredicate(new CPUSpecificPredicate("i386.*", new String[] { "sha" }, null),
new OrPredicate(new CPUSpecificPredicate("x86.*", new String[] { "sha" }, null),
new OrPredicate(new CPUSpecificPredicate("amd64.*", new String[] { "avx2", "bmi2" }, null),
new CPUSpecificPredicate("x86_64", new String[] { "avx2", "bmi2" }, null))))))));
new CPUSpecificPredicate("x86_64", new String[] { "avx2", "bmi2" }, null))))))))));
public static final BooleanSupplier SHA512_INSTRUCTION_AVAILABLE
= new OrPredicate(new CPUSpecificPredicate("aarch64.*", new String[] { "sha512" }, null),
new OrPredicate(new CPUSpecificPredicate("s390.*", new String[] { "sha512" }, null),
new OrPredicate(new CPUSpecificPredicate("sparc.*", new String[] { "sha512" }, null),
new OrPredicate(new CPUSpecificPredicate("ppc64.*", new String[] { "sha" }, null),
new OrPredicate(new CPUSpecificPredicate("ppc64le.*", new String[] { "sha" }, null),
// x86 variants
new OrPredicate(new CPUSpecificPredicate("amd64.*", new String[] { "sha" }, null),
new OrPredicate(new CPUSpecificPredicate("i386.*", new String[] { "sha" }, null),
new OrPredicate(new CPUSpecificPredicate("x86.*", new String[] { "sha" }, null),
new OrPredicate(new CPUSpecificPredicate("amd64.*", new String[] { "avx2", "bmi2" }, null),
new CPUSpecificPredicate("x86_64", new String[] { "avx2", "bmi2" }, null))))))));
new CPUSpecificPredicate("x86_64", new String[] { "avx2", "bmi2" }, null))))))))));
public static final BooleanSupplier ANY_SHA_INSTRUCTION_AVAILABLE
= new OrPredicate(IntrinsicPredicates.SHA1_INSTRUCTION_AVAILABLE,