8143355: Update for addition of vectorizedMismatch intrinsic for x86

Co-authored-by: Liqi Yi <liqi.yi@intel.com>
Reviewed-by: kvn
This commit is contained in:
Vivek R Deshpande 2015-12-07 16:35:07 -08:00 committed by Vladimir Kozlov
parent 43d48c16d2
commit 2d9a6cfd3f
18 changed files with 351 additions and 4 deletions

View File

@ -182,6 +182,11 @@ void VM_Version::get_processor_features() {
FLAG_SET_DEFAULT(UseAdler32Intrinsics, true);
}
if (UseVectorizedMismatchIntrinsic) {
warning("UseVectorizedMismatchIntrinsic specified, but not available on this CPU.");
FLAG_SET_DEFAULT(UseVectorizedMismatchIntrinsic, false);
}
if (auxv & HWCAP_AES) {
UseAES = UseAES || FLAG_IS_DEFAULT(UseAES);
UseAESIntrinsics =

View File

@ -223,6 +223,11 @@ void VM_Version::initialize() {
UseMultiplyToLenIntrinsic = true;
}
if (UseVectorizedMismatchIntrinsic) {
warning("UseVectorizedMismatchIntrinsic specified, but not available on this CPU.");
FLAG_SET_DEFAULT(UseVectorizedMismatchIntrinsic, false);
}
// Adjust RTM (Restricted Transactional Memory) flags.
if (!has_tcheck() && UseRTMLocking) {
// Can't continue because UseRTMLocking affects UseBiasedLocking flag

View File

@ -356,6 +356,11 @@ void VM_Version::initialize() {
FLAG_SET_DEFAULT(UseCRC32Intrinsics, false);
}
if (UseVectorizedMismatchIntrinsic) {
warning("UseVectorizedMismatchIntrinsic specified, but not available on this CPU.");
FLAG_SET_DEFAULT(UseVectorizedMismatchIntrinsic, false);
}
if (FLAG_IS_DEFAULT(ContendedPaddingWidth) &&
(cache_line_size > ContendedPaddingWidth))
ContendedPaddingWidth = cache_line_size;

View File

@ -9439,13 +9439,184 @@ void MacroAssembler::multiply_to_len(Register x, Register xlen, Register y, Regi
pop(tmp1);
}
void MacroAssembler::vectorized_mismatch(Register obja, Register objb, Register length, Register log2_array_indxscale,
Register result, Register tmp1, Register tmp2, XMMRegister rymm0, XMMRegister rymm1, XMMRegister rymm2){
assert(UseSSE42Intrinsics, "SSE4.2 must be enabled.");
Label VECTOR32_LOOP, VECTOR16_LOOP, VECTOR8_LOOP, VECTOR4_LOOP;
Label VECTOR16_TAIL, VECTOR8_TAIL, VECTOR4_TAIL;
Label VECTOR32_NOT_EQUAL, VECTOR16_NOT_EQUAL, VECTOR8_NOT_EQUAL, VECTOR4_NOT_EQUAL;
Label SAME_TILL_END, DONE;
Label BYTES_LOOP, BYTES_TAIL, BYTES_NOT_EQUAL;
//scale is in rcx in both Win64 and Unix
ShortBranchVerifier sbv(this);
shlq(length);
xorq(result, result);
cmpq(length, 8);
jcc(Assembler::equal, VECTOR8_LOOP);
jcc(Assembler::less, VECTOR4_TAIL);
if (UseAVX >= 2){
cmpq(length, 16);
jcc(Assembler::equal, VECTOR16_LOOP);
jcc(Assembler::less, VECTOR8_LOOP);
cmpq(length, 32);
jccb(Assembler::less, VECTOR16_TAIL);
subq(length, 32);
bind(VECTOR32_LOOP);
vmovdqu(rymm0, Address(obja, result));
vmovdqu(rymm1, Address(objb, result));
vpxor(rymm2, rymm0, rymm1, Assembler::AVX_256bit);
vptest(rymm2, rymm2);
jcc(Assembler::notZero, VECTOR32_NOT_EQUAL);//mismatch found
addq(result, 32);
subq(length, 32);
jccb(Assembler::greaterEqual, VECTOR32_LOOP);
addq(length, 32);
jcc(Assembler::equal, SAME_TILL_END);
//falling through if less than 32 bytes left //close the branch here.
bind(VECTOR16_TAIL);
cmpq(length, 16);
jccb(Assembler::less, VECTOR8_TAIL);
bind(VECTOR16_LOOP);
movdqu(rymm0, Address(obja, result));
movdqu(rymm1, Address(objb, result));
vpxor(rymm2, rymm0, rymm1, Assembler::AVX_128bit);
ptest(rymm2, rymm2);
jcc(Assembler::notZero, VECTOR16_NOT_EQUAL);//mismatch found
addq(result, 16);
subq(length, 16);
jcc(Assembler::equal, SAME_TILL_END);
//falling through if less than 16 bytes left
} else {//regular intrinsics
cmpq(length, 16);
jccb(Assembler::less, VECTOR8_TAIL);
subq(length, 16);
bind(VECTOR16_LOOP);
movdqu(rymm0, Address(obja, result));
movdqu(rymm1, Address(objb, result));
pxor(rymm0, rymm1);
ptest(rymm0, rymm0);
jcc(Assembler::notZero, VECTOR16_NOT_EQUAL);//mismatch found
addq(result, 16);
subq(length, 16);
jccb(Assembler::greaterEqual, VECTOR16_LOOP);
addq(length, 16);
jcc(Assembler::equal, SAME_TILL_END);
//falling through if less than 16 bytes left
}
bind(VECTOR8_TAIL);
cmpq(length, 8);
jccb(Assembler::less, VECTOR4_TAIL);
bind(VECTOR8_LOOP);
movq(tmp1, Address(obja, result));
movq(tmp2, Address(objb, result));
xorq(tmp1, tmp2);
testq(tmp1, tmp1);
jcc(Assembler::notZero, VECTOR8_NOT_EQUAL);//mismatch found
addq(result, 8);
subq(length, 8);
jcc(Assembler::equal, SAME_TILL_END);
//falling through if less than 8 bytes left
bind(VECTOR4_TAIL);
cmpq(length, 4);
jccb(Assembler::less, BYTES_TAIL);
bind(VECTOR4_LOOP);
movl(tmp1, Address(obja, result));
xorl(tmp1, Address(objb, result));
testl(tmp1, tmp1);
jcc(Assembler::notZero, VECTOR4_NOT_EQUAL);//mismatch found
addq(result, 4);
subq(length, 4);
jcc(Assembler::equal, SAME_TILL_END);
//falling through if less than 4 bytes left
bind(BYTES_TAIL);
bind(BYTES_LOOP);
load_unsigned_byte(tmp1, Address(obja, result));
load_unsigned_byte(tmp2, Address(objb, result));
xorl(tmp1, tmp2);
testl(tmp1, tmp1);
jccb(Assembler::notZero, BYTES_NOT_EQUAL);//mismatch found
decq(length);
jccb(Assembler::zero, SAME_TILL_END);
incq(result);
load_unsigned_byte(tmp1, Address(obja, result));
load_unsigned_byte(tmp2, Address(objb, result));
xorl(tmp1, tmp2);
testl(tmp1, tmp1);
jccb(Assembler::notZero, BYTES_NOT_EQUAL);//mismatch found
decq(length);
jccb(Assembler::zero, SAME_TILL_END);
incq(result);
load_unsigned_byte(tmp1, Address(obja, result));
load_unsigned_byte(tmp2, Address(objb, result));
xorl(tmp1, tmp2);
testl(tmp1, tmp1);
jccb(Assembler::notZero, BYTES_NOT_EQUAL);//mismatch found
jmpb(SAME_TILL_END);
if (UseAVX >= 2){
bind(VECTOR32_NOT_EQUAL);
vpcmpeqb(rymm2, rymm2, rymm2, Assembler::AVX_256bit);
vpcmpeqb(rymm0, rymm0, rymm1, Assembler::AVX_256bit);
vpxor(rymm0, rymm0, rymm2, Assembler::AVX_256bit);
vpmovmskb(tmp1, rymm0);
bsfq(tmp1, tmp1);
addq(result, tmp1);
shrq(result);
jmpb(DONE);
}
bind(VECTOR16_NOT_EQUAL);
if (UseAVX >= 2){
vpcmpeqb(rymm2, rymm2, rymm2, Assembler::AVX_128bit);
vpcmpeqb(rymm0, rymm0, rymm1, Assembler::AVX_128bit);
pxor(rymm0, rymm2);
} else {
pcmpeqb(rymm2, rymm2);
pxor(rymm0, rymm1);
pcmpeqb(rymm0, rymm1);
pxor(rymm0, rymm2);
}
pmovmskb(tmp1, rymm0);
bsfq(tmp1, tmp1);
addq(result, tmp1);
shrq(result);
jmpb(DONE);
bind(VECTOR8_NOT_EQUAL);
bind(VECTOR4_NOT_EQUAL);
bsfq(tmp1, tmp1);
shrq(tmp1, 3);
addq(result, tmp1);
bind(BYTES_NOT_EQUAL);
shrq(result);
jmpb(DONE);
bind(SAME_TILL_END);
mov64(result, -1);
bind(DONE);
}
//Helper functions for square_to_len()
/**
* Store the squares of x[], right shifted one bit (divided by 2) into z[]
* Preserves x and z and modifies rest of the registers.
*/
void MacroAssembler::square_rshift(Register x, Register xlen, Register z, Register tmp1, Register tmp3, Register tmp4, Register tmp5, Register rdxReg, Register raxReg) {
// Perform square and right shift by 1
// Handle odd xlen case first, then for even xlen do the following

View File

@ -1346,7 +1346,6 @@ public:
Register carry2);
void multiply_to_len(Register x, Register xlen, Register y, Register ylen, Register z, Register zlen,
Register tmp1, Register tmp2, Register tmp3, Register tmp4, Register tmp5);
void square_rshift(Register x, Register len, Register z, Register tmp1, Register tmp3,
Register tmp4, Register tmp5, Register rdxReg, Register raxReg);
void multiply_add_64_bmi2(Register sum, Register op1, Register op2, Register carry,
@ -1365,6 +1364,9 @@ public:
void mul_add(Register out, Register in, Register offset, Register len, Register k, Register tmp1,
Register tmp2, Register tmp3, Register tmp4, Register tmp5, Register rdxReg,
Register raxReg);
void vectorized_mismatch(Register obja, Register objb, Register length, Register log2_array_indxscale,
Register result, Register tmp1, Register tmp2,
XMMRegister vec1, XMMRegister vec2, XMMRegister vec3);
#endif
// CRC32 code for java.util.zip.CRC32::updateBytes() intrinsic.

View File

@ -4054,6 +4054,54 @@ class StubGenerator: public StubCodeGenerator {
return start;
}
/**
* Arguments:
*
* Input:
* c_rarg0 - obja address
* c_rarg1 - objb address
* c_rarg3 - length length
* c_rarg4 - scale log2_array_indxscale
*/
address generate_vectorizedMismatch() {
__ align(CodeEntryAlignment);
StubCodeMark mark(this, "StubRoutines", "vectorizedMismatch");
address start = __ pc();
BLOCK_COMMENT("Entry:");
__ enter();
#ifdef _WIN64 // Win64: rcx, rdx, r8, r9 (c_rarg0, c_rarg1, ...)
const Register scale = c_rarg0; //rcx, will exchange with r9
const Register objb = c_rarg1; //rdx
const Register length = c_rarg2; //r8
const Register obja = c_rarg3; //r9
__ xchgq(obja, scale); //now obja and scale contains the correct contents
const Register tmp1 = r10;
const Register tmp2 = r11;
#endif
#ifndef _WIN64 // Unix: rdi, rsi, rdx, rcx, r8, r9 (c_rarg0, c_rarg1, ...)
const Register obja = c_rarg0; //U:rdi
const Register objb = c_rarg1; //U:rsi
const Register length = c_rarg2; //U:rdx
const Register scale = c_rarg3; //U:rcx
const Register tmp1 = r8;
const Register tmp2 = r9;
#endif
const Register result = rax; //return value
const XMMRegister vec0 = xmm0;
const XMMRegister vec1 = xmm1;
const XMMRegister vec2 = xmm2;
__ vectorized_mismatch(obja, objb, length, scale, result, tmp1, tmp2, vec0, vec1, vec2);
__ leave();
__ ret(0);
return start;
}
/**
* Arguments:
*
@ -4505,7 +4553,9 @@ class StubGenerator: public StubCodeGenerator {
if (UseMulAddIntrinsic) {
StubRoutines::_mulAdd = generate_mulAdd();
}
if (UseVectorizedMismatchIntrinsic) {
StubRoutines::_vectorizedMismatch = generate_vectorizedMismatch();
}
#ifndef _WINDOWS
if (UseMontgomeryMultiplyIntrinsic) {
StubRoutines::_montgomeryMultiply

View File

@ -1041,6 +1041,25 @@ void VM_Version::get_processor_features() {
}
}
#ifdef _LP64
if (UseSSE42Intrinsics) {
if (FLAG_IS_DEFAULT(UseVectorizedMismatchIntrinsic)) {
UseVectorizedMismatchIntrinsic = true;
}
} else if (UseVectorizedMismatchIntrinsic) {
if (!FLAG_IS_DEFAULT(UseVectorizedMismatchIntrinsic))
warning("vectorizedMismatch intrinsics are not available on this CPU");
FLAG_SET_DEFAULT(UseVectorizedMismatchIntrinsic, false);
}
#else
if (UseVectorizedMismatchIntrinsic) {
if (!FLAG_IS_DEFAULT(UseVectorizedMismatchIntrinsic)) {
warning("vectorizedMismatch intrinsic is not available in 32-bit VM");
}
FLAG_SET_DEFAULT(UseVectorizedMismatchIntrinsic, false);
}
#endif // _LP64
// Use count leading zeros count instruction if available.
if (supports_lzcnt()) {
if (FLAG_IS_DEFAULT(UseCountLeadingZerosInstruction)) {

View File

@ -681,6 +681,9 @@ bool vmIntrinsics::is_disabled_by_flags(const methodHandle& method) {
case vmIntrinsics::_montgomerySquare:
if (!UseMontgomerySquareIntrinsic) return true;
break;
case vmIntrinsics::_vectorizedMismatch:
if (!UseVectorizedMismatchIntrinsic) return true;
break;
case vmIntrinsics::_addExactI:
case vmIntrinsics::_addExactL:
case vmIntrinsics::_decrementExactI:

View File

@ -957,6 +957,11 @@
do_name( montgomerySquare_name, "implMontgomerySquare") \
do_signature(montgomerySquare_signature, "([I[IIJ[I)[I") \
\
do_class(java_util_ArraysSupport, "java/util/ArraysSupport") \
do_intrinsic(_vectorizedMismatch, java_util_ArraysSupport, vectorizedMismatch_name, vectorizedMismatch_signature, F_S)\
do_name(vectorizedMismatch_name, "vectorizedMismatch") \
do_signature(vectorizedMismatch_signature, "(Ljava/lang/Object;JLjava/lang/Object;JII)I") \
\
/* java/lang/ref/Reference */ \
do_intrinsic(_Reference_get, java_lang_ref_Reference, get_name, void_object_signature, F_R) \
\

View File

@ -441,6 +441,7 @@ bool C2Compiler::is_intrinsic_supported(const methodHandle& method, bool is_virt
case vmIntrinsics::_mulAdd:
case vmIntrinsics::_montgomeryMultiply:
case vmIntrinsics::_montgomerySquare:
case vmIntrinsics::_vectorizedMismatch:
case vmIntrinsics::_ghash_processBlocks:
case vmIntrinsics::_updateCRC32:
case vmIntrinsics::_updateBytesCRC32:

View File

@ -987,7 +987,8 @@ void ConnectionGraph::process_call_arguments(CallNode *call) {
strcmp(call->as_CallLeaf()->_name, "squareToLen") == 0 ||
strcmp(call->as_CallLeaf()->_name, "mulAdd") == 0 ||
strcmp(call->as_CallLeaf()->_name, "montgomery_multiply") == 0 ||
strcmp(call->as_CallLeaf()->_name, "montgomery_square") == 0)
strcmp(call->as_CallLeaf()->_name, "montgomery_square") == 0 ||
strcmp(call->as_CallLeaf()->_name, "vectorizedMismatch") == 0)
))) {
call->dump();
fatal("EA unexpected CallLeaf %s", call->as_CallLeaf()->_name);

View File

@ -312,6 +312,7 @@ class LibraryCallKit : public GraphKit {
bool inline_mulAdd();
bool inline_montgomeryMultiply();
bool inline_montgomerySquare();
bool inline_vectorizedMismatch();
bool inline_profileBoolean();
bool inline_isCompileConstant();
@ -720,6 +721,9 @@ bool LibraryCallKit::try_to_inline(int predicate) {
case vmIntrinsics::_montgomerySquare:
return inline_montgomerySquare();
case vmIntrinsics::_vectorizedMismatch:
return inline_vectorizedMismatch();
case vmIntrinsics::_ghash_processBlocks:
return inline_ghash_processBlocks();
@ -5581,6 +5585,50 @@ bool LibraryCallKit::inline_montgomerySquare() {
return true;
}
//-------------inline_vectorizedMismatch------------------------------
bool LibraryCallKit::inline_vectorizedMismatch() {
assert(UseVectorizedMismatchIntrinsic, "not implementated on this platform");
address stubAddr = StubRoutines::vectorizedMismatch();
if (stubAddr == NULL) {
return false; // Intrinsic's stub is not implemented on this platform
}
const char* stubName = "vectorizedMismatch";
int size_l = callee()->signature()->size();
assert(callee()->signature()->size() == 8, "vectorizedMismatch has 6 parameters");
Node* obja = argument(0);
Node* aoffset = argument(1);
Node* objb = argument(3);
Node* boffset = argument(4);
Node* length = argument(6);
Node* scale = argument(7);
const Type* a_type = obja->Value(&_gvn);
const Type* b_type = objb->Value(&_gvn);
const TypeAryPtr* top_a = a_type->isa_aryptr();
const TypeAryPtr* top_b = b_type->isa_aryptr();
if (top_a == NULL || top_a->klass() == NULL ||
top_b == NULL || top_b->klass() == NULL) {
// failed array check
return false;
}
Node* call;
jvms()->set_should_reexecute(true);
Node* obja_adr = make_unsafe_address(obja, aoffset);
Node* objb_adr = make_unsafe_address(objb, boffset);
call = make_runtime_call(RC_LEAF,
OptoRuntime::vectorizedMismatch_Type(),
stubAddr, stubName, TypePtr::BOTTOM,
obja_adr, objb_adr, length, scale);
Node* result = _gvn.transform(new ProjNode(call, TypeFunc::Parms));
set_result(result);
return true;
}
/**
* Calculate CRC32 for byte.

View File

@ -1103,6 +1103,26 @@ const TypeFunc* OptoRuntime::montgomerySquare_Type() {
return TypeFunc::make(domain, range);
}
const TypeFunc* OptoRuntime::vectorizedMismatch_Type() {
// create input type (domain)
int num_args = 4;
int argcnt = num_args;
const Type** fields = TypeTuple::fields(argcnt);
int argp = TypeFunc::Parms;
fields[argp++] = TypePtr::NOTNULL; // obja
fields[argp++] = TypePtr::NOTNULL; // objb
fields[argp++] = TypeInt::INT; // length, number of elements
fields[argp++] = TypeInt::INT; // log2scale, element size
assert(argp == TypeFunc::Parms + argcnt, "correct decoding");
const TypeTuple* domain = TypeTuple::make(TypeFunc::Parms + argcnt, fields);
//return mismatch index (int)
fields = TypeTuple::fields(1);
fields[TypeFunc::Parms + 0] = TypeInt::INT;
const TypeTuple* range = TypeTuple::make(TypeFunc::Parms + 1, fields);
return TypeFunc::make(domain, range);
}
// GHASH block processing
const TypeFunc* OptoRuntime::ghash_processBlocks_Type() {
int argcnt = 4;

View File

@ -299,6 +299,8 @@ private:
static const TypeFunc* mulAdd_Type();
static const TypeFunc* vectorizedMismatch_Type();
static const TypeFunc* ghash_processBlocks_Type();
static const TypeFunc* updateBytesCRC32_Type();

View File

@ -855,6 +855,9 @@ public:
product(bool, UseAdler32Intrinsics, false, \
"use intrinsics for java.util.zip.Adler32") \
\
product(bool, UseVectorizedMismatchIntrinsic, false, \
"Enables intrinsification of ArraysSupport.vectorizedMismatch()") \
\
diagnostic(ccstrlist, DisableIntrinsic, "", \
"do not expand intrinsics whose (internal) names appear here") \
\

View File

@ -148,6 +148,8 @@ address StubRoutines::_mulAdd = NULL;
address StubRoutines::_montgomeryMultiply = NULL;
address StubRoutines::_montgomerySquare = NULL;
address StubRoutines::_vectorizedMismatch = NULL;
address StubRoutines::_dexp = NULL;
address StubRoutines::_dlog = NULL;

View File

@ -207,6 +207,8 @@ class StubRoutines: AllStatic {
static address _montgomeryMultiply;
static address _montgomerySquare;
static address _vectorizedMismatch;
static address _dexp;
static address _dlog;
@ -376,6 +378,8 @@ class StubRoutines: AllStatic {
static address montgomeryMultiply() { return _montgomeryMultiply; }
static address montgomerySquare() { return _montgomerySquare; }
static address vectorizedMismatch() { return _vectorizedMismatch; }
static address dexp() { return _dexp; }
static address dlog() { return _dlog; }

View File

@ -860,6 +860,7 @@ typedef CompactHashtable<Symbol*, char> SymbolCompactHashTable;
static_field(StubRoutines, _mulAdd, address) \
static_field(StubRoutines, _dexp, address) \
static_field(StubRoutines, _dlog, address) \
static_field(StubRoutines, _vectorizedMismatch, address) \
static_field(StubRoutines, _jbyte_arraycopy, address) \
static_field(StubRoutines, _jshort_arraycopy, address) \
static_field(StubRoutines, _jint_arraycopy, address) \