From 2d9a6cfd3f4ebe81428fb759be0498e7e2bff129 Mon Sep 17 00:00:00 2001 From: Vivek R Deshpande Date: Mon, 7 Dec 2015 16:35:07 -0800 Subject: [PATCH 1/8] 8143355: Update for addition of vectorizedMismatch intrinsic for x86 Co-authored-by: Liqi Yi Reviewed-by: kvn --- .../src/cpu/aarch64/vm/vm_version_aarch64.cpp | 5 + hotspot/src/cpu/ppc/vm/vm_version_ppc.cpp | 5 + hotspot/src/cpu/sparc/vm/vm_version_sparc.cpp | 5 + hotspot/src/cpu/x86/vm/macroAssembler_x86.cpp | 173 +++++++++++++++++- hotspot/src/cpu/x86/vm/macroAssembler_x86.hpp | 4 +- .../src/cpu/x86/vm/stubGenerator_x86_64.cpp | 52 +++++- hotspot/src/cpu/x86/vm/vm_version_x86.cpp | 19 ++ hotspot/src/share/vm/classfile/vmSymbols.cpp | 3 + hotspot/src/share/vm/classfile/vmSymbols.hpp | 5 + hotspot/src/share/vm/opto/c2compiler.cpp | 1 + hotspot/src/share/vm/opto/escape.cpp | 3 +- hotspot/src/share/vm/opto/library_call.cpp | 48 +++++ hotspot/src/share/vm/opto/runtime.cpp | 20 ++ hotspot/src/share/vm/opto/runtime.hpp | 2 + hotspot/src/share/vm/runtime/globals.hpp | 3 + hotspot/src/share/vm/runtime/stubRoutines.cpp | 2 + hotspot/src/share/vm/runtime/stubRoutines.hpp | 4 + hotspot/src/share/vm/runtime/vmStructs.cpp | 1 + 18 files changed, 351 insertions(+), 4 deletions(-) diff --git a/hotspot/src/cpu/aarch64/vm/vm_version_aarch64.cpp b/hotspot/src/cpu/aarch64/vm/vm_version_aarch64.cpp index 37c0e5bbb63..72cd0375f52 100644 --- a/hotspot/src/cpu/aarch64/vm/vm_version_aarch64.cpp +++ b/hotspot/src/cpu/aarch64/vm/vm_version_aarch64.cpp @@ -182,6 +182,11 @@ void VM_Version::get_processor_features() { FLAG_SET_DEFAULT(UseAdler32Intrinsics, true); } + if (UseVectorizedMismatchIntrinsic) { + warning("UseVectorizedMismatchIntrinsic specified, but not available on this CPU."); + FLAG_SET_DEFAULT(UseVectorizedMismatchIntrinsic, false); + } + if (auxv & HWCAP_AES) { UseAES = UseAES || FLAG_IS_DEFAULT(UseAES); UseAESIntrinsics = diff --git a/hotspot/src/cpu/ppc/vm/vm_version_ppc.cpp b/hotspot/src/cpu/ppc/vm/vm_version_ppc.cpp index 774d90b5cf1..5228c2749e3 100644 --- a/hotspot/src/cpu/ppc/vm/vm_version_ppc.cpp +++ b/hotspot/src/cpu/ppc/vm/vm_version_ppc.cpp @@ -223,6 +223,11 @@ void VM_Version::initialize() { UseMultiplyToLenIntrinsic = true; } + if (UseVectorizedMismatchIntrinsic) { + warning("UseVectorizedMismatchIntrinsic specified, but not available on this CPU."); + FLAG_SET_DEFAULT(UseVectorizedMismatchIntrinsic, false); + } + // Adjust RTM (Restricted Transactional Memory) flags. if (!has_tcheck() && UseRTMLocking) { // Can't continue because UseRTMLocking affects UseBiasedLocking flag diff --git a/hotspot/src/cpu/sparc/vm/vm_version_sparc.cpp b/hotspot/src/cpu/sparc/vm/vm_version_sparc.cpp index e6a79435447..a8b7964e3f7 100644 --- a/hotspot/src/cpu/sparc/vm/vm_version_sparc.cpp +++ b/hotspot/src/cpu/sparc/vm/vm_version_sparc.cpp @@ -356,6 +356,11 @@ void VM_Version::initialize() { FLAG_SET_DEFAULT(UseCRC32Intrinsics, false); } + if (UseVectorizedMismatchIntrinsic) { + warning("UseVectorizedMismatchIntrinsic specified, but not available on this CPU."); + FLAG_SET_DEFAULT(UseVectorizedMismatchIntrinsic, false); + } + if (FLAG_IS_DEFAULT(ContendedPaddingWidth) && (cache_line_size > ContendedPaddingWidth)) ContendedPaddingWidth = cache_line_size; diff --git a/hotspot/src/cpu/x86/vm/macroAssembler_x86.cpp b/hotspot/src/cpu/x86/vm/macroAssembler_x86.cpp index 69a0d562df1..2ccee91a6cd 100644 --- a/hotspot/src/cpu/x86/vm/macroAssembler_x86.cpp +++ b/hotspot/src/cpu/x86/vm/macroAssembler_x86.cpp @@ -9439,13 +9439,184 @@ void MacroAssembler::multiply_to_len(Register x, Register xlen, Register y, Regi pop(tmp1); } +void MacroAssembler::vectorized_mismatch(Register obja, Register objb, Register length, Register log2_array_indxscale, + Register result, Register tmp1, Register tmp2, XMMRegister rymm0, XMMRegister rymm1, XMMRegister rymm2){ + assert(UseSSE42Intrinsics, "SSE4.2 must be enabled."); + Label VECTOR32_LOOP, VECTOR16_LOOP, VECTOR8_LOOP, VECTOR4_LOOP; + Label VECTOR16_TAIL, VECTOR8_TAIL, VECTOR4_TAIL; + Label VECTOR32_NOT_EQUAL, VECTOR16_NOT_EQUAL, VECTOR8_NOT_EQUAL, VECTOR4_NOT_EQUAL; + Label SAME_TILL_END, DONE; + Label BYTES_LOOP, BYTES_TAIL, BYTES_NOT_EQUAL; + + //scale is in rcx in both Win64 and Unix + ShortBranchVerifier sbv(this); + + shlq(length); + xorq(result, result); + + cmpq(length, 8); + jcc(Assembler::equal, VECTOR8_LOOP); + jcc(Assembler::less, VECTOR4_TAIL); + + if (UseAVX >= 2){ + + cmpq(length, 16); + jcc(Assembler::equal, VECTOR16_LOOP); + jcc(Assembler::less, VECTOR8_LOOP); + + cmpq(length, 32); + jccb(Assembler::less, VECTOR16_TAIL); + + subq(length, 32); + bind(VECTOR32_LOOP); + vmovdqu(rymm0, Address(obja, result)); + vmovdqu(rymm1, Address(objb, result)); + vpxor(rymm2, rymm0, rymm1, Assembler::AVX_256bit); + vptest(rymm2, rymm2); + jcc(Assembler::notZero, VECTOR32_NOT_EQUAL);//mismatch found + addq(result, 32); + subq(length, 32); + jccb(Assembler::greaterEqual, VECTOR32_LOOP); + addq(length, 32); + jcc(Assembler::equal, SAME_TILL_END); + //falling through if less than 32 bytes left //close the branch here. + + bind(VECTOR16_TAIL); + cmpq(length, 16); + jccb(Assembler::less, VECTOR8_TAIL); + bind(VECTOR16_LOOP); + movdqu(rymm0, Address(obja, result)); + movdqu(rymm1, Address(objb, result)); + vpxor(rymm2, rymm0, rymm1, Assembler::AVX_128bit); + ptest(rymm2, rymm2); + jcc(Assembler::notZero, VECTOR16_NOT_EQUAL);//mismatch found + addq(result, 16); + subq(length, 16); + jcc(Assembler::equal, SAME_TILL_END); + //falling through if less than 16 bytes left + } else {//regular intrinsics + + cmpq(length, 16); + jccb(Assembler::less, VECTOR8_TAIL); + + subq(length, 16); + bind(VECTOR16_LOOP); + movdqu(rymm0, Address(obja, result)); + movdqu(rymm1, Address(objb, result)); + pxor(rymm0, rymm1); + ptest(rymm0, rymm0); + jcc(Assembler::notZero, VECTOR16_NOT_EQUAL);//mismatch found + addq(result, 16); + subq(length, 16); + jccb(Assembler::greaterEqual, VECTOR16_LOOP); + addq(length, 16); + jcc(Assembler::equal, SAME_TILL_END); + //falling through if less than 16 bytes left + } + + bind(VECTOR8_TAIL); + cmpq(length, 8); + jccb(Assembler::less, VECTOR4_TAIL); + bind(VECTOR8_LOOP); + movq(tmp1, Address(obja, result)); + movq(tmp2, Address(objb, result)); + xorq(tmp1, tmp2); + testq(tmp1, tmp1); + jcc(Assembler::notZero, VECTOR8_NOT_EQUAL);//mismatch found + addq(result, 8); + subq(length, 8); + jcc(Assembler::equal, SAME_TILL_END); + //falling through if less than 8 bytes left + + bind(VECTOR4_TAIL); + cmpq(length, 4); + jccb(Assembler::less, BYTES_TAIL); + bind(VECTOR4_LOOP); + movl(tmp1, Address(obja, result)); + xorl(tmp1, Address(objb, result)); + testl(tmp1, tmp1); + jcc(Assembler::notZero, VECTOR4_NOT_EQUAL);//mismatch found + addq(result, 4); + subq(length, 4); + jcc(Assembler::equal, SAME_TILL_END); + //falling through if less than 4 bytes left + + bind(BYTES_TAIL); + bind(BYTES_LOOP); + load_unsigned_byte(tmp1, Address(obja, result)); + load_unsigned_byte(tmp2, Address(objb, result)); + xorl(tmp1, tmp2); + testl(tmp1, tmp1); + jccb(Assembler::notZero, BYTES_NOT_EQUAL);//mismatch found + decq(length); + jccb(Assembler::zero, SAME_TILL_END); + incq(result); + load_unsigned_byte(tmp1, Address(obja, result)); + load_unsigned_byte(tmp2, Address(objb, result)); + xorl(tmp1, tmp2); + testl(tmp1, tmp1); + jccb(Assembler::notZero, BYTES_NOT_EQUAL);//mismatch found + decq(length); + jccb(Assembler::zero, SAME_TILL_END); + incq(result); + load_unsigned_byte(tmp1, Address(obja, result)); + load_unsigned_byte(tmp2, Address(objb, result)); + xorl(tmp1, tmp2); + testl(tmp1, tmp1); + jccb(Assembler::notZero, BYTES_NOT_EQUAL);//mismatch found + jmpb(SAME_TILL_END); + + if (UseAVX >= 2){ + bind(VECTOR32_NOT_EQUAL); + vpcmpeqb(rymm2, rymm2, rymm2, Assembler::AVX_256bit); + vpcmpeqb(rymm0, rymm0, rymm1, Assembler::AVX_256bit); + vpxor(rymm0, rymm0, rymm2, Assembler::AVX_256bit); + vpmovmskb(tmp1, rymm0); + bsfq(tmp1, tmp1); + addq(result, tmp1); + shrq(result); + jmpb(DONE); + } + + bind(VECTOR16_NOT_EQUAL); + if (UseAVX >= 2){ + vpcmpeqb(rymm2, rymm2, rymm2, Assembler::AVX_128bit); + vpcmpeqb(rymm0, rymm0, rymm1, Assembler::AVX_128bit); + pxor(rymm0, rymm2); + } else { + pcmpeqb(rymm2, rymm2); + pxor(rymm0, rymm1); + pcmpeqb(rymm0, rymm1); + pxor(rymm0, rymm2); + } + pmovmskb(tmp1, rymm0); + bsfq(tmp1, tmp1); + addq(result, tmp1); + shrq(result); + jmpb(DONE); + + bind(VECTOR8_NOT_EQUAL); + bind(VECTOR4_NOT_EQUAL); + bsfq(tmp1, tmp1); + shrq(tmp1, 3); + addq(result, tmp1); + bind(BYTES_NOT_EQUAL); + shrq(result); + jmpb(DONE); + + bind(SAME_TILL_END); + mov64(result, -1); + + bind(DONE); +} + + //Helper functions for square_to_len() /** * Store the squares of x[], right shifted one bit (divided by 2) into z[] * Preserves x and z and modifies rest of the registers. */ - void MacroAssembler::square_rshift(Register x, Register xlen, Register z, Register tmp1, Register tmp3, Register tmp4, Register tmp5, Register rdxReg, Register raxReg) { // Perform square and right shift by 1 // Handle odd xlen case first, then for even xlen do the following diff --git a/hotspot/src/cpu/x86/vm/macroAssembler_x86.hpp b/hotspot/src/cpu/x86/vm/macroAssembler_x86.hpp index e29d80b12bb..719d745b3ed 100644 --- a/hotspot/src/cpu/x86/vm/macroAssembler_x86.hpp +++ b/hotspot/src/cpu/x86/vm/macroAssembler_x86.hpp @@ -1346,7 +1346,6 @@ public: Register carry2); void multiply_to_len(Register x, Register xlen, Register y, Register ylen, Register z, Register zlen, Register tmp1, Register tmp2, Register tmp3, Register tmp4, Register tmp5); - void square_rshift(Register x, Register len, Register z, Register tmp1, Register tmp3, Register tmp4, Register tmp5, Register rdxReg, Register raxReg); void multiply_add_64_bmi2(Register sum, Register op1, Register op2, Register carry, @@ -1365,6 +1364,9 @@ public: void mul_add(Register out, Register in, Register offset, Register len, Register k, Register tmp1, Register tmp2, Register tmp3, Register tmp4, Register tmp5, Register rdxReg, Register raxReg); + void vectorized_mismatch(Register obja, Register objb, Register length, Register log2_array_indxscale, + Register result, Register tmp1, Register tmp2, + XMMRegister vec1, XMMRegister vec2, XMMRegister vec3); #endif // CRC32 code for java.util.zip.CRC32::updateBytes() intrinsic. diff --git a/hotspot/src/cpu/x86/vm/stubGenerator_x86_64.cpp b/hotspot/src/cpu/x86/vm/stubGenerator_x86_64.cpp index b68bedf1ad6..c704975870f 100644 --- a/hotspot/src/cpu/x86/vm/stubGenerator_x86_64.cpp +++ b/hotspot/src/cpu/x86/vm/stubGenerator_x86_64.cpp @@ -4054,6 +4054,54 @@ class StubGenerator: public StubCodeGenerator { return start; } + /** + * Arguments: + * + * Input: + * c_rarg0 - obja address + * c_rarg1 - objb address + * c_rarg3 - length length + * c_rarg4 - scale log2_array_indxscale + */ + address generate_vectorizedMismatch() { + __ align(CodeEntryAlignment); + StubCodeMark mark(this, "StubRoutines", "vectorizedMismatch"); + address start = __ pc(); + + BLOCK_COMMENT("Entry:"); + __ enter(); + +#ifdef _WIN64 // Win64: rcx, rdx, r8, r9 (c_rarg0, c_rarg1, ...) + const Register scale = c_rarg0; //rcx, will exchange with r9 + const Register objb = c_rarg1; //rdx + const Register length = c_rarg2; //r8 + const Register obja = c_rarg3; //r9 + __ xchgq(obja, scale); //now obja and scale contains the correct contents + + const Register tmp1 = r10; + const Register tmp2 = r11; +#endif +#ifndef _WIN64 // Unix: rdi, rsi, rdx, rcx, r8, r9 (c_rarg0, c_rarg1, ...) + const Register obja = c_rarg0; //U:rdi + const Register objb = c_rarg1; //U:rsi + const Register length = c_rarg2; //U:rdx + const Register scale = c_rarg3; //U:rcx + const Register tmp1 = r8; + const Register tmp2 = r9; +#endif + const Register result = rax; //return value + const XMMRegister vec0 = xmm0; + const XMMRegister vec1 = xmm1; + const XMMRegister vec2 = xmm2; + + __ vectorized_mismatch(obja, objb, length, scale, result, tmp1, tmp2, vec0, vec1, vec2); + + __ leave(); + __ ret(0); + + return start; + } + /** * Arguments: * @@ -4505,7 +4553,9 @@ class StubGenerator: public StubCodeGenerator { if (UseMulAddIntrinsic) { StubRoutines::_mulAdd = generate_mulAdd(); } - + if (UseVectorizedMismatchIntrinsic) { + StubRoutines::_vectorizedMismatch = generate_vectorizedMismatch(); + } #ifndef _WINDOWS if (UseMontgomeryMultiplyIntrinsic) { StubRoutines::_montgomeryMultiply diff --git a/hotspot/src/cpu/x86/vm/vm_version_x86.cpp b/hotspot/src/cpu/x86/vm/vm_version_x86.cpp index eed88e81342..b458f99fac0 100644 --- a/hotspot/src/cpu/x86/vm/vm_version_x86.cpp +++ b/hotspot/src/cpu/x86/vm/vm_version_x86.cpp @@ -1041,6 +1041,25 @@ void VM_Version::get_processor_features() { } } +#ifdef _LP64 + if (UseSSE42Intrinsics) { + if (FLAG_IS_DEFAULT(UseVectorizedMismatchIntrinsic)) { + UseVectorizedMismatchIntrinsic = true; + } + } else if (UseVectorizedMismatchIntrinsic) { + if (!FLAG_IS_DEFAULT(UseVectorizedMismatchIntrinsic)) + warning("vectorizedMismatch intrinsics are not available on this CPU"); + FLAG_SET_DEFAULT(UseVectorizedMismatchIntrinsic, false); + } +#else + if (UseVectorizedMismatchIntrinsic) { + if (!FLAG_IS_DEFAULT(UseVectorizedMismatchIntrinsic)) { + warning("vectorizedMismatch intrinsic is not available in 32-bit VM"); + } + FLAG_SET_DEFAULT(UseVectorizedMismatchIntrinsic, false); + } +#endif // _LP64 + // Use count leading zeros count instruction if available. if (supports_lzcnt()) { if (FLAG_IS_DEFAULT(UseCountLeadingZerosInstruction)) { diff --git a/hotspot/src/share/vm/classfile/vmSymbols.cpp b/hotspot/src/share/vm/classfile/vmSymbols.cpp index bfb99c67630..6f0089a77f5 100644 --- a/hotspot/src/share/vm/classfile/vmSymbols.cpp +++ b/hotspot/src/share/vm/classfile/vmSymbols.cpp @@ -681,6 +681,9 @@ bool vmIntrinsics::is_disabled_by_flags(const methodHandle& method) { case vmIntrinsics::_montgomerySquare: if (!UseMontgomerySquareIntrinsic) return true; break; + case vmIntrinsics::_vectorizedMismatch: + if (!UseVectorizedMismatchIntrinsic) return true; + break; case vmIntrinsics::_addExactI: case vmIntrinsics::_addExactL: case vmIntrinsics::_decrementExactI: diff --git a/hotspot/src/share/vm/classfile/vmSymbols.hpp b/hotspot/src/share/vm/classfile/vmSymbols.hpp index e651ad00ed9..0146b37a4ec 100644 --- a/hotspot/src/share/vm/classfile/vmSymbols.hpp +++ b/hotspot/src/share/vm/classfile/vmSymbols.hpp @@ -957,6 +957,11 @@ do_name( montgomerySquare_name, "implMontgomerySquare") \ do_signature(montgomerySquare_signature, "([I[IIJ[I)[I") \ \ + do_class(java_util_ArraysSupport, "java/util/ArraysSupport") \ + do_intrinsic(_vectorizedMismatch, java_util_ArraysSupport, vectorizedMismatch_name, vectorizedMismatch_signature, F_S)\ + do_name(vectorizedMismatch_name, "vectorizedMismatch") \ + do_signature(vectorizedMismatch_signature, "(Ljava/lang/Object;JLjava/lang/Object;JII)I") \ + \ /* java/lang/ref/Reference */ \ do_intrinsic(_Reference_get, java_lang_ref_Reference, get_name, void_object_signature, F_R) \ \ diff --git a/hotspot/src/share/vm/opto/c2compiler.cpp b/hotspot/src/share/vm/opto/c2compiler.cpp index c8ff9804039..689147d9f97 100644 --- a/hotspot/src/share/vm/opto/c2compiler.cpp +++ b/hotspot/src/share/vm/opto/c2compiler.cpp @@ -441,6 +441,7 @@ bool C2Compiler::is_intrinsic_supported(const methodHandle& method, bool is_virt case vmIntrinsics::_mulAdd: case vmIntrinsics::_montgomeryMultiply: case vmIntrinsics::_montgomerySquare: + case vmIntrinsics::_vectorizedMismatch: case vmIntrinsics::_ghash_processBlocks: case vmIntrinsics::_updateCRC32: case vmIntrinsics::_updateBytesCRC32: diff --git a/hotspot/src/share/vm/opto/escape.cpp b/hotspot/src/share/vm/opto/escape.cpp index 92ed15ca3da..169cfc0113f 100644 --- a/hotspot/src/share/vm/opto/escape.cpp +++ b/hotspot/src/share/vm/opto/escape.cpp @@ -987,7 +987,8 @@ void ConnectionGraph::process_call_arguments(CallNode *call) { strcmp(call->as_CallLeaf()->_name, "squareToLen") == 0 || strcmp(call->as_CallLeaf()->_name, "mulAdd") == 0 || strcmp(call->as_CallLeaf()->_name, "montgomery_multiply") == 0 || - strcmp(call->as_CallLeaf()->_name, "montgomery_square") == 0) + strcmp(call->as_CallLeaf()->_name, "montgomery_square") == 0 || + strcmp(call->as_CallLeaf()->_name, "vectorizedMismatch") == 0) ))) { call->dump(); fatal("EA unexpected CallLeaf %s", call->as_CallLeaf()->_name); diff --git a/hotspot/src/share/vm/opto/library_call.cpp b/hotspot/src/share/vm/opto/library_call.cpp index 694fb6f6834..6620e57be72 100644 --- a/hotspot/src/share/vm/opto/library_call.cpp +++ b/hotspot/src/share/vm/opto/library_call.cpp @@ -312,6 +312,7 @@ class LibraryCallKit : public GraphKit { bool inline_mulAdd(); bool inline_montgomeryMultiply(); bool inline_montgomerySquare(); + bool inline_vectorizedMismatch(); bool inline_profileBoolean(); bool inline_isCompileConstant(); @@ -720,6 +721,9 @@ bool LibraryCallKit::try_to_inline(int predicate) { case vmIntrinsics::_montgomerySquare: return inline_montgomerySquare(); + case vmIntrinsics::_vectorizedMismatch: + return inline_vectorizedMismatch(); + case vmIntrinsics::_ghash_processBlocks: return inline_ghash_processBlocks(); @@ -5581,6 +5585,50 @@ bool LibraryCallKit::inline_montgomerySquare() { return true; } +//-------------inline_vectorizedMismatch------------------------------ +bool LibraryCallKit::inline_vectorizedMismatch() { + assert(UseVectorizedMismatchIntrinsic, "not implementated on this platform"); + + address stubAddr = StubRoutines::vectorizedMismatch(); + if (stubAddr == NULL) { + return false; // Intrinsic's stub is not implemented on this platform + } + const char* stubName = "vectorizedMismatch"; + int size_l = callee()->signature()->size(); + assert(callee()->signature()->size() == 8, "vectorizedMismatch has 6 parameters"); + + Node* obja = argument(0); + Node* aoffset = argument(1); + Node* objb = argument(3); + Node* boffset = argument(4); + Node* length = argument(6); + Node* scale = argument(7); + + const Type* a_type = obja->Value(&_gvn); + const Type* b_type = objb->Value(&_gvn); + const TypeAryPtr* top_a = a_type->isa_aryptr(); + const TypeAryPtr* top_b = b_type->isa_aryptr(); + if (top_a == NULL || top_a->klass() == NULL || + top_b == NULL || top_b->klass() == NULL) { + // failed array check + return false; + } + + Node* call; + jvms()->set_should_reexecute(true); + + Node* obja_adr = make_unsafe_address(obja, aoffset); + Node* objb_adr = make_unsafe_address(objb, boffset); + + call = make_runtime_call(RC_LEAF, + OptoRuntime::vectorizedMismatch_Type(), + stubAddr, stubName, TypePtr::BOTTOM, + obja_adr, objb_adr, length, scale); + + Node* result = _gvn.transform(new ProjNode(call, TypeFunc::Parms)); + set_result(result); + return true; +} /** * Calculate CRC32 for byte. diff --git a/hotspot/src/share/vm/opto/runtime.cpp b/hotspot/src/share/vm/opto/runtime.cpp index 1ae727c105a..a42c750d465 100644 --- a/hotspot/src/share/vm/opto/runtime.cpp +++ b/hotspot/src/share/vm/opto/runtime.cpp @@ -1103,6 +1103,26 @@ const TypeFunc* OptoRuntime::montgomerySquare_Type() { return TypeFunc::make(domain, range); } +const TypeFunc* OptoRuntime::vectorizedMismatch_Type() { + // create input type (domain) + int num_args = 4; + int argcnt = num_args; + const Type** fields = TypeTuple::fields(argcnt); + int argp = TypeFunc::Parms; + fields[argp++] = TypePtr::NOTNULL; // obja + fields[argp++] = TypePtr::NOTNULL; // objb + fields[argp++] = TypeInt::INT; // length, number of elements + fields[argp++] = TypeInt::INT; // log2scale, element size + assert(argp == TypeFunc::Parms + argcnt, "correct decoding"); + const TypeTuple* domain = TypeTuple::make(TypeFunc::Parms + argcnt, fields); + + //return mismatch index (int) + fields = TypeTuple::fields(1); + fields[TypeFunc::Parms + 0] = TypeInt::INT; + const TypeTuple* range = TypeTuple::make(TypeFunc::Parms + 1, fields); + return TypeFunc::make(domain, range); +} + // GHASH block processing const TypeFunc* OptoRuntime::ghash_processBlocks_Type() { int argcnt = 4; diff --git a/hotspot/src/share/vm/opto/runtime.hpp b/hotspot/src/share/vm/opto/runtime.hpp index 8e38f3f3a15..69e78aa8ff7 100644 --- a/hotspot/src/share/vm/opto/runtime.hpp +++ b/hotspot/src/share/vm/opto/runtime.hpp @@ -299,6 +299,8 @@ private: static const TypeFunc* mulAdd_Type(); + static const TypeFunc* vectorizedMismatch_Type(); + static const TypeFunc* ghash_processBlocks_Type(); static const TypeFunc* updateBytesCRC32_Type(); diff --git a/hotspot/src/share/vm/runtime/globals.hpp b/hotspot/src/share/vm/runtime/globals.hpp index 9b8b1311a76..acfcd6735f4 100644 --- a/hotspot/src/share/vm/runtime/globals.hpp +++ b/hotspot/src/share/vm/runtime/globals.hpp @@ -855,6 +855,9 @@ public: product(bool, UseAdler32Intrinsics, false, \ "use intrinsics for java.util.zip.Adler32") \ \ + product(bool, UseVectorizedMismatchIntrinsic, false, \ + "Enables intrinsification of ArraysSupport.vectorizedMismatch()") \ + \ diagnostic(ccstrlist, DisableIntrinsic, "", \ "do not expand intrinsics whose (internal) names appear here") \ \ diff --git a/hotspot/src/share/vm/runtime/stubRoutines.cpp b/hotspot/src/share/vm/runtime/stubRoutines.cpp index fef7c0b3b03..330ca9d0a55 100644 --- a/hotspot/src/share/vm/runtime/stubRoutines.cpp +++ b/hotspot/src/share/vm/runtime/stubRoutines.cpp @@ -148,6 +148,8 @@ address StubRoutines::_mulAdd = NULL; address StubRoutines::_montgomeryMultiply = NULL; address StubRoutines::_montgomerySquare = NULL; +address StubRoutines::_vectorizedMismatch = NULL; + address StubRoutines::_dexp = NULL; address StubRoutines::_dlog = NULL; diff --git a/hotspot/src/share/vm/runtime/stubRoutines.hpp b/hotspot/src/share/vm/runtime/stubRoutines.hpp index ac198d0746c..2a9c7084786 100644 --- a/hotspot/src/share/vm/runtime/stubRoutines.hpp +++ b/hotspot/src/share/vm/runtime/stubRoutines.hpp @@ -207,6 +207,8 @@ class StubRoutines: AllStatic { static address _montgomeryMultiply; static address _montgomerySquare; + static address _vectorizedMismatch; + static address _dexp; static address _dlog; @@ -376,6 +378,8 @@ class StubRoutines: AllStatic { static address montgomeryMultiply() { return _montgomeryMultiply; } static address montgomerySquare() { return _montgomerySquare; } + static address vectorizedMismatch() { return _vectorizedMismatch; } + static address dexp() { return _dexp; } static address dlog() { return _dlog; } diff --git a/hotspot/src/share/vm/runtime/vmStructs.cpp b/hotspot/src/share/vm/runtime/vmStructs.cpp index bda425a96df..d3e1b44de02 100644 --- a/hotspot/src/share/vm/runtime/vmStructs.cpp +++ b/hotspot/src/share/vm/runtime/vmStructs.cpp @@ -860,6 +860,7 @@ typedef CompactHashtable SymbolCompactHashTable; static_field(StubRoutines, _mulAdd, address) \ static_field(StubRoutines, _dexp, address) \ static_field(StubRoutines, _dlog, address) \ + static_field(StubRoutines, _vectorizedMismatch, address) \ static_field(StubRoutines, _jbyte_arraycopy, address) \ static_field(StubRoutines, _jshort_arraycopy, address) \ static_field(StubRoutines, _jint_arraycopy, address) \ From 682da7441854744f3be7ec3c64b3108d69ad8d49 Mon Sep 17 00:00:00 2001 From: Martin Doerr Date: Tue, 8 Dec 2015 14:44:00 +0100 Subject: [PATCH 2/8] 8143817: C1: Platform dependent stack space not preserved for all runtime calls Reviewed-by: roland --- hotspot/src/share/vm/c1/c1_LIRGenerator.cpp | 23 ++++++++++++--------- hotspot/src/share/vm/c1/c1_LIRGenerator.hpp | 2 +- 2 files changed, 14 insertions(+), 11 deletions(-) diff --git a/hotspot/src/share/vm/c1/c1_LIRGenerator.cpp b/hotspot/src/share/vm/c1/c1_LIRGenerator.cpp index 98b519b58eb..8fa61f1e975 100644 --- a/hotspot/src/share/vm/c1/c1_LIRGenerator.cpp +++ b/hotspot/src/share/vm/c1/c1_LIRGenerator.cpp @@ -3055,13 +3055,16 @@ void LIRGenerator::do_IfOp(IfOp* x) { __ cmove(lir_cond(x->cond()), t_val.result(), f_val.result(), reg, as_BasicType(x->x()->type())); } -void LIRGenerator::do_RuntimeCall(address routine, int expected_arguments, Intrinsic* x) { - assert(x->number_of_arguments() == expected_arguments, "wrong type"); - LIR_Opr reg = result_register_for(x->type()); - __ call_runtime_leaf(routine, getThreadTemp(), - reg, new LIR_OprList()); - LIR_Opr result = rlock_result(x); - __ move(reg, result); +void LIRGenerator::do_RuntimeCall(address routine, Intrinsic* x) { + assert(x->number_of_arguments() == 0, "wrong type"); + // Enforce computation of _reserved_argument_area_size which is required on some platforms. + BasicTypeList signature; + CallingConvention* cc = frame_map()->c_calling_convention(&signature); + LIR_Opr reg = result_register_for(x->type()); + __ call_runtime_leaf(routine, getThreadTemp(), + reg, new LIR_OprList()); + LIR_Opr result = rlock_result(x); + __ move(reg, result); } #ifdef TRACE_HAVE_INTRINSICS @@ -3115,16 +3118,16 @@ void LIRGenerator::do_Intrinsic(Intrinsic* x) { case vmIntrinsics::_threadID: do_ThreadIDIntrinsic(x); break; case vmIntrinsics::_classID: do_ClassIDIntrinsic(x); break; case vmIntrinsics::_counterTime: - do_RuntimeCall(CAST_FROM_FN_PTR(address, TRACE_TIME_METHOD), 0, x); + do_RuntimeCall(CAST_FROM_FN_PTR(address, TRACE_TIME_METHOD), x); break; #endif case vmIntrinsics::_currentTimeMillis: - do_RuntimeCall(CAST_FROM_FN_PTR(address, os::javaTimeMillis), 0, x); + do_RuntimeCall(CAST_FROM_FN_PTR(address, os::javaTimeMillis), x); break; case vmIntrinsics::_nanoTime: - do_RuntimeCall(CAST_FROM_FN_PTR(address, os::javaTimeNanos), 0, x); + do_RuntimeCall(CAST_FROM_FN_PTR(address, os::javaTimeNanos), x); break; case vmIntrinsics::_Object_init: do_RegisterFinalizer(x); break; diff --git a/hotspot/src/share/vm/c1/c1_LIRGenerator.hpp b/hotspot/src/share/vm/c1/c1_LIRGenerator.hpp index 89f5960182a..47dd33df54e 100644 --- a/hotspot/src/share/vm/c1/c1_LIRGenerator.hpp +++ b/hotspot/src/share/vm/c1/c1_LIRGenerator.hpp @@ -439,7 +439,7 @@ class LIRGenerator: public InstructionVisitor, public BlockClosure { SwitchRangeArray* create_lookup_ranges(LookupSwitch* x); void do_SwitchRanges(SwitchRangeArray* x, LIR_Opr value, BlockBegin* default_sux); - void do_RuntimeCall(address routine, int expected_arguments, Intrinsic* x); + void do_RuntimeCall(address routine, Intrinsic* x); #ifdef TRACE_HAVE_INTRINSICS void do_ThreadIDIntrinsic(Intrinsic* x); void do_ClassIDIntrinsic(Intrinsic* x); From 6f27a97d77d7eab89bbb79a447168056a882c1f7 Mon Sep 17 00:00:00 2001 From: Goetz Lindenmaier Date: Mon, 7 Dec 2015 15:42:47 +0100 Subject: [PATCH 3/8] 8144466: ppc64: fix argument passing through opto stubs Reviewed-by: kvn --- hotspot/make/test/JtregNative.gmk | 1 + .../aarch64/vm/globalDefinitions_aarch64.hpp | 4 ++ .../src/cpu/ppc/vm/globalDefinitions_ppc.hpp | 4 ++ .../cpu/sparc/vm/globalDefinitions_sparc.hpp | 4 ++ .../src/cpu/x86/vm/globalDefinitions_x86.hpp | 4 ++ .../cpu/zero/vm/globalDefinitions_zero.hpp | 4 ++ .../src/share/vm/opto/generateOptoStub.cpp | 59 ++++++++++------ .../TestArrayCopyOverflowArguments.java | 69 +++++++++++++++++++ .../floatingpoint/Test15FloatJNIArgs.java | 61 ++++++++++++++++ .../floatingpoint/libTest15FloatJNIArgs.c | 41 +++++++++++ 10 files changed, 229 insertions(+), 22 deletions(-) create mode 100644 hotspot/test/compiler/arraycopy/TestArrayCopyOverflowArguments.java create mode 100644 hotspot/test/compiler/floatingpoint/Test15FloatJNIArgs.java create mode 100644 hotspot/test/compiler/floatingpoint/libTest15FloatJNIArgs.c diff --git a/hotspot/make/test/JtregNative.gmk b/hotspot/make/test/JtregNative.gmk index 09c48d77aba..d49a03757e7 100644 --- a/hotspot/make/test/JtregNative.gmk +++ b/hotspot/make/test/JtregNative.gmk @@ -46,6 +46,7 @@ BUILD_HOTSPOT_JTREG_NATIVE_SRC := \ $(HOTSPOT_TOPDIR)/test/runtime/jni/8033445 \ $(HOTSPOT_TOPDIR)/test/runtime/jni/ToStringInInterfaceTest \ $(HOTSPOT_TOPDIR)/test/runtime/SameObject \ + $(HOTSPOT_TOPDIR)/test/compiler/floatingpoint/ \ # # Add conditional directories here when needed. diff --git a/hotspot/src/cpu/aarch64/vm/globalDefinitions_aarch64.hpp b/hotspot/src/cpu/aarch64/vm/globalDefinitions_aarch64.hpp index d392ccf5e48..4c9d377ff09 100644 --- a/hotspot/src/cpu/aarch64/vm/globalDefinitions_aarch64.hpp +++ b/hotspot/src/cpu/aarch64/vm/globalDefinitions_aarch64.hpp @@ -28,6 +28,10 @@ const int StackAlignmentInBytes = 16; +// Indicates whether the C calling conventions require that +// 32-bit integer argument values are extended to 64 bits. +const bool CCallingConventionRequiresIntsAsLongs = false; + #define SUPPORTS_NATIVE_CX8 // The maximum B/BL offset range on AArch64 is 128MB. diff --git a/hotspot/src/cpu/ppc/vm/globalDefinitions_ppc.hpp b/hotspot/src/cpu/ppc/vm/globalDefinitions_ppc.hpp index 542abd1bdb6..86fddbc1751 100644 --- a/hotspot/src/cpu/ppc/vm/globalDefinitions_ppc.hpp +++ b/hotspot/src/cpu/ppc/vm/globalDefinitions_ppc.hpp @@ -31,6 +31,10 @@ const int BytesPerInstWord = 4; const int StackAlignmentInBytes = 16; +// Indicates whether the C calling conventions require that +// 32-bit integer argument values are extended to 64 bits. +const bool CCallingConventionRequiresIntsAsLongs = true; + #define SUPPORTS_NATIVE_CX8 // The PPC CPUs are NOT multiple-copy-atomic. diff --git a/hotspot/src/cpu/sparc/vm/globalDefinitions_sparc.hpp b/hotspot/src/cpu/sparc/vm/globalDefinitions_sparc.hpp index 93432223e8d..427b7ab5f14 100644 --- a/hotspot/src/cpu/sparc/vm/globalDefinitions_sparc.hpp +++ b/hotspot/src/cpu/sparc/vm/globalDefinitions_sparc.hpp @@ -30,6 +30,10 @@ const int BytesPerInstWord = 4; const int StackAlignmentInBytes = (2*wordSize); +// Indicates whether the C calling conventions require that +// 32-bit integer argument values are extended to 64 bits. +const bool CCallingConventionRequiresIntsAsLongs = false; + #define SUPPORTS_NATIVE_CX8 // The expected size in bytes of a cache line, used to pad data structures. diff --git a/hotspot/src/cpu/x86/vm/globalDefinitions_x86.hpp b/hotspot/src/cpu/x86/vm/globalDefinitions_x86.hpp index 8ddbdf82ca4..b7abbacd052 100644 --- a/hotspot/src/cpu/x86/vm/globalDefinitions_x86.hpp +++ b/hotspot/src/cpu/x86/vm/globalDefinitions_x86.hpp @@ -27,6 +27,10 @@ const int StackAlignmentInBytes = 16; +// Indicates whether the C calling conventions require that +// 32-bit integer argument values are extended to 64 bits. +const bool CCallingConventionRequiresIntsAsLongs = false; + #define SUPPORTS_NATIVE_CX8 // The expected size in bytes of a cache line, used to pad data structures. diff --git a/hotspot/src/cpu/zero/vm/globalDefinitions_zero.hpp b/hotspot/src/cpu/zero/vm/globalDefinitions_zero.hpp index 5956fe1cfcc..144c47e3cb9 100644 --- a/hotspot/src/cpu/zero/vm/globalDefinitions_zero.hpp +++ b/hotspot/src/cpu/zero/vm/globalDefinitions_zero.hpp @@ -28,4 +28,8 @@ #include +// Indicates whether the C calling conventions require that +// 32-bit integer argument values are extended to 64 bits. +const bool CCallingConventionRequiresIntsAsLongs = false; + #endif // CPU_ZERO_VM_GLOBALDEFINITIONS_ZERO_HPP diff --git a/hotspot/src/share/vm/opto/generateOptoStub.cpp b/hotspot/src/share/vm/opto/generateOptoStub.cpp index b3bc8999daa..48166b491fc 100644 --- a/hotspot/src/share/vm/opto/generateOptoStub.cpp +++ b/hotspot/src/share/vm/opto/generateOptoStub.cpp @@ -72,16 +72,18 @@ void GraphKit::gen_stub(address C_function, // Make up the parameters uint i; - for( i = 0; i < parm_cnt; i++ ) + for (i = 0; i < parm_cnt; i++) { map()->init_req(i, _gvn.transform(new ParmNode(start, i))); - for( ; ireq(); i++ ) + } + for ( ; ireq(); i++) { map()->init_req(i, top()); // For nicer debugging + } // GraphKit requires memory to be a MergeMemNode: set_all_memory(map()->memory()); // Get base of thread-local storage area - Node* thread = _gvn.transform( new ThreadLocalNode() ); + Node* thread = _gvn.transform(new ThreadLocalNode()); const int NoAlias = Compile::AliasIdxBot; @@ -113,21 +115,27 @@ void GraphKit::gen_stub(address C_function, //----------------------------- // Compute signature for C call. Varies from the Java signature! + const Type **fields = TypeTuple::fields(2*parm_cnt+2); uint cnt = TypeFunc::Parms; // The C routines gets the base of thread-local storage passed in as an - // extra argument. Not all calls need it, but its cheap to add here. + // extra argument. Not all calls need it, but it is cheap to add here. for (uint pcnt = cnt; pcnt < parm_cnt; pcnt++, cnt++) { - fields[cnt] = jdomain->field_at(pcnt); + const Type *f = jdomain->field_at(pcnt); + if (CCallingConventionRequiresIntsAsLongs && f->isa_int()) { + fields[cnt++] = TypeLong::LONG; + fields[cnt] = Type::HALF; // Must add an additional half for a long. + } else { + fields[cnt] = f; + } } - fields[cnt++] = TypeRawPtr::BOTTOM; // Thread-local storage // Also pass in the caller's PC, if asked for. if (return_pc) { fields[cnt++] = TypeRawPtr::BOTTOM; // Return PC } + const TypeTuple* domain = TypeTuple::make(cnt, fields); - const TypeTuple* domain = TypeTuple::make(cnt,fields); // The C routine we are about to call cannot return an oop; it can block on // exit and a GC will trash the oop while it sits in C-land. Instead, we // return the oop through TLS for runtime calls. @@ -155,37 +163,44 @@ void GraphKit::gen_stub(address C_function, rfields[TypeFunc::Parms+1] = jrange->field_at(TypeFunc::Parms+1); } } - const TypeTuple* range = TypeTuple::make(jrange->cnt(),rfields); + const TypeTuple* range = TypeTuple::make(jrange->cnt(), rfields); // Final C signature - const TypeFunc *c_sig = TypeFunc::make(domain,range); + const TypeFunc *c_sig = TypeFunc::make(domain, range); //----------------------------- - // Make the call node + // Make the call node. CallRuntimeNode *call = new CallRuntimeNode(c_sig, C_function, name, TypePtr::BOTTOM); //----------------------------- - // Fix-up the debug info for the call - call->set_jvms( new (C) JVMState(0) ); + // Fix-up the debug info for the call. + call->set_jvms(new (C) JVMState(0)); call->jvms()->set_bci(0); call->jvms()->set_offsets(cnt); - // Set fixed predefined input arguments + // Set fixed predefined input arguments. cnt = 0; - for (i = 0; i < TypeFunc::Parms; i++) - call->init_req(cnt++, map()->in(i)); - // A little too aggressive on the parm copy; return address is not an input - call->set_req(TypeFunc::ReturnAdr, top()); - for (; i < parm_cnt; i++) { // Regular input arguments + for (i = 0; i < TypeFunc::Parms; i++) { call->init_req(cnt++, map()->in(i)); } + // A little too aggressive on the parm copy; return address is not an input. + call->set_req(TypeFunc::ReturnAdr, top()); + for (; i < parm_cnt; i++) { // Regular input arguments. + const Type *f = jdomain->field_at(i); + if (CCallingConventionRequiresIntsAsLongs && f->isa_int()) { + call->init_req(cnt++, _gvn.transform(new ConvI2LNode(map()->in(i)))); + call->init_req(cnt++, top()); + } else { + call->init_req(cnt++, map()->in(i)); + } + } + call->init_req(cnt++, thread); + if (return_pc) { // Return PC, if asked for. + call->init_req(cnt++, returnadr()); + } - call->init_req( cnt++, thread ); - if( return_pc ) // Return PC, if asked for - call->init_req( cnt++, returnadr() ); _gvn.transform_no_reclaim(call); - //----------------------------- // Now set up the return results set_control( _gvn.transform( new ProjNode(call,TypeFunc::Control)) ); diff --git a/hotspot/test/compiler/arraycopy/TestArrayCopyOverflowArguments.java b/hotspot/test/compiler/arraycopy/TestArrayCopyOverflowArguments.java new file mode 100644 index 00000000000..1bfaa35e578 --- /dev/null +++ b/hotspot/test/compiler/arraycopy/TestArrayCopyOverflowArguments.java @@ -0,0 +1,69 @@ +/* + * Copyright (c) 2015 SAP SE. All rights reserved. + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This code is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 only, as + * published by the Free Software Foundation. + * + * This code is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * version 2 for more details (a copy is included in the LICENSE file that + * accompanied this code). + * + * You should have received a copy of the GNU General Public License version + * 2 along with this work; if not, write to the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. + * + * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA + * or visit www.oracle.com if you need additional information or have any + * questions. + */ + +/* + * @test + * @summary Test that overflowed integers passed to arraycopy don't do any harm. This might + * be the case on platforms where C-code expects that ints passed to a call + * are properly sign extended to 64 bit (e.g., PPC64, s390x). This can fail + * if slow_arraycopy_C() is commpiled by the C compiler without any imlicit + * casts (as spill stores to the stack that are done with 4-byte instruction). + * @run main/othervm -XX:-BackgroundCompilation -XX:-UseOnStackReplacement TestArrayCopyOverflowArguments + * + */ + +public class TestArrayCopyOverflowArguments { + + // Without volatile the overflowing computation was moved up and then + // spilled to the stack. The 32-bit spill store caused proper rounding. + static volatile int mod = Integer.MAX_VALUE; + + public static int[] m1(Object src) { + if (src == null) return null; + int[] dest = new int[10]; + try { + // PPC C calling conventions require that ints are properly expanded + // to longs when passed to a function. + int pos = 8 + mod + mod; // = 0x1_0000_0006. + int start = 2 + mod + mod; // = 0x1_0000_0000. + int len = 12 + mod + mod; // = 0x1_0000_0010. + // This is supposed to call SharedRuntime::slow_arraycopy_C(). + System.arraycopy(src, pos, dest, 0, 10); + } catch (ArrayStoreException npe) { + } + return dest; + } + + static public void main(String[] args) throws Exception { + int[] src = new int[20]; + + for (int i = 0; i < 20; ++i) { + src[i] = i * (i-1); + } + + for (int i = 0; i < 20000; i++) { + m1(src); + } + } +} + diff --git a/hotspot/test/compiler/floatingpoint/Test15FloatJNIArgs.java b/hotspot/test/compiler/floatingpoint/Test15FloatJNIArgs.java new file mode 100644 index 00000000000..1425d3da016 --- /dev/null +++ b/hotspot/test/compiler/floatingpoint/Test15FloatJNIArgs.java @@ -0,0 +1,61 @@ +/* + * Copyright (c) 2015 SAP SE. All rights reserved. + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This code is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 only, as + * published by the Free Software Foundation. + * + * This code is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * version 2 for more details (a copy is included in the LICENSE file that + * accompanied this code). + * + * You should have received a copy of the GNU General Public License version + * 2 along with this work; if not, write to the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. + * + * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA + * or visit www.oracle.com if you need additional information or have any + * questions. + */ + +/* @test + * @bug 8139258 + * @summary Regression test for 8139258 which failed to properly pass float args + * to a jni function on ppc64le. + * @run main/othervm -Xint Test15FloatJNIArgs + * @run main/othervm -XX:+TieredCompilation -Xcomp Test15FloatJNIArgs + * @run main/othervm -XX:-TieredCompilation -Xcomp Test15FloatJNIArgs + */ + +public class Test15FloatJNIArgs { + static { + try { + System.loadLibrary("Test15FloatJNIArgs"); + } catch (UnsatisfiedLinkError e) { + System.out.println("could not load native lib: " + e); + } + } + + public static native float add15floats( + float f1, float f2, float f3, float f4, + float f5, float f6, float f7, float f8, + float f9, float f10, float f11, float f12, + float f13, float f14, float f15); + + static void test() throws Exception { + float sum = Test15FloatJNIArgs.add15floats(1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, + 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f); + if (sum != 15.0f) { + throw new Error("Passed 15 times 1.0f to jni function which didn't add them properly: " + sum); + } + } + + public static void main(String[] args) throws Exception { + for (int i = 0; i < 200; ++i) { + test(); + } + } +} diff --git a/hotspot/test/compiler/floatingpoint/libTest15FloatJNIArgs.c b/hotspot/test/compiler/floatingpoint/libTest15FloatJNIArgs.c new file mode 100644 index 00000000000..e31627955ca --- /dev/null +++ b/hotspot/test/compiler/floatingpoint/libTest15FloatJNIArgs.c @@ -0,0 +1,41 @@ +/* + * Copyright (c) 2015. All rights reserved. + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This code is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 only, as + * published by the Free Software Foundation. + * + * This code is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * version 2 for more details (a copy is included in the LICENSE file that + * accompanied this code). + * + * You should have received a copy of the GNU General Public License version + * 2 along with this work; if not, write to the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. + * + * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA + * or visit www.oracle.com if you need additional information or have any + * questions. + */ + +#include + +#ifdef __cplusplus +extern "C" { +#endif + +JNIEXPORT jfloat JNICALL Java_Test15FloatJNIArgs_add15floats + (JNIEnv *env, jclass cls, + jfloat f1, jfloat f2, jfloat f3, jfloat f4, + jfloat f5, jfloat f6, jfloat f7, jfloat f8, + jfloat f9, jfloat f10, jfloat f11, jfloat f12, + jfloat f13, jfloat f14, jfloat f15) { + return f1 + f2 + f3 + f4 + f5 + f6 + f7 + f8 + f9 + f10 + f11 + f12 + f13 + f14 + f15; +} + +#ifdef __cplusplus +} +#endif From 08a2e337c77be8952093613528751a9b905a898a Mon Sep 17 00:00:00 2001 From: Doug Simon Date: Sun, 13 Dec 2015 22:51:13 +0100 Subject: [PATCH 4/8] 8145270: Need to eagerly initialize JVMCI compiler under -Xcomp Reviewed-by: twisti --- hotspot/src/share/vm/compiler/compileBroker.cpp | 14 +++++++++++++- hotspot/src/share/vm/compiler/compileBroker.hpp | 2 +- hotspot/src/share/vm/runtime/thread.cpp | 2 +- 3 files changed, 15 insertions(+), 3 deletions(-) diff --git a/hotspot/src/share/vm/compiler/compileBroker.cpp b/hotspot/src/share/vm/compiler/compileBroker.cpp index e5eac46910c..9881a31435e 100644 --- a/hotspot/src/share/vm/compiler/compileBroker.cpp +++ b/hotspot/src/share/vm/compiler/compileBroker.cpp @@ -56,6 +56,7 @@ #if INCLUDE_JVMCI #include "jvmci/jvmciCompiler.hpp" #include "jvmci/jvmciRuntime.hpp" +#include "jvmci/jvmciJavaClasses.hpp" #include "runtime/vframe.hpp" #endif #ifdef COMPILER2 @@ -498,7 +499,7 @@ CompilerCounters::CompilerCounters() { // CompileBroker::compilation_init // // Initialize the Compilation object -void CompileBroker::compilation_init() { +void CompileBroker::compilation_init(TRAPS) { _last_method_compiled[0] = '\0'; // No need to initialize compilation system if we do not use it. @@ -529,6 +530,17 @@ void CompileBroker::compilation_init() { } else { c1_count = JVMCIHostThreads; } + + if (!UseInterpreter) { + // Force initialization of JVMCI compiler otherwise JVMCI + // compilations will not block until JVMCI is initialized + ResourceMark rm; + TempNewSymbol getCompiler = SymbolTable::new_symbol("getCompiler", CHECK); + TempNewSymbol sig = SymbolTable::new_symbol("()Ljdk/vm/ci/runtime/JVMCICompiler;", CHECK); + Handle jvmciRuntime = JVMCIRuntime::get_HotSpotJVMCIRuntime(CHECK); + JavaValue result(T_OBJECT); + JavaCalls::call_virtual(&result, jvmciRuntime, HotSpotJVMCIRuntime::klass(), getCompiler, sig, CHECK); + } } } #endif // INCLUDE_JVMCI diff --git a/hotspot/src/share/vm/compiler/compileBroker.hpp b/hotspot/src/share/vm/compiler/compileBroker.hpp index 9c0709aa28f..448f1f8897d 100644 --- a/hotspot/src/share/vm/compiler/compileBroker.hpp +++ b/hotspot/src/share/vm/compiler/compileBroker.hpp @@ -276,7 +276,7 @@ public: CompileQueue *q = compile_queue(comp_level); return q != NULL ? q->size() : 0; } - static void compilation_init(); + static void compilation_init(TRAPS); static void init_compiler_thread_log(); static nmethod* compile_method(const methodHandle& method, int osr_bci, diff --git a/hotspot/src/share/vm/runtime/thread.cpp b/hotspot/src/share/vm/runtime/thread.cpp index db0f5d9812c..f6f00384245 100644 --- a/hotspot/src/share/vm/runtime/thread.cpp +++ b/hotspot/src/share/vm/runtime/thread.cpp @@ -3628,7 +3628,7 @@ jint Threads::create_vm(JavaVMInitArgs* args, bool* canTryAgain) { // initialize compiler(s) #if defined(COMPILER1) || defined(COMPILER2) || defined(SHARK) || INCLUDE_JVMCI - CompileBroker::compilation_init(); + CompileBroker::compilation_init(CHECK_JNI_ERR); #endif // Pre-initialize some JSR292 core classes to avoid deadlock during class loading. From e4d937a557cfc0d7395be218994751d28bcd64ff Mon Sep 17 00:00:00 2001 From: Tom Rodriguez Date: Mon, 14 Dec 2015 13:06:39 -0800 Subject: [PATCH 5/8] 8145338: compiler/jsr292/CallSiteDepContextTest.java fails: assert(dep_implicit_context_arg(dept) == 0) failed: sanity Reviewed-by: twisti --- hotspot/src/share/vm/code/dependencies.cpp | 1 - 1 file changed, 1 deletion(-) diff --git a/hotspot/src/share/vm/code/dependencies.cpp b/hotspot/src/share/vm/code/dependencies.cpp index c941f404276..bdb1292a4aa 100644 --- a/hotspot/src/share/vm/code/dependencies.cpp +++ b/hotspot/src/share/vm/code/dependencies.cpp @@ -346,7 +346,6 @@ void Dependencies::assert_common_2(DepType dept, } } } else { - assert(dep_implicit_context_arg(dept) == 0, "sanity"); if (note_dep_seen(dept, x0) && note_dep_seen(dept, x1)) { // look in this bucket for redundant assertions const int stride = 2; From a08d3805f07742bcf22b20bab32134b45cf01b9d Mon Sep 17 00:00:00 2001 From: Jan Civlin Date: Mon, 14 Dec 2015 14:48:30 -0800 Subject: [PATCH 6/8] 8144771: Use AVX3 instructions for string compare Co-authored-by: Michael C Berg Reviewed-by: kvn, thartmann --- hotspot/src/cpu/x86/vm/assembler_x86.cpp | 226 +++++++++++------- hotspot/src/cpu/x86/vm/assembler_x86.hpp | 16 +- hotspot/src/cpu/x86/vm/macroAssembler_x86.cpp | 79 +++++- .../src/cpu/x86/vm/sharedRuntime_x86_64.cpp | 6 +- .../src/cpu/x86/vm/stubGenerator_x86_32.cpp | 4 +- .../src/cpu/x86/vm/stubGenerator_x86_64.cpp | 6 +- 6 files changed, 234 insertions(+), 103 deletions(-) diff --git a/hotspot/src/cpu/x86/vm/assembler_x86.cpp b/hotspot/src/cpu/x86/vm/assembler_x86.cpp index 015a66b7900..e55fd56e78d 100644 --- a/hotspot/src/cpu/x86/vm/assembler_x86.cpp +++ b/hotspot/src/cpu/x86/vm/assembler_x86.cpp @@ -2152,33 +2152,64 @@ void Assembler::movddup(XMMRegister dst, XMMRegister src) { emit_int8(0xC0 | encode); } -void Assembler::kmovwl(KRegister dst, Register src) { - NOT_LP64(assert(VM_Version::supports_evex(), "")); +void Assembler::kmovbl(KRegister dst, Register src) { + assert(VM_Version::supports_avx512dq(), ""); InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ true, /* no_mask_reg */ true, /* uses_vl */ false); - int encode = kreg_prefix_and_encode(dst, knoreg, src, VEX_SIMD_NONE, VEX_OPCODE_0F, &attributes); + int encode = vex_prefix_and_encode(dst->encoding(), 0, src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F, &attributes); emit_int8((unsigned char)0x92); emit_int8((unsigned char)(0xC0 | encode)); } +void Assembler::kmovbl(Register dst, KRegister src) { + assert(VM_Version::supports_avx512dq(), ""); + InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ true, /* no_mask_reg */ true, /* uses_vl */ false); + int encode = vex_prefix_and_encode(dst->encoding(), 0, src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F, &attributes); + emit_int8((unsigned char)0x93); + emit_int8((unsigned char)(0xC0 | encode)); +} + +void Assembler::kmovwl(KRegister dst, Register src) { + assert(VM_Version::supports_evex(), ""); + InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ true, /* no_mask_reg */ true, /* uses_vl */ false); + int encode = vex_prefix_and_encode(dst->encoding(), 0, src->encoding(), VEX_SIMD_NONE, VEX_OPCODE_0F, &attributes); + emit_int8((unsigned char)0x92); + emit_int8((unsigned char)(0xC0 | encode)); +} + +void Assembler::kmovwl(Register dst, KRegister src) { + assert(VM_Version::supports_evex(), ""); + InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ true, /* no_mask_reg */ true, /* uses_vl */ false); + int encode = vex_prefix_and_encode(dst->encoding(), 0, src->encoding(), VEX_SIMD_NONE, VEX_OPCODE_0F, &attributes); + emit_int8((unsigned char)0x93); + emit_int8((unsigned char)(0xC0 | encode)); +} + void Assembler::kmovdl(KRegister dst, Register src) { - NOT_LP64(assert(VM_Version::supports_evex(), "")); - VexSimdPrefix pre = !_legacy_mode_bw ? VEX_SIMD_F2 : VEX_SIMD_NONE; + assert(VM_Version::supports_avx512bw(), ""); InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ true, /* no_mask_reg */ true, /* uses_vl */ false); - int encode = kreg_prefix_and_encode(dst, knoreg, src, pre, VEX_OPCODE_0F, &attributes); + int encode = vex_prefix_and_encode(dst->encoding(), 0, src->encoding(), VEX_SIMD_F2, VEX_OPCODE_0F, &attributes); emit_int8((unsigned char)0x92); emit_int8((unsigned char)(0xC0 | encode)); } +void Assembler::kmovdl(Register dst, KRegister src) { + assert(VM_Version::supports_avx512bw(), ""); + InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ true, /* no_mask_reg */ true, /* uses_vl */ false); + int encode = vex_prefix_and_encode(dst->encoding(), 0, src->encoding(), VEX_SIMD_F2, VEX_OPCODE_0F, &attributes); + emit_int8((unsigned char)0x93); + emit_int8((unsigned char)(0xC0 | encode)); +} + void Assembler::kmovql(KRegister dst, KRegister src) { - NOT_LP64(assert(VM_Version::supports_evex(), "")); + assert(VM_Version::supports_avx512bw(), ""); InstructionAttr attributes(AVX_128bit, /* rex_w */ true, /* legacy_mode */ true, /* no_mask_reg */ true, /* uses_vl */ false); - int encode = kreg_prefix_and_encode(dst, knoreg, src, VEX_SIMD_NONE, VEX_OPCODE_0F, &attributes); + int encode = vex_prefix_and_encode(dst->encoding(), 0, src->encoding(), VEX_SIMD_NONE, VEX_OPCODE_0F, &attributes); emit_int8((unsigned char)0x90); emit_int8((unsigned char)(0xC0 | encode)); } void Assembler::kmovql(KRegister dst, Address src) { - NOT_LP64(assert(VM_Version::supports_evex(), "")); + assert(VM_Version::supports_avx512bw(), ""); InstructionMark im(this); InstructionAttr attributes(AVX_128bit, /* vex_w */ true, /* legacy_mode */ true, /* no_mask_reg */ true, /* uses_vl */ false); vex_prefix(src, 0, dst->encoding(), VEX_SIMD_NONE, VEX_OPCODE_0F, &attributes); @@ -2187,7 +2218,7 @@ void Assembler::kmovql(KRegister dst, Address src) { } void Assembler::kmovql(Address dst, KRegister src) { - NOT_LP64(assert(VM_Version::supports_evex(), "")); + assert(VM_Version::supports_avx512bw(), ""); InstructionMark im(this); InstructionAttr attributes(AVX_128bit, /* vex_w */ true, /* legacy_mode */ true, /* no_mask_reg */ true, /* uses_vl */ false); vex_prefix(dst, 0, src->encoding(), VEX_SIMD_NONE, VEX_OPCODE_0F, &attributes); @@ -2196,46 +2227,53 @@ void Assembler::kmovql(Address dst, KRegister src) { } void Assembler::kmovql(KRegister dst, Register src) { - NOT_LP64(assert(VM_Version::supports_evex(), "")); - VexSimdPrefix pre = !_legacy_mode_bw ? VEX_SIMD_F2 : VEX_SIMD_NONE; - InstructionAttr attributes(AVX_128bit, /* rex_w */ !_legacy_mode_bw, /* legacy_mode */ true, /* no_mask_reg */ true, /* uses_vl */ false); - int encode = kreg_prefix_and_encode(dst, knoreg, src, pre, VEX_OPCODE_0F, &attributes); + assert(VM_Version::supports_avx512bw(), ""); + InstructionAttr attributes(AVX_128bit, /* rex_w */ true, /* legacy_mode */ true, /* no_mask_reg */ true, /* uses_vl */ false); + int encode = vex_prefix_and_encode(dst->encoding(), 0, src->encoding(), VEX_SIMD_F2, VEX_OPCODE_0F, &attributes); emit_int8((unsigned char)0x92); emit_int8((unsigned char)(0xC0 | encode)); } +void Assembler::kmovql(Register dst, KRegister src) { + assert(VM_Version::supports_avx512bw(), ""); + InstructionAttr attributes(AVX_128bit, /* rex_w */ true, /* legacy_mode */ true, /* no_mask_reg */ true, /* uses_vl */ false); + int encode = vex_prefix_and_encode(dst->encoding(), 0, src->encoding(), VEX_SIMD_F2, VEX_OPCODE_0F, &attributes); + emit_int8((unsigned char)0x93); + emit_int8((unsigned char)(0xC0 | encode)); +} + // This instruction produces ZF or CF flags void Assembler::kortestbl(KRegister src1, KRegister src2) { - NOT_LP64(assert(VM_Version::supports_avx512dq(), "")); + assert(VM_Version::supports_avx512dq(), ""); InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ true, /* no_mask_reg */ true, /* uses_vl */ false); - int encode = kreg_prefix_and_encode(src1, knoreg, src2, VEX_SIMD_66, VEX_OPCODE_0F, &attributes); + int encode = vex_prefix_and_encode(src1->encoding(), 0, src2->encoding(), VEX_SIMD_66, VEX_OPCODE_0F, &attributes); emit_int8((unsigned char)0x98); emit_int8((unsigned char)(0xC0 | encode)); } // This instruction produces ZF or CF flags void Assembler::kortestwl(KRegister src1, KRegister src2) { - NOT_LP64(assert(VM_Version::supports_evex(), "")); + assert(VM_Version::supports_evex(), ""); InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ true, /* no_mask_reg */ true, /* uses_vl */ false); - int encode = kreg_prefix_and_encode(src1, knoreg, src2, VEX_SIMD_NONE, VEX_OPCODE_0F, &attributes); + int encode = vex_prefix_and_encode(src1->encoding(), 0, src2->encoding(), VEX_SIMD_NONE, VEX_OPCODE_0F, &attributes); emit_int8((unsigned char)0x98); emit_int8((unsigned char)(0xC0 | encode)); } // This instruction produces ZF or CF flags void Assembler::kortestdl(KRegister src1, KRegister src2) { - NOT_LP64(assert(VM_Version::supports_avx512bw(), "")); + assert(VM_Version::supports_avx512bw(), ""); InstructionAttr attributes(AVX_128bit, /* rex_w */ true, /* legacy_mode */ true, /* no_mask_reg */ true, /* uses_vl */ false); - int encode = kreg_prefix_and_encode(src1, knoreg, src2, VEX_SIMD_66, VEX_OPCODE_0F, &attributes); + int encode = vex_prefix_and_encode(src1->encoding(), 0, src2->encoding(), VEX_SIMD_66, VEX_OPCODE_0F, &attributes); emit_int8((unsigned char)0x98); emit_int8((unsigned char)(0xC0 | encode)); } // This instruction produces ZF or CF flags void Assembler::kortestql(KRegister src1, KRegister src2) { - NOT_LP64(assert(VM_Version::supports_avx512bw(), "")); + assert(VM_Version::supports_avx512bw(), ""); InstructionAttr attributes(AVX_128bit, /* rex_w */ true, /* legacy_mode */ true, /* no_mask_reg */ true, /* uses_vl */ false); - int encode = kreg_prefix_and_encode(src1, knoreg, src2, VEX_SIMD_NONE, VEX_OPCODE_0F, &attributes); + int encode = vex_prefix_and_encode(src1->encoding(), 0, src2->encoding(), VEX_SIMD_NONE, VEX_OPCODE_0F, &attributes); emit_int8((unsigned char)0x98); emit_int8((unsigned char)(0xC0 | encode)); } @@ -2375,7 +2413,7 @@ void Assembler::vmovdqu(Address dst, XMMRegister src) { // Move Unaligned EVEX enabled Vector (programmable : 8,16,32,64) void Assembler::evmovdqub(XMMRegister dst, XMMRegister src, int vector_len) { assert(VM_Version::supports_evex(), ""); - InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ false, /* uses_vl */ true); + InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ true, /* uses_vl */ true); int encode = vex_prefix_and_encode(dst->encoding(), 0, src->encoding(), VEX_SIMD_F2, VEX_OPCODE_0F, &attributes); emit_int8(0x6F); emit_int8((unsigned char)(0xC0 | encode)); @@ -2395,7 +2433,7 @@ void Assembler::evmovdqub(Address dst, XMMRegister src, int vector_len) { assert(VM_Version::supports_evex(), ""); assert(src != xnoreg, "sanity"); InstructionMark im(this); - InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ false, /* uses_vl */ true); + InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ true, /* uses_vl */ true); attributes.set_address_attributes(/* tuple_type */ EVEX_FVM, /* input_size_in_bits */ EVEX_NObit); vex_prefix(dst, 0, src->encoding(), VEX_SIMD_F2, VEX_OPCODE_0F, &attributes); emit_int8(0x7F); @@ -2404,7 +2442,7 @@ void Assembler::evmovdqub(Address dst, XMMRegister src, int vector_len) { void Assembler::evmovdquw(XMMRegister dst, XMMRegister src, int vector_len) { assert(VM_Version::supports_evex(), ""); - InstructionAttr attributes(vector_len, /* vex_w */ true, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ false, /* uses_vl */ true); + InstructionAttr attributes(vector_len, /* vex_w */ true, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ true, /* uses_vl */ true); int encode = vex_prefix_and_encode(dst->encoding(), 0, src->encoding(), VEX_SIMD_F2, VEX_OPCODE_0F, &attributes); emit_int8(0x6F); emit_int8((unsigned char)(0xC0 | encode)); @@ -2424,7 +2462,7 @@ void Assembler::evmovdquw(Address dst, XMMRegister src, int vector_len) { assert(VM_Version::supports_evex(), ""); assert(src != xnoreg, "sanity"); InstructionMark im(this); - InstructionAttr attributes(vector_len, /* vex_w */ true, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ false, /* uses_vl */ true); + InstructionAttr attributes(vector_len, /* vex_w */ true, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ true, /* uses_vl */ true); attributes.set_address_attributes(/* tuple_type */ EVEX_FVM, /* input_size_in_bits */ EVEX_NObit); vex_prefix(dst, 0, src->encoding(), VEX_SIMD_F2, VEX_OPCODE_0F, &attributes); emit_int8(0x7F); @@ -3069,7 +3107,7 @@ void Assembler::packuswb(XMMRegister dst, Address src) { NOT_LP64(assert(VM_Version::supports_sse2(), "")); assert((UseAVX > 0), "SSE mode requires address alignment 16 bytes"); InstructionMark im(this); - InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ false, /* uses_vl */ true); + InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ true, /* uses_vl */ true); attributes.set_address_attributes(/* tuple_type */ EVEX_FV, /* input_size_in_bits */ EVEX_32bit); simd_prefix(dst, dst, src, VEX_SIMD_66, VEX_OPCODE_0F, &attributes); emit_int8(0x67); @@ -3078,7 +3116,7 @@ void Assembler::packuswb(XMMRegister dst, Address src) { void Assembler::packuswb(XMMRegister dst, XMMRegister src) { NOT_LP64(assert(VM_Version::supports_sse2(), "")); - InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ false, /* uses_vl */ true); + InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ true, /* uses_vl */ true); int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_66, VEX_OPCODE_0F, &attributes); emit_int8(0x67); emit_int8((unsigned char)(0xC0 | encode)); @@ -3086,7 +3124,7 @@ void Assembler::packuswb(XMMRegister dst, XMMRegister src) { void Assembler::vpackuswb(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) { assert(UseAVX > 0, "some form of AVX must be enabled"); - InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ false, /* uses_vl */ true); + InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ true, /* uses_vl */ true); int nds_enc = nds->is_valid() ? nds->encoding() : 0; int encode = vex_prefix_and_encode(dst->encoding(), nds_enc, src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F, &attributes); emit_int8(0x67); @@ -3128,7 +3166,7 @@ void Assembler::pcmpestri(XMMRegister dst, XMMRegister src, int imm8) { // In this context, the dst vector contains the components that are equal, non equal components are zeroed in dst void Assembler::pcmpeqb(XMMRegister dst, XMMRegister src) { - NOT_LP64(assert(VM_Version::supports_sse2(), "")); + assert(VM_Version::supports_sse2(), ""); InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ true, /* no_mask_reg */ false, /* uses_vl */ false); int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_66, VEX_OPCODE_0F, &attributes); emit_int8(0x74); @@ -3148,16 +3186,28 @@ void Assembler::vpcmpeqb(XMMRegister dst, XMMRegister nds, XMMRegister src, int // In this context, kdst is written the mask used to process the equal components void Assembler::evpcmpeqb(KRegister kdst, XMMRegister nds, XMMRegister src, int vector_len) { assert(VM_Version::supports_avx512bw(), ""); - InstructionAttr attributes(vector_len, /* rex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ true); + InstructionAttr attributes(vector_len, /* rex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ true); int nds_enc = nds->is_valid() ? nds->encoding() : 0; int encode = vex_prefix_and_encode(kdst->encoding(), nds_enc, src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F, &attributes); emit_int8(0x74); emit_int8((unsigned char)(0xC0 | encode)); } +void Assembler::evpcmpeqb(KRegister kdst, XMMRegister nds, Address src, int vector_len) { + assert(VM_Version::supports_avx512bw(), ""); + InstructionMark im(this); + InstructionAttr attributes(vector_len, /* rex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ true); + attributes.set_address_attributes(/* tuple_type */ EVEX_FVM, /* input_size_in_bits */ EVEX_NObit); + int nds_enc = nds->is_valid() ? nds->encoding() : 0; + int dst_enc = kdst->encoding(); + vex_prefix(src, nds_enc, dst_enc, VEX_SIMD_66, VEX_OPCODE_0F, &attributes); + emit_int8(0x74); + emit_operand(as_Register(dst_enc), src); +} + // In this context, the dst vector contains the components that are equal, non equal components are zeroed in dst void Assembler::pcmpeqw(XMMRegister dst, XMMRegister src) { - NOT_LP64(assert(VM_Version::supports_sse2(), "")); + assert(VM_Version::supports_sse2(), ""); InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ true, /* no_mask_reg */ false, /* uses_vl */ false); int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_66, VEX_OPCODE_0F, &attributes); emit_int8(0x75); @@ -3177,16 +3227,28 @@ void Assembler::vpcmpeqw(XMMRegister dst, XMMRegister nds, XMMRegister src, int // In this context, kdst is written the mask used to process the equal components void Assembler::evpcmpeqw(KRegister kdst, XMMRegister nds, XMMRegister src, int vector_len) { assert(VM_Version::supports_avx512bw(), ""); - InstructionAttr attributes(vector_len, /* rex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ true); + InstructionAttr attributes(vector_len, /* rex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ true); int nds_enc = nds->is_valid() ? nds->encoding() : 0; int encode = vex_prefix_and_encode(kdst->encoding(), nds_enc, src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F, &attributes); emit_int8(0x75); emit_int8((unsigned char)(0xC0 | encode)); } +void Assembler::evpcmpeqw(KRegister kdst, XMMRegister nds, Address src, int vector_len) { + assert(VM_Version::supports_avx512bw(), ""); + InstructionMark im(this); + InstructionAttr attributes(vector_len, /* rex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ true); + attributes.set_address_attributes(/* tuple_type */ EVEX_FVM, /* input_size_in_bits */ EVEX_NObit); + int nds_enc = nds->is_valid() ? nds->encoding() : 0; + int dst_enc = kdst->encoding(); + vex_prefix(src, nds_enc, dst_enc, VEX_SIMD_66, VEX_OPCODE_0F, &attributes); + emit_int8(0x75); + emit_operand(as_Register(dst_enc), src); +} + // In this context, the dst vector contains the components that are equal, non equal components are zeroed in dst void Assembler::pcmpeqd(XMMRegister dst, XMMRegister src) { - NOT_LP64(assert(VM_Version::supports_sse2(), "")); + assert(VM_Version::supports_sse2(), ""); InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ true, /* no_mask_reg */ false, /* uses_vl */ false); int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_66, VEX_OPCODE_0F, &attributes); emit_int8(0x76); @@ -3213,9 +3275,21 @@ void Assembler::evpcmpeqd(KRegister kdst, XMMRegister nds, XMMRegister src, int emit_int8((unsigned char)(0xC0 | encode)); } +void Assembler::evpcmpeqd(KRegister kdst, XMMRegister nds, Address src, int vector_len) { + assert(VM_Version::supports_evex(), ""); + InstructionMark im(this); + InstructionAttr attributes(vector_len, /* rex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ true); + attributes.set_address_attributes(/* tuple_type */ EVEX_FV, /* input_size_in_bits */ EVEX_32bit); + int nds_enc = nds->is_valid() ? nds->encoding() : 0; + int dst_enc = kdst->encoding(); + vex_prefix(src, nds_enc, dst_enc, VEX_SIMD_66, VEX_OPCODE_0F, &attributes); + emit_int8(0x76); + emit_operand(as_Register(dst_enc), src); +} + // In this context, the dst vector contains the components that are equal, non equal components are zeroed in dst void Assembler::pcmpeqq(XMMRegister dst, XMMRegister src) { - NOT_LP64(assert(VM_Version::supports_sse4_1(), "")); + assert(VM_Version::supports_sse4_1(), ""); InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ true, /* no_mask_reg */ false, /* uses_vl */ false); int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_66, VEX_OPCODE_0F_38, &attributes); emit_int8(0x29); @@ -3328,7 +3402,7 @@ void Assembler::pinsrw(XMMRegister dst, Register src, int imm8) { void Assembler::pmovzxbw(XMMRegister dst, Address src) { assert(VM_Version::supports_sse4_1(), ""); InstructionMark im(this); - InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ false, /* uses_vl */ false); + InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ true, /* uses_vl */ false); attributes.set_address_attributes(/* tuple_type */ EVEX_HVM, /* input_size_in_bits */ EVEX_NObit); simd_prefix(dst, xnoreg, src, VEX_SIMD_66, VEX_OPCODE_0F_38, &attributes); emit_int8(0x30); @@ -3337,7 +3411,7 @@ void Assembler::pmovzxbw(XMMRegister dst, Address src) { void Assembler::pmovzxbw(XMMRegister dst, XMMRegister src) { assert(VM_Version::supports_sse4_1(), ""); - InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ false, /* uses_vl */ false); + InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ true, /* uses_vl */ false); int encode = simd_prefix_and_encode(dst, xnoreg, src, VEX_SIMD_66, VEX_OPCODE_0F_38, &attributes); emit_int8(0x30); emit_int8((unsigned char)(0xC0 | encode)); @@ -3347,7 +3421,7 @@ void Assembler::vpmovzxbw(XMMRegister dst, Address src, int vector_len) { assert(VM_Version::supports_avx(), ""); InstructionMark im(this); assert(dst != xnoreg, "sanity"); - InstructionAttr attributes(vector_len, /* rex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ false, /* uses_vl */ false); + InstructionAttr attributes(vector_len, /* rex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ true, /* uses_vl */ false); attributes.set_address_attributes(/* tuple_type */ EVEX_HVM, /* input_size_in_bits */ EVEX_NObit); vex_prefix(src, 0, dst->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_38, &attributes); emit_int8(0x30); @@ -3452,7 +3526,7 @@ void Assembler::prefix(Prefix p) { void Assembler::pshufb(XMMRegister dst, XMMRegister src) { assert(VM_Version::supports_ssse3(), ""); - InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ false, /* uses_vl */ false); + InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ true, /* uses_vl */ false); int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_66, VEX_OPCODE_0F_38, &attributes); emit_int8(0x00); emit_int8((unsigned char)(0xC0 | encode)); @@ -3461,7 +3535,7 @@ void Assembler::pshufb(XMMRegister dst, XMMRegister src) { void Assembler::pshufb(XMMRegister dst, Address src) { assert(VM_Version::supports_ssse3(), ""); InstructionMark im(this); - InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ false, /* uses_vl */ false); + InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ true, /* uses_vl */ false); attributes.set_address_attributes(/* tuple_type */ EVEX_FVM, /* input_size_in_bits */ EVEX_NObit); simd_prefix(dst, dst, src, VEX_SIMD_66, VEX_OPCODE_0F_38, &attributes); emit_int8(0x00); @@ -3495,7 +3569,7 @@ void Assembler::pshufd(XMMRegister dst, Address src, int mode) { void Assembler::pshuflw(XMMRegister dst, XMMRegister src, int mode) { assert(isByte(mode), "invalid value"); NOT_LP64(assert(VM_Version::supports_sse2(), "")); - InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ false, /* uses_vl */ false); + InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ true, /* uses_vl */ false); int encode = simd_prefix_and_encode(dst, xnoreg, src, VEX_SIMD_F2, VEX_OPCODE_0F, &attributes); emit_int8(0x70); emit_int8((unsigned char)(0xC0 | encode)); @@ -3507,7 +3581,7 @@ void Assembler::pshuflw(XMMRegister dst, Address src, int mode) { NOT_LP64(assert(VM_Version::supports_sse2(), "")); assert((UseAVX > 0), "SSE mode requires address alignment 16 bytes"); InstructionMark im(this); - InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ false, /* uses_vl */ false); + InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ true, /* uses_vl */ false); attributes.set_address_attributes(/* tuple_type */ EVEX_FVM, /* input_size_in_bits */ EVEX_NObit); simd_prefix(dst, xnoreg, src, VEX_SIMD_F2, VEX_OPCODE_0F, &attributes); emit_int8(0x70); @@ -4723,7 +4797,7 @@ void Assembler::vphaddd(XMMRegister dst, XMMRegister nds, XMMRegister src, int v void Assembler::paddb(XMMRegister dst, XMMRegister src) { NOT_LP64(assert(VM_Version::supports_sse2(), "")); - InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ false, /* uses_vl */ true); + InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ true, /* uses_vl */ true); int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_66, VEX_OPCODE_0F, &attributes); emit_int8((unsigned char)0xFC); emit_int8((unsigned char)(0xC0 | encode)); @@ -4731,7 +4805,7 @@ void Assembler::paddb(XMMRegister dst, XMMRegister src) { void Assembler::paddw(XMMRegister dst, XMMRegister src) { NOT_LP64(assert(VM_Version::supports_sse2(), "")); - InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ false, /* uses_vl */ true); + InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ true, /* uses_vl */ true); int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_66, VEX_OPCODE_0F, &attributes); emit_int8((unsigned char)0xFD); emit_int8((unsigned char)(0xC0 | encode)); @@ -4771,7 +4845,7 @@ void Assembler::phaddd(XMMRegister dst, XMMRegister src) { void Assembler::vpaddb(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) { assert(UseAVX > 0, "requires some form of AVX"); - InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ false, /* uses_vl */ true); + InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ true, /* uses_vl */ true); int nds_enc = nds->is_valid() ? nds->encoding() : 0; int encode = vex_prefix_and_encode(dst->encoding(), nds_enc, src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F, &attributes); emit_int8((unsigned char)0xFC); @@ -4780,7 +4854,7 @@ void Assembler::vpaddb(XMMRegister dst, XMMRegister nds, XMMRegister src, int ve void Assembler::vpaddw(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) { assert(UseAVX > 0, "requires some form of AVX"); - InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ false, /* uses_vl */ true); + InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ true, /* uses_vl */ true); int nds_enc = nds->is_valid() ? nds->encoding() : 0; int encode = vex_prefix_and_encode(dst->encoding(), nds_enc, src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F, &attributes); emit_int8((unsigned char)0xFD); @@ -4808,7 +4882,7 @@ void Assembler::vpaddq(XMMRegister dst, XMMRegister nds, XMMRegister src, int ve void Assembler::vpaddb(XMMRegister dst, XMMRegister nds, Address src, int vector_len) { assert(UseAVX > 0, "requires some form of AVX"); InstructionMark im(this); - InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ false, /* uses_vl */ true); + InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ true, /* uses_vl */ true); attributes.set_address_attributes(/* tuple_type */ EVEX_FVM, /* input_size_in_bits */ EVEX_NObit); int nds_enc = nds->is_valid() ? nds->encoding() : 0; vex_prefix(src, nds_enc, dst->encoding(), VEX_SIMD_66, VEX_OPCODE_0F, &attributes); @@ -4819,7 +4893,7 @@ void Assembler::vpaddb(XMMRegister dst, XMMRegister nds, Address src, int vector void Assembler::vpaddw(XMMRegister dst, XMMRegister nds, Address src, int vector_len) { assert(UseAVX > 0, "requires some form of AVX"); InstructionMark im(this); - InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ false, /* uses_vl */ true); + InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ true, /* uses_vl */ true); attributes.set_address_attributes(/* tuple_type */ EVEX_FVM, /* input_size_in_bits */ EVEX_NObit); int nds_enc = nds->is_valid() ? nds->encoding() : 0; vex_prefix(src, nds_enc, dst->encoding(), VEX_SIMD_66, VEX_OPCODE_0F, &attributes); @@ -4851,7 +4925,7 @@ void Assembler::vpaddq(XMMRegister dst, XMMRegister nds, Address src, int vector void Assembler::psubb(XMMRegister dst, XMMRegister src) { NOT_LP64(assert(VM_Version::supports_sse2(), "")); - InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ false, /* uses_vl */ true); + InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ true, /* uses_vl */ true); int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_66, VEX_OPCODE_0F, &attributes); emit_int8((unsigned char)0xF8); emit_int8((unsigned char)(0xC0 | encode)); @@ -4859,7 +4933,7 @@ void Assembler::psubb(XMMRegister dst, XMMRegister src) { void Assembler::psubw(XMMRegister dst, XMMRegister src) { NOT_LP64(assert(VM_Version::supports_sse2(), "")); - InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ false, /* uses_vl */ true); + InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ true, /* uses_vl */ true); int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_66, VEX_OPCODE_0F, &attributes); emit_int8((unsigned char)0xF9); emit_int8((unsigned char)(0xC0 | encode)); @@ -4882,7 +4956,7 @@ void Assembler::psubq(XMMRegister dst, XMMRegister src) { void Assembler::vpsubb(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) { assert(UseAVX > 0, "requires some form of AVX"); - InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ false, /* uses_vl */ true); + InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ true, /* uses_vl */ true); int nds_enc = nds->is_valid() ? nds->encoding() : 0; int encode = vex_prefix_and_encode(dst->encoding(), nds_enc, src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F, &attributes); emit_int8((unsigned char)0xF8); @@ -4891,7 +4965,7 @@ void Assembler::vpsubb(XMMRegister dst, XMMRegister nds, XMMRegister src, int ve void Assembler::vpsubw(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) { assert(UseAVX > 0, "requires some form of AVX"); - InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ false, /* uses_vl */ true); + InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ true, /* uses_vl */ true); int nds_enc = nds->is_valid() ? nds->encoding() : 0; int encode = vex_prefix_and_encode(dst->encoding(), nds_enc, src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F, &attributes); emit_int8((unsigned char)0xF9); @@ -4919,7 +4993,7 @@ void Assembler::vpsubq(XMMRegister dst, XMMRegister nds, XMMRegister src, int ve void Assembler::vpsubb(XMMRegister dst, XMMRegister nds, Address src, int vector_len) { assert(UseAVX > 0, "requires some form of AVX"); InstructionMark im(this); - InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ false, /* uses_vl */ true); + InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ true, /* uses_vl */ true); attributes.set_address_attributes(/* tuple_type */ EVEX_FVM, /* input_size_in_bits */ EVEX_NObit); int nds_enc = nds->is_valid() ? nds->encoding() : 0; vex_prefix(src, nds_enc, dst->encoding(), VEX_SIMD_66, VEX_OPCODE_0F, &attributes); @@ -4930,7 +5004,7 @@ void Assembler::vpsubb(XMMRegister dst, XMMRegister nds, Address src, int vector void Assembler::vpsubw(XMMRegister dst, XMMRegister nds, Address src, int vector_len) { assert(UseAVX > 0, "requires some form of AVX"); InstructionMark im(this); - InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ false, /* uses_vl */ true); + InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ true, /* uses_vl */ true); attributes.set_address_attributes(/* tuple_type */ EVEX_FVM, /* input_size_in_bits */ EVEX_NObit); int nds_enc = nds->is_valid() ? nds->encoding() : 0; vex_prefix(src, nds_enc, dst->encoding(), VEX_SIMD_66, VEX_OPCODE_0F, &attributes); @@ -4962,7 +5036,7 @@ void Assembler::vpsubq(XMMRegister dst, XMMRegister nds, Address src, int vector void Assembler::pmullw(XMMRegister dst, XMMRegister src) { NOT_LP64(assert(VM_Version::supports_sse2(), "")); - InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ false, /* uses_vl */ true); + InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ true, /* uses_vl */ true); int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_66, VEX_OPCODE_0F, &attributes); emit_int8((unsigned char)0xD5); emit_int8((unsigned char)(0xC0 | encode)); @@ -4978,7 +5052,7 @@ void Assembler::pmulld(XMMRegister dst, XMMRegister src) { void Assembler::vpmullw(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) { assert(UseAVX > 0, "requires some form of AVX"); - InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ false, /* uses_vl */ true); + InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ true, /* uses_vl */ true); int nds_enc = nds->is_valid() ? nds->encoding() : 0; int encode = vex_prefix_and_encode(dst->encoding(), nds_enc, src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F, &attributes); emit_int8((unsigned char)0xD5); @@ -5006,7 +5080,7 @@ void Assembler::vpmullq(XMMRegister dst, XMMRegister nds, XMMRegister src, int v void Assembler::vpmullw(XMMRegister dst, XMMRegister nds, Address src, int vector_len) { assert(UseAVX > 0, "requires some form of AVX"); InstructionMark im(this); - InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ false, /* uses_vl */ true); + InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ true, /* uses_vl */ true); attributes.set_address_attributes(/* tuple_type */ EVEX_FVM, /* input_size_in_bits */ EVEX_NObit); int nds_enc = nds->is_valid() ? nds->encoding() : 0; vex_prefix(src, nds_enc, dst->encoding(), VEX_SIMD_66, VEX_OPCODE_0F, &attributes); @@ -5039,7 +5113,7 @@ void Assembler::vpmullq(XMMRegister dst, XMMRegister nds, Address src, int vecto // Shift packed integers left by specified number of bits. void Assembler::psllw(XMMRegister dst, int shift) { NOT_LP64(assert(VM_Version::supports_sse2(), "")); - InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ false, /* uses_vl */ true); + InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ true, /* uses_vl */ true); // XMM6 is for /6 encoding: 66 0F 71 /6 ib int encode = simd_prefix_and_encode(xmm6, dst, dst, VEX_SIMD_66, VEX_OPCODE_0F, &attributes); emit_int8(0x71); @@ -5069,7 +5143,7 @@ void Assembler::psllq(XMMRegister dst, int shift) { void Assembler::psllw(XMMRegister dst, XMMRegister shift) { NOT_LP64(assert(VM_Version::supports_sse2(), "")); - InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ false, /* uses_vl */ true); + InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ true, /* uses_vl */ true); int encode = simd_prefix_and_encode(dst, dst, shift, VEX_SIMD_66, VEX_OPCODE_0F, &attributes); emit_int8((unsigned char)0xF1); emit_int8((unsigned char)(0xC0 | encode)); @@ -5093,7 +5167,7 @@ void Assembler::psllq(XMMRegister dst, XMMRegister shift) { void Assembler::vpsllw(XMMRegister dst, XMMRegister src, int shift, int vector_len) { assert(UseAVX > 0, "requires some form of AVX"); - InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ false, /* uses_vl */ true); + InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ true, /* uses_vl */ true); // XMM6 is for /6 encoding: 66 0F 71 /6 ib int encode = vex_prefix_and_encode(xmm6->encoding(), dst->encoding(), src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F, &attributes); emit_int8(0x71); @@ -5124,7 +5198,7 @@ void Assembler::vpsllq(XMMRegister dst, XMMRegister src, int shift, int vector_l void Assembler::vpsllw(XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len) { assert(UseAVX > 0, "requires some form of AVX"); - InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ false, /* uses_vl */ true); + InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ true, /* uses_vl */ true); int encode = vex_prefix_and_encode(dst->encoding(), src->encoding(), shift->encoding(), VEX_SIMD_66, VEX_OPCODE_0F, &attributes); emit_int8((unsigned char)0xF1); emit_int8((unsigned char)(0xC0 | encode)); @@ -5149,7 +5223,7 @@ void Assembler::vpsllq(XMMRegister dst, XMMRegister src, XMMRegister shift, int // Shift packed integers logically right by specified number of bits. void Assembler::psrlw(XMMRegister dst, int shift) { NOT_LP64(assert(VM_Version::supports_sse2(), "")); - InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ false, /* uses_vl */ true); + InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ true, /* uses_vl */ true); // XMM2 is for /2 encoding: 66 0F 71 /2 ib int encode = simd_prefix_and_encode(xmm2, dst, dst, VEX_SIMD_66, VEX_OPCODE_0F, &attributes); emit_int8(0x71); @@ -5181,7 +5255,7 @@ void Assembler::psrlq(XMMRegister dst, int shift) { void Assembler::psrlw(XMMRegister dst, XMMRegister shift) { NOT_LP64(assert(VM_Version::supports_sse2(), "")); - InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ false, /* uses_vl */ true); + InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ true, /* uses_vl */ true); int encode = simd_prefix_and_encode(dst, dst, shift, VEX_SIMD_66, VEX_OPCODE_0F, &attributes); emit_int8((unsigned char)0xD1); emit_int8((unsigned char)(0xC0 | encode)); @@ -5205,7 +5279,7 @@ void Assembler::psrlq(XMMRegister dst, XMMRegister shift) { void Assembler::vpsrlw(XMMRegister dst, XMMRegister src, int shift, int vector_len) { assert(UseAVX > 0, "requires some form of AVX"); - InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ false, /* uses_vl */ true); + InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ true, /* uses_vl */ true); // XMM2 is for /2 encoding: 66 0F 71 /2 ib int encode = vex_prefix_and_encode(xmm2->encoding(), dst->encoding(), src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F, &attributes); emit_int8(0x71); @@ -5235,7 +5309,7 @@ void Assembler::vpsrlq(XMMRegister dst, XMMRegister src, int shift, int vector_l void Assembler::vpsrlw(XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len) { assert(UseAVX > 0, "requires some form of AVX"); - InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ false, /* uses_vl */ true); + InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ true, /* uses_vl */ true); int encode = vex_prefix_and_encode(dst->encoding(), src->encoding(), shift->encoding(), VEX_SIMD_66, VEX_OPCODE_0F, &attributes); emit_int8((unsigned char)0xD1); emit_int8((unsigned char)(0xC0 | encode)); @@ -5260,7 +5334,7 @@ void Assembler::vpsrlq(XMMRegister dst, XMMRegister src, XMMRegister shift, int // Shift packed integers arithmetically right by specified number of bits. void Assembler::psraw(XMMRegister dst, int shift) { NOT_LP64(assert(VM_Version::supports_sse2(), "")); - InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ false, /* uses_vl */ true); + InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ true, /* uses_vl */ true); // XMM4 is for /4 encoding: 66 0F 71 /4 ib int encode = simd_prefix_and_encode(xmm4, dst, dst, VEX_SIMD_66, VEX_OPCODE_0F, &attributes); emit_int8(0x71); @@ -5280,7 +5354,7 @@ void Assembler::psrad(XMMRegister dst, int shift) { void Assembler::psraw(XMMRegister dst, XMMRegister shift) { NOT_LP64(assert(VM_Version::supports_sse2(), "")); - InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ false, /* uses_vl */ true); + InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ true, /* uses_vl */ true); int encode = simd_prefix_and_encode(dst, dst, shift, VEX_SIMD_66, VEX_OPCODE_0F, &attributes); emit_int8((unsigned char)0xE1); emit_int8((unsigned char)(0xC0 | encode)); @@ -5296,7 +5370,7 @@ void Assembler::psrad(XMMRegister dst, XMMRegister shift) { void Assembler::vpsraw(XMMRegister dst, XMMRegister src, int shift, int vector_len) { assert(UseAVX > 0, "requires some form of AVX"); - InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ false, /* uses_vl */ true); + InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ true, /* uses_vl */ true); // XMM4 is for /4 encoding: 66 0F 71 /4 ib int encode = vex_prefix_and_encode(xmm4->encoding(), dst->encoding(), src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F, &attributes); emit_int8(0x71); @@ -5316,7 +5390,7 @@ void Assembler::vpsrad(XMMRegister dst, XMMRegister src, int shift, int vector_l void Assembler::vpsraw(XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len) { assert(UseAVX > 0, "requires some form of AVX"); - InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ false, /* uses_vl */ true); + InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ true, /* uses_vl */ true); int encode = vex_prefix_and_encode(dst->encoding(), src->encoding(), shift->encoding(), VEX_SIMD_66, VEX_OPCODE_0F, &attributes); emit_int8((unsigned char)0xE1); emit_int8((unsigned char)(0xC0 | encode)); @@ -5706,7 +5780,7 @@ void Assembler::vpbroadcastd(XMMRegister dst, XMMRegister src) { // duplicate 2-bytes integer data from src into 16 locations in dest void Assembler::vpbroadcastw(XMMRegister dst, XMMRegister src) { assert(VM_Version::supports_avx2(), ""); - InstructionAttr attributes(AVX_256bit, /* vex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ false, /* uses_vl */ true); + InstructionAttr attributes(AVX_256bit, /* vex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ true, /* uses_vl */ true); int encode = vex_prefix_and_encode(dst->encoding(), 0, src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_38, &attributes); emit_int8(0x79); emit_int8((unsigned char)(0xC0 | encode)); @@ -6573,18 +6647,6 @@ int Assembler::simd_prefix_and_encode(XMMRegister dst, XMMRegister nds, XMMRegis } } -int Assembler::kreg_prefix_and_encode(KRegister dst, KRegister nds, KRegister src, VexSimdPrefix pre, - VexOpcode opc, InstructionAttr *attributes) { - int nds_enc = nds->is_valid() ? nds->encoding() : 0; - return vex_prefix_and_encode(dst->encoding(), nds_enc, src->encoding(), pre, opc, attributes); -} - -int Assembler::kreg_prefix_and_encode(KRegister dst, KRegister nds, Register src, VexSimdPrefix pre, - VexOpcode opc, InstructionAttr *attributes) { - int nds_enc = nds->is_valid() ? nds->encoding() : 0; - return vex_prefix_and_encode(dst->encoding(), nds_enc, src->encoding(), pre, opc, attributes); -} - void Assembler::cmppd(XMMRegister dst, XMMRegister nds, XMMRegister src, int cop, int vector_len) { assert(VM_Version::supports_avx(), ""); assert(!VM_Version::supports_evex(), ""); diff --git a/hotspot/src/cpu/x86/vm/assembler_x86.hpp b/hotspot/src/cpu/x86/vm/assembler_x86.hpp index a5f493905fc..4b6dcf4a028 100644 --- a/hotspot/src/cpu/x86/vm/assembler_x86.hpp +++ b/hotspot/src/cpu/x86/vm/assembler_x86.hpp @@ -655,12 +655,6 @@ private: int simd_prefix_and_encode(XMMRegister dst, XMMRegister nds, XMMRegister src, VexSimdPrefix pre, VexOpcode opc, InstructionAttr *attributes); - int kreg_prefix_and_encode(KRegister dst, KRegister nds, KRegister src, VexSimdPrefix pre, - VexOpcode opc, InstructionAttr *attributes); - - int kreg_prefix_and_encode(KRegister dst, KRegister nds, Register src, VexSimdPrefix pre, - VexOpcode opc, InstructionAttr *attributes); - // Helper functions for groups of instructions void emit_arith_b(int op1, int op2, Register dst, int imm8); @@ -1331,12 +1325,17 @@ private: void movddup(XMMRegister dst, XMMRegister src); + void kmovbl(KRegister dst, Register src); + void kmovbl(Register dst, KRegister src); void kmovwl(KRegister dst, Register src); + void kmovwl(Register dst, KRegister src); void kmovdl(KRegister dst, Register src); + void kmovdl(Register dst, KRegister src); void kmovql(KRegister dst, KRegister src); - void kmovql(KRegister dst, Register src); void kmovql(Address dst, KRegister src); void kmovql(KRegister dst, Address src); + void kmovql(KRegister dst, Register src); + void kmovql(Register dst, KRegister src); void kortestbl(KRegister dst, KRegister src); void kortestwl(KRegister dst, KRegister src); @@ -1521,14 +1520,17 @@ private: void pcmpeqb(XMMRegister dst, XMMRegister src); void vpcmpeqb(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len); void evpcmpeqb(KRegister kdst, XMMRegister nds, XMMRegister src, int vector_len); + void evpcmpeqb(KRegister kdst, XMMRegister nds, Address src, int vector_len); void pcmpeqw(XMMRegister dst, XMMRegister src); void vpcmpeqw(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len); void evpcmpeqw(KRegister kdst, XMMRegister nds, XMMRegister src, int vector_len); + void evpcmpeqw(KRegister kdst, XMMRegister nds, Address src, int vector_len); void pcmpeqd(XMMRegister dst, XMMRegister src); void vpcmpeqd(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len); void evpcmpeqd(KRegister kdst, XMMRegister nds, XMMRegister src, int vector_len); + void evpcmpeqd(KRegister kdst, XMMRegister nds, Address src, int vector_len); void pcmpeqq(XMMRegister dst, XMMRegister src); void vpcmpeqq(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len); diff --git a/hotspot/src/cpu/x86/vm/macroAssembler_x86.cpp b/hotspot/src/cpu/x86/vm/macroAssembler_x86.cpp index 2ccee91a6cd..a916379ea36 100644 --- a/hotspot/src/cpu/x86/vm/macroAssembler_x86.cpp +++ b/hotspot/src/cpu/x86/vm/macroAssembler_x86.cpp @@ -7999,9 +7999,15 @@ void MacroAssembler::string_compare(Register str1, Register str2, XMMRegister vec1, int ae) { ShortBranchVerifier sbv(this); Label LENGTH_DIFF_LABEL, POP_LABEL, DONE_LABEL, WHILE_HEAD_LABEL; + Label COMPARE_WIDE_VECTORS_LOOP_FAILED; // used only _LP64 && AVX3 int stride, stride2, adr_stride, adr_stride1, adr_stride2; + int stride2x2 = 0x40; Address::ScaleFactor scale, scale1, scale2; + if (ae != StrIntrinsicNode::LL) { + stride2x2 = 0x20; + } + if (ae == StrIntrinsicNode::LU || ae == StrIntrinsicNode::UL) { shrl(cnt2, 1); } @@ -8011,15 +8017,15 @@ void MacroAssembler::string_compare(Register str1, Register str2, movl(result, cnt1); subl(cnt1, cnt2); push(cnt1); - cmov32(Assembler::lessEqual, cnt2, result); + cmov32(Assembler::lessEqual, cnt2, result); // cnt2 = min(cnt1, cnt2) // Is the minimum length zero? testl(cnt2, cnt2); jcc(Assembler::zero, LENGTH_DIFF_LABEL); if (ae == StrIntrinsicNode::LL) { // Load first bytes - load_unsigned_byte(result, Address(str1, 0)); - load_unsigned_byte(cnt1, Address(str2, 0)); + load_unsigned_byte(result, Address(str1, 0)); // result = str1[0] + load_unsigned_byte(cnt1, Address(str2, 0)); // cnt1 = str2[0] } else if (ae == StrIntrinsicNode::UU) { // Load first characters load_unsigned_short(result, Address(str1, 0)); @@ -8060,7 +8066,10 @@ void MacroAssembler::string_compare(Register str1, Register str2, assert(UseSSE >= 4, "SSE4 must be enabled for SSE4.2 intrinsics to be available"); Label COMPARE_WIDE_VECTORS, VECTOR_NOT_EQUAL, COMPARE_WIDE_TAIL, COMPARE_SMALL_STR; Label COMPARE_WIDE_VECTORS_LOOP, COMPARE_16_CHARS, COMPARE_INDEX_CHAR; + Label COMPARE_WIDE_VECTORS_LOOP_AVX2; Label COMPARE_TAIL_LONG; + Label COMPARE_WIDE_VECTORS_LOOP_AVX3; // used only _LP64 && AVX3 + int pcmpmask = 0x19; if (ae == StrIntrinsicNode::LL) { pcmpmask &= ~0x01; @@ -8123,11 +8132,40 @@ void MacroAssembler::string_compare(Register str1, Register str2, } subl(result, stride2); subl(cnt2, stride2); - jccb(Assembler::zero, COMPARE_WIDE_TAIL); + jcc(Assembler::zero, COMPARE_WIDE_TAIL); negptr(result); // In a loop, compare 16-chars (32-bytes) at once using (vpxor+vptest) bind(COMPARE_WIDE_VECTORS_LOOP); + +#ifdef _LP64 + if (VM_Version::supports_avx512vlbw()) { // trying 64 bytes fast loop + cmpl(cnt2, stride2x2); + jccb(Assembler::below, COMPARE_WIDE_VECTORS_LOOP_AVX2); + testl(cnt2, stride2x2-1); // cnt2 holds the vector count + jccb(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP_AVX2); // means we cannot subtract by 0x40 + + bind(COMPARE_WIDE_VECTORS_LOOP_AVX3); // the hottest loop + if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { + evmovdquq(vec1, Address(str1, result, scale), Assembler::AVX_512bit); + evpcmpeqb(k7, vec1, Address(str2, result, scale), Assembler::AVX_512bit); // k7 == 11..11, if operands equal, otherwise k7 has some 0 + } else { + vpmovzxbw(vec1, Address(str1, result, scale1), Assembler::AVX_512bit); + evpcmpeqb(k7, vec1, Address(str2, result, scale2), Assembler::AVX_512bit); // k7 == 11..11, if operands equal, otherwise k7 has some 0 + } + kortestql(k7, k7); + jcc(Assembler::aboveEqual, COMPARE_WIDE_VECTORS_LOOP_FAILED); // miscompare + addptr(result, stride2x2); // update since we already compared at this addr + subl(cnt2, stride2x2); // and sub the size too + jccb(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP_AVX3); + + vpxor(vec1, vec1); + jmpb(COMPARE_WIDE_TAIL); + }//if (VM_Version::supports_avx512vlbw()) +#endif // _LP64 + + + bind(COMPARE_WIDE_VECTORS_LOOP_AVX2); if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { vmovdqu(vec1, Address(str1, result, scale)); vpxor(vec1, Address(str2, result, scale)); @@ -8136,7 +8174,7 @@ void MacroAssembler::string_compare(Register str1, Register str2, vpxor(vec1, Address(str2, result, scale2)); } vptest(vec1, vec1); - jccb(Assembler::notZero, VECTOR_NOT_EQUAL); + jcc(Assembler::notZero, VECTOR_NOT_EQUAL); addptr(result, stride2); subl(cnt2, stride2); jccb(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP); @@ -8151,7 +8189,7 @@ void MacroAssembler::string_compare(Register str1, Register str2, movl(result, stride2); movl(cnt2, result); negptr(result); - jmpb(COMPARE_WIDE_VECTORS_LOOP); + jmp(COMPARE_WIDE_VECTORS_LOOP_AVX2); // Identifies the mismatching (higher or lower)16-bytes in the 32-byte vectors. bind(VECTOR_NOT_EQUAL); @@ -8295,6 +8333,34 @@ void MacroAssembler::string_compare(Register str1, Register str2, } jmpb(DONE_LABEL); +#ifdef _LP64 + if (VM_Version::supports_avx512vlbw()) { + + bind(COMPARE_WIDE_VECTORS_LOOP_FAILED); + + kmovql(cnt1, k7); + notq(cnt1); + bsfq(cnt2, cnt1); + if (ae != StrIntrinsicNode::LL) { + // Divide diff by 2 to get number of chars + sarl(cnt2, 1); + } + addq(result, cnt2); + if (ae == StrIntrinsicNode::LL) { + load_unsigned_byte(cnt1, Address(str2, result)); + load_unsigned_byte(result, Address(str1, result)); + } else if (ae == StrIntrinsicNode::UU) { + load_unsigned_short(cnt1, Address(str2, result, scale)); + load_unsigned_short(result, Address(str1, result, scale)); + } else { + load_unsigned_short(cnt1, Address(str2, result, scale2)); + load_unsigned_byte(result, Address(str1, result, scale1)); + } + subl(result, cnt1); + jmpb(POP_LABEL); + }//if (VM_Version::supports_avx512vlbw()) +#endif // _LP64 + // Discard the stored length difference bind(POP_LABEL); pop(cnt1); @@ -8304,6 +8370,7 @@ void MacroAssembler::string_compare(Register str1, Register str2, if(ae == StrIntrinsicNode::UL) { negl(result); } + } // Search for Non-ASCII character (Negative byte value) in a byte array, diff --git a/hotspot/src/cpu/x86/vm/sharedRuntime_x86_64.cpp b/hotspot/src/cpu/x86/vm/sharedRuntime_x86_64.cpp index 79b2a0f1db6..2ea8f1a49f0 100644 --- a/hotspot/src/cpu/x86/vm/sharedRuntime_x86_64.cpp +++ b/hotspot/src/cpu/x86/vm/sharedRuntime_x86_64.cpp @@ -189,7 +189,7 @@ OopMap* RegisterSaver::save_live_registers(MacroAssembler* masm, int additional_ } // Save full ZMM registes(16..num_xmm_regs) base_addr = XSAVE_AREA_UPPERBANK; - int off = 0; + off = 0; int vector_len = Assembler::AVX_512bit; for (int n = 16; n < num_xmm_regs; n++) { __ evmovdqul(Address(rsp, base_addr+(off++*64)), as_XMMRegister(n), vector_len); @@ -199,7 +199,7 @@ OopMap* RegisterSaver::save_live_registers(MacroAssembler* masm, int additional_ if (VM_Version::supports_evex()) { // Save upper bank of ZMM registers(16..31) for double/float usage int base_addr = XSAVE_AREA_UPPERBANK; - int off = 0; + off = 0; for (int n = 16; n < num_xmm_regs; n++) { __ movsd(Address(rsp, base_addr+(off++*64)), as_XMMRegister(n)); } @@ -325,7 +325,7 @@ void RegisterSaver::restore_live_registers(MacroAssembler* masm, bool restore_ve assert(MaxVectorSize == 64, "only 512bit vectors are supported now"); } #else - assert(!save_vectors, "vectors are generated only by C2"); + assert(!restore_vectors, "vectors are generated only by C2"); #endif // On EVEX enabled targets everything is handled in pop fpu state diff --git a/hotspot/src/cpu/x86/vm/stubGenerator_x86_32.cpp b/hotspot/src/cpu/x86/vm/stubGenerator_x86_32.cpp index 6298be79e33..b55d1bd2e9d 100644 --- a/hotspot/src/cpu/x86/vm/stubGenerator_x86_32.cpp +++ b/hotspot/src/cpu/x86/vm/stubGenerator_x86_32.cpp @@ -170,7 +170,7 @@ class StubGenerator: public StubCodeGenerator { // provide initial value for required masks if (UseAVX > 2) { __ movl(rbx, 0xffff); - __ kmovdl(k1, rbx); + __ kmovwl(k1, rbx); } // save and initialize %mxcsr @@ -798,7 +798,7 @@ class StubGenerator: public StubCodeGenerator { if (UseAVX > 2) { __ push(rbx); __ movl(rbx, 0xffff); - __ kmovdl(k1, rbx); + __ kmovwl(k1, rbx); __ pop(rbx); } // Copy 64-byte chunks diff --git a/hotspot/src/cpu/x86/vm/stubGenerator_x86_64.cpp b/hotspot/src/cpu/x86/vm/stubGenerator_x86_64.cpp index c704975870f..f86c6b88786 100644 --- a/hotspot/src/cpu/x86/vm/stubGenerator_x86_64.cpp +++ b/hotspot/src/cpu/x86/vm/stubGenerator_x86_64.cpp @@ -266,7 +266,7 @@ class StubGenerator: public StubCodeGenerator { __ movptr(r15_save, r15); if (UseAVX > 2) { __ movl(rbx, 0xffff); - __ kmovql(k1, rbx); + __ kmovwl(k1, rbx); } #ifdef _WIN64 int last_reg = 15; @@ -1350,7 +1350,7 @@ class StubGenerator: public StubCodeGenerator { Label L_end; if (UseAVX > 2) { __ movl(to, 0xffff); - __ kmovql(k1, to); + __ kmovwl(k1, to); } // Copy 64-bytes per iteration __ BIND(L_loop); @@ -1434,7 +1434,7 @@ class StubGenerator: public StubCodeGenerator { Label L_end; if (UseAVX > 2) { __ movl(to, 0xffff); - __ kmovql(k1, to); + __ kmovwl(k1, to); } // Copy 64-bytes per iteration __ BIND(L_loop); From ef1c3dc4ed614bcf7c8e9d6bb1d57015fc392ff7 Mon Sep 17 00:00:00 2001 From: Martin Doerr Date: Mon, 7 Dec 2015 18:24:24 +0100 Subject: [PATCH 7/8] 8144850: C1: operator delete needs an implementation Reviewed-by: kvn --- hotspot/src/share/vm/c1/c1_LIRGenerator.hpp | 4 ++-- hotspot/src/share/vm/c1/c1_RangeCheckElimination.hpp | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/hotspot/src/share/vm/c1/c1_LIRGenerator.hpp b/hotspot/src/share/vm/c1/c1_LIRGenerator.hpp index 47dd33df54e..9438d77288c 100644 --- a/hotspot/src/share/vm/c1/c1_LIRGenerator.hpp +++ b/hotspot/src/share/vm/c1/c1_LIRGenerator.hpp @@ -157,8 +157,8 @@ class LIRGenerator: public InstructionVisitor, public BlockClosure { private: void* operator new(size_t size) throw(); void* operator new[](size_t size) throw(); - void operator delete(void* p); - void operator delete[](void* p); + void operator delete(void* p) { ShouldNotReachHere(); } + void operator delete[](void* p) { ShouldNotReachHere(); } Compilation* _compilation; ciMethod* _method; // method that we are compiling diff --git a/hotspot/src/share/vm/c1/c1_RangeCheckElimination.hpp b/hotspot/src/share/vm/c1/c1_RangeCheckElimination.hpp index 608f39a9196..0552125b8dc 100644 --- a/hotspot/src/share/vm/c1/c1_RangeCheckElimination.hpp +++ b/hotspot/src/share/vm/c1/c1_RangeCheckElimination.hpp @@ -50,8 +50,8 @@ private: private: void* operator new(size_t size) throw(); void* operator new[](size_t size) throw(); - void operator delete(void* p); - void operator delete[](void* p); + void operator delete(void* p) { ShouldNotReachHere(); } + void operator delete[](void* p) { ShouldNotReachHere(); } IR *_ir; boolArray _used; From 2b4403dc8837df2edb09a294c6cd0f902a6dba7c Mon Sep 17 00:00:00 2001 From: Goetz Lindenmaier Date: Mon, 14 Dec 2015 10:22:19 +0100 Subject: [PATCH 8/8] 8145300: ppc64: fix port of "8072008: Emit direct call instead of linkTo* for recursive indy/MH.invoke* calls" Reviewed-by: simonis --- hotspot/src/cpu/ppc/vm/ppc.ad | 1 + 1 file changed, 1 insertion(+) diff --git a/hotspot/src/cpu/ppc/vm/ppc.ad b/hotspot/src/cpu/ppc/vm/ppc.ad index 781b4038229..682474c05a3 100644 --- a/hotspot/src/cpu/ppc/vm/ppc.ad +++ b/hotspot/src/cpu/ppc/vm/ppc.ad @@ -3486,6 +3486,7 @@ encode %{ call->_jvmadj = _jvmadj; call->_in_rms = _in_rms; call->_nesting = _nesting; + call->_override_symbolic_info = _override_symbolic_info; // New call needs all inputs of old call. // Req...