From 0a123cad8fdbc7817c89251ac7c9fb3157288bf9 Mon Sep 17 00:00:00 2001 From: Tomasz Wojtowicz Date: Thu, 5 May 2016 17:16:08 -0700 Subject: [PATCH] 8154974: AVX-512 equipped inflate, has_negatives & compress intrinsics Reviewed-by: kvn --- hotspot/src/cpu/x86/vm/assembler_x86.cpp | 165 +++++- hotspot/src/cpu/x86/vm/assembler_x86.hpp | 39 +- hotspot/src/cpu/x86/vm/macroAssembler_x86.cpp | 500 +++++++++++++++--- 3 files changed, 601 insertions(+), 103 deletions(-) diff --git a/hotspot/src/cpu/x86/vm/assembler_x86.cpp b/hotspot/src/cpu/x86/vm/assembler_x86.cpp index dccf9c8cfb0..d65577ffca5 100644 --- a/hotspot/src/cpu/x86/vm/assembler_x86.cpp +++ b/hotspot/src/cpu/x86/vm/assembler_x86.cpp @@ -2332,6 +2332,22 @@ void Assembler::ktestql(KRegister src1, KRegister src2) { emit_int8((unsigned char)(0xC0 | encode)); } +void Assembler::ktestq(KRegister src1, KRegister src2) { + assert(VM_Version::supports_avx512bw(), ""); + InstructionAttr attributes(AVX_128bit, /* rex_w */ true, /* legacy_mode */ true, /* no_mask_reg */ true, /* uses_vl */ false); + int encode = vex_prefix_and_encode(src1->encoding(), 0, src2->encoding(), VEX_SIMD_NONE, VEX_OPCODE_0F, &attributes); + emit_int8((unsigned char)0x99); + emit_int8((unsigned char)(0xC0 | encode)); +} + +void Assembler::ktestd(KRegister src1, KRegister src2) { + assert(VM_Version::supports_avx512bw(), ""); + InstructionAttr attributes(AVX_128bit, /* rex_w */ true, /* legacy_mode */ true, /* no_mask_reg */ true, /* uses_vl */ false); + int encode = vex_prefix_and_encode(src1->encoding(), 0, src2->encoding(), VEX_SIMD_66, VEX_OPCODE_0F, &attributes); + emit_int8((unsigned char)0x99); + emit_int8((unsigned char)(0xC0 | encode)); +} + void Assembler::movb(Address dst, int imm8) { InstructionMark im(this); prefix(dst); @@ -2500,7 +2516,7 @@ void Assembler::evmovdqub(Address dst, XMMRegister src, int vector_len) { emit_operand(src, dst); } -void Assembler::evmovdqub(KRegister mask, XMMRegister dst, Address src, int vector_len) { +void Assembler::evmovdqub(XMMRegister dst, KRegister mask, Address src, int vector_len) { assert(VM_Version::supports_avx512vlbw(), ""); assert(is_vector_masking(), ""); // For stub code use only InstructionMark im(this); @@ -2513,16 +2529,6 @@ void Assembler::evmovdqub(KRegister mask, XMMRegister dst, Address src, int vect emit_operand(dst, src); } -void Assembler::evmovdquw(XMMRegister dst, XMMRegister src, int vector_len) { - assert(VM_Version::supports_evex(), ""); - InstructionAttr attributes(vector_len, /* vex_w */ true, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ true, /* uses_vl */ true); - attributes.set_is_evex_instruction(); - int prefix = (_legacy_mode_bw) ? VEX_SIMD_F2 : VEX_SIMD_F3; - int encode = vex_prefix_and_encode(dst->encoding(), 0, src->encoding(), (Assembler::VexSimdPrefix)prefix, VEX_OPCODE_0F, &attributes); - emit_int8(0x6F); - emit_int8((unsigned char)(0xC0 | encode)); -} - void Assembler::evmovdquw(XMMRegister dst, Address src, int vector_len) { assert(VM_Version::supports_evex(), ""); InstructionMark im(this); @@ -2535,6 +2541,19 @@ void Assembler::evmovdquw(XMMRegister dst, Address src, int vector_len) { emit_operand(dst, src); } +void Assembler::evmovdquw(XMMRegister dst, KRegister mask, Address src, int vector_len) { + assert(is_vector_masking(), ""); + assert(VM_Version::supports_avx512vlbw(), ""); + InstructionMark im(this); + InstructionAttr attributes(vector_len, /* vex_w */ true, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ false, /* uses_vl */ true); + attributes.set_address_attributes(/* tuple_type */ EVEX_FVM, /* input_size_in_bits */ EVEX_NObit); + attributes.set_embedded_opmask_register_specifier(mask); + attributes.set_is_evex_instruction(); + vex_prefix(src, 0, dst->encoding(), VEX_SIMD_F2, VEX_OPCODE_0F, &attributes); + emit_int8(0x6F); + emit_operand(dst, src); +} + void Assembler::evmovdquw(Address dst, XMMRegister src, int vector_len) { assert(VM_Version::supports_evex(), ""); assert(src != xnoreg, "sanity"); @@ -2548,6 +2567,19 @@ void Assembler::evmovdquw(Address dst, XMMRegister src, int vector_len) { emit_operand(src, dst); } +void Assembler::evmovdquw(Address dst, KRegister mask, XMMRegister src, int vector_len) { + assert(VM_Version::supports_avx512vlbw(), ""); + assert(src != xnoreg, "sanity"); + InstructionMark im(this); + InstructionAttr attributes(vector_len, /* vex_w */ true, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ true); + attributes.set_address_attributes(/* tuple_type */ EVEX_FVM, /* input_size_in_bits */ EVEX_NObit); + attributes.set_embedded_opmask_register_specifier(mask); + attributes.set_is_evex_instruction(); + vex_prefix(dst, 0, src->encoding(), VEX_SIMD_F2, VEX_OPCODE_0F, &attributes); + emit_int8(0x7F); + emit_operand(src, dst); +} + void Assembler::evmovdqul(XMMRegister dst, XMMRegister src, int vector_len) { assert(VM_Version::supports_evex(), ""); InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ true); @@ -3295,10 +3327,71 @@ void Assembler::evpcmpeqb(KRegister kdst, XMMRegister nds, XMMRegister src, int emit_int8((unsigned char)(0xC0 | encode)); } +void Assembler::evpcmpgtb(KRegister kdst, XMMRegister nds, Address src, int vector_len) { + assert(VM_Version::supports_avx512vlbw(), ""); + InstructionMark im(this); + InstructionAttr attributes(vector_len, /* rex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ false); + attributes.set_address_attributes(/* tuple_type */ EVEX_FVM, /* input_size_in_bits */ EVEX_NObit); + attributes.set_is_evex_instruction(); + int dst_enc = kdst->encoding(); + vex_prefix(src, nds->encoding(), dst_enc, VEX_SIMD_66, VEX_OPCODE_0F, &attributes); + emit_int8(0x64); + emit_operand(as_Register(dst_enc), src); +} + +void Assembler::evpcmpgtb(KRegister kdst, KRegister mask, XMMRegister nds, Address src, int vector_len) { + assert(is_vector_masking(), ""); + assert(VM_Version::supports_avx512vlbw(), ""); + InstructionMark im(this); + InstructionAttr attributes(vector_len, /* rex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ false); + attributes.set_address_attributes(/* tuple_type */ EVEX_FVM, /* input_size_in_bits */ EVEX_NObit); + attributes.set_embedded_opmask_register_specifier(mask); + attributes.set_is_evex_instruction(); + int dst_enc = kdst->encoding(); + vex_prefix(src, nds->encoding(), dst_enc, VEX_SIMD_66, VEX_OPCODE_0F, &attributes); + emit_int8(0x64); + emit_operand(as_Register(dst_enc), src); +} + +void Assembler::evpcmpuw(KRegister kdst, XMMRegister nds, XMMRegister src, ComparisonPredicate vcc, int vector_len) { + assert(VM_Version::supports_avx512vlbw(), ""); + InstructionAttr attributes(vector_len, /* rex_w */ true, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ false); + attributes.set_is_evex_instruction(); + int encode = vex_prefix_and_encode(kdst->encoding(), nds->encoding(), src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_3A, &attributes); + emit_int8(0x3E); + emit_int8((unsigned char)(0xC0 | encode)); + emit_int8(vcc); +} + +void Assembler::evpcmpuw(KRegister kdst, KRegister mask, XMMRegister nds, XMMRegister src, ComparisonPredicate vcc, int vector_len) { + assert(is_vector_masking(), ""); + assert(VM_Version::supports_avx512vlbw(), ""); + InstructionAttr attributes(vector_len, /* rex_w */ true, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ false); + attributes.set_embedded_opmask_register_specifier(mask); + attributes.set_is_evex_instruction(); + int encode = vex_prefix_and_encode(kdst->encoding(), nds->encoding(), src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_3A, &attributes); + emit_int8(0x3E); + emit_int8((unsigned char)(0xC0 | encode)); + emit_int8(vcc); +} + +void Assembler::evpcmpuw(KRegister kdst, XMMRegister nds, Address src, ComparisonPredicate vcc, int vector_len) { + assert(VM_Version::supports_avx512vlbw(), ""); + InstructionMark im(this); + InstructionAttr attributes(vector_len, /* rex_w */ true, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ false); + attributes.set_address_attributes(/* tuple_type */ EVEX_FVM, /* input_size_in_bits */ EVEX_NObit); + attributes.set_is_evex_instruction(); + int dst_enc = kdst->encoding(); + vex_prefix(src, nds->encoding(), kdst->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_3A, &attributes); + emit_int8(0x3E); + emit_operand(as_Register(dst_enc), src); + emit_int8(vcc); +} + void Assembler::evpcmpeqb(KRegister kdst, XMMRegister nds, Address src, int vector_len) { assert(VM_Version::supports_avx512bw(), ""); InstructionMark im(this); - InstructionAttr attributes(vector_len, /* rex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ true); + InstructionAttr attributes(vector_len, /* rex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ false); attributes.set_is_evex_instruction(); attributes.set_address_attributes(/* tuple_type */ EVEX_FVM, /* input_size_in_bits */ EVEX_NObit); int dst_enc = kdst->encoding(); @@ -3307,7 +3400,7 @@ void Assembler::evpcmpeqb(KRegister kdst, XMMRegister nds, Address src, int vect emit_operand(as_Register(dst_enc), src); } -void Assembler::evpcmpeqb(KRegister mask, KRegister kdst, XMMRegister nds, Address src, int vector_len) { +void Assembler::evpcmpeqb(KRegister kdst, KRegister mask, XMMRegister nds, Address src, int vector_len) { assert(VM_Version::supports_avx512vlbw(), ""); assert(is_vector_masking(), ""); // For stub code use only InstructionMark im(this); @@ -3620,6 +3713,46 @@ void Assembler::vpmovzxbw(XMMRegister dst, Address src, int vector_len) { emit_operand(dst, src); } +void Assembler::evpmovzxbw(XMMRegister dst, KRegister mask, Address src, int vector_len) { + assert(is_vector_masking(), ""); + assert(VM_Version::supports_avx512vlbw(), ""); + assert(dst != xnoreg, "sanity"); + InstructionMark im(this); + InstructionAttr attributes(vector_len, /* rex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ false); + attributes.set_address_attributes(/* tuple_type */ EVEX_HVM, /* input_size_in_bits */ EVEX_NObit); + attributes.set_embedded_opmask_register_specifier(mask); + attributes.set_is_evex_instruction(); + vex_prefix(src, 0, dst->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_38, &attributes); + emit_int8(0x30); + emit_operand(dst, src); +} + +void Assembler::evpmovwb(Address dst, XMMRegister src, int vector_len) { + assert(VM_Version::supports_avx512vlbw(), ""); + assert(src != xnoreg, "sanity"); + InstructionMark im(this); + InstructionAttr attributes(vector_len, /* rex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ false); + attributes.set_address_attributes(/* tuple_type */ EVEX_HVM, /* input_size_in_bits */ EVEX_NObit); + attributes.set_is_evex_instruction(); + vex_prefix(dst, 0, src->encoding(), VEX_SIMD_F3, VEX_OPCODE_0F_38, &attributes); + emit_int8(0x30); + emit_operand(src, dst); +} + +void Assembler::evpmovwb(Address dst, KRegister mask, XMMRegister src, int vector_len) { + assert(is_vector_masking(), ""); + assert(VM_Version::supports_avx512vlbw(), ""); + assert(src != xnoreg, "sanity"); + InstructionMark im(this); + InstructionAttr attributes(vector_len, /* rex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ false); + attributes.set_address_attributes(/* tuple_type */ EVEX_HVM, /* input_size_in_bits */ EVEX_NObit); + attributes.set_embedded_opmask_register_specifier(mask); + attributes.set_is_evex_instruction(); + vex_prefix(dst, 0, src->encoding(), VEX_SIMD_F3, VEX_OPCODE_0F_38, &attributes); + emit_int8(0x30); + emit_operand(src, dst); +} + // generic void Assembler::pop(Register dst) { int encode = prefix_and_encode(dst->encoding()); @@ -6406,7 +6539,6 @@ void Assembler::vzeroupper() { emit_int8(0x77); } - #ifndef _LP64 // 32bit only pieces of the assembler @@ -6973,7 +7105,10 @@ void Assembler::evex_prefix(bool vex_r, bool vex_b, bool vex_x, bool evex_r, boo emit_int8(byte3); // P2: byte 4 as zL'Lbv'aaa - int byte4 = (_attributes->is_no_reg_mask()) ? 0 : _attributes->get_embedded_opmask_register_specifier(); // kregs are implemented in the low 3 bits as aaa (hard code k1, it will be initialized for now) + // kregs are implemented in the low 3 bits as aaa (hard code k1, it will be initialized for now) + int byte4 = (_attributes->is_no_reg_mask()) ? + 0 : + _attributes->get_embedded_opmask_register_specifier(); // EVEX.v` for extending EVEX.vvvv or VIDX byte4 |= (evex_v ? 0: EVEX_V); // third EXEC.b for broadcast actions diff --git a/hotspot/src/cpu/x86/vm/assembler_x86.hpp b/hotspot/src/cpu/x86/vm/assembler_x86.hpp index 9e40bbb90dd..56165382936 100644 --- a/hotspot/src/cpu/x86/vm/assembler_x86.hpp +++ b/hotspot/src/cpu/x86/vm/assembler_x86.hpp @@ -587,6 +587,16 @@ class Assembler : public AbstractAssembler { #endif }; + enum ComparisonPredicate { + eq = 0, + lt = 1, + le = 2, + _false = 3, + neq = 4, + nlt = 5, + nle = 6, + _true = 7 + }; // NOTE: The general philopsophy of the declarations here is that 64bit versions @@ -830,7 +840,6 @@ private: void clear_vector_masking(void) { _vector_masking = false; } bool is_vector_masking(void) { return _vector_masking; } - void lea(Register dst, Address src); void mov(Register dst, Register src); @@ -1362,6 +1371,9 @@ private: void kortestdl(KRegister dst, KRegister src); void kortestql(KRegister dst, KRegister src); + void ktestq(KRegister src1, KRegister src2); + void ktestd(KRegister src1, KRegister src2); + void ktestql(KRegister dst, KRegister src); void movdl(XMMRegister dst, Register src); @@ -1391,10 +1403,11 @@ private: void evmovdqub(Address dst, XMMRegister src, int vector_len); void evmovdqub(XMMRegister dst, Address src, int vector_len); void evmovdqub(XMMRegister dst, XMMRegister src, int vector_len); - void evmovdqub(KRegister mask, XMMRegister dst, Address src, int vector_len); + void evmovdqub(XMMRegister dst, KRegister mask, Address src, int vector_len); void evmovdquw(Address dst, XMMRegister src, int vector_len); + void evmovdquw(Address dst, KRegister mask, XMMRegister src, int vector_len); void evmovdquw(XMMRegister dst, Address src, int vector_len); - void evmovdquw(XMMRegister dst, XMMRegister src, int vector_len); + void evmovdquw(XMMRegister dst, KRegister mask, Address src, int vector_len); void evmovdqul(Address dst, XMMRegister src, int vector_len); void evmovdqul(XMMRegister dst, Address src, int vector_len); void evmovdqul(XMMRegister dst, XMMRegister src, int vector_len); @@ -1545,7 +1558,14 @@ private: void vpcmpeqb(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len); void evpcmpeqb(KRegister kdst, XMMRegister nds, XMMRegister src, int vector_len); void evpcmpeqb(KRegister kdst, XMMRegister nds, Address src, int vector_len); - void evpcmpeqb(KRegister mask, KRegister kdst, XMMRegister nds, Address src, int vector_len); + void evpcmpeqb(KRegister kdst, KRegister mask, XMMRegister nds, Address src, int vector_len); + + void evpcmpgtb(KRegister kdst, XMMRegister nds, Address src, int vector_len); + void evpcmpgtb(KRegister kdst, KRegister mask, XMMRegister nds, Address src, int vector_len); + + void evpcmpuw(KRegister kdst, XMMRegister nds, XMMRegister src, ComparisonPredicate vcc, int vector_len); + void evpcmpuw(KRegister kdst, KRegister mask, XMMRegister nds, XMMRegister src, ComparisonPredicate of, int vector_len); + void evpcmpuw(KRegister kdst, XMMRegister nds, Address src, ComparisonPredicate vcc, int vector_len); void pcmpeqw(XMMRegister dst, XMMRegister src); void vpcmpeqw(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len); @@ -1589,7 +1609,11 @@ private: void pmovzxbw(XMMRegister dst, XMMRegister src); void pmovzxbw(XMMRegister dst, Address src); - void vpmovzxbw(XMMRegister dst, Address src, int vector_len); + void vpmovzxbw( XMMRegister dst, Address src, int vector_len); + void evpmovzxbw(XMMRegister dst, KRegister mask, Address src, int vector_len); + + void evpmovwb(Address dst, XMMRegister src, int vector_len); + void evpmovwb(Address dst, KRegister mask, XMMRegister src, int vector_len); #ifndef _LP64 // no 32bit push/pop on amd64 void popl(Address dst); @@ -1839,6 +1863,8 @@ private: void vsubss(XMMRegister dst, XMMRegister nds, Address src); void vsubss(XMMRegister dst, XMMRegister nds, XMMRegister src); + void shlxl(Register dst, Register src1, Register src2); + void shlxq(Register dst, Register src1, Register src2); //====================VECTOR ARITHMETIC===================================== @@ -2073,9 +2099,6 @@ private: void cmppd(XMMRegister dst, XMMRegister nds, XMMRegister src, int cop, int vector_len); void vpblendd(XMMRegister dst, XMMRegister nds, XMMRegister src1, XMMRegister src2, int vector_len); - void shlxl(Register dst, Register src1, Register src2); - void shlxq(Register dst, Register src1, Register src2); - protected: // Next instructions require address alignment 16 bytes SSE mode. // They should be called only from corresponding MacroAssembler instructions. diff --git a/hotspot/src/cpu/x86/vm/macroAssembler_x86.cpp b/hotspot/src/cpu/x86/vm/macroAssembler_x86.cpp index 32a33e307f2..fc41ecec4b2 100644 --- a/hotspot/src/cpu/x86/vm/macroAssembler_x86.cpp +++ b/hotspot/src/cpu/x86/vm/macroAssembler_x86.cpp @@ -8251,10 +8251,19 @@ void MacroAssembler::string_compare(Register str1, Register str2, // Search for Non-ASCII character (Negative byte value) in a byte array, // return true if it has any and false otherwise. +// ..\jdk\src\java.base\share\classes\java\lang\StringCoding.java +// @HotSpotIntrinsicCandidate +// private static boolean hasNegatives(byte[] ba, int off, int len) { +// for (int i = off; i < off + len; i++) { +// if (ba[i] < 0) { +// return true; +// } +// } +// return false; +// } void MacroAssembler::has_negatives(Register ary1, Register len, - Register result, Register tmp1, - XMMRegister vec1, XMMRegister vec2) { - + Register result, Register tmp1, + XMMRegister vec1, XMMRegister vec2) { // rsi: byte array // rcx: len // rax: result @@ -8267,79 +8276,161 @@ void MacroAssembler::has_negatives(Register ary1, Register len, testl(len, len); jcc(Assembler::zero, FALSE_LABEL); - movl(result, len); // copy + if ((UseAVX > 2) && // AVX512 + VM_Version::supports_avx512vlbw() && + VM_Version::supports_bmi2()) { - if (UseAVX >= 2 && UseSSE >= 2) { - // With AVX2, use 32-byte vector compare - Label COMPARE_WIDE_VECTORS, COMPARE_TAIL; + set_vector_masking(); // opening of the stub context for programming mask registers - // Compare 32-byte vectors - andl(result, 0x0000001f); // tail count (in bytes) - andl(len, 0xffffffe0); // vector count (in bytes) - jcc(Assembler::zero, COMPARE_TAIL); + Label test_64_loop, test_tail; + Register tmp3_aliased = len; + + movl(tmp1, len); + vpxor(vec2, vec2, vec2, Assembler::AVX_512bit); + + andl(tmp1, 64 - 1); // tail count (in chars) 0x3F + andl(len, ~(64 - 1)); // vector count (in chars) + jccb(Assembler::zero, test_tail); lea(ary1, Address(ary1, len, Address::times_1)); negptr(len); - movl(tmp1, 0x80808080); // create mask to test for Unicode chars in vector - movdl(vec2, tmp1); - vpbroadcastd(vec2, vec2); - - bind(COMPARE_WIDE_VECTORS); - vmovdqu(vec1, Address(ary1, len, Address::times_1)); - vptest(vec1, vec2); + bind(test_64_loop); + // Check whether our 64 elements of size byte contain negatives + evpcmpgtb(k2, vec2, Address(ary1, len, Address::times_1), Assembler::AVX_512bit); + kortestql(k2, k2); jcc(Assembler::notZero, TRUE_LABEL); - addptr(len, 32); - jcc(Assembler::notZero, COMPARE_WIDE_VECTORS); - testl(result, result); + addptr(len, 64); + jccb(Assembler::notZero, test_64_loop); + + + bind(test_tail); + // bail out when there is nothing to be done + testl(tmp1, -1); jcc(Assembler::zero, FALSE_LABEL); - vmovdqu(vec1, Address(ary1, result, Address::times_1, -32)); - vptest(vec1, vec2); + // Save k1 + kmovql(k3, k1); + + // ~(~0 << len) applied up to two times (for 32-bit scenario) +#ifdef _LP64 + mov64(tmp3_aliased, 0xFFFFFFFFFFFFFFFF); + shlxq(tmp3_aliased, tmp3_aliased, tmp1); + notq(tmp3_aliased); + kmovql(k1, tmp3_aliased); +#else + Label k_init; + jmp(k_init); + + // We could not read 64-bits from a general purpose register thus we move + // data required to compose 64 1's to the instruction stream + // We emit 64 byte wide series of elements from 0..63 which later on would + // be used as a compare targets with tail count contained in tmp1 register. + // Result would be a k1 register having tmp1 consecutive number or 1 + // counting from least significant bit. + address tmp = pc(); + emit_int64(0x0706050403020100); + emit_int64(0x0F0E0D0C0B0A0908); + emit_int64(0x1716151413121110); + emit_int64(0x1F1E1D1C1B1A1918); + emit_int64(0x2726252423222120); + emit_int64(0x2F2E2D2C2B2A2928); + emit_int64(0x3736353433323130); + emit_int64(0x3F3E3D3C3B3A3938); + + bind(k_init); + lea(len, InternalAddress(tmp)); + // create mask to test for negative byte inside a vector + evpbroadcastb(vec1, tmp1, Assembler::AVX_512bit); + evpcmpgtb(k1, vec1, Address(len, 0), Assembler::AVX_512bit); + +#endif + evpcmpgtb(k2, k1, vec2, Address(ary1, 0), Assembler::AVX_512bit); + ktestq(k2, k1); + // Restore k1 + kmovql(k1, k3); jcc(Assembler::notZero, TRUE_LABEL); + jmp(FALSE_LABEL); - bind(COMPARE_TAIL); // len is zero - movl(len, result); - // Fallthru to tail compare - } else if (UseSSE42Intrinsics) { - assert(UseSSE >= 4, "SSE4 must be for SSE4.2 intrinsics to be available"); - // With SSE4.2, use double quad vector compare - Label COMPARE_WIDE_VECTORS, COMPARE_TAIL; - - // Compare 16-byte vectors - andl(result, 0x0000000f); // tail count (in bytes) - andl(len, 0xfffffff0); // vector count (in bytes) - jccb(Assembler::zero, COMPARE_TAIL); - - lea(ary1, Address(ary1, len, Address::times_1)); - negptr(len); - - movl(tmp1, 0x80808080); - movdl(vec2, tmp1); - pshufd(vec2, vec2, 0); - - bind(COMPARE_WIDE_VECTORS); - movdqu(vec1, Address(ary1, len, Address::times_1)); - ptest(vec1, vec2); - jcc(Assembler::notZero, TRUE_LABEL); - addptr(len, 16); - jcc(Assembler::notZero, COMPARE_WIDE_VECTORS); - - testl(result, result); - jcc(Assembler::zero, FALSE_LABEL); - - movdqu(vec1, Address(ary1, result, Address::times_1, -16)); - ptest(vec1, vec2); - jccb(Assembler::notZero, TRUE_LABEL); - jmpb(FALSE_LABEL); - - bind(COMPARE_TAIL); // len is zero - movl(len, result); - // Fallthru to tail compare + clear_vector_masking(); // closing of the stub context for programming mask registers } + else { + movl(result, len); // copy + if (UseAVX == 2 && UseSSE >= 2) { + // With AVX2, use 32-byte vector compare + Label COMPARE_WIDE_VECTORS, COMPARE_TAIL; + + // Compare 32-byte vectors + andl(result, 0x0000001f); // tail count (in bytes) + andl(len, 0xffffffe0); // vector count (in bytes) + jccb(Assembler::zero, COMPARE_TAIL); + + lea(ary1, Address(ary1, len, Address::times_1)); + negptr(len); + + movl(tmp1, 0x80808080); // create mask to test for Unicode chars in vector + movdl(vec2, tmp1); + vpbroadcastd(vec2, vec2); + + bind(COMPARE_WIDE_VECTORS); + vmovdqu(vec1, Address(ary1, len, Address::times_1)); + vptest(vec1, vec2); + jccb(Assembler::notZero, TRUE_LABEL); + addptr(len, 32); + jcc(Assembler::notZero, COMPARE_WIDE_VECTORS); + + testl(result, result); + jccb(Assembler::zero, FALSE_LABEL); + + vmovdqu(vec1, Address(ary1, result, Address::times_1, -32)); + vptest(vec1, vec2); + jccb(Assembler::notZero, TRUE_LABEL); + jmpb(FALSE_LABEL); + + bind(COMPARE_TAIL); // len is zero + movl(len, result); + // Fallthru to tail compare + } + else if (UseSSE42Intrinsics) { + assert(UseSSE >= 4, "SSE4 must be for SSE4.2 intrinsics to be available"); + // With SSE4.2, use double quad vector compare + Label COMPARE_WIDE_VECTORS, COMPARE_TAIL; + + // Compare 16-byte vectors + andl(result, 0x0000000f); // tail count (in bytes) + andl(len, 0xfffffff0); // vector count (in bytes) + jccb(Assembler::zero, COMPARE_TAIL); + + lea(ary1, Address(ary1, len, Address::times_1)); + negptr(len); + + movl(tmp1, 0x80808080); + movdl(vec2, tmp1); + pshufd(vec2, vec2, 0); + + bind(COMPARE_WIDE_VECTORS); + movdqu(vec1, Address(ary1, len, Address::times_1)); + ptest(vec1, vec2); + jccb(Assembler::notZero, TRUE_LABEL); + addptr(len, 16); + jcc(Assembler::notZero, COMPARE_WIDE_VECTORS); + + testl(result, result); + jccb(Assembler::zero, FALSE_LABEL); + + movdqu(vec1, Address(ary1, result, Address::times_1, -16)); + ptest(vec1, vec2); + jccb(Assembler::notZero, TRUE_LABEL); + jmpb(FALSE_LABEL); + + bind(COMPARE_TAIL); // len is zero + movl(len, result); + // Fallthru to tail compare + } + } // Compare 4-byte vectors andl(len, 0xfffffffc); // vector count (in bytes) jccb(Assembler::zero, COMPARE_CHAR); @@ -8387,7 +8478,6 @@ void MacroAssembler::has_negatives(Register ary1, Register len, vpxor(vec2, vec2); } } - // Compare char[] or byte[] arrays aligned to 4 bytes or substrings. void MacroAssembler::arrays_equals(bool is_array_equ, Register ary1, Register ary2, Register limit, Register result, Register chr, @@ -8833,10 +8923,23 @@ void MacroAssembler::generate_fill(BasicType t, bool aligned, } // encode char[] to byte[] in ISO_8859_1 + //@HotSpotIntrinsicCandidate + //private static int implEncodeISOArray(byte[] sa, int sp, + //byte[] da, int dp, int len) { + // int i = 0; + // for (; i < len; i++) { + // char c = StringUTF16.getChar(sa, sp++); + // if (c > '\u00FF') + // break; + // da[dp++] = (byte)c; + // } + // return i; + //} void MacroAssembler::encode_iso_array(Register src, Register dst, Register len, - XMMRegister tmp1Reg, XMMRegister tmp2Reg, - XMMRegister tmp3Reg, XMMRegister tmp4Reg, - Register tmp5, Register result) { + XMMRegister tmp1Reg, XMMRegister tmp2Reg, + XMMRegister tmp3Reg, XMMRegister tmp4Reg, + Register tmp5, Register result) { + // rsi: src // rdi: dst // rdx: len @@ -8851,6 +8954,7 @@ void MacroAssembler::encode_iso_array(Register src, Register dst, Register len, // check for zero length testl(len, len); jcc(Assembler::zero, L_done); + movl(result, len); // Setup pointers @@ -8959,6 +9063,7 @@ void MacroAssembler::encode_iso_array(Register src, Register dst, Register len, bind(L_copy_1_char_exit); addptr(result, len); // len is negative count of not processed elements + bind(L_done); } @@ -9470,8 +9575,8 @@ void MacroAssembler::vectorized_mismatch(Register obja, Register objb, Register notq(tmp2); kmovql(k1, tmp2); - evmovdqub(k1, rymm0, Address(obja, result), Assembler::AVX_512bit); - evpcmpeqb(k1, k7, rymm0, Address(objb, result), Assembler::AVX_512bit); + evmovdqub(rymm0, k1, Address(obja, result), Assembler::AVX_512bit); + evpcmpeqb(k7, k1, rymm0, Address(objb, result), Assembler::AVX_512bit); ktestql(k7, k1); // Restore k1 @@ -10830,13 +10935,24 @@ void MacroAssembler::crc32c_ipl_alg2_alt2(Register in_out, Register in1, Registe #undef BIND #undef BLOCK_COMMENT - // Compress char[] array to byte[]. +// ..\jdk\src\java.base\share\classes\java\lang\StringUTF16.java +// @HotSpotIntrinsicCandidate +// private static int compress(char[] src, int srcOff, byte[] dst, int dstOff, int len) { +// for (int i = 0; i < len; i++) { +// int c = src[srcOff++]; +// if (c >>> 8 != 0) { +// return 0; +// } +// dst[dstOff++] = (byte)c; +// } +// return len; +// } void MacroAssembler::char_array_compress(Register src, Register dst, Register len, - XMMRegister tmp1Reg, XMMRegister tmp2Reg, - XMMRegister tmp3Reg, XMMRegister tmp4Reg, - Register tmp5, Register result) { - Label copy_chars_loop, return_length, return_zero, done; + XMMRegister tmp1Reg, XMMRegister tmp2Reg, + XMMRegister tmp3Reg, XMMRegister tmp4Reg, + Register tmp5, Register result) { + Label copy_chars_loop, return_length, return_zero, done, below_threshold; // rsi: src // rdi: dst @@ -10853,11 +10969,141 @@ void MacroAssembler::char_array_compress(Register src, Register dst, Register le // save length for return push(len); + if ((UseAVX > 2) && // AVX512 + VM_Version::supports_avx512vlbw() && + VM_Version::supports_bmi2()) { + + set_vector_masking(); // opening of the stub context for programming mask registers + + Label copy_32_loop, copy_loop_tail, copy_just_portion_of_candidates; + + // alignement + Label post_alignement; + + // if length of the string is less than 16, handle it in an old fashioned + // way + testl(len, -32); + jcc(Assembler::zero, below_threshold); + + // First check whether a character is compressable ( <= 0xFF). + // Create mask to test for Unicode chars inside zmm vector + movl(result, 0x00FF); + evpbroadcastw(tmp2Reg, result, Assembler::AVX_512bit); + + testl(len, -64); + jcc(Assembler::zero, post_alignement); + + // Save k1 + kmovql(k3, k1); + + movl(tmp5, dst); + andl(tmp5, (64 - 1)); + negl(tmp5); + andl(tmp5, (64 - 1)); + + // bail out when there is nothing to be done + testl(tmp5, 0xFFFFFFFF); + jcc(Assembler::zero, post_alignement); + + // ~(~0 << len), where len is the # of remaining elements to process + movl(result, 0xFFFFFFFF); + shlxl(result, result, tmp5); + notl(result); + + kmovdl(k1, result); + + evmovdquw(tmp1Reg, k1, Address(src, 0), Assembler::AVX_512bit); + evpcmpuw(k2, k1, tmp1Reg, tmp2Reg, Assembler::le, Assembler::AVX_512bit); + ktestd(k2, k1); + jcc(Assembler::carryClear, copy_just_portion_of_candidates); + + evpmovwb(Address(dst, 0), k1, tmp1Reg, Assembler::AVX_512bit); + + addptr(src, tmp5); + addptr(src, tmp5); + addptr(dst, tmp5); + subl(len, tmp5); + + bind(post_alignement); + // end of alignement + + movl(tmp5, len); + andl(tmp5, (32 - 1)); // tail count (in chars) + andl(len, ~(32 - 1)); // vector count (in chars) + jcc(Assembler::zero, copy_loop_tail); + + lea(src, Address(src, len, Address::times_2)); + lea(dst, Address(dst, len, Address::times_1)); + negptr(len); + + bind(copy_32_loop); + evmovdquw(tmp1Reg, Address(src, len, Address::times_2), Assembler::AVX_512bit); + evpcmpuw(k2, tmp1Reg, tmp2Reg, Assembler::le, Assembler::AVX_512bit); + kortestdl(k2, k2); + jcc(Assembler::carryClear, copy_just_portion_of_candidates); + + // All elements in current processed chunk are valid candidates for + // compression. Write a truncated byte elements to the memory. + evpmovwb(Address(dst, len, Address::times_1), tmp1Reg, Assembler::AVX_512bit); + addptr(len, 32); + jcc(Assembler::notZero, copy_32_loop); + + bind(copy_loop_tail); + // bail out when there is nothing to be done + testl(tmp5, 0xFFFFFFFF); + jcc(Assembler::zero, return_length); + + // Save k1 + kmovql(k3, k1); + + movl(len, tmp5); + + // ~(~0 << len), where len is the # of remaining elements to process + movl(result, 0xFFFFFFFF); + shlxl(result, result, len); + notl(result); + + kmovdl(k1, result); + + evmovdquw(tmp1Reg, k1, Address(src, 0), Assembler::AVX_512bit); + evpcmpuw(k2, k1, tmp1Reg, tmp2Reg, Assembler::le, Assembler::AVX_512bit); + ktestd(k2, k1); + jcc(Assembler::carryClear, copy_just_portion_of_candidates); + + evpmovwb(Address(dst, 0), k1, tmp1Reg, Assembler::AVX_512bit); + // Restore k1 + kmovql(k1, k3); + + jmp(return_length); + + bind(copy_just_portion_of_candidates); + kmovdl(tmp5, k2); + tzcntl(tmp5, tmp5); + + // ~(~0 << tmp5), where tmp5 is a number of elements in an array from the + // result to the first element larger than 0xFF + movl(result, 0xFFFFFFFF); + shlxl(result, result, tmp5); + notl(result); + + kmovdl(k1, result); + + evpmovwb(Address(dst, 0), k1, tmp1Reg, Assembler::AVX_512bit); + // Restore k1 + kmovql(k1, k3); + + jmp(return_zero); + + clear_vector_masking(); // closing of the stub context for programming mask registers + } if (UseSSE42Intrinsics) { assert(UseSSE >= 4, "SSE4 must be enabled for SSE4.2 intrinsics to be available"); Label copy_32_loop, copy_16, copy_tail; + bind(below_threshold); + movl(result, len); + movl(tmp5, 0xff00ff00); // create mask to test for Unicode chars in vectors // vectored compression @@ -10939,10 +11185,16 @@ void MacroAssembler::char_array_compress(Register src, Register dst, Register le } // Inflate byte[] array to char[]. +// ..\jdk\src\java.base\share\classes\java\lang\StringLatin1.java +// @HotSpotIntrinsicCandidate +// private static void inflate(byte[] src, int srcOff, char[] dst, int dstOff, int len) { +// for (int i = 0; i < len; i++) { +// dst[dstOff++] = (char)(src[srcOff++] & 0xff); +// } +// } void MacroAssembler::byte_array_inflate(Register src, Register dst, Register len, - XMMRegister tmp1, Register tmp2) { - Label copy_chars_loop, done; - + XMMRegister tmp1, Register tmp2) { + Label copy_chars_loop, done, below_threshold; // rsi: src // rdi: dst // rdx: len @@ -10953,20 +11205,109 @@ void MacroAssembler::byte_array_inflate(Register src, Register dst, Register len // rdx holds length assert_different_registers(src, dst, len, tmp2); + if ((UseAVX > 2) && // AVX512 + VM_Version::supports_avx512vlbw() && + VM_Version::supports_bmi2()) { + + set_vector_masking(); // opening of the stub context for programming mask registers + + Label copy_32_loop, copy_tail; + Register tmp3_aliased = len; + + // if length of the string is less than 16, handle it in an old fashioned + // way + testl(len, -16); + jcc(Assembler::zero, below_threshold); + + // In order to use only one arithmetic operation for the main loop we use + // this pre-calculation + movl(tmp2, len); + andl(tmp2, (32 - 1)); // tail count (in chars), 32 element wide loop + andl(len, -32); // vector count + jccb(Assembler::zero, copy_tail); + + lea(src, Address(src, len, Address::times_1)); + lea(dst, Address(dst, len, Address::times_2)); + negptr(len); + + + // inflate 32 chars per iter + bind(copy_32_loop); + vpmovzxbw(tmp1, Address(src, len, Address::times_1), Assembler::AVX_512bit); + evmovdquw(Address(dst, len, Address::times_2), tmp1, Assembler::AVX_512bit); + addptr(len, 32); + jcc(Assembler::notZero, copy_32_loop); + + bind(copy_tail); + // bail out when there is nothing to be done + testl(tmp2, -1); // we don't destroy the contents of tmp2 here + jcc(Assembler::zero, done); + + // Save k1 + kmovql(k2, k1); + + // ~(~0 << length), where length is the # of remaining elements to process + movl(tmp3_aliased, -1); + shlxl(tmp3_aliased, tmp3_aliased, tmp2); + notl(tmp3_aliased); + kmovdl(k1, tmp3_aliased); + evpmovzxbw(tmp1, k1, Address(src, 0), Assembler::AVX_512bit); + evmovdquw(Address(dst, 0), k1, tmp1, Assembler::AVX_512bit); + + // Restore k1 + kmovql(k1, k2); + jmp(done); + + clear_vector_masking(); // closing of the stub context for programming mask registers + } if (UseSSE42Intrinsics) { assert(UseSSE >= 4, "SSE4 must be enabled for SSE4.2 intrinsics to be available"); - Label copy_8_loop, copy_bytes, copy_tail; + Label copy_16_loop, copy_8_loop, copy_bytes, copy_new_tail, copy_tail; movl(tmp2, len); - andl(tmp2, 0x00000007); // tail count (in chars) - andl(len, 0xfffffff8); // vector count (in chars) - jccb(Assembler::zero, copy_tail); + + if (UseAVX > 1) { + andl(tmp2, (16 - 1)); + andl(len, -16); + jccb(Assembler::zero, copy_new_tail); + } else { + andl(tmp2, 0x00000007); // tail count (in chars) + andl(len, 0xfffffff8); // vector count (in chars) + jccb(Assembler::zero, copy_tail); + } // vectored inflation lea(src, Address(src, len, Address::times_1)); lea(dst, Address(dst, len, Address::times_2)); negptr(len); + if (UseAVX > 1) { + bind(copy_16_loop); + vpmovzxbw(tmp1, Address(src, len, Address::times_1), Assembler::AVX_256bit); + vmovdqu(Address(dst, len, Address::times_2), tmp1); + addptr(len, 16); + jcc(Assembler::notZero, copy_16_loop); + + bind(below_threshold); + bind(copy_new_tail); + if (UseAVX > 2) { + movl(tmp2, len); + } + else { + movl(len, tmp2); + } + andl(tmp2, 0x00000007); + andl(len, 0xFFFFFFF8); + jccb(Assembler::zero, copy_tail); + + pmovzxbw(tmp1, Address(src, 0)); + movdqu(Address(dst, 0), tmp1); + addptr(src, 8); + addptr(dst, 2 * 8); + + jmp(copy_tail, true); + } + // inflate 8 chars per iter bind(copy_8_loop); pmovzxbw(tmp1, Address(src, len, Address::times_1)); // unpack to 8 words @@ -11005,7 +11346,6 @@ void MacroAssembler::byte_array_inflate(Register src, Register dst, Register len bind(done); } - Assembler::Condition MacroAssembler::negate_condition(Assembler::Condition cond) { switch (cond) { // Note some conditions are synonyms for others