8154974: AVX-512 equipped inflate, has_negatives & compress intrinsics
Reviewed-by: kvn
This commit is contained in:
parent
472a1bc607
commit
0a123cad8f
@ -2332,6 +2332,22 @@ void Assembler::ktestql(KRegister src1, KRegister src2) {
|
|||||||
emit_int8((unsigned char)(0xC0 | encode));
|
emit_int8((unsigned char)(0xC0 | encode));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void Assembler::ktestq(KRegister src1, KRegister src2) {
|
||||||
|
assert(VM_Version::supports_avx512bw(), "");
|
||||||
|
InstructionAttr attributes(AVX_128bit, /* rex_w */ true, /* legacy_mode */ true, /* no_mask_reg */ true, /* uses_vl */ false);
|
||||||
|
int encode = vex_prefix_and_encode(src1->encoding(), 0, src2->encoding(), VEX_SIMD_NONE, VEX_OPCODE_0F, &attributes);
|
||||||
|
emit_int8((unsigned char)0x99);
|
||||||
|
emit_int8((unsigned char)(0xC0 | encode));
|
||||||
|
}
|
||||||
|
|
||||||
|
void Assembler::ktestd(KRegister src1, KRegister src2) {
|
||||||
|
assert(VM_Version::supports_avx512bw(), "");
|
||||||
|
InstructionAttr attributes(AVX_128bit, /* rex_w */ true, /* legacy_mode */ true, /* no_mask_reg */ true, /* uses_vl */ false);
|
||||||
|
int encode = vex_prefix_and_encode(src1->encoding(), 0, src2->encoding(), VEX_SIMD_66, VEX_OPCODE_0F, &attributes);
|
||||||
|
emit_int8((unsigned char)0x99);
|
||||||
|
emit_int8((unsigned char)(0xC0 | encode));
|
||||||
|
}
|
||||||
|
|
||||||
void Assembler::movb(Address dst, int imm8) {
|
void Assembler::movb(Address dst, int imm8) {
|
||||||
InstructionMark im(this);
|
InstructionMark im(this);
|
||||||
prefix(dst);
|
prefix(dst);
|
||||||
@ -2500,7 +2516,7 @@ void Assembler::evmovdqub(Address dst, XMMRegister src, int vector_len) {
|
|||||||
emit_operand(src, dst);
|
emit_operand(src, dst);
|
||||||
}
|
}
|
||||||
|
|
||||||
void Assembler::evmovdqub(KRegister mask, XMMRegister dst, Address src, int vector_len) {
|
void Assembler::evmovdqub(XMMRegister dst, KRegister mask, Address src, int vector_len) {
|
||||||
assert(VM_Version::supports_avx512vlbw(), "");
|
assert(VM_Version::supports_avx512vlbw(), "");
|
||||||
assert(is_vector_masking(), ""); // For stub code use only
|
assert(is_vector_masking(), ""); // For stub code use only
|
||||||
InstructionMark im(this);
|
InstructionMark im(this);
|
||||||
@ -2513,16 +2529,6 @@ void Assembler::evmovdqub(KRegister mask, XMMRegister dst, Address src, int vect
|
|||||||
emit_operand(dst, src);
|
emit_operand(dst, src);
|
||||||
}
|
}
|
||||||
|
|
||||||
void Assembler::evmovdquw(XMMRegister dst, XMMRegister src, int vector_len) {
|
|
||||||
assert(VM_Version::supports_evex(), "");
|
|
||||||
InstructionAttr attributes(vector_len, /* vex_w */ true, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ true, /* uses_vl */ true);
|
|
||||||
attributes.set_is_evex_instruction();
|
|
||||||
int prefix = (_legacy_mode_bw) ? VEX_SIMD_F2 : VEX_SIMD_F3;
|
|
||||||
int encode = vex_prefix_and_encode(dst->encoding(), 0, src->encoding(), (Assembler::VexSimdPrefix)prefix, VEX_OPCODE_0F, &attributes);
|
|
||||||
emit_int8(0x6F);
|
|
||||||
emit_int8((unsigned char)(0xC0 | encode));
|
|
||||||
}
|
|
||||||
|
|
||||||
void Assembler::evmovdquw(XMMRegister dst, Address src, int vector_len) {
|
void Assembler::evmovdquw(XMMRegister dst, Address src, int vector_len) {
|
||||||
assert(VM_Version::supports_evex(), "");
|
assert(VM_Version::supports_evex(), "");
|
||||||
InstructionMark im(this);
|
InstructionMark im(this);
|
||||||
@ -2535,6 +2541,19 @@ void Assembler::evmovdquw(XMMRegister dst, Address src, int vector_len) {
|
|||||||
emit_operand(dst, src);
|
emit_operand(dst, src);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void Assembler::evmovdquw(XMMRegister dst, KRegister mask, Address src, int vector_len) {
|
||||||
|
assert(is_vector_masking(), "");
|
||||||
|
assert(VM_Version::supports_avx512vlbw(), "");
|
||||||
|
InstructionMark im(this);
|
||||||
|
InstructionAttr attributes(vector_len, /* vex_w */ true, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ false, /* uses_vl */ true);
|
||||||
|
attributes.set_address_attributes(/* tuple_type */ EVEX_FVM, /* input_size_in_bits */ EVEX_NObit);
|
||||||
|
attributes.set_embedded_opmask_register_specifier(mask);
|
||||||
|
attributes.set_is_evex_instruction();
|
||||||
|
vex_prefix(src, 0, dst->encoding(), VEX_SIMD_F2, VEX_OPCODE_0F, &attributes);
|
||||||
|
emit_int8(0x6F);
|
||||||
|
emit_operand(dst, src);
|
||||||
|
}
|
||||||
|
|
||||||
void Assembler::evmovdquw(Address dst, XMMRegister src, int vector_len) {
|
void Assembler::evmovdquw(Address dst, XMMRegister src, int vector_len) {
|
||||||
assert(VM_Version::supports_evex(), "");
|
assert(VM_Version::supports_evex(), "");
|
||||||
assert(src != xnoreg, "sanity");
|
assert(src != xnoreg, "sanity");
|
||||||
@ -2548,6 +2567,19 @@ void Assembler::evmovdquw(Address dst, XMMRegister src, int vector_len) {
|
|||||||
emit_operand(src, dst);
|
emit_operand(src, dst);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void Assembler::evmovdquw(Address dst, KRegister mask, XMMRegister src, int vector_len) {
|
||||||
|
assert(VM_Version::supports_avx512vlbw(), "");
|
||||||
|
assert(src != xnoreg, "sanity");
|
||||||
|
InstructionMark im(this);
|
||||||
|
InstructionAttr attributes(vector_len, /* vex_w */ true, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ true);
|
||||||
|
attributes.set_address_attributes(/* tuple_type */ EVEX_FVM, /* input_size_in_bits */ EVEX_NObit);
|
||||||
|
attributes.set_embedded_opmask_register_specifier(mask);
|
||||||
|
attributes.set_is_evex_instruction();
|
||||||
|
vex_prefix(dst, 0, src->encoding(), VEX_SIMD_F2, VEX_OPCODE_0F, &attributes);
|
||||||
|
emit_int8(0x7F);
|
||||||
|
emit_operand(src, dst);
|
||||||
|
}
|
||||||
|
|
||||||
void Assembler::evmovdqul(XMMRegister dst, XMMRegister src, int vector_len) {
|
void Assembler::evmovdqul(XMMRegister dst, XMMRegister src, int vector_len) {
|
||||||
assert(VM_Version::supports_evex(), "");
|
assert(VM_Version::supports_evex(), "");
|
||||||
InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ true);
|
InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ true);
|
||||||
@ -3295,10 +3327,71 @@ void Assembler::evpcmpeqb(KRegister kdst, XMMRegister nds, XMMRegister src, int
|
|||||||
emit_int8((unsigned char)(0xC0 | encode));
|
emit_int8((unsigned char)(0xC0 | encode));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void Assembler::evpcmpgtb(KRegister kdst, XMMRegister nds, Address src, int vector_len) {
|
||||||
|
assert(VM_Version::supports_avx512vlbw(), "");
|
||||||
|
InstructionMark im(this);
|
||||||
|
InstructionAttr attributes(vector_len, /* rex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ false);
|
||||||
|
attributes.set_address_attributes(/* tuple_type */ EVEX_FVM, /* input_size_in_bits */ EVEX_NObit);
|
||||||
|
attributes.set_is_evex_instruction();
|
||||||
|
int dst_enc = kdst->encoding();
|
||||||
|
vex_prefix(src, nds->encoding(), dst_enc, VEX_SIMD_66, VEX_OPCODE_0F, &attributes);
|
||||||
|
emit_int8(0x64);
|
||||||
|
emit_operand(as_Register(dst_enc), src);
|
||||||
|
}
|
||||||
|
|
||||||
|
void Assembler::evpcmpgtb(KRegister kdst, KRegister mask, XMMRegister nds, Address src, int vector_len) {
|
||||||
|
assert(is_vector_masking(), "");
|
||||||
|
assert(VM_Version::supports_avx512vlbw(), "");
|
||||||
|
InstructionMark im(this);
|
||||||
|
InstructionAttr attributes(vector_len, /* rex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ false);
|
||||||
|
attributes.set_address_attributes(/* tuple_type */ EVEX_FVM, /* input_size_in_bits */ EVEX_NObit);
|
||||||
|
attributes.set_embedded_opmask_register_specifier(mask);
|
||||||
|
attributes.set_is_evex_instruction();
|
||||||
|
int dst_enc = kdst->encoding();
|
||||||
|
vex_prefix(src, nds->encoding(), dst_enc, VEX_SIMD_66, VEX_OPCODE_0F, &attributes);
|
||||||
|
emit_int8(0x64);
|
||||||
|
emit_operand(as_Register(dst_enc), src);
|
||||||
|
}
|
||||||
|
|
||||||
|
void Assembler::evpcmpuw(KRegister kdst, XMMRegister nds, XMMRegister src, ComparisonPredicate vcc, int vector_len) {
|
||||||
|
assert(VM_Version::supports_avx512vlbw(), "");
|
||||||
|
InstructionAttr attributes(vector_len, /* rex_w */ true, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ false);
|
||||||
|
attributes.set_is_evex_instruction();
|
||||||
|
int encode = vex_prefix_and_encode(kdst->encoding(), nds->encoding(), src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_3A, &attributes);
|
||||||
|
emit_int8(0x3E);
|
||||||
|
emit_int8((unsigned char)(0xC0 | encode));
|
||||||
|
emit_int8(vcc);
|
||||||
|
}
|
||||||
|
|
||||||
|
void Assembler::evpcmpuw(KRegister kdst, KRegister mask, XMMRegister nds, XMMRegister src, ComparisonPredicate vcc, int vector_len) {
|
||||||
|
assert(is_vector_masking(), "");
|
||||||
|
assert(VM_Version::supports_avx512vlbw(), "");
|
||||||
|
InstructionAttr attributes(vector_len, /* rex_w */ true, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ false);
|
||||||
|
attributes.set_embedded_opmask_register_specifier(mask);
|
||||||
|
attributes.set_is_evex_instruction();
|
||||||
|
int encode = vex_prefix_and_encode(kdst->encoding(), nds->encoding(), src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_3A, &attributes);
|
||||||
|
emit_int8(0x3E);
|
||||||
|
emit_int8((unsigned char)(0xC0 | encode));
|
||||||
|
emit_int8(vcc);
|
||||||
|
}
|
||||||
|
|
||||||
|
void Assembler::evpcmpuw(KRegister kdst, XMMRegister nds, Address src, ComparisonPredicate vcc, int vector_len) {
|
||||||
|
assert(VM_Version::supports_avx512vlbw(), "");
|
||||||
|
InstructionMark im(this);
|
||||||
|
InstructionAttr attributes(vector_len, /* rex_w */ true, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ false);
|
||||||
|
attributes.set_address_attributes(/* tuple_type */ EVEX_FVM, /* input_size_in_bits */ EVEX_NObit);
|
||||||
|
attributes.set_is_evex_instruction();
|
||||||
|
int dst_enc = kdst->encoding();
|
||||||
|
vex_prefix(src, nds->encoding(), kdst->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_3A, &attributes);
|
||||||
|
emit_int8(0x3E);
|
||||||
|
emit_operand(as_Register(dst_enc), src);
|
||||||
|
emit_int8(vcc);
|
||||||
|
}
|
||||||
|
|
||||||
void Assembler::evpcmpeqb(KRegister kdst, XMMRegister nds, Address src, int vector_len) {
|
void Assembler::evpcmpeqb(KRegister kdst, XMMRegister nds, Address src, int vector_len) {
|
||||||
assert(VM_Version::supports_avx512bw(), "");
|
assert(VM_Version::supports_avx512bw(), "");
|
||||||
InstructionMark im(this);
|
InstructionMark im(this);
|
||||||
InstructionAttr attributes(vector_len, /* rex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ true);
|
InstructionAttr attributes(vector_len, /* rex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ false);
|
||||||
attributes.set_is_evex_instruction();
|
attributes.set_is_evex_instruction();
|
||||||
attributes.set_address_attributes(/* tuple_type */ EVEX_FVM, /* input_size_in_bits */ EVEX_NObit);
|
attributes.set_address_attributes(/* tuple_type */ EVEX_FVM, /* input_size_in_bits */ EVEX_NObit);
|
||||||
int dst_enc = kdst->encoding();
|
int dst_enc = kdst->encoding();
|
||||||
@ -3307,7 +3400,7 @@ void Assembler::evpcmpeqb(KRegister kdst, XMMRegister nds, Address src, int vect
|
|||||||
emit_operand(as_Register(dst_enc), src);
|
emit_operand(as_Register(dst_enc), src);
|
||||||
}
|
}
|
||||||
|
|
||||||
void Assembler::evpcmpeqb(KRegister mask, KRegister kdst, XMMRegister nds, Address src, int vector_len) {
|
void Assembler::evpcmpeqb(KRegister kdst, KRegister mask, XMMRegister nds, Address src, int vector_len) {
|
||||||
assert(VM_Version::supports_avx512vlbw(), "");
|
assert(VM_Version::supports_avx512vlbw(), "");
|
||||||
assert(is_vector_masking(), ""); // For stub code use only
|
assert(is_vector_masking(), ""); // For stub code use only
|
||||||
InstructionMark im(this);
|
InstructionMark im(this);
|
||||||
@ -3620,6 +3713,46 @@ void Assembler::vpmovzxbw(XMMRegister dst, Address src, int vector_len) {
|
|||||||
emit_operand(dst, src);
|
emit_operand(dst, src);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void Assembler::evpmovzxbw(XMMRegister dst, KRegister mask, Address src, int vector_len) {
|
||||||
|
assert(is_vector_masking(), "");
|
||||||
|
assert(VM_Version::supports_avx512vlbw(), "");
|
||||||
|
assert(dst != xnoreg, "sanity");
|
||||||
|
InstructionMark im(this);
|
||||||
|
InstructionAttr attributes(vector_len, /* rex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ false);
|
||||||
|
attributes.set_address_attributes(/* tuple_type */ EVEX_HVM, /* input_size_in_bits */ EVEX_NObit);
|
||||||
|
attributes.set_embedded_opmask_register_specifier(mask);
|
||||||
|
attributes.set_is_evex_instruction();
|
||||||
|
vex_prefix(src, 0, dst->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_38, &attributes);
|
||||||
|
emit_int8(0x30);
|
||||||
|
emit_operand(dst, src);
|
||||||
|
}
|
||||||
|
|
||||||
|
void Assembler::evpmovwb(Address dst, XMMRegister src, int vector_len) {
|
||||||
|
assert(VM_Version::supports_avx512vlbw(), "");
|
||||||
|
assert(src != xnoreg, "sanity");
|
||||||
|
InstructionMark im(this);
|
||||||
|
InstructionAttr attributes(vector_len, /* rex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ false);
|
||||||
|
attributes.set_address_attributes(/* tuple_type */ EVEX_HVM, /* input_size_in_bits */ EVEX_NObit);
|
||||||
|
attributes.set_is_evex_instruction();
|
||||||
|
vex_prefix(dst, 0, src->encoding(), VEX_SIMD_F3, VEX_OPCODE_0F_38, &attributes);
|
||||||
|
emit_int8(0x30);
|
||||||
|
emit_operand(src, dst);
|
||||||
|
}
|
||||||
|
|
||||||
|
void Assembler::evpmovwb(Address dst, KRegister mask, XMMRegister src, int vector_len) {
|
||||||
|
assert(is_vector_masking(), "");
|
||||||
|
assert(VM_Version::supports_avx512vlbw(), "");
|
||||||
|
assert(src != xnoreg, "sanity");
|
||||||
|
InstructionMark im(this);
|
||||||
|
InstructionAttr attributes(vector_len, /* rex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ false);
|
||||||
|
attributes.set_address_attributes(/* tuple_type */ EVEX_HVM, /* input_size_in_bits */ EVEX_NObit);
|
||||||
|
attributes.set_embedded_opmask_register_specifier(mask);
|
||||||
|
attributes.set_is_evex_instruction();
|
||||||
|
vex_prefix(dst, 0, src->encoding(), VEX_SIMD_F3, VEX_OPCODE_0F_38, &attributes);
|
||||||
|
emit_int8(0x30);
|
||||||
|
emit_operand(src, dst);
|
||||||
|
}
|
||||||
|
|
||||||
// generic
|
// generic
|
||||||
void Assembler::pop(Register dst) {
|
void Assembler::pop(Register dst) {
|
||||||
int encode = prefix_and_encode(dst->encoding());
|
int encode = prefix_and_encode(dst->encoding());
|
||||||
@ -6406,7 +6539,6 @@ void Assembler::vzeroupper() {
|
|||||||
emit_int8(0x77);
|
emit_int8(0x77);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
#ifndef _LP64
|
#ifndef _LP64
|
||||||
// 32bit only pieces of the assembler
|
// 32bit only pieces of the assembler
|
||||||
|
|
||||||
@ -6973,7 +7105,10 @@ void Assembler::evex_prefix(bool vex_r, bool vex_b, bool vex_x, bool evex_r, boo
|
|||||||
emit_int8(byte3);
|
emit_int8(byte3);
|
||||||
|
|
||||||
// P2: byte 4 as zL'Lbv'aaa
|
// P2: byte 4 as zL'Lbv'aaa
|
||||||
int byte4 = (_attributes->is_no_reg_mask()) ? 0 : _attributes->get_embedded_opmask_register_specifier(); // kregs are implemented in the low 3 bits as aaa (hard code k1, it will be initialized for now)
|
// kregs are implemented in the low 3 bits as aaa (hard code k1, it will be initialized for now)
|
||||||
|
int byte4 = (_attributes->is_no_reg_mask()) ?
|
||||||
|
0 :
|
||||||
|
_attributes->get_embedded_opmask_register_specifier();
|
||||||
// EVEX.v` for extending EVEX.vvvv or VIDX
|
// EVEX.v` for extending EVEX.vvvv or VIDX
|
||||||
byte4 |= (evex_v ? 0: EVEX_V);
|
byte4 |= (evex_v ? 0: EVEX_V);
|
||||||
// third EXEC.b for broadcast actions
|
// third EXEC.b for broadcast actions
|
||||||
|
@ -587,6 +587,16 @@ class Assembler : public AbstractAssembler {
|
|||||||
#endif
|
#endif
|
||||||
};
|
};
|
||||||
|
|
||||||
|
enum ComparisonPredicate {
|
||||||
|
eq = 0,
|
||||||
|
lt = 1,
|
||||||
|
le = 2,
|
||||||
|
_false = 3,
|
||||||
|
neq = 4,
|
||||||
|
nlt = 5,
|
||||||
|
nle = 6,
|
||||||
|
_true = 7
|
||||||
|
};
|
||||||
|
|
||||||
|
|
||||||
// NOTE: The general philopsophy of the declarations here is that 64bit versions
|
// NOTE: The general philopsophy of the declarations here is that 64bit versions
|
||||||
@ -830,7 +840,6 @@ private:
|
|||||||
void clear_vector_masking(void) { _vector_masking = false; }
|
void clear_vector_masking(void) { _vector_masking = false; }
|
||||||
bool is_vector_masking(void) { return _vector_masking; }
|
bool is_vector_masking(void) { return _vector_masking; }
|
||||||
|
|
||||||
|
|
||||||
void lea(Register dst, Address src);
|
void lea(Register dst, Address src);
|
||||||
|
|
||||||
void mov(Register dst, Register src);
|
void mov(Register dst, Register src);
|
||||||
@ -1362,6 +1371,9 @@ private:
|
|||||||
void kortestdl(KRegister dst, KRegister src);
|
void kortestdl(KRegister dst, KRegister src);
|
||||||
void kortestql(KRegister dst, KRegister src);
|
void kortestql(KRegister dst, KRegister src);
|
||||||
|
|
||||||
|
void ktestq(KRegister src1, KRegister src2);
|
||||||
|
void ktestd(KRegister src1, KRegister src2);
|
||||||
|
|
||||||
void ktestql(KRegister dst, KRegister src);
|
void ktestql(KRegister dst, KRegister src);
|
||||||
|
|
||||||
void movdl(XMMRegister dst, Register src);
|
void movdl(XMMRegister dst, Register src);
|
||||||
@ -1391,10 +1403,11 @@ private:
|
|||||||
void evmovdqub(Address dst, XMMRegister src, int vector_len);
|
void evmovdqub(Address dst, XMMRegister src, int vector_len);
|
||||||
void evmovdqub(XMMRegister dst, Address src, int vector_len);
|
void evmovdqub(XMMRegister dst, Address src, int vector_len);
|
||||||
void evmovdqub(XMMRegister dst, XMMRegister src, int vector_len);
|
void evmovdqub(XMMRegister dst, XMMRegister src, int vector_len);
|
||||||
void evmovdqub(KRegister mask, XMMRegister dst, Address src, int vector_len);
|
void evmovdqub(XMMRegister dst, KRegister mask, Address src, int vector_len);
|
||||||
void evmovdquw(Address dst, XMMRegister src, int vector_len);
|
void evmovdquw(Address dst, XMMRegister src, int vector_len);
|
||||||
|
void evmovdquw(Address dst, KRegister mask, XMMRegister src, int vector_len);
|
||||||
void evmovdquw(XMMRegister dst, Address src, int vector_len);
|
void evmovdquw(XMMRegister dst, Address src, int vector_len);
|
||||||
void evmovdquw(XMMRegister dst, XMMRegister src, int vector_len);
|
void evmovdquw(XMMRegister dst, KRegister mask, Address src, int vector_len);
|
||||||
void evmovdqul(Address dst, XMMRegister src, int vector_len);
|
void evmovdqul(Address dst, XMMRegister src, int vector_len);
|
||||||
void evmovdqul(XMMRegister dst, Address src, int vector_len);
|
void evmovdqul(XMMRegister dst, Address src, int vector_len);
|
||||||
void evmovdqul(XMMRegister dst, XMMRegister src, int vector_len);
|
void evmovdqul(XMMRegister dst, XMMRegister src, int vector_len);
|
||||||
@ -1545,7 +1558,14 @@ private:
|
|||||||
void vpcmpeqb(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
|
void vpcmpeqb(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
|
||||||
void evpcmpeqb(KRegister kdst, XMMRegister nds, XMMRegister src, int vector_len);
|
void evpcmpeqb(KRegister kdst, XMMRegister nds, XMMRegister src, int vector_len);
|
||||||
void evpcmpeqb(KRegister kdst, XMMRegister nds, Address src, int vector_len);
|
void evpcmpeqb(KRegister kdst, XMMRegister nds, Address src, int vector_len);
|
||||||
void evpcmpeqb(KRegister mask, KRegister kdst, XMMRegister nds, Address src, int vector_len);
|
void evpcmpeqb(KRegister kdst, KRegister mask, XMMRegister nds, Address src, int vector_len);
|
||||||
|
|
||||||
|
void evpcmpgtb(KRegister kdst, XMMRegister nds, Address src, int vector_len);
|
||||||
|
void evpcmpgtb(KRegister kdst, KRegister mask, XMMRegister nds, Address src, int vector_len);
|
||||||
|
|
||||||
|
void evpcmpuw(KRegister kdst, XMMRegister nds, XMMRegister src, ComparisonPredicate vcc, int vector_len);
|
||||||
|
void evpcmpuw(KRegister kdst, KRegister mask, XMMRegister nds, XMMRegister src, ComparisonPredicate of, int vector_len);
|
||||||
|
void evpcmpuw(KRegister kdst, XMMRegister nds, Address src, ComparisonPredicate vcc, int vector_len);
|
||||||
|
|
||||||
void pcmpeqw(XMMRegister dst, XMMRegister src);
|
void pcmpeqw(XMMRegister dst, XMMRegister src);
|
||||||
void vpcmpeqw(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
|
void vpcmpeqw(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
|
||||||
@ -1589,7 +1609,11 @@ private:
|
|||||||
void pmovzxbw(XMMRegister dst, XMMRegister src);
|
void pmovzxbw(XMMRegister dst, XMMRegister src);
|
||||||
void pmovzxbw(XMMRegister dst, Address src);
|
void pmovzxbw(XMMRegister dst, Address src);
|
||||||
|
|
||||||
void vpmovzxbw(XMMRegister dst, Address src, int vector_len);
|
void vpmovzxbw( XMMRegister dst, Address src, int vector_len);
|
||||||
|
void evpmovzxbw(XMMRegister dst, KRegister mask, Address src, int vector_len);
|
||||||
|
|
||||||
|
void evpmovwb(Address dst, XMMRegister src, int vector_len);
|
||||||
|
void evpmovwb(Address dst, KRegister mask, XMMRegister src, int vector_len);
|
||||||
|
|
||||||
#ifndef _LP64 // no 32bit push/pop on amd64
|
#ifndef _LP64 // no 32bit push/pop on amd64
|
||||||
void popl(Address dst);
|
void popl(Address dst);
|
||||||
@ -1839,6 +1863,8 @@ private:
|
|||||||
void vsubss(XMMRegister dst, XMMRegister nds, Address src);
|
void vsubss(XMMRegister dst, XMMRegister nds, Address src);
|
||||||
void vsubss(XMMRegister dst, XMMRegister nds, XMMRegister src);
|
void vsubss(XMMRegister dst, XMMRegister nds, XMMRegister src);
|
||||||
|
|
||||||
|
void shlxl(Register dst, Register src1, Register src2);
|
||||||
|
void shlxq(Register dst, Register src1, Register src2);
|
||||||
|
|
||||||
//====================VECTOR ARITHMETIC=====================================
|
//====================VECTOR ARITHMETIC=====================================
|
||||||
|
|
||||||
@ -2073,9 +2099,6 @@ private:
|
|||||||
void cmppd(XMMRegister dst, XMMRegister nds, XMMRegister src, int cop, int vector_len);
|
void cmppd(XMMRegister dst, XMMRegister nds, XMMRegister src, int cop, int vector_len);
|
||||||
void vpblendd(XMMRegister dst, XMMRegister nds, XMMRegister src1, XMMRegister src2, int vector_len);
|
void vpblendd(XMMRegister dst, XMMRegister nds, XMMRegister src1, XMMRegister src2, int vector_len);
|
||||||
|
|
||||||
void shlxl(Register dst, Register src1, Register src2);
|
|
||||||
void shlxq(Register dst, Register src1, Register src2);
|
|
||||||
|
|
||||||
protected:
|
protected:
|
||||||
// Next instructions require address alignment 16 bytes SSE mode.
|
// Next instructions require address alignment 16 bytes SSE mode.
|
||||||
// They should be called only from corresponding MacroAssembler instructions.
|
// They should be called only from corresponding MacroAssembler instructions.
|
||||||
|
@ -8251,10 +8251,19 @@ void MacroAssembler::string_compare(Register str1, Register str2,
|
|||||||
|
|
||||||
// Search for Non-ASCII character (Negative byte value) in a byte array,
|
// Search for Non-ASCII character (Negative byte value) in a byte array,
|
||||||
// return true if it has any and false otherwise.
|
// return true if it has any and false otherwise.
|
||||||
|
// ..\jdk\src\java.base\share\classes\java\lang\StringCoding.java
|
||||||
|
// @HotSpotIntrinsicCandidate
|
||||||
|
// private static boolean hasNegatives(byte[] ba, int off, int len) {
|
||||||
|
// for (int i = off; i < off + len; i++) {
|
||||||
|
// if (ba[i] < 0) {
|
||||||
|
// return true;
|
||||||
|
// }
|
||||||
|
// }
|
||||||
|
// return false;
|
||||||
|
// }
|
||||||
void MacroAssembler::has_negatives(Register ary1, Register len,
|
void MacroAssembler::has_negatives(Register ary1, Register len,
|
||||||
Register result, Register tmp1,
|
Register result, Register tmp1,
|
||||||
XMMRegister vec1, XMMRegister vec2) {
|
XMMRegister vec1, XMMRegister vec2) {
|
||||||
|
|
||||||
// rsi: byte array
|
// rsi: byte array
|
||||||
// rcx: len
|
// rcx: len
|
||||||
// rax: result
|
// rax: result
|
||||||
@ -8267,79 +8276,161 @@ void MacroAssembler::has_negatives(Register ary1, Register len,
|
|||||||
testl(len, len);
|
testl(len, len);
|
||||||
jcc(Assembler::zero, FALSE_LABEL);
|
jcc(Assembler::zero, FALSE_LABEL);
|
||||||
|
|
||||||
movl(result, len); // copy
|
if ((UseAVX > 2) && // AVX512
|
||||||
|
VM_Version::supports_avx512vlbw() &&
|
||||||
|
VM_Version::supports_bmi2()) {
|
||||||
|
|
||||||
if (UseAVX >= 2 && UseSSE >= 2) {
|
set_vector_masking(); // opening of the stub context for programming mask registers
|
||||||
// With AVX2, use 32-byte vector compare
|
|
||||||
Label COMPARE_WIDE_VECTORS, COMPARE_TAIL;
|
|
||||||
|
|
||||||
// Compare 32-byte vectors
|
Label test_64_loop, test_tail;
|
||||||
andl(result, 0x0000001f); // tail count (in bytes)
|
Register tmp3_aliased = len;
|
||||||
andl(len, 0xffffffe0); // vector count (in bytes)
|
|
||||||
jcc(Assembler::zero, COMPARE_TAIL);
|
movl(tmp1, len);
|
||||||
|
vpxor(vec2, vec2, vec2, Assembler::AVX_512bit);
|
||||||
|
|
||||||
|
andl(tmp1, 64 - 1); // tail count (in chars) 0x3F
|
||||||
|
andl(len, ~(64 - 1)); // vector count (in chars)
|
||||||
|
jccb(Assembler::zero, test_tail);
|
||||||
|
|
||||||
lea(ary1, Address(ary1, len, Address::times_1));
|
lea(ary1, Address(ary1, len, Address::times_1));
|
||||||
negptr(len);
|
negptr(len);
|
||||||
|
|
||||||
movl(tmp1, 0x80808080); // create mask to test for Unicode chars in vector
|
bind(test_64_loop);
|
||||||
movdl(vec2, tmp1);
|
// Check whether our 64 elements of size byte contain negatives
|
||||||
vpbroadcastd(vec2, vec2);
|
evpcmpgtb(k2, vec2, Address(ary1, len, Address::times_1), Assembler::AVX_512bit);
|
||||||
|
kortestql(k2, k2);
|
||||||
bind(COMPARE_WIDE_VECTORS);
|
|
||||||
vmovdqu(vec1, Address(ary1, len, Address::times_1));
|
|
||||||
vptest(vec1, vec2);
|
|
||||||
jcc(Assembler::notZero, TRUE_LABEL);
|
jcc(Assembler::notZero, TRUE_LABEL);
|
||||||
addptr(len, 32);
|
|
||||||
jcc(Assembler::notZero, COMPARE_WIDE_VECTORS);
|
|
||||||
|
|
||||||
testl(result, result);
|
addptr(len, 64);
|
||||||
|
jccb(Assembler::notZero, test_64_loop);
|
||||||
|
|
||||||
|
|
||||||
|
bind(test_tail);
|
||||||
|
// bail out when there is nothing to be done
|
||||||
|
testl(tmp1, -1);
|
||||||
jcc(Assembler::zero, FALSE_LABEL);
|
jcc(Assembler::zero, FALSE_LABEL);
|
||||||
|
|
||||||
vmovdqu(vec1, Address(ary1, result, Address::times_1, -32));
|
// Save k1
|
||||||
vptest(vec1, vec2);
|
kmovql(k3, k1);
|
||||||
|
|
||||||
|
// ~(~0 << len) applied up to two times (for 32-bit scenario)
|
||||||
|
#ifdef _LP64
|
||||||
|
mov64(tmp3_aliased, 0xFFFFFFFFFFFFFFFF);
|
||||||
|
shlxq(tmp3_aliased, tmp3_aliased, tmp1);
|
||||||
|
notq(tmp3_aliased);
|
||||||
|
kmovql(k1, tmp3_aliased);
|
||||||
|
#else
|
||||||
|
Label k_init;
|
||||||
|
jmp(k_init);
|
||||||
|
|
||||||
|
// We could not read 64-bits from a general purpose register thus we move
|
||||||
|
// data required to compose 64 1's to the instruction stream
|
||||||
|
// We emit 64 byte wide series of elements from 0..63 which later on would
|
||||||
|
// be used as a compare targets with tail count contained in tmp1 register.
|
||||||
|
// Result would be a k1 register having tmp1 consecutive number or 1
|
||||||
|
// counting from least significant bit.
|
||||||
|
address tmp = pc();
|
||||||
|
emit_int64(0x0706050403020100);
|
||||||
|
emit_int64(0x0F0E0D0C0B0A0908);
|
||||||
|
emit_int64(0x1716151413121110);
|
||||||
|
emit_int64(0x1F1E1D1C1B1A1918);
|
||||||
|
emit_int64(0x2726252423222120);
|
||||||
|
emit_int64(0x2F2E2D2C2B2A2928);
|
||||||
|
emit_int64(0x3736353433323130);
|
||||||
|
emit_int64(0x3F3E3D3C3B3A3938);
|
||||||
|
|
||||||
|
bind(k_init);
|
||||||
|
lea(len, InternalAddress(tmp));
|
||||||
|
// create mask to test for negative byte inside a vector
|
||||||
|
evpbroadcastb(vec1, tmp1, Assembler::AVX_512bit);
|
||||||
|
evpcmpgtb(k1, vec1, Address(len, 0), Assembler::AVX_512bit);
|
||||||
|
|
||||||
|
#endif
|
||||||
|
evpcmpgtb(k2, k1, vec2, Address(ary1, 0), Assembler::AVX_512bit);
|
||||||
|
ktestq(k2, k1);
|
||||||
|
// Restore k1
|
||||||
|
kmovql(k1, k3);
|
||||||
jcc(Assembler::notZero, TRUE_LABEL);
|
jcc(Assembler::notZero, TRUE_LABEL);
|
||||||
|
|
||||||
jmp(FALSE_LABEL);
|
jmp(FALSE_LABEL);
|
||||||
|
|
||||||
bind(COMPARE_TAIL); // len is zero
|
clear_vector_masking(); // closing of the stub context for programming mask registers
|
||||||
movl(len, result);
|
|
||||||
// Fallthru to tail compare
|
|
||||||
} else if (UseSSE42Intrinsics) {
|
|
||||||
assert(UseSSE >= 4, "SSE4 must be for SSE4.2 intrinsics to be available");
|
|
||||||
// With SSE4.2, use double quad vector compare
|
|
||||||
Label COMPARE_WIDE_VECTORS, COMPARE_TAIL;
|
|
||||||
|
|
||||||
// Compare 16-byte vectors
|
|
||||||
andl(result, 0x0000000f); // tail count (in bytes)
|
|
||||||
andl(len, 0xfffffff0); // vector count (in bytes)
|
|
||||||
jccb(Assembler::zero, COMPARE_TAIL);
|
|
||||||
|
|
||||||
lea(ary1, Address(ary1, len, Address::times_1));
|
|
||||||
negptr(len);
|
|
||||||
|
|
||||||
movl(tmp1, 0x80808080);
|
|
||||||
movdl(vec2, tmp1);
|
|
||||||
pshufd(vec2, vec2, 0);
|
|
||||||
|
|
||||||
bind(COMPARE_WIDE_VECTORS);
|
|
||||||
movdqu(vec1, Address(ary1, len, Address::times_1));
|
|
||||||
ptest(vec1, vec2);
|
|
||||||
jcc(Assembler::notZero, TRUE_LABEL);
|
|
||||||
addptr(len, 16);
|
|
||||||
jcc(Assembler::notZero, COMPARE_WIDE_VECTORS);
|
|
||||||
|
|
||||||
testl(result, result);
|
|
||||||
jcc(Assembler::zero, FALSE_LABEL);
|
|
||||||
|
|
||||||
movdqu(vec1, Address(ary1, result, Address::times_1, -16));
|
|
||||||
ptest(vec1, vec2);
|
|
||||||
jccb(Assembler::notZero, TRUE_LABEL);
|
|
||||||
jmpb(FALSE_LABEL);
|
|
||||||
|
|
||||||
bind(COMPARE_TAIL); // len is zero
|
|
||||||
movl(len, result);
|
|
||||||
// Fallthru to tail compare
|
|
||||||
}
|
}
|
||||||
|
else {
|
||||||
|
movl(result, len); // copy
|
||||||
|
|
||||||
|
if (UseAVX == 2 && UseSSE >= 2) {
|
||||||
|
// With AVX2, use 32-byte vector compare
|
||||||
|
Label COMPARE_WIDE_VECTORS, COMPARE_TAIL;
|
||||||
|
|
||||||
|
// Compare 32-byte vectors
|
||||||
|
andl(result, 0x0000001f); // tail count (in bytes)
|
||||||
|
andl(len, 0xffffffe0); // vector count (in bytes)
|
||||||
|
jccb(Assembler::zero, COMPARE_TAIL);
|
||||||
|
|
||||||
|
lea(ary1, Address(ary1, len, Address::times_1));
|
||||||
|
negptr(len);
|
||||||
|
|
||||||
|
movl(tmp1, 0x80808080); // create mask to test for Unicode chars in vector
|
||||||
|
movdl(vec2, tmp1);
|
||||||
|
vpbroadcastd(vec2, vec2);
|
||||||
|
|
||||||
|
bind(COMPARE_WIDE_VECTORS);
|
||||||
|
vmovdqu(vec1, Address(ary1, len, Address::times_1));
|
||||||
|
vptest(vec1, vec2);
|
||||||
|
jccb(Assembler::notZero, TRUE_LABEL);
|
||||||
|
addptr(len, 32);
|
||||||
|
jcc(Assembler::notZero, COMPARE_WIDE_VECTORS);
|
||||||
|
|
||||||
|
testl(result, result);
|
||||||
|
jccb(Assembler::zero, FALSE_LABEL);
|
||||||
|
|
||||||
|
vmovdqu(vec1, Address(ary1, result, Address::times_1, -32));
|
||||||
|
vptest(vec1, vec2);
|
||||||
|
jccb(Assembler::notZero, TRUE_LABEL);
|
||||||
|
jmpb(FALSE_LABEL);
|
||||||
|
|
||||||
|
bind(COMPARE_TAIL); // len is zero
|
||||||
|
movl(len, result);
|
||||||
|
// Fallthru to tail compare
|
||||||
|
}
|
||||||
|
else if (UseSSE42Intrinsics) {
|
||||||
|
assert(UseSSE >= 4, "SSE4 must be for SSE4.2 intrinsics to be available");
|
||||||
|
// With SSE4.2, use double quad vector compare
|
||||||
|
Label COMPARE_WIDE_VECTORS, COMPARE_TAIL;
|
||||||
|
|
||||||
|
// Compare 16-byte vectors
|
||||||
|
andl(result, 0x0000000f); // tail count (in bytes)
|
||||||
|
andl(len, 0xfffffff0); // vector count (in bytes)
|
||||||
|
jccb(Assembler::zero, COMPARE_TAIL);
|
||||||
|
|
||||||
|
lea(ary1, Address(ary1, len, Address::times_1));
|
||||||
|
negptr(len);
|
||||||
|
|
||||||
|
movl(tmp1, 0x80808080);
|
||||||
|
movdl(vec2, tmp1);
|
||||||
|
pshufd(vec2, vec2, 0);
|
||||||
|
|
||||||
|
bind(COMPARE_WIDE_VECTORS);
|
||||||
|
movdqu(vec1, Address(ary1, len, Address::times_1));
|
||||||
|
ptest(vec1, vec2);
|
||||||
|
jccb(Assembler::notZero, TRUE_LABEL);
|
||||||
|
addptr(len, 16);
|
||||||
|
jcc(Assembler::notZero, COMPARE_WIDE_VECTORS);
|
||||||
|
|
||||||
|
testl(result, result);
|
||||||
|
jccb(Assembler::zero, FALSE_LABEL);
|
||||||
|
|
||||||
|
movdqu(vec1, Address(ary1, result, Address::times_1, -16));
|
||||||
|
ptest(vec1, vec2);
|
||||||
|
jccb(Assembler::notZero, TRUE_LABEL);
|
||||||
|
jmpb(FALSE_LABEL);
|
||||||
|
|
||||||
|
bind(COMPARE_TAIL); // len is zero
|
||||||
|
movl(len, result);
|
||||||
|
// Fallthru to tail compare
|
||||||
|
}
|
||||||
|
}
|
||||||
// Compare 4-byte vectors
|
// Compare 4-byte vectors
|
||||||
andl(len, 0xfffffffc); // vector count (in bytes)
|
andl(len, 0xfffffffc); // vector count (in bytes)
|
||||||
jccb(Assembler::zero, COMPARE_CHAR);
|
jccb(Assembler::zero, COMPARE_CHAR);
|
||||||
@ -8387,7 +8478,6 @@ void MacroAssembler::has_negatives(Register ary1, Register len,
|
|||||||
vpxor(vec2, vec2);
|
vpxor(vec2, vec2);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Compare char[] or byte[] arrays aligned to 4 bytes or substrings.
|
// Compare char[] or byte[] arrays aligned to 4 bytes or substrings.
|
||||||
void MacroAssembler::arrays_equals(bool is_array_equ, Register ary1, Register ary2,
|
void MacroAssembler::arrays_equals(bool is_array_equ, Register ary1, Register ary2,
|
||||||
Register limit, Register result, Register chr,
|
Register limit, Register result, Register chr,
|
||||||
@ -8833,10 +8923,23 @@ void MacroAssembler::generate_fill(BasicType t, bool aligned,
|
|||||||
}
|
}
|
||||||
|
|
||||||
// encode char[] to byte[] in ISO_8859_1
|
// encode char[] to byte[] in ISO_8859_1
|
||||||
|
//@HotSpotIntrinsicCandidate
|
||||||
|
//private static int implEncodeISOArray(byte[] sa, int sp,
|
||||||
|
//byte[] da, int dp, int len) {
|
||||||
|
// int i = 0;
|
||||||
|
// for (; i < len; i++) {
|
||||||
|
// char c = StringUTF16.getChar(sa, sp++);
|
||||||
|
// if (c > '\u00FF')
|
||||||
|
// break;
|
||||||
|
// da[dp++] = (byte)c;
|
||||||
|
// }
|
||||||
|
// return i;
|
||||||
|
//}
|
||||||
void MacroAssembler::encode_iso_array(Register src, Register dst, Register len,
|
void MacroAssembler::encode_iso_array(Register src, Register dst, Register len,
|
||||||
XMMRegister tmp1Reg, XMMRegister tmp2Reg,
|
XMMRegister tmp1Reg, XMMRegister tmp2Reg,
|
||||||
XMMRegister tmp3Reg, XMMRegister tmp4Reg,
|
XMMRegister tmp3Reg, XMMRegister tmp4Reg,
|
||||||
Register tmp5, Register result) {
|
Register tmp5, Register result) {
|
||||||
|
|
||||||
// rsi: src
|
// rsi: src
|
||||||
// rdi: dst
|
// rdi: dst
|
||||||
// rdx: len
|
// rdx: len
|
||||||
@ -8851,6 +8954,7 @@ void MacroAssembler::encode_iso_array(Register src, Register dst, Register len,
|
|||||||
// check for zero length
|
// check for zero length
|
||||||
testl(len, len);
|
testl(len, len);
|
||||||
jcc(Assembler::zero, L_done);
|
jcc(Assembler::zero, L_done);
|
||||||
|
|
||||||
movl(result, len);
|
movl(result, len);
|
||||||
|
|
||||||
// Setup pointers
|
// Setup pointers
|
||||||
@ -8959,6 +9063,7 @@ void MacroAssembler::encode_iso_array(Register src, Register dst, Register len,
|
|||||||
|
|
||||||
bind(L_copy_1_char_exit);
|
bind(L_copy_1_char_exit);
|
||||||
addptr(result, len); // len is negative count of not processed elements
|
addptr(result, len); // len is negative count of not processed elements
|
||||||
|
|
||||||
bind(L_done);
|
bind(L_done);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -9470,8 +9575,8 @@ void MacroAssembler::vectorized_mismatch(Register obja, Register objb, Register
|
|||||||
notq(tmp2);
|
notq(tmp2);
|
||||||
kmovql(k1, tmp2);
|
kmovql(k1, tmp2);
|
||||||
|
|
||||||
evmovdqub(k1, rymm0, Address(obja, result), Assembler::AVX_512bit);
|
evmovdqub(rymm0, k1, Address(obja, result), Assembler::AVX_512bit);
|
||||||
evpcmpeqb(k1, k7, rymm0, Address(objb, result), Assembler::AVX_512bit);
|
evpcmpeqb(k7, k1, rymm0, Address(objb, result), Assembler::AVX_512bit);
|
||||||
|
|
||||||
ktestql(k7, k1);
|
ktestql(k7, k1);
|
||||||
// Restore k1
|
// Restore k1
|
||||||
@ -10830,13 +10935,24 @@ void MacroAssembler::crc32c_ipl_alg2_alt2(Register in_out, Register in1, Registe
|
|||||||
#undef BIND
|
#undef BIND
|
||||||
#undef BLOCK_COMMENT
|
#undef BLOCK_COMMENT
|
||||||
|
|
||||||
|
|
||||||
// Compress char[] array to byte[].
|
// Compress char[] array to byte[].
|
||||||
|
// ..\jdk\src\java.base\share\classes\java\lang\StringUTF16.java
|
||||||
|
// @HotSpotIntrinsicCandidate
|
||||||
|
// private static int compress(char[] src, int srcOff, byte[] dst, int dstOff, int len) {
|
||||||
|
// for (int i = 0; i < len; i++) {
|
||||||
|
// int c = src[srcOff++];
|
||||||
|
// if (c >>> 8 != 0) {
|
||||||
|
// return 0;
|
||||||
|
// }
|
||||||
|
// dst[dstOff++] = (byte)c;
|
||||||
|
// }
|
||||||
|
// return len;
|
||||||
|
// }
|
||||||
void MacroAssembler::char_array_compress(Register src, Register dst, Register len,
|
void MacroAssembler::char_array_compress(Register src, Register dst, Register len,
|
||||||
XMMRegister tmp1Reg, XMMRegister tmp2Reg,
|
XMMRegister tmp1Reg, XMMRegister tmp2Reg,
|
||||||
XMMRegister tmp3Reg, XMMRegister tmp4Reg,
|
XMMRegister tmp3Reg, XMMRegister tmp4Reg,
|
||||||
Register tmp5, Register result) {
|
Register tmp5, Register result) {
|
||||||
Label copy_chars_loop, return_length, return_zero, done;
|
Label copy_chars_loop, return_length, return_zero, done, below_threshold;
|
||||||
|
|
||||||
// rsi: src
|
// rsi: src
|
||||||
// rdi: dst
|
// rdi: dst
|
||||||
@ -10853,11 +10969,141 @@ void MacroAssembler::char_array_compress(Register src, Register dst, Register le
|
|||||||
// save length for return
|
// save length for return
|
||||||
push(len);
|
push(len);
|
||||||
|
|
||||||
|
if ((UseAVX > 2) && // AVX512
|
||||||
|
VM_Version::supports_avx512vlbw() &&
|
||||||
|
VM_Version::supports_bmi2()) {
|
||||||
|
|
||||||
|
set_vector_masking(); // opening of the stub context for programming mask registers
|
||||||
|
|
||||||
|
Label copy_32_loop, copy_loop_tail, copy_just_portion_of_candidates;
|
||||||
|
|
||||||
|
// alignement
|
||||||
|
Label post_alignement;
|
||||||
|
|
||||||
|
// if length of the string is less than 16, handle it in an old fashioned
|
||||||
|
// way
|
||||||
|
testl(len, -32);
|
||||||
|
jcc(Assembler::zero, below_threshold);
|
||||||
|
|
||||||
|
// First check whether a character is compressable ( <= 0xFF).
|
||||||
|
// Create mask to test for Unicode chars inside zmm vector
|
||||||
|
movl(result, 0x00FF);
|
||||||
|
evpbroadcastw(tmp2Reg, result, Assembler::AVX_512bit);
|
||||||
|
|
||||||
|
testl(len, -64);
|
||||||
|
jcc(Assembler::zero, post_alignement);
|
||||||
|
|
||||||
|
// Save k1
|
||||||
|
kmovql(k3, k1);
|
||||||
|
|
||||||
|
movl(tmp5, dst);
|
||||||
|
andl(tmp5, (64 - 1));
|
||||||
|
negl(tmp5);
|
||||||
|
andl(tmp5, (64 - 1));
|
||||||
|
|
||||||
|
// bail out when there is nothing to be done
|
||||||
|
testl(tmp5, 0xFFFFFFFF);
|
||||||
|
jcc(Assembler::zero, post_alignement);
|
||||||
|
|
||||||
|
// ~(~0 << len), where len is the # of remaining elements to process
|
||||||
|
movl(result, 0xFFFFFFFF);
|
||||||
|
shlxl(result, result, tmp5);
|
||||||
|
notl(result);
|
||||||
|
|
||||||
|
kmovdl(k1, result);
|
||||||
|
|
||||||
|
evmovdquw(tmp1Reg, k1, Address(src, 0), Assembler::AVX_512bit);
|
||||||
|
evpcmpuw(k2, k1, tmp1Reg, tmp2Reg, Assembler::le, Assembler::AVX_512bit);
|
||||||
|
ktestd(k2, k1);
|
||||||
|
jcc(Assembler::carryClear, copy_just_portion_of_candidates);
|
||||||
|
|
||||||
|
evpmovwb(Address(dst, 0), k1, tmp1Reg, Assembler::AVX_512bit);
|
||||||
|
|
||||||
|
addptr(src, tmp5);
|
||||||
|
addptr(src, tmp5);
|
||||||
|
addptr(dst, tmp5);
|
||||||
|
subl(len, tmp5);
|
||||||
|
|
||||||
|
bind(post_alignement);
|
||||||
|
// end of alignement
|
||||||
|
|
||||||
|
movl(tmp5, len);
|
||||||
|
andl(tmp5, (32 - 1)); // tail count (in chars)
|
||||||
|
andl(len, ~(32 - 1)); // vector count (in chars)
|
||||||
|
jcc(Assembler::zero, copy_loop_tail);
|
||||||
|
|
||||||
|
lea(src, Address(src, len, Address::times_2));
|
||||||
|
lea(dst, Address(dst, len, Address::times_1));
|
||||||
|
negptr(len);
|
||||||
|
|
||||||
|
bind(copy_32_loop);
|
||||||
|
evmovdquw(tmp1Reg, Address(src, len, Address::times_2), Assembler::AVX_512bit);
|
||||||
|
evpcmpuw(k2, tmp1Reg, tmp2Reg, Assembler::le, Assembler::AVX_512bit);
|
||||||
|
kortestdl(k2, k2);
|
||||||
|
jcc(Assembler::carryClear, copy_just_portion_of_candidates);
|
||||||
|
|
||||||
|
// All elements in current processed chunk are valid candidates for
|
||||||
|
// compression. Write a truncated byte elements to the memory.
|
||||||
|
evpmovwb(Address(dst, len, Address::times_1), tmp1Reg, Assembler::AVX_512bit);
|
||||||
|
addptr(len, 32);
|
||||||
|
jcc(Assembler::notZero, copy_32_loop);
|
||||||
|
|
||||||
|
bind(copy_loop_tail);
|
||||||
|
// bail out when there is nothing to be done
|
||||||
|
testl(tmp5, 0xFFFFFFFF);
|
||||||
|
jcc(Assembler::zero, return_length);
|
||||||
|
|
||||||
|
// Save k1
|
||||||
|
kmovql(k3, k1);
|
||||||
|
|
||||||
|
movl(len, tmp5);
|
||||||
|
|
||||||
|
// ~(~0 << len), where len is the # of remaining elements to process
|
||||||
|
movl(result, 0xFFFFFFFF);
|
||||||
|
shlxl(result, result, len);
|
||||||
|
notl(result);
|
||||||
|
|
||||||
|
kmovdl(k1, result);
|
||||||
|
|
||||||
|
evmovdquw(tmp1Reg, k1, Address(src, 0), Assembler::AVX_512bit);
|
||||||
|
evpcmpuw(k2, k1, tmp1Reg, tmp2Reg, Assembler::le, Assembler::AVX_512bit);
|
||||||
|
ktestd(k2, k1);
|
||||||
|
jcc(Assembler::carryClear, copy_just_portion_of_candidates);
|
||||||
|
|
||||||
|
evpmovwb(Address(dst, 0), k1, tmp1Reg, Assembler::AVX_512bit);
|
||||||
|
// Restore k1
|
||||||
|
kmovql(k1, k3);
|
||||||
|
|
||||||
|
jmp(return_length);
|
||||||
|
|
||||||
|
bind(copy_just_portion_of_candidates);
|
||||||
|
kmovdl(tmp5, k2);
|
||||||
|
tzcntl(tmp5, tmp5);
|
||||||
|
|
||||||
|
// ~(~0 << tmp5), where tmp5 is a number of elements in an array from the
|
||||||
|
// result to the first element larger than 0xFF
|
||||||
|
movl(result, 0xFFFFFFFF);
|
||||||
|
shlxl(result, result, tmp5);
|
||||||
|
notl(result);
|
||||||
|
|
||||||
|
kmovdl(k1, result);
|
||||||
|
|
||||||
|
evpmovwb(Address(dst, 0), k1, tmp1Reg, Assembler::AVX_512bit);
|
||||||
|
// Restore k1
|
||||||
|
kmovql(k1, k3);
|
||||||
|
|
||||||
|
jmp(return_zero);
|
||||||
|
|
||||||
|
clear_vector_masking(); // closing of the stub context for programming mask registers
|
||||||
|
}
|
||||||
if (UseSSE42Intrinsics) {
|
if (UseSSE42Intrinsics) {
|
||||||
assert(UseSSE >= 4, "SSE4 must be enabled for SSE4.2 intrinsics to be available");
|
assert(UseSSE >= 4, "SSE4 must be enabled for SSE4.2 intrinsics to be available");
|
||||||
Label copy_32_loop, copy_16, copy_tail;
|
Label copy_32_loop, copy_16, copy_tail;
|
||||||
|
|
||||||
|
bind(below_threshold);
|
||||||
|
|
||||||
movl(result, len);
|
movl(result, len);
|
||||||
|
|
||||||
movl(tmp5, 0xff00ff00); // create mask to test for Unicode chars in vectors
|
movl(tmp5, 0xff00ff00); // create mask to test for Unicode chars in vectors
|
||||||
|
|
||||||
// vectored compression
|
// vectored compression
|
||||||
@ -10939,10 +11185,16 @@ void MacroAssembler::char_array_compress(Register src, Register dst, Register le
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Inflate byte[] array to char[].
|
// Inflate byte[] array to char[].
|
||||||
|
// ..\jdk\src\java.base\share\classes\java\lang\StringLatin1.java
|
||||||
|
// @HotSpotIntrinsicCandidate
|
||||||
|
// private static void inflate(byte[] src, int srcOff, char[] dst, int dstOff, int len) {
|
||||||
|
// for (int i = 0; i < len; i++) {
|
||||||
|
// dst[dstOff++] = (char)(src[srcOff++] & 0xff);
|
||||||
|
// }
|
||||||
|
// }
|
||||||
void MacroAssembler::byte_array_inflate(Register src, Register dst, Register len,
|
void MacroAssembler::byte_array_inflate(Register src, Register dst, Register len,
|
||||||
XMMRegister tmp1, Register tmp2) {
|
XMMRegister tmp1, Register tmp2) {
|
||||||
Label copy_chars_loop, done;
|
Label copy_chars_loop, done, below_threshold;
|
||||||
|
|
||||||
// rsi: src
|
// rsi: src
|
||||||
// rdi: dst
|
// rdi: dst
|
||||||
// rdx: len
|
// rdx: len
|
||||||
@ -10953,20 +11205,109 @@ void MacroAssembler::byte_array_inflate(Register src, Register dst, Register len
|
|||||||
// rdx holds length
|
// rdx holds length
|
||||||
assert_different_registers(src, dst, len, tmp2);
|
assert_different_registers(src, dst, len, tmp2);
|
||||||
|
|
||||||
|
if ((UseAVX > 2) && // AVX512
|
||||||
|
VM_Version::supports_avx512vlbw() &&
|
||||||
|
VM_Version::supports_bmi2()) {
|
||||||
|
|
||||||
|
set_vector_masking(); // opening of the stub context for programming mask registers
|
||||||
|
|
||||||
|
Label copy_32_loop, copy_tail;
|
||||||
|
Register tmp3_aliased = len;
|
||||||
|
|
||||||
|
// if length of the string is less than 16, handle it in an old fashioned
|
||||||
|
// way
|
||||||
|
testl(len, -16);
|
||||||
|
jcc(Assembler::zero, below_threshold);
|
||||||
|
|
||||||
|
// In order to use only one arithmetic operation for the main loop we use
|
||||||
|
// this pre-calculation
|
||||||
|
movl(tmp2, len);
|
||||||
|
andl(tmp2, (32 - 1)); // tail count (in chars), 32 element wide loop
|
||||||
|
andl(len, -32); // vector count
|
||||||
|
jccb(Assembler::zero, copy_tail);
|
||||||
|
|
||||||
|
lea(src, Address(src, len, Address::times_1));
|
||||||
|
lea(dst, Address(dst, len, Address::times_2));
|
||||||
|
negptr(len);
|
||||||
|
|
||||||
|
|
||||||
|
// inflate 32 chars per iter
|
||||||
|
bind(copy_32_loop);
|
||||||
|
vpmovzxbw(tmp1, Address(src, len, Address::times_1), Assembler::AVX_512bit);
|
||||||
|
evmovdquw(Address(dst, len, Address::times_2), tmp1, Assembler::AVX_512bit);
|
||||||
|
addptr(len, 32);
|
||||||
|
jcc(Assembler::notZero, copy_32_loop);
|
||||||
|
|
||||||
|
bind(copy_tail);
|
||||||
|
// bail out when there is nothing to be done
|
||||||
|
testl(tmp2, -1); // we don't destroy the contents of tmp2 here
|
||||||
|
jcc(Assembler::zero, done);
|
||||||
|
|
||||||
|
// Save k1
|
||||||
|
kmovql(k2, k1);
|
||||||
|
|
||||||
|
// ~(~0 << length), where length is the # of remaining elements to process
|
||||||
|
movl(tmp3_aliased, -1);
|
||||||
|
shlxl(tmp3_aliased, tmp3_aliased, tmp2);
|
||||||
|
notl(tmp3_aliased);
|
||||||
|
kmovdl(k1, tmp3_aliased);
|
||||||
|
evpmovzxbw(tmp1, k1, Address(src, 0), Assembler::AVX_512bit);
|
||||||
|
evmovdquw(Address(dst, 0), k1, tmp1, Assembler::AVX_512bit);
|
||||||
|
|
||||||
|
// Restore k1
|
||||||
|
kmovql(k1, k2);
|
||||||
|
jmp(done);
|
||||||
|
|
||||||
|
clear_vector_masking(); // closing of the stub context for programming mask registers
|
||||||
|
}
|
||||||
if (UseSSE42Intrinsics) {
|
if (UseSSE42Intrinsics) {
|
||||||
assert(UseSSE >= 4, "SSE4 must be enabled for SSE4.2 intrinsics to be available");
|
assert(UseSSE >= 4, "SSE4 must be enabled for SSE4.2 intrinsics to be available");
|
||||||
Label copy_8_loop, copy_bytes, copy_tail;
|
Label copy_16_loop, copy_8_loop, copy_bytes, copy_new_tail, copy_tail;
|
||||||
|
|
||||||
movl(tmp2, len);
|
movl(tmp2, len);
|
||||||
andl(tmp2, 0x00000007); // tail count (in chars)
|
|
||||||
andl(len, 0xfffffff8); // vector count (in chars)
|
if (UseAVX > 1) {
|
||||||
jccb(Assembler::zero, copy_tail);
|
andl(tmp2, (16 - 1));
|
||||||
|
andl(len, -16);
|
||||||
|
jccb(Assembler::zero, copy_new_tail);
|
||||||
|
} else {
|
||||||
|
andl(tmp2, 0x00000007); // tail count (in chars)
|
||||||
|
andl(len, 0xfffffff8); // vector count (in chars)
|
||||||
|
jccb(Assembler::zero, copy_tail);
|
||||||
|
}
|
||||||
|
|
||||||
// vectored inflation
|
// vectored inflation
|
||||||
lea(src, Address(src, len, Address::times_1));
|
lea(src, Address(src, len, Address::times_1));
|
||||||
lea(dst, Address(dst, len, Address::times_2));
|
lea(dst, Address(dst, len, Address::times_2));
|
||||||
negptr(len);
|
negptr(len);
|
||||||
|
|
||||||
|
if (UseAVX > 1) {
|
||||||
|
bind(copy_16_loop);
|
||||||
|
vpmovzxbw(tmp1, Address(src, len, Address::times_1), Assembler::AVX_256bit);
|
||||||
|
vmovdqu(Address(dst, len, Address::times_2), tmp1);
|
||||||
|
addptr(len, 16);
|
||||||
|
jcc(Assembler::notZero, copy_16_loop);
|
||||||
|
|
||||||
|
bind(below_threshold);
|
||||||
|
bind(copy_new_tail);
|
||||||
|
if (UseAVX > 2) {
|
||||||
|
movl(tmp2, len);
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
movl(len, tmp2);
|
||||||
|
}
|
||||||
|
andl(tmp2, 0x00000007);
|
||||||
|
andl(len, 0xFFFFFFF8);
|
||||||
|
jccb(Assembler::zero, copy_tail);
|
||||||
|
|
||||||
|
pmovzxbw(tmp1, Address(src, 0));
|
||||||
|
movdqu(Address(dst, 0), tmp1);
|
||||||
|
addptr(src, 8);
|
||||||
|
addptr(dst, 2 * 8);
|
||||||
|
|
||||||
|
jmp(copy_tail, true);
|
||||||
|
}
|
||||||
|
|
||||||
// inflate 8 chars per iter
|
// inflate 8 chars per iter
|
||||||
bind(copy_8_loop);
|
bind(copy_8_loop);
|
||||||
pmovzxbw(tmp1, Address(src, len, Address::times_1)); // unpack to 8 words
|
pmovzxbw(tmp1, Address(src, len, Address::times_1)); // unpack to 8 words
|
||||||
@ -11005,7 +11346,6 @@ void MacroAssembler::byte_array_inflate(Register src, Register dst, Register len
|
|||||||
bind(done);
|
bind(done);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
Assembler::Condition MacroAssembler::negate_condition(Assembler::Condition cond) {
|
Assembler::Condition MacroAssembler::negate_condition(Assembler::Condition cond) {
|
||||||
switch (cond) {
|
switch (cond) {
|
||||||
// Note some conditions are synonyms for others
|
// Note some conditions are synonyms for others
|
||||||
|
Loading…
Reference in New Issue
Block a user