8279282: [vectorapi] Matcher::supports_vector_comparison_unsigned is not needed on x86

Reviewed-by: kvn, sviswanathan, vlivanov
This commit is contained in:
Quan Anh Mai 2022-01-20 18:24:53 +00:00 committed by Sandhya Viswanathan
parent 3419ff7ba7
commit 02390c79b1
8 changed files with 104 additions and 138 deletions

@ -2473,6 +2473,16 @@ void Assembler::movddup(XMMRegister dst, XMMRegister src) {
emit_int16(0x12, 0xC0 | encode);
}
void Assembler::vmovddup(XMMRegister dst, Address src, int vector_len) {
assert(VM_Version::supports_avx(), "");
InstructionMark im(this);
InstructionAttr attributes(vector_len, /* rex_w */ VM_Version::supports_evex(), /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ true);
attributes.set_rex_vex_w_reverted();
simd_prefix(dst, xnoreg, src, VEX_SIMD_F2, VEX_OPCODE_0F, &attributes);
emit_int8(0x12);
emit_operand(dst, src);
}
void Assembler::kmovbl(KRegister dst, KRegister src) {
assert(VM_Version::supports_avx512dq(), "");
InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ true, /* no_mask_reg */ true, /* uses_vl */ false);

@ -1467,6 +1467,7 @@ private:
void movb(Register dst, Address src);
void movddup(XMMRegister dst, XMMRegister src);
void vmovddup(XMMRegister dst, Address src, int vector_len);
void kandbl(KRegister dst, KRegister src1, KRegister src2);
void kandwl(KRegister dst, KRegister src1, KRegister src2);

@ -2192,84 +2192,6 @@ void C2_MacroAssembler::evpcmp(BasicType typ, KRegister kdmask, KRegister ksmask
}
}
void C2_MacroAssembler::vpcmpu(BasicType typ, XMMRegister dst, XMMRegister src1, XMMRegister src2, ComparisonPredicate comparison,
int vlen_in_bytes, XMMRegister vtmp1, XMMRegister vtmp2, Register scratch) {
int vlen_enc = vector_length_encoding(vlen_in_bytes*2);
switch (typ) {
case T_BYTE:
vpmovzxbw(vtmp1, src1, vlen_enc);
vpmovzxbw(vtmp2, src2, vlen_enc);
vpcmpCCW(dst, vtmp1, vtmp2, comparison, Assembler::W, vlen_enc, scratch);
vpacksswb(dst, dst, dst, vlen_enc);
break;
case T_SHORT:
vpmovzxwd(vtmp1, src1, vlen_enc);
vpmovzxwd(vtmp2, src2, vlen_enc);
vpcmpCCW(dst, vtmp1, vtmp2, comparison, Assembler::D, vlen_enc, scratch);
vpackssdw(dst, dst, dst, vlen_enc);
break;
case T_INT:
vpmovzxdq(vtmp1, src1, vlen_enc);
vpmovzxdq(vtmp2, src2, vlen_enc);
vpcmpCCW(dst, vtmp1, vtmp2, comparison, Assembler::Q, vlen_enc, scratch);
vpermilps(dst, dst, 8, vlen_enc);
break;
default:
assert(false, "Should not reach here");
}
if (vlen_in_bytes == 16) {
vpermpd(dst, dst, 0x8, vlen_enc);
}
}
void C2_MacroAssembler::vpcmpu32(BasicType typ, XMMRegister dst, XMMRegister src1, XMMRegister src2, ComparisonPredicate comparison, int vlen_in_bytes,
XMMRegister vtmp1, XMMRegister vtmp2, XMMRegister vtmp3, Register scratch) {
int vlen_enc = vector_length_encoding(vlen_in_bytes);
switch (typ) {
case T_BYTE:
vpmovzxbw(vtmp1, src1, vlen_enc);
vpmovzxbw(vtmp2, src2, vlen_enc);
vpcmpCCW(dst, vtmp1, vtmp2, comparison, Assembler::W, vlen_enc, scratch);
vextracti128(vtmp1, src1, 1);
vextracti128(vtmp2, src2, 1);
vpmovzxbw(vtmp1, vtmp1, vlen_enc);
vpmovzxbw(vtmp2, vtmp2, vlen_enc);
vpcmpCCW(vtmp3, vtmp1, vtmp2, comparison, Assembler::W, vlen_enc, scratch);
vpacksswb(dst, dst, vtmp3, vlen_enc);
vpermpd(dst, dst, 0xd8, vlen_enc);
break;
case T_SHORT:
vpmovzxwd(vtmp1, src1, vlen_enc);
vpmovzxwd(vtmp2, src2, vlen_enc);
vpcmpCCW(dst, vtmp1, vtmp2, comparison, Assembler::D, vlen_enc, scratch);
vextracti128(vtmp1, src1, 1);
vextracti128(vtmp2, src2, 1);
vpmovzxwd(vtmp1, vtmp1, vlen_enc);
vpmovzxwd(vtmp2, vtmp2, vlen_enc);
vpcmpCCW(vtmp3, vtmp1, vtmp2, comparison, Assembler::D, vlen_enc, scratch);
vpackssdw(dst, dst, vtmp3, vlen_enc);
vpermpd(dst, dst, 0xd8, vlen_enc);
break;
case T_INT:
vpmovzxdq(vtmp1, src1, vlen_enc);
vpmovzxdq(vtmp2, src2, vlen_enc);
vpcmpCCW(dst, vtmp1, vtmp2, comparison, Assembler::Q, vlen_enc, scratch);
vpshufd(dst, dst, 8, vlen_enc);
vpermq(dst, dst, 8, vlen_enc);
vextracti128(vtmp1, src1, 1);
vextracti128(vtmp2, src2, 1);
vpmovzxdq(vtmp1, vtmp1, vlen_enc);
vpmovzxdq(vtmp2, vtmp2, vlen_enc);
vpcmpCCW(vtmp3, vtmp1, vtmp2, comparison, Assembler::Q, vlen_enc, scratch);
vpshufd(vtmp3, vtmp3, 8, vlen_enc);
vpermq(vtmp3, vtmp3, 0x80, vlen_enc);
vpblendd(dst, dst, vtmp3, 0xf0, vlen_enc);
break;
default:
assert(false, "Should not reach here");
}
}
void C2_MacroAssembler::evpblend(BasicType typ, XMMRegister dst, KRegister kmask, XMMRegister src1, XMMRegister src2, bool merge, int vector_len) {
switch(typ) {
case T_BYTE:

@ -146,12 +146,6 @@ public:
void load_iota_indices(XMMRegister dst, Register scratch, int vlen_in_bytes);
// vector compare
void vpcmpu(BasicType typ, XMMRegister dst, XMMRegister src1, XMMRegister src2, ComparisonPredicate comparison, int vlen_in_bytes,
XMMRegister vtmp1, XMMRegister vtmp2, Register scratch);
void vpcmpu32(BasicType typ, XMMRegister dst, XMMRegister src1, XMMRegister src2, ComparisonPredicate comparison, int vlen_in_bytes,
XMMRegister vtmp1, XMMRegister vtmp2, XMMRegister vtmp3, Register scratch);
// Reductions for vectors of bytes, shorts, ints, longs, floats, and doubles.
// dst = src1 reduce(op, src2) using vtmp as temps

@ -1,5 +1,5 @@
/*
* Copyright (c) 1997, 2021, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 1997, 2022, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
@ -2702,6 +2702,15 @@ void MacroAssembler::movss(XMMRegister dst, AddressLiteral src) {
}
}
void MacroAssembler::vmovddup(XMMRegister dst, AddressLiteral src, int vector_len, Register rscratch) {
if (reachable(src)) {
Assembler::vmovddup(dst, as_Address(src), vector_len);
} else {
lea(rscratch, src);
Assembler::vmovddup(dst, Address(rscratch, 0), vector_len);
}
}
void MacroAssembler::mulsd(XMMRegister dst, AddressLiteral src) {
if (reachable(src)) {
Assembler::mulsd(dst, as_Address(src));
@ -3151,6 +3160,15 @@ void MacroAssembler::vpbroadcastw(XMMRegister dst, XMMRegister src, int vector_l
Assembler::vpbroadcastw(dst, src, vector_len);
}
void MacroAssembler::vbroadcastsd(XMMRegister dst, AddressLiteral src, int vector_len, Register rscratch) {
if (reachable(src)) {
Assembler::vbroadcastsd(dst, as_Address(src), vector_len);
} else {
lea(rscratch, src);
Assembler::vbroadcastsd(dst, Address(rscratch, 0), vector_len);
}
}
void MacroAssembler::vpcmpeqb(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
assert(((dst->encoding() < 16 && src->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
Assembler::vpcmpeqb(dst, nds, src, vector_len);
@ -3219,7 +3237,7 @@ void MacroAssembler::vpcmpCC(XMMRegister dst, XMMRegister nds, XMMRegister src,
}
}
void MacroAssembler::vpcmpCCW(XMMRegister dst, XMMRegister nds, XMMRegister src, ComparisonPredicate cond, Width width, int vector_len, Register scratch_reg) {
void MacroAssembler::vpcmpCCW(XMMRegister dst, XMMRegister nds, XMMRegister src, XMMRegister xtmp, ComparisonPredicate cond, Width width, int vector_len) {
int eq_cond_enc = 0x29;
int gt_cond_enc = 0x37;
if (width != Assembler::Q) {
@ -3232,15 +3250,18 @@ void MacroAssembler::vpcmpCCW(XMMRegister dst, XMMRegister nds, XMMRegister src,
break;
case neq:
vpcmpCC(dst, nds, src, eq_cond_enc, width, vector_len);
vpxor(dst, dst, ExternalAddress(StubRoutines::x86::vector_all_bits_set()), vector_len, scratch_reg);
vallones(xtmp, vector_len);
vpxor(dst, xtmp, dst, vector_len);
break;
case le:
vpcmpCC(dst, nds, src, gt_cond_enc, width, vector_len);
vpxor(dst, dst, ExternalAddress(StubRoutines::x86::vector_all_bits_set()), vector_len, scratch_reg);
vallones(xtmp, vector_len);
vpxor(dst, xtmp, dst, vector_len);
break;
case nlt:
vpcmpCC(dst, src, nds, gt_cond_enc, width, vector_len);
vpxor(dst, dst, ExternalAddress(StubRoutines::x86::vector_all_bits_set()), vector_len, scratch_reg);
vallones(xtmp, vector_len);
vpxor(dst, xtmp, dst, vector_len);
break;
case lt:
vpcmpCC(dst, src, nds, gt_cond_enc, width, vector_len);

@ -1176,6 +1176,9 @@ public:
void movsd(XMMRegister dst, Address src) { Assembler::movsd(dst, src); }
void movsd(XMMRegister dst, AddressLiteral src);
using Assembler::vmovddup;
void vmovddup(XMMRegister dst, AddressLiteral src, int vector_len, Register rscratch = rscratch1);
void mulpd(XMMRegister dst, XMMRegister src) { Assembler::mulpd(dst, src); }
void mulpd(XMMRegister dst, Address src) { Assembler::mulpd(dst, src); }
void mulpd(XMMRegister dst, AddressLiteral src);
@ -1284,6 +1287,9 @@ public:
void vpbroadcastw(XMMRegister dst, XMMRegister src, int vector_len);
void vpbroadcastw(XMMRegister dst, Address src, int vector_len) { Assembler::vpbroadcastw(dst, src, vector_len); }
using Assembler::vbroadcastsd;
void vbroadcastsd(XMMRegister dst, AddressLiteral src, int vector_len, Register rscratch = rscratch1);
void vpcmpeqb(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
void vpcmpeqw(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
@ -1310,7 +1316,7 @@ public:
void evpbroadcast(BasicType type, XMMRegister dst, Register src, int vector_len);
// Emit comparison instruction for the specified comparison predicate.
void vpcmpCCW(XMMRegister dst, XMMRegister nds, XMMRegister src, ComparisonPredicate cond, Width width, int vector_len, Register scratch_reg);
void vpcmpCCW(XMMRegister dst, XMMRegister nds, XMMRegister src, XMMRegister xtmp, ComparisonPredicate cond, Width width, int vector_len);
void vpcmpCC(XMMRegister dst, XMMRegister nds, XMMRegister src, int cond_encoding, Width width, int vector_len);
void vpmovzxbw(XMMRegister dst, Address src, int vector_len);

@ -1,5 +1,5 @@
/*
* Copyright (c) 2021, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2021, 2022, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
@ -166,20 +166,7 @@
// Does the CPU supports vector unsigned comparison instructions?
static const bool supports_vector_comparison_unsigned(int vlen, BasicType bt) {
int vlen_in_bytes = vlen * type2aelembytes(bt);
if ((UseAVX > 2) && (VM_Version::supports_avx512vl() || vlen_in_bytes == 64))
return true;
else {
// instruction set supports only signed comparison
// so need to zero extend to higher integral type and perform comparison
// cannot cast long to higher integral type
// and on avx1 cannot cast 128 bit integral vectors to higher size
if ((bt != T_LONG) &&
((UseAVX >= 2) || (vlen_in_bytes <= 8)))
return true;
}
return false;
return true;
}
// Some microarchitectures have mask registers used on vectors

@ -2560,6 +2560,18 @@ static inline jlong replicate8_imm(int con, int width) {
return val;
}
static inline jlong high_bit_set(BasicType bt) {
switch (bt) {
case T_BYTE: return 0x8080808080808080;
case T_SHORT: return 0x8000800080008000;
case T_INT: return 0x8000000080000000;
case T_LONG: return 0x8000000000000000;
default:
ShouldNotReachHere();
return 0;
}
}
#ifndef PRODUCT
void MachNopNode::format(PhaseRegAlloc*, outputStream* st) const {
st->print("nop \t# %d bytes pad for loops and calls", _count);
@ -7313,62 +7325,75 @@ instruct evcmpFD(kReg dst, vec src1, vec src2, immI8 cond) %{
ins_pipe( pipe_slow );
%}
instruct vcmp(legVec dst, legVec src1, legVec src2, immI8 cond, rRegP scratch) %{
instruct vcmp_direct(legVec dst, legVec src1, legVec src2, immI8 cond) %{
predicate(n->bottom_type()->isa_vectmask() == NULL &&
!is_unsigned_booltest_pred(n->in(2)->get_int()) &&
Matcher::vector_length_in_bytes(n->in(1)->in(1)) >= 4 && // src1
Matcher::vector_length_in_bytes(n->in(1)->in(1)) <= 32 && // src1
is_integral_type(Matcher::vector_element_basic_type(n->in(1)->in(1)))); // src1
is_integral_type(Matcher::vector_element_basic_type(n->in(1)->in(1))) &&
(n->in(2)->get_int() == BoolTest::eq ||
n->in(2)->get_int() == BoolTest::lt ||
n->in(2)->get_int() == BoolTest::gt)); // cond
match(Set dst (VectorMaskCmp (Binary src1 src2) cond));
effect(TEMP scratch);
format %{ "vector_compare $dst,$src1,$src2,$cond\t! using $scratch as TEMP" %}
format %{ "vector_compare $dst,$src1,$src2,$cond\t!" %}
ins_encode %{
int vlen_enc = vector_length_encoding(this, $src1);
Assembler::ComparisonPredicate cmp = booltest_pred_to_comparison_pred($cond$$constant);
Assembler::Width ww = widthForType(Matcher::vector_element_basic_type(this, $src1));
__ vpcmpCCW($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, cmp, ww, vlen_enc, $scratch$$Register);
__ vpcmpCCW($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, xnoreg, cmp, ww, vlen_enc);
%}
ins_pipe( pipe_slow );
%}
instruct vcmpu(legVec dst, legVec src1, legVec src2, immI8 cond, legVec vtmp1, legVec vtmp2, rRegP scratch) %{
instruct vcmp_negate(legVec dst, legVec src1, legVec src2, immI8 cond, legVec xtmp) %{
predicate(n->bottom_type()->isa_vectmask() == NULL &&
!is_unsigned_booltest_pred(n->in(2)->get_int()) &&
Matcher::vector_length_in_bytes(n->in(1)->in(1)) >= 4 && // src1
Matcher::vector_length_in_bytes(n->in(1)->in(1)) <= 32 && // src1
is_integral_type(Matcher::vector_element_basic_type(n->in(1)->in(1))) &&
(n->in(2)->get_int() == BoolTest::ne ||
n->in(2)->get_int() == BoolTest::le ||
n->in(2)->get_int() == BoolTest::ge)); // cond
match(Set dst (VectorMaskCmp (Binary src1 src2) cond));
effect(TEMP dst, TEMP xtmp);
format %{ "vector_compare $dst,$src1,$src2,$cond\t! using $xtmp as TEMP" %}
ins_encode %{
int vlen_enc = vector_length_encoding(this, $src1);
Assembler::ComparisonPredicate cmp = booltest_pred_to_comparison_pred($cond$$constant);
Assembler::Width ww = widthForType(Matcher::vector_element_basic_type(this, $src1));
__ vpcmpCCW($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, $xtmp$$XMMRegister, cmp, ww, vlen_enc);
%}
ins_pipe( pipe_slow );
%}
instruct vcmpu(legVec dst, legVec src1, legVec src2, immI8 cond, legVec xtmp) %{
predicate(n->bottom_type()->isa_vectmask() == NULL &&
is_unsigned_booltest_pred(n->in(2)->get_int()) &&
Matcher::vector_length_in_bytes(n->in(1)->in(1)) >= 8 && // src1
Matcher::vector_length_in_bytes(n->in(1)->in(1)) <= 16 && // src1
Matcher::vector_length_in_bytes(n->in(1)->in(1)) >= 4 && // src1
Matcher::vector_length_in_bytes(n->in(1)->in(1)) <= 32 && // src1
is_integral_type(Matcher::vector_element_basic_type(n->in(1)->in(1)))); // src1
match(Set dst (VectorMaskCmp (Binary src1 src2) cond));
effect(TEMP vtmp1, TEMP vtmp2, TEMP scratch);
format %{ "vector_compareu $dst,$src1,$src2,$cond\t! using $scratch as TEMP" %}
effect(TEMP dst, TEMP xtmp);
format %{ "vector_compareu $dst,$src1,$src2,$cond\t! using $xtmp as TEMP" %}
ins_encode %{
int vlen = Matcher::vector_length_in_bytes(this, $src1);
InternalAddress flip_bit = $constantaddress(high_bit_set(Matcher::vector_element_basic_type(this, $src1)));
int vlen_enc = vector_length_encoding(this, $src1);
Assembler::ComparisonPredicate cmp = booltest_pred_to_comparison_pred($cond$$constant);
BasicType bt = Matcher::vector_element_basic_type(this, $src1);
__ vpcmpu(bt, $dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vlen, $vtmp1$$XMMRegister,
$vtmp2$$XMMRegister, $scratch$$Register);
Assembler::Width ww = widthForType(Matcher::vector_element_basic_type(this, $src1));
if (vlen_enc == Assembler::AVX_128bit) {
__ vmovddup($xtmp$$XMMRegister, flip_bit, vlen_enc, noreg);
} else {
__ vbroadcastsd($xtmp$$XMMRegister, flip_bit, vlen_enc, noreg);
}
__ vpxor($dst$$XMMRegister, $xtmp$$XMMRegister, $src1$$XMMRegister, vlen_enc);
__ vpxor($xtmp$$XMMRegister, $xtmp$$XMMRegister, $src2$$XMMRegister, vlen_enc);
__ vpcmpCCW($dst$$XMMRegister, $dst$$XMMRegister, $xtmp$$XMMRegister, $xtmp$$XMMRegister, cmp, ww, vlen_enc);
%}
ins_pipe( pipe_slow );
%}
instruct vcmpu32(legVec dst, legVec src1, legVec src2, immI8 cond, legVec vtmp1, legVec vtmp2, legVec vtmp3, rRegP scratch) %{
predicate(n->bottom_type()->isa_vectmask() == NULL &&
is_unsigned_booltest_pred(n->in(2)->get_int()) &&
Matcher::vector_length_in_bytes(n->in(1)->in(1)) == 32 && // src1
is_integral_type(Matcher::vector_element_basic_type(n->in(1)->in(1)))); // src1
match(Set dst (VectorMaskCmp (Binary src1 src2) cond));
effect(TEMP dst, TEMP vtmp1, TEMP vtmp2, TEMP vtmp3, TEMP scratch);
format %{ "vector_compareu $dst,$src1,$src2,$cond\t! using $scratch as TEMP" %}
ins_encode %{
int vlen = Matcher::vector_length_in_bytes(this, $src1);
Assembler::ComparisonPredicate cmp = booltest_pred_to_comparison_pred($cond$$constant);
BasicType bt = Matcher::vector_element_basic_type(this, $src1);
__ vpcmpu32(bt, $dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vlen, $vtmp1$$XMMRegister,
$vtmp2$$XMMRegister, $vtmp3$$XMMRegister, $scratch$$Register);
%}
ins_pipe( pipe_slow );
%}
instruct vcmpu64(vec dst, vec src1, vec src2, immI8 cond, rRegP scratch, kReg ktmp) %{
instruct vcmp64(vec dst, vec src1, vec src2, immI8 cond, rRegP scratch, kReg ktmp) %{
predicate((n->bottom_type()->isa_vectmask() == NULL &&
Matcher::vector_length_in_bytes(n->in(1)->in(1)) == 64) && // src1
is_integral_type(Matcher::vector_element_basic_type(n->in(1)->in(1)))); // src1