8309583: AArch64: Optimize firstTrue() when amount of elements < 8

Reviewed-by: aph, eliu
This commit is contained in:
changpeng1997 2023-06-27 04:10:38 +00:00 committed by Eric Liu
parent 87e6fab2c4
commit 45b581b7d5
2 changed files with 26 additions and 70 deletions

@ -5534,39 +5534,10 @@ instruct vstoremask_truecount_neon(iRegINoSp dst, vReg src, immI_gt_1 size, vReg
// first true
instruct vmask_firsttrue_lt8e(iRegINoSp dst, vReg src, rFlagsReg cr) %{
predicate(UseSVE == 0 &&
Matcher::vector_length(n->in(1)) < 8);
instruct vmask_firsttrue_neon(iRegINoSp dst, vReg src) %{
predicate(UseSVE == 0);
match(Set dst (VectorMaskFirstTrue src));
effect(KILL cr);
format %{ "vmask_firsttrue_lt8e $dst, $src\t# vector < 8 elements (neon). KILL cr" %}
ins_encode %{
// Returns the index of the first active lane of the
// vector mask, or VLENGTH if no lane is active.
//
// Input "src" is a vector of boolean represented as
// bytes with 0x00/0x01 as element values.
//
// Computed by reversing the bits and counting the leading
// zero bytes.
BasicType bt = Matcher::vector_element_basic_type(this, $src);
assert(bt == T_BOOLEAN, "unsupported type");
__ fmovd($dst$$Register, $src$$FloatRegister);
__ rbit($dst$$Register, $dst$$Register);
__ clz($dst$$Register, $dst$$Register);
__ lsrw($dst$$Register, $dst$$Register, 3);
__ movw(rscratch1, Matcher::vector_length(this, $src));
__ cmpw($dst$$Register, rscratch1);
__ cselw($dst$$Register, rscratch1, $dst$$Register, Assembler::GE);
%}
ins_pipe(pipe_slow);
%}
instruct vmask_firsttrue_8or16e(iRegINoSp dst, vReg src) %{
predicate(UseSVE == 0 &&
(Matcher::vector_length(n->in(1)) == 8 || Matcher::vector_length(n->in(1)) == 16));
match(Set dst (VectorMaskFirstTrue src));
format %{ "vmask_firsttrue_8or16e $dst, $src\t# vector 8B/16B (neon)" %}
format %{ "vmask_firsttrue_neon $dst, $src" %}
ins_encode %{
// Returns the index of the first active lane of the
// vector mask, or VLENGTH if no lane is active.
@ -5579,14 +5550,21 @@ instruct vmask_firsttrue_8or16e(iRegINoSp dst, vReg src) %{
BasicType bt = Matcher::vector_element_basic_type(this, $src);
assert(bt == T_BOOLEAN, "unsupported type");
uint length_in_bytes = Matcher::vector_length_in_bytes(this, $src);
if (length_in_bytes == 8) {
uint vlength = Matcher::vector_length(this, $src);
if (vlength <= 8) {
__ fmovd($dst$$Register, $src$$FloatRegister);
if (vlength == 2 || vlength == 4) {
// Special handling for 2B or 4B cases:
// Vector mask is moved to a 64-bit general register, but only the low 16/32 bits are
// significant for 2B/4B cases. We initialize the 16th/32nd bit as bit 1, so as to generate
// the expected result (i.e. VLENGTH) for the case that all lanes are zero.
__ orr($dst$$Register, $dst$$Register, vlength == 2 ? 0x10000 : 0x100000000);
}
__ rbit($dst$$Register, $dst$$Register);
__ clz($dst$$Register, $dst$$Register);
__ lsrw($dst$$Register, $dst$$Register, 3);
} else {
assert(length_in_bytes == 16, "must be");
assert(vlength == 16, "must be");
Label FIRST_TRUE_INDEX;
// Try to compute the result from lower 64 bits.

@ -3844,39 +3844,10 @@ instruct vstoremask_truecount_neon(iRegINoSp dst, vReg src, immI_gt_1 size, vReg
// first true
instruct vmask_firsttrue_lt8e(iRegINoSp dst, vReg src, rFlagsReg cr) %{
predicate(UseSVE == 0 &&
Matcher::vector_length(n->in(1)) < 8);
instruct vmask_firsttrue_neon(iRegINoSp dst, vReg src) %{
predicate(UseSVE == 0);
match(Set dst (VectorMaskFirstTrue src));
effect(KILL cr);
format %{ "vmask_firsttrue_lt8e $dst, $src\t# vector < 8 elements (neon). KILL cr" %}
ins_encode %{
// Returns the index of the first active lane of the
// vector mask, or VLENGTH if no lane is active.
//
// Input "src" is a vector of boolean represented as
// bytes with 0x00/0x01 as element values.
//
// Computed by reversing the bits and counting the leading
// zero bytes.
BasicType bt = Matcher::vector_element_basic_type(this, $src);
assert(bt == T_BOOLEAN, "unsupported type");
__ fmovd($dst$$Register, $src$$FloatRegister);
__ rbit($dst$$Register, $dst$$Register);
__ clz($dst$$Register, $dst$$Register);
__ lsrw($dst$$Register, $dst$$Register, 3);
__ movw(rscratch1, Matcher::vector_length(this, $src));
__ cmpw($dst$$Register, rscratch1);
__ cselw($dst$$Register, rscratch1, $dst$$Register, Assembler::GE);
%}
ins_pipe(pipe_slow);
%}
instruct vmask_firsttrue_8or16e(iRegINoSp dst, vReg src) %{
predicate(UseSVE == 0 &&
(Matcher::vector_length(n->in(1)) == 8 || Matcher::vector_length(n->in(1)) == 16));
match(Set dst (VectorMaskFirstTrue src));
format %{ "vmask_firsttrue_8or16e $dst, $src\t# vector 8B/16B (neon)" %}
format %{ "vmask_firsttrue_neon $dst, $src" %}
ins_encode %{
// Returns the index of the first active lane of the
// vector mask, or VLENGTH if no lane is active.
@ -3889,14 +3860,21 @@ instruct vmask_firsttrue_8or16e(iRegINoSp dst, vReg src) %{
BasicType bt = Matcher::vector_element_basic_type(this, $src);
assert(bt == T_BOOLEAN, "unsupported type");
uint length_in_bytes = Matcher::vector_length_in_bytes(this, $src);
if (length_in_bytes == 8) {
uint vlength = Matcher::vector_length(this, $src);
if (vlength <= 8) {
__ fmovd($dst$$Register, $src$$FloatRegister);
if (vlength == 2 || vlength == 4) {
// Special handling for 2B or 4B cases:
// Vector mask is moved to a 64-bit general register, but only the low 16/32 bits are
// significant for 2B/4B cases. We initialize the 16th/32nd bit as bit 1, so as to generate
// the expected result (i.e. VLENGTH) for the case that all lanes are zero.
__ orr($dst$$Register, $dst$$Register, vlength == 2 ? 0x10000 : 0x100000000);
}
__ rbit($dst$$Register, $dst$$Register);
__ clz($dst$$Register, $dst$$Register);
__ lsrw($dst$$Register, $dst$$Register, 3);
} else {
assert(length_in_bytes == 16, "must be");
assert(vlength == 16, "must be");
Label FIRST_TRUE_INDEX;
// Try to compute the result from lower 64 bits.