8307795: AArch64: Optimize VectorMask.truecount() on Neon

Reviewed-by: aph, eliu
This commit is contained in:
changpeng1997 2023-05-30 12:45:33 +00:00 committed by Eric Liu
parent 07f2070411
commit f600d0369a
5 changed files with 235 additions and 0 deletions

View File

@ -5512,6 +5512,30 @@ instruct vmask_truecount_sve(iRegINoSp dst, pReg src) %{
ins_pipe(pipe_slow);
%}
// Combined rule for VectorMaskTrueCount (VectorStoreMask) when the vector element type is not T_BYTE.
instruct vstoremask_truecount_neon(iRegINoSp dst, vReg src, immI_gt_1 size, vReg vtmp) %{
match(Set dst (VectorMaskTrueCount (VectorStoreMask src size)));
effect(TEMP vtmp);
format %{ "vstoremask_truecount_neon $dst, $src\t# KILL $vtmp" %}
ins_encode %{
// Input "src" is a vector mask represented as lanes with
// 0/-1 as element values.
uint esize = (uint)$size$$constant;
if (esize == 8) {
__ addpd($vtmp$$FloatRegister, $src$$FloatRegister);
} else {
uint length_in_bytes = Matcher::vector_length_in_bytes(this, $src);
Assembler::SIMD_Arrangement arrangement = Assembler::esize2arrangement(esize,
/* isQ */ length_in_bytes == 16);
__ addv($vtmp$$FloatRegister, arrangement, $src$$FloatRegister);
}
__ smov($dst$$Register, $vtmp$$FloatRegister, __ B, 0);
__ neg($dst$$Register, $dst$$Register);
%}
ins_pipe(pipe_slow);
%}
// first true
instruct vmask_firsttrue_lt8e(iRegINoSp dst, vReg src, rFlagsReg cr) %{

View File

@ -3822,6 +3822,30 @@ instruct vmask_truecount_sve(iRegINoSp dst, pReg src) %{
ins_pipe(pipe_slow);
%}
// Combined rule for VectorMaskTrueCount (VectorStoreMask) when the vector element type is not T_BYTE.
instruct vstoremask_truecount_neon(iRegINoSp dst, vReg src, immI_gt_1 size, vReg vtmp) %{
match(Set dst (VectorMaskTrueCount (VectorStoreMask src size)));
effect(TEMP vtmp);
format %{ "vstoremask_truecount_neon $dst, $src\t# KILL $vtmp" %}
ins_encode %{
// Input "src" is a vector mask represented as lanes with
// 0/-1 as element values.
uint esize = (uint)$size$$constant;
if (esize == 8) {
__ addpd($vtmp$$FloatRegister, $src$$FloatRegister);
} else {
uint length_in_bytes = Matcher::vector_length_in_bytes(this, $src);
Assembler::SIMD_Arrangement arrangement = Assembler::esize2arrangement(esize,
/* isQ */ length_in_bytes == 16);
__ addv($vtmp$$FloatRegister, arrangement, $src$$FloatRegister);
}
__ smov($dst$$Register, $vtmp$$FloatRegister, __ B, 0);
__ neg($dst$$Register, $dst$$Register);
%}
ins_pipe(pipe_slow);
%}
// first true
instruct vmask_firsttrue_lt8e(iRegINoSp dst, vReg src, rFlagsReg cr) %{

View File

@ -1463,6 +1463,11 @@ public class IRNode {
machOnlyNameRegex(VNOT_L_MASKED, "vnotL_masked");
}
public static final String VSTOREMASK_TRUECOUNT = PREFIX + "VSTOREMASK_TRUECOUNT" + POSTFIX;
static {
machOnlyNameRegex(VSTOREMASK_TRUECOUNT, "vstoremask_truecount_neon");
}
public static final String XOR = PREFIX + "XOR" + POSTFIX;
static {
beforeMatchingNameRegex(XOR, "Xor(I|L)");

View File

@ -0,0 +1,101 @@
/*
* Copyright (c) 2023, Arm Limited. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License version 2 only, as
* published by the Free Software Foundation.
*
* This code is distributed in the hope that it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
* version 2 for more details (a copy is included in the LICENSE file that
* accompanied this code).
*
* You should have received a copy of the GNU General Public License version
* 2 along with this work; if not, write to the Free Software Foundation,
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
*
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
* or visit www.oracle.com if you need additional information or have any
* questions.
*/
package compiler.vectorapi;
import compiler.lib.ir_framework.*;
import java.util.Random;
import jdk.incubator.vector.*;
import jdk.test.lib.Asserts;
import jdk.test.lib.Utils;
/**
* @test
* @bug 8307795
* @key randomness
* @library /test/lib /
* @requires os.arch=="aarch64"
* @summary AArch64: Optimize VectorMask.truecount() on Neon
* @modules jdk.incubator.vector
*
* @run driver compiler.vectorapi.TestVectorMaskTrueCount
*/
public class TestVectorMaskTrueCount {
private static final VectorSpecies<Double> SPECIES = DoubleVector.SPECIES_PREFERRED;
private static final int LENGTH = 1024;
private static final Random RD = new Random();
private static boolean[] ba;
private static boolean[] bb;
static {
ba = new boolean[LENGTH];
bb = new boolean[LENGTH];
for (int i = 0; i < LENGTH; i++) {
ba[i] = RD.nextBoolean();
bb[i] = RD.nextBoolean();
}
}
static int maskAndTrueCount(boolean[] a, boolean[] b, int idx) {
int trueCount = 0;
boolean[] c = new boolean[SPECIES.length()];
for (int i = idx; i < idx + SPECIES.length(); i++) {
c[i - idx] = a[i] & b[i];
}
for (int i = 0; i < c.length; i++) {
trueCount += c[i] ? 1 : 0;
}
return trueCount;
}
static void assertArrayEquals(int[] r, boolean[] a, boolean[] b) {
for (int i = 0; i < a.length; i += SPECIES.length()) {
Asserts.assertEquals(r[i], maskAndTrueCount(a, b, i));
}
}
@Test
@IR(counts = { IRNode.VSTOREMASK_TRUECOUNT, ">= 1" })
public static void test() {
int[] r = new int[LENGTH];
for (int i = 0; i < LENGTH; i += SPECIES.length()) {
VectorMask<Double> ma = VectorMask.fromArray(SPECIES, ba, i);
VectorMask<Double> mb = VectorMask.fromArray(SPECIES, bb, i);
r[i] = ma.and(mb).trueCount();
}
assertArrayEquals(r, ba, bb);
}
public static void main(String[] args) {
TestFramework testFramework = new TestFramework();
testFramework.setDefaultWarmup(10000)
.addFlags("--add-modules=jdk.incubator.vector")
.addFlags("-XX:UseSVE=0")
.start();
}
}

View File

@ -0,0 +1,81 @@
//
// Copyright (c) 2023, Arm Limited. All rights reserved.
// DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
//
// This code is free software; you can redistribute it and/or modify it
// under the terms of the GNU General Public License version 2 only, as
// published by the Free Software Foundation.
//
// This code is distributed in the hope that it will be useful, but WITHOUT
// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
// version 2 for more details (a copy is included in the LICENSE file that
// accompanied this code).
//
// You should have received a copy of the GNU General Public License version
// 2 along with this work; if not, write to the Free Software Foundation,
// Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
//
// Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
// or visit www.oracle.com if you need additional information or have any
// questions.
//
//
package org.openjdk.bench.jdk.incubator.vector;
import java.util.concurrent.TimeUnit;
import java.util.Random;
import jdk.incubator.vector.*;
import org.openjdk.jmh.annotations.*;
@OutputTimeUnit(TimeUnit.MILLISECONDS)
@State(Scope.Thread)
@Fork(jvmArgsPrepend = {"--add-modules=jdk.incubator.vector"})
public class StoreMaskTrueCount {
private static final VectorSpecies<Short> S_SPECIES = ShortVector.SPECIES_PREFERRED;
private static final VectorSpecies<Integer> I_SPECIES = IntVector.SPECIES_PREFERRED;
private static final VectorSpecies<Long> L_SPECIES = LongVector.SPECIES_PREFERRED;
private static final int LENGTH = 128;
private static final Random RD = new Random();
private static boolean[] ba;
static {
ba = new boolean[LENGTH];
for (int i = 0; i < LENGTH; i++) {
ba[i] = RD.nextBoolean();
}
}
@Benchmark
public static int testShort() {
int res = 0;
for (int i = 0; i < LENGTH; i += S_SPECIES.length()) {
VectorMask<Short> m = VectorMask.fromArray(S_SPECIES, ba, i);
res += m.not().trueCount();
}
return res;
}
@Benchmark
public static int testInt() {
int res = 0;
for (int i = 0; i < LENGTH; i += I_SPECIES.length()) {
VectorMask<Integer> m = VectorMask.fromArray(I_SPECIES, ba, i);
res += m.not().trueCount();
}
return res;
}
@Benchmark
public static int testLong() {
int res = 0;
for (int i = 0; i < LENGTH; i += L_SPECIES.length()) {
VectorMask<Long> m = VectorMask.fromArray(L_SPECIES, ba, i);
res += m.not().trueCount();
}
return res;
}
}