8255949: AArch64: Add support for vectorized shift right and accumulate

Reviewed-by: aph
This commit is contained in:
Dong Bo 2020-11-10 01:24:25 +00:00 committed by Fei Yang
parent 1332ba3c3c
commit f71f9dc93a
3 changed files with 349 additions and 0 deletions

View File

@ -18922,6 +18922,216 @@ instruct vsrl2L_imm(vecX dst, vecX src, immI shift) %{
ins_pipe(vshift128_imm);
%}
instruct vsraa8B_imm(vecD dst, vecD src, immI shift) %{
predicate(n->as_Vector()->length() == 8);
match(Set dst (AddVB dst (RShiftVB src (RShiftCntV shift))));
ins_cost(INSN_COST);
format %{ "ssra $dst, $src, $shift\t# vector (8B)" %}
ins_encode %{
int sh = (int)$shift$$constant;
if (sh >= 8) sh = 7;
__ ssra(as_FloatRegister($dst$$reg), __ T8B,
as_FloatRegister($src$$reg), sh);
%}
ins_pipe(vshift64_imm);
%}
instruct vsraa16B_imm(vecX dst, vecX src, immI shift) %{
predicate(n->as_Vector()->length() == 16);
match(Set dst (AddVB dst (RShiftVB src (RShiftCntV shift))));
ins_cost(INSN_COST);
format %{ "ssra $dst, $src, $shift\t# vector (16B)" %}
ins_encode %{
int sh = (int)$shift$$constant;
if (sh >= 8) sh = 7;
__ ssra(as_FloatRegister($dst$$reg), __ T16B,
as_FloatRegister($src$$reg), sh);
%}
ins_pipe(vshift128_imm);
%}
instruct vsraa4S_imm(vecD dst, vecD src, immI shift) %{
predicate(n->as_Vector()->length() == 4);
match(Set dst (AddVS dst (RShiftVS src (RShiftCntV shift))));
ins_cost(INSN_COST);
format %{ "ssra $dst, $src, $shift\t# vector (4H)" %}
ins_encode %{
int sh = (int)$shift$$constant;
if (sh >= 16) sh = 15;
__ ssra(as_FloatRegister($dst$$reg), __ T4H,
as_FloatRegister($src$$reg), sh);
%}
ins_pipe(vshift64_imm);
%}
instruct vsraa8S_imm(vecX dst, vecX src, immI shift) %{
predicate(n->as_Vector()->length() == 8);
match(Set dst (AddVS dst (RShiftVS src (RShiftCntV shift))));
ins_cost(INSN_COST);
format %{ "ssra $dst, $src, $shift\t# vector (8H)" %}
ins_encode %{
int sh = (int)$shift$$constant;
if (sh >= 16) sh = 15;
__ ssra(as_FloatRegister($dst$$reg), __ T8H,
as_FloatRegister($src$$reg), sh);
%}
ins_pipe(vshift128_imm);
%}
instruct vsraa2I_imm(vecD dst, vecD src, immI shift) %{
predicate(n->as_Vector()->length() == 2);
match(Set dst (AddVI dst (RShiftVI src (RShiftCntV shift))));
ins_cost(INSN_COST);
format %{ "ssra $dst, $src, $shift\t# vector (2S)" %}
ins_encode %{
__ ssra(as_FloatRegister($dst$$reg), __ T2S,
as_FloatRegister($src$$reg),
(int)$shift$$constant);
%}
ins_pipe(vshift64_imm);
%}
instruct vsraa4I_imm(vecX dst, vecX src, immI shift) %{
predicate(n->as_Vector()->length() == 4);
match(Set dst (AddVI dst (RShiftVI src (RShiftCntV shift))));
ins_cost(INSN_COST);
format %{ "ssra $dst, $src, $shift\t# vector (4S)" %}
ins_encode %{
__ ssra(as_FloatRegister($dst$$reg), __ T4S,
as_FloatRegister($src$$reg),
(int)$shift$$constant);
%}
ins_pipe(vshift128_imm);
%}
instruct vsraa2L_imm(vecX dst, vecX src, immI shift) %{
predicate(n->as_Vector()->length() == 2);
match(Set dst (AddVL dst (RShiftVL src (RShiftCntV shift))));
ins_cost(INSN_COST);
format %{ "ssra $dst, $src, $shift\t# vector (2D)" %}
ins_encode %{
__ ssra(as_FloatRegister($dst$$reg), __ T2D,
as_FloatRegister($src$$reg),
(int)$shift$$constant);
%}
ins_pipe(vshift128_imm);
%}
instruct vsrla8B_imm(vecD dst, vecD src, immI shift) %{
predicate(n->as_Vector()->length() == 8);
match(Set dst (AddVB dst (URShiftVB src (RShiftCntV shift))));
ins_cost(INSN_COST);
format %{ "usra $dst, $src, $shift\t# vector (8B)" %}
ins_encode %{
int sh = (int)$shift$$constant;
if (sh >= 8) {
__ eor(as_FloatRegister($src$$reg), __ T8B,
as_FloatRegister($src$$reg),
as_FloatRegister($src$$reg));
} else {
__ usra(as_FloatRegister($dst$$reg), __ T8B,
as_FloatRegister($src$$reg), sh);
}
%}
ins_pipe(vshift64_imm);
%}
instruct vsrla16B_imm(vecX dst, vecX src, immI shift) %{
predicate(n->as_Vector()->length() == 16);
match(Set dst (AddVB dst (URShiftVB src (RShiftCntV shift))));
ins_cost(INSN_COST);
format %{ "usra $dst, $src, $shift\t# vector (16B)" %}
ins_encode %{
int sh = (int)$shift$$constant;
if (sh >= 8) {
__ eor(as_FloatRegister($src$$reg), __ T16B,
as_FloatRegister($src$$reg),
as_FloatRegister($src$$reg));
} else {
__ usra(as_FloatRegister($dst$$reg), __ T16B,
as_FloatRegister($src$$reg), sh);
}
%}
ins_pipe(vshift128_imm);
%}
instruct vsrla4S_imm(vecD dst, vecD src, immI shift) %{
predicate(n->as_Vector()->length() == 4);
match(Set dst (AddVS dst (URShiftVS src (RShiftCntV shift))));
ins_cost(INSN_COST);
format %{ "usra $dst, $src, $shift\t# vector (4H)" %}
ins_encode %{
int sh = (int)$shift$$constant;
if (sh >= 16) {
__ eor(as_FloatRegister($src$$reg), __ T8B,
as_FloatRegister($src$$reg),
as_FloatRegister($src$$reg));
} else {
__ ushr(as_FloatRegister($dst$$reg), __ T4H,
as_FloatRegister($src$$reg), sh);
}
%}
ins_pipe(vshift64_imm);
%}
instruct vsrla8S_imm(vecX dst, vecX src, immI shift) %{
predicate(n->as_Vector()->length() == 8);
match(Set dst (AddVS dst (URShiftVS src (RShiftCntV shift))));
ins_cost(INSN_COST);
format %{ "usra $dst, $src, $shift\t# vector (8H)" %}
ins_encode %{
int sh = (int)$shift$$constant;
if (sh >= 16) {
__ eor(as_FloatRegister($src$$reg), __ T16B,
as_FloatRegister($src$$reg),
as_FloatRegister($src$$reg));
} else {
__ usra(as_FloatRegister($dst$$reg), __ T8H,
as_FloatRegister($src$$reg), sh);
}
%}
ins_pipe(vshift128_imm);
%}
instruct vsrla2I_imm(vecD dst, vecD src, immI shift) %{
predicate(n->as_Vector()->length() == 2);
match(Set dst (AddVI dst (URShiftVI src (RShiftCntV shift))));
ins_cost(INSN_COST);
format %{ "usra $dst, $src, $shift\t# vector (2S)" %}
ins_encode %{
__ usra(as_FloatRegister($dst$$reg), __ T2S,
as_FloatRegister($src$$reg),
(int)$shift$$constant);
%}
ins_pipe(vshift64_imm);
%}
instruct vsrla4I_imm(vecX dst, vecX src, immI shift) %{
predicate(n->as_Vector()->length() == 4);
match(Set dst (AddVI dst (URShiftVI src (RShiftCntV shift))));
ins_cost(INSN_COST);
format %{ "usra $dst, $src, $shift\t# vector (4S)" %}
ins_encode %{
__ usra(as_FloatRegister($dst$$reg), __ T4S,
as_FloatRegister($src$$reg),
(int)$shift$$constant);
%}
ins_pipe(vshift128_imm);
%}
instruct vsrla2L_imm(vecX dst, vecX src, immI shift) %{
predicate(n->as_Vector()->length() == 2);
match(Set dst (AddVL dst (URShiftVL src (RShiftCntV shift))));
ins_cost(INSN_COST);
format %{ "usra $dst, $src, $shift\t# vector (2D)" %}
ins_encode %{
__ usra(as_FloatRegister($dst$$reg), __ T2D,
as_FloatRegister($src$$reg),
(int)$shift$$constant);
%}
ins_pipe(vshift128_imm);
%}
instruct vmax2F(vecD dst, vecD src1, vecD src2)
%{
predicate(n->as_Vector()->length() == 2 && n->bottom_type()->is_vect()->element_basic_type() == T_FLOAT);

View File

@ -2688,6 +2688,8 @@ public:
INSN(shl, 0, 0b010101, /* isSHR = */ false);
INSN(sshr, 0, 0b000001, /* isSHR = */ true);
INSN(ushr, 1, 0b000001, /* isSHR = */ true);
INSN(usra, 1, 0b000101, /* isSHR = */ true);
INSN(ssra, 0, 0b000101, /* isSHAR =*/ true);
#undef INSN

View File

@ -0,0 +1,137 @@
/*
* Copyright (c) 2020, Huawei Technologies Co. Ltd. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License version 2 only, as
* published by the Free Software Foundation.
*
* This code is distributed in the hope that it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
* version 2 for more details (a copy is included in the LICENSE file that
* accompanied this code).
*
* You should have received a copy of the GNU General Public License version
* 2 along with this work; if not, write to the Free Software Foundation,
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
*
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
* or visit www.oracle.com if you need additional information or have any
* questions.
*/
package org.openjdk.bench.vm.compiler;
import org.openjdk.jmh.annotations.*;
import org.openjdk.jmh.infra.*;
import java.util.concurrent.TimeUnit;
import java.util.Random;
@BenchmarkMode(Mode.AverageTime)
@OutputTimeUnit(TimeUnit.NANOSECONDS)
@State(Scope.Thread)
public class VectorShiftAccumulate {
@Param({"1028"})
public int count;
private byte[] bytesA, bytesB, bytesD;
private short[] shortsA, shortsB, shortsD;
private char[] charsA, charsB, charsD;
private int[] intsA, intsB, intsD;
private long[] longsA, longsB, longsD;
@Param("0")
private int seed;
private Random r = new Random(seed);
@Setup
public void init() {
bytesA = new byte[count];
shortsA = new short[count];
charsA = new char[count];
intsA = new int[count];
longsA = new long[count];
bytesB = new byte[count];
shortsB = new short[count];
charsB = new char[count];
intsB = new int[count];
longsB = new long[count];
bytesD = new byte[count];
shortsD = new short[count];
charsD = new char[count];
intsD = new int[count];
longsD = new long[count];
for (int i = 0; i < count; i++) {
bytesA[i] = (byte) r.nextInt();
shortsA[i] = (short) r.nextInt();
intsA[i] = r.nextInt();
longsA[i] = r.nextLong();
bytesB[i] = (byte) r.nextInt();
shortsB[i] = (short) r.nextInt();
intsB[i] = r.nextInt();
longsB[i] = r.nextLong();
}
}
@Benchmark
public void shiftRightAccumulateByte() {
for (int i = 0; i < count; i++) {
bytesD[i] = (byte) (bytesA[i] + (bytesB[i] >> 1));
}
}
@Benchmark
public void shiftURightAccumulateByte() {
for (int i = 0; i < count; i++) {
bytesD[i] = (byte) (bytesA[i] + (((byte) (bytesB[i] >>> 3))));
}
}
@Benchmark
public void shiftRightAccumulateShort() {
for (int i = 0; i < count; i++) {
shortsD[i] = (short) (shortsA[i] + (shortsB[i] >> 5));
}
}
@Benchmark
public void shiftURightAccumulateChar() {
for (int i = 0; i < count; i++) {
charsD[i] = (char) (charsA[i] + (charsB[i] >>> 4));
}
}
@Benchmark
public void shiftRightAccumulateInt() {
for (int i = 0; i < count; i++) {
intsD[i] = intsA[i] + (intsB[i] >> 2);
}
}
@Benchmark
public void shiftURightAccumulateInt() {
for (int i = 0; i < count; i++) {
intsD[i] = (intsB[i] >>> 2) + intsA[i];
}
}
@Benchmark
public void shiftRightAccumulateLong() {
for (int i = 0; i < count; i++) {
longsD[i] = longsA[i] + (longsB[i] >> 5);
}
}
@Benchmark
public void shiftURightAccumulateLong() {
for (int i = 0; i < count; i++) {
longsD[i] = (longsB[i] >>> 2) + longsA[i];
}
}
}