8322768: Optimize non-subword vector compress and expand APIs for AVX2 target.

Reviewed-by: epeter, sviswanathan
This commit is contained in:
Jatin Bhateja 2024-01-25 10:07:50 +00:00
parent 9d1a6d1484
commit 6d36eb78ad
10 changed files with 365 additions and 19 deletions

View File

@ -1,5 +1,5 @@
/*
* Copyright (c) 1997, 2023, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 1997, 2024, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
@ -816,8 +816,8 @@ private:
void check_relocation(RelocationHolder const& rspec, int format);
#endif
void emit_data(jint data, relocInfo::relocType rtype, int format);
void emit_data(jint data, RelocationHolder const& rspec, int format);
void emit_data(jint data, relocInfo::relocType rtype, int format = 0);
void emit_data(jint data, RelocationHolder const& rspec, int format = 0);
void emit_data64(jlong data, relocInfo::relocType rtype, int format = 0);
void emit_data64(jlong data, RelocationHolder const& rspec, int format = 0);

View File

@ -5282,6 +5282,42 @@ void C2_MacroAssembler::vector_mask_compress(KRegister dst, KRegister src, Regis
kmov(dst, rtmp2);
}
#ifdef _LP64
void C2_MacroAssembler::vector_compress_expand_avx2(int opcode, XMMRegister dst, XMMRegister src,
XMMRegister mask, Register rtmp, Register rscratch,
XMMRegister permv, XMMRegister xtmp, BasicType bt,
int vec_enc) {
assert(type2aelembytes(bt) >= 4, "");
assert(opcode == Op_CompressV || opcode == Op_ExpandV, "");
address compress_perm_table = nullptr;
address expand_perm_table = nullptr;
if (type2aelembytes(bt) == 8) {
compress_perm_table = StubRoutines::x86::compress_perm_table64();
expand_perm_table = StubRoutines::x86::expand_perm_table64();
vmovmskpd(rtmp, mask, vec_enc);
} else {
compress_perm_table = StubRoutines::x86::compress_perm_table32();
expand_perm_table = StubRoutines::x86::expand_perm_table32();
vmovmskps(rtmp, mask, vec_enc);
}
shlq(rtmp, 5); // for 32 byte permute row.
if (opcode == Op_CompressV) {
lea(rscratch, ExternalAddress(compress_perm_table));
} else {
lea(rscratch, ExternalAddress(expand_perm_table));
}
addptr(rtmp, rscratch);
vmovdqu(permv, Address(rtmp));
vpermps(dst, permv, src, Assembler::AVX_256bit);
vpxor(xtmp, xtmp, xtmp, vec_enc);
// Blend the result with zero vector using permute mask, each column entry
// in a permute table row contains either a valid permute index or a -1 (default)
// value, this can potentially be used as a blending mask after
// compressing/expanding the source vector lanes.
vblendvps(dst, dst, xtmp, permv, vec_enc, false, permv);
}
#endif
void C2_MacroAssembler::vector_compress_expand(int opcode, XMMRegister dst, XMMRegister src, KRegister mask,
bool merge, BasicType bt, int vec_enc) {
if (opcode == Op_CompressV) {

View File

@ -1,5 +1,5 @@
/*
* Copyright (c) 2020, 2023, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2020, 2024, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
@ -390,6 +390,10 @@ public:
void vector_round_float_avx(XMMRegister dst, XMMRegister src, AddressLiteral float_sign_flip, AddressLiteral new_mxcsr, int vec_enc,
Register tmp, XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4);
void vector_compress_expand_avx2(int opcode, XMMRegister dst, XMMRegister src, XMMRegister mask,
Register rtmp, Register rscratch, XMMRegister permv, XMMRegister xtmp,
BasicType bt, int vec_enc);
#endif // _LP64
void udivI(Register rax, Register divisor, Register rdx);

View File

@ -951,6 +951,92 @@ address StubGenerator::generate_fp_mask(const char *stub_name, int64_t mask) {
return start;
}
address StubGenerator::generate_compress_perm_table(const char *stub_name, int32_t esize) {
__ align(CodeEntryAlignment);
StubCodeMark mark(this, "StubRoutines", stub_name);
address start = __ pc();
if (esize == 32) {
// Loop to generate 256 x 8 int compression permute index table. A row is
// accessed using 8 bit index computed using vector mask. An entry in
// a row holds either a valid permute index corresponding to set bit position
// or a -1 (default) value.
for (int mask = 0; mask < 256; mask++) {
int ctr = 0;
for (int j = 0; j < 8; j++) {
if (mask & (1 << j)) {
__ emit_data(j, relocInfo::none);
ctr++;
}
}
for (; ctr < 8; ctr++) {
__ emit_data(-1, relocInfo::none);
}
}
} else {
assert(esize == 64, "");
// Loop to generate 16 x 4 long compression permute index table. A row is
// accessed using 4 bit index computed using vector mask. An entry in
// a row holds either a valid permute index pair for a quadword corresponding
// to set bit position or a -1 (default) value.
for (int mask = 0; mask < 16; mask++) {
int ctr = 0;
for (int j = 0; j < 4; j++) {
if (mask & (1 << j)) {
__ emit_data(2 * j, relocInfo::none);
__ emit_data(2 * j + 1, relocInfo::none);
ctr++;
}
}
for (; ctr < 4; ctr++) {
__ emit_data64(-1L, relocInfo::none);
}
}
}
return start;
}
address StubGenerator::generate_expand_perm_table(const char *stub_name, int32_t esize) {
__ align(CodeEntryAlignment);
StubCodeMark mark(this, "StubRoutines", stub_name);
address start = __ pc();
if (esize == 32) {
// Loop to generate 256 x 8 int expand permute index table. A row is accessed
// using 8 bit index computed using vector mask. An entry in a row holds either
// a valid permute index (starting from least significant lane) placed at poisition
// corresponding to set bit position or a -1 (default) value.
for (int mask = 0; mask < 256; mask++) {
int ctr = 0;
for (int j = 0; j < 8; j++) {
if (mask & (1 << j)) {
__ emit_data(ctr++, relocInfo::none);
} else {
__ emit_data(-1, relocInfo::none);
}
}
}
} else {
assert(esize == 64, "");
// Loop to generate 16 x 4 long expand permute index table. A row is accessed
// using 4 bit index computed using vector mask. An entry in a row holds either
// a valid doubleword permute index pair representing a quadword index (starting
// from least significant lane) placed at poisition corresponding to set bit
// position or a -1 (default) value.
for (int mask = 0; mask < 16; mask++) {
int ctr = 0;
for (int j = 0; j < 4; j++) {
if (mask & (1 << j)) {
__ emit_data(2 * ctr, relocInfo::none);
__ emit_data(2 * ctr + 1, relocInfo::none);
ctr++;
} else {
__ emit_data64(-1L, relocInfo::none);
}
}
}
}
return start;
}
address StubGenerator::generate_vector_mask(const char *stub_name, int64_t mask) {
__ align(CodeEntryAlignment);
StubCodeMark mark(this, "StubRoutines", stub_name);
@ -4095,6 +4181,13 @@ void StubGenerator::generate_compiler_stubs() {
StubRoutines::x86::_vector_reverse_byte_perm_mask_int = generate_vector_reverse_byte_perm_mask_int("perm_mask_int");
StubRoutines::x86::_vector_reverse_byte_perm_mask_short = generate_vector_reverse_byte_perm_mask_short("perm_mask_short");
if (VM_Version::supports_avx2() && !VM_Version::supports_avx512vl()) {
StubRoutines::x86::_compress_perm_table32 = generate_compress_perm_table("compress_perm_table32", 32);
StubRoutines::x86::_compress_perm_table64 = generate_compress_perm_table("compress_perm_table64", 64);
StubRoutines::x86::_expand_perm_table32 = generate_expand_perm_table("expand_perm_table32", 32);
StubRoutines::x86::_expand_perm_table64 = generate_expand_perm_table("expand_perm_table64", 64);
}
if (VM_Version::supports_avx2() && !VM_Version::supports_avx512_vpopcntdq()) {
// lut implementation influenced by counting 1s algorithm from section 5-1 of Hackers' Delight.
StubRoutines::x86::_vector_popcount_lut = generate_popcount_avx_lut("popcount_lut");

View File

@ -1,5 +1,5 @@
/*
* Copyright (c) 2003, 2023, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2003, 2024, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
@ -99,6 +99,10 @@ class StubGenerator: public StubCodeGenerator {
address generate_fp_mask(const char *stub_name, int64_t mask);
address generate_compress_perm_table(const char *stub_name, int32_t esize);
address generate_expand_perm_table(const char *stub_name, int32_t esize);
address generate_vector_mask(const char *stub_name, int64_t mask);
address generate_vector_byte_perm_mask(const char *stub_name);

View File

@ -1,5 +1,5 @@
/*
* Copyright (c) 2013, 2023, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2013, 2024, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
@ -82,6 +82,10 @@ address StubRoutines::x86::_join_0_1_base64 = nullptr;
address StubRoutines::x86::_join_1_2_base64 = nullptr;
address StubRoutines::x86::_join_2_3_base64 = nullptr;
address StubRoutines::x86::_decoding_table_base64 = nullptr;
address StubRoutines::x86::_compress_perm_table32 = nullptr;
address StubRoutines::x86::_compress_perm_table64 = nullptr;
address StubRoutines::x86::_expand_perm_table32 = nullptr;
address StubRoutines::x86::_expand_perm_table64 = nullptr;
#endif
address StubRoutines::x86::_pshuffle_byte_flip_mask_addr = nullptr;

View File

@ -1,5 +1,5 @@
/*
* Copyright (c) 2013, 2023, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2013, 2024, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
@ -37,7 +37,7 @@ enum platform_dependent_constants {
_continuation_stubs_code_size = 1000 LP64_ONLY(+1000),
// AVX512 intrinsics add more code in 64-bit VM,
// Windows have more code to save/restore registers
_compiler_stubs_code_size = 20000 LP64_ONLY(+32000) WINDOWS_ONLY(+2000),
_compiler_stubs_code_size = 20000 LP64_ONLY(+39000) WINDOWS_ONLY(+2000),
_final_stubs_code_size = 10000 LP64_ONLY(+20000) WINDOWS_ONLY(+2000) ZGC_ONLY(+20000)
};
@ -58,6 +58,10 @@ class x86 {
static address _float_sign_flip;
static address _double_sign_mask;
static address _double_sign_flip;
static address _compress_perm_table32;
static address _compress_perm_table64;
static address _expand_perm_table32;
static address _expand_perm_table64;
public:
@ -338,6 +342,10 @@ class x86 {
static address base64_decoding_table_addr() { return _decoding_table_base64; }
static address base64_AVX2_decode_tables_addr() { return _avx2_decode_tables_base64; }
static address base64_AVX2_decode_LUT_tables_addr() { return _avx2_decode_lut_tables_base64; }
static address compress_perm_table32() { return _compress_perm_table32; }
static address compress_perm_table64() { return _compress_perm_table64; }
static address expand_perm_table32() { return _expand_perm_table32; }
static address expand_perm_table64() { return _expand_perm_table64; }
#endif
static address pshuffle_byte_flip_mask_addr() { return _pshuffle_byte_flip_mask_addr; }
static address arrays_hashcode_powers_of_31() { return (address)_arrays_hashcode_powers_of_31; }

View File

@ -44,4 +44,3 @@ address StubRoutines::x86::_float_sign_mask = nullptr;
address StubRoutines::x86::_float_sign_flip = nullptr;
address StubRoutines::x86::_double_sign_mask = nullptr;
address StubRoutines::x86::_double_sign_flip = nullptr;

View File

@ -1425,6 +1425,8 @@ bool Matcher::match_rule_supported(int opcode) {
return false;
}
break;
case Op_CompressV:
case Op_ExpandV:
case Op_PopCountVL:
if (UseAVX < 2) {
return false;
@ -1659,12 +1661,6 @@ bool Matcher::match_rule_supported(int opcode) {
return false;
}
break;
case Op_CompressV:
case Op_ExpandV:
if (!VM_Version::supports_avx512vl()) {
return false;
}
break;
case Op_SqrtF:
if (UseSSE < 1) {
return false;
@ -1952,13 +1948,12 @@ bool Matcher::match_rule_supported_vector(int opcode, int vlen, BasicType bt) {
if (is_subword_type(bt) && !VM_Version::supports_avx512_vbmi2()) {
return false;
}
if (!is_LP64 && !VM_Version::supports_avx512vl() && size_in_bits < 512) {
return false;
}
if (size_in_bits < 128 ) {
return false;
}
if (size_in_bits < 512 && !VM_Version::supports_avx512vl()) {
return false;
}
break;
case Op_VectorLongToMask:
if (UseAVX < 1 || !is_LP64) {
return false;
@ -9178,8 +9173,26 @@ instruct vmask_first_or_last_true_avx(rRegI dst, vec mask, immI size, rRegL tmp,
%}
// --------------------------------- Compress/Expand Operations ---------------------------
#ifdef _LP64
instruct vcompress_reg_avx(vec dst, vec src, vec mask, rRegI rtmp, rRegL rscratch, vec perm, vec xtmp, rFlagsReg cr) %{
predicate(!VM_Version::supports_avx512vl() && Matcher::vector_length_in_bytes(n) <= 32);
match(Set dst (CompressV src mask));
match(Set dst (ExpandV src mask));
effect(TEMP_DEF dst, TEMP perm, TEMP xtmp, TEMP rtmp, TEMP rscratch, KILL cr);
format %{ "vector_compress $dst, $src, $mask \t!using $xtmp, $rtmp, $rscratch and $perm as TEMP" %}
ins_encode %{
int opcode = this->ideal_Opcode();
int vlen_enc = vector_length_encoding(this);
BasicType bt = Matcher::vector_element_basic_type(this);
__ vector_compress_expand_avx2(opcode, $dst$$XMMRegister, $src$$XMMRegister, $mask$$XMMRegister, $rtmp$$Register,
$rscratch$$Register, $perm$$XMMRegister, $xtmp$$XMMRegister, bt, vlen_enc);
%}
ins_pipe( pipe_slow );
%}
#endif
instruct vcompress_expand_reg_evex(vec dst, vec src, kReg mask) %{
predicate(VM_Version::supports_avx512vl() || Matcher::vector_length_in_bytes(n) == 64);
match(Set dst (CompressV src mask));
match(Set dst (ExpandV src mask));
format %{ "vector_compress_expand $dst, $src, $mask" %}

View File

@ -0,0 +1,185 @@
/*
* Copyright (c) 2024, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License version 2 only, as
* published by the Free Software Foundation.
*
* This code is distributed in the hope that it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
* version 2 for more details (a copy is included in the LICENSE file that
* accompanied this code).
*
* You should have received a copy of the GNU General Public License version
* 2 along with this work; if not, write to the Free Software Foundation,
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
*
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
* or visit www.oracle.com if you need additional information or have any
* questions.
*
*/
package org.openjdk.bench.jdk.incubator.vector;
import java.util.concurrent.TimeUnit;
import java.util.Random;
import jdk.incubator.vector.*;
import org.openjdk.jmh.annotations.*;
import org.openjdk.jmh.infra.Blackhole;
@OutputTimeUnit(TimeUnit.MILLISECONDS)
@State(Scope.Thread)
@Fork(jvmArgsPrepend = {"--add-modules=jdk.incubator.vector", "-XX:UseAVX=2"})
public class ColumnFilterBenchmark {
@Param({"1024", "2047", "4096"})
int size;
float [] floatinCol;
float [] floatoutCol;
float fpivot;
double [] doubleinCol;
double [] doubleoutCol;
double dpivot;
int [] intinCol;
int [] intoutCol;
int ipivot;
long [] longinCol;
long [] longoutCol;
long lpivot;
static final VectorSpecies<Float> fspecies = FloatVector.SPECIES_256;
static final VectorSpecies<Double> dspecies = DoubleVector.SPECIES_256;
static final VectorSpecies<Integer> ispecies = IntVector.SPECIES_256;
static final VectorSpecies<Long> lspecies = LongVector.SPECIES_256;
@Setup(Level.Trial)
public void BmSetup() {
Random r = new Random(2048);
floatinCol = new float[size];
floatoutCol = new float[size];
fpivot = (float) (size / 2);
doubleinCol = new double[size];
doubleoutCol = new double[size];
dpivot = (double) (size / 2);
intinCol = new int[size];
intoutCol = new int[size];
ipivot = size / 2;
longinCol = new long[size];
longoutCol = new long[size];
lpivot = size / 2;
for (int i = 4; i < size; i++) {
floatinCol[i] = r.nextFloat() * size;
doubleinCol[i] = r.nextDouble() * size;
intinCol[i] = r.nextInt(size);
longinCol[i] = (long)intinCol[i];
}
}
@Benchmark
public void fuzzyFilterIntColumn() {
int i = 0;
int j = 0;
long maskctr = 1;
int endIndex = ispecies.loopBound(size);
for (; i < endIndex; i += ispecies.length()) {
IntVector vec = IntVector.fromArray(ispecies, intinCol, i);
VectorMask<Integer> pred = VectorMask.fromLong(ispecies, maskctr++);
vec.compress(pred).intoArray(intoutCol, j);
j += pred.trueCount();
}
}
@Benchmark
public void fuzzyFilterLongColumn() {
int i = 0;
int j = 0;
long maskctr = 1;
int endIndex = lspecies.loopBound(size);
for (; i < endIndex; i += lspecies.length()) {
LongVector vec = LongVector.fromArray(lspecies, longinCol, i);
VectorMask<Long> pred = VectorMask.fromLong(lspecies, maskctr++);
vec.compress(pred).intoArray(longoutCol, j);
j += pred.trueCount();
}
}
@Benchmark
public void filterIntColumn() {
int i = 0;
int j = 0;
int endIndex = ispecies.loopBound(size);
for (; i < endIndex; i += ispecies.length()) {
IntVector vec = IntVector.fromArray(ispecies, intinCol, i);
VectorMask<Integer> pred = vec.compare(VectorOperators.GT, ipivot);
vec.compress(pred).intoArray(intoutCol, j);
j += pred.trueCount();
}
for (; i < endIndex; i++) {
if (intinCol[i] > ipivot) {
intoutCol[j++] = intinCol[i];
}
}
}
@Benchmark
public void filterLongColumn() {
int i = 0;
int j = 0;
int endIndex = lspecies.loopBound(size);
for (; i < endIndex; i += lspecies.length()) {
LongVector vec = LongVector.fromArray(lspecies, longinCol, i);
VectorMask<Long> pred = vec.compare(VectorOperators.GT, lpivot);
vec.compress(pred).intoArray(longoutCol, j);
j += pred.trueCount();
}
for (; i < endIndex; i++) {
if (longinCol[i] > lpivot) {
longoutCol[j++] = longinCol[i];
}
}
}
@Benchmark
public void filterFloatColumn() {
int i = 0;
int j = 0;
int endIndex = fspecies.loopBound(size);
for (; i < endIndex; i += fspecies.length()) {
FloatVector vec = FloatVector.fromArray(fspecies, floatinCol, i);
VectorMask<Float> pred = vec.compare(VectorOperators.GT, fpivot);
vec.compress(pred).intoArray(floatoutCol, j);
j += pred.trueCount();
}
for (; i < endIndex; i++) {
if (floatinCol[i] > fpivot) {
floatoutCol[j++] = floatinCol[i];
}
}
}
@Benchmark
public void filterDoubleColumn() {
int i = 0;
int j = 0;
int endIndex = dspecies.loopBound(size);
for (; i < endIndex; i += dspecies.length()) {
DoubleVector vec = DoubleVector.fromArray(dspecies, doubleinCol, i);
VectorMask<Double> pred = vec.compare(VectorOperators.GT, dpivot);
vec.compress(pred).intoArray(doubleoutCol, j);
j += pred.trueCount();
}
for (; i < endIndex; i++) {
if (doubleinCol[i] > dpivot) {
doubleoutCol[j++] = doubleinCol[i];
}
}
}
}