8322768: Optimize non-subword vector compress and expand APIs for AVX2 target.
Reviewed-by: epeter, sviswanathan
This commit is contained in:
parent
9d1a6d1484
commit
6d36eb78ad
@ -1,5 +1,5 @@
|
||||
/*
|
||||
* Copyright (c) 1997, 2023, Oracle and/or its affiliates. All rights reserved.
|
||||
* Copyright (c) 1997, 2024, Oracle and/or its affiliates. All rights reserved.
|
||||
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
|
||||
*
|
||||
* This code is free software; you can redistribute it and/or modify it
|
||||
@ -816,8 +816,8 @@ private:
|
||||
void check_relocation(RelocationHolder const& rspec, int format);
|
||||
#endif
|
||||
|
||||
void emit_data(jint data, relocInfo::relocType rtype, int format);
|
||||
void emit_data(jint data, RelocationHolder const& rspec, int format);
|
||||
void emit_data(jint data, relocInfo::relocType rtype, int format = 0);
|
||||
void emit_data(jint data, RelocationHolder const& rspec, int format = 0);
|
||||
void emit_data64(jlong data, relocInfo::relocType rtype, int format = 0);
|
||||
void emit_data64(jlong data, RelocationHolder const& rspec, int format = 0);
|
||||
|
||||
|
@ -5282,6 +5282,42 @@ void C2_MacroAssembler::vector_mask_compress(KRegister dst, KRegister src, Regis
|
||||
kmov(dst, rtmp2);
|
||||
}
|
||||
|
||||
#ifdef _LP64
|
||||
void C2_MacroAssembler::vector_compress_expand_avx2(int opcode, XMMRegister dst, XMMRegister src,
|
||||
XMMRegister mask, Register rtmp, Register rscratch,
|
||||
XMMRegister permv, XMMRegister xtmp, BasicType bt,
|
||||
int vec_enc) {
|
||||
assert(type2aelembytes(bt) >= 4, "");
|
||||
assert(opcode == Op_CompressV || opcode == Op_ExpandV, "");
|
||||
address compress_perm_table = nullptr;
|
||||
address expand_perm_table = nullptr;
|
||||
if (type2aelembytes(bt) == 8) {
|
||||
compress_perm_table = StubRoutines::x86::compress_perm_table64();
|
||||
expand_perm_table = StubRoutines::x86::expand_perm_table64();
|
||||
vmovmskpd(rtmp, mask, vec_enc);
|
||||
} else {
|
||||
compress_perm_table = StubRoutines::x86::compress_perm_table32();
|
||||
expand_perm_table = StubRoutines::x86::expand_perm_table32();
|
||||
vmovmskps(rtmp, mask, vec_enc);
|
||||
}
|
||||
shlq(rtmp, 5); // for 32 byte permute row.
|
||||
if (opcode == Op_CompressV) {
|
||||
lea(rscratch, ExternalAddress(compress_perm_table));
|
||||
} else {
|
||||
lea(rscratch, ExternalAddress(expand_perm_table));
|
||||
}
|
||||
addptr(rtmp, rscratch);
|
||||
vmovdqu(permv, Address(rtmp));
|
||||
vpermps(dst, permv, src, Assembler::AVX_256bit);
|
||||
vpxor(xtmp, xtmp, xtmp, vec_enc);
|
||||
// Blend the result with zero vector using permute mask, each column entry
|
||||
// in a permute table row contains either a valid permute index or a -1 (default)
|
||||
// value, this can potentially be used as a blending mask after
|
||||
// compressing/expanding the source vector lanes.
|
||||
vblendvps(dst, dst, xtmp, permv, vec_enc, false, permv);
|
||||
}
|
||||
#endif
|
||||
|
||||
void C2_MacroAssembler::vector_compress_expand(int opcode, XMMRegister dst, XMMRegister src, KRegister mask,
|
||||
bool merge, BasicType bt, int vec_enc) {
|
||||
if (opcode == Op_CompressV) {
|
||||
|
@ -1,5 +1,5 @@
|
||||
/*
|
||||
* Copyright (c) 2020, 2023, Oracle and/or its affiliates. All rights reserved.
|
||||
* Copyright (c) 2020, 2024, Oracle and/or its affiliates. All rights reserved.
|
||||
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
|
||||
*
|
||||
* This code is free software; you can redistribute it and/or modify it
|
||||
@ -390,6 +390,10 @@ public:
|
||||
|
||||
void vector_round_float_avx(XMMRegister dst, XMMRegister src, AddressLiteral float_sign_flip, AddressLiteral new_mxcsr, int vec_enc,
|
||||
Register tmp, XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4);
|
||||
|
||||
void vector_compress_expand_avx2(int opcode, XMMRegister dst, XMMRegister src, XMMRegister mask,
|
||||
Register rtmp, Register rscratch, XMMRegister permv, XMMRegister xtmp,
|
||||
BasicType bt, int vec_enc);
|
||||
#endif // _LP64
|
||||
|
||||
void udivI(Register rax, Register divisor, Register rdx);
|
||||
|
@ -951,6 +951,92 @@ address StubGenerator::generate_fp_mask(const char *stub_name, int64_t mask) {
|
||||
return start;
|
||||
}
|
||||
|
||||
address StubGenerator::generate_compress_perm_table(const char *stub_name, int32_t esize) {
|
||||
__ align(CodeEntryAlignment);
|
||||
StubCodeMark mark(this, "StubRoutines", stub_name);
|
||||
address start = __ pc();
|
||||
if (esize == 32) {
|
||||
// Loop to generate 256 x 8 int compression permute index table. A row is
|
||||
// accessed using 8 bit index computed using vector mask. An entry in
|
||||
// a row holds either a valid permute index corresponding to set bit position
|
||||
// or a -1 (default) value.
|
||||
for (int mask = 0; mask < 256; mask++) {
|
||||
int ctr = 0;
|
||||
for (int j = 0; j < 8; j++) {
|
||||
if (mask & (1 << j)) {
|
||||
__ emit_data(j, relocInfo::none);
|
||||
ctr++;
|
||||
}
|
||||
}
|
||||
for (; ctr < 8; ctr++) {
|
||||
__ emit_data(-1, relocInfo::none);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
assert(esize == 64, "");
|
||||
// Loop to generate 16 x 4 long compression permute index table. A row is
|
||||
// accessed using 4 bit index computed using vector mask. An entry in
|
||||
// a row holds either a valid permute index pair for a quadword corresponding
|
||||
// to set bit position or a -1 (default) value.
|
||||
for (int mask = 0; mask < 16; mask++) {
|
||||
int ctr = 0;
|
||||
for (int j = 0; j < 4; j++) {
|
||||
if (mask & (1 << j)) {
|
||||
__ emit_data(2 * j, relocInfo::none);
|
||||
__ emit_data(2 * j + 1, relocInfo::none);
|
||||
ctr++;
|
||||
}
|
||||
}
|
||||
for (; ctr < 4; ctr++) {
|
||||
__ emit_data64(-1L, relocInfo::none);
|
||||
}
|
||||
}
|
||||
}
|
||||
return start;
|
||||
}
|
||||
|
||||
address StubGenerator::generate_expand_perm_table(const char *stub_name, int32_t esize) {
|
||||
__ align(CodeEntryAlignment);
|
||||
StubCodeMark mark(this, "StubRoutines", stub_name);
|
||||
address start = __ pc();
|
||||
if (esize == 32) {
|
||||
// Loop to generate 256 x 8 int expand permute index table. A row is accessed
|
||||
// using 8 bit index computed using vector mask. An entry in a row holds either
|
||||
// a valid permute index (starting from least significant lane) placed at poisition
|
||||
// corresponding to set bit position or a -1 (default) value.
|
||||
for (int mask = 0; mask < 256; mask++) {
|
||||
int ctr = 0;
|
||||
for (int j = 0; j < 8; j++) {
|
||||
if (mask & (1 << j)) {
|
||||
__ emit_data(ctr++, relocInfo::none);
|
||||
} else {
|
||||
__ emit_data(-1, relocInfo::none);
|
||||
}
|
||||
}
|
||||
}
|
||||
} else {
|
||||
assert(esize == 64, "");
|
||||
// Loop to generate 16 x 4 long expand permute index table. A row is accessed
|
||||
// using 4 bit index computed using vector mask. An entry in a row holds either
|
||||
// a valid doubleword permute index pair representing a quadword index (starting
|
||||
// from least significant lane) placed at poisition corresponding to set bit
|
||||
// position or a -1 (default) value.
|
||||
for (int mask = 0; mask < 16; mask++) {
|
||||
int ctr = 0;
|
||||
for (int j = 0; j < 4; j++) {
|
||||
if (mask & (1 << j)) {
|
||||
__ emit_data(2 * ctr, relocInfo::none);
|
||||
__ emit_data(2 * ctr + 1, relocInfo::none);
|
||||
ctr++;
|
||||
} else {
|
||||
__ emit_data64(-1L, relocInfo::none);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
return start;
|
||||
}
|
||||
|
||||
address StubGenerator::generate_vector_mask(const char *stub_name, int64_t mask) {
|
||||
__ align(CodeEntryAlignment);
|
||||
StubCodeMark mark(this, "StubRoutines", stub_name);
|
||||
@ -4095,6 +4181,13 @@ void StubGenerator::generate_compiler_stubs() {
|
||||
StubRoutines::x86::_vector_reverse_byte_perm_mask_int = generate_vector_reverse_byte_perm_mask_int("perm_mask_int");
|
||||
StubRoutines::x86::_vector_reverse_byte_perm_mask_short = generate_vector_reverse_byte_perm_mask_short("perm_mask_short");
|
||||
|
||||
if (VM_Version::supports_avx2() && !VM_Version::supports_avx512vl()) {
|
||||
StubRoutines::x86::_compress_perm_table32 = generate_compress_perm_table("compress_perm_table32", 32);
|
||||
StubRoutines::x86::_compress_perm_table64 = generate_compress_perm_table("compress_perm_table64", 64);
|
||||
StubRoutines::x86::_expand_perm_table32 = generate_expand_perm_table("expand_perm_table32", 32);
|
||||
StubRoutines::x86::_expand_perm_table64 = generate_expand_perm_table("expand_perm_table64", 64);
|
||||
}
|
||||
|
||||
if (VM_Version::supports_avx2() && !VM_Version::supports_avx512_vpopcntdq()) {
|
||||
// lut implementation influenced by counting 1s algorithm from section 5-1 of Hackers' Delight.
|
||||
StubRoutines::x86::_vector_popcount_lut = generate_popcount_avx_lut("popcount_lut");
|
||||
|
@ -1,5 +1,5 @@
|
||||
/*
|
||||
* Copyright (c) 2003, 2023, Oracle and/or its affiliates. All rights reserved.
|
||||
* Copyright (c) 2003, 2024, Oracle and/or its affiliates. All rights reserved.
|
||||
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
|
||||
*
|
||||
* This code is free software; you can redistribute it and/or modify it
|
||||
@ -99,6 +99,10 @@ class StubGenerator: public StubCodeGenerator {
|
||||
|
||||
address generate_fp_mask(const char *stub_name, int64_t mask);
|
||||
|
||||
address generate_compress_perm_table(const char *stub_name, int32_t esize);
|
||||
|
||||
address generate_expand_perm_table(const char *stub_name, int32_t esize);
|
||||
|
||||
address generate_vector_mask(const char *stub_name, int64_t mask);
|
||||
|
||||
address generate_vector_byte_perm_mask(const char *stub_name);
|
||||
|
@ -1,5 +1,5 @@
|
||||
/*
|
||||
* Copyright (c) 2013, 2023, Oracle and/or its affiliates. All rights reserved.
|
||||
* Copyright (c) 2013, 2024, Oracle and/or its affiliates. All rights reserved.
|
||||
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
|
||||
*
|
||||
* This code is free software; you can redistribute it and/or modify it
|
||||
@ -82,6 +82,10 @@ address StubRoutines::x86::_join_0_1_base64 = nullptr;
|
||||
address StubRoutines::x86::_join_1_2_base64 = nullptr;
|
||||
address StubRoutines::x86::_join_2_3_base64 = nullptr;
|
||||
address StubRoutines::x86::_decoding_table_base64 = nullptr;
|
||||
address StubRoutines::x86::_compress_perm_table32 = nullptr;
|
||||
address StubRoutines::x86::_compress_perm_table64 = nullptr;
|
||||
address StubRoutines::x86::_expand_perm_table32 = nullptr;
|
||||
address StubRoutines::x86::_expand_perm_table64 = nullptr;
|
||||
#endif
|
||||
address StubRoutines::x86::_pshuffle_byte_flip_mask_addr = nullptr;
|
||||
|
||||
|
@ -1,5 +1,5 @@
|
||||
/*
|
||||
* Copyright (c) 2013, 2023, Oracle and/or its affiliates. All rights reserved.
|
||||
* Copyright (c) 2013, 2024, Oracle and/or its affiliates. All rights reserved.
|
||||
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
|
||||
*
|
||||
* This code is free software; you can redistribute it and/or modify it
|
||||
@ -37,7 +37,7 @@ enum platform_dependent_constants {
|
||||
_continuation_stubs_code_size = 1000 LP64_ONLY(+1000),
|
||||
// AVX512 intrinsics add more code in 64-bit VM,
|
||||
// Windows have more code to save/restore registers
|
||||
_compiler_stubs_code_size = 20000 LP64_ONLY(+32000) WINDOWS_ONLY(+2000),
|
||||
_compiler_stubs_code_size = 20000 LP64_ONLY(+39000) WINDOWS_ONLY(+2000),
|
||||
_final_stubs_code_size = 10000 LP64_ONLY(+20000) WINDOWS_ONLY(+2000) ZGC_ONLY(+20000)
|
||||
};
|
||||
|
||||
@ -58,6 +58,10 @@ class x86 {
|
||||
static address _float_sign_flip;
|
||||
static address _double_sign_mask;
|
||||
static address _double_sign_flip;
|
||||
static address _compress_perm_table32;
|
||||
static address _compress_perm_table64;
|
||||
static address _expand_perm_table32;
|
||||
static address _expand_perm_table64;
|
||||
|
||||
public:
|
||||
|
||||
@ -338,6 +342,10 @@ class x86 {
|
||||
static address base64_decoding_table_addr() { return _decoding_table_base64; }
|
||||
static address base64_AVX2_decode_tables_addr() { return _avx2_decode_tables_base64; }
|
||||
static address base64_AVX2_decode_LUT_tables_addr() { return _avx2_decode_lut_tables_base64; }
|
||||
static address compress_perm_table32() { return _compress_perm_table32; }
|
||||
static address compress_perm_table64() { return _compress_perm_table64; }
|
||||
static address expand_perm_table32() { return _expand_perm_table32; }
|
||||
static address expand_perm_table64() { return _expand_perm_table64; }
|
||||
#endif
|
||||
static address pshuffle_byte_flip_mask_addr() { return _pshuffle_byte_flip_mask_addr; }
|
||||
static address arrays_hashcode_powers_of_31() { return (address)_arrays_hashcode_powers_of_31; }
|
||||
|
@ -44,4 +44,3 @@ address StubRoutines::x86::_float_sign_mask = nullptr;
|
||||
address StubRoutines::x86::_float_sign_flip = nullptr;
|
||||
address StubRoutines::x86::_double_sign_mask = nullptr;
|
||||
address StubRoutines::x86::_double_sign_flip = nullptr;
|
||||
|
||||
|
@ -1425,6 +1425,8 @@ bool Matcher::match_rule_supported(int opcode) {
|
||||
return false;
|
||||
}
|
||||
break;
|
||||
case Op_CompressV:
|
||||
case Op_ExpandV:
|
||||
case Op_PopCountVL:
|
||||
if (UseAVX < 2) {
|
||||
return false;
|
||||
@ -1659,12 +1661,6 @@ bool Matcher::match_rule_supported(int opcode) {
|
||||
return false;
|
||||
}
|
||||
break;
|
||||
case Op_CompressV:
|
||||
case Op_ExpandV:
|
||||
if (!VM_Version::supports_avx512vl()) {
|
||||
return false;
|
||||
}
|
||||
break;
|
||||
case Op_SqrtF:
|
||||
if (UseSSE < 1) {
|
||||
return false;
|
||||
@ -1952,13 +1948,12 @@ bool Matcher::match_rule_supported_vector(int opcode, int vlen, BasicType bt) {
|
||||
if (is_subword_type(bt) && !VM_Version::supports_avx512_vbmi2()) {
|
||||
return false;
|
||||
}
|
||||
if (!is_LP64 && !VM_Version::supports_avx512vl() && size_in_bits < 512) {
|
||||
return false;
|
||||
}
|
||||
if (size_in_bits < 128 ) {
|
||||
return false;
|
||||
}
|
||||
if (size_in_bits < 512 && !VM_Version::supports_avx512vl()) {
|
||||
return false;
|
||||
}
|
||||
break;
|
||||
case Op_VectorLongToMask:
|
||||
if (UseAVX < 1 || !is_LP64) {
|
||||
return false;
|
||||
@ -9178,8 +9173,26 @@ instruct vmask_first_or_last_true_avx(rRegI dst, vec mask, immI size, rRegL tmp,
|
||||
%}
|
||||
|
||||
// --------------------------------- Compress/Expand Operations ---------------------------
|
||||
#ifdef _LP64
|
||||
instruct vcompress_reg_avx(vec dst, vec src, vec mask, rRegI rtmp, rRegL rscratch, vec perm, vec xtmp, rFlagsReg cr) %{
|
||||
predicate(!VM_Version::supports_avx512vl() && Matcher::vector_length_in_bytes(n) <= 32);
|
||||
match(Set dst (CompressV src mask));
|
||||
match(Set dst (ExpandV src mask));
|
||||
effect(TEMP_DEF dst, TEMP perm, TEMP xtmp, TEMP rtmp, TEMP rscratch, KILL cr);
|
||||
format %{ "vector_compress $dst, $src, $mask \t!using $xtmp, $rtmp, $rscratch and $perm as TEMP" %}
|
||||
ins_encode %{
|
||||
int opcode = this->ideal_Opcode();
|
||||
int vlen_enc = vector_length_encoding(this);
|
||||
BasicType bt = Matcher::vector_element_basic_type(this);
|
||||
__ vector_compress_expand_avx2(opcode, $dst$$XMMRegister, $src$$XMMRegister, $mask$$XMMRegister, $rtmp$$Register,
|
||||
$rscratch$$Register, $perm$$XMMRegister, $xtmp$$XMMRegister, bt, vlen_enc);
|
||||
%}
|
||||
ins_pipe( pipe_slow );
|
||||
%}
|
||||
#endif
|
||||
|
||||
instruct vcompress_expand_reg_evex(vec dst, vec src, kReg mask) %{
|
||||
predicate(VM_Version::supports_avx512vl() || Matcher::vector_length_in_bytes(n) == 64);
|
||||
match(Set dst (CompressV src mask));
|
||||
match(Set dst (ExpandV src mask));
|
||||
format %{ "vector_compress_expand $dst, $src, $mask" %}
|
||||
|
@ -0,0 +1,185 @@
|
||||
/*
|
||||
* Copyright (c) 2024, Oracle and/or its affiliates. All rights reserved.
|
||||
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
|
||||
*
|
||||
* This code is free software; you can redistribute it and/or modify it
|
||||
* under the terms of the GNU General Public License version 2 only, as
|
||||
* published by the Free Software Foundation.
|
||||
*
|
||||
* This code is distributed in the hope that it will be useful, but WITHOUT
|
||||
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
|
||||
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
|
||||
* version 2 for more details (a copy is included in the LICENSE file that
|
||||
* accompanied this code).
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License version
|
||||
* 2 along with this work; if not, write to the Free Software Foundation,
|
||||
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||
*
|
||||
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
|
||||
* or visit www.oracle.com if you need additional information or have any
|
||||
* questions.
|
||||
*
|
||||
*/
|
||||
|
||||
package org.openjdk.bench.jdk.incubator.vector;
|
||||
|
||||
import java.util.concurrent.TimeUnit;
|
||||
import java.util.Random;
|
||||
import jdk.incubator.vector.*;
|
||||
import org.openjdk.jmh.annotations.*;
|
||||
import org.openjdk.jmh.infra.Blackhole;
|
||||
|
||||
@OutputTimeUnit(TimeUnit.MILLISECONDS)
|
||||
@State(Scope.Thread)
|
||||
@Fork(jvmArgsPrepend = {"--add-modules=jdk.incubator.vector", "-XX:UseAVX=2"})
|
||||
public class ColumnFilterBenchmark {
|
||||
@Param({"1024", "2047", "4096"})
|
||||
int size;
|
||||
|
||||
float [] floatinCol;
|
||||
float [] floatoutCol;
|
||||
float fpivot;
|
||||
|
||||
double [] doubleinCol;
|
||||
double [] doubleoutCol;
|
||||
double dpivot;
|
||||
|
||||
int [] intinCol;
|
||||
int [] intoutCol;
|
||||
int ipivot;
|
||||
|
||||
long [] longinCol;
|
||||
long [] longoutCol;
|
||||
long lpivot;
|
||||
|
||||
static final VectorSpecies<Float> fspecies = FloatVector.SPECIES_256;
|
||||
static final VectorSpecies<Double> dspecies = DoubleVector.SPECIES_256;
|
||||
static final VectorSpecies<Integer> ispecies = IntVector.SPECIES_256;
|
||||
static final VectorSpecies<Long> lspecies = LongVector.SPECIES_256;
|
||||
|
||||
@Setup(Level.Trial)
|
||||
public void BmSetup() {
|
||||
Random r = new Random(2048);
|
||||
|
||||
floatinCol = new float[size];
|
||||
floatoutCol = new float[size];
|
||||
fpivot = (float) (size / 2);
|
||||
doubleinCol = new double[size];
|
||||
doubleoutCol = new double[size];
|
||||
dpivot = (double) (size / 2);
|
||||
intinCol = new int[size];
|
||||
intoutCol = new int[size];
|
||||
ipivot = size / 2;
|
||||
longinCol = new long[size];
|
||||
longoutCol = new long[size];
|
||||
lpivot = size / 2;
|
||||
|
||||
for (int i = 4; i < size; i++) {
|
||||
floatinCol[i] = r.nextFloat() * size;
|
||||
doubleinCol[i] = r.nextDouble() * size;
|
||||
intinCol[i] = r.nextInt(size);
|
||||
longinCol[i] = (long)intinCol[i];
|
||||
}
|
||||
}
|
||||
|
||||
@Benchmark
|
||||
public void fuzzyFilterIntColumn() {
|
||||
int i = 0;
|
||||
int j = 0;
|
||||
long maskctr = 1;
|
||||
int endIndex = ispecies.loopBound(size);
|
||||
for (; i < endIndex; i += ispecies.length()) {
|
||||
IntVector vec = IntVector.fromArray(ispecies, intinCol, i);
|
||||
VectorMask<Integer> pred = VectorMask.fromLong(ispecies, maskctr++);
|
||||
vec.compress(pred).intoArray(intoutCol, j);
|
||||
j += pred.trueCount();
|
||||
}
|
||||
}
|
||||
|
||||
@Benchmark
|
||||
public void fuzzyFilterLongColumn() {
|
||||
int i = 0;
|
||||
int j = 0;
|
||||
long maskctr = 1;
|
||||
int endIndex = lspecies.loopBound(size);
|
||||
for (; i < endIndex; i += lspecies.length()) {
|
||||
LongVector vec = LongVector.fromArray(lspecies, longinCol, i);
|
||||
VectorMask<Long> pred = VectorMask.fromLong(lspecies, maskctr++);
|
||||
vec.compress(pred).intoArray(longoutCol, j);
|
||||
j += pred.trueCount();
|
||||
}
|
||||
}
|
||||
|
||||
@Benchmark
|
||||
public void filterIntColumn() {
|
||||
int i = 0;
|
||||
int j = 0;
|
||||
int endIndex = ispecies.loopBound(size);
|
||||
for (; i < endIndex; i += ispecies.length()) {
|
||||
IntVector vec = IntVector.fromArray(ispecies, intinCol, i);
|
||||
VectorMask<Integer> pred = vec.compare(VectorOperators.GT, ipivot);
|
||||
vec.compress(pred).intoArray(intoutCol, j);
|
||||
j += pred.trueCount();
|
||||
}
|
||||
for (; i < endIndex; i++) {
|
||||
if (intinCol[i] > ipivot) {
|
||||
intoutCol[j++] = intinCol[i];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@Benchmark
|
||||
public void filterLongColumn() {
|
||||
int i = 0;
|
||||
int j = 0;
|
||||
int endIndex = lspecies.loopBound(size);
|
||||
for (; i < endIndex; i += lspecies.length()) {
|
||||
LongVector vec = LongVector.fromArray(lspecies, longinCol, i);
|
||||
VectorMask<Long> pred = vec.compare(VectorOperators.GT, lpivot);
|
||||
vec.compress(pred).intoArray(longoutCol, j);
|
||||
j += pred.trueCount();
|
||||
}
|
||||
for (; i < endIndex; i++) {
|
||||
if (longinCol[i] > lpivot) {
|
||||
longoutCol[j++] = longinCol[i];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@Benchmark
|
||||
public void filterFloatColumn() {
|
||||
int i = 0;
|
||||
int j = 0;
|
||||
int endIndex = fspecies.loopBound(size);
|
||||
for (; i < endIndex; i += fspecies.length()) {
|
||||
FloatVector vec = FloatVector.fromArray(fspecies, floatinCol, i);
|
||||
VectorMask<Float> pred = vec.compare(VectorOperators.GT, fpivot);
|
||||
vec.compress(pred).intoArray(floatoutCol, j);
|
||||
j += pred.trueCount();
|
||||
}
|
||||
for (; i < endIndex; i++) {
|
||||
if (floatinCol[i] > fpivot) {
|
||||
floatoutCol[j++] = floatinCol[i];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@Benchmark
|
||||
public void filterDoubleColumn() {
|
||||
int i = 0;
|
||||
int j = 0;
|
||||
int endIndex = dspecies.loopBound(size);
|
||||
for (; i < endIndex; i += dspecies.length()) {
|
||||
DoubleVector vec = DoubleVector.fromArray(dspecies, doubleinCol, i);
|
||||
VectorMask<Double> pred = vec.compare(VectorOperators.GT, dpivot);
|
||||
vec.compress(pred).intoArray(doubleoutCol, j);
|
||||
j += pred.trueCount();
|
||||
}
|
||||
for (; i < endIndex; i++) {
|
||||
if (doubleinCol[i] > dpivot) {
|
||||
doubleoutCol[j++] = doubleinCol[i];
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
Loading…
x
Reference in New Issue
Block a user