8322768: Optimize non-subword vector compress and expand APIs for AVX2 target.

Reviewed-by: epeter, sviswanathan
2024-01-25 10:07:50 +00:00 · 2024-01-25 10:07:50 +00:00 · 6d36eb78ad
commit 6d36eb78ad
parent 9d1a6d1484
10 changed files with 365 additions and 19 deletions
--- a/src/hotspot/cpu/x86/assembler_x86.hpp
+++ b/src/hotspot/cpu/x86/assembler_x86.hpp
@ -1,5 +1,5 @@
 /*
- * Copyright (c) 1997, 2023, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1997, 2024, Oracle and/or its affiliates. All rights reserved.
 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
 *
 * This code is free software; you can redistribute it and/or modify it
@ -816,8 +816,8 @@ private:
  void check_relocation(RelocationHolder const& rspec, int format);
 #endif

-  void emit_data(jint data, relocInfo::relocType    rtype, int format);
-  void emit_data(jint data, RelocationHolder const& rspec, int format);
+  void emit_data(jint data, relocInfo::relocType    rtype, int format = 0);
+  void emit_data(jint data, RelocationHolder const& rspec, int format = 0);
  void emit_data64(jlong data, relocInfo::relocType rtype, int format = 0);
  void emit_data64(jlong data, RelocationHolder const& rspec, int format = 0);

--- a/src/hotspot/cpu/x86/c2_MacroAssembler_x86.cpp
+++ b/src/hotspot/cpu/x86/c2_MacroAssembler_x86.cpp
@ -5282,6 +5282,42 @@ void C2_MacroAssembler::vector_mask_compress(KRegister dst, KRegister src, Regis
  kmov(dst, rtmp2);
 }

+#ifdef _LP64
+void C2_MacroAssembler::vector_compress_expand_avx2(int opcode, XMMRegister dst, XMMRegister src,
+                                                    XMMRegister mask, Register rtmp, Register rscratch,
+                                                    XMMRegister permv, XMMRegister xtmp, BasicType bt,
+                                                    int vec_enc) {
+  assert(type2aelembytes(bt) >= 4, "");
+  assert(opcode == Op_CompressV || opcode == Op_ExpandV, "");
+  address compress_perm_table = nullptr;
+  address expand_perm_table = nullptr;
+  if (type2aelembytes(bt) == 8) {
+    compress_perm_table = StubRoutines::x86::compress_perm_table64();
+    expand_perm_table  = StubRoutines::x86::expand_perm_table64();
+    vmovmskpd(rtmp, mask, vec_enc);
+  } else {
+    compress_perm_table = StubRoutines::x86::compress_perm_table32();
+    expand_perm_table = StubRoutines::x86::expand_perm_table32();
+    vmovmskps(rtmp, mask, vec_enc);
+  }
+  shlq(rtmp, 5); // for 32 byte permute row.
+  if (opcode == Op_CompressV) {
+    lea(rscratch, ExternalAddress(compress_perm_table));
+  } else {
+    lea(rscratch, ExternalAddress(expand_perm_table));
+  }
+  addptr(rtmp, rscratch);
+  vmovdqu(permv, Address(rtmp));
+  vpermps(dst, permv, src, Assembler::AVX_256bit);
+  vpxor(xtmp, xtmp, xtmp, vec_enc);
+  // Blend the result with zero vector using permute mask, each column entry
+  // in a permute table row contains either a valid permute index or a -1 (default)
+  // value, this can potentially be used as a blending mask after
+  // compressing/expanding the source vector lanes.
+  vblendvps(dst, dst, xtmp, permv, vec_enc, false, permv);
+}
+#endif
+
 void C2_MacroAssembler::vector_compress_expand(int opcode, XMMRegister dst, XMMRegister src, KRegister mask,
                                               bool merge, BasicType bt, int vec_enc) {
  if (opcode == Op_CompressV) {
--- a/src/hotspot/cpu/x86/c2_MacroAssembler_x86.hpp
+++ b/src/hotspot/cpu/x86/c2_MacroAssembler_x86.hpp
@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, 2023, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2020, 2024, Oracle and/or its affiliates. All rights reserved.
 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
 *
 * This code is free software; you can redistribute it and/or modify it
@ -390,6 +390,10 @@ public:

  void vector_round_float_avx(XMMRegister dst, XMMRegister src, AddressLiteral float_sign_flip, AddressLiteral new_mxcsr, int vec_enc,
                              Register tmp, XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4);
+
+  void vector_compress_expand_avx2(int opcode, XMMRegister dst, XMMRegister src, XMMRegister mask,
+                                   Register rtmp, Register rscratch, XMMRegister permv, XMMRegister xtmp,
+                                   BasicType bt, int vec_enc);
 #endif // _LP64

  void udivI(Register rax, Register divisor, Register rdx);
--- a/src/hotspot/cpu/x86/stubGenerator_x86_64.cpp
+++ b/src/hotspot/cpu/x86/stubGenerator_x86_64.cpp
@ -951,6 +951,92 @@ address StubGenerator::generate_fp_mask(const char *stub_name, int64_t mask) {
  return start;
 }

+address StubGenerator::generate_compress_perm_table(const char *stub_name, int32_t esize) {
+  __ align(CodeEntryAlignment);
+  StubCodeMark mark(this, "StubRoutines", stub_name);
+  address start = __ pc();
+  if (esize == 32) {
+    // Loop to generate 256 x 8 int compression permute index table. A row is
+    // accessed using 8 bit index computed using vector mask. An entry in
+    // a row holds either a valid permute index corresponding to set bit position
+    // or a -1 (default) value.
+    for (int mask = 0; mask < 256; mask++) {
+      int ctr = 0;
+      for (int j = 0; j < 8; j++) {
+        if (mask & (1 << j)) {
+          __ emit_data(j, relocInfo::none);
+          ctr++;
+        }
+      }
+      for (; ctr < 8; ctr++) {
+        __ emit_data(-1, relocInfo::none);
+      }
+    }
+  } else {
+    assert(esize == 64, "");
+    // Loop to generate 16 x 4 long compression permute index table. A row is
+    // accessed using 4 bit index computed using vector mask. An entry in
+    // a row holds either a valid permute index pair for a quadword corresponding
+    // to set bit position or a -1 (default) value.
+    for (int mask = 0; mask < 16; mask++) {
+      int ctr = 0;
+      for (int j = 0; j < 4; j++) {
+        if (mask & (1 << j)) {
+          __ emit_data(2 * j, relocInfo::none);
+          __ emit_data(2 * j + 1, relocInfo::none);
+          ctr++;
+        }
+      }
+      for (; ctr < 4; ctr++) {
+        __ emit_data64(-1L, relocInfo::none);
+      }
+    }
+  }
+  return start;
+}
+
+address StubGenerator::generate_expand_perm_table(const char *stub_name, int32_t esize) {
+  __ align(CodeEntryAlignment);
+  StubCodeMark mark(this, "StubRoutines", stub_name);
+  address start = __ pc();
+  if (esize == 32) {
+    // Loop to generate 256 x 8 int expand permute index table. A row is accessed
+    // using 8 bit index computed using vector mask. An entry in a row holds either
+    // a valid permute index (starting from least significant lane) placed at poisition
+    // corresponding to set bit position or a -1 (default) value.
+    for (int mask = 0; mask < 256; mask++) {
+      int ctr = 0;
+      for (int j = 0; j < 8; j++) {
+        if (mask & (1 << j)) {
+          __ emit_data(ctr++, relocInfo::none);
+        } else {
+          __ emit_data(-1, relocInfo::none);
+        }
+      }
+    }
+  } else {
+    assert(esize == 64, "");
+    // Loop to generate 16 x 4 long expand permute index table. A row is accessed
+    // using 4 bit index computed using vector mask. An entry in a row holds either
+    // a valid doubleword permute index pair representing a quadword index (starting
+    // from least significant lane) placed at poisition corresponding to set bit
+    // position or a -1 (default) value.
+    for (int mask = 0; mask < 16; mask++) {
+      int ctr = 0;
+      for (int j = 0; j < 4; j++) {
+        if (mask & (1 << j)) {
+          __ emit_data(2 * ctr, relocInfo::none);
+          __ emit_data(2 * ctr + 1, relocInfo::none);
+          ctr++;
+        } else {
+          __ emit_data64(-1L, relocInfo::none);
+        }
+      }
+    }
+  }
+  return start;
+}
+
 address StubGenerator::generate_vector_mask(const char *stub_name, int64_t mask) {
  __ align(CodeEntryAlignment);
  StubCodeMark mark(this, "StubRoutines", stub_name);
@ -4095,6 +4181,13 @@ void StubGenerator::generate_compiler_stubs() {
  StubRoutines::x86::_vector_reverse_byte_perm_mask_int = generate_vector_reverse_byte_perm_mask_int("perm_mask_int");
  StubRoutines::x86::_vector_reverse_byte_perm_mask_short = generate_vector_reverse_byte_perm_mask_short("perm_mask_short");

+  if (VM_Version::supports_avx2() && !VM_Version::supports_avx512vl()) {
+    StubRoutines::x86::_compress_perm_table32 = generate_compress_perm_table("compress_perm_table32", 32);
+    StubRoutines::x86::_compress_perm_table64 = generate_compress_perm_table("compress_perm_table64", 64);
+    StubRoutines::x86::_expand_perm_table32 = generate_expand_perm_table("expand_perm_table32", 32);
+    StubRoutines::x86::_expand_perm_table64 = generate_expand_perm_table("expand_perm_table64", 64);
+  }
+
  if (VM_Version::supports_avx2() && !VM_Version::supports_avx512_vpopcntdq()) {
    // lut implementation influenced by counting 1s algorithm from section 5-1 of Hackers' Delight.
    StubRoutines::x86::_vector_popcount_lut = generate_popcount_avx_lut("popcount_lut");
--- a/src/hotspot/cpu/x86/stubGenerator_x86_64.hpp
+++ b/src/hotspot/cpu/x86/stubGenerator_x86_64.hpp
@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2003, 2023, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2003, 2024, Oracle and/or its affiliates. All rights reserved.
 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
 *
 * This code is free software; you can redistribute it and/or modify it
@ -99,6 +99,10 @@ class StubGenerator: public StubCodeGenerator {

  address generate_fp_mask(const char *stub_name, int64_t mask);

+  address generate_compress_perm_table(const char *stub_name, int32_t esize);
+
+  address generate_expand_perm_table(const char *stub_name, int32_t esize);
+
  address generate_vector_mask(const char *stub_name, int64_t mask);

  address generate_vector_byte_perm_mask(const char *stub_name);
--- a/src/hotspot/cpu/x86/stubRoutines_x86.cpp
+++ b/src/hotspot/cpu/x86/stubRoutines_x86.cpp
@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2013, 2023, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2013, 2024, Oracle and/or its affiliates. All rights reserved.
 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
 *
 * This code is free software; you can redistribute it and/or modify it
@ -82,6 +82,10 @@ address StubRoutines::x86::_join_0_1_base64 = nullptr;
 address StubRoutines::x86::_join_1_2_base64 = nullptr;
 address StubRoutines::x86::_join_2_3_base64 = nullptr;
 address StubRoutines::x86::_decoding_table_base64 = nullptr;
+address StubRoutines::x86::_compress_perm_table32 = nullptr;
+address StubRoutines::x86::_compress_perm_table64 = nullptr;
+address StubRoutines::x86::_expand_perm_table32 = nullptr;
+address StubRoutines::x86::_expand_perm_table64 = nullptr;
 #endif
 address StubRoutines::x86::_pshuffle_byte_flip_mask_addr = nullptr;

--- a/src/hotspot/cpu/x86/stubRoutines_x86.hpp
+++ b/src/hotspot/cpu/x86/stubRoutines_x86.hpp
@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2013, 2023, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2013, 2024, Oracle and/or its affiliates. All rights reserved.
 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
 *
 * This code is free software; you can redistribute it and/or modify it
@ -37,7 +37,7 @@ enum platform_dependent_constants {
  _continuation_stubs_code_size =  1000 LP64_ONLY(+1000),
  // AVX512 intrinsics add more code in 64-bit VM,
  // Windows have more code to save/restore registers
-  _compiler_stubs_code_size     = 20000 LP64_ONLY(+32000) WINDOWS_ONLY(+2000),
+  _compiler_stubs_code_size     = 20000 LP64_ONLY(+39000) WINDOWS_ONLY(+2000),
  _final_stubs_code_size        = 10000 LP64_ONLY(+20000) WINDOWS_ONLY(+2000) ZGC_ONLY(+20000)
 };

@ -58,6 +58,10 @@ class x86 {
  static address _float_sign_flip;
  static address _double_sign_mask;
  static address _double_sign_flip;
+  static address _compress_perm_table32;
+  static address _compress_perm_table64;
+  static address _expand_perm_table32;
+  static address _expand_perm_table64;

 public:

@ -338,6 +342,10 @@ class x86 {
  static address base64_decoding_table_addr() { return _decoding_table_base64; }
  static address base64_AVX2_decode_tables_addr() { return _avx2_decode_tables_base64; }
  static address base64_AVX2_decode_LUT_tables_addr() { return _avx2_decode_lut_tables_base64; }
+  static address compress_perm_table32() { return _compress_perm_table32; }
+  static address compress_perm_table64() { return _compress_perm_table64; }
+  static address expand_perm_table32() { return _expand_perm_table32; }
+  static address expand_perm_table64() { return _expand_perm_table64; }
 #endif
  static address pshuffle_byte_flip_mask_addr() { return _pshuffle_byte_flip_mask_addr; }
  static address arrays_hashcode_powers_of_31() { return (address)_arrays_hashcode_powers_of_31; }
--- a/src/hotspot/cpu/x86/stubRoutines_x86_64.cpp
+++ b/src/hotspot/cpu/x86/stubRoutines_x86_64.cpp
@ -44,4 +44,3 @@ address StubRoutines::x86::_float_sign_mask = nullptr;
 address StubRoutines::x86::_float_sign_flip = nullptr;
 address StubRoutines::x86::_double_sign_mask = nullptr;
 address StubRoutines::x86::_double_sign_flip = nullptr;
-
--- a/src/hotspot/cpu/x86/x86.ad
+++ b/src/hotspot/cpu/x86/x86.ad
@ -1425,6 +1425,8 @@ bool Matcher::match_rule_supported(int opcode) {
        return false;
      }
      break;
+    case Op_CompressV:
+    case Op_ExpandV:
    case Op_PopCountVL:
      if (UseAVX < 2) {
        return false;
@ -1659,12 +1661,6 @@ bool Matcher::match_rule_supported(int opcode) {
        return false;
      }
      break;
-    case Op_CompressV:
-    case Op_ExpandV:
-      if (!VM_Version::supports_avx512vl()) {
-        return false;
-      }
-      break;
    case Op_SqrtF:
      if (UseSSE < 1) {
        return false;
@ -1952,13 +1948,12 @@ bool Matcher::match_rule_supported_vector(int opcode, int vlen, BasicType bt) {
      if (is_subword_type(bt) && !VM_Version::supports_avx512_vbmi2()) {
        return false;
      }
+      if (!is_LP64 && !VM_Version::supports_avx512vl() && size_in_bits < 512) {
+        return false;
+      }
      if (size_in_bits < 128 ) {
        return false;
      }
-      if (size_in_bits < 512 && !VM_Version::supports_avx512vl()) {
-        return false;
-      }
-      break;
    case Op_VectorLongToMask:
      if (UseAVX < 1 || !is_LP64) {
        return false;
@ -9178,8 +9173,26 @@ instruct vmask_first_or_last_true_avx(rRegI dst, vec mask, immI size, rRegL tmp,
 %}

 // --------------------------------- Compress/Expand Operations ---------------------------
+#ifdef _LP64
+instruct vcompress_reg_avx(vec dst, vec src, vec mask, rRegI rtmp, rRegL rscratch, vec perm, vec xtmp, rFlagsReg cr) %{
+  predicate(!VM_Version::supports_avx512vl() && Matcher::vector_length_in_bytes(n) <= 32);
+  match(Set dst (CompressV src mask));
+  match(Set dst (ExpandV src mask));
+  effect(TEMP_DEF dst, TEMP perm, TEMP xtmp, TEMP rtmp, TEMP rscratch, KILL cr);
+  format %{ "vector_compress $dst, $src, $mask \t!using $xtmp, $rtmp, $rscratch and $perm as TEMP" %}
+  ins_encode %{
+    int opcode = this->ideal_Opcode();
+    int vlen_enc = vector_length_encoding(this);
+    BasicType bt  = Matcher::vector_element_basic_type(this);
+    __ vector_compress_expand_avx2(opcode, $dst$$XMMRegister, $src$$XMMRegister, $mask$$XMMRegister, $rtmp$$Register,
+                                   $rscratch$$Register, $perm$$XMMRegister, $xtmp$$XMMRegister, bt, vlen_enc);
+  %}
+  ins_pipe( pipe_slow );
+%}
+#endif

 instruct vcompress_expand_reg_evex(vec dst, vec src, kReg mask) %{
+  predicate(VM_Version::supports_avx512vl() || Matcher::vector_length_in_bytes(n) == 64);
  match(Set dst (CompressV src mask));
  match(Set dst (ExpandV src mask));
  format %{ "vector_compress_expand $dst, $src, $mask" %}
--- a/test/micro/org/openjdk/bench/jdk/incubator/vector/ColumnFilterBenchmark.java
+++ b/test/micro/org/openjdk/bench/jdk/incubator/vector/ColumnFilterBenchmark.java
@ -0,0 +1,185 @@
+/*
+ *  Copyright (c) 2024, Oracle and/or its affiliates. All rights reserved.
+ *  DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ *  This code is free software; you can redistribute it and/or modify it
+ *  under the terms of the GNU General Public License version 2 only, as
+ *  published by the Free Software Foundation.
+ *
+ *  This code is distributed in the hope that it will be useful, but WITHOUT
+ *  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ *  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ *  version 2 for more details (a copy is included in the LICENSE file that
+ *  accompanied this code).
+ *
+ *  You should have received a copy of the GNU General Public License version
+ *  2 along with this work; if not, write to the Free Software Foundation,
+ *  Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ *  Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ *  or visit www.oracle.com if you need additional information or have any
+ *  questions.
+ *
+ */
+
+package org.openjdk.bench.jdk.incubator.vector;
+
+import java.util.concurrent.TimeUnit;
+import java.util.Random;
+import jdk.incubator.vector.*;
+import org.openjdk.jmh.annotations.*;
+import org.openjdk.jmh.infra.Blackhole;
+
+@OutputTimeUnit(TimeUnit.MILLISECONDS)
+@State(Scope.Thread)
+@Fork(jvmArgsPrepend = {"--add-modules=jdk.incubator.vector", "-XX:UseAVX=2"})
+public class ColumnFilterBenchmark {
+    @Param({"1024", "2047", "4096"})
+    int size;
+
+    float [] floatinCol;
+    float [] floatoutCol;
+    float fpivot;
+
+    double [] doubleinCol;
+    double [] doubleoutCol;
+    double dpivot;
+
+    int [] intinCol;
+    int [] intoutCol;
+    int ipivot;
+
+    long [] longinCol;
+    long [] longoutCol;
+    long lpivot;
+
+    static final VectorSpecies<Float> fspecies = FloatVector.SPECIES_256;
+    static final VectorSpecies<Double> dspecies = DoubleVector.SPECIES_256;
+    static final VectorSpecies<Integer> ispecies = IntVector.SPECIES_256;
+    static final VectorSpecies<Long> lspecies = LongVector.SPECIES_256;
+
+    @Setup(Level.Trial)
+    public void BmSetup() {
+        Random r = new Random(2048);
+
+        floatinCol = new float[size];
+        floatoutCol = new float[size];
+        fpivot = (float) (size / 2);
+        doubleinCol = new double[size];
+        doubleoutCol = new double[size];
+        dpivot = (double) (size / 2);
+        intinCol = new int[size];
+        intoutCol = new int[size];
+        ipivot = size / 2;
+        longinCol = new long[size];
+        longoutCol = new long[size];
+        lpivot = size / 2;
+
+        for (int i = 4; i < size; i++) {
+            floatinCol[i] = r.nextFloat() * size;
+            doubleinCol[i] = r.nextDouble() * size;
+            intinCol[i] = r.nextInt(size);
+            longinCol[i] = (long)intinCol[i];
+        }
+    }
+
+    @Benchmark
+    public void fuzzyFilterIntColumn() {
+       int i = 0;
+       int j = 0;
+       long maskctr = 1;
+       int endIndex = ispecies.loopBound(size);
+       for (; i < endIndex; i += ispecies.length()) {
+           IntVector vec = IntVector.fromArray(ispecies, intinCol, i);
+           VectorMask<Integer> pred = VectorMask.fromLong(ispecies, maskctr++);
+           vec.compress(pred).intoArray(intoutCol, j);
+           j += pred.trueCount();
+       }
+   }
+
+   @Benchmark
+   public void fuzzyFilterLongColumn() {
+       int i = 0;
+       int j = 0;
+       long maskctr = 1;
+       int endIndex = lspecies.loopBound(size);
+       for (; i < endIndex; i += lspecies.length()) {
+           LongVector vec = LongVector.fromArray(lspecies, longinCol, i);
+           VectorMask<Long> pred = VectorMask.fromLong(lspecies, maskctr++);
+           vec.compress(pred).intoArray(longoutCol, j);
+           j += pred.trueCount();
+       }
+   }
+
+    @Benchmark
+    public void filterIntColumn() {
+       int i = 0;
+       int j = 0;
+       int endIndex = ispecies.loopBound(size);
+       for (; i < endIndex; i += ispecies.length()) {
+           IntVector vec = IntVector.fromArray(ispecies, intinCol, i);
+           VectorMask<Integer> pred = vec.compare(VectorOperators.GT, ipivot);
+           vec.compress(pred).intoArray(intoutCol, j);
+           j += pred.trueCount();
+       }
+       for (; i < endIndex; i++) {
+           if (intinCol[i] > ipivot) {
+               intoutCol[j++] = intinCol[i];
+           }
+       }
+   }
+
+   @Benchmark
+   public void filterLongColumn() {
+       int i = 0;
+       int j = 0;
+       int endIndex = lspecies.loopBound(size);
+       for (; i < endIndex; i += lspecies.length()) {
+           LongVector vec = LongVector.fromArray(lspecies, longinCol, i);
+           VectorMask<Long> pred = vec.compare(VectorOperators.GT, lpivot);
+           vec.compress(pred).intoArray(longoutCol, j);
+           j += pred.trueCount();
+       }
+       for (; i < endIndex; i++) {
+           if (longinCol[i] > lpivot) {
+               longoutCol[j++] = longinCol[i];
+           }
+       }
+   }
+
+   @Benchmark
+   public void filterFloatColumn() {
+       int i = 0;
+       int j = 0;
+       int endIndex = fspecies.loopBound(size);
+       for (; i < endIndex; i += fspecies.length()) {
+           FloatVector vec = FloatVector.fromArray(fspecies, floatinCol, i);
+           VectorMask<Float> pred = vec.compare(VectorOperators.GT, fpivot);
+           vec.compress(pred).intoArray(floatoutCol, j);
+           j += pred.trueCount();
+       }
+       for (; i < endIndex; i++) {
+           if (floatinCol[i] > fpivot) {
+               floatoutCol[j++] = floatinCol[i];
+           }
+       }
+   }
+
+   @Benchmark
+   public void filterDoubleColumn() {
+       int i = 0;
+       int j = 0;
+       int endIndex = dspecies.loopBound(size);
+       for (; i < endIndex; i += dspecies.length()) {
+           DoubleVector vec = DoubleVector.fromArray(dspecies, doubleinCol, i);
+           VectorMask<Double> pred = vec.compare(VectorOperators.GT, dpivot);
+           vec.compress(pred).intoArray(doubleoutCol, j);
+           j += pred.trueCount();
+       }
+       for (; i < endIndex; i++) {
+           if (doubleinCol[i] > dpivot) {
+               doubleoutCol[j++] = doubleinCol[i];
+           }
+       }
+   }
+}