From cb7875d57db652cd49cdc09a92d2c1be2b5ec66a Mon Sep 17 00:00:00 2001 From: Hamlin Li Date: Tue, 14 Nov 2023 09:59:08 +0000 Subject: [PATCH] 8318218: RISC-V: C2 CompressBits Reviewed-by: fyang, fjiang --- .../cpu/riscv/c2_MacroAssembler_riscv.cpp | 38 +++++++++++++ .../cpu/riscv/c2_MacroAssembler_riscv.hpp | 8 +++ src/hotspot/cpu/riscv/riscv.ad | 4 ++ src/hotspot/cpu/riscv/riscv_v.ad | 55 +++++++++++++++++++ 4 files changed, 105 insertions(+) diff --git a/src/hotspot/cpu/riscv/c2_MacroAssembler_riscv.cpp b/src/hotspot/cpu/riscv/c2_MacroAssembler_riscv.cpp index d2f037df6ec..bb65a328871 100644 --- a/src/hotspot/cpu/riscv/c2_MacroAssembler_riscv.cpp +++ b/src/hotspot/cpu/riscv/c2_MacroAssembler_riscv.cpp @@ -1676,6 +1676,44 @@ void C2_MacroAssembler::signum_fp(FloatRegister dst, FloatRegister one, bool is_ bind(done); } +void C2_MacroAssembler::compress_bits_v(Register dst, Register src, Register mask, bool is_long) { + Assembler::SEW sew = is_long ? Assembler::e64 : Assembler::e32; + // intrinsic is enabled when MaxVectorSize >= 16 + Assembler::LMUL lmul = is_long ? Assembler::m4 : Assembler::m2; + long len = is_long ? 64 : 32; + + // load the src data(in bits) to be compressed. + vsetivli(x0, 1, sew, Assembler::m1); + vmv_s_x(v0, src); + // reset the src data(in bytes) to zero. + mv(t0, len); + vsetvli(x0, t0, Assembler::e8, lmul); + vmv_v_i(v4, 0); + // convert the src data from bits to bytes. + vmerge_vim(v4, v4, 1); // v0 as the implicit mask register + // reset the dst data(in bytes) to zero. + vmv_v_i(v8, 0); + // load the mask data(in bits). + vsetivli(x0, 1, sew, Assembler::m1); + vmv_s_x(v0, mask); + // compress the src data(in bytes) to dst(in bytes). + vsetvli(x0, t0, Assembler::e8, lmul); + vcompress_vm(v8, v4, v0); + // convert the dst data from bytes to bits. + vmseq_vi(v0, v8, 1); + // store result back. + vsetivli(x0, 1, sew, Assembler::m1); + vmv_x_s(dst, v0); +} + +void C2_MacroAssembler::compress_bits_i_v(Register dst, Register src, Register mask) { + compress_bits_v(dst, src, mask, /* is_long */ false); +} + +void C2_MacroAssembler::compress_bits_l_v(Register dst, Register src, Register mask) { + compress_bits_v(dst, src, mask, /* is_long */ true); +} + void C2_MacroAssembler::element_compare(Register a1, Register a2, Register result, Register cnt, Register tmp1, Register tmp2, VectorRegister vr1, VectorRegister vr2, VectorRegister vrs, bool islatin, Label &DONE) { Label loop; diff --git a/src/hotspot/cpu/riscv/c2_MacroAssembler_riscv.hpp b/src/hotspot/cpu/riscv/c2_MacroAssembler_riscv.hpp index 32fda93c3cb..341fe1c16ba 100644 --- a/src/hotspot/cpu/riscv/c2_MacroAssembler_riscv.hpp +++ b/src/hotspot/cpu/riscv/c2_MacroAssembler_riscv.hpp @@ -38,6 +38,9 @@ VectorRegister vr1, VectorRegister vr2, VectorRegister vrs, bool is_latin, Label& DONE); + + void compress_bits_v(Register dst, Register src, Register mask, bool is_long); + public: // Code used by cmpFastLock and cmpFastUnlock mach instructions in .ad file. // See full description in macroAssembler_riscv.cpp. @@ -160,6 +163,11 @@ void signum_fp(FloatRegister dst, FloatRegister one, bool is_double); // intrinsic methods implemented by rvv instructions + + // compress bits, i.e. j.l.Integer/Long::compress. + void compress_bits_i_v(Register dst, Register src, Register mask); + void compress_bits_l_v(Register dst, Register src, Register mask); + void string_equals_v(Register r1, Register r2, Register result, Register cnt1, int elem_size); diff --git a/src/hotspot/cpu/riscv/riscv.ad b/src/hotspot/cpu/riscv/riscv.ad index dfd4c33d8c2..022ee391f47 100644 --- a/src/hotspot/cpu/riscv/riscv.ad +++ b/src/hotspot/cpu/riscv/riscv.ad @@ -1893,6 +1893,10 @@ bool Matcher::match_rule_supported(int opcode) { case Op_CountPositives: return UseRVV; + case Op_CompressBits: + guarantee(UseRVV == (MaxVectorSize >= 16), "UseRVV and MaxVectorSize not matched"); + return UseRVV; + case Op_EncodeISOArray: return UseRVV && SpecialEncodeISOArray; diff --git a/src/hotspot/cpu/riscv/riscv_v.ad b/src/hotspot/cpu/riscv/riscv_v.ad index 7c9e3ca81d7..7f266def6c7 100644 --- a/src/hotspot/cpu/riscv/riscv_v.ad +++ b/src/hotspot/cpu/riscv/riscv_v.ad @@ -2880,6 +2880,61 @@ instruct vclearArray_reg_reg(iRegL_R29 cnt, iRegP_R28 base, Universe dummy, ins_pipe(pipe_class_memory); %} +// CompressBits of Long & Integer + +instruct compressBitsI(iRegINoSp dst, iRegIorL2I src, iRegIorL2I mask, vRegMask_V0 v0, + vReg_V4 v4, vReg_V5 v5, vReg_V8 v8, vReg_V9 v9) %{ + predicate(UseRVV); + match(Set dst (CompressBits src mask)); + effect(TEMP v0, TEMP v4, TEMP v5, TEMP v8, TEMP v9); + format %{ "vsetivli x0, 1, e32, m1, tu, mu\t#@compressBitsI\n\t" + "vmv.s.x $v0, $src\n\t" + "mv t0, 32\n\t" + "vsetvli x0, t0, e8, m2, tu, mu\n\t" + "vmv.v.i $v4, 0\n\t" + "vmerge.vim $v4, $v4, 1, $v0\n\t" + "vmv.v.i $v8, 0\n\t" + "vsetivli x0, 1, e32, m1, tu, mu\n\t" + "vmv.s.x $v0, $mask\n\t" + "vsetvli x0, t0, e8, m2, tu, mu\n\t" + "vcompress.vm $v8, $v4, $v0\n\t" + "vmseq.vi $v0, $v8, 1\n\t" + "vsetivli x0, 1, e32, m1, tu, mu\n\t" + "vmv.x.s $dst, $v0\t#@compressBitsI\n\t" + %} + ins_encode %{ + __ compress_bits_i_v(as_Register($dst$$reg), as_Register($src$$reg), as_Register($mask$$reg)); + %} + ins_pipe(pipe_slow); +%} + +instruct compressBitsL(iRegLNoSp dst, iRegL src, iRegL mask, vRegMask_V0 v0, + vReg_V4 v4, vReg_V5 v5, vReg_V6 v6, vReg_V7 v7, + vReg_V8 v8, vReg_V9 v9, vReg_V10 v10, vReg_V11 v11) %{ + predicate(UseRVV); + match(Set dst (CompressBits src mask)); + effect(TEMP v0, TEMP v4, TEMP v5, TEMP v6, TEMP v7, TEMP v8, TEMP v9, TEMP v10, TEMP v11); + format %{ "vsetivli x0, 1, e64, m1, tu, mu\t#@compressBitsL\n\t" + "vmv.s.x $v0, $src\n\t" + "mv t0, 64\n\t" + "vsetvli x0, t0, e8, m4, tu, mu\n\t" + "vmv.v.i $v4, 0\n\t" + "vmerge.vim $v4, $v4, 1, $v0\n\t" + "vmv.v.i $v8, 0\n\t" + "vsetivli x0, 1, e64, m1, tu, mu\n\t" + "vmv.s.x $v0, $mask\n\t" + "vsetvli x0, t0, e8, m4, tu, mu\n\t" + "vcompress.vm $v8, $v4, $v0\n\t" + "vmseq.vi $v0, $v8, 1\n\t" + "vsetivli x0, 1, e64, m1, tu, mu\n\t" + "vmv.x.s $dst, $v0\t#@compressBitsL\n\t" + %} + ins_encode %{ + __ compress_bits_l_v(as_Register($dst$$reg), as_Register($src$$reg), as_Register($mask$$reg)); + %} + ins_pipe(pipe_slow); +%} + // Vector Load Const instruct vloadcon(vReg dst, immI0 src) %{ match(Set dst (VectorLoadConst src));