8333382: [s390x] Enhance popcnt Instruction to use Z15 facilities

Reviewed-by: lucy, aph
This commit is contained in:
Amit Kumar 2024-06-12 13:24:58 +00:00
parent 81083a0e10
commit 5a8a9fdfa5
6 changed files with 207 additions and 29 deletions

View File

@ -1610,6 +1610,9 @@ class Assembler : public AbstractAssembler {
static int inv_simm32(long x) { return (inv_s_field(x, 31, 0)); } // 6-byte instructions only
static int inv_uimm12(long x) { return (inv_u_field(x, 11, 0)); } // 4-byte instructions only
// NOTE: PLEASE DON'T USE IT NAKED UNTIL WE DROP SUPPORT FOR MACHINES OLDER THAN Z15!!!!
inline void z_popcnt(Register r1, Register r2, int64_t m3); // population count
private:
// Encode u_field from long value.
@ -3106,7 +3109,6 @@ class Assembler : public AbstractAssembler {
// Ppopulation count intrinsics.
inline void z_flogr(Register r1, Register r2); // find leftmost one
inline void z_popcnt(Register r1, Register r2); // population count
inline void z_ahhhr(Register r1, Register r2, Register r3); // ADD halfword high high
inline void z_ahhlr(Register r1, Register r2, Register r3); // ADD halfword high low

View File

@ -748,7 +748,7 @@ inline void Assembler::z_brxhg(Register r1, Register r3, Label& L) {z_brxhg(r1,
inline void Assembler::z_brxlg(Register r1, Register r3, Label& L) {z_brxlg(r1, r3, target(L)); }
inline void Assembler::z_flogr( Register r1, Register r2) { emit_32( FLOGR_ZOPC | reg(r1, 24, 32) | reg(r2, 28, 32)); }
inline void Assembler::z_popcnt(Register r1, Register r2) { emit_32( POPCNT_ZOPC | reg(r1, 24, 32) | reg(r2, 28, 32)); }
inline void Assembler::z_popcnt(Register r1, Register r2, int64_t m3) { emit_32( POPCNT_ZOPC | reg(r1, 24, 32) | reg(r2, 28, 32) | uimm4(m3, 16, 32)); }
inline void Assembler::z_ahhhr( Register r1, Register r2, Register r3) { emit_32( AHHHR_ZOPC | reg(r3, 16, 32) | reg(r1, 24, 32) | reg(r2, 28, 32)); }
inline void Assembler::z_ahhlr( Register r1, Register r2, Register r3) { emit_32( AHHLR_ZOPC | reg(r3, 16, 32) | reg(r1, 24, 32) | reg(r2, 28, 32)); }

View File

@ -5803,3 +5803,87 @@ void MacroAssembler::lightweight_unlock(Register obj, Register hdr, Register tmp
z_alsi(in_bytes(JavaThread::lock_stack_top_offset()), Z_thread, -oopSize); // pop object
z_cr(tmp, tmp); // set CC to EQ
}
void MacroAssembler::pop_count_int(Register r_dst, Register r_src, Register r_tmp) {
BLOCK_COMMENT("pop_count_int {");
assert(r_tmp != noreg, "temp register required for pop_count_int, as code may run on machine older than z15");
assert_different_registers(r_dst, r_tmp); // if r_src is same as r_tmp, it should be fine
if (VM_Version::has_MiscInstrExt3()) {
pop_count_int_with_ext3(r_dst, r_src);
} else {
pop_count_int_without_ext3(r_dst, r_src, r_tmp);
}
BLOCK_COMMENT("} pop_count_int");
}
void MacroAssembler::pop_count_long(Register r_dst, Register r_src, Register r_tmp) {
BLOCK_COMMENT("pop_count_long {");
assert(r_tmp != noreg, "temp register required for pop_count_long, as code may run on machine older than z15");
assert_different_registers(r_dst, r_tmp); // if r_src is same as r_tmp, it should be fine
if (VM_Version::has_MiscInstrExt3()) {
pop_count_long_with_ext3(r_dst, r_src);
} else {
pop_count_long_without_ext3(r_dst, r_src, r_tmp);
}
BLOCK_COMMENT("} pop_count_long");
}
void MacroAssembler::pop_count_int_without_ext3(Register r_dst, Register r_src, Register r_tmp) {
BLOCK_COMMENT("pop_count_int_without_ext3 {");
assert(r_tmp != noreg, "temp register required for popcnt, for machines < z15");
assert_different_registers(r_dst, r_tmp); // if r_src is same as r_tmp, it should be fine
z_popcnt(r_dst, r_src, 0);
z_srlg(r_tmp, r_dst, 16);
z_alr(r_dst, r_tmp);
z_srlg(r_tmp, r_dst, 8);
z_alr(r_dst, r_tmp);
z_llgcr(r_dst, r_dst);
BLOCK_COMMENT("} pop_count_int_without_ext3");
}
void MacroAssembler::pop_count_long_without_ext3(Register r_dst, Register r_src, Register r_tmp) {
BLOCK_COMMENT("pop_count_long_without_ext3 {");
assert(r_tmp != noreg, "temp register required for popcnt, for machines < z15");
assert_different_registers(r_dst, r_tmp); // if r_src is same as r_tmp, it should be fine
z_popcnt(r_dst, r_src, 0);
z_ahhlr(r_dst, r_dst, r_dst);
z_sllg(r_tmp, r_dst, 16);
z_algr(r_dst, r_tmp);
z_sllg(r_tmp, r_dst, 8);
z_algr(r_dst, r_tmp);
z_srlg(r_dst, r_dst, 56);
BLOCK_COMMENT("} pop_count_long_without_ext3");
}
void MacroAssembler::pop_count_long_with_ext3(Register r_dst, Register r_src) {
BLOCK_COMMENT("pop_count_long_with_ext3 {");
guarantee(VM_Version::has_MiscInstrExt3(),
"this hardware doesn't support miscellaneous-instruction-extensions facility 3, still pop_count_long_with_ext3 is used");
z_popcnt(r_dst, r_src, 8);
BLOCK_COMMENT("} pop_count_long_with_ext3");
}
void MacroAssembler::pop_count_int_with_ext3(Register r_dst, Register r_src) {
BLOCK_COMMENT("pop_count_int_with_ext3 {");
guarantee(VM_Version::has_MiscInstrExt3(),
"this hardware doesn't support miscellaneous-instruction-extensions facility 3, still pop_count_long_with_ext3 is used");
z_llgfr(r_dst, r_src);
z_popcnt(r_dst, r_dst, 8);
BLOCK_COMMENT("} pop_count_int_with_ext3");
}

View File

@ -1,6 +1,7 @@
/*
* Copyright (c) 2016, 2023, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2016, 2023 SAP SE. All rights reserved.
* Copyright (c) 2016, 2024, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2016, 2024 SAP SE. All rights reserved.
* Copyright (c) 2024 IBM Corporation. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
@ -1021,6 +1022,19 @@ class MacroAssembler: public Assembler {
Register z,
Register tmp1, Register tmp2,
Register tmp3, Register tmp4, Register tmp5);
// These generate optimized code for all supported s390 implementations, and are preferred for most uses.
void pop_count_int(Register dst, Register src, Register tmp);
void pop_count_long(Register dst, Register src, Register tmp);
// For legacy (pre-z15) use, but will work on all supported s390 implementations.
void pop_count_int_without_ext3(Register dst, Register src, Register tmp);
void pop_count_long_without_ext3(Register dst, Register src, Register tmp);
// Only for use on z15 or later s390 implementations.
void pop_count_int_with_ext3(Register dst, Register src);
void pop_count_long_with_ext3(Register dst, Register src);
};
/**

View File

@ -10675,10 +10675,49 @@ instruct countTrailingZerosL(revenRegI dst, iRegL src, roddRegL tmp, flagsReg cr
// bit count
instruct popCountI_Ext3(iRegI dst, iRegI src, flagsReg cr) %{
match(Set dst (PopCountI src));
effect(TEMP_DEF dst, KILL cr);
predicate(UsePopCountInstruction &&
VM_Version::has_PopCount() &&
VM_Version::has_MiscInstrExt3());
ins_cost(DEFAULT_COST);
size(8); // popcnt + llgfr
format %{ "POPCNT $dst,$src\t # pop count int" %}
ins_encode %{
Register Rdst = $dst$$Register;
Register Rsrc = $src$$Register;
__ pop_count_int_with_ext3(Rdst, Rsrc);
%}
ins_pipe(pipe_class_dummy);
%}
instruct popCountL_Ext3(iRegI dst, iRegL src, flagsReg cr) %{
match(Set dst (PopCountL src));
effect(TEMP_DEF dst, KILL cr);
predicate(UsePopCountInstruction &&
VM_Version::has_PopCount() &&
VM_Version::has_MiscInstrExt3());
ins_cost(DEFAULT_COST);
size(4); // popcnt
format %{ "POPCNT $dst,$src\t # pop count long" %}
ins_encode %{
Register Rdst = $dst$$Register;
Register Rsrc = $src$$Register;
__ pop_count_long_with_ext3(Rdst, Rsrc);
%}
ins_pipe(pipe_class_dummy);
%}
instruct popCountI(iRegI dst, iRegI src, iRegI tmp, flagsReg cr) %{
match(Set dst (PopCountI src));
effect(TEMP_DEF dst, TEMP tmp, KILL cr);
predicate(UsePopCountInstruction && VM_Version::has_PopCount());
predicate(UsePopCountInstruction &&
VM_Version::has_PopCount() &&
(!VM_Version::has_MiscInstrExt3()));
ins_cost(DEFAULT_COST);
size(24);
format %{ "POPCNT $dst,$src\t # pop count int" %}
@ -10687,17 +10726,8 @@ instruct popCountI(iRegI dst, iRegI src, iRegI tmp, flagsReg cr) %{
Register Rsrc = $src$$Register;
Register Rtmp = $tmp$$Register;
// Prefer compile-time assertion over run-time SIGILL.
assert(VM_Version::has_PopCount(), "bad predicate for countLeadingZerosI");
assert_different_registers(Rdst, Rtmp);
__ pop_count_int_without_ext3(Rdst, Rsrc, Rtmp);
// Version 2: shows 10%(z196) improvement over original.
__ z_popcnt(Rdst, Rsrc);
__ z_srlg(Rtmp, Rdst, 16); // calc byte4+byte6 and byte5+byte7
__ z_alr(Rdst, Rtmp); // into byte6 and byte7
__ z_srlg(Rtmp, Rdst, 8); // calc (byte4+byte6) + (byte5+byte7)
__ z_alr(Rdst, Rtmp); // into byte7
__ z_llgcr(Rdst, Rdst); // zero-extend sum
%}
ins_pipe(pipe_class_dummy);
%}
@ -10705,27 +10735,18 @@ instruct popCountI(iRegI dst, iRegI src, iRegI tmp, flagsReg cr) %{
instruct popCountL(iRegI dst, iRegL src, iRegL tmp, flagsReg cr) %{
match(Set dst (PopCountL src));
effect(TEMP_DEF dst, TEMP tmp, KILL cr);
predicate(UsePopCountInstruction && VM_Version::has_PopCount());
predicate(UsePopCountInstruction &&
VM_Version::has_PopCount() &&
(!VM_Version::has_MiscInstrExt3()));
ins_cost(DEFAULT_COST);
// TODO: s390 port size(FIXED_SIZE);
size(34);
format %{ "POPCNT $dst,$src\t # pop count long" %}
ins_encode %{
Register Rdst = $dst$$Register;
Register Rsrc = $src$$Register;
Register Rtmp = $tmp$$Register;
// Prefer compile-time assertion over run-time SIGILL.
assert(VM_Version::has_PopCount(), "bad predicate for countLeadingZerosI");
assert_different_registers(Rdst, Rtmp);
// Original version. Using LA instead of algr seems to be a really bad idea (-35%).
__ z_popcnt(Rdst, Rsrc);
__ z_ahhlr(Rdst, Rdst, Rdst);
__ z_sllg(Rtmp, Rdst, 16);
__ z_algr(Rdst, Rtmp);
__ z_sllg(Rtmp, Rdst, 8);
__ z_algr(Rdst, Rtmp);
__ z_srlg(Rdst, Rdst, 56);
__ pop_count_long_without_ext3(Rdst, Rsrc, Rtmp);
%}
ins_pipe(pipe_class_dummy);
%}

View File

@ -0,0 +1,57 @@
/*
* Copyright (c) 2024 IBM Corporation. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License version 2 only, as
* published by the Free Software Foundation.
*
* This code is distributed in the hope that it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
* version 2 for more details (a copy is included in the LICENSE file that
* accompanied this code).
*
* You should have received a copy of the GNU General Public License version
* 2 along with this work; if not, write to the Free Software Foundation,
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
*
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
* or visit www.oracle.com if you need additional information or have any
* questions.
*/
package org.openjdk.bench.vm.compiler;
import org.openjdk.jmh.annotations.*;
import java.util.concurrent.TimeUnit;
@BenchmarkMode(Mode.AverageTime)
@OutputTimeUnit(TimeUnit.MILLISECONDS)
@State(Scope.Thread)
@Warmup(iterations = 10, time = 1)
@Measurement(iterations = 5, time = 1)
@Fork(value = 5)
public class PopCount {
int numTests = 100_000;
@Benchmark
public long test() {
long l1 = 1, l2 = 2, l3 = 3, l4 = 4, l5 = 5, l6 = 6, l7 = 7, l8 = 9, l9 = 9, l10 = 10;
for (long i = 0; i < numTests; i++) {
l1 ^= Long.bitCount(l1) + i;
l2 ^= Long.bitCount(l2) + i;
l3 ^= Long.bitCount(l3) + i;
l4 ^= Long.bitCount(l4) + i;
l5 ^= Long.bitCount(l5) + i;
l6 ^= Long.bitCount(l6) + i;
l7 ^= Long.bitCount(l7) + i;
l8 ^= Long.bitCount(l8) + i;
l9 ^= Long.bitCount(l9) + i;
l10 ^= Long.bitCount(l10) + i;
}
return l1 + l2 + l3 + l4 + l5 + l6 + l7 + l8 + l9 + l10;
}
}