8333382: [s390x] Enhance popcnt Instruction to use Z15 facilities
Reviewed-by: lucy, aph
This commit is contained in:
parent
81083a0e10
commit
5a8a9fdfa5
@ -1610,6 +1610,9 @@ class Assembler : public AbstractAssembler {
|
||||
static int inv_simm32(long x) { return (inv_s_field(x, 31, 0)); } // 6-byte instructions only
|
||||
static int inv_uimm12(long x) { return (inv_u_field(x, 11, 0)); } // 4-byte instructions only
|
||||
|
||||
// NOTE: PLEASE DON'T USE IT NAKED UNTIL WE DROP SUPPORT FOR MACHINES OLDER THAN Z15!!!!
|
||||
inline void z_popcnt(Register r1, Register r2, int64_t m3); // population count
|
||||
|
||||
private:
|
||||
|
||||
// Encode u_field from long value.
|
||||
@ -3106,7 +3109,6 @@ class Assembler : public AbstractAssembler {
|
||||
|
||||
// Ppopulation count intrinsics.
|
||||
inline void z_flogr(Register r1, Register r2); // find leftmost one
|
||||
inline void z_popcnt(Register r1, Register r2); // population count
|
||||
inline void z_ahhhr(Register r1, Register r2, Register r3); // ADD halfword high high
|
||||
inline void z_ahhlr(Register r1, Register r2, Register r3); // ADD halfword high low
|
||||
|
||||
|
@ -748,7 +748,7 @@ inline void Assembler::z_brxhg(Register r1, Register r3, Label& L) {z_brxhg(r1,
|
||||
inline void Assembler::z_brxlg(Register r1, Register r3, Label& L) {z_brxlg(r1, r3, target(L)); }
|
||||
|
||||
inline void Assembler::z_flogr( Register r1, Register r2) { emit_32( FLOGR_ZOPC | reg(r1, 24, 32) | reg(r2, 28, 32)); }
|
||||
inline void Assembler::z_popcnt(Register r1, Register r2) { emit_32( POPCNT_ZOPC | reg(r1, 24, 32) | reg(r2, 28, 32)); }
|
||||
inline void Assembler::z_popcnt(Register r1, Register r2, int64_t m3) { emit_32( POPCNT_ZOPC | reg(r1, 24, 32) | reg(r2, 28, 32) | uimm4(m3, 16, 32)); }
|
||||
inline void Assembler::z_ahhhr( Register r1, Register r2, Register r3) { emit_32( AHHHR_ZOPC | reg(r3, 16, 32) | reg(r1, 24, 32) | reg(r2, 28, 32)); }
|
||||
inline void Assembler::z_ahhlr( Register r1, Register r2, Register r3) { emit_32( AHHLR_ZOPC | reg(r3, 16, 32) | reg(r1, 24, 32) | reg(r2, 28, 32)); }
|
||||
|
||||
|
@ -5803,3 +5803,87 @@ void MacroAssembler::lightweight_unlock(Register obj, Register hdr, Register tmp
|
||||
z_alsi(in_bytes(JavaThread::lock_stack_top_offset()), Z_thread, -oopSize); // pop object
|
||||
z_cr(tmp, tmp); // set CC to EQ
|
||||
}
|
||||
|
||||
void MacroAssembler::pop_count_int(Register r_dst, Register r_src, Register r_tmp) {
|
||||
BLOCK_COMMENT("pop_count_int {");
|
||||
|
||||
assert(r_tmp != noreg, "temp register required for pop_count_int, as code may run on machine older than z15");
|
||||
assert_different_registers(r_dst, r_tmp); // if r_src is same as r_tmp, it should be fine
|
||||
|
||||
if (VM_Version::has_MiscInstrExt3()) {
|
||||
pop_count_int_with_ext3(r_dst, r_src);
|
||||
} else {
|
||||
pop_count_int_without_ext3(r_dst, r_src, r_tmp);
|
||||
}
|
||||
|
||||
BLOCK_COMMENT("} pop_count_int");
|
||||
}
|
||||
|
||||
void MacroAssembler::pop_count_long(Register r_dst, Register r_src, Register r_tmp) {
|
||||
BLOCK_COMMENT("pop_count_long {");
|
||||
|
||||
assert(r_tmp != noreg, "temp register required for pop_count_long, as code may run on machine older than z15");
|
||||
assert_different_registers(r_dst, r_tmp); // if r_src is same as r_tmp, it should be fine
|
||||
|
||||
if (VM_Version::has_MiscInstrExt3()) {
|
||||
pop_count_long_with_ext3(r_dst, r_src);
|
||||
} else {
|
||||
pop_count_long_without_ext3(r_dst, r_src, r_tmp);
|
||||
}
|
||||
|
||||
BLOCK_COMMENT("} pop_count_long");
|
||||
}
|
||||
|
||||
void MacroAssembler::pop_count_int_without_ext3(Register r_dst, Register r_src, Register r_tmp) {
|
||||
BLOCK_COMMENT("pop_count_int_without_ext3 {");
|
||||
|
||||
assert(r_tmp != noreg, "temp register required for popcnt, for machines < z15");
|
||||
assert_different_registers(r_dst, r_tmp); // if r_src is same as r_tmp, it should be fine
|
||||
|
||||
z_popcnt(r_dst, r_src, 0);
|
||||
z_srlg(r_tmp, r_dst, 16);
|
||||
z_alr(r_dst, r_tmp);
|
||||
z_srlg(r_tmp, r_dst, 8);
|
||||
z_alr(r_dst, r_tmp);
|
||||
z_llgcr(r_dst, r_dst);
|
||||
|
||||
BLOCK_COMMENT("} pop_count_int_without_ext3");
|
||||
}
|
||||
|
||||
void MacroAssembler::pop_count_long_without_ext3(Register r_dst, Register r_src, Register r_tmp) {
|
||||
BLOCK_COMMENT("pop_count_long_without_ext3 {");
|
||||
|
||||
assert(r_tmp != noreg, "temp register required for popcnt, for machines < z15");
|
||||
assert_different_registers(r_dst, r_tmp); // if r_src is same as r_tmp, it should be fine
|
||||
|
||||
z_popcnt(r_dst, r_src, 0);
|
||||
z_ahhlr(r_dst, r_dst, r_dst);
|
||||
z_sllg(r_tmp, r_dst, 16);
|
||||
z_algr(r_dst, r_tmp);
|
||||
z_sllg(r_tmp, r_dst, 8);
|
||||
z_algr(r_dst, r_tmp);
|
||||
z_srlg(r_dst, r_dst, 56);
|
||||
|
||||
BLOCK_COMMENT("} pop_count_long_without_ext3");
|
||||
}
|
||||
|
||||
void MacroAssembler::pop_count_long_with_ext3(Register r_dst, Register r_src) {
|
||||
BLOCK_COMMENT("pop_count_long_with_ext3 {");
|
||||
|
||||
guarantee(VM_Version::has_MiscInstrExt3(),
|
||||
"this hardware doesn't support miscellaneous-instruction-extensions facility 3, still pop_count_long_with_ext3 is used");
|
||||
z_popcnt(r_dst, r_src, 8);
|
||||
|
||||
BLOCK_COMMENT("} pop_count_long_with_ext3");
|
||||
}
|
||||
|
||||
void MacroAssembler::pop_count_int_with_ext3(Register r_dst, Register r_src) {
|
||||
BLOCK_COMMENT("pop_count_int_with_ext3 {");
|
||||
|
||||
guarantee(VM_Version::has_MiscInstrExt3(),
|
||||
"this hardware doesn't support miscellaneous-instruction-extensions facility 3, still pop_count_long_with_ext3 is used");
|
||||
z_llgfr(r_dst, r_src);
|
||||
z_popcnt(r_dst, r_dst, 8);
|
||||
|
||||
BLOCK_COMMENT("} pop_count_int_with_ext3");
|
||||
}
|
||||
|
@ -1,6 +1,7 @@
|
||||
/*
|
||||
* Copyright (c) 2016, 2023, Oracle and/or its affiliates. All rights reserved.
|
||||
* Copyright (c) 2016, 2023 SAP SE. All rights reserved.
|
||||
* Copyright (c) 2016, 2024, Oracle and/or its affiliates. All rights reserved.
|
||||
* Copyright (c) 2016, 2024 SAP SE. All rights reserved.
|
||||
* Copyright (c) 2024 IBM Corporation. All rights reserved.
|
||||
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
|
||||
*
|
||||
* This code is free software; you can redistribute it and/or modify it
|
||||
@ -1021,6 +1022,19 @@ class MacroAssembler: public Assembler {
|
||||
Register z,
|
||||
Register tmp1, Register tmp2,
|
||||
Register tmp3, Register tmp4, Register tmp5);
|
||||
|
||||
// These generate optimized code for all supported s390 implementations, and are preferred for most uses.
|
||||
void pop_count_int(Register dst, Register src, Register tmp);
|
||||
void pop_count_long(Register dst, Register src, Register tmp);
|
||||
|
||||
// For legacy (pre-z15) use, but will work on all supported s390 implementations.
|
||||
void pop_count_int_without_ext3(Register dst, Register src, Register tmp);
|
||||
void pop_count_long_without_ext3(Register dst, Register src, Register tmp);
|
||||
|
||||
// Only for use on z15 or later s390 implementations.
|
||||
void pop_count_int_with_ext3(Register dst, Register src);
|
||||
void pop_count_long_with_ext3(Register dst, Register src);
|
||||
|
||||
};
|
||||
|
||||
/**
|
||||
|
@ -10675,10 +10675,49 @@ instruct countTrailingZerosL(revenRegI dst, iRegL src, roddRegL tmp, flagsReg cr
|
||||
|
||||
// bit count
|
||||
|
||||
instruct popCountI_Ext3(iRegI dst, iRegI src, flagsReg cr) %{
|
||||
match(Set dst (PopCountI src));
|
||||
effect(TEMP_DEF dst, KILL cr);
|
||||
predicate(UsePopCountInstruction &&
|
||||
VM_Version::has_PopCount() &&
|
||||
VM_Version::has_MiscInstrExt3());
|
||||
ins_cost(DEFAULT_COST);
|
||||
size(8); // popcnt + llgfr
|
||||
format %{ "POPCNT $dst,$src\t # pop count int" %}
|
||||
ins_encode %{
|
||||
Register Rdst = $dst$$Register;
|
||||
Register Rsrc = $src$$Register;
|
||||
|
||||
__ pop_count_int_with_ext3(Rdst, Rsrc);
|
||||
|
||||
%}
|
||||
ins_pipe(pipe_class_dummy);
|
||||
%}
|
||||
|
||||
instruct popCountL_Ext3(iRegI dst, iRegL src, flagsReg cr) %{
|
||||
match(Set dst (PopCountL src));
|
||||
effect(TEMP_DEF dst, KILL cr);
|
||||
predicate(UsePopCountInstruction &&
|
||||
VM_Version::has_PopCount() &&
|
||||
VM_Version::has_MiscInstrExt3());
|
||||
ins_cost(DEFAULT_COST);
|
||||
size(4); // popcnt
|
||||
format %{ "POPCNT $dst,$src\t # pop count long" %}
|
||||
ins_encode %{
|
||||
Register Rdst = $dst$$Register;
|
||||
Register Rsrc = $src$$Register;
|
||||
|
||||
__ pop_count_long_with_ext3(Rdst, Rsrc);
|
||||
%}
|
||||
ins_pipe(pipe_class_dummy);
|
||||
%}
|
||||
|
||||
instruct popCountI(iRegI dst, iRegI src, iRegI tmp, flagsReg cr) %{
|
||||
match(Set dst (PopCountI src));
|
||||
effect(TEMP_DEF dst, TEMP tmp, KILL cr);
|
||||
predicate(UsePopCountInstruction && VM_Version::has_PopCount());
|
||||
predicate(UsePopCountInstruction &&
|
||||
VM_Version::has_PopCount() &&
|
||||
(!VM_Version::has_MiscInstrExt3()));
|
||||
ins_cost(DEFAULT_COST);
|
||||
size(24);
|
||||
format %{ "POPCNT $dst,$src\t # pop count int" %}
|
||||
@ -10687,17 +10726,8 @@ instruct popCountI(iRegI dst, iRegI src, iRegI tmp, flagsReg cr) %{
|
||||
Register Rsrc = $src$$Register;
|
||||
Register Rtmp = $tmp$$Register;
|
||||
|
||||
// Prefer compile-time assertion over run-time SIGILL.
|
||||
assert(VM_Version::has_PopCount(), "bad predicate for countLeadingZerosI");
|
||||
assert_different_registers(Rdst, Rtmp);
|
||||
__ pop_count_int_without_ext3(Rdst, Rsrc, Rtmp);
|
||||
|
||||
// Version 2: shows 10%(z196) improvement over original.
|
||||
__ z_popcnt(Rdst, Rsrc);
|
||||
__ z_srlg(Rtmp, Rdst, 16); // calc byte4+byte6 and byte5+byte7
|
||||
__ z_alr(Rdst, Rtmp); // into byte6 and byte7
|
||||
__ z_srlg(Rtmp, Rdst, 8); // calc (byte4+byte6) + (byte5+byte7)
|
||||
__ z_alr(Rdst, Rtmp); // into byte7
|
||||
__ z_llgcr(Rdst, Rdst); // zero-extend sum
|
||||
%}
|
||||
ins_pipe(pipe_class_dummy);
|
||||
%}
|
||||
@ -10705,27 +10735,18 @@ instruct popCountI(iRegI dst, iRegI src, iRegI tmp, flagsReg cr) %{
|
||||
instruct popCountL(iRegI dst, iRegL src, iRegL tmp, flagsReg cr) %{
|
||||
match(Set dst (PopCountL src));
|
||||
effect(TEMP_DEF dst, TEMP tmp, KILL cr);
|
||||
predicate(UsePopCountInstruction && VM_Version::has_PopCount());
|
||||
predicate(UsePopCountInstruction &&
|
||||
VM_Version::has_PopCount() &&
|
||||
(!VM_Version::has_MiscInstrExt3()));
|
||||
ins_cost(DEFAULT_COST);
|
||||
// TODO: s390 port size(FIXED_SIZE);
|
||||
size(34);
|
||||
format %{ "POPCNT $dst,$src\t # pop count long" %}
|
||||
ins_encode %{
|
||||
Register Rdst = $dst$$Register;
|
||||
Register Rsrc = $src$$Register;
|
||||
Register Rtmp = $tmp$$Register;
|
||||
|
||||
// Prefer compile-time assertion over run-time SIGILL.
|
||||
assert(VM_Version::has_PopCount(), "bad predicate for countLeadingZerosI");
|
||||
assert_different_registers(Rdst, Rtmp);
|
||||
|
||||
// Original version. Using LA instead of algr seems to be a really bad idea (-35%).
|
||||
__ z_popcnt(Rdst, Rsrc);
|
||||
__ z_ahhlr(Rdst, Rdst, Rdst);
|
||||
__ z_sllg(Rtmp, Rdst, 16);
|
||||
__ z_algr(Rdst, Rtmp);
|
||||
__ z_sllg(Rtmp, Rdst, 8);
|
||||
__ z_algr(Rdst, Rtmp);
|
||||
__ z_srlg(Rdst, Rdst, 56);
|
||||
__ pop_count_long_without_ext3(Rdst, Rsrc, Rtmp);
|
||||
%}
|
||||
ins_pipe(pipe_class_dummy);
|
||||
%}
|
||||
|
57
test/micro/org/openjdk/bench/vm/compiler/PopCount.java
Normal file
57
test/micro/org/openjdk/bench/vm/compiler/PopCount.java
Normal file
@ -0,0 +1,57 @@
|
||||
/*
|
||||
* Copyright (c) 2024 IBM Corporation. All rights reserved.
|
||||
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
|
||||
*
|
||||
* This code is free software; you can redistribute it and/or modify it
|
||||
* under the terms of the GNU General Public License version 2 only, as
|
||||
* published by the Free Software Foundation.
|
||||
*
|
||||
* This code is distributed in the hope that it will be useful, but WITHOUT
|
||||
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
|
||||
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
|
||||
* version 2 for more details (a copy is included in the LICENSE file that
|
||||
* accompanied this code).
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License version
|
||||
* 2 along with this work; if not, write to the Free Software Foundation,
|
||||
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||
*
|
||||
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
|
||||
* or visit www.oracle.com if you need additional information or have any
|
||||
* questions.
|
||||
*/
|
||||
|
||||
package org.openjdk.bench.vm.compiler;
|
||||
|
||||
import org.openjdk.jmh.annotations.*;
|
||||
|
||||
import java.util.concurrent.TimeUnit;
|
||||
|
||||
@BenchmarkMode(Mode.AverageTime)
|
||||
@OutputTimeUnit(TimeUnit.MILLISECONDS)
|
||||
@State(Scope.Thread)
|
||||
@Warmup(iterations = 10, time = 1)
|
||||
@Measurement(iterations = 5, time = 1)
|
||||
@Fork(value = 5)
|
||||
public class PopCount {
|
||||
int numTests = 100_000;
|
||||
|
||||
@Benchmark
|
||||
public long test() {
|
||||
long l1 = 1, l2 = 2, l3 = 3, l4 = 4, l5 = 5, l6 = 6, l7 = 7, l8 = 9, l9 = 9, l10 = 10;
|
||||
for (long i = 0; i < numTests; i++) {
|
||||
l1 ^= Long.bitCount(l1) + i;
|
||||
l2 ^= Long.bitCount(l2) + i;
|
||||
l3 ^= Long.bitCount(l3) + i;
|
||||
l4 ^= Long.bitCount(l4) + i;
|
||||
l5 ^= Long.bitCount(l5) + i;
|
||||
l6 ^= Long.bitCount(l6) + i;
|
||||
l7 ^= Long.bitCount(l7) + i;
|
||||
l8 ^= Long.bitCount(l8) + i;
|
||||
l9 ^= Long.bitCount(l9) + i;
|
||||
l10 ^= Long.bitCount(l10) + i;
|
||||
}
|
||||
return l1 + l2 + l3 + l4 + l5 + l6 + l7 + l8 + l9 + l10;
|
||||
}
|
||||
|
||||
}
|
Loading…
Reference in New Issue
Block a user