diff --git a/src/hotspot/cpu/s390/assembler_s390.hpp b/src/hotspot/cpu/s390/assembler_s390.hpp index f472af134a3..58200958289 100644 --- a/src/hotspot/cpu/s390/assembler_s390.hpp +++ b/src/hotspot/cpu/s390/assembler_s390.hpp @@ -1610,6 +1610,9 @@ class Assembler : public AbstractAssembler { static int inv_simm32(long x) { return (inv_s_field(x, 31, 0)); } // 6-byte instructions only static int inv_uimm12(long x) { return (inv_u_field(x, 11, 0)); } // 4-byte instructions only + // NOTE: PLEASE DON'T USE IT NAKED UNTIL WE DROP SUPPORT FOR MACHINES OLDER THAN Z15!!!! + inline void z_popcnt(Register r1, Register r2, int64_t m3); // population count + private: // Encode u_field from long value. @@ -3106,7 +3109,6 @@ class Assembler : public AbstractAssembler { // Ppopulation count intrinsics. inline void z_flogr(Register r1, Register r2); // find leftmost one - inline void z_popcnt(Register r1, Register r2); // population count inline void z_ahhhr(Register r1, Register r2, Register r3); // ADD halfword high high inline void z_ahhlr(Register r1, Register r2, Register r3); // ADD halfword high low diff --git a/src/hotspot/cpu/s390/assembler_s390.inline.hpp b/src/hotspot/cpu/s390/assembler_s390.inline.hpp index 51b2cbe0a3e..2649d0f7a34 100644 --- a/src/hotspot/cpu/s390/assembler_s390.inline.hpp +++ b/src/hotspot/cpu/s390/assembler_s390.inline.hpp @@ -748,7 +748,7 @@ inline void Assembler::z_brxhg(Register r1, Register r3, Label& L) {z_brxhg(r1, inline void Assembler::z_brxlg(Register r1, Register r3, Label& L) {z_brxlg(r1, r3, target(L)); } inline void Assembler::z_flogr( Register r1, Register r2) { emit_32( FLOGR_ZOPC | reg(r1, 24, 32) | reg(r2, 28, 32)); } -inline void Assembler::z_popcnt(Register r1, Register r2) { emit_32( POPCNT_ZOPC | reg(r1, 24, 32) | reg(r2, 28, 32)); } +inline void Assembler::z_popcnt(Register r1, Register r2, int64_t m3) { emit_32( POPCNT_ZOPC | reg(r1, 24, 32) | reg(r2, 28, 32) | uimm4(m3, 16, 32)); } inline void Assembler::z_ahhhr( Register r1, Register r2, Register r3) { emit_32( AHHHR_ZOPC | reg(r3, 16, 32) | reg(r1, 24, 32) | reg(r2, 28, 32)); } inline void Assembler::z_ahhlr( Register r1, Register r2, Register r3) { emit_32( AHHLR_ZOPC | reg(r3, 16, 32) | reg(r1, 24, 32) | reg(r2, 28, 32)); } diff --git a/src/hotspot/cpu/s390/macroAssembler_s390.cpp b/src/hotspot/cpu/s390/macroAssembler_s390.cpp index ef5216a12ba..275f4a8d832 100644 --- a/src/hotspot/cpu/s390/macroAssembler_s390.cpp +++ b/src/hotspot/cpu/s390/macroAssembler_s390.cpp @@ -5803,3 +5803,87 @@ void MacroAssembler::lightweight_unlock(Register obj, Register hdr, Register tmp z_alsi(in_bytes(JavaThread::lock_stack_top_offset()), Z_thread, -oopSize); // pop object z_cr(tmp, tmp); // set CC to EQ } + +void MacroAssembler::pop_count_int(Register r_dst, Register r_src, Register r_tmp) { + BLOCK_COMMENT("pop_count_int {"); + + assert(r_tmp != noreg, "temp register required for pop_count_int, as code may run on machine older than z15"); + assert_different_registers(r_dst, r_tmp); // if r_src is same as r_tmp, it should be fine + + if (VM_Version::has_MiscInstrExt3()) { + pop_count_int_with_ext3(r_dst, r_src); + } else { + pop_count_int_without_ext3(r_dst, r_src, r_tmp); + } + + BLOCK_COMMENT("} pop_count_int"); +} + +void MacroAssembler::pop_count_long(Register r_dst, Register r_src, Register r_tmp) { + BLOCK_COMMENT("pop_count_long {"); + + assert(r_tmp != noreg, "temp register required for pop_count_long, as code may run on machine older than z15"); + assert_different_registers(r_dst, r_tmp); // if r_src is same as r_tmp, it should be fine + + if (VM_Version::has_MiscInstrExt3()) { + pop_count_long_with_ext3(r_dst, r_src); + } else { + pop_count_long_without_ext3(r_dst, r_src, r_tmp); + } + + BLOCK_COMMENT("} pop_count_long"); +} + +void MacroAssembler::pop_count_int_without_ext3(Register r_dst, Register r_src, Register r_tmp) { + BLOCK_COMMENT("pop_count_int_without_ext3 {"); + + assert(r_tmp != noreg, "temp register required for popcnt, for machines < z15"); + assert_different_registers(r_dst, r_tmp); // if r_src is same as r_tmp, it should be fine + + z_popcnt(r_dst, r_src, 0); + z_srlg(r_tmp, r_dst, 16); + z_alr(r_dst, r_tmp); + z_srlg(r_tmp, r_dst, 8); + z_alr(r_dst, r_tmp); + z_llgcr(r_dst, r_dst); + + BLOCK_COMMENT("} pop_count_int_without_ext3"); +} + +void MacroAssembler::pop_count_long_without_ext3(Register r_dst, Register r_src, Register r_tmp) { + BLOCK_COMMENT("pop_count_long_without_ext3 {"); + + assert(r_tmp != noreg, "temp register required for popcnt, for machines < z15"); + assert_different_registers(r_dst, r_tmp); // if r_src is same as r_tmp, it should be fine + + z_popcnt(r_dst, r_src, 0); + z_ahhlr(r_dst, r_dst, r_dst); + z_sllg(r_tmp, r_dst, 16); + z_algr(r_dst, r_tmp); + z_sllg(r_tmp, r_dst, 8); + z_algr(r_dst, r_tmp); + z_srlg(r_dst, r_dst, 56); + + BLOCK_COMMENT("} pop_count_long_without_ext3"); +} + +void MacroAssembler::pop_count_long_with_ext3(Register r_dst, Register r_src) { + BLOCK_COMMENT("pop_count_long_with_ext3 {"); + + guarantee(VM_Version::has_MiscInstrExt3(), + "this hardware doesn't support miscellaneous-instruction-extensions facility 3, still pop_count_long_with_ext3 is used"); + z_popcnt(r_dst, r_src, 8); + + BLOCK_COMMENT("} pop_count_long_with_ext3"); +} + +void MacroAssembler::pop_count_int_with_ext3(Register r_dst, Register r_src) { + BLOCK_COMMENT("pop_count_int_with_ext3 {"); + + guarantee(VM_Version::has_MiscInstrExt3(), + "this hardware doesn't support miscellaneous-instruction-extensions facility 3, still pop_count_long_with_ext3 is used"); + z_llgfr(r_dst, r_src); + z_popcnt(r_dst, r_dst, 8); + + BLOCK_COMMENT("} pop_count_int_with_ext3"); +} diff --git a/src/hotspot/cpu/s390/macroAssembler_s390.hpp b/src/hotspot/cpu/s390/macroAssembler_s390.hpp index 924583abdf5..9f45542dd65 100644 --- a/src/hotspot/cpu/s390/macroAssembler_s390.hpp +++ b/src/hotspot/cpu/s390/macroAssembler_s390.hpp @@ -1,6 +1,7 @@ /* - * Copyright (c) 2016, 2023, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2016, 2023 SAP SE. All rights reserved. + * Copyright (c) 2016, 2024, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2016, 2024 SAP SE. All rights reserved. + * Copyright (c) 2024 IBM Corporation. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * This code is free software; you can redistribute it and/or modify it @@ -1021,6 +1022,19 @@ class MacroAssembler: public Assembler { Register z, Register tmp1, Register tmp2, Register tmp3, Register tmp4, Register tmp5); + + // These generate optimized code for all supported s390 implementations, and are preferred for most uses. + void pop_count_int(Register dst, Register src, Register tmp); + void pop_count_long(Register dst, Register src, Register tmp); + + // For legacy (pre-z15) use, but will work on all supported s390 implementations. + void pop_count_int_without_ext3(Register dst, Register src, Register tmp); + void pop_count_long_without_ext3(Register dst, Register src, Register tmp); + + // Only for use on z15 or later s390 implementations. + void pop_count_int_with_ext3(Register dst, Register src); + void pop_count_long_with_ext3(Register dst, Register src); + }; /** diff --git a/src/hotspot/cpu/s390/s390.ad b/src/hotspot/cpu/s390/s390.ad index 28cac16864d..56cf494d27e 100644 --- a/src/hotspot/cpu/s390/s390.ad +++ b/src/hotspot/cpu/s390/s390.ad @@ -10675,10 +10675,49 @@ instruct countTrailingZerosL(revenRegI dst, iRegL src, roddRegL tmp, flagsReg cr // bit count +instruct popCountI_Ext3(iRegI dst, iRegI src, flagsReg cr) %{ + match(Set dst (PopCountI src)); + effect(TEMP_DEF dst, KILL cr); + predicate(UsePopCountInstruction && + VM_Version::has_PopCount() && + VM_Version::has_MiscInstrExt3()); + ins_cost(DEFAULT_COST); + size(8); // popcnt + llgfr + format %{ "POPCNT $dst,$src\t # pop count int" %} + ins_encode %{ + Register Rdst = $dst$$Register; + Register Rsrc = $src$$Register; + + __ pop_count_int_with_ext3(Rdst, Rsrc); + + %} + ins_pipe(pipe_class_dummy); +%} + +instruct popCountL_Ext3(iRegI dst, iRegL src, flagsReg cr) %{ + match(Set dst (PopCountL src)); + effect(TEMP_DEF dst, KILL cr); + predicate(UsePopCountInstruction && + VM_Version::has_PopCount() && + VM_Version::has_MiscInstrExt3()); + ins_cost(DEFAULT_COST); + size(4); // popcnt + format %{ "POPCNT $dst,$src\t # pop count long" %} + ins_encode %{ + Register Rdst = $dst$$Register; + Register Rsrc = $src$$Register; + + __ pop_count_long_with_ext3(Rdst, Rsrc); + %} + ins_pipe(pipe_class_dummy); +%} + instruct popCountI(iRegI dst, iRegI src, iRegI tmp, flagsReg cr) %{ match(Set dst (PopCountI src)); effect(TEMP_DEF dst, TEMP tmp, KILL cr); - predicate(UsePopCountInstruction && VM_Version::has_PopCount()); + predicate(UsePopCountInstruction && + VM_Version::has_PopCount() && + (!VM_Version::has_MiscInstrExt3())); ins_cost(DEFAULT_COST); size(24); format %{ "POPCNT $dst,$src\t # pop count int" %} @@ -10687,17 +10726,8 @@ instruct popCountI(iRegI dst, iRegI src, iRegI tmp, flagsReg cr) %{ Register Rsrc = $src$$Register; Register Rtmp = $tmp$$Register; - // Prefer compile-time assertion over run-time SIGILL. - assert(VM_Version::has_PopCount(), "bad predicate for countLeadingZerosI"); - assert_different_registers(Rdst, Rtmp); + __ pop_count_int_without_ext3(Rdst, Rsrc, Rtmp); - // Version 2: shows 10%(z196) improvement over original. - __ z_popcnt(Rdst, Rsrc); - __ z_srlg(Rtmp, Rdst, 16); // calc byte4+byte6 and byte5+byte7 - __ z_alr(Rdst, Rtmp); // into byte6 and byte7 - __ z_srlg(Rtmp, Rdst, 8); // calc (byte4+byte6) + (byte5+byte7) - __ z_alr(Rdst, Rtmp); // into byte7 - __ z_llgcr(Rdst, Rdst); // zero-extend sum %} ins_pipe(pipe_class_dummy); %} @@ -10705,27 +10735,18 @@ instruct popCountI(iRegI dst, iRegI src, iRegI tmp, flagsReg cr) %{ instruct popCountL(iRegI dst, iRegL src, iRegL tmp, flagsReg cr) %{ match(Set dst (PopCountL src)); effect(TEMP_DEF dst, TEMP tmp, KILL cr); - predicate(UsePopCountInstruction && VM_Version::has_PopCount()); + predicate(UsePopCountInstruction && + VM_Version::has_PopCount() && + (!VM_Version::has_MiscInstrExt3())); ins_cost(DEFAULT_COST); - // TODO: s390 port size(FIXED_SIZE); + size(34); format %{ "POPCNT $dst,$src\t # pop count long" %} ins_encode %{ Register Rdst = $dst$$Register; Register Rsrc = $src$$Register; Register Rtmp = $tmp$$Register; - // Prefer compile-time assertion over run-time SIGILL. - assert(VM_Version::has_PopCount(), "bad predicate for countLeadingZerosI"); - assert_different_registers(Rdst, Rtmp); - - // Original version. Using LA instead of algr seems to be a really bad idea (-35%). - __ z_popcnt(Rdst, Rsrc); - __ z_ahhlr(Rdst, Rdst, Rdst); - __ z_sllg(Rtmp, Rdst, 16); - __ z_algr(Rdst, Rtmp); - __ z_sllg(Rtmp, Rdst, 8); - __ z_algr(Rdst, Rtmp); - __ z_srlg(Rdst, Rdst, 56); + __ pop_count_long_without_ext3(Rdst, Rsrc, Rtmp); %} ins_pipe(pipe_class_dummy); %} diff --git a/test/micro/org/openjdk/bench/vm/compiler/PopCount.java b/test/micro/org/openjdk/bench/vm/compiler/PopCount.java new file mode 100644 index 00000000000..cbf44023c1e --- /dev/null +++ b/test/micro/org/openjdk/bench/vm/compiler/PopCount.java @@ -0,0 +1,57 @@ +/* + * Copyright (c) 2024 IBM Corporation. All rights reserved. + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This code is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 only, as + * published by the Free Software Foundation. + * + * This code is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * version 2 for more details (a copy is included in the LICENSE file that + * accompanied this code). + * + * You should have received a copy of the GNU General Public License version + * 2 along with this work; if not, write to the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. + * + * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA + * or visit www.oracle.com if you need additional information or have any + * questions. + */ + +package org.openjdk.bench.vm.compiler; + +import org.openjdk.jmh.annotations.*; + +import java.util.concurrent.TimeUnit; + +@BenchmarkMode(Mode.AverageTime) +@OutputTimeUnit(TimeUnit.MILLISECONDS) +@State(Scope.Thread) +@Warmup(iterations = 10, time = 1) +@Measurement(iterations = 5, time = 1) +@Fork(value = 5) +public class PopCount { + int numTests = 100_000; + + @Benchmark + public long test() { + long l1 = 1, l2 = 2, l3 = 3, l4 = 4, l5 = 5, l6 = 6, l7 = 7, l8 = 9, l9 = 9, l10 = 10; + for (long i = 0; i < numTests; i++) { + l1 ^= Long.bitCount(l1) + i; + l2 ^= Long.bitCount(l2) + i; + l3 ^= Long.bitCount(l3) + i; + l4 ^= Long.bitCount(l4) + i; + l5 ^= Long.bitCount(l5) + i; + l6 ^= Long.bitCount(l6) + i; + l7 ^= Long.bitCount(l7) + i; + l8 ^= Long.bitCount(l8) + i; + l9 ^= Long.bitCount(l9) + i; + l10 ^= Long.bitCount(l10) + i; + } + return l1 + l2 + l3 + l4 + l5 + l6 + l7 + l8 + l9 + l10; + } + +}