8179527: Implement intrinsic code for reverseBytes with load/store

Reviewed-by: simonis, mdoerr
This commit is contained in:
Michihiro Horie 2017-06-02 16:32:39 +02:00 committed by Martin Doerr
parent f2b486212e
commit 3312fc717b
5 changed files with 224 additions and 5 deletions

View File

@ -376,10 +376,12 @@ class Assembler : public AbstractAssembler {
STWX_OPCODE = (31u << OPCODE_SHIFT | 151u << 1),
STWU_OPCODE = (37u << OPCODE_SHIFT),
STWUX_OPCODE = (31u << OPCODE_SHIFT | 183u << 1),
STWBRX_OPCODE = (31u << OPCODE_SHIFT | 662u << 1),
STH_OPCODE = (44u << OPCODE_SHIFT),
STHX_OPCODE = (31u << OPCODE_SHIFT | 407u << 1),
STHU_OPCODE = (45u << OPCODE_SHIFT),
STHBRX_OPCODE = (31u << OPCODE_SHIFT | 918u << 1),
STB_OPCODE = (38u << OPCODE_SHIFT),
STBX_OPCODE = (31u << OPCODE_SHIFT | 215u << 1),
@ -401,11 +403,13 @@ class Assembler : public AbstractAssembler {
LD_OPCODE = (58u << OPCODE_SHIFT | 0u << XO_30_31_SHIFT), // DS-FORM
LDU_OPCODE = (58u << OPCODE_SHIFT | 1u << XO_30_31_SHIFT), // DS-FORM
LDX_OPCODE = (31u << OPCODE_SHIFT | 21u << XO_21_30_SHIFT), // X-FORM
LDBRX_OPCODE = (31u << OPCODE_SHIFT | 532u << 1), // X-FORM
STD_OPCODE = (62u << OPCODE_SHIFT | 0u << XO_30_31_SHIFT), // DS-FORM
STDU_OPCODE = (62u << OPCODE_SHIFT | 1u << XO_30_31_SHIFT), // DS-FORM
STDUX_OPCODE = (31u << OPCODE_SHIFT | 181u << 1), // X-FORM
STDUX_OPCODE = (31u << OPCODE_SHIFT | 181u << 1), // X-FORM
STDX_OPCODE = (31u << OPCODE_SHIFT | 149u << XO_21_30_SHIFT), // X-FORM
STDBRX_OPCODE = (31u << OPCODE_SHIFT | 660u << 1), // X-FORM
RLDICR_OPCODE = (30u << OPCODE_SHIFT | 1u << XO_27_29_SHIFT), // MD-FORM
RLDICL_OPCODE = (30u << OPCODE_SHIFT | 0u << XO_27_29_SHIFT), // MD-FORM
@ -1552,6 +1556,9 @@ class Assembler : public AbstractAssembler {
inline void ld( Register d, int si16, Register s1);
inline void ldu( Register d, int si16, Register s1);
// 8 bytes reversed
inline void ldbrx( Register d, Register s1, Register s2);
// For convenience. Load pointer into d from b+s1.
inline void ld_ptr(Register d, int b, Register s1);
DEBUG_ONLY(inline void ld_ptr(Register d, ByteSize b, Register s1);)
@ -1560,10 +1567,12 @@ class Assembler : public AbstractAssembler {
inline void stwx( Register d, Register s1, Register s2);
inline void stw( Register d, int si16, Register s1);
inline void stwu( Register d, int si16, Register s1);
inline void stwbrx( Register d, Register s1, Register s2);
inline void sthx( Register d, Register s1, Register s2);
inline void sth( Register d, int si16, Register s1);
inline void sthu( Register d, int si16, Register s1);
inline void sthbrx( Register d, Register s1, Register s2);
inline void stbx( Register d, Register s1, Register s2);
inline void stb( Register d, int si16, Register s1);
@ -1573,6 +1582,7 @@ class Assembler : public AbstractAssembler {
inline void std( Register d, int si16, Register s1);
inline void stdu( Register d, int si16, Register s1);
inline void stdux(Register s, Register a, Register b);
inline void stdbrx( Register d, Register s1, Register s2);
inline void st_ptr(Register d, int si16, Register s1);
DEBUG_ONLY(inline void st_ptr(Register d, ByteSize b, Register s1);)
@ -2182,14 +2192,18 @@ class Assembler : public AbstractAssembler {
inline void lbz( Register d, int si16);
inline void ldx( Register d, Register s2);
inline void ld( Register d, int si16);
inline void ldbrx(Register d, Register s2);
inline void stwx( Register d, Register s2);
inline void stw( Register d, int si16);
inline void stwbrx( Register d, Register s2);
inline void sthx( Register d, Register s2);
inline void sth( Register d, int si16);
inline void sthbrx( Register d, Register s2);
inline void stbx( Register d, Register s2);
inline void stb( Register d, int si16);
inline void stdx( Register d, Register s2);
inline void std( Register d, int si16);
inline void stdbrx( Register d, Register s2);
// PPC 2, section 3.2.1 Instruction Cache Instructions
inline void icbi( Register s2);

View File

@ -327,6 +327,7 @@ inline void Assembler::lbzu( Register d, int si16, Register s1) { assert(d !=
inline void Assembler::ld( Register d, int si16, Register s1) { emit_int32(LD_OPCODE | rt(d) | ds(si16) | ra0mem(s1));}
inline void Assembler::ldx( Register d, Register s1, Register s2) { emit_int32(LDX_OPCODE | rt(d) | ra0mem(s1) | rb(s2));}
inline void Assembler::ldu( Register d, int si16, Register s1) { assert(d != s1, "according to ibm manual"); emit_int32(LDU_OPCODE | rt(d) | ds(si16) | rta0mem(s1));}
inline void Assembler::ldbrx( Register d, Register s1, Register s2) { emit_int32(LDBRX_OPCODE | rt(d) | ra0mem(s1) | rb(s2));}
inline void Assembler::ld_ptr(Register d, int b, Register s1) { ld(d, b, s1); }
DEBUG_ONLY(inline void Assembler::ld_ptr(Register d, ByteSize b, Register s1) { ld(d, in_bytes(b), s1); })
@ -335,10 +336,12 @@ DEBUG_ONLY(inline void Assembler::ld_ptr(Register d, ByteSize b, Register s1) {
inline void Assembler::stwx( Register d, Register s1, Register s2) { emit_int32(STWX_OPCODE | rs(d) | ra0mem(s1) | rb(s2));}
inline void Assembler::stw( Register d, int si16, Register s1) { emit_int32(STW_OPCODE | rs(d) | d1(si16) | ra0mem(s1));}
inline void Assembler::stwu( Register d, int si16, Register s1) { emit_int32(STWU_OPCODE | rs(d) | d1(si16) | rta0mem(s1));}
inline void Assembler::stwbrx( Register d, Register s1, Register s2) { emit_int32(STWBRX_OPCODE | rs(d) | ra0mem(s1) | rb(s2));}
inline void Assembler::sthx( Register d, Register s1, Register s2) { emit_int32(STHX_OPCODE | rs(d) | ra0mem(s1) | rb(s2));}
inline void Assembler::sth( Register d, int si16, Register s1) { emit_int32(STH_OPCODE | rs(d) | d1(si16) | ra0mem(s1));}
inline void Assembler::sthu( Register d, int si16, Register s1) { emit_int32(STHU_OPCODE | rs(d) | d1(si16) | rta0mem(s1));}
inline void Assembler::sthbrx( Register d, Register s1, Register s2) { emit_int32(STHBRX_OPCODE | rs(d) | ra0mem(s1) | rb(s2));}
inline void Assembler::stbx( Register d, Register s1, Register s2) { emit_int32(STBX_OPCODE | rs(d) | ra0mem(s1) | rb(s2));}
inline void Assembler::stb( Register d, int si16, Register s1) { emit_int32(STB_OPCODE | rs(d) | d1(si16) | ra0mem(s1));}
@ -348,6 +351,7 @@ inline void Assembler::std( Register d, int si16, Register s1) { emit_int32(
inline void Assembler::stdx( Register d, Register s1, Register s2) { emit_int32(STDX_OPCODE | rs(d) | ra0mem(s1) | rb(s2));}
inline void Assembler::stdu( Register d, int si16, Register s1) { emit_int32(STDU_OPCODE | rs(d) | ds(si16) | rta0mem(s1));}
inline void Assembler::stdux(Register s, Register a, Register b) { emit_int32(STDUX_OPCODE| rs(s) | rta0mem(a) | rb(b));}
inline void Assembler::stdbrx( Register d, Register s1, Register s2) { emit_int32(STDBRX_OPCODE | rs(d) | ra0mem(s1) | rb(s2));}
inline void Assembler::st_ptr(Register d, int b, Register s1) { std(d, b, s1); }
DEBUG_ONLY(inline void Assembler::st_ptr(Register d, ByteSize b, Register s1) { std(d, in_bytes(b), s1); })
@ -944,14 +948,18 @@ inline void Assembler::lbzx( Register d, Register s2) { emit_int32( LBZX_OPCODE
inline void Assembler::lbz( Register d, int si16 ) { emit_int32( LBZ_OPCODE | rt(d) | d1(si16));}
inline void Assembler::ld( Register d, int si16 ) { emit_int32( LD_OPCODE | rt(d) | ds(si16));}
inline void Assembler::ldx( Register d, Register s2) { emit_int32( LDX_OPCODE | rt(d) | rb(s2));}
inline void Assembler::ldbrx(Register d, Register s2) { emit_int32( LDBRX_OPCODE| rt(d) | rb(s2));}
inline void Assembler::stwx( Register d, Register s2) { emit_int32( STWX_OPCODE | rs(d) | rb(s2));}
inline void Assembler::stw( Register d, int si16 ) { emit_int32( STW_OPCODE | rs(d) | d1(si16));}
inline void Assembler::stwbrx(Register d, Register s2){ emit_int32(STWBRX_OPCODE| rs(d) | rb(s2));}
inline void Assembler::sthx( Register d, Register s2) { emit_int32( STHX_OPCODE | rs(d) | rb(s2));}
inline void Assembler::sth( Register d, int si16 ) { emit_int32( STH_OPCODE | rs(d) | d1(si16));}
inline void Assembler::sthbrx(Register d, Register s2){ emit_int32(STHBRX_OPCODE| rs(d) | rb(s2));}
inline void Assembler::stbx( Register d, Register s2) { emit_int32( STBX_OPCODE | rs(d) | rb(s2));}
inline void Assembler::stb( Register d, int si16 ) { emit_int32( STB_OPCODE | rs(d) | d1(si16));}
inline void Assembler::std( Register d, int si16 ) { emit_int32( STD_OPCODE | rs(d) | ds(si16));}
inline void Assembler::stdx( Register d, Register s2) { emit_int32( STDX_OPCODE | rs(d) | rb(s2));}
inline void Assembler::stdbrx(Register d, Register s2){ emit_int32(STDBRX_OPCODE| rs(d) | rb(s2));}
// ra0 version
inline void Assembler::icbi( Register s2) { emit_int32( ICBI_OPCODE | rb(s2) ); }

View File

@ -5842,6 +5842,16 @@ instruct loadConN_lo(iRegNdst dst, iRegNsrc src1, immN src2) %{
ins_pipe(pipe_class_default);
%}
instruct rldicl(iRegLdst dst, iRegLsrc src, immI16 shift, immI16 mask_begin) %{
effect(DEF dst, USE src, USE shift, USE mask_begin);
size(4);
ins_encode %{
__ rldicl($dst$$Register, $src$$Register, $shift$$constant, $mask_begin$$constant);
%}
ins_pipe(pipe_class_default);
%}
// Needed to postalloc expand loadConN: ConN is loaded as ConI
// leaving the upper 32 bits with sign-extension bits.
// This clears these bits: dst = src & 0xFFFFFFFF.
@ -10519,6 +10529,16 @@ instruct convB2I_reg(iRegIdst dst, iRegIsrc src, immI_24 amount) %{
ins_pipe(pipe_class_default);
%}
instruct extsh(iRegIdst dst, iRegIsrc src) %{
effect(DEF dst, USE src);
size(4);
ins_encode %{
__ extsh($dst$$Register, $src$$Register);
%}
ins_pipe(pipe_class_default);
%}
// LShiftI 16 + RShiftI 16 converts short to int.
instruct convS2I_reg(iRegIdst dst, iRegIsrc src, immI_16 amount) %{
match(Set dst (RShiftI (LShiftI src amount) amount));
@ -12682,8 +12702,7 @@ instruct insrwi(iRegIdst dst, iRegIsrc src, immI16 pos, immI16 shift) %{
// Just slightly faster than java implementation.
instruct bytes_reverse_int_Ex(iRegIdst dst, iRegIsrc src) %{
match(Set dst (ReverseBytesI src));
predicate(UseCountLeadingZerosInstructionsPPC64);
ins_cost(DEFAULT_COST);
ins_cost(7*DEFAULT_COST);
expand %{
immI16 imm24 %{ (int) 24 %}
@ -12705,6 +12724,172 @@ instruct bytes_reverse_int_Ex(iRegIdst dst, iRegIsrc src) %{
%}
%}
instruct bytes_reverse_long_Ex(iRegLdst dst, iRegLsrc src) %{
match(Set dst (ReverseBytesL src));
ins_cost(15*DEFAULT_COST);
expand %{
immI16 imm56 %{ (int) 56 %}
immI16 imm48 %{ (int) 48 %}
immI16 imm40 %{ (int) 40 %}
immI16 imm32 %{ (int) 32 %}
immI16 imm24 %{ (int) 24 %}
immI16 imm16 %{ (int) 16 %}
immI16 imm8 %{ (int) 8 %}
immI16 imm0 %{ (int) 0 %}
iRegLdst tmpL1;
iRegLdst tmpL2;
iRegLdst tmpL3;
iRegLdst tmpL4;
iRegLdst tmpL5;
iRegLdst tmpL6;
// src : |a|b|c|d|e|f|g|h|
rldicl(tmpL1, src, imm8, imm24); // tmpL1 : | | | |e|f|g|h|a|
rldicl(tmpL2, tmpL1, imm32, imm24); // tmpL2 : | | | |a| | | |e|
rldicl(tmpL3, tmpL2, imm32, imm0); // tmpL3 : | | | |e| | | |a|
rldicl(tmpL1, src, imm16, imm24); // tmpL1 : | | | |f|g|h|a|b|
rldicl(tmpL2, tmpL1, imm32, imm24); // tmpL2 : | | | |b| | | |f|
rldicl(tmpL4, tmpL2, imm40, imm0); // tmpL4 : | | |f| | | |b| |
orL_reg_reg(tmpL5, tmpL3, tmpL4); // tmpL5 : | | |f|e| | |b|a|
rldicl(tmpL1, src, imm24, imm24); // tmpL1 : | | | |g|h|a|b|c|
rldicl(tmpL2, tmpL1, imm32, imm24); // tmpL2 : | | | |c| | | |g|
rldicl(tmpL3, tmpL2, imm48, imm0); // tmpL3 : | |g| | | |c| | |
rldicl(tmpL1, src, imm32, imm24); // tmpL1 : | | | |h|a|b|c|d|
rldicl(tmpL2, tmpL1, imm32, imm24); // tmpL2 : | | | |d| | | |h|
rldicl(tmpL4, tmpL2, imm56, imm0); // tmpL4 : |h| | | |d| | | |
orL_reg_reg(tmpL6, tmpL3, tmpL4); // tmpL6 : |h|g| | |d|c| | |
orL_reg_reg(dst, tmpL5, tmpL6); // dst : |h|g|f|e|d|c|b|a|
%}
%}
instruct bytes_reverse_ushort_Ex(iRegIdst dst, iRegIsrc src) %{
match(Set dst (ReverseBytesUS src));
ins_cost(2*DEFAULT_COST);
expand %{
immI16 imm16 %{ (int) 16 %}
immI16 imm8 %{ (int) 8 %}
urShiftI_reg_imm(dst, src, imm8);
insrwi(dst, src, imm16, imm8);
%}
%}
instruct bytes_reverse_short_Ex(iRegIdst dst, iRegIsrc src) %{
match(Set dst (ReverseBytesS src));
ins_cost(3*DEFAULT_COST);
expand %{
immI16 imm16 %{ (int) 16 %}
immI16 imm8 %{ (int) 8 %}
iRegLdst tmpI1;
urShiftI_reg_imm(tmpI1, src, imm8);
insrwi(tmpI1, src, imm16, imm8);
extsh(dst, tmpI1);
%}
%}
// Load Integer reversed byte order
instruct loadI_reversed(iRegIdst dst, indirect mem) %{
match(Set dst (ReverseBytesI (LoadI mem)));
ins_cost(MEMORY_REF_COST);
size(4);
ins_encode %{
__ lwbrx($dst$$Register, $mem$$Register);
%}
ins_pipe(pipe_class_default);
%}
// Load Long - aligned and reversed
instruct loadL_reversed(iRegLdst dst, indirect mem) %{
match(Set dst (ReverseBytesL (LoadL mem)));
predicate(VM_Version::has_ldbrx());
ins_cost(MEMORY_REF_COST);
size(4);
ins_encode %{
__ ldbrx($dst$$Register, $mem$$Register);
%}
ins_pipe(pipe_class_default);
%}
// Load unsigned short / char reversed byte order
instruct loadUS_reversed(iRegIdst dst, indirect mem) %{
match(Set dst (ReverseBytesUS (LoadUS mem)));
ins_cost(MEMORY_REF_COST);
size(4);
ins_encode %{
__ lhbrx($dst$$Register, $mem$$Register);
%}
ins_pipe(pipe_class_default);
%}
// Load short reversed byte order
instruct loadS_reversed(iRegIdst dst, indirect mem) %{
match(Set dst (ReverseBytesS (LoadS mem)));
ins_cost(MEMORY_REF_COST + DEFAULT_COST);
size(8);
ins_encode %{
__ lhbrx($dst$$Register, $mem$$Register);
__ extsh($dst$$Register, $dst$$Register);
%}
ins_pipe(pipe_class_default);
%}
// Store Integer reversed byte order
instruct storeI_reversed(iRegIsrc src, indirect mem) %{
match(Set mem (StoreI mem (ReverseBytesI src)));
ins_cost(MEMORY_REF_COST);
size(4);
ins_encode %{
__ stwbrx($src$$Register, $mem$$Register);
%}
ins_pipe(pipe_class_default);
%}
// Store Long reversed byte order
instruct storeL_reversed(iRegLsrc src, indirect mem) %{
match(Set mem (StoreL mem (ReverseBytesL src)));
predicate(VM_Version::has_stdbrx());
ins_cost(MEMORY_REF_COST);
size(4);
ins_encode %{
__ stdbrx($src$$Register, $mem$$Register);
%}
ins_pipe(pipe_class_default);
%}
// Store unsigned short / char reversed byte order
instruct storeUS_reversed(iRegIsrc src, indirect mem) %{
match(Set mem (StoreC mem (ReverseBytesUS src)));
ins_cost(MEMORY_REF_COST);
size(4);
ins_encode %{
__ sthbrx($src$$Register, $mem$$Register);
%}
ins_pipe(pipe_class_default);
%}
// Store short reversed byte order
instruct storeS_reversed(iRegIsrc src, indirect mem) %{
match(Set mem (StoreC mem (ReverseBytesS src)));
ins_cost(MEMORY_REF_COST);
size(4);
ins_encode %{
__ sthbrx($src$$Register, $mem$$Register);
%}
ins_pipe(pipe_class_default);
%}
//---------- Replicate Vector Instructions ------------------------------------
// Insrdi does replicate if src == dst.

View File

@ -111,7 +111,7 @@ void VM_Version::initialize() {
// Create and print feature-string.
char buf[(num_features+1) * 16]; // Max 16 chars per feature.
jio_snprintf(buf, sizeof(buf),
"ppc64%s%s%s%s%s%s%s%s%s%s%s%s%s%s",
"ppc64%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s",
(has_fsqrt() ? " fsqrt" : ""),
(has_isel() ? " isel" : ""),
(has_lxarxeh() ? " lxarxeh" : ""),
@ -126,7 +126,9 @@ void VM_Version::initialize() {
(has_vpmsumb() ? " vpmsumb" : ""),
(has_tcheck() ? " tcheck" : ""),
(has_mfdscr() ? " mfdscr" : ""),
(has_vsx() ? " vsx" : "")
(has_vsx() ? " vsx" : ""),
(has_ldbrx() ? " ldbrx" : ""),
(has_stdbrx() ? " stdbrx" : "")
// Make sure number of %s matches num_features!
);
_features_string = os::strdup(buf);
@ -663,6 +665,8 @@ void VM_Version::determine_features() {
a->tcheck(0); // code[12] -> tcheck
a->mfdscr(R0); // code[13] -> mfdscr
a->lxvd2x(VSR0, R3_ARG1); // code[14] -> vsx
a->ldbrx(R7, R3_ARG1, R4_ARG2); // code[15] -> ldbrx
a->stdbrx(R7, R3_ARG1, R4_ARG2); // code[16] -> stdbrx
a->blr();
// Emit function to set one cache line to zero. Emit function descriptor and get pointer to it.
@ -712,6 +716,8 @@ void VM_Version::determine_features() {
if (code[feature_cntr++]) features |= tcheck_m;
if (code[feature_cntr++]) features |= mfdscr_m;
if (code[feature_cntr++]) features |= vsx_m;
if (code[feature_cntr++]) features |= ldbrx_m;
if (code[feature_cntr++]) features |= stdbrx_m;
// Print the detection code.
if (PrintAssembly) {

View File

@ -47,6 +47,8 @@ protected:
tcheck,
mfdscr,
vsx,
ldbrx,
stdbrx,
num_features // last entry to count features
};
enum Feature_Flag_Set {
@ -66,6 +68,8 @@ protected:
tcheck_m = (1 << tcheck ),
mfdscr_m = (1 << mfdscr ),
vsx_m = (1 << vsx ),
ldbrx_m = (1 << ldbrx ),
stdbrx_m = (1 << stdbrx ),
all_features_m = (unsigned long)-1
};
@ -100,6 +104,8 @@ public:
static bool has_tcheck() { return (_features & tcheck_m) != 0; }
static bool has_mfdscr() { return (_features & mfdscr_m) != 0; }
static bool has_vsx() { return (_features & vsx_m) != 0; }
static bool has_ldbrx() { return (_features & ldbrx_m) != 0; }
static bool has_stdbrx() { return (_features & stdbrx_m) != 0; }
// Assembler testing
static void allow_all();