8189793: [s390]: Improve String compress/inflate by exploiting vector instructions

Reviewed-by: mdoerr, goetz
This commit is contained in:
Lutz Schmidt 2017-11-22 17:10:38 +01:00
parent c8bce04078
commit d5adf1df92
5 changed files with 788 additions and 128 deletions

@ -582,7 +582,11 @@ class Assembler : public AbstractAssembler {
#define LOC_ZOPC (unsigned long)(0xebL << 40 | 0xf2L) // z196
#define LOCG_ZOPC (unsigned long)(0xebL << 40 | 0xe2L) // z196
#define LMG_ZOPC (unsigned long)(235L << 40 | 4L)
// LOAD multiple registers at once
#define LM_ZOPC (unsigned int)(0x98 << 24)
#define LMY_ZOPC (unsigned long)(0xebL << 40 | 0x98L)
#define LMG_ZOPC (unsigned long)(0xebL << 40 | 0x04L)
#define LE_ZOPC (unsigned int)(0x78 << 24)
#define LEY_ZOPC (unsigned long)(237L << 40 | 100L)
@ -613,7 +617,10 @@ class Assembler : public AbstractAssembler {
#define STOC_ZOPC (unsigned long)(0xebL << 40 | 0xf3L) // z196
#define STOCG_ZOPC (unsigned long)(0xebL << 40 | 0xe3L) // z196
#define STMG_ZOPC (unsigned long)(235L << 40 | 36L)
// STORE multiple registers at once
#define STM_ZOPC (unsigned int)(0x90 << 24)
#define STMY_ZOPC (unsigned long)(0xebL << 40 | 0x90L)
#define STMG_ZOPC (unsigned long)(0xebL << 40 | 0x24L)
#define STE_ZOPC (unsigned int)(0x70 << 24)
#define STEY_ZOPC (unsigned long)(237L << 40 | 102L)
@ -874,15 +881,19 @@ class Assembler : public AbstractAssembler {
// Shift
// arithmetic
#define SLA_ZOPC (unsigned int)(139 << 24)
#define SLAG_ZOPC (unsigned long)(235L << 40 | 11L)
#define SRA_ZOPC (unsigned int)(138 << 24)
#define SRAG_ZOPC (unsigned long)(235L << 40 | 10L)
#define SLA_ZOPC (unsigned int)(0x8b << 24)
#define SLAK_ZOPC (unsigned long)(0xebL << 40 | 0xddL)
#define SLAG_ZOPC (unsigned long)(0xebL << 40 | 0x0bL)
#define SRA_ZOPC (unsigned int)(0x8a << 24)
#define SRAK_ZOPC (unsigned long)(0xebL << 40 | 0xdcL)
#define SRAG_ZOPC (unsigned long)(0xebL << 40 | 0x0aL)
// logical
#define SLL_ZOPC (unsigned int)(137 << 24)
#define SLLG_ZOPC (unsigned long)(235L << 40 | 13L)
#define SRL_ZOPC (unsigned int)(136 << 24)
#define SRLG_ZOPC (unsigned long)(235L << 40 | 12L)
#define SLL_ZOPC (unsigned int)(0x89 << 24)
#define SLLK_ZOPC (unsigned long)(0xebL << 40 | 0xdfL)
#define SLLG_ZOPC (unsigned long)(0xebL << 40 | 0x0dL)
#define SRL_ZOPC (unsigned int)(0x88 << 24)
#define SRLK_ZOPC (unsigned long)(0xebL << 40 | 0xdeL)
#define SRLG_ZOPC (unsigned long)(0xebL << 40 | 0x0cL)
// Rotate, then AND/XOR/OR/insert
// rotate
@ -2262,12 +2273,16 @@ class Assembler : public AbstractAssembler {
// shift
inline void z_sla( Register r1, int64_t d2, Register b2=Z_R0); // shift left r1 = r1 << ((d2+b2)&0x3f) ; int32, only 31 bits shifted, sign preserved!
inline void z_slak(Register r1, Register r3, int64_t d2, Register b2=Z_R0); // shift left r1 = r3 << ((d2+b2)&0x3f) ; int32, only 31 bits shifted, sign preserved!
inline void z_slag(Register r1, Register r3, int64_t d2, Register b2=Z_R0); // shift left r1 = r3 << ((d2+b2)&0x3f) ; int64, only 63 bits shifted, sign preserved!
inline void z_sra( Register r1, int64_t d2, Register b2=Z_R0); // shift right r1 = r1 >> ((d2+b2)&0x3f) ; int32, sign extended
inline void z_srak(Register r1, Register r3, int64_t d2, Register b2=Z_R0); // shift right r1 = r3 >> ((d2+b2)&0x3f) ; int32, sign extended
inline void z_srag(Register r1, Register r3, int64_t d2, Register b2=Z_R0); // shift right r1 = r3 >> ((d2+b2)&0x3f) ; int64, sign extended
inline void z_sll( Register r1, int64_t d2, Register b2=Z_R0); // shift left r1 = r1 << ((d2+b2)&0x3f) ; int32, zeros added
inline void z_sllk(Register r1, Register r3, int64_t d2, Register b2=Z_R0); // shift left r1 = r3 << ((d2+b2)&0x3f) ; int32, zeros added
inline void z_sllg(Register r1, Register r3, int64_t d2, Register b2=Z_R0); // shift left r1 = r3 << ((d2+b2)&0x3f) ; int64, zeros added
inline void z_srl( Register r1, int64_t d2, Register b2=Z_R0); // shift right r1 = r1 >> ((d2+b2)&0x3f) ; int32, zero extended
inline void z_srlk(Register r1, Register r3, int64_t d2, Register b2=Z_R0); // shift right r1 = r3 >> ((d2+b2)&0x3f) ; int32, zero extended
inline void z_srlg(Register r1, Register r3, int64_t d2, Register b2=Z_R0); // shift right r1 = r3 >> ((d2+b2)&0x3f) ; int64, zero extended
// rotate
@ -3035,7 +3050,11 @@ class Assembler : public AbstractAssembler {
inline void z_tam();
inline void z_stckf(int64_t d2, Register b2);
inline void z_stm( Register r1, Register r3, int64_t d2, Register b2);
inline void z_stmy(Register r1, Register r3, int64_t d2, Register b2);
inline void z_stmg(Register r1, Register r3, int64_t d2, Register b2);
inline void z_lm( Register r1, Register r3, int64_t d2, Register b2);
inline void z_lmy(Register r1, Register r3, int64_t d2, Register b2);
inline void z_lmg(Register r1, Register r3, int64_t d2, Register b2);
inline void z_cs( Register r1, Register r3, int64_t d2, Register b2);

@ -334,12 +334,16 @@ inline void Assembler::z_stfle(int64_t d2, Register b2) { emit_32(STFLE_ZOPC | u
// SHIFT/RORATE OPERATIONS
//-----------------------------------
inline void Assembler::z_sla( Register r1, int64_t d2, Register b2) { emit_32( SLA_ZOPC | regt(r1, 8, 32) | uimm12(d2, 20, 32) | reg(b2, 16, 32)); }
inline void Assembler::z_slak(Register r1, Register r3, int64_t d2, Register b2) { emit_48( SLAK_ZOPC | regt(r1, 8, 48) | simm20(d2) | reg(b2, 16, 48) | reg(r3, 12, 48)); }
inline void Assembler::z_slag(Register r1, Register r3, int64_t d2, Register b2) { emit_48( SLAG_ZOPC | regt(r1, 8, 48) | simm20(d2) | reg(b2, 16, 48) | reg(r3, 12, 48)); }
inline void Assembler::z_sra( Register r1, int64_t d2, Register b2) { emit_32( SRA_ZOPC | regt(r1, 8, 32) | uimm12(d2, 20, 32) | reg(b2, 16, 32)); }
inline void Assembler::z_srak(Register r1, Register r3, int64_t d2, Register b2) { emit_48( SRAK_ZOPC | regt(r1, 8, 48) | simm20(d2) | reg(b2, 16, 48) | reg(r3, 12, 48)); }
inline void Assembler::z_srag(Register r1, Register r3, int64_t d2, Register b2) { emit_48( SRAG_ZOPC | regt(r1, 8, 48) | simm20(d2) | reg(b2, 16, 48) | reg(r3, 12, 48)); }
inline void Assembler::z_sll( Register r1, int64_t d2, Register b2) { emit_32( SLL_ZOPC | regt(r1, 8, 32) | uimm12(d2, 20, 32) | reg(b2, 16, 32)); }
inline void Assembler::z_sllk(Register r1, Register r3, int64_t d2, Register b2) { emit_48( SLLK_ZOPC | regt(r1, 8, 48) | simm20(d2) | reg(b2, 16, 48) | reg(r3, 12, 48)); }
inline void Assembler::z_sllg(Register r1, Register r3, int64_t d2, Register b2) { emit_48( SLLG_ZOPC | regt(r1, 8, 48) | simm20(d2) | reg(b2, 16, 48) | reg(r3, 12, 48)); }
inline void Assembler::z_srl( Register r1, int64_t d2, Register b2) { emit_32( SRL_ZOPC | regt(r1, 8, 32) | uimm12(d2, 20, 32) | reg(b2, 16, 32)); }
inline void Assembler::z_srlk(Register r1, Register r3, int64_t d2, Register b2) { emit_48( SRLK_ZOPC | regt(r1, 8, 48) | simm20(d2) | reg(b2, 16, 48) | reg(r3, 12, 48)); }
inline void Assembler::z_srlg(Register r1, Register r3, int64_t d2, Register b2) { emit_48( SRLG_ZOPC | regt(r1, 8, 48) | simm20(d2) | reg(b2, 16, 48) | reg(r3, 12, 48)); }
// rotate left
@ -690,10 +694,14 @@ inline void Assembler::z_ahhlr(Register r1, Register r2, Register r3) { emit_32(
inline void Assembler::z_tam() { emit_16( TAM_ZOPC); }
inline void Assembler::z_stckf(int64_t d2, Register b2) { emit_32( STCKF_ZOPC | uimm12(d2, 20, 32) | regz(b2, 16, 32)); }
inline void Assembler::z_stmg(Register r1, Register r3, int64_t d2, Register b2) { emit_48( STMG_ZOPC | simm20(d2) | reg(r1, 8, 48) | reg(r3,12,48)| reg(b2,16,48) ); }
inline void Assembler::z_lmg(Register r1, Register r3, int64_t d2, Register b2) { emit_48( LMG_ZOPC | simm20(d2) | reg(r1, 8, 48) | reg(r3,12,48)| reg(b2,16,48) ); }
inline void Assembler::z_stm( Register r1, Register r3, int64_t d2, Register b2) { emit_32( STM_ZOPC | reg(r1, 8, 32) | reg(r3,12,32)| reg(b2,16,32) | uimm12(d2, 20,32)); }
inline void Assembler::z_stmy(Register r1, Register r3, int64_t d2, Register b2) { emit_48( STMY_ZOPC | reg(r1, 8, 48) | reg(r3,12,48)| reg(b2,16,48) | simm20(d2) ); }
inline void Assembler::z_stmg(Register r1, Register r3, int64_t d2, Register b2) { emit_48( STMG_ZOPC | reg(r1, 8, 48) | reg(r3,12,48)| reg(b2,16,48) | simm20(d2) ); }
inline void Assembler::z_lm( Register r1, Register r3, int64_t d2, Register b2) { emit_32( LM_ZOPC | reg(r1, 8, 32) | reg(r3,12,32)| reg(b2,16,32) | uimm12(d2, 20,32)); }
inline void Assembler::z_lmy( Register r1, Register r3, int64_t d2, Register b2) { emit_48( LMY_ZOPC | reg(r1, 8, 48) | reg(r3,12,48)| reg(b2,16,48) | simm20(d2) ); }
inline void Assembler::z_lmg( Register r1, Register r3, int64_t d2, Register b2) { emit_48( LMG_ZOPC | reg(r1, 8, 48) | reg(r3,12,48)| reg(b2,16,48) | simm20(d2) ); }
inline void Assembler::z_cs(Register r1, Register r3, int64_t d2, Register b2) { emit_32( CS_ZOPC | regt(r1, 8, 32) | reg(r3, 12, 32) | reg(b2, 16, 32) | uimm12(d2, 20, 32)); }
inline void Assembler::z_cs( Register r1, Register r3, int64_t d2, Register b2) { emit_32( CS_ZOPC | regt(r1, 8, 32) | reg(r3, 12, 32) | reg(b2, 16, 32) | uimm12(d2, 20, 32)); }
inline void Assembler::z_csy(Register r1, Register r3, int64_t d2, Register b2) { emit_48( CSY_ZOPC | regt(r1, 8, 48) | reg(r3, 12, 48) | reg(b2, 16, 48) | simm20(d2)); }
inline void Assembler::z_csg(Register r1, Register r3, int64_t d2, Register b2) { emit_48( CSG_ZOPC | regt(r1, 8, 48) | reg(r3, 12, 48) | reg(b2, 16, 48) | simm20(d2)); }
inline void Assembler::z_cs( Register r1, Register r3, const Address& a) { assert(!a.has_index(), "Cannot encode index"); z_cs( r1, r3, a.disp(), a.baseOrR0()); }

@ -936,7 +936,7 @@ void MacroAssembler::load_long_pcrelative(Register Rdst, address dataLocation) {
// Some extra safety net.
if (!RelAddr::is_in_range_of_RelAddr32(total_distance)) {
guarantee(RelAddr::is_in_range_of_RelAddr32(total_distance), "too far away");
guarantee(RelAddr::is_in_range_of_RelAddr32(total_distance), "load_long_pcrelative can't handle distance " INTPTR_FORMAT, total_distance);
}
(this)->relocate(rspec, relocInfo::pcrel_addr_format);
@ -956,7 +956,7 @@ void MacroAssembler::load_addr_pcrelative(Register Rdst, address addrLocation) {
// Some extra safety net.
if (!RelAddr::is_in_range_of_RelAddr32(total_distance)) {
guarantee(RelAddr::is_in_range_of_RelAddr32(total_distance), "too far away");
guarantee(RelAddr::is_in_range_of_RelAddr32(total_distance), "load_long_pcrelative can't handle distance " INTPTR_FORMAT, total_distance);
}
(this)->relocate(rspec, relocInfo::pcrel_addr_format);
@ -1025,6 +1025,13 @@ void MacroAssembler::testbit(Register r, unsigned int bitPos) {
}
}
void MacroAssembler::prefetch_read(Address a) {
z_pfd(1, a.disp20(), a.indexOrR0(), a.base());
}
void MacroAssembler::prefetch_update(Address a) {
z_pfd(2, a.disp20(), a.indexOrR0(), a.base());
}
// Clear a register, i.e. load const zero into reg.
// Return len (in bytes) of generated instruction(s).
// whole_reg: Clear 64 bits if true, 32 bits otherwise.
@ -4896,77 +4903,295 @@ unsigned int MacroAssembler::CopyRawMemory_AlignedDisjoint(Register src_reg, Reg
// Intrinsics for CompactStrings
// Compress char[] to byte[]. odd_reg contains cnt. Kills dst. Early clobber: result
// Compress char[] to byte[].
// Restores: src, dst
// Uses: cnt
// Kills: tmp, Z_R0, Z_R1.
// Early clobber: result.
// Note:
// cnt is signed int. Do not rely on high word!
// counts # characters, not bytes.
// The result is the number of characters copied before the first incompatible character was found.
// If tmp2 is provided and the compression fails, the compression stops exactly at this point and the result is precise.
// If precise is true, the processing stops exactly at this point. Otherwise, the result may be off
// by a few bytes. The result always indicates the number of copied characters.
//
// Note: Does not behave exactly like package private StringUTF16 compress java implementation in case of failure:
// - Different number of characters may have been written to dead array (if tmp2 not provided).
// - Different number of characters may have been written to dead array (if precise is false).
// - Returns a number <cnt instead of 0. (Result gets compared with cnt.)
unsigned int MacroAssembler::string_compress(Register result, Register src, Register dst, Register odd_reg,
Register even_reg, Register tmp, Register tmp2) {
int block_start = offset();
Label Lloop1, Lloop2, Lslow, Ldone;
const Register addr2 = dst, ind1 = result, mask = tmp;
const bool precise = (tmp2 != noreg);
unsigned int MacroAssembler::string_compress(Register result, Register src, Register dst, Register cnt,
Register tmp, bool precise) {
assert_different_registers(Z_R0, Z_R1, src, dst, cnt, tmp);
BLOCK_COMMENT("string_compress {");
z_sll(odd_reg, 1); // Number of bytes to read. (Must be a positive simm32.)
clear_reg(ind1); // Index to read.
z_llilf(mask, 0xFF00FF00);
z_ahi(odd_reg, -16); // Last possible index for fast loop.
z_brl(Lslow);
// ind1: index, even_reg: index increment, odd_reg: index limit
z_iihf(mask, 0xFF00FF00);
z_lhi(even_reg, 16);
bind(Lloop1); // 8 Characters per iteration.
z_lg(Z_R0, Address(src, ind1));
z_lg(Z_R1, Address(src, ind1, 8));
if (precise) {
BLOCK_COMMENT("encode_iso_array {");
} else {
BLOCK_COMMENT("string_compress {");
}
int block_start = offset();
Register Rsrc = src;
Register Rdst = dst;
Register Rix = tmp;
Register Rcnt = cnt;
Register Rmask = result; // holds incompatibility check mask until result value is stored.
Label ScalarShortcut, AllDone;
z_iilf(Rmask, 0xFF00FF00);
z_iihf(Rmask, 0xFF00FF00);
#if 0 // Sacrifice shortcuts for code compactness
{
//---< shortcuts for short strings (very frequent) >---
// Strings with 4 and 8 characters were fond to occur very frequently.
// Therefore, we handle them right away with minimal overhead.
Label skipShortcut, skip4Shortcut, skip8Shortcut;
Register Rout = Z_R0;
z_chi(Rcnt, 4);
z_brne(skip4Shortcut); // 4 characters are very frequent
z_lg(Z_R0, 0, Rsrc); // Treat exactly 4 characters specially.
if (VM_Version::has_DistinctOpnds()) {
Rout = Z_R0;
z_ngrk(Rix, Z_R0, Rmask);
} else {
Rout = Rix;
z_lgr(Rix, Z_R0);
z_ngr(Z_R0, Rmask);
}
z_brnz(skipShortcut);
z_stcmh(Rout, 5, 0, Rdst);
z_stcm(Rout, 5, 2, Rdst);
z_lgfr(result, Rcnt);
z_bru(AllDone);
bind(skip4Shortcut);
z_chi(Rcnt, 8);
z_brne(skip8Shortcut); // There's more to do...
z_lmg(Z_R0, Z_R1, 0, Rsrc); // Treat exactly 8 characters specially.
if (VM_Version::has_DistinctOpnds()) {
Rout = Z_R0;
z_ogrk(Rix, Z_R0, Z_R1);
z_ngr(Rix, Rmask);
} else {
Rout = Rix;
z_lgr(Rix, Z_R0);
z_ogr(Z_R0, Z_R1);
z_ngr(Z_R0, Rmask);
}
z_brnz(skipShortcut);
z_stcmh(Rout, 5, 0, Rdst);
z_stcm(Rout, 5, 2, Rdst);
z_stcmh(Z_R1, 5, 4, Rdst);
z_stcm(Z_R1, 5, 6, Rdst);
z_lgfr(result, Rcnt);
z_bru(AllDone);
bind(skip8Shortcut);
clear_reg(Z_R0, true, false); // #characters already processed (none). Precond for scalar loop.
z_brl(ScalarShortcut); // Just a few characters
bind(skipShortcut);
}
#endif
clear_reg(Z_R0); // make sure register is properly initialized.
if (VM_Version::has_VectorFacility()) {
const int min_vcnt = 32; // Minimum #characters required to use vector instructions.
// Otherwise just do nothing in vector mode.
// Must be multiple of 2*(vector register length in chars (8 HW = 128 bits)).
const int log_min_vcnt = exact_log2(min_vcnt);
Label VectorLoop, VectorDone, VectorBreak;
VectorRegister Vtmp1 = Z_V16;
VectorRegister Vtmp2 = Z_V17;
VectorRegister Vmask = Z_V18;
VectorRegister Vzero = Z_V19;
VectorRegister Vsrc_first = Z_V20;
VectorRegister Vsrc_last = Z_V23;
assert((Vsrc_last->encoding() - Vsrc_first->encoding() + 1) == min_vcnt/8, "logic error");
assert(VM_Version::has_DistinctOpnds(), "Assumption when has_VectorFacility()");
z_srak(Rix, Rcnt, log_min_vcnt); // # vector loop iterations
z_brz(VectorDone); // not enough data for vector loop
z_vzero(Vzero); // all zeroes
z_vgmh(Vmask, 0, 7); // generate 0xff00 mask for all 2-byte elements
z_sllg(Z_R0, Rix, log_min_vcnt); // remember #chars that will be processed by vector loop
bind(VectorLoop);
z_vlm(Vsrc_first, Vsrc_last, 0, Rsrc);
add2reg(Rsrc, min_vcnt*2);
//---< check for incompatible character >---
z_vo(Vtmp1, Z_V20, Z_V21);
z_vo(Vtmp2, Z_V22, Z_V23);
z_vo(Vtmp1, Vtmp1, Vtmp2);
z_vn(Vtmp1, Vtmp1, Vmask);
z_vceqhs(Vtmp1, Vtmp1, Vzero); // high half of all chars must be zero for successful compress.
z_brne(VectorBreak); // break vector loop, incompatible character found.
// re-process data from current iteration in break handler.
//---< pack & store characters >---
z_vpkh(Vtmp1, Z_V20, Z_V21); // pack (src1, src2) -> tmp1
z_vpkh(Vtmp2, Z_V22, Z_V23); // pack (src3, src4) -> tmp2
z_vstm(Vtmp1, Vtmp2, 0, Rdst); // store packed string
add2reg(Rdst, min_vcnt);
z_brct(Rix, VectorLoop);
z_bru(VectorDone);
bind(VectorBreak);
z_sll(Rix, log_min_vcnt); // # chars processed so far in VectorLoop, excl. current iteration.
z_sr(Z_R0, Rix); // correct # chars processed in total.
bind(VectorDone);
}
{
const int min_cnt = 8; // Minimum #characters required to use unrolled loop.
// Otherwise just do nothing in unrolled loop.
// Must be multiple of 8.
const int log_min_cnt = exact_log2(min_cnt);
Label UnrolledLoop, UnrolledDone, UnrolledBreak;
if (VM_Version::has_DistinctOpnds()) {
z_ogrk(tmp2, Z_R0, Z_R1);
z_srk(Rix, Rcnt, Z_R0); // remaining # chars to compress in unrolled loop
} else {
z_lgr(tmp2, Z_R0);
z_ogr(tmp2, Z_R1);
z_lr(Rix, Rcnt);
z_sr(Rix, Z_R0);
}
z_ngr(tmp2, mask);
z_brne(Lslow); // Failed fast case, retry slowly.
z_sra(Rix, log_min_cnt); // unrolled loop count
z_brz(UnrolledDone);
bind(UnrolledLoop);
z_lmg(Z_R0, Z_R1, 0, Rsrc);
if (precise) {
z_ogr(Z_R1, Z_R0); // check all 8 chars for incompatibility
z_ngr(Z_R1, Rmask);
z_brnz(UnrolledBreak);
z_lg(Z_R1, 8, Rsrc); // reload destroyed register
z_stcmh(Z_R0, 5, 0, Rdst);
z_stcm(Z_R0, 5, 2, Rdst);
} else {
z_stcmh(Z_R0, 5, 0, Rdst);
z_stcm(Z_R0, 5, 2, Rdst);
z_ogr(Z_R0, Z_R1);
z_ngr(Z_R0, Rmask);
z_brnz(UnrolledBreak);
}
z_stcmh(Z_R1, 5, 4, Rdst);
z_stcm(Z_R1, 5, 6, Rdst);
add2reg(Rsrc, min_cnt*2);
add2reg(Rdst, min_cnt);
z_brct(Rix, UnrolledLoop);
z_lgfr(Z_R0, Rcnt); // # chars processed in total after unrolled loop.
z_nilf(Z_R0, ~(min_cnt-1));
z_tmll(Rcnt, min_cnt-1);
z_brnaz(ScalarShortcut); // if all bits zero, there is nothing left to do for scalar loop.
// Rix == 0 in all cases.
z_lgfr(result, Rcnt); // all characters processed.
z_sgfr(Rdst, Rcnt); // restore ptr
z_sgfr(Rsrc, Rcnt); // restore ptr, double the element count for Rsrc restore
z_sgfr(Rsrc, Rcnt);
z_bru(AllDone);
bind(UnrolledBreak);
z_lgfr(Z_R0, Rcnt); // # chars processed in total after unrolled loop
z_nilf(Z_R0, ~(min_cnt-1));
z_sll(Rix, log_min_cnt); // # chars processed so far in UnrolledLoop, excl. current iteration.
z_sr(Z_R0, Rix); // correct # chars processed in total.
if (!precise) {
z_lgfr(result, Z_R0);
z_aghi(result, min_cnt/2); // min_cnt/2 characters have already been written
// but ptrs were not updated yet.
z_sgfr(Rdst, Z_R0); // restore ptr
z_sgfr(Rsrc, Z_R0); // restore ptr, double the element count for Rsrc restore
z_sgfr(Rsrc, Z_R0);
z_bru(AllDone);
}
bind(UnrolledDone);
}
z_stcmh(Z_R0, 5, 0, addr2);
z_stcm(Z_R0, 5, 2, addr2);
if (!precise) { z_ogr(Z_R0, Z_R1); }
z_stcmh(Z_R1, 5, 4, addr2);
z_stcm(Z_R1, 5, 6, addr2);
if (!precise) {
z_ngr(Z_R0, mask);
z_brne(Ldone); // Failed (more than needed was written).
{
Label ScalarLoop, ScalarDone, ScalarBreak;
bind(ScalarShortcut);
z_ltgfr(result, Rcnt);
z_brz(AllDone);
#if 0 // Sacrifice shortcuts for code compactness
{
//---< Special treatment for very short strings (one or two characters) >---
// For these strings, we are sure that the above code was skipped.
// Thus, no registers were modified, register restore is not required.
Label ScalarDoit, Scalar2Char;
z_chi(Rcnt, 2);
z_brh(ScalarDoit);
z_llh(Z_R1, 0, Z_R0, Rsrc);
z_bre(Scalar2Char);
z_tmll(Z_R1, 0xff00);
z_lghi(result, 0); // cnt == 1, first char invalid, no chars successfully processed
z_brnaz(AllDone);
z_stc(Z_R1, 0, Z_R0, Rdst);
z_lghi(result, 1);
z_bru(AllDone);
bind(Scalar2Char);
z_llh(Z_R0, 2, Z_R0, Rsrc);
z_tmll(Z_R1, 0xff00);
z_lghi(result, 0); // cnt == 2, first char invalid, no chars successfully processed
z_brnaz(AllDone);
z_stc(Z_R1, 0, Z_R0, Rdst);
z_tmll(Z_R0, 0xff00);
z_lghi(result, 1); // cnt == 2, second char invalid, one char successfully processed
z_brnaz(AllDone);
z_stc(Z_R0, 1, Z_R0, Rdst);
z_lghi(result, 2);
z_bru(AllDone);
bind(ScalarDoit);
}
#endif
if (VM_Version::has_DistinctOpnds()) {
z_srk(Rix, Rcnt, Z_R0); // remaining # chars to compress in unrolled loop
} else {
z_lr(Rix, Rcnt);
z_sr(Rix, Z_R0);
}
z_lgfr(result, Rcnt); // # processed characters (if all runs ok).
z_brz(ScalarDone);
bind(ScalarLoop);
z_llh(Z_R1, 0, Z_R0, Rsrc);
z_tmll(Z_R1, 0xff00);
z_brnaz(ScalarBreak);
z_stc(Z_R1, 0, Z_R0, Rdst);
add2reg(Rsrc, 2);
add2reg(Rdst, 1);
z_brct(Rix, ScalarLoop);
z_bru(ScalarDone);
bind(ScalarBreak);
z_sr(result, Rix);
bind(ScalarDone);
z_sgfr(Rdst, result); // restore ptr
z_sgfr(Rsrc, result); // restore ptr, double the element count for Rsrc restore
z_sgfr(Rsrc, result);
}
z_aghi(addr2, 8);
z_brxle(ind1, even_reg, Lloop1);
bind(Lslow);
// Compute index limit and skip if negative.
z_ahi(odd_reg, 16-2); // Last possible index for slow loop.
z_lhi(even_reg, 2);
z_cr(ind1, odd_reg);
z_brh(Ldone);
bind(Lloop2); // 1 Character per iteration.
z_llh(Z_R0, Address(src, ind1));
z_tmll(Z_R0, 0xFF00);
z_brnaz(Ldone); // Failed slow case: Return number of written characters.
z_stc(Z_R0, Address(addr2));
z_aghi(addr2, 1);
z_brxle(ind1, even_reg, Lloop2);
bind(Ldone); // result = ind1 = 2*cnt
z_srl(ind1, 1);
BLOCK_COMMENT("} string_compress");
bind(AllDone);
if (precise) {
BLOCK_COMMENT("} encode_iso_array");
} else {
BLOCK_COMMENT("} string_compress");
}
return offset() - block_start;
}
@ -4997,53 +5222,432 @@ unsigned int MacroAssembler::string_inflate_trot(Register src, Register dst, Reg
return offset() - block_start;
}
// Inflate byte[] to char[]. odd_reg contains cnt. Kills src.
unsigned int MacroAssembler::string_inflate(Register src, Register dst, Register odd_reg,
Register even_reg, Register tmp) {
int block_start = offset();
// Inflate byte[] to char[].
// Restores: src, dst
// Uses: cnt
// Kills: tmp, Z_R0, Z_R1.
// Note:
// cnt is signed int. Do not rely on high word!
// counts # characters, not bytes.
unsigned int MacroAssembler::string_inflate(Register src, Register dst, Register cnt, Register tmp) {
assert_different_registers(Z_R0, Z_R1, src, dst, cnt, tmp);
BLOCK_COMMENT("string_inflate {");
int block_start = offset();
Label Lloop1, Lloop2, Lslow, Ldone;
const Register addr1 = src, ind2 = tmp;
Register Rcnt = cnt; // # characters (src: bytes, dst: char (2-byte)), remaining after current loop.
Register Rix = tmp; // loop index
Register Rsrc = src; // addr(src array)
Register Rdst = dst; // addr(dst array)
Label ScalarShortcut, AllDone;
z_sll(odd_reg, 1); // Number of bytes to write. (Must be a positive simm32.)
clear_reg(ind2); // Index to write.
z_ahi(odd_reg, -16); // Last possible index for fast loop.
z_brl(Lslow);
#if 0 // Sacrifice shortcuts for code compactness
{
//---< shortcuts for short strings (very frequent) >---
Label skipShortcut, skip4Shortcut;
z_ltr(Rcnt, Rcnt); // absolutely nothing to do for strings of len == 0.
z_brz(AllDone);
clear_reg(Z_R0); // make sure registers are properly initialized.
clear_reg(Z_R1);
z_chi(Rcnt, 4);
z_brne(skip4Shortcut); // 4 characters are very frequent
z_icm(Z_R0, 5, 0, Rsrc); // Treat exactly 4 characters specially.
z_icm(Z_R1, 5, 2, Rsrc);
z_stm(Z_R0, Z_R1, 0, Rdst);
z_bru(AllDone);
bind(skip4Shortcut);
// ind2: index, even_reg: index increment, odd_reg: index limit
clear_reg(Z_R0);
clear_reg(Z_R1);
z_lhi(even_reg, 16);
z_chi(Rcnt, 8);
z_brh(skipShortcut); // There's a lot to do...
z_lgfr(Z_R0, Rcnt); // remaining #characters (<= 8). Precond for scalar loop.
// This does not destroy the "register cleared" state of Z_R0.
z_brl(ScalarShortcut); // Just a few characters
z_icmh(Z_R0, 5, 0, Rsrc); // Treat exactly 8 characters specially.
z_icmh(Z_R1, 5, 4, Rsrc);
z_icm(Z_R0, 5, 2, Rsrc);
z_icm(Z_R1, 5, 6, Rsrc);
z_stmg(Z_R0, Z_R1, 0, Rdst);
z_bru(AllDone);
bind(skipShortcut);
}
#endif
clear_reg(Z_R0); // make sure register is properly initialized.
bind(Lloop1); // 8 Characters per iteration.
z_icmh(Z_R0, 5, 0, addr1);
z_icmh(Z_R1, 5, 4, addr1);
z_icm(Z_R0, 5, 2, addr1);
z_icm(Z_R1, 5, 6, addr1);
z_aghi(addr1, 8);
z_stg(Z_R0, Address(dst, ind2));
z_stg(Z_R1, Address(dst, ind2, 8));
z_brxle(ind2, even_reg, Lloop1);
if (VM_Version::has_VectorFacility()) {
const int min_vcnt = 32; // Minimum #characters required to use vector instructions.
// Otherwise just do nothing in vector mode.
// Must be multiple of vector register length (16 bytes = 128 bits).
const int log_min_vcnt = exact_log2(min_vcnt);
Label VectorLoop, VectorDone;
bind(Lslow);
// Compute index limit and skip if negative.
z_ahi(odd_reg, 16-2); // Last possible index for slow loop.
z_lhi(even_reg, 2);
z_cr(ind2, odd_reg);
z_brh(Ldone);
assert(VM_Version::has_DistinctOpnds(), "Assumption when has_VectorFacility()");
z_srak(Rix, Rcnt, log_min_vcnt); // calculate # vector loop iterations
z_brz(VectorDone); // skip if none
bind(Lloop2); // 1 Character per iteration.
z_llc(Z_R0, Address(addr1));
z_sth(Z_R0, Address(dst, ind2));
z_aghi(addr1, 1);
z_brxle(ind2, even_reg, Lloop2);
z_sllg(Z_R0, Rix, log_min_vcnt); // remember #chars that will be processed by vector loop
bind(Ldone);
bind(VectorLoop);
z_vlm(Z_V20, Z_V21, 0, Rsrc); // get next 32 characters (single-byte)
add2reg(Rsrc, min_vcnt);
z_vuplhb(Z_V22, Z_V20); // V2 <- (expand) V0(high)
z_vupllb(Z_V23, Z_V20); // V3 <- (expand) V0(low)
z_vuplhb(Z_V24, Z_V21); // V4 <- (expand) V1(high)
z_vupllb(Z_V25, Z_V21); // V5 <- (expand) V1(low)
z_vstm(Z_V22, Z_V25, 0, Rdst); // store next 32 bytes
add2reg(Rdst, min_vcnt*2);
z_brct(Rix, VectorLoop);
bind(VectorDone);
}
const int min_cnt = 8; // Minimum #characters required to use unrolled scalar loop.
// Otherwise just do nothing in unrolled scalar mode.
// Must be multiple of 8.
{
const int log_min_cnt = exact_log2(min_cnt);
Label UnrolledLoop, UnrolledDone;
if (VM_Version::has_DistinctOpnds()) {
z_srk(Rix, Rcnt, Z_R0); // remaining # chars to process in unrolled loop
} else {
z_lr(Rix, Rcnt);
z_sr(Rix, Z_R0);
}
z_sra(Rix, log_min_cnt); // unrolled loop count
z_brz(UnrolledDone);
clear_reg(Z_R0);
clear_reg(Z_R1);
bind(UnrolledLoop);
z_icmh(Z_R0, 5, 0, Rsrc);
z_icmh(Z_R1, 5, 4, Rsrc);
z_icm(Z_R0, 5, 2, Rsrc);
z_icm(Z_R1, 5, 6, Rsrc);
add2reg(Rsrc, min_cnt);
z_stmg(Z_R0, Z_R1, 0, Rdst);
add2reg(Rdst, min_cnt*2);
z_brct(Rix, UnrolledLoop);
bind(UnrolledDone);
z_lgfr(Z_R0, Rcnt); // # chars left over after unrolled loop.
z_nilf(Z_R0, min_cnt-1);
z_brnz(ScalarShortcut); // if zero, there is nothing left to do for scalar loop.
// Rix == 0 in all cases.
z_sgfr(Z_R0, Rcnt); // negative # characters the ptrs have been advanced previously.
z_agr(Rdst, Z_R0); // restore ptr, double the element count for Rdst restore.
z_agr(Rdst, Z_R0);
z_agr(Rsrc, Z_R0); // restore ptr.
z_bru(AllDone);
}
{
bind(ScalarShortcut);
// Z_R0 must contain remaining # characters as 64-bit signed int here.
// register contents is preserved over scalar processing (for register fixup).
#if 0 // Sacrifice shortcuts for code compactness
{
Label ScalarDefault;
z_chi(Rcnt, 2);
z_brh(ScalarDefault);
z_llc(Z_R0, 0, Z_R0, Rsrc); // 6 bytes
z_sth(Z_R0, 0, Z_R0, Rdst); // 4 bytes
z_brl(AllDone);
z_llc(Z_R0, 1, Z_R0, Rsrc); // 6 bytes
z_sth(Z_R0, 2, Z_R0, Rdst); // 4 bytes
z_bru(AllDone);
bind(ScalarDefault);
}
#endif
Label CodeTable;
// Some comments on Rix calculation:
// - Rcnt is small, therefore no bits shifted out of low word (sll(g) instructions).
// - high word of both Rix and Rcnt may contain garbage
// - the final lngfr takes care of that garbage, extending the sign to high word
z_sllg(Rix, Z_R0, 2); // calculate 10*Rix = (4*Rix + Rix)*2
z_ar(Rix, Z_R0);
z_larl(Z_R1, CodeTable);
z_sll(Rix, 1);
z_lngfr(Rix, Rix); // ix range: [0..7], after inversion & mult: [-(7*12)..(0*12)].
z_bc(Assembler::bcondAlways, 0, Rix, Z_R1);
z_llc(Z_R1, 6, Z_R0, Rsrc); // 6 bytes
z_sth(Z_R1, 12, Z_R0, Rdst); // 4 bytes
z_llc(Z_R1, 5, Z_R0, Rsrc);
z_sth(Z_R1, 10, Z_R0, Rdst);
z_llc(Z_R1, 4, Z_R0, Rsrc);
z_sth(Z_R1, 8, Z_R0, Rdst);
z_llc(Z_R1, 3, Z_R0, Rsrc);
z_sth(Z_R1, 6, Z_R0, Rdst);
z_llc(Z_R1, 2, Z_R0, Rsrc);
z_sth(Z_R1, 4, Z_R0, Rdst);
z_llc(Z_R1, 1, Z_R0, Rsrc);
z_sth(Z_R1, 2, Z_R0, Rdst);
z_llc(Z_R1, 0, Z_R0, Rsrc);
z_sth(Z_R1, 0, Z_R0, Rdst);
bind(CodeTable);
z_chi(Rcnt, 8); // no fixup for small strings. Rdst, Rsrc were not modified.
z_brl(AllDone);
z_sgfr(Z_R0, Rcnt); // # characters the ptrs have been advanced previously.
z_agr(Rdst, Z_R0); // restore ptr, double the element count for Rdst restore.
z_agr(Rdst, Z_R0);
z_agr(Rsrc, Z_R0); // restore ptr.
}
bind(AllDone);
BLOCK_COMMENT("} string_inflate");
return offset() - block_start;
}
// Inflate byte[] to char[], length known at compile time.
// Restores: src, dst
// Kills: tmp, Z_R0, Z_R1.
// Note:
// len is signed int. Counts # characters, not bytes.
unsigned int MacroAssembler::string_inflate_const(Register src, Register dst, Register tmp, int len) {
assert_different_registers(Z_R0, Z_R1, src, dst, tmp);
BLOCK_COMMENT("string_inflate_const {");
int block_start = offset();
Register Rix = tmp; // loop index
Register Rsrc = src; // addr(src array)
Register Rdst = dst; // addr(dst array)
Label ScalarShortcut, AllDone;
int nprocessed = 0;
int src_off = 0; // compensate for saved (optimized away) ptr advancement.
int dst_off = 0; // compensate for saved (optimized away) ptr advancement.
bool restore_inputs = false;
bool workreg_clear = false;
if ((len >= 32) && VM_Version::has_VectorFacility()) {
const int min_vcnt = 32; // Minimum #characters required to use vector instructions.
// Otherwise just do nothing in vector mode.
// Must be multiple of vector register length (16 bytes = 128 bits).
const int log_min_vcnt = exact_log2(min_vcnt);
const int iterations = (len - nprocessed) >> log_min_vcnt;
nprocessed += iterations << log_min_vcnt;
Label VectorLoop;
if (iterations == 1) {
z_vlm(Z_V20, Z_V21, 0+src_off, Rsrc); // get next 32 characters (single-byte)
z_vuplhb(Z_V22, Z_V20); // V2 <- (expand) V0(high)
z_vupllb(Z_V23, Z_V20); // V3 <- (expand) V0(low)
z_vuplhb(Z_V24, Z_V21); // V4 <- (expand) V1(high)
z_vupllb(Z_V25, Z_V21); // V5 <- (expand) V1(low)
z_vstm(Z_V22, Z_V25, 0+dst_off, Rdst); // store next 32 bytes
src_off += min_vcnt;
dst_off += min_vcnt*2;
} else {
restore_inputs = true;
z_lgfi(Rix, len>>log_min_vcnt);
bind(VectorLoop);
z_vlm(Z_V20, Z_V21, 0, Rsrc); // get next 32 characters (single-byte)
add2reg(Rsrc, min_vcnt);
z_vuplhb(Z_V22, Z_V20); // V2 <- (expand) V0(high)
z_vupllb(Z_V23, Z_V20); // V3 <- (expand) V0(low)
z_vuplhb(Z_V24, Z_V21); // V4 <- (expand) V1(high)
z_vupllb(Z_V25, Z_V21); // V5 <- (expand) V1(low)
z_vstm(Z_V22, Z_V25, 0, Rdst); // store next 32 bytes
add2reg(Rdst, min_vcnt*2);
z_brct(Rix, VectorLoop);
}
}
if (((len-nprocessed) >= 16) && VM_Version::has_VectorFacility()) {
const int min_vcnt = 16; // Minimum #characters required to use vector instructions.
// Otherwise just do nothing in vector mode.
// Must be multiple of vector register length (16 bytes = 128 bits).
const int log_min_vcnt = exact_log2(min_vcnt);
const int iterations = (len - nprocessed) >> log_min_vcnt;
nprocessed += iterations << log_min_vcnt;
assert(iterations == 1, "must be!");
z_vl(Z_V20, 0+src_off, Z_R0, Rsrc); // get next 16 characters (single-byte)
z_vuplhb(Z_V22, Z_V20); // V2 <- (expand) V0(high)
z_vupllb(Z_V23, Z_V20); // V3 <- (expand) V0(low)
z_vstm(Z_V22, Z_V23, 0+dst_off, Rdst); // store next 32 bytes
src_off += min_vcnt;
dst_off += min_vcnt*2;
}
if ((len-nprocessed) > 8) {
const int min_cnt = 8; // Minimum #characters required to use unrolled scalar loop.
// Otherwise just do nothing in unrolled scalar mode.
// Must be multiple of 8.
const int log_min_cnt = exact_log2(min_cnt);
const int iterations = (len - nprocessed) >> log_min_cnt;
nprocessed += iterations << log_min_cnt;
//---< avoid loop overhead/ptr increment for small # iterations >---
if (iterations <= 2) {
clear_reg(Z_R0);
clear_reg(Z_R1);
workreg_clear = true;
z_icmh(Z_R0, 5, 0+src_off, Rsrc);
z_icmh(Z_R1, 5, 4+src_off, Rsrc);
z_icm(Z_R0, 5, 2+src_off, Rsrc);
z_icm(Z_R1, 5, 6+src_off, Rsrc);
z_stmg(Z_R0, Z_R1, 0+dst_off, Rdst);
src_off += min_cnt;
dst_off += min_cnt*2;
}
if (iterations == 2) {
z_icmh(Z_R0, 5, 0+src_off, Rsrc);
z_icmh(Z_R1, 5, 4+src_off, Rsrc);
z_icm(Z_R0, 5, 2+src_off, Rsrc);
z_icm(Z_R1, 5, 6+src_off, Rsrc);
z_stmg(Z_R0, Z_R1, 0+dst_off, Rdst);
src_off += min_cnt;
dst_off += min_cnt*2;
}
if (iterations > 2) {
Label UnrolledLoop;
restore_inputs = true;
clear_reg(Z_R0);
clear_reg(Z_R1);
workreg_clear = true;
z_lgfi(Rix, iterations);
bind(UnrolledLoop);
z_icmh(Z_R0, 5, 0, Rsrc);
z_icmh(Z_R1, 5, 4, Rsrc);
z_icm(Z_R0, 5, 2, Rsrc);
z_icm(Z_R1, 5, 6, Rsrc);
add2reg(Rsrc, min_cnt);
z_stmg(Z_R0, Z_R1, 0, Rdst);
add2reg(Rdst, min_cnt*2);
z_brct(Rix, UnrolledLoop);
}
}
if ((len-nprocessed) > 0) {
switch (len-nprocessed) {
case 8:
if (!workreg_clear) {
clear_reg(Z_R0);
clear_reg(Z_R1);
}
z_icmh(Z_R0, 5, 0+src_off, Rsrc);
z_icmh(Z_R1, 5, 4+src_off, Rsrc);
z_icm(Z_R0, 5, 2+src_off, Rsrc);
z_icm(Z_R1, 5, 6+src_off, Rsrc);
z_stmg(Z_R0, Z_R1, 0+dst_off, Rdst);
break;
case 7:
if (!workreg_clear) {
clear_reg(Z_R0);
clear_reg(Z_R1);
}
clear_reg(Rix);
z_icm(Z_R0, 5, 0+src_off, Rsrc);
z_icm(Z_R1, 5, 2+src_off, Rsrc);
z_icm(Rix, 5, 4+src_off, Rsrc);
z_stm(Z_R0, Z_R1, 0+dst_off, Rdst);
z_llc(Z_R0, 6+src_off, Z_R0, Rsrc);
z_st(Rix, 8+dst_off, Z_R0, Rdst);
z_sth(Z_R0, 12+dst_off, Z_R0, Rdst);
break;
case 6:
if (!workreg_clear) {
clear_reg(Z_R0);
clear_reg(Z_R1);
}
clear_reg(Rix);
z_icm(Z_R0, 5, 0+src_off, Rsrc);
z_icm(Z_R1, 5, 2+src_off, Rsrc);
z_icm(Rix, 5, 4+src_off, Rsrc);
z_stm(Z_R0, Z_R1, 0+dst_off, Rdst);
z_st(Rix, 8+dst_off, Z_R0, Rdst);
break;
case 5:
if (!workreg_clear) {
clear_reg(Z_R0);
clear_reg(Z_R1);
}
z_icm(Z_R0, 5, 0+src_off, Rsrc);
z_icm(Z_R1, 5, 2+src_off, Rsrc);
z_llc(Rix, 4+src_off, Z_R0, Rsrc);
z_stm(Z_R0, Z_R1, 0+dst_off, Rdst);
z_sth(Rix, 8+dst_off, Z_R0, Rdst);
break;
case 4:
if (!workreg_clear) {
clear_reg(Z_R0);
clear_reg(Z_R1);
}
z_icm(Z_R0, 5, 0+src_off, Rsrc);
z_icm(Z_R1, 5, 2+src_off, Rsrc);
z_stm(Z_R0, Z_R1, 0+dst_off, Rdst);
break;
case 3:
if (!workreg_clear) {
clear_reg(Z_R0);
}
z_llc(Z_R1, 2+src_off, Z_R0, Rsrc);
z_icm(Z_R0, 5, 0+src_off, Rsrc);
z_sth(Z_R1, 4+dst_off, Z_R0, Rdst);
z_st(Z_R0, 0+dst_off, Rdst);
break;
case 2:
z_llc(Z_R0, 0+src_off, Z_R0, Rsrc);
z_llc(Z_R1, 1+src_off, Z_R0, Rsrc);
z_sth(Z_R0, 0+dst_off, Z_R0, Rdst);
z_sth(Z_R1, 2+dst_off, Z_R0, Rdst);
break;
case 1:
z_llc(Z_R0, 0+src_off, Z_R0, Rsrc);
z_sth(Z_R0, 0+dst_off, Z_R0, Rdst);
break;
default:
guarantee(false, "Impossible");
break;
}
src_off += len-nprocessed;
dst_off += (len-nprocessed)*2;
nprocessed = len;
}
//---< restore modified input registers >---
if ((nprocessed > 0) && restore_inputs) {
z_agfi(Rsrc, -(nprocessed-src_off));
if (nprocessed < 1000000000) { // avoid int overflow
z_agfi(Rdst, -(nprocessed*2-dst_off));
} else {
z_agfi(Rdst, -(nprocessed-dst_off));
z_agfi(Rdst, -nprocessed);
}
}
BLOCK_COMMENT("} string_inflate_const");
return offset() - block_start;
}

@ -198,6 +198,9 @@ class MacroAssembler: public Assembler {
// Test a bit in a register. Result is reflected in CC.
void testbit(Register r, unsigned int bitPos);
void prefetch_read(Address a);
void prefetch_update(Address a);
// Clear a register, i.e. load const zero into reg. Return len (in bytes) of
// generated instruction(s).
// whole_reg: Clear 64 bits if true, 32 bits otherwise.
@ -836,7 +839,7 @@ class MacroAssembler: public Assembler {
void load_mirror(Register mirror, Register method);
//--------------------------
//--- perations on arrays.
//--- Operations on arrays.
//--------------------------
unsigned int Clear_Array(Register cnt_arg, Register base_pointer_arg, Register src_addr, Register src_len);
unsigned int Clear_Array_Const(long cnt, Register base);
@ -849,20 +852,34 @@ class MacroAssembler: public Assembler {
// Special String Intrinsics Implementation.
//-------------------------------------------
// Intrinsics for CompactStrings
// Compress char[] to byte[]. odd_reg contains cnt. tmp3 is only needed for precise behavior in failure case. Kills dst.
unsigned int string_compress(Register result, Register src, Register dst, Register odd_reg,
Register even_reg, Register tmp, Register tmp2 = noreg);
// Restores: src, dst
// Uses: cnt
// Kills: tmp, Z_R0, Z_R1.
// Early clobber: result.
// Boolean precise controls accuracy of result value.
unsigned int string_compress(Register result, Register src, Register dst, Register cnt,
Register tmp, bool precise);
// Inflate byte[] to char[].
unsigned int string_inflate_trot(Register src, Register dst, Register cnt, Register tmp);
// Inflate byte[] to char[].
// Restores: src, dst
// Uses: cnt
// Kills: tmp, Z_R0, Z_R1.
unsigned int string_inflate(Register src, Register dst, Register cnt, Register tmp);
// Inflate byte[] to char[], length known at compile time.
// Restores: src, dst
// Kills: tmp, Z_R0, Z_R1.
// Note:
// len is signed int. Counts # characters, not bytes.
unsigned int string_inflate_const(Register src, Register dst, Register tmp, int len);
// Kills src.
unsigned int has_negatives(Register result, Register src, Register cnt,
Register odd_reg, Register even_reg, Register tmp);
// Inflate byte[] to char[].
unsigned int string_inflate_trot(Register src, Register dst, Register cnt, Register tmp);
// Odd_reg contains cnt. Kills src.
unsigned int string_inflate(Register src, Register dst, Register odd_reg,
Register even_reg, Register tmp);
unsigned int string_compare(Register str1, Register str2, Register cnt1, Register cnt2,
Register odd_reg, Register even_reg, Register result, int ae);

@ -10267,14 +10267,14 @@ instruct indexOf_UL(iRegP haystack, rarg2RegI haycnt, iRegP needle, rarg5RegI ne
%}
// char[] to byte[] compression
instruct string_compress(iRegP src, rarg5RegP dst, iRegI result, roddRegI len, revenRegI evenReg, iRegI tmp, flagsReg cr) %{
instruct string_compress(iRegP src, iRegP dst, iRegI result, iRegI len, iRegI tmp, flagsReg cr) %{
match(Set result (StrCompressedCopy src (Binary dst len)));
effect(TEMP_DEF result, USE_KILL dst, USE_KILL len, TEMP evenReg, TEMP tmp, KILL cr); // R0, R1 are killed, too.
effect(TEMP_DEF result, TEMP tmp, KILL cr); // R0, R1 are killed, too.
ins_cost(300);
format %{ "String Compress $src->$dst($len) -> $result" %}
ins_encode %{
__ string_compress($result$$Register, $src$$Register, $dst$$Register, $len$$Register,
$evenReg$$Register, $tmp$$Register);
$tmp$$Register, false);
%}
ins_pipe(pipe_class_dummy);
%}
@ -10293,13 +10293,25 @@ instruct string_compress(iRegP src, rarg5RegP dst, iRegI result, roddRegI len, r
//%}
// byte[] to char[] inflation
instruct string_inflate(Universe dummy, rarg5RegP src, iRegP dst, roddRegI len, revenRegI evenReg, iRegI tmp, flagsReg cr) %{
instruct string_inflate(Universe dummy, iRegP src, iRegP dst, iRegI len, iRegI tmp, flagsReg cr) %{
match(Set dummy (StrInflatedCopy src (Binary dst len)));
effect(USE_KILL src, USE_KILL len, TEMP evenReg, TEMP tmp, KILL cr); // R0, R1 are killed, too.
effect(TEMP tmp, KILL cr); // R0, R1 are killed, too.
ins_cost(300);
format %{ "String Inflate $src->$dst($len)" %}
ins_encode %{
__ string_inflate($src$$Register, $dst$$Register, $len$$Register, $evenReg$$Register, $tmp$$Register);
__ string_inflate($src$$Register, $dst$$Register, $len$$Register, $tmp$$Register);
%}
ins_pipe(pipe_class_dummy);
%}
// byte[] to char[] inflation
instruct string_inflate_const(Universe dummy, iRegP src, iRegP dst, iRegI tmp, immI len, flagsReg cr) %{
match(Set dummy (StrInflatedCopy src (Binary dst len)));
effect(TEMP tmp, KILL cr); // R0, R1 are killed, too.
ins_cost(300);
format %{ "String Inflate (constLen) $src->$dst($len)" %}
ins_encode %{
__ string_inflate_const($src$$Register, $dst$$Register, $tmp$$Register, $len$$constant);
%}
ins_pipe(pipe_class_dummy);
%}
@ -10318,14 +10330,14 @@ instruct has_negatives(rarg5RegP ary1, iRegI len, iRegI result, roddRegI oddReg,
%}
// encode char[] to byte[] in ISO_8859_1
instruct encode_iso_array(rarg5RegP src, iRegP dst, iRegI result, roddRegI len, revenRegI evenReg, iRegI tmp, iRegI tmp2, flagsReg cr) %{
instruct encode_iso_array(iRegP src, iRegP dst, iRegI result, iRegI len, iRegI tmp, flagsReg cr) %{
match(Set result (EncodeISOArray src (Binary dst len)));
effect(TEMP_DEF result, USE_KILL src, USE_KILL len, TEMP evenReg, TEMP tmp, TEMP tmp2, KILL cr); // R0, R1 are killed, too.
effect(TEMP_DEF result, TEMP tmp, KILL cr); // R0, R1 are killed, too.
ins_cost(300);
format %{ "Encode array $src->$dst($len) -> $result" %}
ins_encode %{
__ string_compress($result$$Register, $src$$Register, $dst$$Register, $len$$Register,
$evenReg$$Register, $tmp$$Register, $tmp2$$Register);
$tmp$$Register, true);
%}
ins_pipe(pipe_class_dummy);
%}