8270340: Base64 decodeBlock intrinsic for Power64 needs cleanup
Reviewed-by: ogatak, mdoerr
This commit is contained in:
parent
8cd0769ef2
commit
754352f4c9
@ -824,6 +824,10 @@ class Assembler : public AbstractAssembler {
|
||||
// Prefixed addi/li
|
||||
PADDI_PREFIX_OPCODE = PREFIX_PRIMARY_OPCODE | (2u << PRE_TYPE_SHIFT),
|
||||
PADDI_SUFFIX_OPCODE = ADDI_OPCODE,
|
||||
|
||||
// xxpermx
|
||||
XXPERMX_PREFIX_OPCODE = PREFIX_PRIMARY_OPCODE | (1u << PRE_TYPE_SHIFT),
|
||||
XXPERMX_SUFFIX_OPCODE = (34u << OPCODE_SHIFT),
|
||||
};
|
||||
|
||||
// Trap instructions TO bits
|
||||
@ -2348,6 +2352,7 @@ class Assembler : public AbstractAssembler {
|
||||
inline void mtvrd( VectorRegister d, Register a);
|
||||
inline void mfvrd( Register a, VectorRegister d);
|
||||
inline void xxperm( VectorSRegister d, VectorSRegister a, VectorSRegister b);
|
||||
inline void xxpermx( VectorSRegister d, VectorSRegister a, VectorSRegister b, VectorSRegister c, int ui3);
|
||||
inline void xxpermdi( VectorSRegister d, VectorSRegister a, VectorSRegister b, int dm);
|
||||
inline void xxmrghw( VectorSRegister d, VectorSRegister a, VectorSRegister b);
|
||||
inline void xxmrglw( VectorSRegister d, VectorSRegister a, VectorSRegister b);
|
||||
|
@ -142,6 +142,11 @@ inline void Assembler::paddi_r0ok(Register d, Register a, long si34, bool r = fa
|
||||
emit_int32(PADDI_SUFFIX_OPCODE | rt(d) | ra(a) | d1_eo(si34));
|
||||
}
|
||||
|
||||
inline void Assembler::xxpermx( VectorSRegister d, VectorSRegister a, VectorSRegister b, VectorSRegister c, int ui3) {
|
||||
emit_int32(XXPERMX_PREFIX_OPCODE | uimm(ui3, 3));
|
||||
emit_int32(XXPERMX_SUFFIX_OPCODE | vsrt(d) | vsra(a) | vsrb(b) | vsrc(c));
|
||||
}
|
||||
|
||||
// Fixed-Point Arithmetic Instructions with Overflow detection
|
||||
inline void Assembler::addo( Register d, Register a, Register b) { emit_int32(ADD_OPCODE | rt(d) | ra(a) | rb(b) | oe(1) | rc(0)); }
|
||||
inline void Assembler::addo_( Register d, Register a, Register b) { emit_int32(ADD_OPCODE | rt(d) | ra(a) | rb(b) | oe(1) | rc(1)); }
|
||||
|
@ -105,6 +105,10 @@ void MacroAssembler::align(int modulus, int max, int rem) {
|
||||
for (int c = (padding >> 2); c > 0; --c) { nop(); }
|
||||
}
|
||||
|
||||
void MacroAssembler::align_prefix() {
|
||||
if (is_aligned(offset() + BytesPerInstWord, 64)) { nop(); }
|
||||
}
|
||||
|
||||
// Issue instructions that calculate given TOC from global TOC.
|
||||
void MacroAssembler::calculate_address_from_global_toc(Register dst, address addr, bool hi16, bool lo16,
|
||||
bool add_relocation, bool emit_dummy_addr) {
|
||||
|
@ -88,6 +88,16 @@ class MacroAssembler: public Assembler {
|
||||
// nop padding
|
||||
void align(int modulus, int max = 252, int rem = 0);
|
||||
|
||||
// Align prefix opcode to make sure it's not on the last word of a
|
||||
// 64-byte block.
|
||||
//
|
||||
// Note: do not call align_prefix() in a .ad file (e.g. ppc.ad). Instead
|
||||
// add ins_alignment(2) to the instruct definition and implement the
|
||||
// compute_padding() method of the instruct node to use
|
||||
// compute_prefix_padding(). See loadConI32Node::compute_padding() in
|
||||
// ppc.ad for an example.
|
||||
void align_prefix();
|
||||
|
||||
//
|
||||
// Constants, loading constants, TOC support
|
||||
//
|
||||
|
@ -3643,8 +3643,14 @@ class StubGenerator: public StubCodeGenerator {
|
||||
// Underscore (URL = 1)
|
||||
#define US (signed char)((-'_' + 63) & 0xff)
|
||||
|
||||
// For P10 (or later) only
|
||||
#define VALID_B64 0x80
|
||||
#define VB64(x) (VALID_B64 | x)
|
||||
|
||||
#define VEC_ALIGN __attribute__ ((aligned(16)))
|
||||
|
||||
#define BLK_OFFSETOF(x) (offsetof(constant_block, x))
|
||||
|
||||
// In little-endian mode, the lxv instruction loads the element at EA into
|
||||
// element 15 of the the vector register, EA+1 goes into element 14, and so
|
||||
// on.
|
||||
@ -3660,95 +3666,123 @@ class StubGenerator: public StubCodeGenerator {
|
||||
StubCodeMark mark(this, "StubRoutines", "base64_decodeBlock");
|
||||
address start = __ function_entry();
|
||||
|
||||
static const signed char VEC_ALIGN offsetLUT_val[16] = {
|
||||
ARRAY_TO_LXV_ORDER(
|
||||
0, 0, PLS, DIG, UC, UC, LC, LC,
|
||||
0, 0, 0, 0, 0, 0, 0, 0 ) };
|
||||
typedef struct {
|
||||
signed char offsetLUT_val[16];
|
||||
signed char offsetLUT_URL_val[16];
|
||||
unsigned char maskLUT_val[16];
|
||||
unsigned char maskLUT_URL_val[16];
|
||||
unsigned char bitposLUT_val[16];
|
||||
unsigned char table_32_47_val[16];
|
||||
unsigned char table_32_47_URL_val[16];
|
||||
unsigned char table_48_63_val[16];
|
||||
unsigned char table_64_79_val[16];
|
||||
unsigned char table_80_95_val[16];
|
||||
unsigned char table_80_95_URL_val[16];
|
||||
unsigned char table_96_111_val[16];
|
||||
unsigned char table_112_127_val[16];
|
||||
unsigned char pack_lshift_val[16];
|
||||
unsigned char pack_rshift_val[16];
|
||||
unsigned char pack_permute_val[16];
|
||||
} constant_block;
|
||||
|
||||
static const signed char VEC_ALIGN offsetLUT_URL_val[16] = {
|
||||
ARRAY_TO_LXV_ORDER(
|
||||
0, 0, HYP, DIG, UC, UC, LC, LC,
|
||||
0, 0, 0, 0, 0, 0, 0, 0 ) };
|
||||
static const constant_block VEC_ALIGN const_block = {
|
||||
|
||||
static const unsigned char VEC_ALIGN maskLUT_val[16] = {
|
||||
ARRAY_TO_LXV_ORDER(
|
||||
/* 0 */ (unsigned char)0b10101000,
|
||||
/* 1 .. 9 */ (unsigned char)0b11111000, (unsigned char)0b11111000, (unsigned char)0b11111000, (unsigned char)0b11111000,
|
||||
(unsigned char)0b11111000, (unsigned char)0b11111000, (unsigned char)0b11111000, (unsigned char)0b11111000,
|
||||
(unsigned char)0b11111000,
|
||||
/* 10 */ (unsigned char)0b11110000,
|
||||
/* 11 */ (unsigned char)0b01010100,
|
||||
/* 12 .. 14 */ (unsigned char)0b01010000, (unsigned char)0b01010000, (unsigned char)0b01010000,
|
||||
/* 15 */ (unsigned char)0b01010100 ) };
|
||||
.offsetLUT_val = {
|
||||
ARRAY_TO_LXV_ORDER(
|
||||
0, 0, PLS, DIG, UC, UC, LC, LC,
|
||||
0, 0, 0, 0, 0, 0, 0, 0 ) },
|
||||
|
||||
static const unsigned char VEC_ALIGN maskLUT_URL_val[16] = {
|
||||
ARRAY_TO_LXV_ORDER(
|
||||
/* 0 */ (unsigned char)0b10101000,
|
||||
/* 1 .. 9 */ (unsigned char)0b11111000, (unsigned char)0b11111000, (unsigned char)0b11111000, (unsigned char)0b11111000,
|
||||
(unsigned char)0b11111000, (unsigned char)0b11111000, (unsigned char)0b11111000, (unsigned char)0b11111000,
|
||||
(unsigned char)0b11111000,
|
||||
/* 10 */ (unsigned char)0b11110000,
|
||||
/* 11 .. 12 */ (unsigned char)0b01010000, (unsigned char)0b01010000,
|
||||
/* 13 */ (unsigned char)0b01010100,
|
||||
/* 14 */ (unsigned char)0b01010000,
|
||||
/* 15 */ (unsigned char)0b01110000 ) };
|
||||
.offsetLUT_URL_val = {
|
||||
ARRAY_TO_LXV_ORDER(
|
||||
0, 0, HYP, DIG, UC, UC, LC, LC,
|
||||
0, 0, 0, 0, 0, 0, 0, 0 ) },
|
||||
|
||||
static const unsigned char VEC_ALIGN bitposLUT_val[16] = {
|
||||
ARRAY_TO_LXV_ORDER(
|
||||
0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, (unsigned char)0x80,
|
||||
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 ) };
|
||||
.maskLUT_val = {
|
||||
ARRAY_TO_LXV_ORDER(
|
||||
/* 0 */ (unsigned char)0b10101000,
|
||||
/* 1 .. 9 */ (unsigned char)0b11111000, (unsigned char)0b11111000, (unsigned char)0b11111000, (unsigned char)0b11111000,
|
||||
(unsigned char)0b11111000, (unsigned char)0b11111000, (unsigned char)0b11111000, (unsigned char)0b11111000,
|
||||
(unsigned char)0b11111000,
|
||||
/* 10 */ (unsigned char)0b11110000,
|
||||
/* 11 */ (unsigned char)0b01010100,
|
||||
/* 12 .. 14 */ (unsigned char)0b01010000, (unsigned char)0b01010000, (unsigned char)0b01010000,
|
||||
/* 15 */ (unsigned char)0b01010100 ) },
|
||||
|
||||
static const unsigned char VEC_ALIGN pack_lshift_val[16] = {
|
||||
ARRAY_TO_LXV_ORDER(
|
||||
0, 6, 4, 2, 0, 6, 4, 2, 0, 6, 4, 2, 0, 6, 4, 2 ) };
|
||||
.maskLUT_URL_val = {
|
||||
ARRAY_TO_LXV_ORDER(
|
||||
/* 0 */ (unsigned char)0b10101000,
|
||||
/* 1 .. 9 */ (unsigned char)0b11111000, (unsigned char)0b11111000, (unsigned char)0b11111000, (unsigned char)0b11111000,
|
||||
(unsigned char)0b11111000, (unsigned char)0b11111000, (unsigned char)0b11111000, (unsigned char)0b11111000,
|
||||
(unsigned char)0b11111000,
|
||||
/* 10 */ (unsigned char)0b11110000,
|
||||
/* 11 .. 12 */ (unsigned char)0b01010000, (unsigned char)0b01010000,
|
||||
/* 13 */ (unsigned char)0b01010100,
|
||||
/* 14 */ (unsigned char)0b01010000,
|
||||
/* 15 */ (unsigned char)0b01110000 ) },
|
||||
|
||||
static const unsigned char VEC_ALIGN pack_rshift_val[16] = {
|
||||
ARRAY_TO_LXV_ORDER(
|
||||
0, 2, 4, 0, 0, 2, 4, 0, 0, 2, 4, 0, 0, 2, 4, 0 ) };
|
||||
.bitposLUT_val = {
|
||||
ARRAY_TO_LXV_ORDER(
|
||||
0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, (unsigned char)0x80,
|
||||
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 ) },
|
||||
|
||||
// The first 4 index values are "don't care" because
|
||||
// we only use the first 12 bytes of the vector,
|
||||
// which are decoded from 16 bytes of Base64 characters.
|
||||
static const unsigned char VEC_ALIGN pack_permute_val[16] = {
|
||||
ARRAY_TO_LXV_ORDER(
|
||||
0, 0, 0, 0,
|
||||
0, 1, 2,
|
||||
4, 5, 6,
|
||||
8, 9, 10,
|
||||
12, 13, 14 ) };
|
||||
// In the following table_*_val constants, a 0 value means the
|
||||
// character is not in the Base64 character set
|
||||
.table_32_47_val = {
|
||||
ARRAY_TO_LXV_ORDER (
|
||||
/* space .. '*' = 0 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* '+' = 62 */ VB64(62), /* ',' .. '.' = 0 */ 0, 0, 0, /* '/' = 63 */ VB64(63) ) },
|
||||
|
||||
static const unsigned char VEC_ALIGN p10_pack_permute_val[16] = {
|
||||
ARRAY_TO_LXV_ORDER(
|
||||
0, 0, 0, 0, 7, 6, 5, 4,
|
||||
3, 2, 15, 14, 13, 12, 11, 10 ) };
|
||||
.table_32_47_URL_val = {
|
||||
ARRAY_TO_LXV_ORDER(
|
||||
/* space .. ',' = 0 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* '-' = 62 */ VB64(62), /* '.' .. '/' */ 0, 0 ) },
|
||||
|
||||
// loop_unrolls needs to be a power of two so that the rounding can be
|
||||
// done using a mask.
|
||||
//
|
||||
// The amount of loop unrolling was determined by running a benchmark
|
||||
// that decodes a 20k block of Base64 data on a Power9 machine:
|
||||
// loop_unrolls = 1 :
|
||||
// (min, avg, max) = (108639.215, 110530.479, 110779.920), stdev = 568.437
|
||||
// loop_unrolls = 2 :
|
||||
// (min, avg, max) = (108259.029, 110174.202, 110399.642), stdev = 561.729
|
||||
// loop_unrolls = 4 :
|
||||
// (min, avg, max) = (106514.175, 108373.110, 108514.786), stdev = 392.237
|
||||
// loop_unrolls = 8 :
|
||||
// (min, avg, max) = (106281.283, 108316.668, 108539.953), stdev = 553.938
|
||||
// loop_unrolls = 16 :
|
||||
// (min, avg, max) = (108580.768, 110631.161, 110766.237), stdev = 430.510
|
||||
//
|
||||
// Comparing only the max values, there's no reason to go past
|
||||
// loop_unrolls = 1. Performance at loop_unrolls = 16 is similar but
|
||||
// has the disadvantage of requiring a larger minimum block of data to
|
||||
// work with. A value of 1 gives a minimum of (16 + 12) = 28 bytes
|
||||
// before the intrinsic will decode any data. See the reason for the
|
||||
// +12 in the following logic.
|
||||
const unsigned loop_unrolls = 1;
|
||||
.table_48_63_val = {
|
||||
ARRAY_TO_LXV_ORDER(
|
||||
/* '0' .. '9' = 52 .. 61 */ VB64(52), VB64(53), VB64(54), VB64(55), VB64(56), VB64(57), VB64(58), VB64(59), VB64(60), VB64(61),
|
||||
/* ':' .. '?' = 0 */ 0, 0, 0, 0, 0, 0 ) },
|
||||
|
||||
const unsigned vec_size = 16; // size of vector registers in bytes
|
||||
const unsigned block_size = vec_size * loop_unrolls; // number of bytes to process in each pass through the loop
|
||||
const unsigned block_size_shift = exact_log2(block_size);
|
||||
.table_64_79_val = {
|
||||
ARRAY_TO_LXV_ORDER(
|
||||
/* '@' = 0 */ 0, /* 'A' .. 'O' = 0 .. 14 */ VB64(0), VB64(1), VB64(2), VB64(3), VB64(4), VB64(5), VB64(6), VB64(7), VB64(8),
|
||||
VB64(9), VB64(10), VB64(11), VB64(12), VB64(13), VB64(14) ) },
|
||||
|
||||
.table_80_95_val = {
|
||||
ARRAY_TO_LXV_ORDER(/* 'P' .. 'Z' = 15 .. 25 */ VB64(15), VB64(16), VB64(17), VB64(18), VB64(19), VB64(20), VB64(21), VB64(22),
|
||||
VB64(23), VB64(24), VB64(25), /* '[' .. '_' = 0 */ 0, 0, 0, 0, 0 ) },
|
||||
|
||||
.table_80_95_URL_val = {
|
||||
ARRAY_TO_LXV_ORDER(/* 'P' .. 'Z' = 15 .. 25 */ VB64(15), VB64(16), VB64(17), VB64(18), VB64(19), VB64(20), VB64(21), VB64(22),
|
||||
VB64(23), VB64(24), VB64(25), /* '[' .. '^' = 0 */ 0, 0, 0, 0, /* '_' = 63 */ VB64(63) ) },
|
||||
|
||||
.table_96_111_val = {
|
||||
ARRAY_TO_LXV_ORDER(/* '`' = 0 */ 0, /* 'a' .. 'o' = 26 .. 40 */ VB64(26), VB64(27), VB64(28), VB64(29), VB64(30), VB64(31),
|
||||
VB64(32), VB64(33), VB64(34), VB64(35), VB64(36), VB64(37), VB64(38), VB64(39), VB64(40) ) },
|
||||
|
||||
.table_112_127_val = {
|
||||
ARRAY_TO_LXV_ORDER(/* 'p' .. 'z' = 41 .. 51 */ VB64(41), VB64(42), VB64(43), VB64(44), VB64(45), VB64(46), VB64(47), VB64(48),
|
||||
VB64(49), VB64(50), VB64(51), /* '{' .. DEL = 0 */ 0, 0, 0, 0, 0 ) },
|
||||
|
||||
.pack_lshift_val = {
|
||||
ARRAY_TO_LXV_ORDER(
|
||||
0, 6, 4, 2, 0, 6, 4, 2, 0, 6, 4, 2, 0, 6, 4, 2 ) },
|
||||
|
||||
.pack_rshift_val = {
|
||||
ARRAY_TO_LXV_ORDER(
|
||||
0, 2, 4, 0, 0, 2, 4, 0, 0, 2, 4, 0, 0, 2, 4, 0 ) },
|
||||
|
||||
// The first 4 index values are "don't care" because
|
||||
// we only use the first 12 bytes of the vector,
|
||||
// which are decoded from 16 bytes of Base64 characters.
|
||||
.pack_permute_val = {
|
||||
ARRAY_TO_LXV_ORDER(
|
||||
0, 0, 0, 0,
|
||||
0, 1, 2,
|
||||
4, 5, 6,
|
||||
8, 9, 10,
|
||||
12, 13, 14 ) }
|
||||
};
|
||||
|
||||
const unsigned block_size = 16; // number of bytes to process in each pass through the loop
|
||||
const unsigned block_size_shift = 4;
|
||||
|
||||
// According to the ELF V2 ABI, registers r3-r12 are volatile and available for use without save/restore
|
||||
Register s = R3_ARG1; // source starting address of Base64 characters
|
||||
@ -3775,8 +3809,6 @@ class StubGenerator: public StubCodeGenerator {
|
||||
VectorRegister vec_special_case_char = VR3;
|
||||
VectorRegister pack_rshift = VR4;
|
||||
VectorRegister pack_lshift = VR5;
|
||||
// P10+
|
||||
VectorRegister vec_0x3fs = VR4; // safe to reuse pack_rshift's register
|
||||
|
||||
// VSR Constants
|
||||
VectorSRegister offsetLUT = VSR0;
|
||||
@ -3786,26 +3818,40 @@ class StubGenerator: public StubCodeGenerator {
|
||||
VectorSRegister vec_special_case_offset = VSR4;
|
||||
VectorSRegister pack_permute = VSR5;
|
||||
|
||||
// Variables for lookup
|
||||
// VR
|
||||
// P10 (or later) VSR lookup constants
|
||||
VectorSRegister table_32_47 = VSR0;
|
||||
VectorSRegister table_48_63 = VSR1;
|
||||
VectorSRegister table_64_79 = VSR2;
|
||||
VectorSRegister table_80_95 = VSR3;
|
||||
VectorSRegister table_96_111 = VSR4;
|
||||
VectorSRegister table_112_127 = VSR6;
|
||||
|
||||
// Data read in and later converted
|
||||
VectorRegister input = VR6;
|
||||
// Variable for testing Base64 validity
|
||||
VectorRegister non_match = VR10;
|
||||
|
||||
// P9 VR Variables for lookup
|
||||
VectorRegister higher_nibble = VR7;
|
||||
VectorRegister eq_special_case_char = VR8;
|
||||
VectorRegister offsets = VR9;
|
||||
VectorRegister non_match = VR10;
|
||||
|
||||
// VSR
|
||||
// P9 VSR lookup variables
|
||||
VectorSRegister bit = VSR6;
|
||||
VectorSRegister lower_nibble = VSR7;
|
||||
VectorSRegister M = VSR8;
|
||||
|
||||
// P10 (or later) VSR lookup variables
|
||||
VectorSRegister xlate_a = VSR7;
|
||||
VectorSRegister xlate_b = VSR8;
|
||||
|
||||
// Variables for pack
|
||||
// VR
|
||||
VectorRegister l = VR7; // reuse higher_nibble's register
|
||||
VectorRegister r = VR8; // reuse eq_special_case_char's register
|
||||
VectorRegister gathered = VR9; // reuse offsets's register
|
||||
VectorRegister gathered = VR10; // reuse non_match's register
|
||||
|
||||
Label not_URL, calculate_size, unrolled_loop_start, unrolled_loop_exit, return_zero;
|
||||
Label not_URL, calculate_size, loop_start, loop_exit, return_zero;
|
||||
|
||||
// The upper 32 bits of the non-pointer parameter registers are not
|
||||
// guaranteed to be zero, so mask off those upper bits.
|
||||
@ -3824,7 +3870,7 @@ class StubGenerator: public StubCodeGenerator {
|
||||
__ sub(sl, sl, sp);
|
||||
__ subi(sl, sl, 12);
|
||||
|
||||
// Load CTR with the number of passes through the unrolled loop
|
||||
// Load CTR with the number of passes through the loop
|
||||
// = sl >> block_size_shift. After the shift, if sl <= 0, there's too
|
||||
// little data to be processed by this intrinsic.
|
||||
__ srawi_(sl, sl, block_size_shift);
|
||||
@ -3836,26 +3882,33 @@ class StubGenerator: public StubCodeGenerator {
|
||||
__ clrldi(dp, dp, 32);
|
||||
|
||||
// Load constant vec registers that need to be loaded from memory
|
||||
__ load_const_optimized(const_ptr, (address)&bitposLUT_val, tmp_reg);
|
||||
__ lxv(bitposLUT, 0, const_ptr);
|
||||
if (PowerArchitecturePPC64 >= 10) {
|
||||
__ load_const_optimized(const_ptr, (address)&p10_pack_permute_val, tmp_reg);
|
||||
} else {
|
||||
__ load_const_optimized(const_ptr, (address)&pack_rshift_val, tmp_reg);
|
||||
__ lxv(pack_rshift->to_vsr(), 0, const_ptr);
|
||||
__ load_const_optimized(const_ptr, (address)&pack_lshift_val, tmp_reg);
|
||||
__ lxv(pack_lshift->to_vsr(), 0, const_ptr);
|
||||
__ load_const_optimized(const_ptr, (address)&pack_permute_val, tmp_reg);
|
||||
}
|
||||
__ lxv(pack_permute, 0, const_ptr);
|
||||
__ load_const_optimized(const_ptr, (address)&const_block, tmp_reg);
|
||||
__ lxv(bitposLUT, BLK_OFFSETOF(bitposLUT_val), const_ptr);
|
||||
__ lxv(pack_rshift->to_vsr(), BLK_OFFSETOF(pack_rshift_val), const_ptr);
|
||||
__ lxv(pack_lshift->to_vsr(), BLK_OFFSETOF(pack_lshift_val), const_ptr);
|
||||
__ lxv(pack_permute, BLK_OFFSETOF(pack_permute_val), const_ptr);
|
||||
|
||||
// Splat the constants that can use xxspltib
|
||||
__ xxspltib(vec_0s->to_vsr(), 0);
|
||||
__ xxspltib(vec_4s->to_vsr(), 4);
|
||||
__ xxspltib(vec_8s->to_vsr(), 8);
|
||||
__ xxspltib(vec_0xfs, 0xf);
|
||||
if (PowerArchitecturePPC64 >= 10) {
|
||||
__ xxspltib(vec_0x3fs->to_vsr(), 0x3f);
|
||||
// Using VALID_B64 for the offsets effectively strips the upper bit
|
||||
// of each byte that was selected from the table. Setting the upper
|
||||
// bit gives us a way to distinguish between the 6-bit value of 0
|
||||
// from an error code of 0, which will happen if the character is
|
||||
// outside the range of the lookup, or is an illegal Base64
|
||||
// character, such as %.
|
||||
__ xxspltib(offsets->to_vsr(), VALID_B64);
|
||||
|
||||
__ lxv(table_48_63, BLK_OFFSETOF(table_48_63_val), const_ptr);
|
||||
__ lxv(table_64_79, BLK_OFFSETOF(table_64_79_val), const_ptr);
|
||||
__ lxv(table_80_95, BLK_OFFSETOF(table_80_95_val), const_ptr);
|
||||
__ lxv(table_96_111, BLK_OFFSETOF(table_96_111_val), const_ptr);
|
||||
__ lxv(table_112_127, BLK_OFFSETOF(table_112_127_val), const_ptr);
|
||||
} else {
|
||||
__ xxspltib(vec_4s->to_vsr(), 4);
|
||||
__ xxspltib(vec_0xfs, 0xf);
|
||||
__ lxv(bitposLUT, BLK_OFFSETOF(bitposLUT_val), const_ptr);
|
||||
}
|
||||
|
||||
// The rest of the constants use different values depending on the
|
||||
@ -3864,22 +3917,28 @@ class StubGenerator: public StubCodeGenerator {
|
||||
__ beq(CCR0, not_URL);
|
||||
|
||||
// isURL != 0 (true)
|
||||
__ load_const_optimized(const_ptr, (address)&offsetLUT_URL_val, tmp_reg);
|
||||
__ lxv(offsetLUT, 0, const_ptr);
|
||||
__ load_const_optimized(const_ptr, (address)&maskLUT_URL_val, tmp_reg);
|
||||
__ lxv(maskLUT, 0, const_ptr);
|
||||
__ xxspltib(vec_special_case_char->to_vsr(), '_');
|
||||
__ xxspltib(vec_special_case_offset, (unsigned char)US);
|
||||
if (PowerArchitecturePPC64 >= 10) {
|
||||
__ lxv(table_32_47, BLK_OFFSETOF(table_32_47_URL_val), const_ptr);
|
||||
__ lxv(table_80_95, BLK_OFFSETOF(table_80_95_URL_val), const_ptr);
|
||||
} else {
|
||||
__ lxv(offsetLUT, BLK_OFFSETOF(offsetLUT_URL_val), const_ptr);
|
||||
__ lxv(maskLUT, BLK_OFFSETOF(maskLUT_URL_val), const_ptr);
|
||||
__ xxspltib(vec_special_case_char->to_vsr(), '_');
|
||||
__ xxspltib(vec_special_case_offset, (unsigned char)US);
|
||||
}
|
||||
__ b(calculate_size);
|
||||
|
||||
// isURL = 0 (false)
|
||||
__ bind(not_URL);
|
||||
__ load_const_optimized(const_ptr, (address)&offsetLUT_val, tmp_reg);
|
||||
__ lxv(offsetLUT, 0, const_ptr);
|
||||
__ load_const_optimized(const_ptr, (address)&maskLUT_val, tmp_reg);
|
||||
__ lxv(maskLUT, 0, const_ptr);
|
||||
__ xxspltib(vec_special_case_char->to_vsr(), '/');
|
||||
__ xxspltib(vec_special_case_offset, (unsigned char)SLS);
|
||||
if (PowerArchitecturePPC64 >= 10) {
|
||||
__ lxv(table_32_47, BLK_OFFSETOF(table_32_47_val), const_ptr);
|
||||
__ lxv(table_80_95, BLK_OFFSETOF(table_80_95_val), const_ptr);
|
||||
} else {
|
||||
__ lxv(offsetLUT, BLK_OFFSETOF(offsetLUT_val), const_ptr);
|
||||
__ lxv(maskLUT, BLK_OFFSETOF(maskLUT_val), const_ptr);
|
||||
__ xxspltib(vec_special_case_char->to_vsr(), '/');
|
||||
__ xxspltib(vec_special_case_offset, (unsigned char)SLS);
|
||||
}
|
||||
|
||||
__ bind(calculate_size);
|
||||
|
||||
@ -3890,177 +3949,156 @@ class StubGenerator: public StubCodeGenerator {
|
||||
__ add(in, s, sp);
|
||||
|
||||
__ align(32);
|
||||
__ bind(unrolled_loop_start);
|
||||
for (unsigned unroll_cnt=0; unroll_cnt < loop_unrolls; unroll_cnt++) {
|
||||
// We can use a static displacement in the load since it's always a
|
||||
// multiple of 16, which is a requirement of lxv/stxv. This saves
|
||||
// an addi instruction.
|
||||
__ lxv(input->to_vsr(), unroll_cnt * 16, in);
|
||||
//
|
||||
// Lookup
|
||||
//
|
||||
// Isolate the upper 4 bits of each character by shifting it right 4 bits
|
||||
__ vsrb(higher_nibble, input, vec_4s);
|
||||
// Isolate the lower 4 bits by masking
|
||||
__ xxland(lower_nibble, input->to_vsr(), vec_0xfs);
|
||||
__ bind(loop_start);
|
||||
__ lxv(input->to_vsr(), 0, in); // offset=0
|
||||
|
||||
// Get the offset (the value to subtract from the byte) by using
|
||||
// a lookup table indexed by the upper 4 bits of the character
|
||||
__ xxperm(offsets->to_vsr(), offsetLUT, higher_nibble->to_vsr());
|
||||
//
|
||||
// Lookup
|
||||
//
|
||||
if (PowerArchitecturePPC64 >= 10) {
|
||||
// Use xxpermx to do a lookup of each Base64 character in the
|
||||
// input vector and translate it to a 6-bit value + 0x80.
|
||||
// Characters which are not valid Base64 characters will result
|
||||
// in a zero in the corresponding byte.
|
||||
//
|
||||
// Note that due to align(32) call above, the xxpermx instructions do
|
||||
// not require align_prefix() calls, since the final xxpermx
|
||||
// prefix+opcode is at byte 24.
|
||||
__ xxpermx(xlate_a, table_32_47, table_48_63, input->to_vsr(), 1); // offset=4
|
||||
__ xxpermx(xlate_b, table_64_79, table_80_95, input->to_vsr(), 2); // offset=12
|
||||
__ xxlor(xlate_b, xlate_a, xlate_b); // offset=20
|
||||
__ xxpermx(xlate_a, table_96_111, table_112_127, input->to_vsr(), 3); // offset=24
|
||||
__ xxlor(input->to_vsr(), xlate_a, xlate_b);
|
||||
// Check for non-Base64 characters by comparing each byte to zero.
|
||||
__ vcmpequb_(non_match, input, vec_0s);
|
||||
} else {
|
||||
// Isolate the upper 4 bits of each character by shifting it right 4 bits
|
||||
__ vsrb(higher_nibble, input, vec_4s);
|
||||
// Isolate the lower 4 bits by masking
|
||||
__ xxland(lower_nibble, input->to_vsr(), vec_0xfs);
|
||||
|
||||
// Find out which elements are the special case character (isURL ? '/' : '-')
|
||||
__ vcmpequb(eq_special_case_char, input, vec_special_case_char);
|
||||
// Get the offset (the value to subtract from the byte) by using
|
||||
// a lookup table indexed by the upper 4 bits of the character
|
||||
__ xxperm(offsets->to_vsr(), offsetLUT, higher_nibble->to_vsr());
|
||||
|
||||
// For each character in the input which is a special case
|
||||
// character, replace its offset with one that is special for that
|
||||
// character.
|
||||
__ xxsel(offsets->to_vsr(), offsets->to_vsr(), vec_special_case_offset, eq_special_case_char->to_vsr());
|
||||
// Find out which elements are the special case character (isURL ? '/' : '-')
|
||||
__ vcmpequb(eq_special_case_char, input, vec_special_case_char);
|
||||
|
||||
// Use the lower_nibble to select a mask "M" from the lookup table.
|
||||
__ xxperm(M, maskLUT, lower_nibble);
|
||||
// For each character in the input which is a special case
|
||||
// character, replace its offset with one that is special for that
|
||||
// character.
|
||||
__ xxsel(offsets->to_vsr(), offsets->to_vsr(), vec_special_case_offset, eq_special_case_char->to_vsr());
|
||||
|
||||
// "bit" is used to isolate which of the bits in M is relevant.
|
||||
__ xxperm(bit, bitposLUT, higher_nibble->to_vsr());
|
||||
// Use the lower_nibble to select a mask "M" from the lookup table.
|
||||
__ xxperm(M, maskLUT, lower_nibble);
|
||||
|
||||
// Each element of non_match correspond to one each of the 16 input
|
||||
// characters. Those elements that become 0x00 after the xxland
|
||||
// instuction are invalid Base64 characters.
|
||||
__ xxland(non_match->to_vsr(), M, bit);
|
||||
// "bit" is used to isolate which of the bits in M is relevant.
|
||||
__ xxperm(bit, bitposLUT, higher_nibble->to_vsr());
|
||||
|
||||
// Compare each element to zero
|
||||
//
|
||||
// vmcmpequb_ sets the EQ bit of CCR6 if no elements compare equal.
|
||||
// Any element comparing equal to zero means there is an error in
|
||||
// that element. Note that the comparison result register
|
||||
// non_match is not referenced again. Only CCR6-EQ matters.
|
||||
__ vcmpequb_(non_match, non_match, vec_0s);
|
||||
__ bne_predict_not_taken(CCR6, unrolled_loop_exit);
|
||||
// Each element of non_match correspond to one each of the 16 input
|
||||
// characters. Those elements that become 0x00 after the xxland
|
||||
// instuction are invalid Base64 characters.
|
||||
__ xxland(non_match->to_vsr(), M, bit);
|
||||
|
||||
// The Base64 characters had no errors, so add the offsets
|
||||
__ vaddubm(input, input, offsets);
|
||||
|
||||
// Pack
|
||||
//
|
||||
// In the tables below, b0, b1, .. b15 are the bytes of decoded
|
||||
// binary data, the first line of each of the cells (except for
|
||||
// the constants) uses the bit-field nomenclature from the
|
||||
// above-linked paper, whereas the second line is more specific
|
||||
// about which exact bits are present, and is constructed using the
|
||||
// Power ISA 3.x document style, where:
|
||||
//
|
||||
// * The specifier after the colon depicts which bits are there.
|
||||
// * The bit numbering is big endian style (bit 0 is the most
|
||||
// significant).
|
||||
// * || is a concatenate operator.
|
||||
// * Strings of 0's are a field of zeros with the shown length, and
|
||||
// likewise for strings of 1's.
|
||||
|
||||
if (PowerArchitecturePPC64 >= 10) {
|
||||
// Note that only e8..e15 are shown here because the extract bit
|
||||
// pattern is the same in e0..e7.
|
||||
//
|
||||
// +===============+=============+======================+======================+=============+=============+======================+======================+=============+
|
||||
// | Vector | e8 | e9 | e10 | e11 | e12 | e13 | e14 | e15 |
|
||||
// | Element | | | | | | | | |
|
||||
// +===============+=============+======================+======================+=============+=============+======================+======================+=============+
|
||||
// | after vaddudb | 00hhhhhh | 00gggggg | 00ffffff | 00eeeeee | 00dddddd | 00cccccc | 00bbbbbb | 00aaaaaa |
|
||||
// | | 00||b5:2..7 | 00||b4:4..7||b5:0..1 | 00||b3:6..7||b4:0..3 | 00||b3:0..5 | 00||b2:2..7 | 00||b1:4..7||b2:0..1 | 00||b0:6..7||b1:0..3 | 00||b0:0..5 |
|
||||
// +---------------+-------------+----------------------+----------------------+-------------+-------------+----------------------+----------------------+-------------+
|
||||
// | after xxbrd | 00aaaaaa | 00bbbbbb | 00cccccc | 00dddddd | 00eeeeee | 00ffffff | 00gggggg | 00hhhhhh |
|
||||
// | | 00||b0:0..5 | 00||b0:6..7||b1:0..3 | 00||b1:4..7||b2:0..1 | 00||b2:2..7 | 00||b3:0..5 | 00||b3:6..7||b4:0..3 | 00||b4:4..7||b5:0..1 | 00||b5:2..7 |
|
||||
// +---------------+-------------+----------------------+----------------------+-------------+-------------+----------------------+----------------------+-------------+
|
||||
// | vec_0x3fs | 00111111 | 00111111 | 00111111 | 00111111 | 00111111 | 00111111 | 00111111 | 00111111 |
|
||||
// +---------------+-------------+----------------------+----------------------+-------------+-------------+----------------------+----------------------+-------------+
|
||||
// | after vpextd | 00000000 | 00000000 | aaaaaabb | bbbbcccc | ccdddddd | eeeeeeff | ffffgggg | gghhhhhh |
|
||||
// | | 00000000 | 00000000 | b0:0..7 | b1:0..7 | b2:0..7 | b3:0..7 | b4:0..7 | b5:0..7 |
|
||||
// +===============+=============+======================+======================+=============+=============+======================+======================+=============+
|
||||
|
||||
__ xxbrd(input->to_vsr(), input->to_vsr());
|
||||
__ vpextd(gathered, input, vec_0x3fs);
|
||||
|
||||
// Final rearrangement of bytes into their correct positions.
|
||||
// +==================+====+====+====+====+=====+=====+=====+=====+====+====+=====+=====+=====+=====+=====+=====+
|
||||
// | Vector | e0 | e1 | e2 | e3 | e4 | e5 | e6 | e7 | e8 | e9 | e10 | e11 | e12 | e13 | e14 | e15 |
|
||||
// | Elements | | | | | | | | | | | | | | | | |
|
||||
// +==================+====+====+====+====+=====+=====+=====+=====+====+====+=====+=====+=====+=====+=====+=====+
|
||||
// | after vpextd | 0 | 0 | b6 | b7 | b8 | b9 | b10 | b11 | 0 | 0 | b0 | b1 | b2 | b3 | b4 | b5 |
|
||||
// +------------------+----+----+----+----+-----+-----+-----+-----+----+----+-----+-----+-----+-----+-----+-----+
|
||||
// | p10_pack_permute | 0 | 0 | 0 | 0 | 7 | 6 | 5 | 4 | 3 | 2 | 15 | 14 | 13 | 12 | 11 | 10 |
|
||||
// +------------------+----+----+----+----+-----+-----+-----+-----+----+----+-----+-----+-----+-----+-----+-----+
|
||||
// | after xxperm | 0 | 0 | 0 | 0 | b11 | b10 | b9 | b8 | b7 | b6 | b5 | b4 | b3 | b2 | b1 | b0 |
|
||||
// +==================+====+====+====+====+=====+=====+=====+=====+====+====+=====+=====+=====+=====+=====+=====+
|
||||
|
||||
} else {
|
||||
// Note that only e12..e15 are shown here because the shifting
|
||||
// and OR'ing pattern replicates for e8..e11, e4..7, and
|
||||
// e0..e3.
|
||||
//
|
||||
// +======================+=================+======================+======================+=============+
|
||||
// | Vector | e12 | e13 | e14 | e15 |
|
||||
// | Element | | | | |
|
||||
// +======================+=================+======================+======================+=============+
|
||||
// | after vaddubm | 00dddddd | 00cccccc | 00bbbbbb | 00aaaaaa |
|
||||
// | | 00||b2:2..7 | 00||b1:4..7||b2:0..1 | 00||b0:6..7||b1:0..3 | 00||b0:0..5 |
|
||||
// +----------------------+-----------------+----------------------+----------------------+-------------+
|
||||
// | pack_lshift | | << 6 | << 4 | << 2 |
|
||||
// +----------------------+-----------------+----------------------+----------------------+-------------+
|
||||
// | l after vslb | 00dddddd | cc000000 | bbbb0000 | aaaaaa00 |
|
||||
// | | 00||b2:2..7 | b2:0..1||000000 | b1:0..3||0000 | b0:0..5||00 |
|
||||
// +----------------------+-----------------+----------------------+----------------------+-------------+
|
||||
// | l after vslo | cc000000 | bbbb0000 | aaaaaa00 | 00000000 |
|
||||
// | | b2:0..1||000000 | b1:0..3||0000 | b0:0..5||00 | 00000000 |
|
||||
// +----------------------+-----------------+----------------------+----------------------+-------------+
|
||||
// | pack_rshift | | >> 2 | >> 4 | |
|
||||
// +----------------------+-----------------+----------------------+----------------------+-------------+
|
||||
// | r after vsrb | 00dddddd | 0000cccc | 000000bb | 00aaaaaa |
|
||||
// | | 00||b2:2..7 | 0000||b1:4..7 | 000000||b0:6..7 | 00||b0:0..5 |
|
||||
// +----------------------+-----------------+----------------------+----------------------+-------------+
|
||||
// | gathered after xxlor | ccdddddd | bbbbcccc | aaaaaabb | 00aaaaaa |
|
||||
// | | b2:0..7 | b1:0..7 | b0:0..7 | 00||b0:0..5 |
|
||||
// +======================+=================+======================+======================+=============+
|
||||
//
|
||||
// Note: there is a typo in the above-linked paper that shows the result of the gathering process is:
|
||||
// [ddddddcc|bbbbcccc|aaaaaabb]
|
||||
// but should be:
|
||||
// [ccdddddd|bbbbcccc|aaaaaabb]
|
||||
//
|
||||
__ vslb(l, input, pack_lshift);
|
||||
// vslo of vec_8s shifts the vector by one octet toward lower
|
||||
// element numbers, discarding element 0. This means it actually
|
||||
// shifts to the right (not left) according to the order of the
|
||||
// table above.
|
||||
__ vslo(l, l, vec_8s);
|
||||
__ vsrb(r, input, pack_rshift);
|
||||
__ xxlor(gathered->to_vsr(), l->to_vsr(), r->to_vsr());
|
||||
|
||||
// Final rearrangement of bytes into their correct positions.
|
||||
// +==============+======+======+======+======+=====+=====+====+====+====+====+=====+=====+=====+=====+=====+=====+
|
||||
// | Vector | e0 | e1 | e2 | e3 | e4 | e5 | e6 | e7 | e8 | e9 | e10 | e11 | e12 | e13 | e14 | e15 |
|
||||
// | Elements | | | | | | | | | | | | | | | | |
|
||||
// +==============+======+======+======+======+=====+=====+====+====+====+====+=====+=====+=====+=====+=====+=====+
|
||||
// | after xxlor | b11 | b10 | b9 | xx | b8 | b7 | b6 | xx | b5 | b4 | b3 | xx | b2 | b1 | b0 | xx |
|
||||
// +--------------+------+------+------+------+-----+-----+----+----+----+----+-----+-----+-----+-----+-----+-----+
|
||||
// | pack_permute | 0 | 0 | 0 | 0 | 0 | 1 | 2 | 4 | 5 | 6 | 8 | 9 | 10 | 12 | 13 | 14 |
|
||||
// +--------------+------+------+------+------+-----+-----+----+----+----+----+-----+-----+-----+-----+-----+-----+
|
||||
// | after xxperm | b11* | b11* | b11* | b11* | b11 | b10 | b9 | b8 | b7 | b6 | b5 | b4 | b3 | b2 | b1 | b0 |
|
||||
// +==============+======+======+======+======+=====+=====+====+====+====+====+=====+=====+=====+=====+=====+=====+
|
||||
// xx bytes are not used to form the final data
|
||||
// b0..b15 are the decoded and reassembled 8-bit bytes of data
|
||||
// b11 with asterisk is a "don't care", because these bytes will be
|
||||
// overwritten on the next iteration.
|
||||
}
|
||||
__ xxperm(gathered->to_vsr(), gathered->to_vsr(), pack_permute);
|
||||
|
||||
// We cannot use a static displacement on the store, since it's a
|
||||
// multiple of 12, not 16. Note that this stxv instruction actually
|
||||
// writes 16 bytes, even though only the first 12 are valid data.
|
||||
__ stxv(gathered->to_vsr(), 0, out);
|
||||
__ addi(out, out, 12);
|
||||
// Compare each element to zero
|
||||
//
|
||||
__ vcmpequb_(non_match, non_match, vec_0s);
|
||||
}
|
||||
__ addi(in, in, 16 * loop_unrolls);
|
||||
__ bdnz(unrolled_loop_start);
|
||||
// vmcmpequb_ sets the EQ bit of CCR6 if no elements compare equal.
|
||||
// Any element comparing equal to zero means there is an error in
|
||||
// that element. Note that the comparison result register
|
||||
// non_match is not referenced again. Only CCR6-EQ matters.
|
||||
__ bne_predict_not_taken(CCR6, loop_exit);
|
||||
|
||||
__ bind(unrolled_loop_exit);
|
||||
// The Base64 characters had no errors, so add the offsets, which in
|
||||
// the case of Power10 is a constant vector of all 0x80's (see earlier
|
||||
// comment where the offsets register is loaded).
|
||||
__ vaddubm(input, input, offsets);
|
||||
|
||||
// Pack
|
||||
//
|
||||
// In the tables below, b0, b1, .. b15 are the bytes of decoded
|
||||
// binary data, the first line of each of the cells (except for
|
||||
// the constants) uses the bit-field nomenclature from the
|
||||
// above-linked paper, whereas the second line is more specific
|
||||
// about which exact bits are present, and is constructed using the
|
||||
// Power ISA 3.x document style, where:
|
||||
//
|
||||
// * The specifier after the colon depicts which bits are there.
|
||||
// * The bit numbering is big endian style (bit 0 is the most
|
||||
// significant).
|
||||
// * || is a concatenate operator.
|
||||
// * Strings of 0's are a field of zeros with the shown length, and
|
||||
// likewise for strings of 1's.
|
||||
|
||||
// Note that only e12..e15 are shown here because the shifting
|
||||
// and OR'ing pattern replicates for e8..e11, e4..7, and
|
||||
// e0..e3.
|
||||
//
|
||||
// +======================+=================+======================+======================+=============+
|
||||
// | Vector | e12 | e13 | e14 | e15 |
|
||||
// | Element | | | | |
|
||||
// +======================+=================+======================+======================+=============+
|
||||
// | after vaddubm | 00dddddd | 00cccccc | 00bbbbbb | 00aaaaaa |
|
||||
// | | 00||b2:2..7 | 00||b1:4..7||b2:0..1 | 00||b0:6..7||b1:0..3 | 00||b0:0..5 |
|
||||
// +----------------------+-----------------+----------------------+----------------------+-------------+
|
||||
// | pack_lshift | | << 6 | << 4 | << 2 |
|
||||
// +----------------------+-----------------+----------------------+----------------------+-------------+
|
||||
// | l after vslb | 00dddddd | cc000000 | bbbb0000 | aaaaaa00 |
|
||||
// | | 00||b2:2..7 | b2:0..1||000000 | b1:0..3||0000 | b0:0..5||00 |
|
||||
// +----------------------+-----------------+----------------------+----------------------+-------------+
|
||||
// | l after vslo | cc000000 | bbbb0000 | aaaaaa00 | 00000000 |
|
||||
// | | b2:0..1||000000 | b1:0..3||0000 | b0:0..5||00 | 00000000 |
|
||||
// +----------------------+-----------------+----------------------+----------------------+-------------+
|
||||
// | pack_rshift | | >> 2 | >> 4 | |
|
||||
// +----------------------+-----------------+----------------------+----------------------+-------------+
|
||||
// | r after vsrb | 00dddddd | 0000cccc | 000000bb | 00aaaaaa |
|
||||
// | | 00||b2:2..7 | 0000||b1:4..7 | 000000||b0:6..7 | 00||b0:0..5 |
|
||||
// +----------------------+-----------------+----------------------+----------------------+-------------+
|
||||
// | gathered after xxlor | ccdddddd | bbbbcccc | aaaaaabb | 00aaaaaa |
|
||||
// | | b2:0..7 | b1:0..7 | b0:0..7 | 00||b0:0..5 |
|
||||
// +======================+=================+======================+======================+=============+
|
||||
//
|
||||
// Note: there is a typo in the above-linked paper that shows the result of the gathering process is:
|
||||
// [ddddddcc|bbbbcccc|aaaaaabb]
|
||||
// but should be:
|
||||
// [ccdddddd|bbbbcccc|aaaaaabb]
|
||||
//
|
||||
__ vslb(l, input, pack_lshift);
|
||||
// vslo of vec_8s shifts the vector by one octet toward lower
|
||||
// element numbers, discarding element 0. This means it actually
|
||||
// shifts to the right (not left) according to the order of the
|
||||
// table above.
|
||||
__ vslo(l, l, vec_8s);
|
||||
__ vsrb(r, input, pack_rshift);
|
||||
__ xxlor(gathered->to_vsr(), l->to_vsr(), r->to_vsr());
|
||||
|
||||
// Final rearrangement of bytes into their correct positions.
|
||||
// +==============+======+======+======+======+=====+=====+====+====+====+====+=====+=====+=====+=====+=====+=====+
|
||||
// | Vector | e0 | e1 | e2 | e3 | e4 | e5 | e6 | e7 | e8 | e9 | e10 | e11 | e12 | e13 | e14 | e15 |
|
||||
// | Elements | | | | | | | | | | | | | | | | |
|
||||
// +==============+======+======+======+======+=====+=====+====+====+====+====+=====+=====+=====+=====+=====+=====+
|
||||
// | after xxlor | b11 | b10 | b9 | xx | b8 | b7 | b6 | xx | b5 | b4 | b3 | xx | b2 | b1 | b0 | xx |
|
||||
// +--------------+------+------+------+------+-----+-----+----+----+----+----+-----+-----+-----+-----+-----+-----+
|
||||
// | pack_permute | 0 | 0 | 0 | 0 | 0 | 1 | 2 | 4 | 5 | 6 | 8 | 9 | 10 | 12 | 13 | 14 |
|
||||
// +--------------+------+------+------+------+-----+-----+----+----+----+----+-----+-----+-----+-----+-----+-----+
|
||||
// | after xxperm | b11* | b11* | b11* | b11* | b11 | b10 | b9 | b8 | b7 | b6 | b5 | b4 | b3 | b2 | b1 | b0 |
|
||||
// +==============+======+======+======+======+=====+=====+====+====+====+====+=====+=====+=====+=====+=====+=====+
|
||||
// xx bytes are not used to form the final data
|
||||
// b0..b15 are the decoded and reassembled 8-bit bytes of data
|
||||
// b11 with asterisk is a "don't care", because these bytes will be
|
||||
// overwritten on the next iteration.
|
||||
__ xxperm(gathered->to_vsr(), gathered->to_vsr(), pack_permute);
|
||||
|
||||
// We cannot use a static displacement on the store, since it's a
|
||||
// multiple of 12, not 16. Note that this stxv instruction actually
|
||||
// writes 16 bytes, even though only the first 12 are valid data.
|
||||
__ stxv(gathered->to_vsr(), 0, out);
|
||||
__ addi(out, out, 12);
|
||||
__ addi(in, in, 16);
|
||||
__ bdnz(loop_start);
|
||||
|
||||
__ bind(loop_exit);
|
||||
|
||||
// Return the number of out bytes produced, which is (out - (d + dp)) == out - d - dp;
|
||||
__ sub(R3_RET, out, d);
|
||||
@ -4188,10 +4226,12 @@ class StubGenerator: public StubCodeGenerator {
|
||||
// at each location, all values in expanded are compared to 31. Using
|
||||
// vsel, values higher than 31 use the results from the upper 32 bytes of
|
||||
// the lookup operation, while values less than or equal to 31 use the
|
||||
// lower 32 bytes of the lookup operation. Power10 and beyond can save the
|
||||
// compare instruction, because the comparison is done within xxpermx
|
||||
// itself. TODO: use xxpermx,xxpermx,vor on P10 when instruction prefixes are
|
||||
// available in assembler_ppc.*
|
||||
// lower 32 bytes of the lookup operation.
|
||||
//
|
||||
// Note: it's tempting to use a xxpermx,xxpermx,vor sequence here on
|
||||
// Power10 (or later), but experiments doing so on Power10 yielded a slight
|
||||
// performance drop, perhaps due to the need for xxpermx instruction
|
||||
// prefixes.
|
||||
|
||||
#define ENCODE_CORE \
|
||||
__ xxperm(input->to_vsr(), input->to_vsr(), expand_permute); \
|
||||
@ -4283,7 +4323,6 @@ class StubGenerator: public StubCodeGenerator {
|
||||
ARRAY_TO_LXV_ORDER(
|
||||
'w','x','y','z','0','1','2','3','4','5','6','7','8','9','-','_' ) }
|
||||
};
|
||||
#define BLK_OFFSETOF(x) (offsetof(constant_block, x))
|
||||
|
||||
// Number of bytes to process in each pass through the main loop.
|
||||
// 12 of the 16 bytes from each lxv are encoded to 16 Base64 bytes.
|
||||
@ -4306,7 +4345,7 @@ class StubGenerator: public StubCodeGenerator {
|
||||
Register block_modulo = R12; // == block_size (reuse const_ptr)
|
||||
Register remaining = R12; // bytes remaining to process after the blocks are completed (reuse block_modulo's reg)
|
||||
Register in = R4; // current input (source) pointer (reuse sp's register)
|
||||
Register num_blocks = R11; // number of blocks to be processed by the unrolled loop
|
||||
Register num_blocks = R11; // number of blocks to be processed by the loop
|
||||
Register out = R8; // current output (destination) pointer (reuse const_ptr's register)
|
||||
Register three = R9; // constant divisor (reuse size's register)
|
||||
Register bytes_to_write = R10; // number of bytes to write with the stxvl instr (reused blocked_size's register)
|
||||
|
Loading…
x
Reference in New Issue
Block a user