8314124: RISC-V: implement Base64 intrinsic - decoding
Reviewed-by: fyang, rehn, tonyp
This commit is contained in:
parent
daf26178be
commit
44d3a68d8a
@ -1835,8 +1835,10 @@ enum Nf {
|
||||
|
||||
// Vector Unit-Stride Segment Load Instructions
|
||||
INSN(vlseg3e8_v, 0b0000111, 0b000, 0b00000, 0b00, 0b0, g3);
|
||||
INSN(vlseg4e8_v, 0b0000111, 0b000, 0b00000, 0b00, 0b0, g4);
|
||||
|
||||
// Vector Unit-Stride Segment Store Instructions
|
||||
INSN(vsseg3e8_v, 0b0100111, 0b000, 0b00000, 0b00, 0b0, g3);
|
||||
INSN(vsseg4e8_v, 0b0100111, 0b000, 0b00000, 0b00, 0b0, g4);
|
||||
|
||||
#undef INSN
|
||||
|
@ -5322,6 +5322,279 @@ class StubGenerator: public StubCodeGenerator {
|
||||
return (address) start;
|
||||
}
|
||||
|
||||
/**
|
||||
* vector registers:
|
||||
* input VectorRegister's: intputV1-V4, for m2 they could be v2, v4, v6, for m1 they could be v2, v4, v6, v8
|
||||
* index VectorRegister's: idxV1-V3, for m2 they could be v8, v10, v12, v14, for m1 they could be v10, v12, v14, v16
|
||||
* output VectorRegister's: outputV1-V4, for m2 they could be v16, v18, v20, v22, for m1 they could be v18, v20, v22
|
||||
*
|
||||
* NOTE: each field will occupy a single vector register group
|
||||
*/
|
||||
void base64_vector_decode_round(Register src, Register dst, Register codec,
|
||||
Register size, Register stepSrc, Register stepDst, Register failedIdx, Register minusOne,
|
||||
VectorRegister inputV1, VectorRegister inputV2, VectorRegister inputV3, VectorRegister inputV4,
|
||||
VectorRegister idxV1, VectorRegister idxV2, VectorRegister idxV3, VectorRegister idxV4,
|
||||
VectorRegister outputV1, VectorRegister outputV2, VectorRegister outputV3,
|
||||
Assembler::LMUL lmul) {
|
||||
// set vector register type/len
|
||||
__ vsetvli(x0, size, Assembler::e8, lmul, Assembler::ma, Assembler::ta);
|
||||
|
||||
// segmented load src into v registers: mem(src) => vr(4)
|
||||
__ vlseg4e8_v(inputV1, src);
|
||||
|
||||
// src = src + register_group_len_bytes * 4
|
||||
__ add(src, src, stepSrc);
|
||||
|
||||
// decoding
|
||||
// 1. indexed load: vr(4) => vr(4)
|
||||
__ vluxei8_v(idxV1, codec, inputV1);
|
||||
__ vluxei8_v(idxV2, codec, inputV2);
|
||||
__ vluxei8_v(idxV3, codec, inputV3);
|
||||
__ vluxei8_v(idxV4, codec, inputV4);
|
||||
|
||||
// 2. check wrong data
|
||||
__ vor_vv(outputV1, idxV1, idxV2);
|
||||
__ vor_vv(outputV2, idxV3, idxV4);
|
||||
__ vor_vv(outputV1, outputV1, outputV2);
|
||||
__ vmseq_vi(v0, outputV1, -1);
|
||||
__ vfirst_m(failedIdx, v0);
|
||||
Label NoFailure;
|
||||
__ beq(failedIdx, minusOne, NoFailure);
|
||||
__ vsetvli(x0, failedIdx, Assembler::e8, lmul, Assembler::mu, Assembler::tu);
|
||||
__ slli(stepDst, failedIdx, 1);
|
||||
__ add(stepDst, failedIdx, stepDst);
|
||||
__ BIND(NoFailure);
|
||||
|
||||
// 3. compute the decoded data: vr(4) => vr(3)
|
||||
__ vsll_vi(idxV1, idxV1, 2);
|
||||
__ vsrl_vi(outputV1, idxV2, 4);
|
||||
__ vor_vv(outputV1, outputV1, idxV1);
|
||||
|
||||
__ vsll_vi(idxV2, idxV2, 4);
|
||||
__ vsrl_vi(outputV2, idxV3, 2);
|
||||
__ vor_vv(outputV2, outputV2, idxV2);
|
||||
|
||||
__ vsll_vi(idxV3, idxV3, 6);
|
||||
__ vor_vv(outputV3, idxV4, idxV3);
|
||||
|
||||
// segmented store encoded data in v registers back to dst: vr(3) => mem(dst)
|
||||
__ vsseg3e8_v(outputV1, dst);
|
||||
|
||||
// dst = dst + register_group_len_bytes * 3
|
||||
__ add(dst, dst, stepDst);
|
||||
}
|
||||
|
||||
/**
|
||||
* int j.u.Base64.Decoder.decodeBlock(byte[] src, int sp, int sl, byte[] dst, int dp, boolean isURL, boolean isMIME)
|
||||
*
|
||||
* Input arguments:
|
||||
* c_rarg0 - src, source array
|
||||
* c_rarg1 - sp, src start offset
|
||||
* c_rarg2 - sl, src end offset
|
||||
* c_rarg3 - dst, dest array
|
||||
* c_rarg4 - dp, dst start offset
|
||||
* c_rarg5 - isURL, Base64 or URL character set
|
||||
* c_rarg6 - isMIME, Decoding MIME block
|
||||
*/
|
||||
address generate_base64_decodeBlock() {
|
||||
|
||||
static const uint8_t fromBase64[256] = {
|
||||
255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
|
||||
255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
|
||||
255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 62u, 255u, 255u, 255u, 63u,
|
||||
52u, 53u, 54u, 55u, 56u, 57u, 58u, 59u, 60u, 61u, 255u, 255u, 255u, 255u, 255u, 255u,
|
||||
255u, 0u, 1u, 2u, 3u, 4u, 5u, 6u, 7u, 8u, 9u, 10u, 11u, 12u, 13u, 14u,
|
||||
15u, 16u, 17u, 18u, 19u, 20u, 21u, 22u, 23u, 24u, 25u, 255u, 255u, 255u, 255u, 255u,
|
||||
255u, 26u, 27u, 28u, 29u, 30u, 31u, 32u, 33u, 34u, 35u, 36u, 37u, 38u, 39u, 40u,
|
||||
41u, 42u, 43u, 44u, 45u, 46u, 47u, 48u, 49u, 50u, 51u, 255u, 255u, 255u, 255u, 255u,
|
||||
255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
|
||||
255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
|
||||
255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
|
||||
255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
|
||||
255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
|
||||
255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
|
||||
255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
|
||||
255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
|
||||
};
|
||||
|
||||
static const uint8_t fromBase64URL[256] = {
|
||||
255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
|
||||
255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
|
||||
255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 62u, 255u, 255u,
|
||||
52u, 53u, 54u, 55u, 56u, 57u, 58u, 59u, 60u, 61u, 255u, 255u, 255u, 255u, 255u, 255u,
|
||||
255u, 0u, 1u, 2u, 3u, 4u, 5u, 6u, 7u, 8u, 9u, 10u, 11u, 12u, 13u, 14u,
|
||||
15u, 16u, 17u, 18u, 19u, 20u, 21u, 22u, 23u, 24u, 25u, 255u, 255u, 255u, 255u, 63u,
|
||||
255u, 26u, 27u, 28u, 29u, 30u, 31u, 32u, 33u, 34u, 35u, 36u, 37u, 38u, 39u, 40u,
|
||||
41u, 42u, 43u, 44u, 45u, 46u, 47u, 48u, 49u, 50u, 51u, 255u, 255u, 255u, 255u, 255u,
|
||||
255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
|
||||
255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
|
||||
255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
|
||||
255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
|
||||
255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
|
||||
255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
|
||||
255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
|
||||
255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
|
||||
};
|
||||
|
||||
__ align(CodeEntryAlignment);
|
||||
StubCodeMark mark(this, "StubRoutines", "decodeBlock");
|
||||
address start = __ pc();
|
||||
__ enter();
|
||||
|
||||
Register src = c_rarg0;
|
||||
Register soff = c_rarg1;
|
||||
Register send = c_rarg2;
|
||||
Register dst = c_rarg3;
|
||||
Register doff = c_rarg4;
|
||||
Register isURL = c_rarg5;
|
||||
Register isMIME = c_rarg6;
|
||||
|
||||
Register codec = c_rarg7;
|
||||
Register dstBackup = x31;
|
||||
Register length = x28; // t3, total length of src data in bytes
|
||||
|
||||
Label ProcessData, Exit;
|
||||
Label ProcessScalar, ScalarLoop;
|
||||
|
||||
// passed in length (send - soff) is guaranteed to be > 4,
|
||||
// and in this intrinsic we only process data of length in multiple of 4,
|
||||
// it's not guaranteed to be multiple of 4 by java level, so do it explicitly
|
||||
__ sub(length, send, soff);
|
||||
__ andi(length, length, -4);
|
||||
// real src/dst to process data
|
||||
__ add(src, src, soff);
|
||||
__ add(dst, dst, doff);
|
||||
// backup of dst, used to calculate the return value at exit
|
||||
__ mv(dstBackup, dst);
|
||||
|
||||
// load the codec base address
|
||||
__ la(codec, ExternalAddress((address) fromBase64));
|
||||
__ beqz(isURL, ProcessData);
|
||||
__ la(codec, ExternalAddress((address) fromBase64URL));
|
||||
__ BIND(ProcessData);
|
||||
|
||||
// vector version
|
||||
if (UseRVV) {
|
||||
// for MIME case, it has a default length limit of 76 which could be
|
||||
// different(smaller) from (send - soff), so in MIME case, we go through
|
||||
// the scalar code path directly.
|
||||
__ bnez(isMIME, ScalarLoop);
|
||||
|
||||
Label ProcessM1, ProcessM2;
|
||||
|
||||
Register failedIdx = soff;
|
||||
Register stepSrcM1 = send;
|
||||
Register stepSrcM2 = doff;
|
||||
Register stepDst = isURL;
|
||||
Register size = x29; // t4
|
||||
Register minusOne = x30; // t5
|
||||
|
||||
__ mv(minusOne, -1);
|
||||
__ mv(size, MaxVectorSize * 2);
|
||||
__ mv(stepSrcM1, MaxVectorSize * 4);
|
||||
__ slli(stepSrcM2, stepSrcM1, 1);
|
||||
__ mv(stepDst, MaxVectorSize * 2 * 3);
|
||||
|
||||
__ blt(length, stepSrcM2, ProcessM1);
|
||||
|
||||
|
||||
// Assembler::m2
|
||||
__ BIND(ProcessM2);
|
||||
base64_vector_decode_round(src, dst, codec,
|
||||
size, stepSrcM2, stepDst, failedIdx, minusOne,
|
||||
v2, v4, v6, v8, // inputs
|
||||
v10, v12, v14, v16, // indexes
|
||||
v18, v20, v22, // outputs
|
||||
Assembler::m2);
|
||||
__ sub(length, length, stepSrcM2);
|
||||
|
||||
// error check
|
||||
__ bne(failedIdx, minusOne, Exit);
|
||||
|
||||
__ bge(length, stepSrcM2, ProcessM2);
|
||||
|
||||
|
||||
// Assembler::m1
|
||||
__ BIND(ProcessM1);
|
||||
__ blt(length, stepSrcM1, ProcessScalar);
|
||||
|
||||
__ srli(size, size, 1);
|
||||
__ srli(stepDst, stepDst, 1);
|
||||
base64_vector_decode_round(src, dst, codec,
|
||||
size, stepSrcM1, stepDst, failedIdx, minusOne,
|
||||
v1, v2, v3, v4, // inputs
|
||||
v5, v6, v7, v8, // indexes
|
||||
v9, v10, v11, // outputs
|
||||
Assembler::m1);
|
||||
__ sub(length, length, stepSrcM1);
|
||||
|
||||
// error check
|
||||
__ bne(failedIdx, minusOne, Exit);
|
||||
|
||||
__ BIND(ProcessScalar);
|
||||
__ beqz(length, Exit);
|
||||
}
|
||||
|
||||
// scalar version
|
||||
{
|
||||
Register byte0 = soff, byte1 = send, byte2 = doff, byte3 = isURL;
|
||||
Register combined32Bits = x29; // t5
|
||||
|
||||
// encoded: [byte0[5:0] : byte1[5:0] : byte2[5:0]] : byte3[5:0]] =>
|
||||
// plain: [byte0[5:0]+byte1[5:4] : byte1[3:0]+byte2[5:2] : byte2[1:0]+byte3[5:0]]
|
||||
__ BIND(ScalarLoop);
|
||||
|
||||
// load 4 bytes encoded src data
|
||||
__ lbu(byte0, Address(src, 0));
|
||||
__ lbu(byte1, Address(src, 1));
|
||||
__ lbu(byte2, Address(src, 2));
|
||||
__ lbu(byte3, Address(src, 3));
|
||||
__ addi(src, src, 4);
|
||||
|
||||
// get codec index and decode (ie. load from codec by index)
|
||||
__ add(byte0, codec, byte0);
|
||||
__ add(byte1, codec, byte1);
|
||||
__ lb(byte0, Address(byte0, 0));
|
||||
__ lb(byte1, Address(byte1, 0));
|
||||
__ add(byte2, codec, byte2);
|
||||
__ add(byte3, codec, byte3);
|
||||
__ lb(byte2, Address(byte2, 0));
|
||||
__ lb(byte3, Address(byte3, 0));
|
||||
__ slliw(byte0, byte0, 18);
|
||||
__ slliw(byte1, byte1, 12);
|
||||
__ orr(byte0, byte0, byte1);
|
||||
__ orr(byte0, byte0, byte3);
|
||||
__ slliw(byte2, byte2, 6);
|
||||
// For performance consideration, `combined32Bits` is constructed for 2 purposes at the same time,
|
||||
// 1. error check below
|
||||
// 2. decode below
|
||||
__ orr(combined32Bits, byte0, byte2);
|
||||
|
||||
// error check
|
||||
__ bltz(combined32Bits, Exit);
|
||||
|
||||
// store 3 bytes decoded data
|
||||
__ sraiw(byte0, combined32Bits, 16);
|
||||
__ sraiw(byte1, combined32Bits, 8);
|
||||
__ sb(byte0, Address(dst, 0));
|
||||
__ sb(byte1, Address(dst, 1));
|
||||
__ sb(combined32Bits, Address(dst, 2));
|
||||
|
||||
__ sub(length, length, 4);
|
||||
__ addi(dst, dst, 3);
|
||||
// loop back
|
||||
__ bnez(length, ScalarLoop);
|
||||
}
|
||||
|
||||
__ BIND(Exit);
|
||||
__ sub(c_rarg0, dst, dstBackup);
|
||||
|
||||
__ leave();
|
||||
__ ret();
|
||||
|
||||
return (address) start;
|
||||
}
|
||||
|
||||
void adler32_process_bytes(Register buff, Register s1, Register s2, VectorRegister vtable,
|
||||
VectorRegister vzero, VectorRegister vbytes, VectorRegister vs1acc, VectorRegister vs2acc,
|
||||
Register temp0, Register temp1, Register temp2, Register temp3,
|
||||
@ -5980,6 +6253,7 @@ static const int64_t right_3_bits = right_n_bits(3);
|
||||
|
||||
if (UseBASE64Intrinsics) {
|
||||
StubRoutines::_base64_encodeBlock = generate_base64_encodeBlock();
|
||||
StubRoutines::_base64_decodeBlock = generate_base64_decodeBlock();
|
||||
}
|
||||
|
||||
if (UseAdler32Intrinsics) {
|
||||
|
Loading…
x
Reference in New Issue
Block a user