8216060: [PPC64] Vector CRC implementation should be used by interpreter and be faster for short arrays
Reviewed-by: gromero, goetz
This commit is contained in:
parent
019cffb539
commit
d3339d45e1
@ -1,6 +1,6 @@
|
||||
/*
|
||||
* Copyright (c) 1997, 2018, Oracle and/or its affiliates. All rights reserved.
|
||||
* Copyright (c) 2012, 2018, SAP SE. All rights reserved.
|
||||
* Copyright (c) 1997, 2019, Oracle and/or its affiliates. All rights reserved.
|
||||
* Copyright (c) 2012, 2019, SAP SE. All rights reserved.
|
||||
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
|
||||
*
|
||||
* This code is free software; you can redistribute it and/or modify it
|
||||
@ -3974,7 +3974,7 @@ void MacroAssembler::update_byteLoop_crc32(Register crc, Register buf, Register
|
||||
* Emits code to update CRC-32 with a 4-byte value according to constants in table
|
||||
* Implementation according to jdk/src/share/native/java/util/zip/zlib-1.2.8/crc32.c
|
||||
*/
|
||||
// A not on the lookup table address(es):
|
||||
// A note on the lookup table address(es):
|
||||
// The lookup table consists of two sets of four columns each.
|
||||
// The columns {0..3} are used for little-endian machines.
|
||||
// The columns {4..7} are used for big-endian machines.
|
||||
@ -4147,57 +4147,50 @@ void MacroAssembler::kernel_crc32_1byte(Register crc, Register buf, Register len
|
||||
* @param len register containing number of bytes
|
||||
* @param table register pointing to CRC table
|
||||
* @param constants register pointing to CRC table for 128-bit aligned memory
|
||||
* @param barretConstants register pointing to table for barrett reduction
|
||||
* @param t0-t4 temp registers
|
||||
* @param t0-t5 temp registers
|
||||
*/
|
||||
void MacroAssembler::kernel_crc32_1word_vpmsum(Register crc, Register buf, Register len, Register table,
|
||||
Register constants, Register barretConstants,
|
||||
Register t0, Register t1, Register t2, Register t3, Register t4,
|
||||
bool invertCRC) {
|
||||
void MacroAssembler::kernel_crc32_vpmsum(Register crc, Register buf, Register len, Register table,
|
||||
Register constants, Register t0, Register t1, Register t2,
|
||||
Register t3, Register t4, Register t5, bool invertCRC) {
|
||||
assert_different_registers(crc, buf, len, table);
|
||||
|
||||
Label L_alignedHead, L_tail;
|
||||
Label L_tail;
|
||||
|
||||
BLOCK_COMMENT("kernel_crc32_1word_vpmsum {");
|
||||
BLOCK_COMMENT("kernel_crc32_vpmsum {");
|
||||
|
||||
// 1. ~c
|
||||
if (invertCRC) {
|
||||
nand(crc, crc, crc); // 1s complement of crc
|
||||
}
|
||||
|
||||
// 2. use kernel_crc32_1word for short len
|
||||
// Enforce 32 bit.
|
||||
clrldi(len, len, 32);
|
||||
cmpdi(CCR0, len, 512);
|
||||
blt(CCR0, L_tail);
|
||||
|
||||
// 3. calculate from 0 to first aligned address
|
||||
const int alignment = 16;
|
||||
// Align if we have enough bytes for the fast version.
|
||||
const int alignment = 16,
|
||||
threshold = 32;
|
||||
Register prealign = t0;
|
||||
|
||||
andi_(prealign, buf, alignment - 1);
|
||||
beq(CCR0, L_alignedHead);
|
||||
subfic(prealign, prealign, alignment);
|
||||
neg(prealign, buf);
|
||||
addi(t1, len, -threshold);
|
||||
andi(prealign, prealign, alignment - 1);
|
||||
cmpw(CCR0, t1, prealign);
|
||||
blt(CCR0, L_tail); // len - prealign < threshold?
|
||||
|
||||
subf(len, prealign, len);
|
||||
update_byteLoop_crc32(crc, buf, prealign, table, t2, false);
|
||||
|
||||
// 4. calculate from first aligned address as far as possible
|
||||
BIND(L_alignedHead);
|
||||
kernel_crc32_1word_aligned(crc, buf, len, constants, barretConstants, t0, t1, t2, t3, t4);
|
||||
// Calculate from first aligned address as far as possible.
|
||||
kernel_crc32_vpmsum_aligned(crc, buf, len, constants, t0, t1, t2, t3, t4, t5);
|
||||
|
||||
// 5. remaining bytes
|
||||
// Remaining bytes.
|
||||
BIND(L_tail);
|
||||
Register tc0 = t4;
|
||||
Register tc1 = constants;
|
||||
Register tc2 = barretConstants;
|
||||
kernel_crc32_1word(crc, buf, len, table, t0, t1, t2, t3, tc0, tc1, tc2, table, false);
|
||||
update_byteLoop_crc32(crc, buf, len, table, t2, false);
|
||||
|
||||
// 6. ~c
|
||||
if (invertCRC) {
|
||||
nand(crc, crc, crc); // 1s complement of crc
|
||||
}
|
||||
|
||||
BLOCK_COMMENT("} kernel_crc32_1word_vpmsum");
|
||||
BLOCK_COMMENT("} kernel_crc32_vpmsum");
|
||||
}
|
||||
|
||||
/**
|
||||
@ -4205,13 +4198,10 @@ void MacroAssembler::kernel_crc32_1word_vpmsum(Register crc, Register buf, Regis
|
||||
* @param buf register pointing to input byte buffer (byte*)
|
||||
* @param len register containing number of bytes (will get updated to remaining bytes)
|
||||
* @param constants register pointing to CRC table for 128-bit aligned memory
|
||||
* @param barretConstants register pointing to table for barrett reduction
|
||||
* @param t0-t4 temp registers
|
||||
* Precondition: len should be >= 512. Otherwise, nothing will be done.
|
||||
* @param t0-t5 temp registers
|
||||
*/
|
||||
void MacroAssembler::kernel_crc32_1word_aligned(Register crc, Register buf, Register len,
|
||||
Register constants, Register barretConstants,
|
||||
Register t0, Register t1, Register t2, Register t3, Register t4) {
|
||||
void MacroAssembler::kernel_crc32_vpmsum_aligned(Register crc, Register buf, Register len,
|
||||
Register constants, Register t0, Register t1, Register t2, Register t3, Register t4, Register t5) {
|
||||
|
||||
// Save non-volatile vector registers (frameless).
|
||||
Register offset = t1;
|
||||
@ -4228,7 +4218,6 @@ void MacroAssembler::kernel_crc32_1word_aligned(Register crc, Register buf, Regi
|
||||
offsetInt -= 8; std(R14, offsetInt, R1_SP);
|
||||
offsetInt -= 8; std(R15, offsetInt, R1_SP);
|
||||
offsetInt -= 8; std(R16, offsetInt, R1_SP);
|
||||
offsetInt -= 8; std(R17, offsetInt, R1_SP);
|
||||
|
||||
// Implementation uses an inner loop which uses between 256 and 16 * unroll_factor
|
||||
// bytes per iteration. The basic scheme is:
|
||||
@ -4239,14 +4228,17 @@ void MacroAssembler::kernel_crc32_1word_aligned(Register crc, Register buf, Regi
|
||||
// Outer loop performs the CRC shifts needed to combine the unroll_factor2 vectors.
|
||||
|
||||
// Using 16 * unroll_factor / unroll_factor_2 bytes for constants.
|
||||
const int unroll_factor = 2048;
|
||||
const int unroll_factor2 = 8;
|
||||
const int unroll_factor = CRC32_UNROLL_FACTOR,
|
||||
unroll_factor2 = CRC32_UNROLL_FACTOR2;
|
||||
|
||||
const int outer_consts_size = (unroll_factor2 - 1) * 16,
|
||||
inner_consts_size = (unroll_factor / unroll_factor2) * 16;
|
||||
|
||||
// Support registers.
|
||||
Register offs[] = { noreg, t0, t1, t2, t3, t4, crc /* will live in VCRC */, R14 };
|
||||
Register num_bytes = R15,
|
||||
loop_count = R16,
|
||||
cur_const = R17;
|
||||
Register offs[] = { noreg, t0, t1, t2, t3, t4, t5, crc /* will live in VCRC */ };
|
||||
Register num_bytes = R14,
|
||||
loop_count = R15,
|
||||
cur_const = R16;
|
||||
// Constant array for outer loop: unroll_factor2 - 1 registers,
|
||||
// Constant array for inner loop: unroll_factor / unroll_factor2 registers.
|
||||
VectorRegister consts0[] = { VR16, VR17, VR18, VR19, VR20, VR21, VR22 },
|
||||
@ -4268,7 +4260,7 @@ void MacroAssembler::kernel_crc32_1word_aligned(Register crc, Register buf, Regi
|
||||
mtdscr(t0);
|
||||
}
|
||||
|
||||
mtvrwz(VCRC, crc); // crc lives lives in VCRC, now
|
||||
mtvrwz(VCRC, crc); // crc lives in VCRC, now
|
||||
|
||||
for (int i = 1; i < unroll_factor2; ++i) {
|
||||
li(offs[i], 16 * i);
|
||||
@ -4279,10 +4271,8 @@ void MacroAssembler::kernel_crc32_1word_aligned(Register crc, Register buf, Regi
|
||||
for (int i = 1; i < unroll_factor2 - 1; ++i) {
|
||||
lvx(consts0[i], offs[i], constants);
|
||||
}
|
||||
addi(constants, constants, (unroll_factor2 - 1) * 16);
|
||||
|
||||
load_const_optimized(num_bytes, 16 * unroll_factor);
|
||||
load_const_optimized(loop_count, unroll_factor / (2 * unroll_factor2) - 1); // One double-iteration peeled off.
|
||||
|
||||
// Reuse data registers outside of the loop.
|
||||
VectorRegister Vtmp = data1[0];
|
||||
@ -4310,13 +4300,15 @@ void MacroAssembler::kernel_crc32_1word_aligned(Register crc, Register buf, Regi
|
||||
cmpd(CCR0, len, num_bytes);
|
||||
blt(CCR0, L_last);
|
||||
|
||||
addi(cur_const, constants, outer_consts_size); // Point to consts for inner loop
|
||||
load_const_optimized(loop_count, unroll_factor / (2 * unroll_factor2) - 1); // One double-iteration peeled off.
|
||||
|
||||
// ********** Main loop start **********
|
||||
align(32);
|
||||
bind(L_outer_loop);
|
||||
|
||||
// Begin of unrolled first iteration (no xor).
|
||||
lvx(data1[0], buf);
|
||||
mr(cur_const, constants);
|
||||
for (int i = 1; i < unroll_factor2 / 2; ++i) {
|
||||
lvx(data1[i], offs[i], buf);
|
||||
}
|
||||
@ -4369,6 +4361,8 @@ void MacroAssembler::kernel_crc32_1word_aligned(Register crc, Register buf, Regi
|
||||
}
|
||||
bdnz(L_inner_loop);
|
||||
|
||||
addi(cur_const, constants, outer_consts_size); // Reset
|
||||
|
||||
// Tail of last iteration (no loads).
|
||||
for (int i = 0; i < unroll_factor2 / 2; ++i) {
|
||||
BE_swap_bytes(data1[i + unroll_factor2 / 2]);
|
||||
@ -4397,15 +4391,15 @@ void MacroAssembler::kernel_crc32_1word_aligned(Register crc, Register buf, Regi
|
||||
// Last chance with lower num_bytes.
|
||||
bind(L_last);
|
||||
srdi(loop_count, len, exact_log2(16 * 2 * unroll_factor2)); // Use double-iterations.
|
||||
add_const_optimized(constants, constants, 16 * (unroll_factor / unroll_factor2)); // Point behind last one.
|
||||
// Point behind last const for inner loop.
|
||||
add_const_optimized(cur_const, constants, outer_consts_size + inner_consts_size);
|
||||
sldi(R0, loop_count, exact_log2(16 * 2)); // Bytes of constants to be used.
|
||||
clrrdi(num_bytes, len, exact_log2(16 * 2 * unroll_factor2));
|
||||
subf(constants, R0, constants); // Point to constant to be used first.
|
||||
subf(cur_const, R0, cur_const); // Point to constant to be used first.
|
||||
|
||||
addic_(loop_count, loop_count, -1); // One double-iteration peeled off.
|
||||
bgt(CCR0, L_outer_loop);
|
||||
// ********** Main loop end **********
|
||||
#undef BE_swap_bytes
|
||||
|
||||
// Restore DSCR pre-fetch value.
|
||||
if (VM_Version::has_mfdscr()) {
|
||||
@ -4413,13 +4407,45 @@ void MacroAssembler::kernel_crc32_1word_aligned(Register crc, Register buf, Regi
|
||||
mtdscr(t0);
|
||||
}
|
||||
|
||||
// ********** Simple loop for remaining 16 byte blocks **********
|
||||
{
|
||||
Label L_loop, L_done;
|
||||
|
||||
srdi_(t0, len, 4); // 16 bytes per iteration
|
||||
clrldi(len, len, 64-4);
|
||||
beq(CCR0, L_done);
|
||||
|
||||
// Point to const (same as last const for inner loop).
|
||||
add_const_optimized(cur_const, constants, outer_consts_size + inner_consts_size - 16);
|
||||
mtctr(t0);
|
||||
lvx(Vtmp2, cur_const);
|
||||
|
||||
align(32);
|
||||
bind(L_loop);
|
||||
|
||||
lvx(Vtmp, buf);
|
||||
addi(buf, buf, 16);
|
||||
vpermxor(VCRC, VCRC, VCRC, Vc); // xor both halves to 64 bit result.
|
||||
BE_swap_bytes(Vtmp);
|
||||
vxor(VCRC, VCRC, Vtmp);
|
||||
vpmsumw(VCRC, VCRC, Vtmp2);
|
||||
bdnz(L_loop);
|
||||
|
||||
bind(L_done);
|
||||
}
|
||||
// ********** Simple loop end **********
|
||||
#undef BE_swap_bytes
|
||||
|
||||
// Point to Barrett constants
|
||||
add_const_optimized(cur_const, constants, outer_consts_size + inner_consts_size);
|
||||
|
||||
vspltisb(zeroes, 0);
|
||||
|
||||
// Combine to 64 bit result.
|
||||
vpermxor(VCRC, VCRC, VCRC, Vc); // xor both halves to 64 bit result.
|
||||
|
||||
// Reduce to 32 bit CRC: Remainder by multiply-high.
|
||||
lvx(Vtmp, barretConstants);
|
||||
lvx(Vtmp, cur_const);
|
||||
vsldoi(Vtmp2, zeroes, VCRC, 12); // Extract high 32 bit.
|
||||
vpmsumd(Vtmp2, Vtmp2, Vtmp); // Multiply by inverse long poly.
|
||||
vsldoi(Vtmp2, zeroes, Vtmp2, 12); // Extract high 32 bit.
|
||||
@ -4445,7 +4471,20 @@ void MacroAssembler::kernel_crc32_1word_aligned(Register crc, Register buf, Regi
|
||||
offsetInt -= 8; ld(R14, offsetInt, R1_SP);
|
||||
offsetInt -= 8; ld(R15, offsetInt, R1_SP);
|
||||
offsetInt -= 8; ld(R16, offsetInt, R1_SP);
|
||||
offsetInt -= 8; ld(R17, offsetInt, R1_SP);
|
||||
}
|
||||
|
||||
void MacroAssembler::crc32(Register crc, Register buf, Register len, Register t0, Register t1, Register t2,
|
||||
Register t3, Register t4, Register t5, Register t6, Register t7, bool is_crc32c) {
|
||||
load_const_optimized(t0, is_crc32c ? StubRoutines::crc32c_table_addr()
|
||||
: StubRoutines::crc_table_addr() , R0);
|
||||
|
||||
if (VM_Version::has_vpmsumb()) {
|
||||
load_const_optimized(t1, is_crc32c ? StubRoutines::ppc64::crc32c_constants()
|
||||
: StubRoutines::ppc64::crc_constants() , R0);
|
||||
kernel_crc32_vpmsum(crc, buf, len, t0, t1, t2, t3, t4, t5, t6, t7, !is_crc32c);
|
||||
} else {
|
||||
kernel_crc32_1word(crc, buf, len, t0, t1, t2, t3, t4, t5, t6, t7, t0, !is_crc32c);
|
||||
}
|
||||
}
|
||||
|
||||
void MacroAssembler::kernel_crc32_singleByte(Register crc, Register buf, Register len, Register table, Register tmp, bool invertCRC) {
|
||||
|
@ -1,6 +1,6 @@
|
||||
/*
|
||||
* Copyright (c) 2002, 2019, Oracle and/or its affiliates. All rights reserved.
|
||||
* Copyright (c) 2012, 2018, SAP SE. All rights reserved.
|
||||
* Copyright (c) 2012, 2019, SAP SE. All rights reserved.
|
||||
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
|
||||
*
|
||||
* This code is free software; you can redistribute it and/or modify it
|
||||
@ -842,13 +842,14 @@ class MacroAssembler: public Assembler {
|
||||
void kernel_crc32_1byte(Register crc, Register buf, Register len, Register table,
|
||||
Register t0, Register t1, Register t2, Register t3,
|
||||
bool invertCRC);
|
||||
void kernel_crc32_1word_vpmsum(Register crc, Register buf, Register len, Register table,
|
||||
Register constants, Register barretConstants,
|
||||
Register t0, Register t1, Register t2, Register t3, Register t4,
|
||||
void kernel_crc32_vpmsum(Register crc, Register buf, Register len, Register table, Register constants,
|
||||
Register t0, Register t1, Register t2, Register t3, Register t4, Register t5,
|
||||
bool invertCRC);
|
||||
void kernel_crc32_1word_aligned(Register crc, Register buf, Register len,
|
||||
Register constants, Register barretConstants,
|
||||
Register t0, Register t1, Register t2, Register t3, Register t4);
|
||||
void kernel_crc32_vpmsum_aligned(Register crc, Register buf, Register len, Register constants,
|
||||
Register t0, Register t1, Register t2, Register t3, Register t4, Register t5);
|
||||
// Version which internally decides what to use.
|
||||
void crc32(Register crc, Register buf, Register len, Register t0, Register t1, Register t2,
|
||||
Register t3, Register t4, Register t5, Register t6, Register t7, bool is_crc32c);
|
||||
|
||||
void kernel_crc32_singleByte(Register crc, Register buf, Register len, Register table, Register tmp,
|
||||
bool invertCRC);
|
||||
|
@ -1,6 +1,6 @@
|
||||
/*
|
||||
* Copyright (c) 1997, 2018, Oracle and/or its affiliates. All rights reserved.
|
||||
* Copyright (c) 2012, 2018, SAP SE. All rights reserved.
|
||||
* Copyright (c) 1997, 2019, Oracle and/or its affiliates. All rights reserved.
|
||||
* Copyright (c) 2012, 2019, SAP SE. All rights reserved.
|
||||
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
|
||||
*
|
||||
* This code is free software; you can redistribute it and/or modify it
|
||||
@ -3186,35 +3186,6 @@ class StubGenerator: public StubCodeGenerator {
|
||||
return start;
|
||||
}
|
||||
|
||||
|
||||
// Compute CRC32/CRC32C function.
|
||||
void generate_CRC_updateBytes(const char* name, Register table, bool invertCRC) {
|
||||
|
||||
// arguments to kernel_crc32:
|
||||
const Register crc = R3_ARG1; // Current checksum, preset by caller or result from previous call.
|
||||
const Register data = R4_ARG2; // source byte array
|
||||
const Register dataLen = R5_ARG3; // #bytes to process
|
||||
|
||||
const Register t0 = R2;
|
||||
const Register t1 = R7;
|
||||
const Register t2 = R8;
|
||||
const Register t3 = R9;
|
||||
const Register tc0 = R10;
|
||||
const Register tc1 = R11;
|
||||
const Register tc2 = R12;
|
||||
|
||||
BLOCK_COMMENT("Stub body {");
|
||||
assert_different_registers(crc, data, dataLen, table);
|
||||
|
||||
__ kernel_crc32_1word(crc, data, dataLen, table, t0, t1, t2, t3, tc0, tc1, tc2, table, invertCRC);
|
||||
|
||||
BLOCK_COMMENT("return");
|
||||
__ mr_if_needed(R3_RET, crc); // Updated crc is function result. No copying required (R3_ARG1 == R3_RET).
|
||||
__ blr();
|
||||
|
||||
BLOCK_COMMENT("} Stub body");
|
||||
}
|
||||
|
||||
/**
|
||||
* Arguments:
|
||||
*
|
||||
@ -3492,111 +3463,15 @@ class StubGenerator: public StubCodeGenerator {
|
||||
* R3_RET - int crc result
|
||||
*/
|
||||
// Compute CRC32 function.
|
||||
address generate_CRC32_updateBytes(const char* name) {
|
||||
address generate_CRC32_updateBytes(bool is_crc32c) {
|
||||
__ align(CodeEntryAlignment);
|
||||
StubCodeMark mark(this, "StubRoutines", name);
|
||||
StubCodeMark mark(this, "StubRoutines", is_crc32c ? "CRC32C_updateBytes" : "CRC32_updateBytes");
|
||||
address start = __ function_entry(); // Remember stub start address (is rtn value).
|
||||
|
||||
const Register table = R6; // crc table address
|
||||
|
||||
// arguments to kernel_crc32:
|
||||
const Register crc = R3_ARG1; // Current checksum, preset by caller or result from previous call.
|
||||
const Register data = R4_ARG2; // source byte array
|
||||
const Register dataLen = R5_ARG3; // #bytes to process
|
||||
|
||||
if (VM_Version::has_vpmsumb()) {
|
||||
const Register constants = R2; // constants address
|
||||
const Register bconstants = R8; // barret table address
|
||||
|
||||
const Register t0 = R9;
|
||||
const Register t1 = R10;
|
||||
const Register t2 = R11;
|
||||
const Register t3 = R12;
|
||||
const Register t4 = R7;
|
||||
|
||||
BLOCK_COMMENT("Stub body {");
|
||||
assert_different_registers(crc, data, dataLen, table);
|
||||
|
||||
StubRoutines::ppc64::generate_load_crc_table_addr(_masm, table);
|
||||
StubRoutines::ppc64::generate_load_crc_constants_addr(_masm, constants);
|
||||
StubRoutines::ppc64::generate_load_crc_barret_constants_addr(_masm, bconstants);
|
||||
|
||||
__ kernel_crc32_1word_vpmsum(crc, data, dataLen, table, constants, bconstants, t0, t1, t2, t3, t4, true);
|
||||
|
||||
BLOCK_COMMENT("return");
|
||||
__ mr_if_needed(R3_RET, crc); // Updated crc is function result. No copying required (R3_ARG1 == R3_RET).
|
||||
__ blr();
|
||||
|
||||
BLOCK_COMMENT("} Stub body");
|
||||
} else {
|
||||
StubRoutines::ppc64::generate_load_crc_table_addr(_masm, table);
|
||||
generate_CRC_updateBytes(name, table, true);
|
||||
}
|
||||
|
||||
__ crc32(R3_ARG1, R4_ARG2, R5_ARG3, R2, R6, R7, R8, R9, R10, R11, R12, is_crc32c);
|
||||
__ blr();
|
||||
return start;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Arguments:
|
||||
*
|
||||
* Inputs:
|
||||
* R3_ARG1 - int crc
|
||||
* R4_ARG2 - byte* buf
|
||||
* R5_ARG3 - int length (of buffer)
|
||||
*
|
||||
* scratch:
|
||||
* R2, R6-R12
|
||||
*
|
||||
* Ouput:
|
||||
* R3_RET - int crc result
|
||||
*/
|
||||
// Compute CRC32C function.
|
||||
address generate_CRC32C_updateBytes(const char* name) {
|
||||
__ align(CodeEntryAlignment);
|
||||
StubCodeMark mark(this, "StubRoutines", name);
|
||||
address start = __ function_entry(); // Remember stub start address (is rtn value).
|
||||
|
||||
const Register table = R6; // crc table address
|
||||
|
||||
// arguments to kernel_crc32:
|
||||
const Register crc = R3_ARG1; // Current checksum, preset by caller or result from previous call.
|
||||
const Register data = R4_ARG2; // source byte array
|
||||
const Register dataLen = R5_ARG3; // #bytes to process
|
||||
|
||||
if (VM_Version::has_vpmsumb()) {
|
||||
const Register constants = R2; // constants address
|
||||
const Register bconstants = R8; // barret table address
|
||||
|
||||
const Register t0 = R9;
|
||||
const Register t1 = R10;
|
||||
const Register t2 = R11;
|
||||
const Register t3 = R12;
|
||||
const Register t4 = R7;
|
||||
|
||||
BLOCK_COMMENT("Stub body {");
|
||||
assert_different_registers(crc, data, dataLen, table);
|
||||
|
||||
StubRoutines::ppc64::generate_load_crc32c_table_addr(_masm, table);
|
||||
StubRoutines::ppc64::generate_load_crc32c_constants_addr(_masm, constants);
|
||||
StubRoutines::ppc64::generate_load_crc32c_barret_constants_addr(_masm, bconstants);
|
||||
|
||||
__ kernel_crc32_1word_vpmsum(crc, data, dataLen, table, constants, bconstants, t0, t1, t2, t3, t4, false);
|
||||
|
||||
BLOCK_COMMENT("return");
|
||||
__ mr_if_needed(R3_RET, crc); // Updated crc is function result. No copying required (R3_ARG1 == R3_RET).
|
||||
__ blr();
|
||||
|
||||
BLOCK_COMMENT("} Stub body");
|
||||
} else {
|
||||
StubRoutines::ppc64::generate_load_crc32c_table_addr(_masm, table);
|
||||
generate_CRC_updateBytes(name, table, false);
|
||||
}
|
||||
|
||||
return start;
|
||||
}
|
||||
|
||||
|
||||
// Initialization
|
||||
void generate_initial() {
|
||||
// Generates all stubs and initializes the entry points
|
||||
@ -3621,14 +3496,20 @@ class StubGenerator: public StubCodeGenerator {
|
||||
|
||||
// CRC32 Intrinsics.
|
||||
if (UseCRC32Intrinsics) {
|
||||
StubRoutines::_crc_table_adr = (address)StubRoutines::ppc64::_crc_table;
|
||||
StubRoutines::_updateBytesCRC32 = generate_CRC32_updateBytes("CRC32_updateBytes");
|
||||
StubRoutines::_crc_table_adr = (address)StubRoutines::ppc64::_crc_table;
|
||||
if (VM_Version::has_vpmsumb()) {
|
||||
StubRoutines::ppc64::_crc_constants = StubRoutines::ppc64::generate_crc_constants(REVERSE_CRC32_POLY);
|
||||
}
|
||||
StubRoutines::_updateBytesCRC32 = generate_CRC32_updateBytes(false);
|
||||
}
|
||||
|
||||
// CRC32C Intrinsics.
|
||||
if (UseCRC32CIntrinsics) {
|
||||
StubRoutines::_crc32c_table_addr = (address)StubRoutines::ppc64::_crc32c_table;
|
||||
StubRoutines::_updateBytesCRC32C = generate_CRC32C_updateBytes("CRC32C_updateBytes");
|
||||
if (VM_Version::has_vpmsumb()) {
|
||||
StubRoutines::ppc64::_crc32c_constants = StubRoutines::ppc64::generate_crc_constants(REVERSE_CRC32C_POLY);
|
||||
}
|
||||
StubRoutines::_updateBytesCRC32C = generate_CRC32_updateBytes(true);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -1,6 +1,6 @@
|
||||
/*
|
||||
* Copyright (c) 2002, 2019, Oracle and/or its affiliates. All rights reserved.
|
||||
* Copyright (c) 2012, 2018, SAP SE. All rights reserved.
|
||||
* Copyright (c) 2012, 2019, SAP SE. All rights reserved.
|
||||
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
|
||||
*
|
||||
* This code is free software; you can redistribute it and/or modify it
|
||||
@ -45,8 +45,14 @@ enum platform_dependent_constants {
|
||||
#else
|
||||
#define CRC32_TABLES 1
|
||||
#endif
|
||||
#define CRC32_CONSTANTS_SIZE 1084
|
||||
#define CRC32_BARRET_CONSTANTS 10
|
||||
|
||||
#define REVERSE_CRC32_POLY 0xEDB88320
|
||||
#define REVERSE_CRC32C_POLY 0x82F63B78
|
||||
#define INVERSE_REVERSE_CRC32_POLY 0x1aab14226ull
|
||||
#define INVERSE_REVERSE_CRC32C_POLY 0x105fd79bdull
|
||||
#define CRC32_UNROLL_FACTOR 2048
|
||||
#define CRC32_UNROLL_FACTOR2 8
|
||||
|
||||
|
||||
class ppc64 {
|
||||
friend class StubGenerator;
|
||||
@ -56,20 +62,15 @@ class ppc64 {
|
||||
// CRC32 Intrinsics.
|
||||
static juint _crc_table[CRC32_TABLES][CRC32_COLUMN_SIZE];
|
||||
static juint _crc32c_table[CRC32_TABLES][CRC32_COLUMN_SIZE];
|
||||
static juint *_crc_constants, *_crc_barret_constants;
|
||||
static juint *_crc32c_constants, *_crc32c_barret_constants;
|
||||
static juint *_crc_constants;
|
||||
static juint *_crc32c_constants;
|
||||
|
||||
public:
|
||||
|
||||
// CRC32 Intrinsics.
|
||||
static void generate_load_crc_table_addr(MacroAssembler* masm, Register table);
|
||||
static void generate_load_crc_constants_addr(MacroAssembler* masm, Register table);
|
||||
static void generate_load_crc_barret_constants_addr(MacroAssembler* masm, Register table);
|
||||
static void generate_load_crc32c_table_addr(MacroAssembler* masm, Register table);
|
||||
static void generate_load_crc32c_constants_addr(MacroAssembler* masm, Register table);
|
||||
static void generate_load_crc32c_barret_constants_addr(MacroAssembler* masm, Register table);
|
||||
static address crc_constants() { return (address)_crc_constants; }
|
||||
static address crc32c_constants() { return (address)_crc32c_constants; }
|
||||
static juint* generate_crc_constants(juint reverse_poly);
|
||||
static juint* generate_crc_barret_constants(juint reverse_poly);
|
||||
};
|
||||
|
||||
#endif // CPU_PPC_STUBROUTINES_PPC_HPP
|
||||
|
@ -1,6 +1,6 @@
|
||||
/*
|
||||
* Copyright (c) 2002, 2018, Oracle and/or its affiliates. All rights reserved.
|
||||
* Copyright (c) 2012, 2018, SAP SE. All rights reserved.
|
||||
* Copyright (c) 2002, 2019, Oracle and/or its affiliates. All rights reserved.
|
||||
* Copyright (c) 2012, 2019, SAP SE. All rights reserved.
|
||||
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
|
||||
*
|
||||
* This code is free software; you can redistribute it and/or modify it
|
||||
@ -33,39 +33,7 @@
|
||||
|
||||
#define __ masm->
|
||||
|
||||
// CRC32(C) Intrinsics.
|
||||
void StubRoutines::ppc64::generate_load_crc_table_addr(MacroAssembler* masm, Register table) {
|
||||
__ load_const_optimized(table, StubRoutines::_crc_table_adr, R0);
|
||||
}
|
||||
|
||||
void StubRoutines::ppc64::generate_load_crc_constants_addr(MacroAssembler* masm, Register table) {
|
||||
__ load_const_optimized(table, (address)StubRoutines::ppc64::_crc_constants, R0);
|
||||
}
|
||||
|
||||
void StubRoutines::ppc64::generate_load_crc_barret_constants_addr(MacroAssembler* masm, Register table) {
|
||||
__ load_const_optimized(table, (address)StubRoutines::ppc64::_crc_barret_constants, R0);
|
||||
}
|
||||
|
||||
void StubRoutines::ppc64::generate_load_crc32c_table_addr(MacroAssembler* masm, Register table) {
|
||||
__ load_const_optimized(table, StubRoutines::_crc32c_table_addr, R0);
|
||||
}
|
||||
|
||||
void StubRoutines::ppc64::generate_load_crc32c_constants_addr(MacroAssembler* masm, Register table) {
|
||||
__ load_const_optimized(table, (address)StubRoutines::ppc64::_crc32c_constants, R0);
|
||||
}
|
||||
|
||||
void StubRoutines::ppc64::generate_load_crc32c_barret_constants_addr(MacroAssembler* masm, Register table) {
|
||||
__ load_const_optimized(table, (address)StubRoutines::ppc64::_crc32c_barret_constants, R0);
|
||||
}
|
||||
|
||||
// CRC constants and compute functions
|
||||
#define REVERSE_CRC32_POLY 0xEDB88320
|
||||
#define REVERSE_CRC32C_POLY 0x82F63B78
|
||||
#define INVERSE_REVERSE_CRC32_POLY 0x1aab14226ull
|
||||
#define INVERSE_REVERSE_CRC32C_POLY 0x105fd79bdull
|
||||
#define UNROLL_FACTOR 2048
|
||||
#define UNROLL_FACTOR2 8
|
||||
|
||||
// CRC constant compute functions
|
||||
static juint fold_word(juint w, juint reverse_poly) {
|
||||
for (int i = 0; i < 32; i++) {
|
||||
int poly_if_odd = (-(w & 1)) & reverse_poly;
|
||||
@ -98,13 +66,13 @@ static julong compute_inverse_poly(julong long_poly) {
|
||||
|
||||
// Constants to fold n words as needed by macroAssembler.
|
||||
juint* StubRoutines::ppc64::generate_crc_constants(juint reverse_poly) {
|
||||
juint* ptr = (juint*) malloc(sizeof(juint) * 4 * (UNROLL_FACTOR2 - 1 + UNROLL_FACTOR / UNROLL_FACTOR2));
|
||||
juint* ptr = (juint*) malloc(sizeof(juint) * 4 * (CRC32_UNROLL_FACTOR2 + CRC32_UNROLL_FACTOR / CRC32_UNROLL_FACTOR2));
|
||||
guarantee(((intptr_t)ptr & 0xF) == 0, "16-byte alignment needed");
|
||||
guarantee(ptr != NULL, "allocation error of a crc table");
|
||||
|
||||
// Generate constants for outer loop
|
||||
juint v0, v1, v2, v3 = 1;
|
||||
for (int i = 0; i < UNROLL_FACTOR2 - 1; ++i) {
|
||||
for (int i = 0; i < CRC32_UNROLL_FACTOR2 - 1; ++i) {
|
||||
v0 = fold_word(v3, reverse_poly);
|
||||
v1 = fold_word(v0, reverse_poly);
|
||||
v2 = fold_word(v1, reverse_poly);
|
||||
@ -123,15 +91,15 @@ juint* StubRoutines::ppc64::generate_crc_constants(juint reverse_poly) {
|
||||
}
|
||||
|
||||
// Generate constants for inner loop
|
||||
juint* ptr2 = ptr + 4 * (UNROLL_FACTOR2 - 1);
|
||||
juint* ptr2 = ptr + 4 * (CRC32_UNROLL_FACTOR2 - 1);
|
||||
v3 = 1; // Restart from scratch.
|
||||
for (int i = 0; i < UNROLL_FACTOR; ++i) {
|
||||
for (int i = 0; i < CRC32_UNROLL_FACTOR; ++i) {
|
||||
v0 = fold_word(v3, reverse_poly);
|
||||
v1 = fold_word(v0, reverse_poly);
|
||||
v2 = fold_word(v1, reverse_poly);
|
||||
v3 = fold_word(v2, reverse_poly);
|
||||
if (i % UNROLL_FACTOR2 == 0) {
|
||||
int idx = UNROLL_FACTOR / UNROLL_FACTOR2 - 1 - i / UNROLL_FACTOR2;
|
||||
if (i % CRC32_UNROLL_FACTOR2 == 0) {
|
||||
int idx = CRC32_UNROLL_FACTOR / CRC32_UNROLL_FACTOR2 - 1 - i / CRC32_UNROLL_FACTOR2;
|
||||
for (int j = 0; j < 4; ++j) {
|
||||
#ifdef VM_LITTLE_ENDIAN
|
||||
ptr2[4*idx ] = v3;
|
||||
@ -148,16 +116,9 @@ juint* StubRoutines::ppc64::generate_crc_constants(juint reverse_poly) {
|
||||
}
|
||||
}
|
||||
|
||||
return ptr;
|
||||
}
|
||||
|
||||
// Constants to reduce 64 to 32 bit as needed by macroAssembler.
|
||||
juint* StubRoutines::ppc64::generate_crc_barret_constants(juint reverse_poly) {
|
||||
juint* ptr = (juint*) malloc(sizeof(juint) * CRC32_BARRET_CONSTANTS);
|
||||
guarantee(((intptr_t)ptr & 0xF) == 0, "16-byte alignment needed");
|
||||
guarantee(ptr != NULL, "allocation error of a crc table");
|
||||
|
||||
julong* c = (julong*)ptr;
|
||||
// Constants to reduce 64 to 32 bit as needed by macroAssembler.
|
||||
juint* ptr3 = ptr2 + 4 * (CRC32_UNROLL_FACTOR / CRC32_UNROLL_FACTOR2);
|
||||
julong* c = (julong*)ptr3;
|
||||
julong long_poly = (((julong)reverse_poly) << 1) | 1;
|
||||
julong inverse_long_poly = compute_inverse_poly(long_poly);
|
||||
#ifdef VM_LITTLE_ENDIAN
|
||||
@ -177,6 +138,7 @@ juint* StubRoutines::ppc64::generate_crc_barret_constants(juint reverse_poly) {
|
||||
#endif
|
||||
|
||||
//printf("inv poly: 0x%016llx\n", (long long unsigned int)inverse_long_poly);
|
||||
|
||||
return ptr;
|
||||
}
|
||||
|
||||
@ -772,8 +734,5 @@ juint StubRoutines::ppc64::_crc32c_table[CRC32_TABLES][CRC32_COLUMN_SIZE] = {
|
||||
#endif
|
||||
};
|
||||
|
||||
juint* StubRoutines::ppc64::_crc_constants = StubRoutines::ppc64::generate_crc_constants(REVERSE_CRC32_POLY);
|
||||
juint* StubRoutines::ppc64::_crc32c_constants = StubRoutines::ppc64::generate_crc_constants(REVERSE_CRC32C_POLY);
|
||||
|
||||
juint* StubRoutines::ppc64::_crc_barret_constants = StubRoutines::ppc64::generate_crc_barret_constants(REVERSE_CRC32_POLY);
|
||||
juint* StubRoutines::ppc64::_crc32c_barret_constants = StubRoutines::ppc64::generate_crc_barret_constants(REVERSE_CRC32C_POLY);
|
||||
juint* StubRoutines::ppc64::_crc_constants = NULL;
|
||||
juint* StubRoutines::ppc64::_crc32c_constants = NULL;
|
||||
|
@ -1,6 +1,6 @@
|
||||
/*
|
||||
* Copyright (c) 2014, 2018, Oracle and/or its affiliates. All rights reserved.
|
||||
* Copyright (c) 2015, 2018, SAP SE. All rights reserved.
|
||||
* Copyright (c) 2014, 2019, Oracle and/or its affiliates. All rights reserved.
|
||||
* Copyright (c) 2015, 2019, SAP SE. All rights reserved.
|
||||
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
|
||||
*
|
||||
* This code is free software; you can redistribute it and/or modify it
|
||||
@ -1832,7 +1832,7 @@ address TemplateInterpreterGenerator::generate_CRC32_update_entry() {
|
||||
#endif
|
||||
__ lwz(crc, 2*wordSize, argP); // Current crc state, zero extend to 64 bit to have a clean register.
|
||||
|
||||
StubRoutines::ppc64::generate_load_crc_table_addr(_masm, table);
|
||||
__ load_const_optimized(table, StubRoutines::crc_table_addr(), R0);
|
||||
__ kernel_crc32_singleByte(crc, data, dataLen, table, tmp, true);
|
||||
|
||||
// Restore caller sp for c2i case (from compiled) and for resized sender frame (from interpreted).
|
||||
@ -1873,19 +1873,7 @@ address TemplateInterpreterGenerator::generate_CRC32_updateBytes_entry(AbstractI
|
||||
const Register crc = R3_ARG1; // crc value
|
||||
const Register data = R4_ARG2; // address of java byte array
|
||||
const Register dataLen = R5_ARG3; // source data len
|
||||
const Register table = R6_ARG4; // address of crc32 table
|
||||
|
||||
const Register t0 = R9; // scratch registers for crc calculation
|
||||
const Register t1 = R10;
|
||||
const Register t2 = R11;
|
||||
const Register t3 = R12;
|
||||
|
||||
const Register tc0 = R2; // registers to hold pre-calculated column addresses
|
||||
const Register tc1 = R7;
|
||||
const Register tc2 = R8;
|
||||
const Register tc3 = table; // table address is reconstructed at the end of kernel_crc32_* emitters
|
||||
|
||||
const Register tmp = t0; // Only used very locally to calculate byte buffer address.
|
||||
const Register tmp = R11_scratch1;
|
||||
|
||||
// Arguments are reversed on java expression stack.
|
||||
// Calculate address of start element.
|
||||
@ -1916,12 +1904,7 @@ address TemplateInterpreterGenerator::generate_CRC32_updateBytes_entry(AbstractI
|
||||
__ addi(data, data, arrayOopDesc::base_offset_in_bytes(T_BYTE));
|
||||
}
|
||||
|
||||
StubRoutines::ppc64::generate_load_crc_table_addr(_masm, table);
|
||||
|
||||
// Performance measurements show the 1word and 2word variants to be almost equivalent,
|
||||
// with very light advantages for the 1word variant. We chose the 1word variant for
|
||||
// code compactness.
|
||||
__ kernel_crc32_1word(crc, data, dataLen, table, t0, t1, t2, t3, tc0, tc1, tc2, tc3, true);
|
||||
__ crc32(crc, data, dataLen, R2, R6, R7, R8, R9, R10, R11, R12, false);
|
||||
|
||||
// Restore caller sp for c2i case (from compiled) and for resized sender frame (from interpreted).
|
||||
__ resize_frame_absolute(R21_sender_SP, R11_scratch1, R0);
|
||||
@ -1959,19 +1942,7 @@ address TemplateInterpreterGenerator::generate_CRC32C_updateBytes_entry(Abstract
|
||||
const Register crc = R3_ARG1; // crc value
|
||||
const Register data = R4_ARG2; // address of java byte array
|
||||
const Register dataLen = R5_ARG3; // source data len
|
||||
const Register table = R6_ARG4; // address of crc32c table
|
||||
|
||||
const Register t0 = R9; // scratch registers for crc calculation
|
||||
const Register t1 = R10;
|
||||
const Register t2 = R11;
|
||||
const Register t3 = R12;
|
||||
|
||||
const Register tc0 = R2; // registers to hold pre-calculated column addresses
|
||||
const Register tc1 = R7;
|
||||
const Register tc2 = R8;
|
||||
const Register tc3 = table; // table address is reconstructed at the end of kernel_crc32_* emitters
|
||||
|
||||
const Register tmp = t0; // Only used very locally to calculate byte buffer address.
|
||||
const Register tmp = R11_scratch1;
|
||||
|
||||
// Arguments are reversed on java expression stack.
|
||||
// Calculate address of start element.
|
||||
@ -2004,12 +1975,7 @@ address TemplateInterpreterGenerator::generate_CRC32C_updateBytes_entry(Abstract
|
||||
__ addi(data, data, arrayOopDesc::base_offset_in_bytes(T_BYTE));
|
||||
}
|
||||
|
||||
StubRoutines::ppc64::generate_load_crc32c_table_addr(_masm, table);
|
||||
|
||||
// Performance measurements show the 1word and 2word variants to be almost equivalent,
|
||||
// with very light advantages for the 1word variant. We chose the 1word variant for
|
||||
// code compactness.
|
||||
__ kernel_crc32_1word(crc, data, dataLen, table, t0, t1, t2, t3, tc0, tc1, tc2, tc3, false);
|
||||
__ crc32(crc, data, dataLen, R2, R6, R7, R8, R9, R10, R11, R12, true);
|
||||
|
||||
// Restore caller sp for c2i case (from compiled) and for resized sender frame (from interpreted).
|
||||
__ resize_frame_absolute(R21_sender_SP, R11_scratch1, R0);
|
||||
|
Loading…
x
Reference in New Issue
Block a user