8216060: [PPC64] Vector CRC implementation should be used by interpreter and be faster for short arrays

Reviewed-by: gromero, goetz
This commit is contained in:
Martin Doerr 2019-01-21 09:44:27 +01:00
parent 019cffb539
commit d3339d45e1
6 changed files with 149 additions and 302 deletions

@ -1,6 +1,6 @@
/*
* Copyright (c) 1997, 2018, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2012, 2018, SAP SE. All rights reserved.
* Copyright (c) 1997, 2019, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2012, 2019, SAP SE. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
@ -3974,7 +3974,7 @@ void MacroAssembler::update_byteLoop_crc32(Register crc, Register buf, Register
* Emits code to update CRC-32 with a 4-byte value according to constants in table
* Implementation according to jdk/src/share/native/java/util/zip/zlib-1.2.8/crc32.c
*/
// A not on the lookup table address(es):
// A note on the lookup table address(es):
// The lookup table consists of two sets of four columns each.
// The columns {0..3} are used for little-endian machines.
// The columns {4..7} are used for big-endian machines.
@ -4147,57 +4147,50 @@ void MacroAssembler::kernel_crc32_1byte(Register crc, Register buf, Register len
* @param len register containing number of bytes
* @param table register pointing to CRC table
* @param constants register pointing to CRC table for 128-bit aligned memory
* @param barretConstants register pointing to table for barrett reduction
* @param t0-t4 temp registers
* @param t0-t5 temp registers
*/
void MacroAssembler::kernel_crc32_1word_vpmsum(Register crc, Register buf, Register len, Register table,
Register constants, Register barretConstants,
Register t0, Register t1, Register t2, Register t3, Register t4,
bool invertCRC) {
void MacroAssembler::kernel_crc32_vpmsum(Register crc, Register buf, Register len, Register table,
Register constants, Register t0, Register t1, Register t2,
Register t3, Register t4, Register t5, bool invertCRC) {
assert_different_registers(crc, buf, len, table);
Label L_alignedHead, L_tail;
Label L_tail;
BLOCK_COMMENT("kernel_crc32_1word_vpmsum {");
BLOCK_COMMENT("kernel_crc32_vpmsum {");
// 1. ~c
if (invertCRC) {
nand(crc, crc, crc); // 1s complement of crc
}
// 2. use kernel_crc32_1word for short len
// Enforce 32 bit.
clrldi(len, len, 32);
cmpdi(CCR0, len, 512);
blt(CCR0, L_tail);
// 3. calculate from 0 to first aligned address
const int alignment = 16;
// Align if we have enough bytes for the fast version.
const int alignment = 16,
threshold = 32;
Register prealign = t0;
andi_(prealign, buf, alignment - 1);
beq(CCR0, L_alignedHead);
subfic(prealign, prealign, alignment);
neg(prealign, buf);
addi(t1, len, -threshold);
andi(prealign, prealign, alignment - 1);
cmpw(CCR0, t1, prealign);
blt(CCR0, L_tail); // len - prealign < threshold?
subf(len, prealign, len);
update_byteLoop_crc32(crc, buf, prealign, table, t2, false);
// 4. calculate from first aligned address as far as possible
BIND(L_alignedHead);
kernel_crc32_1word_aligned(crc, buf, len, constants, barretConstants, t0, t1, t2, t3, t4);
// Calculate from first aligned address as far as possible.
kernel_crc32_vpmsum_aligned(crc, buf, len, constants, t0, t1, t2, t3, t4, t5);
// 5. remaining bytes
// Remaining bytes.
BIND(L_tail);
Register tc0 = t4;
Register tc1 = constants;
Register tc2 = barretConstants;
kernel_crc32_1word(crc, buf, len, table, t0, t1, t2, t3, tc0, tc1, tc2, table, false);
update_byteLoop_crc32(crc, buf, len, table, t2, false);
// 6. ~c
if (invertCRC) {
nand(crc, crc, crc); // 1s complement of crc
}
BLOCK_COMMENT("} kernel_crc32_1word_vpmsum");
BLOCK_COMMENT("} kernel_crc32_vpmsum");
}
/**
@ -4205,13 +4198,10 @@ void MacroAssembler::kernel_crc32_1word_vpmsum(Register crc, Register buf, Regis
* @param buf register pointing to input byte buffer (byte*)
* @param len register containing number of bytes (will get updated to remaining bytes)
* @param constants register pointing to CRC table for 128-bit aligned memory
* @param barretConstants register pointing to table for barrett reduction
* @param t0-t4 temp registers
* Precondition: len should be >= 512. Otherwise, nothing will be done.
* @param t0-t5 temp registers
*/
void MacroAssembler::kernel_crc32_1word_aligned(Register crc, Register buf, Register len,
Register constants, Register barretConstants,
Register t0, Register t1, Register t2, Register t3, Register t4) {
void MacroAssembler::kernel_crc32_vpmsum_aligned(Register crc, Register buf, Register len,
Register constants, Register t0, Register t1, Register t2, Register t3, Register t4, Register t5) {
// Save non-volatile vector registers (frameless).
Register offset = t1;
@ -4228,7 +4218,6 @@ void MacroAssembler::kernel_crc32_1word_aligned(Register crc, Register buf, Regi
offsetInt -= 8; std(R14, offsetInt, R1_SP);
offsetInt -= 8; std(R15, offsetInt, R1_SP);
offsetInt -= 8; std(R16, offsetInt, R1_SP);
offsetInt -= 8; std(R17, offsetInt, R1_SP);
// Implementation uses an inner loop which uses between 256 and 16 * unroll_factor
// bytes per iteration. The basic scheme is:
@ -4239,14 +4228,17 @@ void MacroAssembler::kernel_crc32_1word_aligned(Register crc, Register buf, Regi
// Outer loop performs the CRC shifts needed to combine the unroll_factor2 vectors.
// Using 16 * unroll_factor / unroll_factor_2 bytes for constants.
const int unroll_factor = 2048;
const int unroll_factor2 = 8;
const int unroll_factor = CRC32_UNROLL_FACTOR,
unroll_factor2 = CRC32_UNROLL_FACTOR2;
const int outer_consts_size = (unroll_factor2 - 1) * 16,
inner_consts_size = (unroll_factor / unroll_factor2) * 16;
// Support registers.
Register offs[] = { noreg, t0, t1, t2, t3, t4, crc /* will live in VCRC */, R14 };
Register num_bytes = R15,
loop_count = R16,
cur_const = R17;
Register offs[] = { noreg, t0, t1, t2, t3, t4, t5, crc /* will live in VCRC */ };
Register num_bytes = R14,
loop_count = R15,
cur_const = R16;
// Constant array for outer loop: unroll_factor2 - 1 registers,
// Constant array for inner loop: unroll_factor / unroll_factor2 registers.
VectorRegister consts0[] = { VR16, VR17, VR18, VR19, VR20, VR21, VR22 },
@ -4268,7 +4260,7 @@ void MacroAssembler::kernel_crc32_1word_aligned(Register crc, Register buf, Regi
mtdscr(t0);
}
mtvrwz(VCRC, crc); // crc lives lives in VCRC, now
mtvrwz(VCRC, crc); // crc lives in VCRC, now
for (int i = 1; i < unroll_factor2; ++i) {
li(offs[i], 16 * i);
@ -4279,10 +4271,8 @@ void MacroAssembler::kernel_crc32_1word_aligned(Register crc, Register buf, Regi
for (int i = 1; i < unroll_factor2 - 1; ++i) {
lvx(consts0[i], offs[i], constants);
}
addi(constants, constants, (unroll_factor2 - 1) * 16);
load_const_optimized(num_bytes, 16 * unroll_factor);
load_const_optimized(loop_count, unroll_factor / (2 * unroll_factor2) - 1); // One double-iteration peeled off.
// Reuse data registers outside of the loop.
VectorRegister Vtmp = data1[0];
@ -4310,13 +4300,15 @@ void MacroAssembler::kernel_crc32_1word_aligned(Register crc, Register buf, Regi
cmpd(CCR0, len, num_bytes);
blt(CCR0, L_last);
addi(cur_const, constants, outer_consts_size); // Point to consts for inner loop
load_const_optimized(loop_count, unroll_factor / (2 * unroll_factor2) - 1); // One double-iteration peeled off.
// ********** Main loop start **********
align(32);
bind(L_outer_loop);
// Begin of unrolled first iteration (no xor).
lvx(data1[0], buf);
mr(cur_const, constants);
for (int i = 1; i < unroll_factor2 / 2; ++i) {
lvx(data1[i], offs[i], buf);
}
@ -4369,6 +4361,8 @@ void MacroAssembler::kernel_crc32_1word_aligned(Register crc, Register buf, Regi
}
bdnz(L_inner_loop);
addi(cur_const, constants, outer_consts_size); // Reset
// Tail of last iteration (no loads).
for (int i = 0; i < unroll_factor2 / 2; ++i) {
BE_swap_bytes(data1[i + unroll_factor2 / 2]);
@ -4397,15 +4391,15 @@ void MacroAssembler::kernel_crc32_1word_aligned(Register crc, Register buf, Regi
// Last chance with lower num_bytes.
bind(L_last);
srdi(loop_count, len, exact_log2(16 * 2 * unroll_factor2)); // Use double-iterations.
add_const_optimized(constants, constants, 16 * (unroll_factor / unroll_factor2)); // Point behind last one.
// Point behind last const for inner loop.
add_const_optimized(cur_const, constants, outer_consts_size + inner_consts_size);
sldi(R0, loop_count, exact_log2(16 * 2)); // Bytes of constants to be used.
clrrdi(num_bytes, len, exact_log2(16 * 2 * unroll_factor2));
subf(constants, R0, constants); // Point to constant to be used first.
subf(cur_const, R0, cur_const); // Point to constant to be used first.
addic_(loop_count, loop_count, -1); // One double-iteration peeled off.
bgt(CCR0, L_outer_loop);
// ********** Main loop end **********
#undef BE_swap_bytes
// Restore DSCR pre-fetch value.
if (VM_Version::has_mfdscr()) {
@ -4413,13 +4407,45 @@ void MacroAssembler::kernel_crc32_1word_aligned(Register crc, Register buf, Regi
mtdscr(t0);
}
// ********** Simple loop for remaining 16 byte blocks **********
{
Label L_loop, L_done;
srdi_(t0, len, 4); // 16 bytes per iteration
clrldi(len, len, 64-4);
beq(CCR0, L_done);
// Point to const (same as last const for inner loop).
add_const_optimized(cur_const, constants, outer_consts_size + inner_consts_size - 16);
mtctr(t0);
lvx(Vtmp2, cur_const);
align(32);
bind(L_loop);
lvx(Vtmp, buf);
addi(buf, buf, 16);
vpermxor(VCRC, VCRC, VCRC, Vc); // xor both halves to 64 bit result.
BE_swap_bytes(Vtmp);
vxor(VCRC, VCRC, Vtmp);
vpmsumw(VCRC, VCRC, Vtmp2);
bdnz(L_loop);
bind(L_done);
}
// ********** Simple loop end **********
#undef BE_swap_bytes
// Point to Barrett constants
add_const_optimized(cur_const, constants, outer_consts_size + inner_consts_size);
vspltisb(zeroes, 0);
// Combine to 64 bit result.
vpermxor(VCRC, VCRC, VCRC, Vc); // xor both halves to 64 bit result.
// Reduce to 32 bit CRC: Remainder by multiply-high.
lvx(Vtmp, barretConstants);
lvx(Vtmp, cur_const);
vsldoi(Vtmp2, zeroes, VCRC, 12); // Extract high 32 bit.
vpmsumd(Vtmp2, Vtmp2, Vtmp); // Multiply by inverse long poly.
vsldoi(Vtmp2, zeroes, Vtmp2, 12); // Extract high 32 bit.
@ -4445,7 +4471,20 @@ void MacroAssembler::kernel_crc32_1word_aligned(Register crc, Register buf, Regi
offsetInt -= 8; ld(R14, offsetInt, R1_SP);
offsetInt -= 8; ld(R15, offsetInt, R1_SP);
offsetInt -= 8; ld(R16, offsetInt, R1_SP);
offsetInt -= 8; ld(R17, offsetInt, R1_SP);
}
void MacroAssembler::crc32(Register crc, Register buf, Register len, Register t0, Register t1, Register t2,
Register t3, Register t4, Register t5, Register t6, Register t7, bool is_crc32c) {
load_const_optimized(t0, is_crc32c ? StubRoutines::crc32c_table_addr()
: StubRoutines::crc_table_addr() , R0);
if (VM_Version::has_vpmsumb()) {
load_const_optimized(t1, is_crc32c ? StubRoutines::ppc64::crc32c_constants()
: StubRoutines::ppc64::crc_constants() , R0);
kernel_crc32_vpmsum(crc, buf, len, t0, t1, t2, t3, t4, t5, t6, t7, !is_crc32c);
} else {
kernel_crc32_1word(crc, buf, len, t0, t1, t2, t3, t4, t5, t6, t7, t0, !is_crc32c);
}
}
void MacroAssembler::kernel_crc32_singleByte(Register crc, Register buf, Register len, Register table, Register tmp, bool invertCRC) {

@ -1,6 +1,6 @@
/*
* Copyright (c) 2002, 2019, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2012, 2018, SAP SE. All rights reserved.
* Copyright (c) 2012, 2019, SAP SE. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
@ -842,13 +842,14 @@ class MacroAssembler: public Assembler {
void kernel_crc32_1byte(Register crc, Register buf, Register len, Register table,
Register t0, Register t1, Register t2, Register t3,
bool invertCRC);
void kernel_crc32_1word_vpmsum(Register crc, Register buf, Register len, Register table,
Register constants, Register barretConstants,
Register t0, Register t1, Register t2, Register t3, Register t4,
void kernel_crc32_vpmsum(Register crc, Register buf, Register len, Register table, Register constants,
Register t0, Register t1, Register t2, Register t3, Register t4, Register t5,
bool invertCRC);
void kernel_crc32_1word_aligned(Register crc, Register buf, Register len,
Register constants, Register barretConstants,
Register t0, Register t1, Register t2, Register t3, Register t4);
void kernel_crc32_vpmsum_aligned(Register crc, Register buf, Register len, Register constants,
Register t0, Register t1, Register t2, Register t3, Register t4, Register t5);
// Version which internally decides what to use.
void crc32(Register crc, Register buf, Register len, Register t0, Register t1, Register t2,
Register t3, Register t4, Register t5, Register t6, Register t7, bool is_crc32c);
void kernel_crc32_singleByte(Register crc, Register buf, Register len, Register table, Register tmp,
bool invertCRC);

@ -1,6 +1,6 @@
/*
* Copyright (c) 1997, 2018, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2012, 2018, SAP SE. All rights reserved.
* Copyright (c) 1997, 2019, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2012, 2019, SAP SE. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
@ -3186,35 +3186,6 @@ class StubGenerator: public StubCodeGenerator {
return start;
}
// Compute CRC32/CRC32C function.
void generate_CRC_updateBytes(const char* name, Register table, bool invertCRC) {
// arguments to kernel_crc32:
const Register crc = R3_ARG1; // Current checksum, preset by caller or result from previous call.
const Register data = R4_ARG2; // source byte array
const Register dataLen = R5_ARG3; // #bytes to process
const Register t0 = R2;
const Register t1 = R7;
const Register t2 = R8;
const Register t3 = R9;
const Register tc0 = R10;
const Register tc1 = R11;
const Register tc2 = R12;
BLOCK_COMMENT("Stub body {");
assert_different_registers(crc, data, dataLen, table);
__ kernel_crc32_1word(crc, data, dataLen, table, t0, t1, t2, t3, tc0, tc1, tc2, table, invertCRC);
BLOCK_COMMENT("return");
__ mr_if_needed(R3_RET, crc); // Updated crc is function result. No copying required (R3_ARG1 == R3_RET).
__ blr();
BLOCK_COMMENT("} Stub body");
}
/**
* Arguments:
*
@ -3492,111 +3463,15 @@ class StubGenerator: public StubCodeGenerator {
* R3_RET - int crc result
*/
// Compute CRC32 function.
address generate_CRC32_updateBytes(const char* name) {
address generate_CRC32_updateBytes(bool is_crc32c) {
__ align(CodeEntryAlignment);
StubCodeMark mark(this, "StubRoutines", name);
StubCodeMark mark(this, "StubRoutines", is_crc32c ? "CRC32C_updateBytes" : "CRC32_updateBytes");
address start = __ function_entry(); // Remember stub start address (is rtn value).
const Register table = R6; // crc table address
// arguments to kernel_crc32:
const Register crc = R3_ARG1; // Current checksum, preset by caller or result from previous call.
const Register data = R4_ARG2; // source byte array
const Register dataLen = R5_ARG3; // #bytes to process
if (VM_Version::has_vpmsumb()) {
const Register constants = R2; // constants address
const Register bconstants = R8; // barret table address
const Register t0 = R9;
const Register t1 = R10;
const Register t2 = R11;
const Register t3 = R12;
const Register t4 = R7;
BLOCK_COMMENT("Stub body {");
assert_different_registers(crc, data, dataLen, table);
StubRoutines::ppc64::generate_load_crc_table_addr(_masm, table);
StubRoutines::ppc64::generate_load_crc_constants_addr(_masm, constants);
StubRoutines::ppc64::generate_load_crc_barret_constants_addr(_masm, bconstants);
__ kernel_crc32_1word_vpmsum(crc, data, dataLen, table, constants, bconstants, t0, t1, t2, t3, t4, true);
BLOCK_COMMENT("return");
__ mr_if_needed(R3_RET, crc); // Updated crc is function result. No copying required (R3_ARG1 == R3_RET).
__ blr();
BLOCK_COMMENT("} Stub body");
} else {
StubRoutines::ppc64::generate_load_crc_table_addr(_masm, table);
generate_CRC_updateBytes(name, table, true);
}
__ crc32(R3_ARG1, R4_ARG2, R5_ARG3, R2, R6, R7, R8, R9, R10, R11, R12, is_crc32c);
__ blr();
return start;
}
/**
* Arguments:
*
* Inputs:
* R3_ARG1 - int crc
* R4_ARG2 - byte* buf
* R5_ARG3 - int length (of buffer)
*
* scratch:
* R2, R6-R12
*
* Ouput:
* R3_RET - int crc result
*/
// Compute CRC32C function.
address generate_CRC32C_updateBytes(const char* name) {
__ align(CodeEntryAlignment);
StubCodeMark mark(this, "StubRoutines", name);
address start = __ function_entry(); // Remember stub start address (is rtn value).
const Register table = R6; // crc table address
// arguments to kernel_crc32:
const Register crc = R3_ARG1; // Current checksum, preset by caller or result from previous call.
const Register data = R4_ARG2; // source byte array
const Register dataLen = R5_ARG3; // #bytes to process
if (VM_Version::has_vpmsumb()) {
const Register constants = R2; // constants address
const Register bconstants = R8; // barret table address
const Register t0 = R9;
const Register t1 = R10;
const Register t2 = R11;
const Register t3 = R12;
const Register t4 = R7;
BLOCK_COMMENT("Stub body {");
assert_different_registers(crc, data, dataLen, table);
StubRoutines::ppc64::generate_load_crc32c_table_addr(_masm, table);
StubRoutines::ppc64::generate_load_crc32c_constants_addr(_masm, constants);
StubRoutines::ppc64::generate_load_crc32c_barret_constants_addr(_masm, bconstants);
__ kernel_crc32_1word_vpmsum(crc, data, dataLen, table, constants, bconstants, t0, t1, t2, t3, t4, false);
BLOCK_COMMENT("return");
__ mr_if_needed(R3_RET, crc); // Updated crc is function result. No copying required (R3_ARG1 == R3_RET).
__ blr();
BLOCK_COMMENT("} Stub body");
} else {
StubRoutines::ppc64::generate_load_crc32c_table_addr(_masm, table);
generate_CRC_updateBytes(name, table, false);
}
return start;
}
// Initialization
void generate_initial() {
// Generates all stubs and initializes the entry points
@ -3621,14 +3496,20 @@ class StubGenerator: public StubCodeGenerator {
// CRC32 Intrinsics.
if (UseCRC32Intrinsics) {
StubRoutines::_crc_table_adr = (address)StubRoutines::ppc64::_crc_table;
StubRoutines::_updateBytesCRC32 = generate_CRC32_updateBytes("CRC32_updateBytes");
StubRoutines::_crc_table_adr = (address)StubRoutines::ppc64::_crc_table;
if (VM_Version::has_vpmsumb()) {
StubRoutines::ppc64::_crc_constants = StubRoutines::ppc64::generate_crc_constants(REVERSE_CRC32_POLY);
}
StubRoutines::_updateBytesCRC32 = generate_CRC32_updateBytes(false);
}
// CRC32C Intrinsics.
if (UseCRC32CIntrinsics) {
StubRoutines::_crc32c_table_addr = (address)StubRoutines::ppc64::_crc32c_table;
StubRoutines::_updateBytesCRC32C = generate_CRC32C_updateBytes("CRC32C_updateBytes");
if (VM_Version::has_vpmsumb()) {
StubRoutines::ppc64::_crc32c_constants = StubRoutines::ppc64::generate_crc_constants(REVERSE_CRC32C_POLY);
}
StubRoutines::_updateBytesCRC32C = generate_CRC32_updateBytes(true);
}
}

@ -1,6 +1,6 @@
/*
* Copyright (c) 2002, 2019, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2012, 2018, SAP SE. All rights reserved.
* Copyright (c) 2012, 2019, SAP SE. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
@ -45,8 +45,14 @@ enum platform_dependent_constants {
#else
#define CRC32_TABLES 1
#endif
#define CRC32_CONSTANTS_SIZE 1084
#define CRC32_BARRET_CONSTANTS 10
#define REVERSE_CRC32_POLY 0xEDB88320
#define REVERSE_CRC32C_POLY 0x82F63B78
#define INVERSE_REVERSE_CRC32_POLY 0x1aab14226ull
#define INVERSE_REVERSE_CRC32C_POLY 0x105fd79bdull
#define CRC32_UNROLL_FACTOR 2048
#define CRC32_UNROLL_FACTOR2 8
class ppc64 {
friend class StubGenerator;
@ -56,20 +62,15 @@ class ppc64 {
// CRC32 Intrinsics.
static juint _crc_table[CRC32_TABLES][CRC32_COLUMN_SIZE];
static juint _crc32c_table[CRC32_TABLES][CRC32_COLUMN_SIZE];
static juint *_crc_constants, *_crc_barret_constants;
static juint *_crc32c_constants, *_crc32c_barret_constants;
static juint *_crc_constants;
static juint *_crc32c_constants;
public:
// CRC32 Intrinsics.
static void generate_load_crc_table_addr(MacroAssembler* masm, Register table);
static void generate_load_crc_constants_addr(MacroAssembler* masm, Register table);
static void generate_load_crc_barret_constants_addr(MacroAssembler* masm, Register table);
static void generate_load_crc32c_table_addr(MacroAssembler* masm, Register table);
static void generate_load_crc32c_constants_addr(MacroAssembler* masm, Register table);
static void generate_load_crc32c_barret_constants_addr(MacroAssembler* masm, Register table);
static address crc_constants() { return (address)_crc_constants; }
static address crc32c_constants() { return (address)_crc32c_constants; }
static juint* generate_crc_constants(juint reverse_poly);
static juint* generate_crc_barret_constants(juint reverse_poly);
};
#endif // CPU_PPC_STUBROUTINES_PPC_HPP

@ -1,6 +1,6 @@
/*
* Copyright (c) 2002, 2018, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2012, 2018, SAP SE. All rights reserved.
* Copyright (c) 2002, 2019, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2012, 2019, SAP SE. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
@ -33,39 +33,7 @@
#define __ masm->
// CRC32(C) Intrinsics.
void StubRoutines::ppc64::generate_load_crc_table_addr(MacroAssembler* masm, Register table) {
__ load_const_optimized(table, StubRoutines::_crc_table_adr, R0);
}
void StubRoutines::ppc64::generate_load_crc_constants_addr(MacroAssembler* masm, Register table) {
__ load_const_optimized(table, (address)StubRoutines::ppc64::_crc_constants, R0);
}
void StubRoutines::ppc64::generate_load_crc_barret_constants_addr(MacroAssembler* masm, Register table) {
__ load_const_optimized(table, (address)StubRoutines::ppc64::_crc_barret_constants, R0);
}
void StubRoutines::ppc64::generate_load_crc32c_table_addr(MacroAssembler* masm, Register table) {
__ load_const_optimized(table, StubRoutines::_crc32c_table_addr, R0);
}
void StubRoutines::ppc64::generate_load_crc32c_constants_addr(MacroAssembler* masm, Register table) {
__ load_const_optimized(table, (address)StubRoutines::ppc64::_crc32c_constants, R0);
}
void StubRoutines::ppc64::generate_load_crc32c_barret_constants_addr(MacroAssembler* masm, Register table) {
__ load_const_optimized(table, (address)StubRoutines::ppc64::_crc32c_barret_constants, R0);
}
// CRC constants and compute functions
#define REVERSE_CRC32_POLY 0xEDB88320
#define REVERSE_CRC32C_POLY 0x82F63B78
#define INVERSE_REVERSE_CRC32_POLY 0x1aab14226ull
#define INVERSE_REVERSE_CRC32C_POLY 0x105fd79bdull
#define UNROLL_FACTOR 2048
#define UNROLL_FACTOR2 8
// CRC constant compute functions
static juint fold_word(juint w, juint reverse_poly) {
for (int i = 0; i < 32; i++) {
int poly_if_odd = (-(w & 1)) & reverse_poly;
@ -98,13 +66,13 @@ static julong compute_inverse_poly(julong long_poly) {
// Constants to fold n words as needed by macroAssembler.
juint* StubRoutines::ppc64::generate_crc_constants(juint reverse_poly) {
juint* ptr = (juint*) malloc(sizeof(juint) * 4 * (UNROLL_FACTOR2 - 1 + UNROLL_FACTOR / UNROLL_FACTOR2));
juint* ptr = (juint*) malloc(sizeof(juint) * 4 * (CRC32_UNROLL_FACTOR2 + CRC32_UNROLL_FACTOR / CRC32_UNROLL_FACTOR2));
guarantee(((intptr_t)ptr & 0xF) == 0, "16-byte alignment needed");
guarantee(ptr != NULL, "allocation error of a crc table");
// Generate constants for outer loop
juint v0, v1, v2, v3 = 1;
for (int i = 0; i < UNROLL_FACTOR2 - 1; ++i) {
for (int i = 0; i < CRC32_UNROLL_FACTOR2 - 1; ++i) {
v0 = fold_word(v3, reverse_poly);
v1 = fold_word(v0, reverse_poly);
v2 = fold_word(v1, reverse_poly);
@ -123,15 +91,15 @@ juint* StubRoutines::ppc64::generate_crc_constants(juint reverse_poly) {
}
// Generate constants for inner loop
juint* ptr2 = ptr + 4 * (UNROLL_FACTOR2 - 1);
juint* ptr2 = ptr + 4 * (CRC32_UNROLL_FACTOR2 - 1);
v3 = 1; // Restart from scratch.
for (int i = 0; i < UNROLL_FACTOR; ++i) {
for (int i = 0; i < CRC32_UNROLL_FACTOR; ++i) {
v0 = fold_word(v3, reverse_poly);
v1 = fold_word(v0, reverse_poly);
v2 = fold_word(v1, reverse_poly);
v3 = fold_word(v2, reverse_poly);
if (i % UNROLL_FACTOR2 == 0) {
int idx = UNROLL_FACTOR / UNROLL_FACTOR2 - 1 - i / UNROLL_FACTOR2;
if (i % CRC32_UNROLL_FACTOR2 == 0) {
int idx = CRC32_UNROLL_FACTOR / CRC32_UNROLL_FACTOR2 - 1 - i / CRC32_UNROLL_FACTOR2;
for (int j = 0; j < 4; ++j) {
#ifdef VM_LITTLE_ENDIAN
ptr2[4*idx ] = v3;
@ -148,16 +116,9 @@ juint* StubRoutines::ppc64::generate_crc_constants(juint reverse_poly) {
}
}
return ptr;
}
// Constants to reduce 64 to 32 bit as needed by macroAssembler.
juint* StubRoutines::ppc64::generate_crc_barret_constants(juint reverse_poly) {
juint* ptr = (juint*) malloc(sizeof(juint) * CRC32_BARRET_CONSTANTS);
guarantee(((intptr_t)ptr & 0xF) == 0, "16-byte alignment needed");
guarantee(ptr != NULL, "allocation error of a crc table");
julong* c = (julong*)ptr;
// Constants to reduce 64 to 32 bit as needed by macroAssembler.
juint* ptr3 = ptr2 + 4 * (CRC32_UNROLL_FACTOR / CRC32_UNROLL_FACTOR2);
julong* c = (julong*)ptr3;
julong long_poly = (((julong)reverse_poly) << 1) | 1;
julong inverse_long_poly = compute_inverse_poly(long_poly);
#ifdef VM_LITTLE_ENDIAN
@ -177,6 +138,7 @@ juint* StubRoutines::ppc64::generate_crc_barret_constants(juint reverse_poly) {
#endif
//printf("inv poly: 0x%016llx\n", (long long unsigned int)inverse_long_poly);
return ptr;
}
@ -772,8 +734,5 @@ juint StubRoutines::ppc64::_crc32c_table[CRC32_TABLES][CRC32_COLUMN_SIZE] = {
#endif
};
juint* StubRoutines::ppc64::_crc_constants = StubRoutines::ppc64::generate_crc_constants(REVERSE_CRC32_POLY);
juint* StubRoutines::ppc64::_crc32c_constants = StubRoutines::ppc64::generate_crc_constants(REVERSE_CRC32C_POLY);
juint* StubRoutines::ppc64::_crc_barret_constants = StubRoutines::ppc64::generate_crc_barret_constants(REVERSE_CRC32_POLY);
juint* StubRoutines::ppc64::_crc32c_barret_constants = StubRoutines::ppc64::generate_crc_barret_constants(REVERSE_CRC32C_POLY);
juint* StubRoutines::ppc64::_crc_constants = NULL;
juint* StubRoutines::ppc64::_crc32c_constants = NULL;

@ -1,6 +1,6 @@
/*
* Copyright (c) 2014, 2018, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2015, 2018, SAP SE. All rights reserved.
* Copyright (c) 2014, 2019, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2015, 2019, SAP SE. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
@ -1832,7 +1832,7 @@ address TemplateInterpreterGenerator::generate_CRC32_update_entry() {
#endif
__ lwz(crc, 2*wordSize, argP); // Current crc state, zero extend to 64 bit to have a clean register.
StubRoutines::ppc64::generate_load_crc_table_addr(_masm, table);
__ load_const_optimized(table, StubRoutines::crc_table_addr(), R0);
__ kernel_crc32_singleByte(crc, data, dataLen, table, tmp, true);
// Restore caller sp for c2i case (from compiled) and for resized sender frame (from interpreted).
@ -1873,19 +1873,7 @@ address TemplateInterpreterGenerator::generate_CRC32_updateBytes_entry(AbstractI
const Register crc = R3_ARG1; // crc value
const Register data = R4_ARG2; // address of java byte array
const Register dataLen = R5_ARG3; // source data len
const Register table = R6_ARG4; // address of crc32 table
const Register t0 = R9; // scratch registers for crc calculation
const Register t1 = R10;
const Register t2 = R11;
const Register t3 = R12;
const Register tc0 = R2; // registers to hold pre-calculated column addresses
const Register tc1 = R7;
const Register tc2 = R8;
const Register tc3 = table; // table address is reconstructed at the end of kernel_crc32_* emitters
const Register tmp = t0; // Only used very locally to calculate byte buffer address.
const Register tmp = R11_scratch1;
// Arguments are reversed on java expression stack.
// Calculate address of start element.
@ -1916,12 +1904,7 @@ address TemplateInterpreterGenerator::generate_CRC32_updateBytes_entry(AbstractI
__ addi(data, data, arrayOopDesc::base_offset_in_bytes(T_BYTE));
}
StubRoutines::ppc64::generate_load_crc_table_addr(_masm, table);
// Performance measurements show the 1word and 2word variants to be almost equivalent,
// with very light advantages for the 1word variant. We chose the 1word variant for
// code compactness.
__ kernel_crc32_1word(crc, data, dataLen, table, t0, t1, t2, t3, tc0, tc1, tc2, tc3, true);
__ crc32(crc, data, dataLen, R2, R6, R7, R8, R9, R10, R11, R12, false);
// Restore caller sp for c2i case (from compiled) and for resized sender frame (from interpreted).
__ resize_frame_absolute(R21_sender_SP, R11_scratch1, R0);
@ -1959,19 +1942,7 @@ address TemplateInterpreterGenerator::generate_CRC32C_updateBytes_entry(Abstract
const Register crc = R3_ARG1; // crc value
const Register data = R4_ARG2; // address of java byte array
const Register dataLen = R5_ARG3; // source data len
const Register table = R6_ARG4; // address of crc32c table
const Register t0 = R9; // scratch registers for crc calculation
const Register t1 = R10;
const Register t2 = R11;
const Register t3 = R12;
const Register tc0 = R2; // registers to hold pre-calculated column addresses
const Register tc1 = R7;
const Register tc2 = R8;
const Register tc3 = table; // table address is reconstructed at the end of kernel_crc32_* emitters
const Register tmp = t0; // Only used very locally to calculate byte buffer address.
const Register tmp = R11_scratch1;
// Arguments are reversed on java expression stack.
// Calculate address of start element.
@ -2004,12 +1975,7 @@ address TemplateInterpreterGenerator::generate_CRC32C_updateBytes_entry(Abstract
__ addi(data, data, arrayOopDesc::base_offset_in_bytes(T_BYTE));
}
StubRoutines::ppc64::generate_load_crc32c_table_addr(_masm, table);
// Performance measurements show the 1word and 2word variants to be almost equivalent,
// with very light advantages for the 1word variant. We chose the 1word variant for
// code compactness.
__ kernel_crc32_1word(crc, data, dataLen, table, t0, t1, t2, t3, tc0, tc1, tc2, tc3, false);
__ crc32(crc, data, dataLen, R2, R6, R7, R8, R9, R10, R11, R12, true);
// Restore caller sp for c2i case (from compiled) and for resized sender frame (from interpreted).
__ resize_frame_absolute(R21_sender_SP, R11_scratch1, R0);