8175369: [ppc] Provide intrinsic implementation for CRC32C

Reviewed-by: mdoerr, simonis, kvn
This commit is contained in:
Lutz Schmidt 2017-03-08 17:01:13 -08:00
parent 99a554c5ad
commit 0171aad88e
10 changed files with 963 additions and 547 deletions

View File

@ -1,6 +1,6 @@
/*
* Copyright (c) 2000, 2016, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2012, 2016 SAP SE. All rights reserved.
* Copyright (c) 2000, 2017, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2012, 2017, SAP SE. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
@ -3177,9 +3177,8 @@ void LIR_Assembler::emit_updatecrc32(LIR_OpUpdateCRC32* op) {
assert_different_registers(val, crc, res);
__ load_const_optimized(res, StubRoutines::crc_table_addr(), R0);
__ nand(crc, crc, crc); // ~crc
__ update_byte_crc32(crc, val, res);
__ nand(res, crc, crc); // ~crc
__ kernel_crc32_singleByteReg(crc, val, res, true);
__ mr(res, crc);
}
#undef __

View File

@ -1,6 +1,6 @@
/*
* Copyright (c) 2005, 2016, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2012, 2015 SAP SE. All rights reserved.
* Copyright (c) 2005, 2017, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2012, 2017, SAP SE. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
@ -63,18 +63,6 @@ void LIRItem::load_nonconstant() {
}
inline void load_int_as_long(LIR_List *ll, LIRItem &li, LIR_Opr dst) {
LIR_Opr r = li.value()->operand();
if (r->is_register()) {
LIR_Opr dst_l = FrameMap::as_long_opr(dst->as_register());
ll->convert(Bytecodes::_i2l, li.result(), dst_l); // Convert.
} else {
// Constants or memory get loaded with sign extend on this platform.
ll->move(li.result(), dst);
}
}
//--------------------------------------------------------------
// LIRGenerator
//--------------------------------------------------------------
@ -1419,10 +1407,9 @@ void LIRGenerator::do_update_CRC32(Intrinsic* x) {
arg2 = cc->at(1),
arg3 = cc->at(2);
// CCallingConventionRequiresIntsAsLongs
crc.load_item_force(arg1); // We skip int->long conversion here, because CRC32 stub doesn't care about high bits.
__ leal(LIR_OprFact::address(a), arg2);
load_int_as_long(gen()->lir(), len, arg3);
len.load_item_force(arg3); // We skip int->long conversion here, , because CRC32 stub expects int.
__ call_runtime_leaf(StubRoutines::updateBytesCRC32(), LIR_OprFact::illegalOpr, result_reg, cc->args());
__ move(result_reg, result);
@ -1434,6 +1421,66 @@ void LIRGenerator::do_update_CRC32(Intrinsic* x) {
}
}
void LIRGenerator::do_update_CRC32C(Intrinsic* x) {
assert(UseCRC32CIntrinsics, "or should not be here");
LIR_Opr result = rlock_result(x);
switch (x->id()) {
case vmIntrinsics::_updateBytesCRC32C:
case vmIntrinsics::_updateDirectByteBufferCRC32C: {
bool is_updateBytes = (x->id() == vmIntrinsics::_updateBytesCRC32C);
LIRItem crc(x->argument_at(0), this);
LIRItem buf(x->argument_at(1), this);
LIRItem off(x->argument_at(2), this);
LIRItem len(x->argument_at(3), this);
buf.load_item();
off.load_nonconstant();
LIR_Opr index = off.result();
int offset = is_updateBytes ? arrayOopDesc::base_offset_in_bytes(T_BYTE) : 0;
if (off.result()->is_constant()) {
index = LIR_OprFact::illegalOpr;
offset += off.result()->as_jint();
}
LIR_Opr base_op = buf.result();
LIR_Address* a = NULL;
if (index->is_valid()) {
LIR_Opr tmp = new_register(T_LONG);
__ convert(Bytecodes::_i2l, index, tmp);
index = tmp;
__ add(index, LIR_OprFact::intptrConst(offset), index);
a = new LIR_Address(base_op, index, T_BYTE);
} else {
a = new LIR_Address(base_op, offset, T_BYTE);
}
BasicTypeList signature(3);
signature.append(T_INT);
signature.append(T_ADDRESS);
signature.append(T_INT);
CallingConvention* cc = frame_map()->c_calling_convention(&signature);
const LIR_Opr result_reg = result_register_for(x->type());
LIR_Opr arg1 = cc->at(0),
arg2 = cc->at(1),
arg3 = cc->at(2);
crc.load_item_force(arg1); // We skip int->long conversion here, because CRC32 stub doesn't care about high bits.
__ leal(LIR_OprFact::address(a), arg2);
len.load_item_force(arg3); // We skip int->long conversion here, , because CRC32 stub expects int.
__ call_runtime_leaf(StubRoutines::updateBytesCRC32C(), LIR_OprFact::illegalOpr, result_reg, cc->args());
__ move(result_reg, result);
break;
}
default: {
ShouldNotReachHere();
}
}
}
void LIRGenerator::do_FmaIntrinsic(Intrinsic* x) {
assert(x->number_of_arguments() == 3, "wrong type");
assert(UseFMA, "Needs FMA instructions support.");
@ -1460,7 +1507,3 @@ void LIRGenerator::do_FmaIntrinsic(Intrinsic* x) {
void LIRGenerator::do_vectorizedMismatch(Intrinsic* x) {
fatal("vectorizedMismatch intrinsic is not implemented on this platform");
}
void LIRGenerator::do_update_CRC32C(Intrinsic* x) {
Unimplemented();
}

View File

@ -1,6 +1,6 @@
/*
* Copyright (c) 1997, 2016, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2012, 2016 SAP SE. All rights reserved.
* Copyright (c) 1997, 2017, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2012, 2017, SAP SE. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
@ -4092,7 +4092,7 @@ void MacroAssembler::update_byte_crc32(Register crc, Register val, Register tabl
* @param table register pointing to CRC table
*/
void MacroAssembler::update_byteLoop_crc32(Register crc, Register buf, Register len, Register table,
Register data, bool loopAlignment, bool invertCRC) {
Register data, bool loopAlignment) {
assert_different_registers(crc, buf, len, table, data);
Label L_mainLoop, L_done;
@ -4103,10 +4103,6 @@ void MacroAssembler::update_byteLoop_crc32(Register crc, Register buf, Register
clrldi_(len, len, 32); // Enforce 32 bit. Anything to do?
beq(CCR0, L_done);
if (invertCRC) {
nand(crc, crc, crc); // ~c
}
mtctr(len);
align(mainLoop_alignment);
BIND(L_mainLoop);
@ -4115,10 +4111,6 @@ void MacroAssembler::update_byteLoop_crc32(Register crc, Register buf, Register
update_byte_crc32(crc, data, table);
bdnz(L_mainLoop); // Iterate.
if (invertCRC) {
nand(crc, crc, crc); // ~c
}
bind(L_done);
}
@ -4175,7 +4167,8 @@ void MacroAssembler::update_1word_crc32(Register crc, Register buf, Register tab
*/
void MacroAssembler::kernel_crc32_2word(Register crc, Register buf, Register len, Register table,
Register t0, Register t1, Register t2, Register t3,
Register tc0, Register tc1, Register tc2, Register tc3) {
Register tc0, Register tc1, Register tc2, Register tc3,
bool invertCRC) {
assert_different_registers(crc, buf, len, table);
Label L_mainLoop, L_tail;
@ -4189,14 +4182,16 @@ void MacroAssembler::kernel_crc32_2word(Register crc, Register buf, Register len
const int complexThreshold = 2*mainLoop_stepping;
// Don't test for len <= 0 here. This pathological case should not occur anyway.
// Optimizing for it by adding a test and a branch seems to be a waste of CPU cycles.
// The situation itself is detected and handled correctly by the conditional branches
// following aghi(len, -stepping) and aghi(len, +stepping).
// Optimizing for it by adding a test and a branch seems to be a waste of CPU cycles
// for all well-behaved cases. The situation itself is detected and handled correctly
// within update_byteLoop_crc32.
assert(tailLoop_stepping == 1, "check tailLoop_stepping!");
BLOCK_COMMENT("kernel_crc32_2word {");
nand(crc, crc, crc); // ~c
if (invertCRC) {
nand(crc, crc, crc); // 1s complement of crc
}
// Check for short (<mainLoop_stepping) buffer.
cmpdi(CCR0, len, complexThreshold);
@ -4217,7 +4212,7 @@ void MacroAssembler::kernel_crc32_2word(Register crc, Register buf, Register len
blt(CCR0, L_tail); // For less than one mainloop_stepping left, do only tail processing
mr(len, tmp); // remaining bytes for main loop (>=mainLoop_stepping is guaranteed).
}
update_byteLoop_crc32(crc, buf, tmp2, table, data, false, false);
update_byteLoop_crc32(crc, buf, tmp2, table, data, false);
}
srdi(tmp2, len, log_stepping); // #iterations for mainLoop
@ -4253,9 +4248,11 @@ void MacroAssembler::kernel_crc32_2word(Register crc, Register buf, Register len
// Process last few (<complexThreshold) bytes of buffer.
BIND(L_tail);
update_byteLoop_crc32(crc, buf, len, table, data, false, false);
update_byteLoop_crc32(crc, buf, len, table, data, false);
nand(crc, crc, crc); // ~c
if (invertCRC) {
nand(crc, crc, crc); // 1s complement of crc
}
BLOCK_COMMENT("} kernel_crc32_2word");
}
@ -4269,7 +4266,8 @@ void MacroAssembler::kernel_crc32_2word(Register crc, Register buf, Register len
*/
void MacroAssembler::kernel_crc32_1word(Register crc, Register buf, Register len, Register table,
Register t0, Register t1, Register t2, Register t3,
Register tc0, Register tc1, Register tc2, Register tc3) {
Register tc0, Register tc1, Register tc2, Register tc3,
bool invertCRC) {
assert_different_registers(crc, buf, len, table);
Label L_mainLoop, L_tail;
@ -4283,14 +4281,16 @@ void MacroAssembler::kernel_crc32_1word(Register crc, Register buf, Register len
const int complexThreshold = 2*mainLoop_stepping;
// Don't test for len <= 0 here. This pathological case should not occur anyway.
// Optimizing for it by adding a test and a branch seems to be a waste of CPU cycles.
// The situation itself is detected and handled correctly by the conditional branches
// following aghi(len, -stepping) and aghi(len, +stepping).
// Optimizing for it by adding a test and a branch seems to be a waste of CPU cycles
// for all well-behaved cases. The situation itself is detected and handled correctly
// within update_byteLoop_crc32.
assert(tailLoop_stepping == 1, "check tailLoop_stepping!");
BLOCK_COMMENT("kernel_crc32_1word {");
nand(crc, crc, crc); // ~c
if (invertCRC) {
nand(crc, crc, crc); // 1s complement of crc
}
// Check for short (<mainLoop_stepping) buffer.
cmpdi(CCR0, len, complexThreshold);
@ -4311,7 +4311,7 @@ void MacroAssembler::kernel_crc32_1word(Register crc, Register buf, Register len
blt(CCR0, L_tail); // For less than one mainloop_stepping left, do only tail processing
mr(len, tmp); // remaining bytes for main loop (>=mainLoop_stepping is guaranteed).
}
update_byteLoop_crc32(crc, buf, tmp2, table, data, false, false);
update_byteLoop_crc32(crc, buf, tmp2, table, data, false);
}
srdi(tmp2, len, log_stepping); // #iterations for mainLoop
@ -4346,9 +4346,11 @@ void MacroAssembler::kernel_crc32_1word(Register crc, Register buf, Register len
// Process last few (<complexThreshold) bytes of buffer.
BIND(L_tail);
update_byteLoop_crc32(crc, buf, len, table, data, false, false);
update_byteLoop_crc32(crc, buf, len, table, data, false);
nand(crc, crc, crc); // ~c
if (invertCRC) {
nand(crc, crc, crc); // 1s complement of crc
}
BLOCK_COMMENT("} kernel_crc32_1word");
}
@ -4361,16 +4363,24 @@ void MacroAssembler::kernel_crc32_1word(Register crc, Register buf, Register len
* Uses R7_ARG5, R8_ARG6 as work registers.
*/
void MacroAssembler::kernel_crc32_1byte(Register crc, Register buf, Register len, Register table,
Register t0, Register t1, Register t2, Register t3) {
Register t0, Register t1, Register t2, Register t3,
bool invertCRC) {
assert_different_registers(crc, buf, len, table);
Register data = t0; // Holds the current byte to be folded into crc.
BLOCK_COMMENT("kernel_crc32_1byte {");
// Process all bytes in a single-byte loop.
update_byteLoop_crc32(crc, buf, len, table, data, true, true);
if (invertCRC) {
nand(crc, crc, crc); // 1s complement of crc
}
// Process all bytes in a single-byte loop.
update_byteLoop_crc32(crc, buf, len, table, data, true);
if (invertCRC) {
nand(crc, crc, crc); // 1s complement of crc
}
BLOCK_COMMENT("} kernel_crc32_1byte");
}
@ -4388,7 +4398,8 @@ void MacroAssembler::kernel_crc32_1byte(Register crc, Register buf, Register len
*/
void MacroAssembler::kernel_crc32_1word_vpmsumd(Register crc, Register buf, Register len, Register table,
Register constants, Register barretConstants,
Register t0, Register t1, Register t2, Register t3, Register t4) {
Register t0, Register t1, Register t2, Register t3, Register t4,
bool invertCRC) {
assert_different_registers(crc, buf, len, table);
Label L_alignedHead, L_tail, L_alignTail, L_start, L_end;
@ -4406,13 +4417,15 @@ void MacroAssembler::kernel_crc32_1word_vpmsumd(Register crc, Register buf, Regi
Register tc0 = t4;
Register tc1 = constants;
Register tc2 = barretConstants;
kernel_crc32_1word(crc, buf, len, table,t0, t1, t2, t3, tc0, tc1, tc2, table);
kernel_crc32_1word(crc, buf, len, table,t0, t1, t2, t3, tc0, tc1, tc2, table, invertCRC);
b(L_end);
BIND(L_start);
// 2. ~c
nand(crc, crc, crc);
if (invertCRC) {
nand(crc, crc, crc); // 1s complement of crc
}
// 3. calculate from 0 to first 128bit-aligned address
clrldi_(prealign, buf, 57);
@ -4421,7 +4434,7 @@ void MacroAssembler::kernel_crc32_1word_vpmsumd(Register crc, Register buf, Regi
subfic(prealign, prealign, 128);
subf(len, prealign, len);
update_byteLoop_crc32(crc, buf, prealign, table, t2, false, false);
update_byteLoop_crc32(crc, buf, prealign, table, t2, false);
// 4. calculate from first 128bit-aligned address to last 128bit-aligned address
BIND(L_alignedHead);
@ -4436,12 +4449,14 @@ void MacroAssembler::kernel_crc32_1word_vpmsumd(Register crc, Register buf, Regi
cmpdi(CCR0, postalign, 0);
beq(CCR0, L_tail);
update_byteLoop_crc32(crc, buf, postalign, table, t2, false, false);
update_byteLoop_crc32(crc, buf, postalign, table, t2, false);
BIND(L_tail);
// 6. ~c
nand(crc, crc, crc);
if (invertCRC) {
nand(crc, crc, crc); // 1s complement of crc
}
BIND(L_end);
@ -4933,16 +4948,35 @@ void MacroAssembler::kernel_crc32_1word_aligned(Register crc, Register buf, Regi
offsetInt -= 8; ld(R31, offsetInt, R1_SP);
}
void MacroAssembler::kernel_crc32_singleByte(Register crc, Register buf, Register len, Register table, Register tmp) {
void MacroAssembler::kernel_crc32_singleByte(Register crc, Register buf, Register len, Register table, Register tmp, bool invertCRC) {
assert_different_registers(crc, buf, /* len, not used!! */ table, tmp);
BLOCK_COMMENT("kernel_crc32_singleByte:");
nand(crc, crc, crc); // ~c
if (invertCRC) {
nand(crc, crc, crc); // 1s complement of crc
}
lbz(tmp, 0, buf); // Byte from buffer, zero-extended.
lbz(tmp, 0, buf); // Byte from buffer, zero-extended.
update_byte_crc32(crc, tmp, table);
nand(crc, crc, crc); // ~c
if (invertCRC) {
nand(crc, crc, crc); // 1s complement of crc
}
}
void MacroAssembler::kernel_crc32_singleByteReg(Register crc, Register val, Register table, bool invertCRC) {
assert_different_registers(crc, val, table);
BLOCK_COMMENT("kernel_crc32_singleByteReg:");
if (invertCRC) {
nand(crc, crc, crc); // 1s complement of crc
}
update_byte_crc32(crc, val, table);
if (invertCRC) {
nand(crc, crc, crc); // 1s complement of crc
}
}
// dest_lo += src1 + src2

View File

@ -1,6 +1,6 @@
/*
* Copyright (c) 2002, 2016, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2012, 2016 SAP SE. All rights reserved.
* Copyright (c) 2002, 2017, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2012, 2017, SAP SE. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
@ -817,33 +817,47 @@ class MacroAssembler: public Assembler {
Register tmp6, Register tmp7, Register tmp8, Register tmp9, Register tmp10,
Register tmp11, Register tmp12, Register tmp13);
// CRC32 Intrinsics.
// Emitters for CRC32 calculation.
// A note on invertCRC:
// Unfortunately, internal representation of crc differs between CRC32 and CRC32C.
// CRC32 holds it's current crc value in the externally visible representation.
// CRC32C holds it's current crc value in internal format, ready for updating.
// Thus, the crc value must be bit-flipped before updating it in the CRC32 case.
// In the CRC32C case, it must be bit-flipped when it is given to the outside world (getValue()).
// The bool invertCRC parameter indicates whether bit-flipping is required before updates.
void load_reverse_32(Register dst, Register src);
int crc32_table_columns(Register table, Register tc0, Register tc1, Register tc2, Register tc3);
void fold_byte_crc32(Register crc, Register val, Register table, Register tmp);
void fold_8bit_crc32(Register crc, Register table, Register tmp);
void update_byte_crc32(Register crc, Register val, Register table);
void update_byteLoop_crc32(Register crc, Register buf, Register len, Register table,
Register data, bool loopAlignment, bool invertCRC);
Register data, bool loopAlignment);
void update_1word_crc32(Register crc, Register buf, Register table, int bufDisp, int bufInc,
Register t0, Register t1, Register t2, Register t3,
Register tc0, Register tc1, Register tc2, Register tc3);
void kernel_crc32_2word(Register crc, Register buf, Register len, Register table,
Register t0, Register t1, Register t2, Register t3,
Register tc0, Register tc1, Register tc2, Register tc3);
Register tc0, Register tc1, Register tc2, Register tc3,
bool invertCRC);
void kernel_crc32_1word(Register crc, Register buf, Register len, Register table,
Register t0, Register t1, Register t2, Register t3,
Register tc0, Register tc1, Register tc2, Register tc3);
Register tc0, Register tc1, Register tc2, Register tc3,
bool invertCRC);
void kernel_crc32_1byte(Register crc, Register buf, Register len, Register table,
Register t0, Register t1, Register t2, Register t3);
Register t0, Register t1, Register t2, Register t3,
bool invertCRC);
void kernel_crc32_1word_vpmsumd(Register crc, Register buf, Register len, Register table,
Register constants, Register barretConstants,
Register t0, Register t1, Register t2, Register t3, Register t4);
Register t0, Register t1, Register t2, Register t3, Register t4,
bool invertCRC);
void kernel_crc32_1word_aligned(Register crc, Register buf, Register len,
Register constants, Register barretConstants,
Register t0, Register t1, Register t2);
void kernel_crc32_singleByte(Register crc, Register buf, Register len, Register table, Register tmp);
void kernel_crc32_singleByte(Register crc, Register buf, Register len, Register table, Register tmp,
bool invertCRC);
void kernel_crc32_singleByteReg(Register crc, Register val, Register table,
bool invertCRC);
//
// Debugging

View File

@ -1,6 +1,6 @@
/*
* Copyright (c) 1997, 2016, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2012, 2016 SAP SE. All rights reserved.
* Copyright (c) 1997, 2017, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2012, 2017, SAP SE. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
@ -3276,6 +3276,36 @@ class StubGenerator: public StubCodeGenerator {
return start;
}
// Compute CRC32/CRC32C function.
void generate_CRC_updateBytes(const char* name, Register table, bool invertCRC) {
// arguments to kernel_crc32:
const Register crc = R3_ARG1; // Current checksum, preset by caller or result from previous call.
const Register data = R4_ARG2; // source byte array
const Register dataLen = R5_ARG3; // #bytes to process
const Register t0 = R2;
const Register t1 = R7;
const Register t2 = R8;
const Register t3 = R9;
const Register tc0 = R10;
const Register tc1 = R11;
const Register tc2 = R12;
BLOCK_COMMENT("Stub body {");
assert_different_registers(crc, data, dataLen, table);
__ kernel_crc32_1word(crc, data, dataLen, table, t0, t1, t2, t3, tc0, tc1, tc2, table, invertCRC);
BLOCK_COMMENT("return");
__ mr_if_needed(R3_RET, crc); // Updated crc is function result. No copying required (R3_ARG1 == R3_RET).
__ blr();
BLOCK_COMMENT("} Stub body");
}
/**
* Arguments:
*
@ -3296,14 +3326,14 @@ class StubGenerator: public StubCodeGenerator {
StubCodeMark mark(this, "StubRoutines", name);
address start = __ function_entry(); // Remember stub start address (is rtn value).
const Register table = R6; // crc table address
#ifdef VM_LITTLE_ENDIAN
// arguments to kernel_crc32:
const Register crc = R3_ARG1; // Current checksum, preset by caller or result from previous call.
const Register data = R4_ARG2; // source byte array
const Register dataLen = R5_ARG3; // #bytes to process
const Register table = R6; // crc table address
#ifdef VM_LITTLE_ENDIAN
if (VM_Version::has_vpmsumb()) {
const Register constants = R2; // constants address
const Register bconstants = R8; // barret table address
@ -3321,7 +3351,7 @@ class StubGenerator: public StubCodeGenerator {
StubRoutines::ppc64::generate_load_crc_constants_addr(_masm, constants);
StubRoutines::ppc64::generate_load_crc_barret_constants_addr(_masm, bconstants);
__ kernel_crc32_1word_vpmsumd(crc, data, dataLen, table, constants, bconstants, t0, t1, t2, t3, t4);
__ kernel_crc32_1word_vpmsumd(crc, data, dataLen, table, constants, bconstants, t0, t1, t2, t3, t4, true);
BLOCK_COMMENT("return");
__ mr_if_needed(R3_RET, crc); // Updated crc is function result. No copying required (R3_ARG1 == R3_RET).
@ -3331,31 +3361,79 @@ class StubGenerator: public StubCodeGenerator {
} else
#endif
{
const Register t0 = R2;
const Register t1 = R7;
const Register t2 = R8;
const Register t3 = R9;
const Register tc0 = R10;
const Register tc1 = R11;
const Register tc2 = R12;
StubRoutines::ppc64::generate_load_crc_table_addr(_masm, table);
generate_CRC_updateBytes(name, table, true);
}
return start;
}
/**
* Arguments:
*
* Inputs:
* R3_ARG1 - int crc
* R4_ARG2 - byte* buf
* R5_ARG3 - int length (of buffer)
*
* scratch:
* R2, R6-R12
*
* Ouput:
* R3_RET - int crc result
*/
// Compute CRC32C function.
address generate_CRC32C_updateBytes(const char* name) {
__ align(CodeEntryAlignment);
StubCodeMark mark(this, "StubRoutines", name);
address start = __ function_entry(); // Remember stub start address (is rtn value).
const Register table = R6; // crc table address
#if 0 // no vector support yet for CRC32C
#ifdef VM_LITTLE_ENDIAN
// arguments to kernel_crc32:
const Register crc = R3_ARG1; // Current checksum, preset by caller or result from previous call.
const Register data = R4_ARG2; // source byte array
const Register dataLen = R5_ARG3; // #bytes to process
if (VM_Version::has_vpmsumb()) {
const Register constants = R2; // constants address
const Register bconstants = R8; // barret table address
const Register t0 = R9;
const Register t1 = R10;
const Register t2 = R11;
const Register t3 = R12;
const Register t4 = R7;
BLOCK_COMMENT("Stub body {");
assert_different_registers(crc, data, dataLen, table);
StubRoutines::ppc64::generate_load_crc_table_addr(_masm, table);
StubRoutines::ppc64::generate_load_crc32c_table_addr(_masm, table);
StubRoutines::ppc64::generate_load_crc32c_constants_addr(_masm, constants);
StubRoutines::ppc64::generate_load_crc32c_barret_constants_addr(_masm, bconstants);
__ kernel_crc32_1word(crc, data, dataLen, table, t0, t1, t2, t3, tc0, tc1, tc2, table);
__ kernel_crc32_1word_vpmsumd(crc, data, dataLen, table, constants, bconstants, t0, t1, t2, t3, t4, false);
BLOCK_COMMENT("return");
__ mr_if_needed(R3_RET, crc); // Updated crc is function result. No copying required (R3_ARG1 == R3_RET).
__ blr();
BLOCK_COMMENT("} Stub body");
} else
#endif
#endif
{
StubRoutines::ppc64::generate_load_crc32c_table_addr(_masm, table);
generate_CRC_updateBytes(name, table, false);
}
return start;
}
// Initialization
void generate_initial() {
// Generates all stubs and initializes the entry points
@ -3383,6 +3461,12 @@ class StubGenerator: public StubCodeGenerator {
StubRoutines::_crc_table_adr = (address)StubRoutines::ppc64::_crc_table;
StubRoutines::_updateBytesCRC32 = generate_CRC32_updateBytes("CRC32_updateBytes");
}
// CRC32C Intrinsics.
if (UseCRC32CIntrinsics) {
StubRoutines::_crc32c_table_addr = (address)StubRoutines::ppc64::_crc32c_table;
StubRoutines::_updateBytesCRC32C = generate_CRC32C_updateBytes("CRC32C_updateBytes");
}
}
void generate_all() {

View File

@ -1,6 +1,6 @@
/*
* Copyright (c) 2002, 2016, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2012, 2016 SAP SE. All rights reserved.
* Copyright (c) 2002, 2017, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2012, 2017, SAP SE. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
@ -55,13 +55,16 @@ class ppc64 {
// CRC32 Intrinsics.
static juint _crc_table[CRC32_TABLES][CRC32_COLUMN_SIZE];
static juint _crc32c_table[CRC32_TABLES][CRC32_COLUMN_SIZE];
static juint* _constants;
static juint* _barret_constants;
public:
// CRC32 Intrinsics.
static void generate_load_table_addr(MacroAssembler* masm, Register table, address table_addr, uint64_t table_contents);
static void generate_load_crc_table_addr(MacroAssembler* masm, Register table);
static void generate_load_crc32c_table_addr(MacroAssembler* masm, Register table);
static void generate_load_crc_constants_addr(MacroAssembler* masm, Register table);
static void generate_load_crc_barret_constants_addr(MacroAssembler* masm, Register table);
static juint* generate_crc_constants();

File diff suppressed because it is too large Load Diff

View File

@ -1,6 +1,6 @@
/*
* Copyright (c) 2014, 2017, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2015, 2016 SAP SE. All rights reserved.
* Copyright (c) 2015, 2017, SAP SE. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
@ -1895,7 +1895,7 @@ address TemplateInterpreterGenerator::generate_CRC32_update_entry() {
__ lwz(crc, 2*wordSize, argP); // Current crc state, zero extend to 64 bit to have a clean register.
StubRoutines::ppc64::generate_load_crc_table_addr(_masm, table);
__ kernel_crc32_singleByte(crc, data, dataLen, table, tmp);
__ kernel_crc32_singleByte(crc, data, dataLen, table, tmp, true);
// Restore caller sp for c2i case and return.
__ mr(R1_SP, R21_sender_SP); // Cut the stack back to where the caller started.
@ -1911,6 +1911,10 @@ address TemplateInterpreterGenerator::generate_CRC32_update_entry() {
return NULL;
}
// TODO: generate_CRC32_updateBytes_entry and generate_CRC32C_updateBytes_entry are identical
// except for using different crc tables and some block comment strings.
// We should provide a common implementation.
// CRC32 Intrinsics.
/**
* Method entry for static native methods:
@ -1987,7 +1991,7 @@ address TemplateInterpreterGenerator::generate_CRC32_updateBytes_entry(AbstractI
// Performance measurements show the 1word and 2word variants to be almost equivalent,
// with very light advantages for the 1word variant. We chose the 1word variant for
// code compactness.
__ kernel_crc32_1word(crc, data, dataLen, table, t0, t1, t2, t3, tc0, tc1, tc2, tc3);
__ kernel_crc32_1word(crc, data, dataLen, table, t0, t1, t2, t3, tc0, tc1, tc2, tc3, true);
// Restore caller sp for c2i case and return.
__ mr(R1_SP, R21_sender_SP); // Cut the stack back to where the caller started.
@ -2003,8 +2007,84 @@ address TemplateInterpreterGenerator::generate_CRC32_updateBytes_entry(AbstractI
return NULL;
}
// Not supported
// CRC32C Intrinsics.
/**
* Method entry for static native methods:
* int java.util.zip.CRC32C.updateBytes( int crc, byte[] b, int off, int len)
* int java.util.zip.CRC32C.updateDirectByteBuffer(int crc, long* buf, int off, int len)
**/
address TemplateInterpreterGenerator::generate_CRC32C_updateBytes_entry(AbstractInterpreter::MethodKind kind) {
if (UseCRC32CIntrinsics) {
address start = __ pc(); // Remember stub start address (is rtn value).
// We don't generate local frame and don't align stack because
// we not even call stub code (we generate the code inline)
// and there is no safepoint on this path.
// Load parameters.
// Z_esp is callers operand stack pointer, i.e. it points to the parameters.
const Register argP = R15_esp;
const Register crc = R3_ARG1; // crc value
const Register data = R4_ARG2; // address of java byte array
const Register dataLen = R5_ARG3; // source data len
const Register table = R6_ARG4; // address of crc32c table
const Register t0 = R9; // scratch registers for crc calculation
const Register t1 = R10;
const Register t2 = R11;
const Register t3 = R12;
const Register tc0 = R2; // registers to hold pre-calculated column addresses
const Register tc1 = R7;
const Register tc2 = R8;
const Register tc3 = table; // table address is reconstructed at the end of kernel_crc32_* emitters
const Register tmp = t0; // Only used very locally to calculate byte buffer address.
// Arguments are reversed on java expression stack.
// Calculate address of start element.
if (kind == Interpreter::java_util_zip_CRC32C_updateDirectByteBuffer) { // Used for "updateDirectByteBuffer".
BLOCK_COMMENT("CRC32C_updateDirectByteBuffer {");
// crc @ (SP + 5W) (32bit)
// buf @ (SP + 3W) (64bit ptr to long array)
// off @ (SP + 2W) (32bit)
// dataLen @ (SP + 1W) (32bit)
// data = buf + off
__ ld( data, 3*wordSize, argP); // start of byte buffer
__ lwa( tmp, 2*wordSize, argP); // byte buffer offset
__ lwa( dataLen, 1*wordSize, argP); // #bytes to process
__ lwz( crc, 5*wordSize, argP); // current crc state
__ add( data, data, tmp); // Add byte buffer offset.
} else { // Used for "updateBytes update".
BLOCK_COMMENT("CRC32C_updateBytes {");
// crc @ (SP + 4W) (32bit)
// buf @ (SP + 3W) (64bit ptr to byte array)
// off @ (SP + 2W) (32bit)
// dataLen @ (SP + 1W) (32bit)
// data = buf + off + base_offset
__ ld( data, 3*wordSize, argP); // start of byte buffer
__ lwa( tmp, 2*wordSize, argP); // byte buffer offset
__ lwa( dataLen, 1*wordSize, argP); // #bytes to process
__ add( data, data, tmp); // add byte buffer offset
__ lwz( crc, 4*wordSize, argP); // current crc state
__ addi(data, data, arrayOopDesc::base_offset_in_bytes(T_BYTE));
}
StubRoutines::ppc64::generate_load_crc32c_table_addr(_masm, table);
// Performance measurements show the 1word and 2word variants to be almost equivalent,
// with very light advantages for the 1word variant. We chose the 1word variant for
// code compactness.
__ kernel_crc32_1word(crc, data, dataLen, table, t0, t1, t2, t3, tc0, tc1, tc2, tc3, false);
// Restore caller sp for c2i case and return.
__ mr(R1_SP, R21_sender_SP); // Cut the stack back to where the caller started.
__ blr();
BLOCK_COMMENT("} CRC32C_update{Bytes|DirectByteBuffer}");
return start;
}
return NULL;
}

View File

@ -1,6 +1,6 @@
/*
* Copyright (c) 1997, 2016, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2012, 2016 SAP SE. All rights reserved.
* Copyright (c) 1997, 2017, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2012, 2017, SAP SE. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
@ -172,18 +172,27 @@ void VM_Version::initialize() {
assert(AllocatePrefetchStyle >= 0, "AllocatePrefetchStyle should be positive");
// Implementation does not use any of the vector instructions
// available with Power8. Their exploitation is still pending.
// If defined(VM_LITTLE_ENDIAN) and running on Power8 or newer hardware,
// the implementation uses the vector instructions available with Power8.
// In all other cases, the implementation uses only generally available instructions.
if (!UseCRC32Intrinsics) {
if (FLAG_IS_DEFAULT(UseCRC32Intrinsics)) {
FLAG_SET_DEFAULT(UseCRC32Intrinsics, true);
}
}
if (UseCRC32CIntrinsics) {
if (!FLAG_IS_DEFAULT(UseCRC32CIntrinsics))
warning("CRC32C intrinsics are not available on this CPU");
FLAG_SET_DEFAULT(UseCRC32CIntrinsics, false);
// Implementation does not use any of the vector instructions available with Power8.
// Their exploitation is still pending (aka "work in progress").
if (!UseCRC32CIntrinsics) {
if (FLAG_IS_DEFAULT(UseCRC32CIntrinsics)) {
FLAG_SET_DEFAULT(UseCRC32CIntrinsics, true);
}
}
// TODO: Provide implementation.
if (UseAdler32Intrinsics) {
warning("Adler32Intrinsics not available on this CPU.");
FLAG_SET_DEFAULT(UseAdler32Intrinsics, false);
}
// The AES intrinsic stubs require AES instruction support.
@ -245,11 +254,6 @@ void VM_Version::initialize() {
FLAG_SET_DEFAULT(UseSHA512Intrinsics, false);
}
if (UseAdler32Intrinsics) {
warning("Adler32Intrinsics not available on this CPU.");
FLAG_SET_DEFAULT(UseAdler32Intrinsics, false);
}
if (FLAG_IS_DEFAULT(UseMultiplyToLenIntrinsic)) {
UseMultiplyToLenIntrinsic = true;
}

View File

@ -212,7 +212,7 @@ bool Compiler::is_intrinsic_supported(const methodHandle& method) {
case vmIntrinsics::_updateCRC32:
case vmIntrinsics::_updateBytesCRC32:
case vmIntrinsics::_updateByteBufferCRC32:
#if defined(SPARC) || defined(S390)
#if defined(SPARC) || defined(S390) || defined(PPC64)
case vmIntrinsics::_updateBytesCRC32C:
case vmIntrinsics::_updateDirectByteBufferCRC32C:
#endif