8143012: CRC32 Intrinsics support on SPARC

Reviewed-by: kvn, roland
This commit is contained in:
Ahmed Khawaja 2015-11-20 08:29:10 -08:00 committed by Vladimir Kozlov
parent ea052022fa
commit e28d9ba105
11 changed files with 822 additions and 7 deletions

View File

@ -2812,7 +2812,23 @@ void LIR_Assembler::monitor_address(int monitor_no, LIR_Opr dst_opr) {
} }
void LIR_Assembler::emit_updatecrc32(LIR_OpUpdateCRC32* op) { void LIR_Assembler::emit_updatecrc32(LIR_OpUpdateCRC32* op) {
fatal("CRC32 intrinsic is not implemented on this platform"); assert(op->crc()->is_single_cpu(), "crc must be register");
assert(op->val()->is_single_cpu(), "byte value must be register");
assert(op->result_opr()->is_single_cpu(), "result must be register");
Register crc = op->crc()->as_register();
Register val = op->val()->as_register();
Register table = op->result_opr()->as_register();
Register res = op->result_opr()->as_register();
assert_different_registers(val, crc, table);
__ set(ExternalAddress(StubRoutines::crc_table_addr()), table);
__ not1(crc);
__ clruwu(crc);
__ update_byte_crc32(crc, val, table);
__ not1(crc);
__ mov(crc, res);
} }
void LIR_Assembler::emit_lock(LIR_OpLock* op) { void LIR_Assembler::emit_lock(LIR_OpLock* op) {

View File

@ -786,7 +786,86 @@ void LIRGenerator::do_ArrayCopy(Intrinsic* x) {
} }
void LIRGenerator::do_update_CRC32(Intrinsic* x) { void LIRGenerator::do_update_CRC32(Intrinsic* x) {
fatal("CRC32 intrinsic is not implemented on this platform"); // Make all state_for calls early since they can emit code
LIR_Opr result = rlock_result(x);
int flags = 0;
switch (x->id()) {
case vmIntrinsics::_updateCRC32: {
LIRItem crc(x->argument_at(0), this);
LIRItem val(x->argument_at(1), this);
// val is destroyed by update_crc32
val.set_destroys_register();
crc.load_item();
val.load_item();
__ update_crc32(crc.result(), val.result(), result);
break;
}
case vmIntrinsics::_updateBytesCRC32:
case vmIntrinsics::_updateByteBufferCRC32: {
bool is_updateBytes = (x->id() == vmIntrinsics::_updateBytesCRC32);
LIRItem crc(x->argument_at(0), this);
LIRItem buf(x->argument_at(1), this);
LIRItem off(x->argument_at(2), this);
LIRItem len(x->argument_at(3), this);
buf.load_item();
off.load_nonconstant();
LIR_Opr index = off.result();
int offset = is_updateBytes ? arrayOopDesc::base_offset_in_bytes(T_BYTE) : 0;
if(off.result()->is_constant()) {
index = LIR_OprFact::illegalOpr;
offset += off.result()->as_jint();
}
LIR_Opr base_op = buf.result();
if (index->is_valid()) {
LIR_Opr tmp = new_register(T_LONG);
__ convert(Bytecodes::_i2l, index, tmp);
index = tmp;
if (index->is_constant()) {
offset += index->as_constant_ptr()->as_jint();
index = LIR_OprFact::illegalOpr;
} else if (index->is_register()) {
LIR_Opr tmp2 = new_register(T_LONG);
LIR_Opr tmp3 = new_register(T_LONG);
__ move(base_op, tmp2);
__ move(index, tmp3);
__ add(tmp2, tmp3, tmp2);
base_op = tmp2;
} else {
ShouldNotReachHere();
}
}
LIR_Address* a = new LIR_Address(base_op, offset, T_BYTE);
BasicTypeList signature(3);
signature.append(T_INT);
signature.append(T_ADDRESS);
signature.append(T_INT);
CallingConvention* cc = frame_map()->c_calling_convention(&signature);
const LIR_Opr result_reg = result_register_for(x->type());
LIR_Opr addr = new_pointer_register();
__ leal(LIR_OprFact::address(a), addr);
crc.load_item_force(cc->at(0));
__ move(addr, cc->at(1));
len.load_item_force(cc->at(2));
__ call_runtime_leaf(StubRoutines::updateBytesCRC32(), getThreadTemp(), result_reg, cc->args());
__ move(result_reg, result);
break;
}
default: {
ShouldNotReachHere();
}
}
} }
// _i2l, _i2f, _i2d, _l2i, _l2f, _l2d, _f2i, _f2l, _f2d, _d2i, _d2l, _d2f // _i2l, _i2f, _i2d, _l2i, _l2f, _l2d, _f2i, _f2l, _f2d, _d2i, _d2l, _d2f

View File

@ -1,5 +1,5 @@
/* /*
* Copyright (c) 1997, 2014, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 1997, 2015, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
* *
* This code is free software; you can redistribute it and/or modify it * This code is free software; you can redistribute it and/or modify it
@ -43,8 +43,9 @@
void generate_counter_incr(Label* overflow, Label* profile_method, Label* profile_method_continue); void generate_counter_incr(Label* overflow, Label* profile_method, Label* profile_method_continue);
void generate_counter_overflow(Label& Lcontinue); void generate_counter_overflow(Label& Lcontinue);
address generate_CRC32_update_entry();
address generate_CRC32_updateBytes_entry(AbstractInterpreter::MethodKind kind);
// Not supported // Not supported
address generate_CRC32_update_entry() { return NULL; }
address generate_CRC32_updateBytes_entry(AbstractInterpreter::MethodKind kind) { return NULL; }
address generate_CRC32C_updateBytes_entry(AbstractInterpreter::MethodKind kind) { return NULL; } address generate_CRC32C_updateBytes_entry(AbstractInterpreter::MethodKind kind) { return NULL; }
#endif // CPU_SPARC_VM_INTERPRETERGENERATOR_SPARC_HPP #endif // CPU_SPARC_VM_INTERPRETERGENERATOR_SPARC_HPP

View File

@ -4771,3 +4771,243 @@ void MacroAssembler::movftoi_revbytes(FloatRegister src, Register dst, Register
movdtox(src, tmp1); movdtox(src, tmp1);
reverse_bytes_32(tmp1, dst, tmp2); reverse_bytes_32(tmp1, dst, tmp2);
} }
void MacroAssembler::fold_128bit_crc32(Register xcrc_hi, Register xcrc_lo, Register xK_hi, Register xK_lo, Register xtmp_hi, Register xtmp_lo, Register buf, int offset) {
xmulx(xcrc_hi, xK_hi, xtmp_lo);
xmulxhi(xcrc_hi, xK_hi, xtmp_hi);
xmulxhi(xcrc_lo, xK_lo, xcrc_hi);
xmulx(xcrc_lo, xK_lo, xcrc_lo);
xor3(xcrc_lo, xtmp_lo, xcrc_lo);
xor3(xcrc_hi, xtmp_hi, xcrc_hi);
ldxl(buf, G0, xtmp_lo);
inc(buf, 8);
ldxl(buf, G0, xtmp_hi);
inc(buf, 8);
xor3(xcrc_lo, xtmp_lo, xcrc_lo);
xor3(xcrc_hi, xtmp_hi, xcrc_hi);
}
void MacroAssembler::fold_128bit_crc32(Register xcrc_hi, Register xcrc_lo, Register xK_hi, Register xK_lo, Register xtmp_hi, Register xtmp_lo, Register xbuf_hi, Register xbuf_lo) {
mov(xcrc_lo, xtmp_lo);
mov(xcrc_hi, xtmp_hi);
xmulx(xtmp_hi, xK_hi, xtmp_lo);
xmulxhi(xtmp_hi, xK_hi, xtmp_hi);
xmulxhi(xcrc_lo, xK_lo, xcrc_hi);
xmulx(xcrc_lo, xK_lo, xcrc_lo);
xor3(xcrc_lo, xbuf_lo, xcrc_lo);
xor3(xcrc_hi, xbuf_hi, xcrc_hi);
xor3(xcrc_lo, xtmp_lo, xcrc_lo);
xor3(xcrc_hi, xtmp_hi, xcrc_hi);
}
void MacroAssembler::fold_8bit_crc32(Register xcrc, Register table, Register xtmp, Register tmp) {
and3(xcrc, 0xFF, tmp);
sllx(tmp, 2, tmp);
lduw(table, tmp, xtmp);
srlx(xcrc, 8, xcrc);
xor3(xtmp, xcrc, xcrc);
}
void MacroAssembler::fold_8bit_crc32(Register crc, Register table, Register tmp) {
and3(crc, 0xFF, tmp);
srlx(crc, 8, crc);
sllx(tmp, 2, tmp);
lduw(table, tmp, tmp);
xor3(tmp, crc, crc);
}
#define CRC32_TMP_REG_NUM 18
#define CRC32_CONST_64 0x163cd6124
#define CRC32_CONST_96 0x0ccaa009e
#define CRC32_CONST_160 0x1751997d0
#define CRC32_CONST_480 0x1c6e41596
#define CRC32_CONST_544 0x154442bd4
void MacroAssembler::kernel_crc32(Register crc, Register buf, Register len, Register table) {
Label L_cleanup_loop, L_cleanup_check, L_align_loop, L_align_check;
Label L_main_loop_prologue;
Label L_fold_512b, L_fold_512b_loop, L_fold_128b;
Label L_fold_tail, L_fold_tail_loop;
Label L_8byte_fold_loop, L_8byte_fold_check;
const Register tmp[CRC32_TMP_REG_NUM] = {L0, L1, L2, L3, L4, L5, L6, G1, I0, I1, I2, I3, I4, I5, I7, O4, O5, G3};
Register const_64 = tmp[CRC32_TMP_REG_NUM-1];
Register const_96 = tmp[CRC32_TMP_REG_NUM-1];
Register const_160 = tmp[CRC32_TMP_REG_NUM-2];
Register const_480 = tmp[CRC32_TMP_REG_NUM-1];
Register const_544 = tmp[CRC32_TMP_REG_NUM-2];
set(ExternalAddress(StubRoutines::crc_table_addr()), table);
not1(crc); // ~c
clruwu(crc); // clear upper 32 bits of crc
// Check if below cutoff, proceed directly to cleanup code
mov(31, G4);
cmp_and_br_short(len, G4, Assembler::lessEqualUnsigned, Assembler::pt, L_cleanup_check);
// Align buffer to 8 byte boundry
mov(8, O5);
and3(buf, 0x7, O4);
sub(O5, O4, O5);
and3(O5, 0x7, O5);
sub(len, O5, len);
ba(L_align_check);
delayed()->nop();
// Alignment loop, table look up method for up to 7 bytes
bind(L_align_loop);
ldub(buf, 0, O4);
inc(buf);
dec(O5);
xor3(O4, crc, O4);
and3(O4, 0xFF, O4);
sllx(O4, 2, O4);
lduw(table, O4, O4);
srlx(crc, 8, crc);
xor3(O4, crc, crc);
bind(L_align_check);
nop();
cmp_and_br_short(O5, 0, Assembler::notEqual, Assembler::pt, L_align_loop);
// Aligned on 64-bit (8-byte) boundry at this point
// Check if still above cutoff (31-bytes)
mov(31, G4);
cmp_and_br_short(len, G4, Assembler::lessEqualUnsigned, Assembler::pt, L_cleanup_check);
// At least 32 bytes left to process
// Free up registers by storing them to FP registers
for (int i = 0; i < CRC32_TMP_REG_NUM; i++) {
movxtod(tmp[i], as_FloatRegister(2*i));
}
// Determine which loop to enter
// Shared prologue
ldxl(buf, G0, tmp[0]);
inc(buf, 8);
ldxl(buf, G0, tmp[1]);
inc(buf, 8);
xor3(tmp[0], crc, tmp[0]); // Fold CRC into first few bytes
and3(crc, 0, crc); // Clear out the crc register
// Main loop needs 128-bytes at least
mov(128, G4);
mov(64, tmp[2]);
cmp_and_br_short(len, G4, Assembler::greaterEqualUnsigned, Assembler::pt, L_main_loop_prologue);
// Less than 64 bytes
nop();
cmp_and_br_short(len, tmp[2], Assembler::lessUnsigned, Assembler::pt, L_fold_tail);
// Between 64 and 127 bytes
set64(CRC32_CONST_96, const_96, tmp[8]);
set64(CRC32_CONST_160, const_160, tmp[9]);
fold_128bit_crc32(tmp[1], tmp[0], const_96, const_160, tmp[2], tmp[3], buf, 0);
fold_128bit_crc32(tmp[1], tmp[0], const_96, const_160, tmp[4], tmp[5], buf, 16);
fold_128bit_crc32(tmp[1], tmp[0], const_96, const_160, tmp[6], tmp[7], buf, 32);
dec(len, 48);
ba(L_fold_tail);
delayed()->nop();
bind(L_main_loop_prologue);
for (int i = 2; i < 8; i++) {
ldxl(buf, G0, tmp[i]);
inc(buf, 8);
}
// Fold total 512 bits of polynomial on each iteration,
// 128 bits per each of 4 parallel streams
set64(CRC32_CONST_480, const_480, tmp[8]);
set64(CRC32_CONST_544, const_544, tmp[9]);
mov(128, G4);
bind(L_fold_512b_loop);
fold_128bit_crc32(tmp[1], tmp[0], const_480, const_544, tmp[9], tmp[8], buf, 0);
fold_128bit_crc32(tmp[3], tmp[2], const_480, const_544, tmp[11], tmp[10], buf, 16);
fold_128bit_crc32(tmp[5], tmp[4], const_480, const_544, tmp[13], tmp[12], buf, 32);
fold_128bit_crc32(tmp[7], tmp[6], const_480, const_544, tmp[15], tmp[14], buf, 64);
dec(len, 64);
cmp_and_br_short(len, G4, Assembler::greaterEqualUnsigned, Assembler::pt, L_fold_512b_loop);
// Fold 512 bits to 128 bits
bind(L_fold_512b);
set64(CRC32_CONST_96, const_96, tmp[8]);
set64(CRC32_CONST_160, const_160, tmp[9]);
fold_128bit_crc32(tmp[1], tmp[0], const_96, const_160, tmp[8], tmp[9], tmp[3], tmp[2]);
fold_128bit_crc32(tmp[1], tmp[0], const_96, const_160, tmp[8], tmp[9], tmp[5], tmp[4]);
fold_128bit_crc32(tmp[1], tmp[0], const_96, const_160, tmp[8], tmp[9], tmp[7], tmp[6]);
dec(len, 48);
// Fold the rest of 128 bits data chunks
bind(L_fold_tail);
mov(32, G4);
cmp_and_br_short(len, G4, Assembler::lessEqualUnsigned, Assembler::pt, L_fold_128b);
set64(CRC32_CONST_96, const_96, tmp[8]);
set64(CRC32_CONST_160, const_160, tmp[9]);
bind(L_fold_tail_loop);
fold_128bit_crc32(tmp[1], tmp[0], const_96, const_160, tmp[2], tmp[3], buf, 0);
sub(len, 16, len);
cmp_and_br_short(len, G4, Assembler::greaterEqualUnsigned, Assembler::pt, L_fold_tail_loop);
// Fold the 128 bits in tmps 0 - 1 into tmp 1
bind(L_fold_128b);
set64(CRC32_CONST_64, const_64, tmp[4]);
xmulx(const_64, tmp[0], tmp[2]);
xmulxhi(const_64, tmp[0], tmp[3]);
srl(tmp[2], G0, tmp[4]);
xmulx(const_64, tmp[4], tmp[4]);
srlx(tmp[2], 32, tmp[2]);
sllx(tmp[3], 32, tmp[3]);
or3(tmp[2], tmp[3], tmp[2]);
xor3(tmp[4], tmp[1], tmp[4]);
xor3(tmp[4], tmp[2], tmp[1]);
dec(len, 8);
// Use table lookup for the 8 bytes left in tmp[1]
dec(len, 8);
// 8 8-bit folds to compute 32-bit CRC.
for (int j = 0; j < 4; j++) {
fold_8bit_crc32(tmp[1], table, tmp[2], tmp[3]);
}
srl(tmp[1], G0, crc); // move 32 bits to general register
for (int j = 0; j < 4; j++) {
fold_8bit_crc32(crc, table, tmp[3]);
}
bind(L_8byte_fold_check);
// Restore int registers saved in FP registers
for (int i = 0; i < CRC32_TMP_REG_NUM; i++) {
movdtox(as_FloatRegister(2*i), tmp[i]);
}
ba(L_cleanup_check);
delayed()->nop();
// Table look-up method for the remaining few bytes
bind(L_cleanup_loop);
ldub(buf, 0, O4);
inc(buf);
dec(len);
xor3(O4, crc, O4);
and3(O4, 0xFF, O4);
sllx(O4, 2, O4);
lduw(table, O4, O4);
srlx(crc, 8, crc);
xor3(O4, crc, crc);
bind(L_cleanup_check);
nop();
cmp_and_br_short(len, 0, Assembler::greaterUnsigned, Assembler::pt, L_cleanup_loop);
not1(crc);
}

View File

@ -904,7 +904,9 @@ public:
inline void ldf(FloatRegisterImpl::Width w, const Address& a, FloatRegister d, int offset = 0); inline void ldf(FloatRegisterImpl::Width w, const Address& a, FloatRegister d, int offset = 0);
// little-endian // little-endian
inline void ldxl(Register s1, Register s2, Register d) { ldxa(s1, s2, ASI_PRIMARY_LITTLE, d); } inline void lduwl(Register s1, Register s2, Register d) { lduwa(s1, s2, ASI_PRIMARY_LITTLE, d); }
inline void ldswl(Register s1, Register s2, Register d) { ldswa(s1, s2, ASI_PRIMARY_LITTLE, d);}
inline void ldxl( Register s1, Register s2, Register d) { ldxa(s1, s2, ASI_PRIMARY_LITTLE, d); }
inline void ldfl(FloatRegisterImpl::Width w, Register s1, Register s2, FloatRegister d) { ldfa(w, s1, s2, ASI_PRIMARY_LITTLE, d); } inline void ldfl(FloatRegisterImpl::Width w, Register s1, Register s2, FloatRegister d) { ldfa(w, s1, s2, ASI_PRIMARY_LITTLE, d); }
// membar psuedo instruction. takes into account target memory model. // membar psuedo instruction. takes into account target memory model.
@ -1469,6 +1471,15 @@ public:
void movitof_revbytes(Register src, FloatRegister dst, Register tmp1, Register tmp2); void movitof_revbytes(Register src, FloatRegister dst, Register tmp1, Register tmp2);
void movftoi_revbytes(FloatRegister src, Register dst, Register tmp1, Register tmp2); void movftoi_revbytes(FloatRegister src, Register dst, Register tmp1, Register tmp2);
// CRC32 code for java.util.zip.CRC32::updateBytes0() instrinsic.
void kernel_crc32(Register crc, Register buf, Register len, Register table);
// Fold 128-bit data chunk
void fold_128bit_crc32(Register xcrc_hi, Register xcrc_lo, Register xK_hi, Register xK_lo, Register xtmp_hi, Register xtmp_lo, Register buf, int offset);
void fold_128bit_crc32(Register xcrc_hi, Register xcrc_lo, Register xK_hi, Register xK_lo, Register xtmp_hi, Register xtmp_lo, Register xbuf_hi, Register xbuf_lo);
// Fold 8-bit data
void fold_8bit_crc32(Register xcrc, Register table, Register xtmp, Register tmp);
void fold_8bit_crc32(Register crc, Register table, Register tmp);
#undef VIRTUAL #undef VIRTUAL
}; };

View File

@ -5292,6 +5292,38 @@ class StubGenerator: public StubCodeGenerator {
return start; return start;
} }
/**
* Arguments:
*
* Inputs:
* O0 - int crc
* O1 - byte* buf
* O2 - int len
* O3 - int* table
*
* Output:
* O0 - int crc result
*/
address generate_updateBytesCRC32() {
assert(UseCRC32Intrinsics, "need VIS3 instructions");
__ align(CodeEntryAlignment);
StubCodeMark mark(this, "StubRoutines", "updateBytesCRC32");
address start = __ pc();
const Register crc = O0; // crc
const Register buf = O1; // source java byte array address
const Register len = O2; // length
const Register table = O3; // crc_table address (reuse register)
__ kernel_crc32(crc, buf, len, table);
__ retl();
__ delayed()->nop();
return start;
}
void generate_initial() { void generate_initial() {
// Generates all stubs and initializes the entry points // Generates all stubs and initializes the entry points
@ -5324,6 +5356,12 @@ class StubGenerator: public StubCodeGenerator {
// Build this early so it's available for the interpreter. // Build this early so it's available for the interpreter.
StubRoutines::_throw_StackOverflowError_entry = generate_throw_exception("StackOverflowError throw_exception", CAST_FROM_FN_PTR(address, SharedRuntime::throw_StackOverflowError)); StubRoutines::_throw_StackOverflowError_entry = generate_throw_exception("StackOverflowError throw_exception", CAST_FROM_FN_PTR(address, SharedRuntime::throw_StackOverflowError));
if (UseCRC32Intrinsics) {
// set table address before stub generation which use it
StubRoutines::_crc_table_adr = (address)StubRoutines::Sparc::_crc_table;
StubRoutines::_updateBytesCRC32 = generate_updateBytesCRC32();
}
} }

View File

@ -1,5 +1,5 @@
/* /*
* Copyright (c) 1997, 2013, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 1997, 2015, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
* *
* This code is free software; you can redistribute it and/or modify it * This code is free software; you can redistribute it and/or modify it
@ -52,3 +52,98 @@ address StubRoutines::Sparc::_stop_subroutine_entry = NULL;
address StubRoutines::Sparc::_flush_callers_register_windows_entry = CAST_FROM_FN_PTR(address, bootstrap_flush_windows); address StubRoutines::Sparc::_flush_callers_register_windows_entry = CAST_FROM_FN_PTR(address, bootstrap_flush_windows);
address StubRoutines::Sparc::_partial_subtype_check = NULL; address StubRoutines::Sparc::_partial_subtype_check = NULL;
uint64_t StubRoutines::Sparc::_crc_by128_masks[] =
{
/* The fields in this structure are arranged so that they can be
* picked up two at a time with 128-bit loads.
*
* Because of flipped bit order for this CRC polynomials
* the constant for X**N is left-shifted by 1. This is because
* a 64 x 64 polynomial multiply produces a 127-bit result
* but the highest term is always aligned to bit 0 in the container.
* Pre-shifting by one fixes this, at the cost of potentially making
* the 32-bit constant no longer fit in a 32-bit container (thus the
* use of uint64_t, though this is also the size used by the carry-
* less multiply instruction.
*
* In addition, the flipped bit order and highest-term-at-least-bit
* multiply changes the constants used. The 96-bit result will be
* aligned to the high-term end of the target 128-bit container,
* not the low-term end; that is, instead of a 512-bit or 576-bit fold,
* instead it is a 480 (=512-32) or 544 (=512+64-32) bit fold.
*
* This cause additional problems in the 128-to-64-bit reduction; see the
* code for details. By storing a mask in the otherwise unused half of
* a 128-bit constant, bits can be cleared before multiplication without
* storing and reloading. Note that staying on a 128-bit datapath means
* that some data is uselessly stored and some unused data is intersected
* with an irrelevant constant.
*/
((uint64_t) 0xffffffffUL), /* low of K_M_64 */
((uint64_t) 0xb1e6b092U << 1), /* high of K_M_64 */
((uint64_t) 0xba8ccbe8U << 1), /* low of K_160_96 */
((uint64_t) 0x6655004fU << 1), /* high of K_160_96 */
((uint64_t) 0xaa2215eaU << 1), /* low of K_544_480 */
((uint64_t) 0xe3720acbU << 1) /* high of K_544_480 */
};
/**
* crc_table[] from jdk/src/java.base/share/native/libzip/zlib-1.2.8/crc32.h
*/
juint StubRoutines::Sparc::_crc_table[] =
{
0x00000000UL, 0x77073096UL, 0xee0e612cUL, 0x990951baUL, 0x076dc419UL,
0x706af48fUL, 0xe963a535UL, 0x9e6495a3UL, 0x0edb8832UL, 0x79dcb8a4UL,
0xe0d5e91eUL, 0x97d2d988UL, 0x09b64c2bUL, 0x7eb17cbdUL, 0xe7b82d07UL,
0x90bf1d91UL, 0x1db71064UL, 0x6ab020f2UL, 0xf3b97148UL, 0x84be41deUL,
0x1adad47dUL, 0x6ddde4ebUL, 0xf4d4b551UL, 0x83d385c7UL, 0x136c9856UL,
0x646ba8c0UL, 0xfd62f97aUL, 0x8a65c9ecUL, 0x14015c4fUL, 0x63066cd9UL,
0xfa0f3d63UL, 0x8d080df5UL, 0x3b6e20c8UL, 0x4c69105eUL, 0xd56041e4UL,
0xa2677172UL, 0x3c03e4d1UL, 0x4b04d447UL, 0xd20d85fdUL, 0xa50ab56bUL,
0x35b5a8faUL, 0x42b2986cUL, 0xdbbbc9d6UL, 0xacbcf940UL, 0x32d86ce3UL,
0x45df5c75UL, 0xdcd60dcfUL, 0xabd13d59UL, 0x26d930acUL, 0x51de003aUL,
0xc8d75180UL, 0xbfd06116UL, 0x21b4f4b5UL, 0x56b3c423UL, 0xcfba9599UL,
0xb8bda50fUL, 0x2802b89eUL, 0x5f058808UL, 0xc60cd9b2UL, 0xb10be924UL,
0x2f6f7c87UL, 0x58684c11UL, 0xc1611dabUL, 0xb6662d3dUL, 0x76dc4190UL,
0x01db7106UL, 0x98d220bcUL, 0xefd5102aUL, 0x71b18589UL, 0x06b6b51fUL,
0x9fbfe4a5UL, 0xe8b8d433UL, 0x7807c9a2UL, 0x0f00f934UL, 0x9609a88eUL,
0xe10e9818UL, 0x7f6a0dbbUL, 0x086d3d2dUL, 0x91646c97UL, 0xe6635c01UL,
0x6b6b51f4UL, 0x1c6c6162UL, 0x856530d8UL, 0xf262004eUL, 0x6c0695edUL,
0x1b01a57bUL, 0x8208f4c1UL, 0xf50fc457UL, 0x65b0d9c6UL, 0x12b7e950UL,
0x8bbeb8eaUL, 0xfcb9887cUL, 0x62dd1ddfUL, 0x15da2d49UL, 0x8cd37cf3UL,
0xfbd44c65UL, 0x4db26158UL, 0x3ab551ceUL, 0xa3bc0074UL, 0xd4bb30e2UL,
0x4adfa541UL, 0x3dd895d7UL, 0xa4d1c46dUL, 0xd3d6f4fbUL, 0x4369e96aUL,
0x346ed9fcUL, 0xad678846UL, 0xda60b8d0UL, 0x44042d73UL, 0x33031de5UL,
0xaa0a4c5fUL, 0xdd0d7cc9UL, 0x5005713cUL, 0x270241aaUL, 0xbe0b1010UL,
0xc90c2086UL, 0x5768b525UL, 0x206f85b3UL, 0xb966d409UL, 0xce61e49fUL,
0x5edef90eUL, 0x29d9c998UL, 0xb0d09822UL, 0xc7d7a8b4UL, 0x59b33d17UL,
0x2eb40d81UL, 0xb7bd5c3bUL, 0xc0ba6cadUL, 0xedb88320UL, 0x9abfb3b6UL,
0x03b6e20cUL, 0x74b1d29aUL, 0xead54739UL, 0x9dd277afUL, 0x04db2615UL,
0x73dc1683UL, 0xe3630b12UL, 0x94643b84UL, 0x0d6d6a3eUL, 0x7a6a5aa8UL,
0xe40ecf0bUL, 0x9309ff9dUL, 0x0a00ae27UL, 0x7d079eb1UL, 0xf00f9344UL,
0x8708a3d2UL, 0x1e01f268UL, 0x6906c2feUL, 0xf762575dUL, 0x806567cbUL,
0x196c3671UL, 0x6e6b06e7UL, 0xfed41b76UL, 0x89d32be0UL, 0x10da7a5aUL,
0x67dd4accUL, 0xf9b9df6fUL, 0x8ebeeff9UL, 0x17b7be43UL, 0x60b08ed5UL,
0xd6d6a3e8UL, 0xa1d1937eUL, 0x38d8c2c4UL, 0x4fdff252UL, 0xd1bb67f1UL,
0xa6bc5767UL, 0x3fb506ddUL, 0x48b2364bUL, 0xd80d2bdaUL, 0xaf0a1b4cUL,
0x36034af6UL, 0x41047a60UL, 0xdf60efc3UL, 0xa867df55UL, 0x316e8eefUL,
0x4669be79UL, 0xcb61b38cUL, 0xbc66831aUL, 0x256fd2a0UL, 0x5268e236UL,
0xcc0c7795UL, 0xbb0b4703UL, 0x220216b9UL, 0x5505262fUL, 0xc5ba3bbeUL,
0xb2bd0b28UL, 0x2bb45a92UL, 0x5cb36a04UL, 0xc2d7ffa7UL, 0xb5d0cf31UL,
0x2cd99e8bUL, 0x5bdeae1dUL, 0x9b64c2b0UL, 0xec63f226UL, 0x756aa39cUL,
0x026d930aUL, 0x9c0906a9UL, 0xeb0e363fUL, 0x72076785UL, 0x05005713UL,
0x95bf4a82UL, 0xe2b87a14UL, 0x7bb12baeUL, 0x0cb61b38UL, 0x92d28e9bUL,
0xe5d5be0dUL, 0x7cdcefb7UL, 0x0bdbdf21UL, 0x86d3d2d4UL, 0xf1d4e242UL,
0x68ddb3f8UL, 0x1fda836eUL, 0x81be16cdUL, 0xf6b9265bUL, 0x6fb077e1UL,
0x18b74777UL, 0x88085ae6UL, 0xff0f6a70UL, 0x66063bcaUL, 0x11010b5cUL,
0x8f659effUL, 0xf862ae69UL, 0x616bffd3UL, 0x166ccf45UL, 0xa00ae278UL,
0xd70dd2eeUL, 0x4e048354UL, 0x3903b3c2UL, 0xa7672661UL, 0xd06016f7UL,
0x4969474dUL, 0x3e6e77dbUL, 0xaed16a4aUL, 0xd9d65adcUL, 0x40df0b66UL,
0x37d83bf0UL, 0xa9bcae53UL, 0xdebb9ec5UL, 0x47b2cf7fUL, 0x30b5ffe9UL,
0xbdbdf21cUL, 0xcabac28aUL, 0x53b39330UL, 0x24b4a3a6UL, 0xbad03605UL,
0xcdd70693UL, 0x54de5729UL, 0x23d967bfUL, 0xb3667a2eUL, 0xc4614ab8UL,
0x5d681b02UL, 0x2a6f2b94UL, 0xb40bbe37UL, 0xc30c8ea1UL, 0x5a05df1bUL,
0x2d02ef8dUL
};

View File

@ -53,6 +53,9 @@ class Sparc {
static address _flush_callers_register_windows_entry; static address _flush_callers_register_windows_entry;
static address _partial_subtype_check; static address _partial_subtype_check;
// masks and table for CRC32
static uint64_t _crc_by128_masks[];
static juint _crc_table[];
public: public:
// test assembler stop routine by setting registers // test assembler stop routine by setting registers
@ -65,6 +68,8 @@ class Sparc {
static intptr_t* (*flush_callers_register_windows_func())() { return CAST_TO_FN_PTR(intptr_t* (*)(void), _flush_callers_register_windows_entry); } static intptr_t* (*flush_callers_register_windows_func())() { return CAST_TO_FN_PTR(intptr_t* (*)(void), _flush_callers_register_windows_entry); }
static address partial_subtype_check() { return _partial_subtype_check; } static address partial_subtype_check() { return _partial_subtype_check; }
static address crc_by128_masks_addr() { return (address)_crc_by128_masks; }
}; };
#endif // CPU_SPARC_VM_STUBROUTINES_SPARC_HPP #endif // CPU_SPARC_VM_STUBROUTINES_SPARC_HPP

View File

@ -803,6 +803,106 @@ address InterpreterGenerator::generate_Reference_get_entry(void) {
return NULL; return NULL;
} }
/**
* Method entry for static native methods:
* int java.util.zip.CRC32.update(int crc, int b)
*/
address InterpreterGenerator::generate_CRC32_update_entry() {
if (UseCRC32Intrinsics) {
address entry = __ pc();
Label L_slow_path;
// If we need a safepoint check, generate full interpreter entry.
ExternalAddress state(SafepointSynchronize::address_of_state());
__ set(ExternalAddress(SafepointSynchronize::address_of_state()), O2);
__ set(SafepointSynchronize::_not_synchronized, O3);
__ cmp_and_br_short(O2, O3, Assembler::notEqual, Assembler::pt, L_slow_path);
// Load parameters
const Register crc = O0; // initial crc
const Register val = O1; // byte to update with
const Register table = O2; // address of 256-entry lookup table
__ ldub(Gargs, 3, val);
__ lduw(Gargs, 8, crc);
__ set(ExternalAddress(StubRoutines::crc_table_addr()), table);
__ not1(crc); // ~crc
__ clruwu(crc);
__ update_byte_crc32(crc, val, table);
__ not1(crc); // ~crc
// result in O0
__ retl();
__ delayed()->nop();
// generate a vanilla native entry as the slow path
__ bind(L_slow_path);
__ jump_to_entry(Interpreter::entry_for_kind(Interpreter::native));
return entry;
}
return NULL;
}
/**
* Method entry for static native methods:
* int java.util.zip.CRC32.updateBytes(int crc, byte[] b, int off, int len)
* int java.util.zip.CRC32.updateByteBuffer(int crc, long buf, int off, int len)
*/
address InterpreterGenerator::generate_CRC32_updateBytes_entry(AbstractInterpreter::MethodKind kind) {
if (UseCRC32Intrinsics) {
address entry = __ pc();
Label L_slow_path;
// If we need a safepoint check, generate full interpreter entry.
ExternalAddress state(SafepointSynchronize::address_of_state());
__ set(ExternalAddress(SafepointSynchronize::address_of_state()), O2);
__ set(SafepointSynchronize::_not_synchronized, O3);
__ cmp_and_br_short(O2, O3, Assembler::notEqual, Assembler::pt, L_slow_path);
// Load parameters from the stack
const Register crc = O0; // initial crc
const Register buf = O1; // source java byte array address
const Register len = O2; // len
const Register offset = O3; // offset
// Arguments are reversed on java expression stack
// Calculate address of start element
if (kind == Interpreter::java_util_zip_CRC32_updateByteBuffer) {
__ lduw(Gargs, 0, len);
__ lduw(Gargs, 8, offset);
__ ldx( Gargs, 16, buf);
__ lduw(Gargs, 32, crc);
__ add(buf, offset, buf);
} else {
__ lduw(Gargs, 0, len);
__ lduw(Gargs, 8, offset);
__ ldx( Gargs, 16, buf);
__ lduw(Gargs, 24, crc);
__ add(buf, arrayOopDesc::base_offset_in_bytes(T_BYTE), buf); // account for the header size
__ add(buf ,offset, buf);
}
// Call the crc32 kernel
__ MacroAssembler::save_thread(L7_thread_cache);
__ kernel_crc32(crc, buf, len, O3);
__ MacroAssembler::restore_thread(L7_thread_cache);
// result in O0
__ retl();
__ delayed()->nop();
// generate a vanilla native entry as the slow path
__ bind(L_slow_path);
__ jump_to_entry(Interpreter::entry_for_kind(Interpreter::native));
return entry;
}
return NULL;
}
// //
// Interpreter stub for calling a native method. (asm interpreter) // Interpreter stub for calling a native method. (asm interpreter)
// This sets up a somewhat different looking stack for calling the native method // This sets up a somewhat different looking stack for calling the native method

View File

@ -347,6 +347,15 @@ void VM_Version::initialize() {
FLAG_SET_DEFAULT(UseAdler32Intrinsics, false); FLAG_SET_DEFAULT(UseAdler32Intrinsics, false);
} }
if (UseVIS > 2) {
if (FLAG_IS_DEFAULT(UseCRC32Intrinsics)) {
FLAG_SET_DEFAULT(UseCRC32Intrinsics, true);
}
} else if (UseCRC32Intrinsics) {
warning("SPARC CRC32 intrinsics require VIS3 insructions support. Intriniscs will be disabled");
FLAG_SET_DEFAULT(UseCRC32Intrinsics, false);
}
if (FLAG_IS_DEFAULT(ContendedPaddingWidth) && if (FLAG_IS_DEFAULT(ContendedPaddingWidth) &&
(cache_line_size > ContendedPaddingWidth)) (cache_line_size > ContendedPaddingWidth))
ContendedPaddingWidth = cache_line_size; ContendedPaddingWidth = cache_line_size;

View File

@ -0,0 +1,221 @@
/*
* Copyright (c) 2015, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License version 2 only, as
* published by the Free Software Foundation.
*
* This code is distributed in the hope that it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
* version 2 for more details (a copy is included in the LICENSE file that
* accompanied this code).
*
* You should have received a copy of the GNU General Public License version
* 2 along with this work; if not, write to the Free Software Foundation,
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
*
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
* or visit www.oracle.com if you need additional information or have any
* questions.
*/
/**
* @test
* @bug 8143012
* @summary CRC32 Intrinsics support on SPARC
*
* @run main/othervm/timeout=720 -Xbatch TestCRC32 -m
*/
import java.nio.ByteBuffer;
import java.util.zip.Checksum;
import java.util.zip.CRC32;
public class TestCRC32 {
public static void main(String[] args) {
int offset = Integer.getInteger("offset", 0);
int msgSize = Integer.getInteger("msgSize", 512);
boolean multi = false;
int iters = 20000;
int warmupIters = 20000;
if (args.length > 0) {
if (args[0].equals("-m")) {
multi = true;
} else {
iters = Integer.valueOf(args[0]);
}
if (args.length > 1) {
warmupIters = Integer.valueOf(args[1]);
}
}
if (multi) {
test_multi(warmupIters);
return;
}
System.out.println(" offset = " + offset);
System.out.println("msgSize = " + msgSize + " bytes");
System.out.println(" iters = " + iters);
byte[] b = initializedBytes(msgSize, offset);
CRC32 crc0 = new CRC32();
CRC32 crc1 = new CRC32();
CRC32 crc2 = new CRC32();
crc0.update(b, offset, msgSize);
System.out.println("-------------------------------------------------------");
/* warm up */
for (int i = 0; i < warmupIters; i++) {
crc1.reset();
crc1.update(b, offset, msgSize);
}
/* measure performance */
long start = System.nanoTime();
for (int i = 0; i < iters; i++) {
crc1.reset();
crc1.update(b, offset, msgSize);
}
long end = System.nanoTime();
double total = (double)(end - start)/1e9; // in seconds
double thruput = (double)msgSize*iters/1e6/total; // in MB/s
System.out.println("CRC32.update(byte[]) runtime = " + total + " seconds");
System.out.println("CRC32.update(byte[]) throughput = " + thruput + " MB/s");
/* check correctness */
for (int i = 0; i < iters; i++) {
crc1.reset();
crc1.update(b, offset, msgSize);
if (!check(crc0, crc1)) break;
}
report("CRCs", crc0, crc1);
System.out.println("-------------------------------------------------------");
ByteBuffer buf = ByteBuffer.allocateDirect(msgSize);
buf.put(b, offset, msgSize);
buf.flip();
/* warm up */
for (int i = 0; i < warmupIters; i++) {
crc2.reset();
crc2.update(buf);
buf.rewind();
}
/* measure performance */
start = System.nanoTime();
for (int i = 0; i < iters; i++) {
crc2.reset();
crc2.update(buf);
buf.rewind();
}
end = System.nanoTime();
total = (double)(end - start)/1e9; // in seconds
thruput = (double)msgSize*iters/1e6/total; // in MB/s
System.out.println("CRC32.update(ByteBuffer) runtime = " + total + " seconds");
System.out.println("CRC32.update(ByteBuffer) throughput = " + thruput + " MB/s");
/* check correctness */
for (int i = 0; i < iters; i++) {
crc2.reset();
crc2.update(buf);
buf.rewind();
if (!check(crc0, crc2)) break;
}
report("CRCs", crc0, crc2);
System.out.println("-------------------------------------------------------");
}
private static void report(String s, Checksum crc0, Checksum crc1) {
System.out.printf("%s: crc0 = %08x, crc1 = %08x\n",
s, crc0.getValue(), crc1.getValue());
}
private static boolean check(Checksum crc0, Checksum crc1) {
if (crc0.getValue() != crc1.getValue()) {
System.err.printf("ERROR: crc0 = %08x, crc1 = %08x\n",
crc0.getValue(), crc1.getValue());
return false;
}
return true;
}
private static byte[] initializedBytes(int M, int offset) {
byte[] bytes = new byte[M + offset];
for (int i = 0; i < offset; i++) {
bytes[i] = (byte) i;
}
for (int i = offset; i < bytes.length; i++) {
bytes[i] = (byte) (i - offset);
}
return bytes;
}
private static void test_multi(int iters) {
int len1 = 8; // the 8B/iteration loop
int len2 = 32; // the 32B/iteration loop
int len3 = 4096; // the 4KB/iteration loop
byte[] b = initializedBytes(len3*16, 0);
int[] offsets = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 16, 32, 64, 128, 256, 512 };
int[] sizes = { 0, 1, 2, 3, 4, 5, 6, 7,
len1, len1+1, len1+2, len1+3, len1+4, len1+5, len1+6, len1+7,
len1*2, len1*2+1, len1*2+3, len1*2+5, len1*2+7,
len2, len2+1, len2+3, len2+5, len2+7,
len2*2, len2*4, len2*8, len2*16, len2*32, len2*64,
len3, len3+1, len3+3, len3+5, len3+7,
len3*2, len3*4, len3*8,
len1+len2, len1+len2+1, len1+len2+3, len1+len2+5, len1+len2+7,
len1+len3, len1+len3+1, len1+len3+3, len1+len3+5, len1+len3+7,
len2+len3, len2+len3+1, len2+len3+3, len2+len3+5, len2+len3+7,
len1+len2+len3, len1+len2+len3+1, len1+len2+len3+3,
len1+len2+len3+5, len1+len2+len3+7,
(len1+len2+len3)*2, (len1+len2+len3)*2+1, (len1+len2+len3)*2+3,
(len1+len2+len3)*2+5, (len1+len2+len3)*2+7,
(len1+len2+len3)*3, (len1+len2+len3)*3-1, (len1+len2+len3)*3-3,
(len1+len2+len3)*3-5, (len1+len2+len3)*3-7 };
CRC32[] crc0 = new CRC32[offsets.length*sizes.length];
CRC32[] crc1 = new CRC32[offsets.length*sizes.length];
int i, j, k;
System.out.printf("testing %d cases ...\n", offsets.length*sizes.length);
/* set the result from interpreter as reference */
for (i = 0; i < offsets.length; i++) {
for (j = 0; j < sizes.length; j++) {
crc0[i*sizes.length + j] = new CRC32();
crc1[i*sizes.length + j] = new CRC32();
crc0[i*sizes.length + j].update(b, offsets[i], sizes[j]);
}
}
/* warm up the JIT compiler and get result */
for (k = 0; k < iters; k++) {
for (i = 0; i < offsets.length; i++) {
for (j = 0; j < sizes.length; j++) {
crc1[i*sizes.length + j].reset();
crc1[i*sizes.length + j].update(b, offsets[i], sizes[j]);
}
}
}
/* check correctness */
for (i = 0; i < offsets.length; i++) {
for (j = 0; j < sizes.length; j++) {
if (!check(crc0[i*sizes.length + j], crc1[i*sizes.length + j])) {
System.out.printf("offsets[%d] = %d", i, offsets[i]);
System.out.printf("\tsizes[%d] = %d\n", j, sizes[j]);
}
}
}
}
}