8073583: C2 support for CRC32C on SPARC

Reviewed-by: jrose, kvn
This commit is contained in:
James Cheng 2015-06-29 00:10:01 -07:00 committed by Vladimir Kozlov
parent 4a826139e3
commit e2533553f6
21 changed files with 746 additions and 19 deletions

View File

@ -199,6 +199,12 @@ void VM_Version::get_processor_features() {
UseCRC32Intrinsics = true;
}
if (UseCRC32CIntrinsics) {
if (!FLAG_IS_DEFAULT(UseCRC32CIntrinsics))
warning("CRC32C intrinsics are not available on this CPU");
FLAG_SET_DEFAULT(UseCRC32CIntrinsics, false);
}
if (auxv & (HWCAP_SHA1 | HWCAP_SHA2)) {
if (FLAG_IS_DEFAULT(UseSHA)) {
FLAG_SET_DEFAULT(UseSHA, true);

View File

@ -191,6 +191,13 @@ void VM_Version::initialize() {
FLAG_SET_DEFAULT(UseSHA256Intrinsics, false);
FLAG_SET_DEFAULT(UseSHA512Intrinsics, false);
}
if (UseCRC32CIntrinsics) {
if (!FLAG_IS_DEFAULT(UseCRC32CIntrinsics))
warning("CRC32C intrinsics are not available on this CPU");
FLAG_SET_DEFAULT(UseCRC32CIntrinsics, false);
}
// Adjust RTM (Restricted Transactional Memory) flags.
if (!has_tcheck() && UseRTMLocking) {
// Can't continue because UseRTMLocking affects UseBiasedLocking flag

View File

@ -128,8 +128,11 @@ class Assembler : public AbstractAssembler {
faligndata_op3 = 0x36,
flog3_op3 = 0x36,
edge_op3 = 0x36,
fzero_op3 = 0x36,
fsrc_op3 = 0x36,
fnot_op3 = 0x36,
xmulx_op3 = 0x36,
crc32c_op3 = 0x36,
impdep2_op3 = 0x37,
stpartialf_op3 = 0x37,
jmpl_op3 = 0x38,
@ -231,7 +234,9 @@ class Assembler : public AbstractAssembler {
sha1_opf = 0x141,
sha256_opf = 0x142,
sha512_opf = 0x143
sha512_opf = 0x143,
crc32c_opf = 0x147
};
enum op5s {
@ -600,6 +605,11 @@ class Assembler : public AbstractAssembler {
return x & ((1 << 10) - 1);
}
// create a low12 __value__ (not a field) for a given a 32-bit constant
static int low12( int x ) {
return x & ((1 << 12) - 1);
}
// AES crypto instructions supported only on certain processors
static void aes_only() { assert( VM_Version::has_aes(), "This instruction only works on SPARC with AES instructions support"); }
@ -608,6 +618,9 @@ class Assembler : public AbstractAssembler {
static void sha256_only() { assert( VM_Version::has_sha256(), "This instruction only works on SPARC with SHA256"); }
static void sha512_only() { assert( VM_Version::has_sha512(), "This instruction only works on SPARC with SHA512"); }
// CRC32C instruction supported only on certain processors
static void crc32c_only() { assert( VM_Version::has_crc32c(), "This instruction only works on SPARC with CRC32C"); }
// instruction only in VIS1
static void vis1_only() { assert( VM_Version::has_vis1(), "This instruction only works on SPARC with VIS1"); }
@ -1022,6 +1035,7 @@ public:
void nop() { emit_int32( op(branch_op) | op2(sethi_op2) ); }
void sw_count() { emit_int32( op(branch_op) | op2(sethi_op2) | 0x3f0 ); }
// pp 202
@ -1198,8 +1212,14 @@ public:
void faligndata( FloatRegister s1, FloatRegister s2, FloatRegister d ) { vis1_only(); emit_int32( op(arith_op) | fd(d, FloatRegisterImpl::D) | op3(faligndata_op3) | fs1(s1, FloatRegisterImpl::D) | opf(faligndata_opf) | fs2(s2, FloatRegisterImpl::D)); }
void fzero( FloatRegisterImpl::Width w, FloatRegister d ) { vis1_only(); emit_int32( op(arith_op) | fd(d, w) | op3(fzero_op3) | opf(0x62 - w)); }
void fsrc2( FloatRegisterImpl::Width w, FloatRegister s2, FloatRegister d ) { vis1_only(); emit_int32( op(arith_op) | fd(d, w) | op3(fsrc_op3) | opf(0x7A - w) | fs2(s2, w)); }
void fnot1( FloatRegisterImpl::Width w, FloatRegister s1, FloatRegister d ) { vis1_only(); emit_int32( op(arith_op) | fd(d, w) | op3(fnot_op3) | fs1(s1, w) | opf(0x6C - w)); }
void fpmerge( FloatRegister s1, FloatRegister s2, FloatRegister d ) { vis1_only(); emit_int32( op(arith_op) | fd(d, FloatRegisterImpl::D) | op3(0x36) | fs1(s1, FloatRegisterImpl::S) | opf(0x4b) | fs2(s2, FloatRegisterImpl::S)); }
void stpartialf( Register s1, Register s2, FloatRegister d, int ia = -1 ) { vis1_only(); emit_int32( op(ldst_op) | fd(d, FloatRegisterImpl::D) | op3(stpartialf_op3) | rs1(s1) | imm_asi(ia) | rs2(s2)); }
// VIS2 instructions
@ -1224,6 +1244,10 @@ public:
void sha256() { sha256_only(); emit_int32( op(arith_op) | op3(sha_op3) | opf(sha256_opf)); }
void sha512() { sha512_only(); emit_int32( op(arith_op) | op3(sha_op3) | opf(sha512_opf)); }
// CRC32C instruction
void crc32c( FloatRegister s1, FloatRegister s2, FloatRegister d ) { crc32c_only(); emit_int32( op(arith_op) | fd(d, FloatRegisterImpl::D) | op3(crc32c_op3) | fs1(s1, FloatRegisterImpl::D) | opf(crc32c_opf) | fs2(s2, FloatRegisterImpl::D)); }
// Creation
Assembler(CodeBuffer* code) : AbstractAssembler(code) {
#ifdef CHECK_DELAY

View File

@ -956,6 +956,7 @@ void MacroAssembler::set64(jlong value, Register d, Register tmp) {
int hi = (int)(value >> 32);
int lo = (int)(value & ~0);
int bits_33to2 = (int)((value >> 2) & ~0);
// (Matcher::isSimpleConstant64 knows about the following optimizations.)
if (Assembler::is_simm13(lo) && value == lo) {
or3(G0, lo, d);
@ -964,6 +965,12 @@ void MacroAssembler::set64(jlong value, Register d, Register tmp) {
if (low10(lo) != 0)
or3(d, low10(lo), d);
}
else if ((hi >> 2) == 0) {
Assembler::sethi(bits_33to2, d); // hardware version zero-extends to upper 32
sllx(d, 2, d);
if (low12(lo) != 0)
or3(d, low12(lo), d);
}
else if (hi == -1) {
Assembler::sethi(~lo, d); // hardware version zero-extends to upper 32
xor3(d, low10(lo) ^ ~low10(~0), d);
@ -4351,3 +4358,52 @@ void MacroAssembler::bis_zeroing(Register to, Register count, Register temp, Lab
cmp_and_brx_short(to, end, Assembler::lessUnsigned, Assembler::pt, small_loop);
nop(); // Separate short branches
}
/**
* Update CRC-32[C] with a byte value according to constants in table
*
* @param [in,out]crc Register containing the crc.
* @param [in]val Register containing the byte to fold into the CRC.
* @param [in]table Register containing the table of crc constants.
*
* uint32_t crc;
* val = crc_table[(val ^ crc) & 0xFF];
* crc = val ^ (crc >> 8);
*/
void MacroAssembler::update_byte_crc32(Register crc, Register val, Register table) {
xor3(val, crc, val);
and3(val, 0xFF, val);
sllx(val, 2, val);
lduw(table, val, val);
srlx(crc, 8, crc);
xor3(val, crc, crc);
}
// Reverse byte order of lower 32 bits, assuming upper 32 bits all zeros
void MacroAssembler::reverse_bytes_32(Register src, Register dst, Register tmp) {
srlx(src, 24, dst);
sllx(src, 32+8, tmp);
srlx(tmp, 32+24, tmp);
sllx(tmp, 8, tmp);
or3(dst, tmp, dst);
sllx(src, 32+16, tmp);
srlx(tmp, 32+24, tmp);
sllx(tmp, 16, tmp);
or3(dst, tmp, dst);
sllx(src, 32+24, tmp);
srlx(tmp, 32, tmp);
or3(dst, tmp, dst);
}
void MacroAssembler::movitof_revbytes(Register src, FloatRegister dst, Register tmp1, Register tmp2) {
reverse_bytes_32(src, tmp1, tmp2);
movxtod(tmp1, dst);
}
void MacroAssembler::movftoi_revbytes(FloatRegister src, Register dst, Register tmp1, Register tmp2) {
movdtox(src, tmp1);
reverse_bytes_32(tmp1, dst, tmp2);
}

View File

@ -1,5 +1,5 @@
/*
* Copyright (c) 1997, 2014, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 1997, 2015, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
@ -903,6 +903,10 @@ public:
inline void ldf(FloatRegisterImpl::Width w, Register s1, RegisterOrConstant s2, FloatRegister d);
inline void ldf(FloatRegisterImpl::Width w, const Address& a, FloatRegister d, int offset = 0);
// little-endian
inline void ldxl(Register s1, Register s2, Register d) { ldxa(s1, s2, ASI_PRIMARY_LITTLE, d); }
inline void ldfl(FloatRegisterImpl::Width w, Register s1, Register s2, FloatRegister d) { ldfa(w, s1, s2, ASI_PRIMARY_LITTLE, d); }
// membar psuedo instruction. takes into account target memory model.
inline void membar( Assembler::Membar_mask_bits const7a );
@ -1436,6 +1440,14 @@ public:
// Use BIS for zeroing
void bis_zeroing(Register to, Register count, Register temp, Label& Ldone);
// Update CRC-32[C] with a byte value according to constants in table
void update_byte_crc32(Register crc, Register val, Register table);
// Reverse byte order of lower 32 bits, assuming upper 32 bits all zeros
void reverse_bytes_32(Register src, Register dst, Register tmp);
void movitof_revbytes(Register src, FloatRegister dst, Register tmp1, Register tmp2);
void movftoi_revbytes(FloatRegister src, Register dst, Register tmp1, Register tmp2);
#undef VIRTUAL
};

View File

@ -4910,6 +4910,206 @@ class StubGenerator: public StubCodeGenerator {
return start;
}
#define CHUNK_LEN 128 /* 128 x 8B = 1KB */
#define CHUNK_K1 0x1307a0206 /* reverseBits(pow(x, CHUNK_LEN*8*8*3 - 32) mod P(x)) << 1 */
#define CHUNK_K2 0x1a0f717c4 /* reverseBits(pow(x, CHUNK_LEN*8*8*2 - 32) mod P(x)) << 1 */
#define CHUNK_K3 0x0170076fa /* reverseBits(pow(x, CHUNK_LEN*8*8*1 - 32) mod P(x)) << 1 */
/**
* Arguments:
*
* Inputs:
* O0 - int crc
* O1 - byte* buf
* O2 - int len
* O3 - int* table
*
* Output:
* O0 - int crc result
*/
address generate_updateBytesCRC32C() {
assert(UseCRC32CIntrinsics, "need CRC32C instruction");
__ align(CodeEntryAlignment);
StubCodeMark mark(this, "StubRoutines", "updateBytesCRC32C");
address start = __ pc();
const Register crc = O0; // crc
const Register buf = O1; // source java byte array address
const Register len = O2; // number of bytes
const Register table = O3; // byteTable
Label L_crc32c_head, L_crc32c_aligned;
Label L_crc32c_parallel, L_crc32c_parallel_loop;
Label L_crc32c_serial, L_crc32c_x32_loop, L_crc32c_x8, L_crc32c_x8_loop;
Label L_crc32c_done, L_crc32c_tail, L_crc32c_return;
__ cmp_and_br_short(len, 0, Assembler::lessEqual, Assembler::pn, L_crc32c_return);
// clear upper 32 bits of crc
__ clruwu(crc);
__ and3(buf, 7, G4);
__ cmp_and_brx_short(G4, 0, Assembler::equal, Assembler::pt, L_crc32c_aligned);
__ mov(8, G1);
__ sub(G1, G4, G4);
// ------ process the misaligned head (7 bytes or less) ------
__ BIND(L_crc32c_head);
// crc = (crc >>> 8) ^ byteTable[(crc ^ b) & 0xFF];
__ ldub(buf, 0, G1);
__ update_byte_crc32(crc, G1, table);
__ inc(buf);
__ dec(len);
__ cmp_and_br_short(len, 0, Assembler::equal, Assembler::pn, L_crc32c_return);
__ dec(G4);
__ cmp_and_br_short(G4, 0, Assembler::greater, Assembler::pt, L_crc32c_head);
// ------ process the 8-byte-aligned body ------
__ BIND(L_crc32c_aligned);
__ nop();
__ cmp_and_br_short(len, 8, Assembler::less, Assembler::pn, L_crc32c_tail);
// reverse the byte order of lower 32 bits to big endian, and move to FP side
__ movitof_revbytes(crc, F0, G1, G3);
__ set(CHUNK_LEN*8*4, G4);
__ cmp_and_br_short(len, G4, Assembler::less, Assembler::pt, L_crc32c_serial);
// ------ process four 1KB chunks in parallel ------
__ BIND(L_crc32c_parallel);
__ fzero(FloatRegisterImpl::D, F2);
__ fzero(FloatRegisterImpl::D, F4);
__ fzero(FloatRegisterImpl::D, F6);
__ mov(CHUNK_LEN - 1, G4);
__ BIND(L_crc32c_parallel_loop);
// schedule ldf's ahead of crc32c's to hide the load-use latency
__ ldf(FloatRegisterImpl::D, buf, 0, F8);
__ ldf(FloatRegisterImpl::D, buf, CHUNK_LEN*8, F10);
__ ldf(FloatRegisterImpl::D, buf, CHUNK_LEN*16, F12);
__ ldf(FloatRegisterImpl::D, buf, CHUNK_LEN*24, F14);
__ crc32c(F0, F8, F0);
__ crc32c(F2, F10, F2);
__ crc32c(F4, F12, F4);
__ crc32c(F6, F14, F6);
__ inc(buf, 8);
__ dec(G4);
__ cmp_and_br_short(G4, 0, Assembler::greater, Assembler::pt, L_crc32c_parallel_loop);
__ ldf(FloatRegisterImpl::D, buf, 0, F8);
__ ldf(FloatRegisterImpl::D, buf, CHUNK_LEN*8, F10);
__ ldf(FloatRegisterImpl::D, buf, CHUNK_LEN*16, F12);
__ crc32c(F0, F8, F0);
__ crc32c(F2, F10, F2);
__ crc32c(F4, F12, F4);
__ inc(buf, CHUNK_LEN*24);
__ ldfl(FloatRegisterImpl::D, buf, G0, F14); // load in little endian
__ inc(buf, 8);
__ prefetch(buf, 0, Assembler::severalReads);
__ prefetch(buf, CHUNK_LEN*8, Assembler::severalReads);
__ prefetch(buf, CHUNK_LEN*16, Assembler::severalReads);
__ prefetch(buf, CHUNK_LEN*24, Assembler::severalReads);
// move to INT side, and reverse the byte order of lower 32 bits to little endian
__ movftoi_revbytes(F0, O4, G1, G4);
__ movftoi_revbytes(F2, O5, G1, G4);
__ movftoi_revbytes(F4, G5, G1, G4);
// combine the results of 4 chunks
__ set64(CHUNK_K1, G3, G1);
__ xmulx(O4, G3, O4);
__ set64(CHUNK_K2, G3, G1);
__ xmulx(O5, G3, O5);
__ set64(CHUNK_K3, G3, G1);
__ xmulx(G5, G3, G5);
__ movdtox(F14, G4);
__ xor3(O4, O5, O5);
__ xor3(G5, O5, O5);
__ xor3(G4, O5, O5);
// reverse the byte order to big endian, via stack, and move to FP side
__ add(SP, -8, G1);
__ srlx(G1, 3, G1);
__ sllx(G1, 3, G1);
__ stx(O5, G1, G0);
__ ldfl(FloatRegisterImpl::D, G1, G0, F2); // load in little endian
__ crc32c(F6, F2, F0);
__ set(CHUNK_LEN*8*4, G4);
__ sub(len, G4, len);
__ cmp_and_br_short(len, G4, Assembler::greaterEqual, Assembler::pt, L_crc32c_parallel);
__ nop();
__ cmp_and_br_short(len, 0, Assembler::equal, Assembler::pt, L_crc32c_done);
__ BIND(L_crc32c_serial);
__ mov(32, G4);
__ cmp_and_br_short(len, G4, Assembler::less, Assembler::pn, L_crc32c_x8);
// ------ process 32B chunks ------
__ BIND(L_crc32c_x32_loop);
__ ldf(FloatRegisterImpl::D, buf, 0, F2);
__ inc(buf, 8);
__ crc32c(F0, F2, F0);
__ ldf(FloatRegisterImpl::D, buf, 0, F2);
__ inc(buf, 8);
__ crc32c(F0, F2, F0);
__ ldf(FloatRegisterImpl::D, buf, 0, F2);
__ inc(buf, 8);
__ crc32c(F0, F2, F0);
__ ldf(FloatRegisterImpl::D, buf, 0, F2);
__ inc(buf, 8);
__ crc32c(F0, F2, F0);
__ dec(len, 32);
__ cmp_and_br_short(len, G4, Assembler::greaterEqual, Assembler::pt, L_crc32c_x32_loop);
__ BIND(L_crc32c_x8);
__ nop();
__ cmp_and_br_short(len, 8, Assembler::less, Assembler::pt, L_crc32c_done);
// ------ process 8B chunks ------
__ BIND(L_crc32c_x8_loop);
__ ldf(FloatRegisterImpl::D, buf, 0, F2);
__ inc(buf, 8);
__ crc32c(F0, F2, F0);
__ dec(len, 8);
__ cmp_and_br_short(len, 8, Assembler::greaterEqual, Assembler::pt, L_crc32c_x8_loop);
__ BIND(L_crc32c_done);
// move to INT side, and reverse the byte order of lower 32 bits to little endian
__ movftoi_revbytes(F0, crc, G1, G3);
__ cmp_and_br_short(len, 0, Assembler::equal, Assembler::pt, L_crc32c_return);
// ------ process the misaligned tail (7 bytes or less) ------
__ BIND(L_crc32c_tail);
// crc = (crc >>> 8) ^ byteTable[(crc ^ b) & 0xFF];
__ ldub(buf, 0, G1);
__ update_byte_crc32(crc, G1, table);
__ inc(buf);
__ dec(len);
__ cmp_and_br_short(len, 0, Assembler::greater, Assembler::pt, L_crc32c_tail);
__ BIND(L_crc32c_return);
__ nop();
__ retl();
__ delayed()->nop();
return start;
}
void generate_initial() {
// Generates all stubs and initializes the entry points
@ -5001,6 +5201,11 @@ class StubGenerator: public StubCodeGenerator {
StubRoutines::_sha512_implCompress = generate_sha512_implCompress(false, "sha512_implCompress");
StubRoutines::_sha512_implCompressMB = generate_sha512_implCompress(true, "sha512_implCompressMB");
}
// generate CRC32C intrinsic code
if (UseCRC32CIntrinsics) {
StubRoutines::_updateBytesCRC32C = generate_updateBytesCRC32C();
}
}

View File

@ -1,5 +1,5 @@
/*
* Copyright (c) 1997, 2013, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 1997, 2015, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
@ -41,7 +41,7 @@ static bool returns_to_call_stub(address return_pc) {
enum /* platform_dependent_constants */ {
// %%%%%%%% May be able to shrink this a lot
code_size1 = 20000, // simply increase if too small (assembler will crash if too small)
code_size2 = 23000 // simply increase if too small (assembler will crash if too small)
code_size2 = 24000 // simply increase if too small (assembler will crash if too small)
};
class Sparc {

View File

@ -230,7 +230,7 @@ void VM_Version::initialize() {
assert((OptoLoopAlignment % relocInfo::addr_unit()) == 0, "alignment is not a multiple of NOP size");
char buf[512];
jio_snprintf(buf, sizeof(buf), "%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s",
jio_snprintf(buf, sizeof(buf), "%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s",
(has_v9() ? ", v9" : (has_v8() ? ", v8" : "")),
(has_hardware_popc() ? ", popc" : ""),
(has_vis1() ? ", vis1" : ""),
@ -242,6 +242,7 @@ void VM_Version::initialize() {
(has_sha1() ? ", sha1" : ""),
(has_sha256() ? ", sha256" : ""),
(has_sha512() ? ", sha512" : ""),
(has_crc32c() ? ", crc32c" : ""),
(is_ultra3() ? ", ultra3" : ""),
(is_sun4v() ? ", sun4v" : ""),
(is_niagara_plus() ? ", niagara_plus" : (is_niagara() ? ", niagara" : "")),
@ -363,6 +364,23 @@ void VM_Version::initialize() {
}
}
// SPARC T4 and above should have support for CRC32C instruction
if (has_crc32c()) {
if (UseVIS > 2) { // CRC32C intrinsics use VIS3 instructions
if (FLAG_IS_DEFAULT(UseCRC32CIntrinsics)) {
FLAG_SET_DEFAULT(UseCRC32CIntrinsics, true);
}
} else {
if (UseCRC32CIntrinsics) {
warning("SPARC CRC32C intrinsics require VIS3 instruction support. Intrinsics will be disabled.");
FLAG_SET_DEFAULT(UseCRC32CIntrinsics, false);
}
}
} else if (UseCRC32CIntrinsics) {
warning("CRC32C instruction is not available on this CPU");
FLAG_SET_DEFAULT(UseCRC32CIntrinsics, false);
}
if (FLAG_IS_DEFAULT(ContendedPaddingWidth) &&
(cache_line_size > ContendedPaddingWidth))
ContendedPaddingWidth = cache_line_size;

View File

@ -1,5 +1,5 @@
/*
* Copyright (c) 1997, 2014, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 1997, 2015, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
@ -53,7 +53,8 @@ protected:
aes_instructions = 19,
sha1_instruction = 20,
sha256_instruction = 21,
sha512_instruction = 22
sha512_instruction = 22,
crc32c_instruction = 23
};
enum Feature_Flag_Set {
@ -83,6 +84,7 @@ protected:
sha1_instruction_m = 1 << sha1_instruction,
sha256_instruction_m = 1 << sha256_instruction,
sha512_instruction_m = 1 << sha512_instruction,
crc32c_instruction_m = 1 << crc32c_instruction,
generic_v8_m = v8_instructions_m | hardware_mul32_m | hardware_div32_m | hardware_fsmuld_m,
generic_v9_m = generic_v8_m | v9_instructions_m,
@ -141,6 +143,7 @@ public:
static bool has_sha1() { return (_features & sha1_instruction_m) != 0; }
static bool has_sha256() { return (_features & sha256_instruction_m) != 0; }
static bool has_sha512() { return (_features & sha512_instruction_m) != 0; }
static bool has_crc32c() { return (_features & crc32c_instruction_m) != 0; }
static bool supports_compare_and_exchange()
{ return has_v9(); }

View File

@ -699,6 +699,12 @@ void VM_Version::get_processor_features() {
FLAG_SET_DEFAULT(UseSHA512Intrinsics, false);
}
if (UseCRC32CIntrinsics) {
if (!FLAG_IS_DEFAULT(UseCRC32CIntrinsics))
warning("CRC32C intrinsics are not available on this CPU");
FLAG_SET_DEFAULT(UseCRC32CIntrinsics, false);
}
// Adjust RTM (Restricted Transactional Memory) flags
if (!supports_rtm() && UseRTMLocking) {
// Can't continue because UseRTMLocking affects UseBiasedLocking flag

View File

@ -1,5 +1,5 @@
/*
* Copyright (c) 2006, 2014, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2006, 2015, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
@ -363,6 +363,11 @@ int VM_Version::platform_features(int features) {
#endif
if (av & AV_SPARC_CBCOND) features |= cbcond_instructions_m;
#ifndef AV_SPARC_CRC32C
#define AV_SPARC_CRC32C 0x20000000 /* crc32c instruction supported */
#endif
if (av & AV_SPARC_CRC32C) features |= crc32c_instruction_m;
#ifndef AV_SPARC_AES
#define AV_SPARC_AES 0x00020000 /* aes instrs supported */
#endif

View File

@ -863,6 +863,12 @@
do_name( updateByteBuffer_name, "updateByteBuffer") \
do_signature(updateByteBuffer_signature, "(IJII)I") \
\
/* support for java.util.zip.CRC32C */ \
do_class(java_util_zip_CRC32C, "java/util/zip/CRC32C") \
do_intrinsic(_updateBytesCRC32C, java_util_zip_CRC32C, updateBytes_name, updateBytes_signature, F_S) \
do_intrinsic(_updateDirectByteBufferCRC32C, java_util_zip_CRC32C, updateDirectByteBuffer_name, updateByteBuffer_signature, F_S) \
do_name( updateDirectByteBuffer_name, "updateDirectByteBuffer") \
\
/* support for sun.misc.Unsafe */ \
do_class(sun_misc_Unsafe, "sun/misc/Unsafe") \
\

View File

@ -962,6 +962,7 @@ void ConnectionGraph::process_call_arguments(CallNode *call) {
(strcmp(call->as_CallLeaf()->_name, "g1_wb_pre") == 0 ||
strcmp(call->as_CallLeaf()->_name, "g1_wb_post") == 0 ||
strcmp(call->as_CallLeaf()->_name, "updateBytesCRC32") == 0 ||
strcmp(call->as_CallLeaf()->_name, "updateBytesCRC32C") == 0 ||
strcmp(call->as_CallLeaf()->_name, "aescrypt_encryptBlock") == 0 ||
strcmp(call->as_CallLeaf()->_name, "aescrypt_decryptBlock") == 0 ||
strcmp(call->as_CallLeaf()->_name, "cipherBlockChaining_encryptAESCrypt") == 0 ||

View File

@ -197,7 +197,7 @@ class LibraryCallKit : public GraphKit {
CallJavaNode* generate_method_call_virtual(vmIntrinsics::ID method_id) {
return generate_method_call(method_id, true, false);
}
Node * load_field_from_object(Node * fromObj, const char * fieldName, const char * fieldTypeString, bool is_exact, bool is_static);
Node * load_field_from_object(Node * fromObj, const char * fieldName, const char * fieldTypeString, bool is_exact, bool is_static, ciInstanceKlass * fromKls);
Node* make_string_method_node(int opcode, Node* str1_start, Node* cnt1, Node* str2_start, Node* cnt2);
Node* make_string_method_node(int opcode, Node* str1, Node* str2);
@ -291,6 +291,9 @@ class LibraryCallKit : public GraphKit {
bool inline_updateCRC32();
bool inline_updateBytesCRC32();
bool inline_updateByteBufferCRC32();
Node* get_table_from_crc32c_class(ciInstanceKlass *crc32c_class);
bool inline_updateBytesCRC32C();
bool inline_updateDirectByteBufferCRC32C();
bool inline_multiplyToLen();
bool inline_squareToLen();
bool inline_mulAdd();
@ -539,6 +542,11 @@ CallGenerator* Compile::make_vm_intrinsic(ciMethod* m, bool is_virtual) {
if (!UseCRC32Intrinsics) return NULL;
break;
case vmIntrinsics::_updateBytesCRC32C:
case vmIntrinsics::_updateDirectByteBufferCRC32C:
if (!UseCRC32CIntrinsics) return NULL;
break;
case vmIntrinsics::_incrementExactI:
case vmIntrinsics::_addExactI:
if (!Matcher::match_rule_supported(Op_OverflowAddI) || !UseMathExactIntrinsics) return NULL;
@ -947,6 +955,11 @@ bool LibraryCallKit::try_to_inline(int predicate) {
case vmIntrinsics::_updateByteBufferCRC32:
return inline_updateByteBufferCRC32();
case vmIntrinsics::_updateBytesCRC32C:
return inline_updateBytesCRC32C();
case vmIntrinsics::_updateDirectByteBufferCRC32C:
return inline_updateDirectByteBufferCRC32C();
case vmIntrinsics::_profileBoolean:
return inline_profileBoolean();
case vmIntrinsics::_isCompileConstant:
@ -5536,6 +5549,106 @@ bool LibraryCallKit::inline_updateByteBufferCRC32() {
return true;
}
//------------------------------get_table_from_crc32c_class-----------------------
Node * LibraryCallKit::get_table_from_crc32c_class(ciInstanceKlass *crc32c_class) {
Node* table = load_field_from_object(NULL, "byteTable", "[I", /*is_exact*/ false, /*is_static*/ true, crc32c_class);
assert (table != NULL, "wrong version of java.util.zip.CRC32C");
return table;
}
//------------------------------inline_updateBytesCRC32C-----------------------
//
// Calculate CRC32C for byte[] array.
// int java.util.zip.CRC32C.updateBytes(int crc, byte[] buf, int off, int end)
//
bool LibraryCallKit::inline_updateBytesCRC32C() {
assert(UseCRC32CIntrinsics, "need CRC32C instruction support");
assert(callee()->signature()->size() == 4, "updateBytes has 4 parameters");
assert(callee()->holder()->is_loaded(), "CRC32C class must be loaded");
// no receiver since it is a static method
Node* crc = argument(0); // type: int
Node* src = argument(1); // type: oop
Node* offset = argument(2); // type: int
Node* end = argument(3); // type: int
Node* length = _gvn.transform(new SubINode(end, offset));
const Type* src_type = src->Value(&_gvn);
const TypeAryPtr* top_src = src_type->isa_aryptr();
if (top_src == NULL || top_src->klass() == NULL) {
// failed array check
return false;
}
// Figure out the size and type of the elements we will be copying.
BasicType src_elem = src_type->isa_aryptr()->klass()->as_array_klass()->element_type()->basic_type();
if (src_elem != T_BYTE) {
return false;
}
// 'src_start' points to src array + scaled offset
Node* src_start = array_element_address(src, offset, src_elem);
// static final int[] byteTable in class CRC32C
Node* table = get_table_from_crc32c_class(callee()->holder());
Node* table_start = array_element_address(table, intcon(0), T_INT);
// We assume that range check is done by caller.
// TODO: generate range check (offset+length < src.length) in debug VM.
// Call the stub.
address stubAddr = StubRoutines::updateBytesCRC32C();
const char *stubName = "updateBytesCRC32C";
Node* call = make_runtime_call(RC_LEAF, OptoRuntime::updateBytesCRC32C_Type(),
stubAddr, stubName, TypePtr::BOTTOM,
crc, src_start, length, table_start);
Node* result = _gvn.transform(new ProjNode(call, TypeFunc::Parms));
set_result(result);
return true;
}
//------------------------------inline_updateDirectByteBufferCRC32C-----------------------
//
// Calculate CRC32C for DirectByteBuffer.
// int java.util.zip.CRC32C.updateDirectByteBuffer(int crc, long buf, int off, int end)
//
bool LibraryCallKit::inline_updateDirectByteBufferCRC32C() {
assert(UseCRC32CIntrinsics, "need CRC32C instruction support");
assert(callee()->signature()->size() == 5, "updateDirectByteBuffer has 4 parameters and one is long");
assert(callee()->holder()->is_loaded(), "CRC32C class must be loaded");
// no receiver since it is a static method
Node* crc = argument(0); // type: int
Node* src = argument(1); // type: long
Node* offset = argument(3); // type: int
Node* end = argument(4); // type: int
Node* length = _gvn.transform(new SubINode(end, offset));
src = ConvL2X(src); // adjust Java long to machine word
Node* base = _gvn.transform(new CastX2PNode(src));
offset = ConvI2X(offset);
// 'src_start' points to src array + scaled offset
Node* src_start = basic_plus_adr(top(), base, offset);
// static final int[] byteTable in class CRC32C
Node* table = get_table_from_crc32c_class(callee()->holder());
Node* table_start = array_element_address(table, intcon(0), T_INT);
// Call the stub.
address stubAddr = StubRoutines::updateBytesCRC32C();
const char *stubName = "updateBytesCRC32C";
Node* call = make_runtime_call(RC_LEAF, OptoRuntime::updateBytesCRC32C_Type(),
stubAddr, stubName, TypePtr::BOTTOM,
crc, src_start, length, table_start);
Node* result = _gvn.transform(new ProjNode(call, TypeFunc::Parms));
set_result(result);
return true;
}
//----------------------------inline_reference_get----------------------------
// public T java.lang.ref.Reference.get();
bool LibraryCallKit::inline_reference_get() {
@ -5571,18 +5684,28 @@ bool LibraryCallKit::inline_reference_get() {
Node * LibraryCallKit::load_field_from_object(Node * fromObj, const char * fieldName, const char * fieldTypeString,
bool is_exact=true, bool is_static=false) {
bool is_exact=true, bool is_static=false,
ciInstanceKlass * fromKls=NULL) {
if (fromKls == NULL) {
const TypeInstPtr* tinst = _gvn.type(fromObj)->isa_instptr();
assert(tinst != NULL, "obj is null");
assert(tinst->klass()->is_loaded(), "obj is not loaded");
assert(!is_exact || tinst->klass_is_exact(), "klass not exact");
fromKls = tinst->klass()->as_instance_klass();
} else {
assert(is_static, "only for static field access");
}
ciField* field = fromKls->get_field_by_name(ciSymbol::make(fieldName),
ciSymbol::make(fieldTypeString),
is_static);
const TypeInstPtr* tinst = _gvn.type(fromObj)->isa_instptr();
assert(tinst != NULL, "obj is null");
assert(tinst->klass()->is_loaded(), "obj is not loaded");
assert(!is_exact || tinst->klass_is_exact(), "klass not exact");
ciField* field = tinst->klass()->as_instance_klass()->get_field_by_name(ciSymbol::make(fieldName),
ciSymbol::make(fieldTypeString),
is_static);
if (field == NULL) return (Node *) NULL;
assert (field != NULL, "undefined field");
if (field == NULL) return (Node *) NULL;
if (is_static) {
const TypeInstPtr* tip = TypeInstPtr::make(fromKls->java_mirror());
fromObj = makecon(tip);
}
// Next code copied from Parse::do_get_xxx():

View File

@ -851,6 +851,29 @@ const TypeFunc* OptoRuntime::updateBytesCRC32_Type() {
return TypeFunc::make(domain, range);
}
/**
* int updateBytesCRC32C(int crc, byte* buf, int len, int* table)
*/
const TypeFunc* OptoRuntime::updateBytesCRC32C_Type() {
// create input type (domain)
int num_args = 4;
int argcnt = num_args;
const Type** fields = TypeTuple::fields(argcnt);
int argp = TypeFunc::Parms;
fields[argp++] = TypeInt::INT; // crc
fields[argp++] = TypePtr::NOTNULL; // buf
fields[argp++] = TypeInt::INT; // len
fields[argp++] = TypePtr::NOTNULL; // table
assert(argp == TypeFunc::Parms+argcnt, "correct decoding");
const TypeTuple* domain = TypeTuple::make(TypeFunc::Parms+argcnt, fields);
// result type needed
fields = TypeTuple::fields(1);
fields[TypeFunc::Parms+0] = TypeInt::INT; // crc result
const TypeTuple* range = TypeTuple::make(TypeFunc::Parms+1, fields);
return TypeFunc::make(domain, range);
}
// for cipherBlockChaining calls of aescrypt encrypt/decrypt, four pointers and a length, returning int
const TypeFunc* OptoRuntime::cipherBlockChaining_aescrypt_Type() {
// create input type (domain)

View File

@ -319,6 +319,7 @@ private:
static const TypeFunc* ghash_processBlocks_Type();
static const TypeFunc* updateBytesCRC32_Type();
static const TypeFunc* updateBytesCRC32C_Type();
// leaf on stack replacement interpreter accessor types
static const TypeFunc* osr_end_Type();

View File

@ -848,6 +848,9 @@ public:
product(bool, UseCRC32Intrinsics, false, \
"use intrinsics for java.util.zip.CRC32") \
\
product(bool, UseCRC32CIntrinsics, false, \
"use intrinsics for java.util.zip.CRC32C") \
\
develop(bool, TraceCallFixup, false, \
"Trace all call fixups") \
\

View File

@ -137,6 +137,8 @@ address StubRoutines::_sha512_implCompressMB = NULL;
address StubRoutines::_updateBytesCRC32 = NULL;
address StubRoutines::_crc_table_adr = NULL;
address StubRoutines::_updateBytesCRC32C = NULL;
address StubRoutines::_multiplyToLen = NULL;
address StubRoutines::_squareToLen = NULL;
address StubRoutines::_mulAdd = NULL;

View File

@ -197,6 +197,8 @@ class StubRoutines: AllStatic {
static address _updateBytesCRC32;
static address _crc_table_adr;
static address _updateBytesCRC32C;
static address _multiplyToLen;
static address _squareToLen;
static address _mulAdd;
@ -359,6 +361,8 @@ class StubRoutines: AllStatic {
static address updateBytesCRC32() { return _updateBytesCRC32; }
static address crc_table_addr() { return _crc_table_adr; }
static address updateBytesCRC32C() { return _updateBytesCRC32C; }
static address multiplyToLen() {return _multiplyToLen; }
static address squareToLen() {return _squareToLen; }
static address mulAdd() {return _mulAdd; }

View File

@ -830,6 +830,7 @@ typedef CompactHashtable<Symbol*, char> SymbolCompactHashTable;
static_field(StubRoutines, _ghash_processBlocks, address) \
static_field(StubRoutines, _updateBytesCRC32, address) \
static_field(StubRoutines, _crc_table_adr, address) \
static_field(StubRoutines, _updateBytesCRC32C, address) \
static_field(StubRoutines, _multiplyToLen, address) \
static_field(StubRoutines, _squareToLen, address) \
static_field(StubRoutines, _mulAdd, address) \

View File

@ -0,0 +1,221 @@
/*
* Copyright (c) 2015, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License version 2 only, as
* published by the Free Software Foundation.
*
* This code is distributed in the hope that it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
* version 2 for more details (a copy is included in the LICENSE file that
* accompanied this code).
*
* You should have received a copy of the GNU General Public License version
* 2 along with this work; if not, write to the Free Software Foundation,
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
*
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
* or visit www.oracle.com if you need additional information or have any
* questions.
*/
/**
* @test
* @bug 8073583
* @summary C2 support for CRC32C on SPARC
*
* @run main/othervm/timeout=600 -Xbatch TestCRC32C -m
*/
import java.nio.ByteBuffer;
import java.util.zip.Checksum;
import java.util.zip.CRC32C;
public class TestCRC32C {
public static void main(String[] args) {
int offset = Integer.getInteger("offset", 0);
int msgSize = Integer.getInteger("msgSize", 512);
boolean multi = false;
int iters = 20000;
int warmupIters = 20000;
if (args.length > 0) {
if (args[0].equals("-m")) {
multi = true;
} else {
iters = Integer.valueOf(args[0]);
}
if (args.length > 1) {
warmupIters = Integer.valueOf(args[1]);
}
}
if (multi) {
test_multi(warmupIters);
return;
}
System.out.println(" offset = " + offset);
System.out.println("msgSize = " + msgSize + " bytes");
System.out.println(" iters = " + iters);
byte[] b = initializedBytes(msgSize, offset);
CRC32C crc0 = new CRC32C();
CRC32C crc1 = new CRC32C();
CRC32C crc2 = new CRC32C();
crc0.update(b, offset, msgSize);
System.out.println("-------------------------------------------------------");
/* warm up */
for (int i = 0; i < warmupIters; i++) {
crc1.reset();
crc1.update(b, offset, msgSize);
}
/* measure performance */
long start = System.nanoTime();
for (int i = 0; i < iters; i++) {
crc1.reset();
crc1.update(b, offset, msgSize);
}
long end = System.nanoTime();
double total = (double)(end - start)/1e9; // in seconds
double thruput = (double)msgSize*iters/1e6/total; // in MB/s
System.out.println("CRC32C.update(byte[]) runtime = " + total + " seconds");
System.out.println("CRC32C.update(byte[]) throughput = " + thruput + " MB/s");
/* check correctness */
for (int i = 0; i < iters; i++) {
crc1.reset();
crc1.update(b, offset, msgSize);
if (!check(crc0, crc1)) break;
}
report("CRCs", crc0, crc1);
System.out.println("-------------------------------------------------------");
ByteBuffer buf = ByteBuffer.allocateDirect(msgSize);
buf.put(b, offset, msgSize);
buf.flip();
/* warm up */
for (int i = 0; i < warmupIters; i++) {
crc2.reset();
crc2.update(buf);
buf.rewind();
}
/* measure performance */
start = System.nanoTime();
for (int i = 0; i < iters; i++) {
crc2.reset();
crc2.update(buf);
buf.rewind();
}
end = System.nanoTime();
total = (double)(end - start)/1e9; // in seconds
thruput = (double)msgSize*iters/1e6/total; // in MB/s
System.out.println("CRC32C.update(ByteBuffer) runtime = " + total + " seconds");
System.out.println("CRC32C.update(ByteBuffer) throughput = " + thruput + " MB/s");
/* check correctness */
for (int i = 0; i < iters; i++) {
crc2.reset();
crc2.update(buf);
buf.rewind();
if (!check(crc0, crc2)) break;
}
report("CRCs", crc0, crc2);
System.out.println("-------------------------------------------------------");
}
private static void report(String s, Checksum crc0, Checksum crc1) {
System.out.printf("%s: crc0 = %08x, crc1 = %08x\n",
s, crc0.getValue(), crc1.getValue());
}
private static boolean check(Checksum crc0, Checksum crc1) {
if (crc0.getValue() != crc1.getValue()) {
System.err.printf("ERROR: crc0 = %08x, crc1 = %08x\n",
crc0.getValue(), crc1.getValue());
return false;
}
return true;
}
private static byte[] initializedBytes(int M, int offset) {
byte[] bytes = new byte[M + offset];
for (int i = 0; i < offset; i++) {
bytes[i] = (byte) i;
}
for (int i = offset; i < bytes.length; i++) {
bytes[i] = (byte) (i - offset);
}
return bytes;
}
private static void test_multi(int iters) {
int len1 = 8; // the 8B/iteration loop
int len2 = 32; // the 32B/iteration loop
int len3 = 4096; // the 4KB/iteration loop
byte[] b = initializedBytes(len3*16, 0);
int[] offsets = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 16, 32, 64, 128, 256, 512 };
int[] sizes = { 0, 1, 2, 3, 4, 5, 6, 7,
len1, len1+1, len1+2, len1+3, len1+4, len1+5, len1+6, len1+7,
len1*2, len1*2+1, len1*2+3, len1*2+5, len1*2+7,
len2, len2+1, len2+3, len2+5, len2+7,
len2*2, len2*4, len2*8, len2*16, len2*32, len2*64,
len3, len3+1, len3+3, len3+5, len3+7,
len3*2, len3*4, len3*8,
len1+len2, len1+len2+1, len1+len2+3, len1+len2+5, len1+len2+7,
len1+len3, len1+len3+1, len1+len3+3, len1+len3+5, len1+len3+7,
len2+len3, len2+len3+1, len2+len3+3, len2+len3+5, len2+len3+7,
len1+len2+len3, len1+len2+len3+1, len1+len2+len3+3,
len1+len2+len3+5, len1+len2+len3+7,
(len1+len2+len3)*2, (len1+len2+len3)*2+1, (len1+len2+len3)*2+3,
(len1+len2+len3)*2+5, (len1+len2+len3)*2+7,
(len1+len2+len3)*3, (len1+len2+len3)*3-1, (len1+len2+len3)*3-3,
(len1+len2+len3)*3-5, (len1+len2+len3)*3-7 };
CRC32C[] crc0 = new CRC32C[offsets.length*sizes.length];
CRC32C[] crc1 = new CRC32C[offsets.length*sizes.length];
int i, j, k;
System.out.printf("testing %d cases ...\n", offsets.length*sizes.length);
/* set the result from interpreter as reference */
for (i = 0; i < offsets.length; i++) {
for (j = 0; j < sizes.length; j++) {
crc0[i*sizes.length + j] = new CRC32C();
crc1[i*sizes.length + j] = new CRC32C();
crc0[i*sizes.length + j].update(b, offsets[i], sizes[j]);
}
}
/* warm up the JIT compiler and get result */
for (k = 0; k < iters; k++) {
for (i = 0; i < offsets.length; i++) {
for (j = 0; j < sizes.length; j++) {
crc1[i*sizes.length + j].reset();
crc1[i*sizes.length + j].update(b, offsets[i], sizes[j]);
}
}
}
/* check correctness */
for (i = 0; i < offsets.length; i++) {
for (j = 0; j < sizes.length; j++) {
if (!check(crc0[i*sizes.length + j], crc1[i*sizes.length + j])) {
System.out.printf("offsets[%d] = %d", i, offsets[i]);
System.out.printf("\tsizes[%d] = %d\n", j, sizes[j]);
}
}
}
}
}