8134553: CRC32C implementations for x86/x64 targets

Reviewed-by: kvn
This commit is contained in:
Tomasz Wojtowicz 2015-09-16 15:54:32 -07:00 committed by Vladimir Kozlov
parent d49d1ea740
commit 61b77b8590
24 changed files with 1092 additions and 19 deletions

View File

@ -48,6 +48,7 @@ private:
address generate_Reference_get_entry();
address generate_CRC32_update_entry();
address generate_CRC32_updateBytes_entry(AbstractInterpreter::MethodKind kind);
address generate_CRC32C_updateBytes_entry(AbstractInterpreter::MethodKind kind) { return NULL; }
void lock_method(void);
void generate_stack_overflow_check(void);

View File

@ -38,5 +38,6 @@
address generate_CRC32_update_entry();
address generate_CRC32_updateBytes_entry(AbstractInterpreter::MethodKind kind);
address generate_CRC32C_updateBytes_entry(AbstractInterpreter::MethodKind kind) { return NULL; }
#endif // CPU_PPC_VM_INTERPRETERGENERATOR_PPC_HPP

View File

@ -48,4 +48,5 @@
// Not supported
address generate_CRC32_update_entry() { return NULL; }
address generate_CRC32_updateBytes_entry(AbstractInterpreter::MethodKind kind) { return NULL; }
address generate_CRC32C_updateBytes_entry(AbstractInterpreter::MethodKind kind) { return NULL; }
#endif // CPU_SPARC_VM_INTERPRETERGENERATOR_SPARC_HPP

View File

@ -1604,6 +1604,85 @@ void Assembler::cpuid() {
emit_int8((unsigned char)0xA2);
}
// Opcode / Instruction Op / En 64 - Bit Mode Compat / Leg Mode Description Implemented
// F2 0F 38 F0 / r CRC32 r32, r / m8 RM Valid Valid Accumulate CRC32 on r / m8. v
// F2 REX 0F 38 F0 / r CRC32 r32, r / m8* RM Valid N.E. Accumulate CRC32 on r / m8. -
// F2 REX.W 0F 38 F0 / r CRC32 r64, r / m8 RM Valid N.E. Accumulate CRC32 on r / m8. -
//
// F2 0F 38 F1 / r CRC32 r32, r / m16 RM Valid Valid Accumulate CRC32 on r / m16. v
//
// F2 0F 38 F1 / r CRC32 r32, r / m32 RM Valid Valid Accumulate CRC32 on r / m32. v
//
// F2 REX.W 0F 38 F1 / r CRC32 r64, r / m64 RM Valid N.E. Accumulate CRC32 on r / m64. v
void Assembler::crc32(Register crc, Register v, int8_t sizeInBytes) {
assert(VM_Version::supports_sse4_2(), "");
int8_t w = 0x01;
Prefix p = Prefix_EMPTY;
emit_int8((int8_t)0xF2);
switch (sizeInBytes) {
case 1:
w = 0;
break;
case 2:
case 4:
break;
LP64_ONLY(case 8:)
// This instruction is not valid in 32 bits
// Note:
// http://www.intel.com/content/dam/www/public/us/en/documents/manuals/64-ia-32-architectures-software-developer-instruction-set-reference-manual-325383.pdf
//
// Page B - 72 Vol. 2C says
// qwreg2 to qwreg 1111 0010 : 0100 1R0B : 0000 1111 : 0011 1000 : 1111 0000 : 11 qwreg1 qwreg2
// mem64 to qwreg 1111 0010 : 0100 1R0B : 0000 1111 : 0011 1000 : 1111 0000 : mod qwreg r / m
// F0!!!
// while 3 - 208 Vol. 2A
// F2 REX.W 0F 38 F1 / r CRC32 r64, r / m64 RM Valid N.E.Accumulate CRC32 on r / m64.
//
// the 0 on a last bit is reserved for a different flavor of this instruction :
// F2 REX.W 0F 38 F0 / r CRC32 r64, r / m8 RM Valid N.E.Accumulate CRC32 on r / m8.
p = REX_W;
break;
default:
assert(0, "Unsupported value for a sizeInBytes argument");
break;
}
LP64_ONLY(prefix(crc, v, p);)
emit_int8((int8_t)0x0F);
emit_int8(0x38);
emit_int8((int8_t)(0xF0 | w));
emit_int8(0xC0 | ((crc->encoding() & 0x7) << 3) | (v->encoding() & 7));
}
void Assembler::crc32(Register crc, Address adr, int8_t sizeInBytes) {
assert(VM_Version::supports_sse4_2(), "");
InstructionMark im(this);
int8_t w = 0x01;
Prefix p = Prefix_EMPTY;
emit_int8((int8_t)0xF2);
switch (sizeInBytes) {
case 1:
w = 0;
break;
case 2:
case 4:
break;
LP64_ONLY(case 8:)
// This instruction is not valid in 32 bits
p = REX_W;
break;
default:
assert(0, "Unsupported value for a sizeInBytes argument");
break;
}
LP64_ONLY(prefix(crc, adr, p);)
emit_int8((int8_t)0x0F);
emit_int8(0x38);
emit_int8((int8_t)(0xF0 | w));
emit_operand(crc, adr);
}
void Assembler::cvtdq2pd(XMMRegister dst, XMMRegister src) {
NOT_LP64(assert(VM_Version::supports_sse2(), ""));
emit_simd_arith_nonds(0xE6, dst, src, VEX_SIMD_F3, /* no_mask_reg */ false, /* legacy_mode */ true);
@ -6223,6 +6302,14 @@ void Assembler::shldl(Register dst, Register src) {
emit_int8((unsigned char)(0xC0 | src->encoding() << 3 | dst->encoding()));
}
// 0F A4 / r ib
void Assembler::shldl(Register dst, Register src, int8_t imm8) {
emit_int8(0x0F);
emit_int8((unsigned char)0xA4);
emit_int8((unsigned char)(0xC0 | src->encoding() << 3 | dst->encoding()));
emit_int8(imm8);
}
void Assembler::shrdl(Register dst, Register src) {
emit_int8(0x0F);
emit_int8((unsigned char)0xAD);
@ -6408,6 +6495,40 @@ void Assembler::prefix(Register reg) {
}
}
void Assembler::prefix(Register dst, Register src, Prefix p) {
if (src->encoding() >= 8) {
p = (Prefix)(p | REX_B);
}
if (dst->encoding() >= 8) {
p = (Prefix)( p | REX_R);
}
if (p != Prefix_EMPTY) {
// do not generate an empty prefix
prefix(p);
}
}
void Assembler::prefix(Register dst, Address adr, Prefix p) {
if (adr.base_needs_rex()) {
if (adr.index_needs_rex()) {
assert(false, "prefix(Register dst, Address adr, Prefix p) does not support handling of an X");
} else {
prefix(REX_B);
}
} else {
if (adr.index_needs_rex()) {
assert(false, "prefix(Register dst, Address adr, Prefix p) does not support handling of an X");
}
}
if (dst->encoding() >= 8) {
p = (Prefix)(p | REX_R);
}
if (p != Prefix_EMPTY) {
// do not generate an empty prefix
prefix(p);
}
}
void Assembler::prefix(Address adr) {
if (adr.base_needs_rex()) {
if (adr.index_needs_rex()) {

View File

@ -506,7 +506,8 @@ class Assembler : public AbstractAssembler {
VEX_3bytes = 0xC4,
VEX_2bytes = 0xC5,
EVEX_4bytes = 0x62
EVEX_4bytes = 0x62,
Prefix_EMPTY = 0x0
};
enum VexPrefix {
@ -615,6 +616,8 @@ private:
int prefixq_and_encode(int dst_enc, int src_enc);
void prefix(Register reg);
void prefix(Register dst, Register src, Prefix p);
void prefix(Register dst, Address adr, Prefix p);
void prefix(Address adr);
void prefixq(Address adr);
@ -1177,6 +1180,10 @@ private:
// Identify processor type and features
void cpuid();
// CRC32C
void crc32(Register crc, Register v, int8_t sizeInBytes);
void crc32(Register crc, Address adr, int8_t sizeInBytes);
// Convert Scalar Double-Precision Floating-Point Value to Scalar Single-Precision Floating-Point Value
void cvtsd2ss(XMMRegister dst, XMMRegister src);
void cvtsd2ss(XMMRegister dst, Address src);
@ -1783,6 +1790,7 @@ private:
void setb(Condition cc, Register dst);
void shldl(Register dst, Register src);
void shldl(Register dst, Register src, int8_t imm8);
void shll(Register dst, int imm8);
void shll(Register dst);

View File

@ -37,6 +37,8 @@ inline int Assembler::prefix_and_encode(int dst_enc, int src_enc, bool byteinst)
inline int Assembler::prefixq_and_encode(int dst_enc, int src_enc) { return dst_enc << 3 | src_enc; }
inline void Assembler::prefix(Register reg) {}
inline void Assembler::prefix(Register dst, Register src, Prefix p) {}
inline void Assembler::prefix(Register dst, Address adr, Prefix p) {}
inline void Assembler::prefix(Address adr) {}
inline void Assembler::prefixq(Address adr) {}

View File

@ -0,0 +1,66 @@
/*
* Copyright (c) 2015, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License version 2 only, as
* published by the Free Software Foundation.
*
* This code is distributed in the hope that it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
* version 2 for more details (a copy is included in the LICENSE file that
* accompanied this code).
*
* You should have received a copy of the GNU General Public License version
* 2 along with this work; if not, write to the Free Software Foundation,
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
*
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
* or visit www.oracle.com if you need additional information or have any
* questions.
*
*/
enum {
// S. Gueron / Information Processing Letters 112 (2012) 184
// shows than anything above 6K and below 32K is a good choice
// 32K does not deliver any further performance gains
// 6K=8*256 (*3 as we compute 3 blocks together)
//
// Thus selecting the smallest value so it could apply to the largest number
// of buffer sizes.
CRC32C_HIGH = 8 * 256,
// empirical
// based on ubench study using methodology described in
// V. Gopal et al. / Fast CRC Computation for iSCSI Polynomial Using CRC32 Instruction April 2011 8
//
// arbitrary value between 27 and 256
CRC32C_MIDDLE = 8 * 86,
// V. Gopal et al. / Fast CRC Computation for iSCSI Polynomial Using CRC32 Instruction April 2011 9
// shows that 240 and 1024 are equally good choices as the 216==8*27
//
// Selecting the smallest value which resulted in a significant performance improvement over
// sequential version
CRC32C_LOW = 8 * 27,
CRC32C_NUM_ChunkSizeInBytes = 3,
// We need to compute powers of 64N and 128N for each "chunk" size
CRC32C_NUM_PRECOMPUTED_CONSTANTS = ( 2 * CRC32C_NUM_ChunkSizeInBytes )
};
// Notes:
// 1. Why we need to choose a "chunk" approach?
// Overhead of computing a powers and powers of for an arbitrary buffer of size N is significant
// (implementation approaches a library perf.)
// 2. Why only 3 "chunks"?
// Performance experiments results showed that a HIGH+LOW was not delivering a stable speedup
// curve.
//
// Disclaimer:
// If you ever decide to increase/decrease number of "chunks" be sure to modify
// a) constants table generation (hotspot/src/cpu/x86/vm/stubRoutines_x86.cpp)
// b) constant fetch from that table (macroAssembler_x86.cpp)
// c) unrolled for loop (macroAssembler_x86.cpp)

View File

@ -42,6 +42,7 @@
address generate_Reference_get_entry();
address generate_CRC32_update_entry();
address generate_CRC32_updateBytes_entry(AbstractInterpreter::MethodKind kind);
address generate_CRC32C_updateBytes_entry(AbstractInterpreter::MethodKind kind);
#ifndef _LP64
address generate_Float_intBitsToFloat_entry();
address generate_Float_floatToRawIntBits_entry();

View File

@ -45,6 +45,7 @@
#include "gc/g1/g1SATBCardTableModRefBS.hpp"
#include "gc/g1/heapRegion.hpp"
#endif // INCLUDE_ALL_GCS
#include "crc32c.h"
#ifdef PRODUCT
#define BLOCK_COMMENT(str) /* nothing */
@ -8636,6 +8637,471 @@ void MacroAssembler::kernel_crc32(Register crc, Register buf, Register len, Regi
notl(crc); // ~c
}
#ifdef _LP64
// S. Gueron / Information Processing Letters 112 (2012) 184
// Algorithm 4: Computing carry-less multiplication using a precomputed lookup table.
// Input: A 32 bit value B = [byte3, byte2, byte1, byte0].
// Output: the 64-bit carry-less product of B * CONST
void MacroAssembler::crc32c_ipl_alg4(Register in, uint32_t n,
Register tmp1, Register tmp2, Register tmp3) {
lea(tmp3, ExternalAddress(StubRoutines::crc32c_table_addr()));
if (n > 0) {
addq(tmp3, n * 256 * 8);
}
// Q1 = TABLEExt[n][B & 0xFF];
movl(tmp1, in);
andl(tmp1, 0x000000FF);
shll(tmp1, 3);
addq(tmp1, tmp3);
movq(tmp1, Address(tmp1, 0));
// Q2 = TABLEExt[n][B >> 8 & 0xFF];
movl(tmp2, in);
shrl(tmp2, 8);
andl(tmp2, 0x000000FF);
shll(tmp2, 3);
addq(tmp2, tmp3);
movq(tmp2, Address(tmp2, 0));
shlq(tmp2, 8);
xorq(tmp1, tmp2);
// Q3 = TABLEExt[n][B >> 16 & 0xFF];
movl(tmp2, in);
shrl(tmp2, 16);
andl(tmp2, 0x000000FF);
shll(tmp2, 3);
addq(tmp2, tmp3);
movq(tmp2, Address(tmp2, 0));
shlq(tmp2, 16);
xorq(tmp1, tmp2);
// Q4 = TABLEExt[n][B >> 24 & 0xFF];
shrl(in, 24);
andl(in, 0x000000FF);
shll(in, 3);
addq(in, tmp3);
movq(in, Address(in, 0));
shlq(in, 24);
xorq(in, tmp1);
// return Q1 ^ Q2 << 8 ^ Q3 << 16 ^ Q4 << 24;
}
void MacroAssembler::crc32c_pclmulqdq(XMMRegister w_xtmp1,
Register in_out,
uint32_t const_or_pre_comp_const_index, bool is_pclmulqdq_supported,
XMMRegister w_xtmp2,
Register tmp1,
Register n_tmp2, Register n_tmp3) {
if (is_pclmulqdq_supported) {
movdl(w_xtmp1, in_out); // modified blindly
movl(tmp1, const_or_pre_comp_const_index);
movdl(w_xtmp2, tmp1);
pclmulqdq(w_xtmp1, w_xtmp2, 0);
movdq(in_out, w_xtmp1);
} else {
crc32c_ipl_alg4(in_out, const_or_pre_comp_const_index, tmp1, n_tmp2, n_tmp3);
}
}
// Recombination Alternative 2: No bit-reflections
// T1 = (CRC_A * U1) << 1
// T2 = (CRC_B * U2) << 1
// C1 = T1 >> 32
// C2 = T2 >> 32
// T1 = T1 & 0xFFFFFFFF
// T2 = T2 & 0xFFFFFFFF
// T1 = CRC32(0, T1)
// T2 = CRC32(0, T2)
// C1 = C1 ^ T1
// C2 = C2 ^ T2
// CRC = C1 ^ C2 ^ CRC_C
void MacroAssembler::crc32c_rec_alt2(uint32_t const_or_pre_comp_const_index_u1, uint32_t const_or_pre_comp_const_index_u2, bool is_pclmulqdq_supported, Register in_out, Register in1, Register in2,
XMMRegister w_xtmp1, XMMRegister w_xtmp2, XMMRegister w_xtmp3,
Register tmp1, Register tmp2,
Register n_tmp3) {
crc32c_pclmulqdq(w_xtmp1, in_out, const_or_pre_comp_const_index_u1, is_pclmulqdq_supported, w_xtmp3, tmp1, tmp2, n_tmp3);
crc32c_pclmulqdq(w_xtmp2, in1, const_or_pre_comp_const_index_u2, is_pclmulqdq_supported, w_xtmp3, tmp1, tmp2, n_tmp3);
shlq(in_out, 1);
movl(tmp1, in_out);
shrq(in_out, 32);
xorl(tmp2, tmp2);
crc32(tmp2, tmp1, 4);
xorl(in_out, tmp2); // we don't care about upper 32 bit contents here
shlq(in1, 1);
movl(tmp1, in1);
shrq(in1, 32);
xorl(tmp2, tmp2);
crc32(tmp2, tmp1, 4);
xorl(in1, tmp2);
xorl(in_out, in1);
xorl(in_out, in2);
}
// Set N to predefined value
// Subtract from a lenght of a buffer
// execute in a loop:
// CRC_A = 0xFFFFFFFF, CRC_B = 0, CRC_C = 0
// for i = 1 to N do
// CRC_A = CRC32(CRC_A, A[i])
// CRC_B = CRC32(CRC_B, B[i])
// CRC_C = CRC32(CRC_C, C[i])
// end for
// Recombine
void MacroAssembler::crc32c_proc_chunk(uint32_t size, uint32_t const_or_pre_comp_const_index_u1, uint32_t const_or_pre_comp_const_index_u2, bool is_pclmulqdq_supported,
Register in_out1, Register in_out2, Register in_out3,
Register tmp1, Register tmp2, Register tmp3,
XMMRegister w_xtmp1, XMMRegister w_xtmp2, XMMRegister w_xtmp3,
Register tmp4, Register tmp5,
Register n_tmp6) {
Label L_processPartitions;
Label L_processPartition;
Label L_exit;
bind(L_processPartitions);
cmpl(in_out1, 3 * size);
jcc(Assembler::less, L_exit);
xorl(tmp1, tmp1);
xorl(tmp2, tmp2);
movq(tmp3, in_out2);
addq(tmp3, size);
bind(L_processPartition);
crc32(in_out3, Address(in_out2, 0), 8);
crc32(tmp1, Address(in_out2, size), 8);
crc32(tmp2, Address(in_out2, size * 2), 8);
addq(in_out2, 8);
cmpq(in_out2, tmp3);
jcc(Assembler::less, L_processPartition);
crc32c_rec_alt2(const_or_pre_comp_const_index_u1, const_or_pre_comp_const_index_u2, is_pclmulqdq_supported, in_out3, tmp1, tmp2,
w_xtmp1, w_xtmp2, w_xtmp3,
tmp4, tmp5,
n_tmp6);
addq(in_out2, 2 * size);
subl(in_out1, 3 * size);
jmp(L_processPartitions);
bind(L_exit);
}
#else
void MacroAssembler::crc32c_ipl_alg4(Register in_out, uint32_t n,
Register tmp1, Register tmp2, Register tmp3,
XMMRegister xtmp1, XMMRegister xtmp2) {
lea(tmp3, ExternalAddress(StubRoutines::crc32c_table_addr()));
if (n > 0) {
addl(tmp3, n * 256 * 8);
}
// Q1 = TABLEExt[n][B & 0xFF];
movl(tmp1, in_out);
andl(tmp1, 0x000000FF);
shll(tmp1, 3);
addl(tmp1, tmp3);
movq(xtmp1, Address(tmp1, 0));
// Q2 = TABLEExt[n][B >> 8 & 0xFF];
movl(tmp2, in_out);
shrl(tmp2, 8);
andl(tmp2, 0x000000FF);
shll(tmp2, 3);
addl(tmp2, tmp3);
movq(xtmp2, Address(tmp2, 0));
psllq(xtmp2, 8);
pxor(xtmp1, xtmp2);
// Q3 = TABLEExt[n][B >> 16 & 0xFF];
movl(tmp2, in_out);
shrl(tmp2, 16);
andl(tmp2, 0x000000FF);
shll(tmp2, 3);
addl(tmp2, tmp3);
movq(xtmp2, Address(tmp2, 0));
psllq(xtmp2, 16);
pxor(xtmp1, xtmp2);
// Q4 = TABLEExt[n][B >> 24 & 0xFF];
shrl(in_out, 24);
andl(in_out, 0x000000FF);
shll(in_out, 3);
addl(in_out, tmp3);
movq(xtmp2, Address(in_out, 0));
psllq(xtmp2, 24);
pxor(xtmp1, xtmp2); // Result in CXMM
// return Q1 ^ Q2 << 8 ^ Q3 << 16 ^ Q4 << 24;
}
void MacroAssembler::crc32c_pclmulqdq(XMMRegister w_xtmp1,
Register in_out,
uint32_t const_or_pre_comp_const_index, bool is_pclmulqdq_supported,
XMMRegister w_xtmp2,
Register tmp1,
Register n_tmp2, Register n_tmp3) {
if (is_pclmulqdq_supported) {
movdl(w_xtmp1, in_out);
movl(tmp1, const_or_pre_comp_const_index);
movdl(w_xtmp2, tmp1);
pclmulqdq(w_xtmp1, w_xtmp2, 0);
// Keep result in XMM since GPR is 32 bit in length
} else {
crc32c_ipl_alg4(in_out, const_or_pre_comp_const_index, tmp1, n_tmp2, n_tmp3, w_xtmp1, w_xtmp2);
}
}
void MacroAssembler::crc32c_rec_alt2(uint32_t const_or_pre_comp_const_index_u1, uint32_t const_or_pre_comp_const_index_u2, bool is_pclmulqdq_supported, Register in_out, Register in1, Register in2,
XMMRegister w_xtmp1, XMMRegister w_xtmp2, XMMRegister w_xtmp3,
Register tmp1, Register tmp2,
Register n_tmp3) {
crc32c_pclmulqdq(w_xtmp1, in_out, const_or_pre_comp_const_index_u1, is_pclmulqdq_supported, w_xtmp3, tmp1, tmp2, n_tmp3);
crc32c_pclmulqdq(w_xtmp2, in1, const_or_pre_comp_const_index_u2, is_pclmulqdq_supported, w_xtmp3, tmp1, tmp2, n_tmp3);
psllq(w_xtmp1, 1);
movdl(tmp1, w_xtmp1);
psrlq(w_xtmp1, 32);
movdl(in_out, w_xtmp1);
xorl(tmp2, tmp2);
crc32(tmp2, tmp1, 4);
xorl(in_out, tmp2);
psllq(w_xtmp2, 1);
movdl(tmp1, w_xtmp2);
psrlq(w_xtmp2, 32);
movdl(in1, w_xtmp2);
xorl(tmp2, tmp2);
crc32(tmp2, tmp1, 4);
xorl(in1, tmp2);
xorl(in_out, in1);
xorl(in_out, in2);
}
void MacroAssembler::crc32c_proc_chunk(uint32_t size, uint32_t const_or_pre_comp_const_index_u1, uint32_t const_or_pre_comp_const_index_u2, bool is_pclmulqdq_supported,
Register in_out1, Register in_out2, Register in_out3,
Register tmp1, Register tmp2, Register tmp3,
XMMRegister w_xtmp1, XMMRegister w_xtmp2, XMMRegister w_xtmp3,
Register tmp4, Register tmp5,
Register n_tmp6) {
Label L_processPartitions;
Label L_processPartition;
Label L_exit;
bind(L_processPartitions);
cmpl(in_out1, 3 * size);
jcc(Assembler::less, L_exit);
xorl(tmp1, tmp1);
xorl(tmp2, tmp2);
movl(tmp3, in_out2);
addl(tmp3, size);
bind(L_processPartition);
crc32(in_out3, Address(in_out2, 0), 4);
crc32(tmp1, Address(in_out2, size), 4);
crc32(tmp2, Address(in_out2, size*2), 4);
crc32(in_out3, Address(in_out2, 0+4), 4);
crc32(tmp1, Address(in_out2, size+4), 4);
crc32(tmp2, Address(in_out2, size*2+4), 4);
addl(in_out2, 8);
cmpl(in_out2, tmp3);
jcc(Assembler::less, L_processPartition);
push(tmp3);
push(in_out1);
push(in_out2);
tmp4 = tmp3;
tmp5 = in_out1;
n_tmp6 = in_out2;
crc32c_rec_alt2(const_or_pre_comp_const_index_u1, const_or_pre_comp_const_index_u2, is_pclmulqdq_supported, in_out3, tmp1, tmp2,
w_xtmp1, w_xtmp2, w_xtmp3,
tmp4, tmp5,
n_tmp6);
pop(in_out2);
pop(in_out1);
pop(tmp3);
addl(in_out2, 2 * size);
subl(in_out1, 3 * size);
jmp(L_processPartitions);
bind(L_exit);
}
#endif //LP64
#ifdef _LP64
// Algorithm 2: Pipelined usage of the CRC32 instruction.
// Input: A buffer I of L bytes.
// Output: the CRC32C value of the buffer.
// Notations:
// Write L = 24N + r, with N = floor (L/24).
// r = L mod 24 (0 <= r < 24).
// Consider I as the concatenation of A|B|C|R, where A, B, C, each,
// N quadwords, and R consists of r bytes.
// A[j] = I [8j+7:8j], j= 0, 1, ..., N-1
// B[j] = I [N + 8j+7:N + 8j], j= 0, 1, ..., N-1
// C[j] = I [2N + 8j+7:2N + 8j], j= 0, 1, ..., N-1
// if r > 0 R[j] = I [3N +j], j= 0, 1, ...,r-1
void MacroAssembler::crc32c_ipl_alg2_alt2(Register in_out, Register in1, Register in2,
Register tmp1, Register tmp2, Register tmp3,
Register tmp4, Register tmp5, Register tmp6,
XMMRegister w_xtmp1, XMMRegister w_xtmp2, XMMRegister w_xtmp3,
bool is_pclmulqdq_supported) {
uint32_t const_or_pre_comp_const_index[CRC32C_NUM_PRECOMPUTED_CONSTANTS];
Label L_wordByWord;
Label L_byteByByteProlog;
Label L_byteByByte;
Label L_exit;
if (is_pclmulqdq_supported ) {
const_or_pre_comp_const_index[1] = *(uint32_t *)StubRoutines::_crc32c_table_addr;
const_or_pre_comp_const_index[0] = *((uint32_t *)StubRoutines::_crc32c_table_addr+1);
const_or_pre_comp_const_index[3] = *((uint32_t *)StubRoutines::_crc32c_table_addr + 2);
const_or_pre_comp_const_index[2] = *((uint32_t *)StubRoutines::_crc32c_table_addr + 3);
const_or_pre_comp_const_index[5] = *((uint32_t *)StubRoutines::_crc32c_table_addr + 4);
const_or_pre_comp_const_index[4] = *((uint32_t *)StubRoutines::_crc32c_table_addr + 5);
assert((CRC32C_NUM_PRECOMPUTED_CONSTANTS - 1 ) == 5, "Checking whether you declared all of the constants based on the number of \"chunks\"");
} else {
const_or_pre_comp_const_index[0] = 1;
const_or_pre_comp_const_index[1] = 0;
const_or_pre_comp_const_index[2] = 3;
const_or_pre_comp_const_index[3] = 2;
const_or_pre_comp_const_index[4] = 5;
const_or_pre_comp_const_index[5] = 4;
}
crc32c_proc_chunk(CRC32C_HIGH, const_or_pre_comp_const_index[0], const_or_pre_comp_const_index[1], is_pclmulqdq_supported,
in2, in1, in_out,
tmp1, tmp2, tmp3,
w_xtmp1, w_xtmp2, w_xtmp3,
tmp4, tmp5,
tmp6);
crc32c_proc_chunk(CRC32C_MIDDLE, const_or_pre_comp_const_index[2], const_or_pre_comp_const_index[3], is_pclmulqdq_supported,
in2, in1, in_out,
tmp1, tmp2, tmp3,
w_xtmp1, w_xtmp2, w_xtmp3,
tmp4, tmp5,
tmp6);
crc32c_proc_chunk(CRC32C_LOW, const_or_pre_comp_const_index[4], const_or_pre_comp_const_index[5], is_pclmulqdq_supported,
in2, in1, in_out,
tmp1, tmp2, tmp3,
w_xtmp1, w_xtmp2, w_xtmp3,
tmp4, tmp5,
tmp6);
movl(tmp1, in2);
andl(tmp1, 0x00000007);
negl(tmp1);
addl(tmp1, in2);
addq(tmp1, in1);
BIND(L_wordByWord);
cmpq(in1, tmp1);
jcc(Assembler::greaterEqual, L_byteByByteProlog);
crc32(in_out, Address(in1, 0), 4);
addq(in1, 4);
jmp(L_wordByWord);
BIND(L_byteByByteProlog);
andl(in2, 0x00000007);
movl(tmp2, 1);
BIND(L_byteByByte);
cmpl(tmp2, in2);
jccb(Assembler::greater, L_exit);
crc32(in_out, Address(in1, 0), 1);
incq(in1);
incl(tmp2);
jmp(L_byteByByte);
BIND(L_exit);
}
#else
void MacroAssembler::crc32c_ipl_alg2_alt2(Register in_out, Register in1, Register in2,
Register tmp1, Register tmp2, Register tmp3,
Register tmp4, Register tmp5, Register tmp6,
XMMRegister w_xtmp1, XMMRegister w_xtmp2, XMMRegister w_xtmp3,
bool is_pclmulqdq_supported) {
uint32_t const_or_pre_comp_const_index[CRC32C_NUM_PRECOMPUTED_CONSTANTS];
Label L_wordByWord;
Label L_byteByByteProlog;
Label L_byteByByte;
Label L_exit;
if (is_pclmulqdq_supported) {
const_or_pre_comp_const_index[1] = *(uint32_t *)StubRoutines::_crc32c_table_addr;
const_or_pre_comp_const_index[0] = *((uint32_t *)StubRoutines::_crc32c_table_addr + 1);
const_or_pre_comp_const_index[3] = *((uint32_t *)StubRoutines::_crc32c_table_addr + 2);
const_or_pre_comp_const_index[2] = *((uint32_t *)StubRoutines::_crc32c_table_addr + 3);
const_or_pre_comp_const_index[5] = *((uint32_t *)StubRoutines::_crc32c_table_addr + 4);
const_or_pre_comp_const_index[4] = *((uint32_t *)StubRoutines::_crc32c_table_addr + 5);
} else {
const_or_pre_comp_const_index[0] = 1;
const_or_pre_comp_const_index[1] = 0;
const_or_pre_comp_const_index[2] = 3;
const_or_pre_comp_const_index[3] = 2;
const_or_pre_comp_const_index[4] = 5;
const_or_pre_comp_const_index[5] = 4;
}
crc32c_proc_chunk(CRC32C_HIGH, const_or_pre_comp_const_index[0], const_or_pre_comp_const_index[1], is_pclmulqdq_supported,
in2, in1, in_out,
tmp1, tmp2, tmp3,
w_xtmp1, w_xtmp2, w_xtmp3,
tmp4, tmp5,
tmp6);
crc32c_proc_chunk(CRC32C_MIDDLE, const_or_pre_comp_const_index[2], const_or_pre_comp_const_index[3], is_pclmulqdq_supported,
in2, in1, in_out,
tmp1, tmp2, tmp3,
w_xtmp1, w_xtmp2, w_xtmp3,
tmp4, tmp5,
tmp6);
crc32c_proc_chunk(CRC32C_LOW, const_or_pre_comp_const_index[4], const_or_pre_comp_const_index[5], is_pclmulqdq_supported,
in2, in1, in_out,
tmp1, tmp2, tmp3,
w_xtmp1, w_xtmp2, w_xtmp3,
tmp4, tmp5,
tmp6);
movl(tmp1, in2);
andl(tmp1, 0x00000007);
negl(tmp1);
addl(tmp1, in2);
addl(tmp1, in1);
BIND(L_wordByWord);
cmpl(in1, tmp1);
jcc(Assembler::greaterEqual, L_byteByByteProlog);
crc32(in_out, Address(in1,0), 4);
addl(in1, 4);
jmp(L_wordByWord);
BIND(L_byteByByteProlog);
andl(in2, 0x00000007);
movl(tmp2, 1);
BIND(L_byteByByte);
cmpl(tmp2, in2);
jccb(Assembler::greater, L_exit);
movb(tmp1, Address(in1, 0));
crc32(in_out, tmp1, 1);
incl(in1);
incl(tmp2);
jmp(L_byteByByte);
BIND(L_exit);
}
#endif // LP64
#undef BIND
#undef BLOCK_COMMENT

View File

@ -1278,9 +1278,42 @@ public:
Register raxReg);
#endif
// CRC32 code for java.util.zip.CRC32::updateBytes() instrinsic.
// CRC32 code for java.util.zip.CRC32::updateBytes() intrinsic.
void update_byte_crc32(Register crc, Register val, Register table);
void kernel_crc32(Register crc, Register buf, Register len, Register table, Register tmp);
// CRC32C code for java.util.zip.CRC32C::updateBytes() intrinsic
// Note on a naming convention:
// Prefix w = register only used on a Westmere+ architecture
// Prefix n = register only used on a Nehalem architecture
#ifdef _LP64
void crc32c_ipl_alg4(Register in_out, uint32_t n,
Register tmp1, Register tmp2, Register tmp3);
#else
void crc32c_ipl_alg4(Register in_out, uint32_t n,
Register tmp1, Register tmp2, Register tmp3,
XMMRegister xtmp1, XMMRegister xtmp2);
#endif
void crc32c_pclmulqdq(XMMRegister w_xtmp1,
Register in_out,
uint32_t const_or_pre_comp_const_index, bool is_pclmulqdq_supported,
XMMRegister w_xtmp2,
Register tmp1,
Register n_tmp2, Register n_tmp3);
void crc32c_rec_alt2(uint32_t const_or_pre_comp_const_index_u1, uint32_t const_or_pre_comp_const_index_u2, bool is_pclmulqdq_supported, Register in_out, Register in1, Register in2,
XMMRegister w_xtmp1, XMMRegister w_xtmp2, XMMRegister w_xtmp3,
Register tmp1, Register tmp2,
Register n_tmp3);
void crc32c_proc_chunk(uint32_t size, uint32_t const_or_pre_comp_const_index_u1, uint32_t const_or_pre_comp_const_index_u2, bool is_pclmulqdq_supported,
Register in_out1, Register in_out2, Register in_out3,
Register tmp1, Register tmp2, Register tmp3,
XMMRegister w_xtmp1, XMMRegister w_xtmp2, XMMRegister w_xtmp3,
Register tmp4, Register tmp5,
Register n_tmp6);
void crc32c_ipl_alg2_alt2(Register in_out, Register in1, Register in2,
Register tmp1, Register tmp2, Register tmp3,
Register tmp4, Register tmp5, Register tmp6,
XMMRegister w_xtmp1, XMMRegister w_xtmp2, XMMRegister w_xtmp3,
bool is_pclmulqdq_supported);
// Fold 128-bit data chunk
void fold_128bit_crc32(XMMRegister xcrc, XMMRegister xK, XMMRegister xtmp, Register buf, int offset);
void fold_128bit_crc32(XMMRegister xcrc, XMMRegister xK, XMMRegister xtmp, XMMRegister xbuf);

View File

@ -2991,6 +2991,63 @@ class StubGenerator: public StubCodeGenerator {
return start;
}
/**
* Arguments:
*
* Inputs:
* rsp(4) - int crc
* rsp(8) - byte* buf
* rsp(12) - int length
* rsp(16) - table_start - optional (present only when doing a library_calll,
* not used by x86 algorithm)
*
* Ouput:
* rax - int crc result
*/
address generate_updateBytesCRC32C(bool is_pclmulqdq_supported) {
assert(UseCRC32CIntrinsics, "need SSE4_2");
__ align(CodeEntryAlignment);
StubCodeMark mark(this, "StubRoutines", "updateBytesCRC32C");
address start = __ pc();
const Register crc = rax; // crc
const Register buf = rcx; // source java byte array address
const Register len = rdx; // length
const Register d = rbx;
const Register g = rsi;
const Register h = rdi;
const Register empty = 0; // will never be used, in order not
// to change a signature for crc32c_IPL_Alg2_Alt2
// between 64/32 I'm just keeping it here
assert_different_registers(crc, buf, len, d, g, h);
BLOCK_COMMENT("Entry:");
__ enter(); // required for proper stackwalking of RuntimeStub frame
Address crc_arg(rsp, 4 + 4 + 0); // ESP+4 +
// we need to add additional 4 because __ enter
// have just pushed ebp on a stack
Address buf_arg(rsp, 4 + 4 + 4);
Address len_arg(rsp, 4 + 4 + 8);
// Load up:
__ movl(crc, crc_arg);
__ movl(buf, buf_arg);
__ movl(len, len_arg);
__ push(d);
__ push(g);
__ push(h);
__ crc32c_ipl_alg2_alt2(crc, buf, len,
d, g, h,
empty, empty, empty,
xmm0, xmm1, xmm2,
is_pclmulqdq_supported);
__ pop(h);
__ pop(g);
__ pop(d);
__ leave(); // required for proper stackwalking of RuntimeStub frame
__ ret(0);
return start;
}
// Safefetch stubs.
void generate_safefetch(const char* name, int size, address* entry,
address* fault_pc, address* continuation_pc) {
@ -3204,6 +3261,13 @@ class StubGenerator: public StubCodeGenerator {
StubRoutines::_crc_table_adr = (address)StubRoutines::x86::_crc_table;
StubRoutines::_updateBytesCRC32 = generate_updateBytesCRC32();
}
if (UseCRC32CIntrinsics) {
bool supports_clmul = VM_Version::supports_clmul();
StubRoutines::x86::generate_CRC32C_table(supports_clmul);
StubRoutines::_crc32c_table_addr = (address)StubRoutines::x86::_crc32c_table;
StubRoutines::_updateBytesCRC32C = generate_updateBytesCRC32C(supports_clmul);
}
}

View File

@ -3958,6 +3958,64 @@ class StubGenerator: public StubCodeGenerator {
return start;
}
/**
* Arguments:
*
* Inputs:
* c_rarg0 - int crc
* c_rarg1 - byte* buf
* c_rarg2 - long length
* c_rarg3 - table_start - optional (present only when doing a library_calll,
* not used by x86 algorithm)
*
* Ouput:
* rax - int crc result
*/
address generate_updateBytesCRC32C(bool is_pclmulqdq_supported) {
assert(UseCRC32CIntrinsics, "need SSE4_2");
__ align(CodeEntryAlignment);
StubCodeMark mark(this, "StubRoutines", "updateBytesCRC32C");
address start = __ pc();
//reg.arg int#0 int#1 int#2 int#3 int#4 int#5 float regs
//Windows RCX RDX R8 R9 none none XMM0..XMM3
//Lin / Sol RDI RSI RDX RCX R8 R9 XMM0..XMM7
const Register crc = c_rarg0; // crc
const Register buf = c_rarg1; // source java byte array address
const Register len = c_rarg2; // length
const Register a = rax;
const Register j = r9;
const Register k = r10;
const Register l = r11;
#ifdef _WIN64
const Register y = rdi;
const Register z = rsi;
#else
const Register y = rcx;
const Register z = r8;
#endif
assert_different_registers(crc, buf, len, a, j, k, l, y, z);
BLOCK_COMMENT("Entry:");
__ enter(); // required for proper stackwalking of RuntimeStub frame
#ifdef _WIN64
__ push(y);
__ push(z);
#endif
__ crc32c_ipl_alg2_alt2(crc, buf, len,
a, j, k,
l, y, z,
c_farg0, c_farg1, c_farg2,
is_pclmulqdq_supported);
__ movl(rax, crc);
#ifdef _WIN64
__ pop(z);
__ pop(y);
#endif
__ leave(); // required for proper stackwalking of RuntimeStub frame
__ ret(0);
return start;
}
/**
* Arguments:
@ -4302,6 +4360,13 @@ class StubGenerator: public StubCodeGenerator {
StubRoutines::_crc_table_adr = (address)StubRoutines::x86::_crc_table;
StubRoutines::_updateBytesCRC32 = generate_updateBytesCRC32();
}
if (UseCRC32CIntrinsics) {
bool supports_clmul = VM_Version::supports_clmul();
StubRoutines::x86::generate_CRC32C_table(supports_clmul);
StubRoutines::_crc32c_table_addr = (address)StubRoutines::x86::_crc32c_table;
StubRoutines::_updateBytesCRC32C = generate_updateBytesCRC32C(supports_clmul);
}
}
void generate_all() {

View File

@ -27,6 +27,7 @@
#include "runtime/frame.inline.hpp"
#include "runtime/stubRoutines.hpp"
#include "runtime/thread.inline.hpp"
#include "crc32c.h"
// Implementation of the platform-specific part of StubRoutines - for
// a description of how to extend it, see the stubRoutines.hpp file.
@ -130,3 +131,107 @@ juint StubRoutines::x86::_crc_table[] =
0x5d681b02UL, 0x2a6f2b94UL, 0xb40bbe37UL, 0xc30c8ea1UL, 0x5a05df1bUL,
0x2d02ef8dUL
};
#define D 32
#define P 0x82F63B78 // Reflection of Castagnoli (0x11EDC6F41)
#define TILL_CYCLE 31
uint32_t _crc32c_pow_2k_table[TILL_CYCLE]; // because _crc32c_pow_2k_table[TILL_CYCLE == 31] == _crc32c_pow_2k_table[0]
// A. Kadatch and B. Jenkins / Everything we know about CRC but afraid to forget September 3, 2010 8
// Listing 1: Multiplication of normalized polynomials
// "a" and "b" occupy D least significant bits.
uint32_t crc32c_multiply(uint32_t a, uint32_t b) {
uint32_t product = 0;
uint32_t b_pow_x_table[D + 1]; // b_pow_x_table[k] = (b * x**k) mod P
b_pow_x_table[0] = b;
for (int k = 0; k < D; ++k) {
// If "a" has non-zero coefficient at x**k,/ add ((b * x**k) mod P) to the result.
if ((a & (uint64_t)(1 << (D - 1 - k))) != 0) product ^= b_pow_x_table[k];
// Compute b_pow_x_table[k+1] = (b ** x**(k+1)) mod P.
if (b_pow_x_table[k] & 1) {
// If degree of (b_pow_x_table[k] * x) is D, then
// degree of (b_pow_x_table[k] * x - P) is less than D.
b_pow_x_table[k + 1] = (b_pow_x_table[k] >> 1) ^ P;
}
else {
b_pow_x_table[k + 1] = b_pow_x_table[k] >> 1;
}
}
return product;
}
#undef D
#undef P
// A. Kadatch and B. Jenkins / Everything we know about CRC but afraid to forget September 3, 2010 9
void crc32c_init_pow_2k(void) {
// _crc32c_pow_2k_table(0) =
// x^(2^k) mod P(x) = x mod P(x) = x
// Since we are operating on a reflected values
// x = 10b, reflect(x) = 0x40000000
_crc32c_pow_2k_table[0] = 0x40000000;
for (int k = 1; k < TILL_CYCLE; k++) {
// _crc32c_pow_2k_table(k+1) = _crc32c_pow_2k_table(k-1)^2 mod P(x)
uint32_t tmp = _crc32c_pow_2k_table[k - 1];
_crc32c_pow_2k_table[k] = crc32c_multiply(tmp, tmp);
}
}
// x^N mod P(x)
uint32_t crc32c_f_pow_n(uint32_t n) {
// result = 1 (polynomial)
uint32_t one, result = 0x80000000, i = 0;
while (one = (n & 1), (n == 1 || n - one > 0)) {
if (one) {
result = crc32c_multiply(result, _crc32c_pow_2k_table[i]);
}
n >>= 1;
i++;
}
return result;
}
juint *StubRoutines::x86::_crc32c_table;
void StubRoutines::x86::generate_CRC32C_table(bool is_pclmulqdq_table_supported) {
static juint pow_n[CRC32C_NUM_PRECOMPUTED_CONSTANTS];
crc32c_init_pow_2k();
pow_n[0] = crc32c_f_pow_n(CRC32C_HIGH * 8); // 8N * 8 = 64N
pow_n[1] = crc32c_f_pow_n(CRC32C_HIGH * 8 * 2); // 128N
pow_n[2] = crc32c_f_pow_n(CRC32C_MIDDLE * 8);
pow_n[3] = crc32c_f_pow_n(CRC32C_MIDDLE * 8 * 2);
pow_n[4] = crc32c_f_pow_n(CRC32C_LOW * 8);
pow_n[CRC32C_NUM_PRECOMPUTED_CONSTANTS - 1] =
crc32c_f_pow_n(CRC32C_LOW * 8 * 2);
if (is_pclmulqdq_table_supported) {
_crc32c_table = pow_n;
} else {
static julong pclmulqdq_table[CRC32C_NUM_PRECOMPUTED_CONSTANTS * 256];
for (int j = 0; j < CRC32C_NUM_PRECOMPUTED_CONSTANTS; j++) {
static juint X_CONST = pow_n[j];
for (int64_t i = 0; i < 256; i++) { // to force 64 bit wide computations
// S. Gueron / Information Processing Letters 112 (2012) 184
// Algorithm 3: Generating a carry-less multiplication lookup table.
// Input: A 32-bit constant, X_CONST.
// Output: A table of 256 entries, each one is a 64-bit quadword,
// that can be used for computing "byte" * X_CONST, for a given byte.
pclmulqdq_table[j * 256 + i] =
((i & 1) * X_CONST) ^ ((i & 2) * X_CONST) ^ ((i & 4) * X_CONST) ^
((i & 8) * X_CONST) ^ ((i & 16) * X_CONST) ^ ((i & 32) * X_CONST) ^
((i & 64) * X_CONST) ^ ((i & 128) * X_CONST);
}
}
_crc32c_table = (juint*)pclmulqdq_table;
}
}

View File

@ -36,6 +36,8 @@
// masks and table for CRC32
static uint64_t _crc_by128_masks[];
static juint _crc_table[];
// table for CRC32C
static juint* _crc32c_table;
// swap mask for ghash
static address _ghash_long_swap_mask_addr;
static address _ghash_byte_swap_mask_addr;
@ -46,5 +48,6 @@
static address crc_by128_masks_addr() { return (address)_crc_by128_masks; }
static address ghash_long_swap_mask_addr() { return _ghash_long_swap_mask_addr; }
static address ghash_byte_swap_mask_addr() { return _ghash_byte_swap_mask_addr; }
static void generate_CRC32C_table(bool is_pclmulqdq_supported);
#endif // CPU_X86_VM_STUBROUTINES_X86_32_HPP

View File

@ -790,18 +790,25 @@ address InterpreterGenerator::generate_CRC32_updateBytes_entry(AbstractInterpret
const Register buf = rdx; // source java byte array address
const Register len = rdi; // length
// value x86_32
// interp. arg ptr ESP + 4
// int java.util.zip.CRC32.updateBytes(int crc, byte[] b, int off, int len)
// 3 2 1 0
// int java.util.zip.CRC32.updateByteBuffer(int crc, long buf, int off, int len)
// 4 2,3 1 0
// Arguments are reversed on java expression stack
__ movl(len, Address(rsp, wordSize)); // Length
__ movl(len, Address(rsp, 4 + 0)); // Length
// Calculate address of start element
if (kind == Interpreter::java_util_zip_CRC32_updateByteBuffer) {
__ movptr(buf, Address(rsp, 3*wordSize)); // long buf
__ addptr(buf, Address(rsp, 2*wordSize)); // + offset
__ movl(crc, Address(rsp, 5*wordSize)); // Initial CRC
__ movptr(buf, Address(rsp, 4 + 2 * wordSize)); // long buf
__ addptr(buf, Address(rsp, 4 + 1 * wordSize)); // + offset
__ movl(crc, Address(rsp, 4 + 4 * wordSize)); // Initial CRC
} else {
__ movptr(buf, Address(rsp, 3*wordSize)); // byte[] array
__ movptr(buf, Address(rsp, 4 + 2 * wordSize)); // byte[] array
__ addptr(buf, arrayOopDesc::base_offset_in_bytes(T_BYTE)); // + header size
__ addptr(buf, Address(rsp, 2*wordSize)); // + offset
__ movl(crc, Address(rsp, 4*wordSize)); // Initial CRC
__ addptr(buf, Address(rsp, 4 + 1 * wordSize)); // + offset
__ movl(crc, Address(rsp, 4 + 3 * wordSize)); // Initial CRC
}
__ super_call_VM_leaf(CAST_FROM_FN_PTR(address, StubRoutines::updateBytesCRC32()), crc, buf, len);
@ -822,6 +829,53 @@ address InterpreterGenerator::generate_CRC32_updateBytes_entry(AbstractInterpret
return generate_native_entry(false);
}
/**
* Method entry for static native methods:
* int java.util.zip.CRC32C.updateBytes(int crc, byte[] b, int off, int end)
* int java.util.zip.CRC32C.updateByteBuffer(int crc, long address, int off, int end)
*/
address InterpreterGenerator::generate_CRC32C_updateBytes_entry(AbstractInterpreter::MethodKind kind) {
if (UseCRC32CIntrinsics) {
address entry = __ pc();
// Load parameters
const Register crc = rax; // crc
const Register buf = rcx; // source java byte array address
const Register len = rdx; // length
const Register end = len;
// value x86_32
// interp. arg ptr ESP + 4
// int java.util.zip.CRC32.updateBytes(int crc, byte[] b, int off, int end)
// 3 2 1 0
// int java.util.zip.CRC32.updateByteBuffer(int crc, long address, int off, int end)
// 4 2,3 1 0
// Arguments are reversed on java expression stack
__ movl(end, Address(rsp, 4 + 0)); // end
__ subl(len, Address(rsp, 4 + 1 * wordSize)); // end - offset == length
// Calculate address of start element
if (kind == Interpreter::java_util_zip_CRC32_updateByteBuffer) {
__ movptr(buf, Address(rsp, 4 + 2 * wordSize)); // long address
__ addptr(buf, Address(rsp, 4 + 1 * wordSize)); // + offset
__ movl(crc, Address(rsp, 4 + 4 * wordSize)); // Initial CRC
} else {
__ movptr(buf, Address(rsp, 4 + 2 * wordSize)); // byte[] array
__ addptr(buf, arrayOopDesc::base_offset_in_bytes(T_BYTE)); // + header size
__ addptr(buf, Address(rsp, 4 + 1 * wordSize)); // + offset
__ movl(crc, Address(rsp, 4 + 3 * wordSize)); // Initial CRC
}
__ super_call_VM_leaf(CAST_FROM_FN_PTR(address, StubRoutines::updateBytesCRC32C()), crc, buf, len);
// result in rax
// _areturn
__ pop(rdi); // get return address
__ mov(rsp, rsi); // set sp to sender sp
__ jmp(rdi);
return entry;
}
return generate_native_entry(false);
}
/**
* Method entry for static native method:
* java.lang.Float.intBitsToFloat(int bits)

View File

@ -804,6 +804,57 @@ address InterpreterGenerator::generate_CRC32_updateBytes_entry(AbstractInterpret
return generate_native_entry(false);
}
/**
* Method entry for static native methods:
* int java.util.zip.CRC32C.updateBytes(int crc, byte[] b, int off, int end)
* int java.util.zip.CRC32C.updateByteBuffer(int crc, long address, int off, int end)
*/
address InterpreterGenerator::generate_CRC32C_updateBytes_entry(AbstractInterpreter::MethodKind kind) {
if (UseCRC32CIntrinsics) {
address entry = __ pc();
// Load parameters
const Register crc = c_rarg0; // crc
const Register buf = c_rarg1; // source java byte array address
const Register len = c_rarg2;
const Register off = c_rarg3; // offset
const Register end = len;
// Arguments are reversed on java expression stack
// Calculate address of start element
if (kind == Interpreter::java_util_zip_CRC32C_updateDirectByteBuffer) {
__ movptr(buf, Address(rsp, 3 * wordSize)); // long buf
__ movl2ptr(off, Address(rsp, 2 * wordSize)); // offset
__ addq(buf, off); // + offset
__ movl(crc, Address(rsp, 5 * wordSize)); // Initial CRC
// Note on 5 * wordSize vs. 4 * wordSize:
// * int java.util.zip.CRC32C.updateByteBuffer(int crc, long address, int off, int end)
// 4 2,3 1 0
// end starts at SP + 8
// The Java(R) Virtual Machine Specification Java SE 7 Edition
// 4.10.2.3. Values of Types long and double
// "When calculating operand stack length, values of type long and double have length two."
} else {
__ movptr(buf, Address(rsp, 3 * wordSize)); // byte[] array
__ addptr(buf, arrayOopDesc::base_offset_in_bytes(T_BYTE)); // + header size
__ movl2ptr(off, Address(rsp, 2 * wordSize)); // offset
__ addq(buf, off); // + offset
__ movl(crc, Address(rsp, 4 * wordSize)); // Initial CRC
}
__ movl(end, Address(rsp, wordSize)); // end
__ subl(end, off); // end - off
__ super_call_VM_leaf(CAST_FROM_FN_PTR(address, StubRoutines::updateBytesCRC32C()), crc, buf, len);
// result in rax
// _areturn
__ pop(rdi); // get return address
__ mov(rsp, r13); // set sp to sender sp
__ jmp(rdi);
return entry;
}
return generate_native_entry(false);
}
// Interpreter stub for calling a native method. (asm interpreter)
// This sets up a somewhat different looking stack for calling the
// native method than the typical interpreter frame setup.

View File

@ -661,6 +661,18 @@ void VM_Version::get_processor_features() {
FLAG_SET_DEFAULT(UseCRC32Intrinsics, false);
}
if (supports_sse4_2()) {
if (FLAG_IS_DEFAULT(UseCRC32CIntrinsics)) {
UseCRC32CIntrinsics = true;
}
}
else if (UseCRC32CIntrinsics) {
if (!FLAG_IS_DEFAULT(UseCRC32CIntrinsics)) {
warning("CRC32C intrinsics are not available on this CPU");
}
FLAG_SET_DEFAULT(UseCRC32CIntrinsics, false);
}
// The AES intrinsic stubs require AES instruction support (of course)
// but also require sse3 mode for instructions it use.
if (UseAES && (UseSSE > 2)) {
@ -704,12 +716,6 @@ void VM_Version::get_processor_features() {
FLAG_SET_DEFAULT(UseSHA512Intrinsics, false);
}
if (UseCRC32CIntrinsics) {
if (!FLAG_IS_DEFAULT(UseCRC32CIntrinsics))
warning("CRC32C intrinsics are not available on this CPU");
FLAG_SET_DEFAULT(UseCRC32CIntrinsics, false);
}
if (UseAdler32Intrinsics) {
warning("Adler32Intrinsics not available on this CPU.");
FLAG_SET_DEFAULT(UseAdler32Intrinsics, false);

View File

@ -42,4 +42,5 @@
// Not supported
address generate_CRC32_update_entry() { return NULL; }
address generate_CRC32_updateBytes_entry(AbstractInterpreter::MethodKind kind) { return NULL; }
address generate_CRC32C_updateBytes_entry(AbstractInterpreter::MethodKind kind) { return NULL; }
#endif // CPU_ZERO_VM_INTERPRETERGENERATOR_ZERO_HPP

View File

@ -90,6 +90,8 @@ class AbstractInterpreter: AllStatic {
java_util_zip_CRC32_update, // implementation of java.util.zip.CRC32.update()
java_util_zip_CRC32_updateBytes, // implementation of java.util.zip.CRC32.updateBytes()
java_util_zip_CRC32_updateByteBuffer, // implementation of java.util.zip.CRC32.updateByteBuffer()
java_util_zip_CRC32C_updateBytes, // implementation of java.util.zip.CRC32C.updateBytes(crc, b[], off, end)
java_util_zip_CRC32C_updateDirectByteBuffer, // implementation of java.util.zip.CRC32C.updateDirectByteBuffer(crc, address, off, end)
java_lang_Float_intBitsToFloat, // implementation of java.lang.Float.intBitsToFloat()
java_lang_Float_floatToRawIntBits, // implementation of java.lang.Float.floatToRawIntBits()
java_lang_Double_longBitsToDouble, // implementation of java.lang.Double.longBitsToDouble()

View File

@ -234,6 +234,13 @@ AbstractInterpreter::MethodKind AbstractInterpreter::method_kind(methodHandle m)
case vmIntrinsics::_updateByteBufferCRC32 : return java_util_zip_CRC32_updateByteBuffer;
}
}
if (UseCRC32CIntrinsics) {
// Use optimized stub code for CRC32C methods.
switch (m->intrinsic_id()) {
case vmIntrinsics::_updateBytesCRC32C : return java_util_zip_CRC32C_updateBytes;
case vmIntrinsics::_updateDirectByteBufferCRC32C : return java_util_zip_CRC32C_updateDirectByteBuffer;
}
}
switch(m->intrinsic_id()) {
case vmIntrinsics::_intBitsToFloat: return java_lang_Float_intBitsToFloat;
@ -349,6 +356,8 @@ void AbstractInterpreter::print_method_kind(MethodKind kind) {
case java_util_zip_CRC32_update : tty->print("java_util_zip_CRC32_update"); break;
case java_util_zip_CRC32_updateBytes : tty->print("java_util_zip_CRC32_updateBytes"); break;
case java_util_zip_CRC32_updateByteBuffer : tty->print("java_util_zip_CRC32_updateByteBuffer"); break;
case java_util_zip_CRC32C_updateBytes : tty->print("java_util_zip_CRC32C_updateBytes"); break;
case java_util_zip_CRC32C_updateDirectByteBuffer: tty->print("java_util_zip_CRC32C_updateDirectByteByffer"); break;
default:
if (kind >= method_handle_invoke_FIRST &&
kind <= method_handle_invoke_LAST) {
@ -567,6 +576,10 @@ address InterpreterGenerator::generate_method_entry(
: // fall thru
case Interpreter::java_util_zip_CRC32_updateByteBuffer
: entry_point = generate_CRC32_updateBytes_entry(kind); break;
case Interpreter::java_util_zip_CRC32C_updateBytes
: // fall thru
case Interpreter::java_util_zip_CRC32C_updateDirectByteBuffer
: entry_point = generate_CRC32C_updateBytes_entry(kind); break;
#if defined(TARGET_ARCH_x86) && !defined(_LP64)
// On x86_32 platforms, a special entry is generated for the following four methods.
// On other platforms the normal entry is used to enter these methods.
@ -582,9 +595,9 @@ address InterpreterGenerator::generate_method_entry(
case Interpreter::java_lang_Float_intBitsToFloat:
case Interpreter::java_lang_Float_floatToRawIntBits:
case Interpreter::java_lang_Double_longBitsToDouble:
case Interpreter::java_lang_Double_doubleToRawLongBits:
entry_point = generate_native_entry(false);
break;
case Interpreter::java_lang_Double_doubleToRawLongBits:
entry_point = generate_native_entry(false);
break;
#endif // defined(TARGET_ARCH_x86) && !defined(_LP64)
#endif // CC_INTERP
default:

View File

@ -418,6 +418,11 @@ void TemplateInterpreterGenerator::generate_all() {
method_entry(java_util_zip_CRC32_updateByteBuffer)
}
if (UseCRC32CIntrinsics) {
method_entry(java_util_zip_CRC32C_updateBytes)
method_entry(java_util_zip_CRC32C_updateDirectByteBuffer)
}
method_entry(java_lang_Float_intBitsToFloat);
method_entry(java_lang_Float_floatToRawIntBits);
method_entry(java_lang_Double_longBitsToDouble);

View File

@ -136,8 +136,9 @@ address StubRoutines::_sha512_implCompress = NULL;
address StubRoutines::_sha512_implCompressMB = NULL;
address StubRoutines::_updateBytesCRC32 = NULL;
address StubRoutines::_crc_table_adr = NULL;
address StubRoutines::_crc_table_adr = NULL;
address StubRoutines::_crc32c_table_addr = NULL;
address StubRoutines::_updateBytesCRC32C = NULL;
address StubRoutines::_updateBytesAdler32 = NULL;

View File

@ -197,6 +197,7 @@ class StubRoutines: AllStatic {
static address _updateBytesCRC32;
static address _crc_table_adr;
static address _crc32c_table_addr;
static address _updateBytesCRC32C;
static address _updateBytesAdler32;
@ -364,6 +365,7 @@ class StubRoutines: AllStatic {
static address updateBytesCRC32() { return _updateBytesCRC32; }
static address crc_table_addr() { return _crc_table_adr; }
static address crc32c_table_addr() { return _crc32c_table_addr; }
static address updateBytesCRC32C() { return _updateBytesCRC32C; }
static address updateBytesAdler32() { return _updateBytesAdler32; }

View File

@ -832,6 +832,7 @@ typedef CompactHashtable<Symbol*, char> SymbolCompactHashTable;
static_field(StubRoutines, _ghash_processBlocks, address) \
static_field(StubRoutines, _updateBytesCRC32, address) \
static_field(StubRoutines, _crc_table_adr, address) \
static_field(StubRoutines, _crc32c_table_addr, address) \
static_field(StubRoutines, _updateBytesCRC32C, address) \
static_field(StubRoutines, _multiplyToLen, address) \
static_field(StubRoutines, _squareToLen, address) \