8073583: C2 support for CRC32C on SPARC

Reviewed-by: jrose, kvn
2015-06-29 00:10:01 -07:00 · 2015-06-29 00:10:01 -07:00 · e2533553f6
commit e2533553f6
parent 4a826139e3
21 changed files with 746 additions and 19 deletions
--- a/hotspot/src/cpu/aarch64/vm/vm_version_aarch64.cpp
+++ b/hotspot/src/cpu/aarch64/vm/vm_version_aarch64.cpp
@ -199,6 +199,12 @@ void VM_Version::get_processor_features() {
    UseCRC32Intrinsics = true;
  }

+  if (UseCRC32CIntrinsics) {
+    if (!FLAG_IS_DEFAULT(UseCRC32CIntrinsics))
+      warning("CRC32C intrinsics are not available on this CPU");
+    FLAG_SET_DEFAULT(UseCRC32CIntrinsics, false);
+  }
+
  if (auxv & (HWCAP_SHA1 | HWCAP_SHA2)) {
    if (FLAG_IS_DEFAULT(UseSHA)) {
      FLAG_SET_DEFAULT(UseSHA, true);
--- a/hotspot/src/cpu/ppc/vm/vm_version_ppc.cpp
+++ b/hotspot/src/cpu/ppc/vm/vm_version_ppc.cpp
@ -191,6 +191,13 @@ void VM_Version::initialize() {
    FLAG_SET_DEFAULT(UseSHA256Intrinsics, false);
    FLAG_SET_DEFAULT(UseSHA512Intrinsics, false);
  }
+
+  if (UseCRC32CIntrinsics) {
+    if (!FLAG_IS_DEFAULT(UseCRC32CIntrinsics))
+      warning("CRC32C intrinsics are not available on this CPU");
+    FLAG_SET_DEFAULT(UseCRC32CIntrinsics, false);
+  }
+
  // Adjust RTM (Restricted Transactional Memory) flags.
  if (!has_tcheck() && UseRTMLocking) {
    // Can't continue because UseRTMLocking affects UseBiasedLocking flag
--- a/hotspot/src/cpu/sparc/vm/assembler_sparc.hpp
+++ b/hotspot/src/cpu/sparc/vm/assembler_sparc.hpp
@ -128,8 +128,11 @@ class Assembler : public AbstractAssembler  {
    faligndata_op3 = 0x36,
    flog3_op3    = 0x36,
    edge_op3     = 0x36,
+    fzero_op3    = 0x36,
    fsrc_op3     = 0x36,
+    fnot_op3     = 0x36,
    xmulx_op3    = 0x36,
+    crc32c_op3   = 0x36,
    impdep2_op3  = 0x37,
    stpartialf_op3 = 0x37,
    jmpl_op3     = 0x38,
@ -231,7 +234,9 @@ class Assembler : public AbstractAssembler  {

    sha1_opf           = 0x141,
    sha256_opf         = 0x142,
-    sha512_opf         = 0x143
+    sha512_opf         = 0x143,
+
+    crc32c_opf         = 0x147
  };

  enum op5s {
@ -600,6 +605,11 @@ class Assembler : public AbstractAssembler  {
    return x & ((1 << 10) - 1);
  }

+  // create a low12 __value__ (not a field) for a given a 32-bit constant
+  static int low12( int x ) {
+    return x & ((1 << 12) - 1);
+  }
+
  // AES crypto instructions supported only on certain processors
  static void aes_only() { assert( VM_Version::has_aes(), "This instruction only works on SPARC with AES instructions support"); }

@ -608,6 +618,9 @@ class Assembler : public AbstractAssembler  {
  static void sha256_only() { assert( VM_Version::has_sha256(), "This instruction only works on SPARC with SHA256"); }
  static void sha512_only() { assert( VM_Version::has_sha512(), "This instruction only works on SPARC with SHA512"); }

+  // CRC32C instruction supported only on certain processors
+  static void crc32c_only() { assert( VM_Version::has_crc32c(), "This instruction only works on SPARC with CRC32C"); }
+
  // instruction only in VIS1
  static void vis1_only() { assert( VM_Version::has_vis1(), "This instruction only works on SPARC with VIS1"); }

@ -1022,6 +1035,7 @@ public:

  void nop() { emit_int32( op(branch_op) | op2(sethi_op2) ); }

+  void sw_count() { emit_int32( op(branch_op) | op2(sethi_op2) | 0x3f0 ); }

  // pp 202

@ -1198,8 +1212,14 @@ public:

  void faligndata( FloatRegister s1, FloatRegister s2, FloatRegister d ) { vis1_only(); emit_int32( op(arith_op) | fd(d, FloatRegisterImpl::D) | op3(faligndata_op3) | fs1(s1, FloatRegisterImpl::D) | opf(faligndata_opf) | fs2(s2, FloatRegisterImpl::D)); }

+  void fzero( FloatRegisterImpl::Width w, FloatRegister d ) { vis1_only(); emit_int32( op(arith_op) | fd(d, w) | op3(fzero_op3) | opf(0x62 - w)); }
+
  void fsrc2( FloatRegisterImpl::Width w, FloatRegister s2, FloatRegister d ) { vis1_only(); emit_int32( op(arith_op) | fd(d, w) | op3(fsrc_op3) | opf(0x7A - w) | fs2(s2, w)); }

+  void fnot1( FloatRegisterImpl::Width w, FloatRegister s1, FloatRegister d ) { vis1_only(); emit_int32( op(arith_op) | fd(d, w) | op3(fnot_op3) | fs1(s1, w) | opf(0x6C - w)); }
+
+  void fpmerge( FloatRegister s1, FloatRegister s2, FloatRegister d ) { vis1_only(); emit_int32( op(arith_op) | fd(d, FloatRegisterImpl::D) | op3(0x36) | fs1(s1, FloatRegisterImpl::S) | opf(0x4b) | fs2(s2, FloatRegisterImpl::S)); }
+
  void stpartialf( Register s1, Register s2, FloatRegister d, int ia = -1 ) { vis1_only(); emit_int32( op(ldst_op) | fd(d, FloatRegisterImpl::D) | op3(stpartialf_op3) | rs1(s1) | imm_asi(ia) | rs2(s2)); }

  //  VIS2 instructions
@ -1224,6 +1244,10 @@ public:
  void sha256() { sha256_only();  emit_int32( op(arith_op) | op3(sha_op3) | opf(sha256_opf)); }
  void sha512() { sha512_only();  emit_int32( op(arith_op) | op3(sha_op3) | opf(sha512_opf)); }

+  // CRC32C instruction
+
+  void crc32c( FloatRegister s1, FloatRegister s2, FloatRegister d ) { crc32c_only();  emit_int32( op(arith_op) | fd(d, FloatRegisterImpl::D) | op3(crc32c_op3) | fs1(s1, FloatRegisterImpl::D) | opf(crc32c_opf) | fs2(s2, FloatRegisterImpl::D)); }
+
  // Creation
  Assembler(CodeBuffer* code) : AbstractAssembler(code) {
 #ifdef CHECK_DELAY
--- a/hotspot/src/cpu/sparc/vm/macroAssembler_sparc.cpp
+++ b/hotspot/src/cpu/sparc/vm/macroAssembler_sparc.cpp
@ -956,6 +956,7 @@ void MacroAssembler::set64(jlong value, Register d, Register tmp) {

  int hi = (int)(value >> 32);
  int lo = (int)(value & ~0);
+  int bits_33to2 = (int)((value >> 2) & ~0);
  // (Matcher::isSimpleConstant64 knows about the following optimizations.)
  if (Assembler::is_simm13(lo) && value == lo) {
    or3(G0, lo, d);
@ -964,6 +965,12 @@ void MacroAssembler::set64(jlong value, Register d, Register tmp) {
    if (low10(lo) != 0)
      or3(d, low10(lo), d);
  }
+  else if ((hi >> 2) == 0) {
+    Assembler::sethi(bits_33to2, d);  // hardware version zero-extends to upper 32
+    sllx(d, 2, d);
+    if (low12(lo) != 0)
+      or3(d, low12(lo), d);
+  }
  else if (hi == -1) {
    Assembler::sethi(~lo, d);  // hardware version zero-extends to upper 32
    xor3(d, low10(lo) ^ ~low10(~0), d);
@ -4351,3 +4358,52 @@ void MacroAssembler::bis_zeroing(Register to, Register count, Register temp, Lab
  cmp_and_brx_short(to, end, Assembler::lessUnsigned, Assembler::pt, small_loop);
  nop(); // Separate short branches
 }
+
+/**
+ * Update CRC-32[C] with a byte value according to constants in table
+ *
+ * @param [in,out]crc   Register containing the crc.
+ * @param [in]val       Register containing the byte to fold into the CRC.
+ * @param [in]table     Register containing the table of crc constants.
+ *
+ * uint32_t crc;
+ * val = crc_table[(val ^ crc) & 0xFF];
+ * crc = val ^ (crc >> 8);
+ */
+void MacroAssembler::update_byte_crc32(Register crc, Register val, Register table) {
+  xor3(val, crc, val);
+  and3(val, 0xFF, val);
+  sllx(val, 2, val);
+  lduw(table, val, val);
+  srlx(crc, 8, crc);
+  xor3(val, crc, crc);
+}
+
+// Reverse byte order of lower 32 bits, assuming upper 32 bits all zeros
+void MacroAssembler::reverse_bytes_32(Register src, Register dst, Register tmp) {
+  srlx(src, 24, dst);
+
+  sllx(src, 32+8, tmp);
+  srlx(tmp, 32+24, tmp);
+  sllx(tmp, 8, tmp);
+  or3(dst, tmp, dst);
+
+  sllx(src, 32+16, tmp);
+  srlx(tmp, 32+24, tmp);
+  sllx(tmp, 16, tmp);
+  or3(dst, tmp, dst);
+
+  sllx(src, 32+24, tmp);
+  srlx(tmp, 32, tmp);
+  or3(dst, tmp, dst);
+}
+
+void MacroAssembler::movitof_revbytes(Register src, FloatRegister dst, Register tmp1, Register tmp2) {
+  reverse_bytes_32(src, tmp1, tmp2);
+  movxtod(tmp1, dst);
+}
+
+void MacroAssembler::movftoi_revbytes(FloatRegister src, Register dst, Register tmp1, Register tmp2) {
+  movdtox(src, tmp1);
+  reverse_bytes_32(tmp1, dst, tmp2);
+}
--- a/hotspot/src/cpu/sparc/vm/macroAssembler_sparc.hpp
+++ b/hotspot/src/cpu/sparc/vm/macroAssembler_sparc.hpp
@ -1,5 +1,5 @@
 /*
- * Copyright (c) 1997, 2014, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1997, 2015, Oracle and/or its affiliates. All rights reserved.
 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
 *
 * This code is free software; you can redistribute it and/or modify it
@ -903,6 +903,10 @@ public:
  inline void ldf(FloatRegisterImpl::Width w, Register s1, RegisterOrConstant s2, FloatRegister d);
  inline void ldf(FloatRegisterImpl::Width w, const Address& a, FloatRegister d, int offset = 0);

+  // little-endian
+  inline void ldxl(Register s1, Register s2, Register d) { ldxa(s1, s2, ASI_PRIMARY_LITTLE, d); }
+  inline void ldfl(FloatRegisterImpl::Width w, Register s1, Register s2, FloatRegister d) { ldfa(w, s1, s2, ASI_PRIMARY_LITTLE, d); }
+
  // membar psuedo instruction.  takes into account target memory model.
  inline void membar( Assembler::Membar_mask_bits const7a );

@ -1436,6 +1440,14 @@ public:
  // Use BIS for zeroing
  void bis_zeroing(Register to, Register count, Register temp, Label& Ldone);

+  // Update CRC-32[C] with a byte value according to constants in table
+  void update_byte_crc32(Register crc, Register val, Register table);
+
+  // Reverse byte order of lower 32 bits, assuming upper 32 bits all zeros
+  void reverse_bytes_32(Register src, Register dst, Register tmp);
+  void movitof_revbytes(Register src, FloatRegister dst, Register tmp1, Register tmp2);
+  void movftoi_revbytes(FloatRegister src, Register dst, Register tmp1, Register tmp2);
+
 #undef VIRTUAL
 };

--- a/hotspot/src/cpu/sparc/vm/stubGenerator_sparc.cpp
+++ b/hotspot/src/cpu/sparc/vm/stubGenerator_sparc.cpp
@ -4910,6 +4910,206 @@ class StubGenerator: public StubCodeGenerator {
      return start;
  }

+#define CHUNK_LEN   128          /* 128 x 8B = 1KB */
+#define CHUNK_K1    0x1307a0206  /* reverseBits(pow(x, CHUNK_LEN*8*8*3 - 32) mod P(x)) << 1 */
+#define CHUNK_K2    0x1a0f717c4  /* reverseBits(pow(x, CHUNK_LEN*8*8*2 - 32) mod P(x)) << 1 */
+#define CHUNK_K3    0x0170076fa  /* reverseBits(pow(x, CHUNK_LEN*8*8*1 - 32) mod P(x)) << 1 */
+
+  /**
+   *  Arguments:
+   *
+   * Inputs:
+   *   O0   - int   crc
+   *   O1   - byte* buf
+   *   O2   - int   len
+   *   O3   - int*  table
+   *
+   * Output:
+   *   O0   - int crc result
+   */
+  address generate_updateBytesCRC32C() {
+    assert(UseCRC32CIntrinsics, "need CRC32C instruction");
+
+    __ align(CodeEntryAlignment);
+    StubCodeMark mark(this, "StubRoutines", "updateBytesCRC32C");
+    address start = __ pc();
+
+    const Register crc   = O0;  // crc
+    const Register buf   = O1;  // source java byte array address
+    const Register len   = O2;  // number of bytes
+    const Register table = O3;  // byteTable
+
+    Label L_crc32c_head, L_crc32c_aligned;
+    Label L_crc32c_parallel, L_crc32c_parallel_loop;
+    Label L_crc32c_serial, L_crc32c_x32_loop, L_crc32c_x8, L_crc32c_x8_loop;
+    Label L_crc32c_done, L_crc32c_tail, L_crc32c_return;
+
+    __ cmp_and_br_short(len, 0, Assembler::lessEqual, Assembler::pn, L_crc32c_return);
+
+    // clear upper 32 bits of crc
+    __ clruwu(crc);
+
+    __ and3(buf, 7, G4);
+    __ cmp_and_brx_short(G4, 0, Assembler::equal, Assembler::pt, L_crc32c_aligned);
+
+    __ mov(8, G1);
+    __ sub(G1, G4, G4);
+
+    // ------ process the misaligned head (7 bytes or less) ------
+    __ BIND(L_crc32c_head);
+
+    // crc = (crc >>> 8) ^ byteTable[(crc ^ b) & 0xFF];
+    __ ldub(buf, 0, G1);
+    __ update_byte_crc32(crc, G1, table);
+
+    __ inc(buf);
+    __ dec(len);
+    __ cmp_and_br_short(len, 0, Assembler::equal, Assembler::pn, L_crc32c_return);
+    __ dec(G4);
+    __ cmp_and_br_short(G4, 0, Assembler::greater, Assembler::pt, L_crc32c_head);
+
+    // ------ process the 8-byte-aligned body ------
+    __ BIND(L_crc32c_aligned);
+    __ nop();
+    __ cmp_and_br_short(len, 8, Assembler::less, Assembler::pn, L_crc32c_tail);
+
+    // reverse the byte order of lower 32 bits to big endian, and move to FP side
+    __ movitof_revbytes(crc, F0, G1, G3);
+
+    __ set(CHUNK_LEN*8*4, G4);
+    __ cmp_and_br_short(len, G4, Assembler::less, Assembler::pt, L_crc32c_serial);
+
+    // ------ process four 1KB chunks in parallel ------
+    __ BIND(L_crc32c_parallel);
+
+    __ fzero(FloatRegisterImpl::D, F2);
+    __ fzero(FloatRegisterImpl::D, F4);
+    __ fzero(FloatRegisterImpl::D, F6);
+
+    __ mov(CHUNK_LEN - 1, G4);
+    __ BIND(L_crc32c_parallel_loop);
+    // schedule ldf's ahead of crc32c's to hide the load-use latency
+    __ ldf(FloatRegisterImpl::D, buf, 0,            F8);
+    __ ldf(FloatRegisterImpl::D, buf, CHUNK_LEN*8,  F10);
+    __ ldf(FloatRegisterImpl::D, buf, CHUNK_LEN*16, F12);
+    __ ldf(FloatRegisterImpl::D, buf, CHUNK_LEN*24, F14);
+    __ crc32c(F0, F8,  F0);
+    __ crc32c(F2, F10, F2);
+    __ crc32c(F4, F12, F4);
+    __ crc32c(F6, F14, F6);
+    __ inc(buf, 8);
+    __ dec(G4);
+    __ cmp_and_br_short(G4, 0, Assembler::greater, Assembler::pt, L_crc32c_parallel_loop);
+
+    __ ldf(FloatRegisterImpl::D, buf, 0,            F8);
+    __ ldf(FloatRegisterImpl::D, buf, CHUNK_LEN*8,  F10);
+    __ ldf(FloatRegisterImpl::D, buf, CHUNK_LEN*16, F12);
+    __ crc32c(F0, F8,  F0);
+    __ crc32c(F2, F10, F2);
+    __ crc32c(F4, F12, F4);
+
+    __ inc(buf, CHUNK_LEN*24);
+    __ ldfl(FloatRegisterImpl::D, buf, G0, F14);  // load in little endian
+    __ inc(buf, 8);
+
+    __ prefetch(buf, 0,            Assembler::severalReads);
+    __ prefetch(buf, CHUNK_LEN*8,  Assembler::severalReads);
+    __ prefetch(buf, CHUNK_LEN*16, Assembler::severalReads);
+    __ prefetch(buf, CHUNK_LEN*24, Assembler::severalReads);
+
+    // move to INT side, and reverse the byte order of lower 32 bits to little endian
+    __ movftoi_revbytes(F0, O4, G1, G4);
+    __ movftoi_revbytes(F2, O5, G1, G4);
+    __ movftoi_revbytes(F4, G5, G1, G4);
+
+    // combine the results of 4 chunks
+    __ set64(CHUNK_K1, G3, G1);
+    __ xmulx(O4, G3, O4);
+    __ set64(CHUNK_K2, G3, G1);
+    __ xmulx(O5, G3, O5);
+    __ set64(CHUNK_K3, G3, G1);
+    __ xmulx(G5, G3, G5);
+
+    __ movdtox(F14, G4);
+    __ xor3(O4, O5, O5);
+    __ xor3(G5, O5, O5);
+    __ xor3(G4, O5, O5);
+
+    // reverse the byte order to big endian, via stack, and move to FP side
+    __ add(SP, -8, G1);
+    __ srlx(G1, 3, G1);
+    __ sllx(G1, 3, G1);
+    __ stx(O5, G1, G0);
+    __ ldfl(FloatRegisterImpl::D, G1, G0, F2);  // load in little endian
+
+    __ crc32c(F6, F2, F0);
+
+    __ set(CHUNK_LEN*8*4, G4);
+    __ sub(len, G4, len);
+    __ cmp_and_br_short(len, G4, Assembler::greaterEqual, Assembler::pt, L_crc32c_parallel);
+    __ nop();
+    __ cmp_and_br_short(len, 0, Assembler::equal, Assembler::pt, L_crc32c_done);
+
+    __ BIND(L_crc32c_serial);
+
+    __ mov(32, G4);
+    __ cmp_and_br_short(len, G4, Assembler::less, Assembler::pn, L_crc32c_x8);
+
+    // ------ process 32B chunks ------
+    __ BIND(L_crc32c_x32_loop);
+    __ ldf(FloatRegisterImpl::D, buf, 0, F2);
+    __ inc(buf, 8);
+    __ crc32c(F0, F2, F0);
+    __ ldf(FloatRegisterImpl::D, buf, 0, F2);
+    __ inc(buf, 8);
+    __ crc32c(F0, F2, F0);
+    __ ldf(FloatRegisterImpl::D, buf, 0, F2);
+    __ inc(buf, 8);
+    __ crc32c(F0, F2, F0);
+    __ ldf(FloatRegisterImpl::D, buf, 0, F2);
+    __ inc(buf, 8);
+    __ crc32c(F0, F2, F0);
+    __ dec(len, 32);
+    __ cmp_and_br_short(len, G4, Assembler::greaterEqual, Assembler::pt, L_crc32c_x32_loop);
+
+    __ BIND(L_crc32c_x8);
+    __ nop();
+    __ cmp_and_br_short(len, 8, Assembler::less, Assembler::pt, L_crc32c_done);
+
+    // ------ process 8B chunks ------
+    __ BIND(L_crc32c_x8_loop);
+    __ ldf(FloatRegisterImpl::D, buf, 0, F2);
+    __ inc(buf, 8);
+    __ crc32c(F0, F2, F0);
+    __ dec(len, 8);
+    __ cmp_and_br_short(len, 8, Assembler::greaterEqual, Assembler::pt, L_crc32c_x8_loop);
+
+    __ BIND(L_crc32c_done);
+
+    // move to INT side, and reverse the byte order of lower 32 bits to little endian
+    __ movftoi_revbytes(F0, crc, G1, G3);
+
+    __ cmp_and_br_short(len, 0, Assembler::equal, Assembler::pt, L_crc32c_return);
+
+    // ------ process the misaligned tail (7 bytes or less) ------
+    __ BIND(L_crc32c_tail);
+
+    // crc = (crc >>> 8) ^ byteTable[(crc ^ b) & 0xFF];
+    __ ldub(buf, 0, G1);
+    __ update_byte_crc32(crc, G1, table);
+
+    __ inc(buf);
+    __ dec(len);
+    __ cmp_and_br_short(len, 0, Assembler::greater, Assembler::pt, L_crc32c_tail);
+
+    __ BIND(L_crc32c_return);
+    __ nop();
+    __ retl();
+    __ delayed()->nop();
+
+    return start;
+  }
+
  void generate_initial() {
    // Generates all stubs and initializes the entry points

@ -5001,6 +5201,11 @@ class StubGenerator: public StubCodeGenerator {
      StubRoutines::_sha512_implCompress   = generate_sha512_implCompress(false, "sha512_implCompress");
      StubRoutines::_sha512_implCompressMB = generate_sha512_implCompress(true,  "sha512_implCompressMB");
    }
+
+    // generate CRC32C intrinsic code
+    if (UseCRC32CIntrinsics) {
+      StubRoutines::_updateBytesCRC32C = generate_updateBytesCRC32C();
+    }
  }


--- a/hotspot/src/cpu/sparc/vm/stubRoutines_sparc.hpp
+++ b/hotspot/src/cpu/sparc/vm/stubRoutines_sparc.hpp
@ -1,5 +1,5 @@
 /*
- * Copyright (c) 1997, 2013, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1997, 2015, Oracle and/or its affiliates. All rights reserved.
 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
 *
 * This code is free software; you can redistribute it and/or modify it
@ -41,7 +41,7 @@ static bool returns_to_call_stub(address return_pc)   {
 enum /* platform_dependent_constants */ {
  // %%%%%%%% May be able to shrink this a lot
  code_size1 = 20000,           // simply increase if too small (assembler will crash if too small)
-  code_size2 = 23000            // simply increase if too small (assembler will crash if too small)
+  code_size2 = 24000            // simply increase if too small (assembler will crash if too small)
 };

 class Sparc {
--- a/hotspot/src/cpu/sparc/vm/vm_version_sparc.cpp
+++ b/hotspot/src/cpu/sparc/vm/vm_version_sparc.cpp
@ -230,7 +230,7 @@ void VM_Version::initialize() {
  assert((OptoLoopAlignment % relocInfo::addr_unit()) == 0, "alignment is not a multiple of NOP size");

  char buf[512];
-  jio_snprintf(buf, sizeof(buf), "%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s",
+  jio_snprintf(buf, sizeof(buf), "%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s",
               (has_v9() ? ", v9" : (has_v8() ? ", v8" : "")),
               (has_hardware_popc() ? ", popc" : ""),
               (has_vis1() ? ", vis1" : ""),
@ -242,6 +242,7 @@ void VM_Version::initialize() {
               (has_sha1() ? ", sha1" : ""),
               (has_sha256() ? ", sha256" : ""),
               (has_sha512() ? ", sha512" : ""),
+               (has_crc32c() ? ", crc32c" : ""),
               (is_ultra3() ? ", ultra3" : ""),
               (is_sun4v() ? ", sun4v" : ""),
               (is_niagara_plus() ? ", niagara_plus" : (is_niagara() ? ", niagara" : "")),
@ -363,6 +364,23 @@ void VM_Version::initialize() {
    }
  }

+  // SPARC T4 and above should have support for CRC32C instruction
+  if (has_crc32c()) {
+    if (UseVIS > 2) { // CRC32C intrinsics use VIS3 instructions
+      if (FLAG_IS_DEFAULT(UseCRC32CIntrinsics)) {
+        FLAG_SET_DEFAULT(UseCRC32CIntrinsics, true);
+      }
+    } else {
+      if (UseCRC32CIntrinsics) {
+        warning("SPARC CRC32C intrinsics require VIS3 instruction support. Intrinsics will be disabled.");
+        FLAG_SET_DEFAULT(UseCRC32CIntrinsics, false);
+      }
+    }
+  } else if (UseCRC32CIntrinsics) {
+    warning("CRC32C instruction is not available on this CPU");
+    FLAG_SET_DEFAULT(UseCRC32CIntrinsics, false);
+  }
+
  if (FLAG_IS_DEFAULT(ContendedPaddingWidth) &&
    (cache_line_size > ContendedPaddingWidth))
    ContendedPaddingWidth = cache_line_size;
--- a/hotspot/src/cpu/sparc/vm/vm_version_sparc.hpp
+++ b/hotspot/src/cpu/sparc/vm/vm_version_sparc.hpp
@ -1,5 +1,5 @@
 /*
- * Copyright (c) 1997, 2014, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1997, 2015, Oracle and/or its affiliates. All rights reserved.
 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
 *
 * This code is free software; you can redistribute it and/or modify it
@ -53,7 +53,8 @@ protected:
    aes_instructions     = 19,
    sha1_instruction     = 20,
    sha256_instruction   = 21,
-    sha512_instruction   = 22
+    sha512_instruction   = 22,
+    crc32c_instruction   = 23
  };

  enum Feature_Flag_Set {
@ -83,6 +84,7 @@ protected:
    sha1_instruction_m      = 1 << sha1_instruction,
    sha256_instruction_m    = 1 << sha256_instruction,
    sha512_instruction_m    = 1 << sha512_instruction,
+    crc32c_instruction_m    = 1 << crc32c_instruction,

    generic_v8_m        = v8_instructions_m | hardware_mul32_m | hardware_div32_m | hardware_fsmuld_m,
    generic_v9_m        = generic_v8_m | v9_instructions_m,
@ -141,6 +143,7 @@ public:
  static bool has_sha1()                { return (_features & sha1_instruction_m) != 0; }
  static bool has_sha256()              { return (_features & sha256_instruction_m) != 0; }
  static bool has_sha512()              { return (_features & sha512_instruction_m) != 0; }
+  static bool has_crc32c()              { return (_features & crc32c_instruction_m) != 0; }

  static bool supports_compare_and_exchange()
                                        { return has_v9(); }
--- a/hotspot/src/cpu/x86/vm/vm_version_x86.cpp
+++ b/hotspot/src/cpu/x86/vm/vm_version_x86.cpp
@ -699,6 +699,12 @@ void VM_Version::get_processor_features() {
    FLAG_SET_DEFAULT(UseSHA512Intrinsics, false);
  }

+  if (UseCRC32CIntrinsics) {
+    if (!FLAG_IS_DEFAULT(UseCRC32CIntrinsics))
+      warning("CRC32C intrinsics are not available on this CPU");
+    FLAG_SET_DEFAULT(UseCRC32CIntrinsics, false);
+  }
+
  // Adjust RTM (Restricted Transactional Memory) flags
  if (!supports_rtm() && UseRTMLocking) {
    // Can't continue because UseRTMLocking affects UseBiasedLocking flag
--- a/hotspot/src/os_cpu/solaris_sparc/vm/vm_version_solaris_sparc.cpp
+++ b/hotspot/src/os_cpu/solaris_sparc/vm/vm_version_solaris_sparc.cpp
@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2006, 2014, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2006, 2015, Oracle and/or its affiliates. All rights reserved.
 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
 *
 * This code is free software; you can redistribute it and/or modify it
@ -363,6 +363,11 @@ int VM_Version::platform_features(int features) {
 #endif
  if (av & AV_SPARC_CBCOND)       features |= cbcond_instructions_m;

+#ifndef AV_SPARC_CRC32C
+#define AV_SPARC_CRC32C 0x20000000  /* crc32c instruction supported */
+#endif
+  if (av & AV_SPARC_CRC32C)       features |= crc32c_instruction_m;
+
 #ifndef AV_SPARC_AES
 #define AV_SPARC_AES 0x00020000  /* aes instrs supported */
 #endif
--- a/hotspot/src/share/vm/classfile/vmSymbols.hpp
+++ b/hotspot/src/share/vm/classfile/vmSymbols.hpp
@ -863,6 +863,12 @@
   do_name(     updateByteBuffer_name,                           "updateByteBuffer")                                    \
   do_signature(updateByteBuffer_signature,                      "(IJII)I")                                             \
                                                                                                                        \
+  /* support for java.util.zip.CRC32C */                                                                                \
+  do_class(java_util_zip_CRC32C,          "java/util/zip/CRC32C")                                                       \
+  do_intrinsic(_updateBytesCRC32C,         java_util_zip_CRC32C,  updateBytes_name, updateBytes_signature,       F_S)   \
+  do_intrinsic(_updateDirectByteBufferCRC32C, java_util_zip_CRC32C, updateDirectByteBuffer_name, updateByteBuffer_signature, F_S) \
+   do_name(     updateDirectByteBuffer_name,                     "updateDirectByteBuffer")                              \
+                                                                                                                        \
  /* support for sun.misc.Unsafe */                                                                                     \
  do_class(sun_misc_Unsafe,               "sun/misc/Unsafe")                                                            \
                                                                                                                        \
--- a/hotspot/src/share/vm/opto/escape.cpp
+++ b/hotspot/src/share/vm/opto/escape.cpp
@ -962,6 +962,7 @@ void ConnectionGraph::process_call_arguments(CallNode *call) {
                 (strcmp(call->as_CallLeaf()->_name, "g1_wb_pre")  == 0 ||
                  strcmp(call->as_CallLeaf()->_name, "g1_wb_post") == 0 ||
                  strcmp(call->as_CallLeaf()->_name, "updateBytesCRC32") == 0 ||
+                  strcmp(call->as_CallLeaf()->_name, "updateBytesCRC32C") == 0 ||
                  strcmp(call->as_CallLeaf()->_name, "aescrypt_encryptBlock") == 0 ||
                  strcmp(call->as_CallLeaf()->_name, "aescrypt_decryptBlock") == 0 ||
                  strcmp(call->as_CallLeaf()->_name, "cipherBlockChaining_encryptAESCrypt") == 0 ||
--- a/hotspot/src/share/vm/opto/library_call.cpp
+++ b/hotspot/src/share/vm/opto/library_call.cpp
@ -197,7 +197,7 @@ class LibraryCallKit : public GraphKit {
  CallJavaNode* generate_method_call_virtual(vmIntrinsics::ID method_id) {
    return generate_method_call(method_id, true, false);
  }
-  Node * load_field_from_object(Node * fromObj, const char * fieldName, const char * fieldTypeString, bool is_exact, bool is_static);
+  Node * load_field_from_object(Node * fromObj, const char * fieldName, const char * fieldTypeString, bool is_exact, bool is_static, ciInstanceKlass * fromKls);

  Node* make_string_method_node(int opcode, Node* str1_start, Node* cnt1, Node* str2_start, Node* cnt2);
  Node* make_string_method_node(int opcode, Node* str1, Node* str2);
@ -291,6 +291,9 @@ class LibraryCallKit : public GraphKit {
  bool inline_updateCRC32();
  bool inline_updateBytesCRC32();
  bool inline_updateByteBufferCRC32();
+  Node* get_table_from_crc32c_class(ciInstanceKlass *crc32c_class);
+  bool inline_updateBytesCRC32C();
+  bool inline_updateDirectByteBufferCRC32C();
  bool inline_multiplyToLen();
  bool inline_squareToLen();
  bool inline_mulAdd();
@ -539,6 +542,11 @@ CallGenerator* Compile::make_vm_intrinsic(ciMethod* m, bool is_virtual) {
    if (!UseCRC32Intrinsics) return NULL;
    break;

+  case vmIntrinsics::_updateBytesCRC32C:
+  case vmIntrinsics::_updateDirectByteBufferCRC32C:
+    if (!UseCRC32CIntrinsics) return NULL;
+    break;
+
  case vmIntrinsics::_incrementExactI:
  case vmIntrinsics::_addExactI:
    if (!Matcher::match_rule_supported(Op_OverflowAddI) || !UseMathExactIntrinsics) return NULL;
@ -947,6 +955,11 @@ bool LibraryCallKit::try_to_inline(int predicate) {
  case vmIntrinsics::_updateByteBufferCRC32:
    return inline_updateByteBufferCRC32();

+  case vmIntrinsics::_updateBytesCRC32C:
+    return inline_updateBytesCRC32C();
+  case vmIntrinsics::_updateDirectByteBufferCRC32C:
+    return inline_updateDirectByteBufferCRC32C();
+
  case vmIntrinsics::_profileBoolean:
    return inline_profileBoolean();
  case vmIntrinsics::_isCompileConstant:
@ -5536,6 +5549,106 @@ bool LibraryCallKit::inline_updateByteBufferCRC32() {
  return true;
 }

+//------------------------------get_table_from_crc32c_class-----------------------
+Node * LibraryCallKit::get_table_from_crc32c_class(ciInstanceKlass *crc32c_class) {
+  Node* table = load_field_from_object(NULL, "byteTable", "[I", /*is_exact*/ false, /*is_static*/ true, crc32c_class);
+  assert (table != NULL, "wrong version of java.util.zip.CRC32C");
+
+  return table;
+}
+
+//------------------------------inline_updateBytesCRC32C-----------------------
+//
+// Calculate CRC32C for byte[] array.
+// int java.util.zip.CRC32C.updateBytes(int crc, byte[] buf, int off, int end)
+//
+bool LibraryCallKit::inline_updateBytesCRC32C() {
+  assert(UseCRC32CIntrinsics, "need CRC32C instruction support");
+  assert(callee()->signature()->size() == 4, "updateBytes has 4 parameters");
+  assert(callee()->holder()->is_loaded(), "CRC32C class must be loaded");
+  // no receiver since it is a static method
+  Node* crc     = argument(0); // type: int
+  Node* src     = argument(1); // type: oop
+  Node* offset  = argument(2); // type: int
+  Node* end     = argument(3); // type: int
+
+  Node* length = _gvn.transform(new SubINode(end, offset));
+
+  const Type* src_type = src->Value(&_gvn);
+  const TypeAryPtr* top_src = src_type->isa_aryptr();
+  if (top_src  == NULL || top_src->klass()  == NULL) {
+    // failed array check
+    return false;
+  }
+
+  // Figure out the size and type of the elements we will be copying.
+  BasicType src_elem = src_type->isa_aryptr()->klass()->as_array_klass()->element_type()->basic_type();
+  if (src_elem != T_BYTE) {
+    return false;
+  }
+
+  // 'src_start' points to src array + scaled offset
+  Node* src_start = array_element_address(src, offset, src_elem);
+
+  // static final int[] byteTable in class CRC32C
+  Node* table = get_table_from_crc32c_class(callee()->holder());
+  Node* table_start = array_element_address(table, intcon(0), T_INT);
+
+  // We assume that range check is done by caller.
+  // TODO: generate range check (offset+length < src.length) in debug VM.
+
+  // Call the stub.
+  address stubAddr = StubRoutines::updateBytesCRC32C();
+  const char *stubName = "updateBytesCRC32C";
+
+  Node* call = make_runtime_call(RC_LEAF, OptoRuntime::updateBytesCRC32C_Type(),
+                                 stubAddr, stubName, TypePtr::BOTTOM,
+                                 crc, src_start, length, table_start);
+  Node* result = _gvn.transform(new ProjNode(call, TypeFunc::Parms));
+  set_result(result);
+  return true;
+}
+
+//------------------------------inline_updateDirectByteBufferCRC32C-----------------------
+//
+// Calculate CRC32C for DirectByteBuffer.
+// int java.util.zip.CRC32C.updateDirectByteBuffer(int crc, long buf, int off, int end)
+//
+bool LibraryCallKit::inline_updateDirectByteBufferCRC32C() {
+  assert(UseCRC32CIntrinsics, "need CRC32C instruction support");
+  assert(callee()->signature()->size() == 5, "updateDirectByteBuffer has 4 parameters and one is long");
+  assert(callee()->holder()->is_loaded(), "CRC32C class must be loaded");
+  // no receiver since it is a static method
+  Node* crc     = argument(0); // type: int
+  Node* src     = argument(1); // type: long
+  Node* offset  = argument(3); // type: int
+  Node* end     = argument(4); // type: int
+
+  Node* length = _gvn.transform(new SubINode(end, offset));
+
+  src = ConvL2X(src);  // adjust Java long to machine word
+  Node* base = _gvn.transform(new CastX2PNode(src));
+  offset = ConvI2X(offset);
+
+  // 'src_start' points to src array + scaled offset
+  Node* src_start = basic_plus_adr(top(), base, offset);
+
+  // static final int[] byteTable in class CRC32C
+  Node* table = get_table_from_crc32c_class(callee()->holder());
+  Node* table_start = array_element_address(table, intcon(0), T_INT);
+
+  // Call the stub.
+  address stubAddr = StubRoutines::updateBytesCRC32C();
+  const char *stubName = "updateBytesCRC32C";
+
+  Node* call = make_runtime_call(RC_LEAF, OptoRuntime::updateBytesCRC32C_Type(),
+                                 stubAddr, stubName, TypePtr::BOTTOM,
+                                 crc, src_start, length, table_start);
+  Node* result = _gvn.transform(new ProjNode(call, TypeFunc::Parms));
+  set_result(result);
+  return true;
+}
+
 //----------------------------inline_reference_get----------------------------
 // public T java.lang.ref.Reference.get();
 bool LibraryCallKit::inline_reference_get() {
@ -5571,18 +5684,28 @@ bool LibraryCallKit::inline_reference_get() {


 Node * LibraryCallKit::load_field_from_object(Node * fromObj, const char * fieldName, const char * fieldTypeString,
-                                              bool is_exact=true, bool is_static=false) {
+                                              bool is_exact=true, bool is_static=false,
+                                              ciInstanceKlass * fromKls=NULL) {
+  if (fromKls == NULL) {
+    const TypeInstPtr* tinst = _gvn.type(fromObj)->isa_instptr();
+    assert(tinst != NULL, "obj is null");
+    assert(tinst->klass()->is_loaded(), "obj is not loaded");
+    assert(!is_exact || tinst->klass_is_exact(), "klass not exact");
+    fromKls = tinst->klass()->as_instance_klass();
+  } else {
+    assert(is_static, "only for static field access");
+  }
+  ciField* field = fromKls->get_field_by_name(ciSymbol::make(fieldName),
+                                              ciSymbol::make(fieldTypeString),
+                                              is_static);

-  const TypeInstPtr* tinst = _gvn.type(fromObj)->isa_instptr();
-  assert(tinst != NULL, "obj is null");
-  assert(tinst->klass()->is_loaded(), "obj is not loaded");
-  assert(!is_exact || tinst->klass_is_exact(), "klass not exact");
-
-  ciField* field = tinst->klass()->as_instance_klass()->get_field_by_name(ciSymbol::make(fieldName),
-                                                                          ciSymbol::make(fieldTypeString),
-                                                                          is_static);
-  if (field == NULL) return (Node *) NULL;
  assert (field != NULL, "undefined field");
+  if (field == NULL) return (Node *) NULL;
+
+  if (is_static) {
+    const TypeInstPtr* tip = TypeInstPtr::make(fromKls->java_mirror());
+    fromObj = makecon(tip);
+  }

  // Next code  copied from Parse::do_get_xxx():

--- a/hotspot/src/share/vm/opto/runtime.cpp
+++ b/hotspot/src/share/vm/opto/runtime.cpp
@ -851,6 +851,29 @@ const TypeFunc* OptoRuntime::updateBytesCRC32_Type() {
  return TypeFunc::make(domain, range);
 }

+/**
+ * int updateBytesCRC32C(int crc, byte* buf, int len, int* table)
+ */
+const TypeFunc* OptoRuntime::updateBytesCRC32C_Type() {
+  // create input type (domain)
+  int num_args      = 4;
+  int argcnt = num_args;
+  const Type** fields = TypeTuple::fields(argcnt);
+  int argp = TypeFunc::Parms;
+  fields[argp++] = TypeInt::INT;        // crc
+  fields[argp++] = TypePtr::NOTNULL;    // buf
+  fields[argp++] = TypeInt::INT;        // len
+  fields[argp++] = TypePtr::NOTNULL;    // table
+  assert(argp == TypeFunc::Parms+argcnt, "correct decoding");
+  const TypeTuple* domain = TypeTuple::make(TypeFunc::Parms+argcnt, fields);
+
+  // result type needed
+  fields = TypeTuple::fields(1);
+  fields[TypeFunc::Parms+0] = TypeInt::INT; // crc result
+  const TypeTuple* range = TypeTuple::make(TypeFunc::Parms+1, fields);
+  return TypeFunc::make(domain, range);
+}
+
 // for cipherBlockChaining calls of aescrypt encrypt/decrypt, four pointers and a length, returning int
 const TypeFunc* OptoRuntime::cipherBlockChaining_aescrypt_Type() {
  // create input type (domain)
--- a/hotspot/src/share/vm/opto/runtime.hpp
+++ b/hotspot/src/share/vm/opto/runtime.hpp
@ -319,6 +319,7 @@ private:
  static const TypeFunc* ghash_processBlocks_Type();

  static const TypeFunc* updateBytesCRC32_Type();
+  static const TypeFunc* updateBytesCRC32C_Type();

  // leaf on stack replacement interpreter accessor types
  static const TypeFunc* osr_end_Type();
--- a/hotspot/src/share/vm/runtime/globals.hpp
+++ b/hotspot/src/share/vm/runtime/globals.hpp
@ -848,6 +848,9 @@ public:
  product(bool, UseCRC32Intrinsics, false,                                  \
          "use intrinsics for java.util.zip.CRC32")                         \
                                                                            \
+  product(bool, UseCRC32CIntrinsics, false,                                 \
+          "use intrinsics for java.util.zip.CRC32C")                        \
+                                                                            \
  develop(bool, TraceCallFixup, false,                                      \
          "Trace all call fixups")                                          \
                                                                            \
--- a/hotspot/src/share/vm/runtime/stubRoutines.cpp
+++ b/hotspot/src/share/vm/runtime/stubRoutines.cpp
@ -137,6 +137,8 @@ address StubRoutines::_sha512_implCompressMB = NULL;
 address StubRoutines::_updateBytesCRC32 = NULL;
 address StubRoutines::_crc_table_adr = NULL;

+address StubRoutines::_updateBytesCRC32C = NULL;
+
 address StubRoutines::_multiplyToLen = NULL;
 address StubRoutines::_squareToLen = NULL;
 address StubRoutines::_mulAdd = NULL;
--- a/hotspot/src/share/vm/runtime/stubRoutines.hpp
+++ b/hotspot/src/share/vm/runtime/stubRoutines.hpp
@ -197,6 +197,8 @@ class StubRoutines: AllStatic {
  static address _updateBytesCRC32;
  static address _crc_table_adr;

+  static address _updateBytesCRC32C;
+
  static address _multiplyToLen;
  static address _squareToLen;
  static address _mulAdd;
@ -359,6 +361,8 @@ class StubRoutines: AllStatic {
  static address updateBytesCRC32()    { return _updateBytesCRC32; }
  static address crc_table_addr()      { return _crc_table_adr; }

+  static address updateBytesCRC32C()   { return _updateBytesCRC32C; }
+
  static address multiplyToLen()       {return _multiplyToLen; }
  static address squareToLen()         {return _squareToLen; }
  static address mulAdd()              {return _mulAdd; }
--- a/hotspot/src/share/vm/runtime/vmStructs.cpp
+++ b/hotspot/src/share/vm/runtime/vmStructs.cpp
@ -830,6 +830,7 @@ typedef CompactHashtable<Symbol*, char>       SymbolCompactHashTable;
     static_field(StubRoutines,                _ghash_processBlocks,                          address)                               \
     static_field(StubRoutines,                _updateBytesCRC32,                             address)                               \
     static_field(StubRoutines,                _crc_table_adr,                                address)                               \
+     static_field(StubRoutines,                _updateBytesCRC32C,                            address)                               \
     static_field(StubRoutines,                _multiplyToLen,                                address)                               \
     static_field(StubRoutines,                _squareToLen,                                  address)                               \
     static_field(StubRoutines,                _mulAdd,                                       address)                               \
--- a/hotspot/test/compiler/intrinsics/crc32c/TestCRC32C.java
+++ b/hotspot/test/compiler/intrinsics/crc32c/TestCRC32C.java
@ -0,0 +1,221 @@
+/*
+ * Copyright (c) 2015, Oracle and/or its affiliates. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ */
+
+/**
+ * @test
+ * @bug 8073583
+ * @summary C2 support for CRC32C on SPARC
+ *
+ * @run main/othervm/timeout=600 -Xbatch TestCRC32C -m
+ */
+
+import java.nio.ByteBuffer;
+import java.util.zip.Checksum;
+import java.util.zip.CRC32C;
+
+public class TestCRC32C {
+    public static void main(String[] args) {
+        int offset = Integer.getInteger("offset", 0);
+        int msgSize = Integer.getInteger("msgSize", 512);
+        boolean multi = false;
+        int iters = 20000;
+        int warmupIters = 20000;
+
+        if (args.length > 0) {
+            if (args[0].equals("-m")) {
+                multi = true;
+            } else {
+                iters = Integer.valueOf(args[0]);
+            }
+            if (args.length > 1) {
+                warmupIters = Integer.valueOf(args[1]);
+            }
+        }
+
+        if (multi) {
+            test_multi(warmupIters);
+            return;
+        }
+
+        System.out.println(" offset = " + offset);
+        System.out.println("msgSize = " + msgSize + " bytes");
+        System.out.println("  iters = " + iters);
+
+        byte[] b = initializedBytes(msgSize, offset);
+
+        CRC32C crc0 = new CRC32C();
+        CRC32C crc1 = new CRC32C();
+        CRC32C crc2 = new CRC32C();
+
+        crc0.update(b, offset, msgSize);
+
+        System.out.println("-------------------------------------------------------");
+
+        /* warm up */
+        for (int i = 0; i < warmupIters; i++) {
+            crc1.reset();
+            crc1.update(b, offset, msgSize);
+        }
+
+        /* measure performance */
+        long start = System.nanoTime();
+        for (int i = 0; i < iters; i++) {
+            crc1.reset();
+            crc1.update(b, offset, msgSize);
+        }
+        long end = System.nanoTime();
+        double total = (double)(end - start)/1e9;         // in seconds
+        double thruput = (double)msgSize*iters/1e6/total; // in MB/s
+        System.out.println("CRC32C.update(byte[]) runtime = " + total + " seconds");
+        System.out.println("CRC32C.update(byte[]) throughput = " + thruput + " MB/s");
+
+        /* check correctness */
+        for (int i = 0; i < iters; i++) {
+            crc1.reset();
+            crc1.update(b, offset, msgSize);
+            if (!check(crc0, crc1)) break;
+        }
+        report("CRCs", crc0, crc1);
+
+        System.out.println("-------------------------------------------------------");
+
+        ByteBuffer buf = ByteBuffer.allocateDirect(msgSize);
+        buf.put(b, offset, msgSize);
+        buf.flip();
+
+        /* warm up */
+        for (int i = 0; i < warmupIters; i++) {
+            crc2.reset();
+            crc2.update(buf);
+            buf.rewind();
+        }
+
+        /* measure performance */
+        start = System.nanoTime();
+        for (int i = 0; i < iters; i++) {
+            crc2.reset();
+            crc2.update(buf);
+            buf.rewind();
+        }
+        end = System.nanoTime();
+        total = (double)(end - start)/1e9;         // in seconds
+        thruput = (double)msgSize*iters/1e6/total; // in MB/s
+        System.out.println("CRC32C.update(ByteBuffer) runtime = " + total + " seconds");
+        System.out.println("CRC32C.update(ByteBuffer) throughput = " + thruput + " MB/s");
+
+        /* check correctness */
+        for (int i = 0; i < iters; i++) {
+            crc2.reset();
+            crc2.update(buf);
+            buf.rewind();
+            if (!check(crc0, crc2)) break;
+        }
+        report("CRCs", crc0, crc2);
+
+        System.out.println("-------------------------------------------------------");
+    }
+
+    private static void report(String s, Checksum crc0, Checksum crc1) {
+        System.out.printf("%s: crc0 = %08x, crc1 = %08x\n",
+                          s, crc0.getValue(), crc1.getValue());
+    }
+
+    private static boolean check(Checksum crc0, Checksum crc1) {
+        if (crc0.getValue() != crc1.getValue()) {
+            System.err.printf("ERROR: crc0 = %08x, crc1 = %08x\n",
+                              crc0.getValue(), crc1.getValue());
+            return false;
+        }
+        return true;
+    }
+
+    private static byte[] initializedBytes(int M, int offset) {
+        byte[] bytes = new byte[M + offset];
+        for (int i = 0; i < offset; i++) {
+            bytes[i] = (byte) i;
+        }
+        for (int i = offset; i < bytes.length; i++) {
+            bytes[i] = (byte) (i - offset);
+        }
+        return bytes;
+    }
+
+    private static void test_multi(int iters) {
+        int len1 = 8;    // the  8B/iteration loop
+        int len2 = 32;   // the 32B/iteration loop
+        int len3 = 4096; // the 4KB/iteration loop
+
+        byte[] b = initializedBytes(len3*16, 0);
+        int[] offsets = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 16, 32, 64, 128, 256, 512 };
+        int[] sizes = { 0, 1, 2, 3, 4, 5, 6, 7,
+                        len1, len1+1, len1+2, len1+3, len1+4, len1+5, len1+6, len1+7,
+                        len1*2, len1*2+1, len1*2+3, len1*2+5, len1*2+7,
+                        len2, len2+1, len2+3, len2+5, len2+7,
+                        len2*2, len2*4, len2*8, len2*16, len2*32, len2*64,
+                        len3, len3+1, len3+3, len3+5, len3+7,
+                        len3*2, len3*4, len3*8,
+                        len1+len2, len1+len2+1, len1+len2+3, len1+len2+5, len1+len2+7,
+                        len1+len3, len1+len3+1, len1+len3+3, len1+len3+5, len1+len3+7,
+                        len2+len3, len2+len3+1, len2+len3+3, len2+len3+5, len2+len3+7,
+                        len1+len2+len3, len1+len2+len3+1, len1+len2+len3+3,
+                        len1+len2+len3+5, len1+len2+len3+7,
+                        (len1+len2+len3)*2, (len1+len2+len3)*2+1, (len1+len2+len3)*2+3,
+                        (len1+len2+len3)*2+5, (len1+len2+len3)*2+7,
+                        (len1+len2+len3)*3, (len1+len2+len3)*3-1, (len1+len2+len3)*3-3,
+                        (len1+len2+len3)*3-5, (len1+len2+len3)*3-7 };
+        CRC32C[] crc0 = new CRC32C[offsets.length*sizes.length];
+        CRC32C[] crc1 = new CRC32C[offsets.length*sizes.length];
+        int i, j, k;
+
+        System.out.printf("testing %d cases ...\n", offsets.length*sizes.length);
+
+        /* set the result from interpreter as reference */
+        for (i = 0; i < offsets.length; i++) {
+            for (j = 0; j < sizes.length; j++) {
+                crc0[i*sizes.length + j] = new CRC32C();
+                crc1[i*sizes.length + j] = new CRC32C();
+                crc0[i*sizes.length + j].update(b, offsets[i], sizes[j]);
+            }
+        }
+
+        /* warm up the JIT compiler and get result */
+        for (k = 0; k < iters; k++) {
+            for (i = 0; i < offsets.length; i++) {
+                for (j = 0; j < sizes.length; j++) {
+                    crc1[i*sizes.length + j].reset();
+                    crc1[i*sizes.length + j].update(b, offsets[i], sizes[j]);
+                }
+            }
+        }
+
+        /* check correctness */
+        for (i = 0; i < offsets.length; i++) {
+            for (j = 0; j < sizes.length; j++) {
+                if (!check(crc0[i*sizes.length + j], crc1[i*sizes.length + j])) {
+                    System.out.printf("offsets[%d] = %d", i, offsets[i]);
+                    System.out.printf("\tsizes[%d] = %d\n", j, sizes[j]);
+                }
+            }
+        }
+    }
+}