8134553: CRC32C implementations for x86/x64 targets

Reviewed-by: kvn
2015-09-16 15:54:32 -07:00 · 2015-09-16 15:54:32 -07:00 · 61b77b8590
commit 61b77b8590
parent d49d1ea740
24 changed files with 1092 additions and 19 deletions
--- a/hotspot/src/cpu/aarch64/vm/interpreterGenerator_aarch64.hpp
+++ b/hotspot/src/cpu/aarch64/vm/interpreterGenerator_aarch64.hpp
@ -48,6 +48,7 @@ private:
  address generate_Reference_get_entry();
  address generate_CRC32_update_entry();
  address generate_CRC32_updateBytes_entry(AbstractInterpreter::MethodKind kind);
+  address generate_CRC32C_updateBytes_entry(AbstractInterpreter::MethodKind kind) { return NULL; }
  void lock_method(void);
  void generate_stack_overflow_check(void);

--- a/hotspot/src/cpu/ppc/vm/interpreterGenerator_ppc.hpp
+++ b/hotspot/src/cpu/ppc/vm/interpreterGenerator_ppc.hpp
@ -38,5 +38,6 @@

  address generate_CRC32_update_entry();
  address generate_CRC32_updateBytes_entry(AbstractInterpreter::MethodKind kind);
+  address generate_CRC32C_updateBytes_entry(AbstractInterpreter::MethodKind kind) { return NULL; }

 #endif // CPU_PPC_VM_INTERPRETERGENERATOR_PPC_HPP
--- a/hotspot/src/cpu/sparc/vm/interpreterGenerator_sparc.hpp
+++ b/hotspot/src/cpu/sparc/vm/interpreterGenerator_sparc.hpp
@ -48,4 +48,5 @@
  // Not supported
  address generate_CRC32_update_entry() { return NULL; }
  address generate_CRC32_updateBytes_entry(AbstractInterpreter::MethodKind kind) { return NULL; }
+  address generate_CRC32C_updateBytes_entry(AbstractInterpreter::MethodKind kind) { return NULL; }
 #endif // CPU_SPARC_VM_INTERPRETERGENERATOR_SPARC_HPP
--- a/hotspot/src/cpu/x86/vm/assembler_x86.cpp
+++ b/hotspot/src/cpu/x86/vm/assembler_x86.cpp
@ -1604,6 +1604,85 @@ void Assembler::cpuid() {
  emit_int8((unsigned char)0xA2);
 }

+// Opcode / Instruction                      Op /  En  64 - Bit Mode     Compat / Leg Mode Description                  Implemented
+// F2 0F 38 F0 / r       CRC32 r32, r / m8   RM        Valid             Valid             Accumulate CRC32 on r / m8.  v
+// F2 REX 0F 38 F0 / r   CRC32 r32, r / m8*  RM        Valid             N.E.              Accumulate CRC32 on r / m8.  -
+// F2 REX.W 0F 38 F0 / r CRC32 r64, r / m8   RM        Valid             N.E.              Accumulate CRC32 on r / m8.  -
+//
+// F2 0F 38 F1 / r       CRC32 r32, r / m16  RM        Valid             Valid             Accumulate CRC32 on r / m16. v
+//
+// F2 0F 38 F1 / r       CRC32 r32, r / m32  RM        Valid             Valid             Accumulate CRC32 on r / m32. v
+//
+// F2 REX.W 0F 38 F1 / r CRC32 r64, r / m64  RM        Valid             N.E.              Accumulate CRC32 on r / m64. v
+void Assembler::crc32(Register crc, Register v, int8_t sizeInBytes) {
+  assert(VM_Version::supports_sse4_2(), "");
+  int8_t w = 0x01;
+  Prefix p = Prefix_EMPTY;
+
+  emit_int8((int8_t)0xF2);
+  switch (sizeInBytes) {
+  case 1:
+    w = 0;
+    break;
+  case 2:
+  case 4:
+    break;
+  LP64_ONLY(case 8:)
+    // This instruction is not valid in 32 bits
+    // Note:
+    // http://www.intel.com/content/dam/www/public/us/en/documents/manuals/64-ia-32-architectures-software-developer-instruction-set-reference-manual-325383.pdf
+    //
+    // Page B - 72   Vol. 2C says
+    // qwreg2 to qwreg            1111 0010 : 0100 1R0B : 0000 1111 : 0011 1000 : 1111 0000 : 11 qwreg1 qwreg2
+    // mem64 to qwreg             1111 0010 : 0100 1R0B : 0000 1111 : 0011 1000 : 1111 0000 : mod qwreg r / m
+    //                                                                            F0!!!
+    // while 3 - 208 Vol. 2A
+    // F2 REX.W 0F 38 F1 / r       CRC32 r64, r / m64             RM         Valid      N.E.Accumulate CRC32 on r / m64.
+    //
+    // the 0 on a last bit is reserved for a different flavor of this instruction :
+    // F2 REX.W 0F 38 F0 / r       CRC32 r64, r / m8              RM         Valid      N.E.Accumulate CRC32 on r / m8.
+    p = REX_W;
+    break;
+  default:
+    assert(0, "Unsupported value for a sizeInBytes argument");
+    break;
+  }
+  LP64_ONLY(prefix(crc, v, p);)
+  emit_int8((int8_t)0x0F);
+  emit_int8(0x38);
+  emit_int8((int8_t)(0xF0 | w));
+  emit_int8(0xC0 | ((crc->encoding() & 0x7) << 3) | (v->encoding() & 7));
+}
+
+void Assembler::crc32(Register crc, Address adr, int8_t sizeInBytes) {
+  assert(VM_Version::supports_sse4_2(), "");
+  InstructionMark im(this);
+  int8_t w = 0x01;
+  Prefix p = Prefix_EMPTY;
+
+  emit_int8((int8_t)0xF2);
+  switch (sizeInBytes) {
+  case 1:
+    w = 0;
+    break;
+  case 2:
+  case 4:
+    break;
+  LP64_ONLY(case 8:)
+    // This instruction is not valid in 32 bits
+    p = REX_W;
+    break;
+  default:
+    assert(0, "Unsupported value for a sizeInBytes argument");
+    break;
+  }
+  LP64_ONLY(prefix(crc, adr, p);)
+  emit_int8((int8_t)0x0F);
+  emit_int8(0x38);
+  emit_int8((int8_t)(0xF0 | w));
+  emit_operand(crc, adr);
+}
+
 void Assembler::cvtdq2pd(XMMRegister dst, XMMRegister src) {
  NOT_LP64(assert(VM_Version::supports_sse2(), ""));
  emit_simd_arith_nonds(0xE6, dst, src, VEX_SIMD_F3, /* no_mask_reg */ false, /* legacy_mode */ true);
@ -6223,6 +6302,14 @@ void Assembler::shldl(Register dst, Register src) {
  emit_int8((unsigned char)(0xC0 | src->encoding() << 3 | dst->encoding()));
 }

+// 0F A4 / r ib
+void Assembler::shldl(Register dst, Register src, int8_t imm8) {
+  emit_int8(0x0F);
+  emit_int8((unsigned char)0xA4);
+  emit_int8((unsigned char)(0xC0 | src->encoding() << 3 | dst->encoding()));
+  emit_int8(imm8);
+}
+
 void Assembler::shrdl(Register dst, Register src) {
  emit_int8(0x0F);
  emit_int8((unsigned char)0xAD);
@ -6408,6 +6495,40 @@ void Assembler::prefix(Register reg) {
  }
 }

+void Assembler::prefix(Register dst, Register src, Prefix p) {
+  if (src->encoding() >= 8) {
+    p = (Prefix)(p | REX_B);
+  }
+  if (dst->encoding() >= 8) {
+    p = (Prefix)( p | REX_R);
+  }
+  if (p != Prefix_EMPTY) {
+    // do not generate an empty prefix
+    prefix(p);
+  }
+}
+
+void Assembler::prefix(Register dst, Address adr, Prefix p) {
+  if (adr.base_needs_rex()) {
+    if (adr.index_needs_rex()) {
+      assert(false, "prefix(Register dst, Address adr, Prefix p) does not support handling of an X");
+    } else {
+      prefix(REX_B);
+    }
+  } else {
+    if (adr.index_needs_rex()) {
+      assert(false, "prefix(Register dst, Address adr, Prefix p) does not support handling of an X");
+    }
+  }
+  if (dst->encoding() >= 8) {
+    p = (Prefix)(p | REX_R);
+  }
+  if (p != Prefix_EMPTY) {
+    // do not generate an empty prefix
+    prefix(p);
+  }
+}
+
 void Assembler::prefix(Address adr) {
  if (adr.base_needs_rex()) {
    if (adr.index_needs_rex()) {
--- a/hotspot/src/cpu/x86/vm/assembler_x86.hpp
+++ b/hotspot/src/cpu/x86/vm/assembler_x86.hpp
@ -506,7 +506,8 @@ class Assembler : public AbstractAssembler  {

    VEX_3bytes = 0xC4,
    VEX_2bytes = 0xC5,
-    EVEX_4bytes = 0x62
+    EVEX_4bytes = 0x62,
+    Prefix_EMPTY = 0x0
  };

  enum VexPrefix {
@ -615,6 +616,8 @@ private:
  int prefixq_and_encode(int dst_enc, int src_enc);

  void prefix(Register reg);
+  void prefix(Register dst, Register src, Prefix p);
+  void prefix(Register dst, Address adr, Prefix p);
  void prefix(Address adr);
  void prefixq(Address adr);

@ -1177,6 +1180,10 @@ private:
  // Identify processor type and features
  void cpuid();

+  // CRC32C
+  void crc32(Register crc, Register v, int8_t sizeInBytes);
+  void crc32(Register crc, Address adr, int8_t sizeInBytes);
+
  // Convert Scalar Double-Precision Floating-Point Value to Scalar Single-Precision Floating-Point Value
  void cvtsd2ss(XMMRegister dst, XMMRegister src);
  void cvtsd2ss(XMMRegister dst, Address src);
@ -1783,6 +1790,7 @@ private:
  void setb(Condition cc, Register dst);

  void shldl(Register dst, Register src);
+  void shldl(Register dst, Register src, int8_t imm8);

  void shll(Register dst, int imm8);
  void shll(Register dst);
--- a/hotspot/src/cpu/x86/vm/assembler_x86.inline.hpp
+++ b/hotspot/src/cpu/x86/vm/assembler_x86.inline.hpp
@ -37,6 +37,8 @@ inline int Assembler::prefix_and_encode(int dst_enc, int src_enc, bool byteinst)
 inline int Assembler::prefixq_and_encode(int dst_enc, int src_enc) { return dst_enc << 3 | src_enc; }

 inline void Assembler::prefix(Register reg) {}
+inline void Assembler::prefix(Register dst, Register src, Prefix p) {}
+inline void Assembler::prefix(Register dst, Address adr, Prefix p) {}
 inline void Assembler::prefix(Address adr) {}
 inline void Assembler::prefixq(Address adr) {}

--- a/hotspot/src/cpu/x86/vm/crc32c.h
+++ b/hotspot/src/cpu/x86/vm/crc32c.h
@ -0,0 +1,66 @@
+/*
+* Copyright (c) 2015, Oracle and/or its affiliates. All rights reserved.
+* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+*
+* This code is free software; you can redistribute it and/or modify it
+* under the terms of the GNU General Public License version 2 only, as
+* published by the Free Software Foundation.
+*
+* This code is distributed in the hope that it will be useful, but WITHOUT
+* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+* FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+* version 2 for more details (a copy is included in the LICENSE file that
+* accompanied this code).
+*
+* You should have received a copy of the GNU General Public License version
+* 2 along with this work; if not, write to the Free Software Foundation,
+* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+*
+* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+* or visit www.oracle.com if you need additional information or have any
+* questions.
+*
+*/
+
+enum {
+  // S. Gueron / Information Processing Letters 112 (2012) 184
+  // shows than anything above 6K and below 32K is a good choice
+  // 32K does not deliver any further performance gains
+  // 6K=8*256 (*3 as we compute 3 blocks together)
+  //
+  // Thus selecting the smallest value so it could apply to the largest number
+  // of buffer sizes.
+  CRC32C_HIGH = 8 * 256,
+
+  // empirical
+  // based on ubench study using methodology described in
+  // V. Gopal et al. / Fast CRC Computation for iSCSI Polynomial Using CRC32 Instruction April 2011 8
+  //
+  // arbitrary value between 27 and 256
+  CRC32C_MIDDLE = 8 * 86,
+
+  // V. Gopal et al. / Fast CRC Computation for iSCSI Polynomial Using CRC32 Instruction April 2011 9
+  // shows that 240 and 1024 are equally good choices as the 216==8*27
+  //
+  // Selecting the smallest value which resulted in a significant performance improvement over
+  // sequential version
+  CRC32C_LOW = 8 * 27,
+
+  CRC32C_NUM_ChunkSizeInBytes = 3,
+
+  // We need to compute powers of 64N and 128N for each "chunk" size
+  CRC32C_NUM_PRECOMPUTED_CONSTANTS = ( 2 * CRC32C_NUM_ChunkSizeInBytes )
+};
+// Notes:
+// 1. Why we need to choose a "chunk" approach?
+// Overhead of computing a powers and powers of for an arbitrary buffer of size N is significant
+// (implementation approaches a library perf.)
+// 2. Why only 3 "chunks"?
+// Performance experiments results showed that a HIGH+LOW was not delivering a stable speedup
+// curve.
+//
+// Disclaimer:
+// If you ever decide to increase/decrease number of "chunks" be sure to modify
+// a) constants table generation (hotspot/src/cpu/x86/vm/stubRoutines_x86.cpp)
+// b) constant fetch from that table (macroAssembler_x86.cpp)
+// c) unrolled for loop (macroAssembler_x86.cpp)
--- a/hotspot/src/cpu/x86/vm/interpreterGenerator_x86.hpp
+++ b/hotspot/src/cpu/x86/vm/interpreterGenerator_x86.hpp
@ -42,6 +42,7 @@
  address generate_Reference_get_entry();
  address generate_CRC32_update_entry();
  address generate_CRC32_updateBytes_entry(AbstractInterpreter::MethodKind kind);
+  address generate_CRC32C_updateBytes_entry(AbstractInterpreter::MethodKind kind);
 #ifndef _LP64
  address generate_Float_intBitsToFloat_entry();
  address generate_Float_floatToRawIntBits_entry();
--- a/hotspot/src/cpu/x86/vm/macroAssembler_x86.cpp
+++ b/hotspot/src/cpu/x86/vm/macroAssembler_x86.cpp
@ -45,6 +45,7 @@
 #include "gc/g1/g1SATBCardTableModRefBS.hpp"
 #include "gc/g1/heapRegion.hpp"
 #endif // INCLUDE_ALL_GCS
+#include "crc32c.h"

 #ifdef PRODUCT
 #define BLOCK_COMMENT(str) /* nothing */
@ -8636,6 +8637,471 @@ void MacroAssembler::kernel_crc32(Register crc, Register buf, Register len, Regi
  notl(crc); // ~c
 }

+#ifdef _LP64
+// S. Gueron / Information Processing Letters 112 (2012) 184
+// Algorithm 4: Computing carry-less multiplication using a precomputed lookup table.
+// Input: A 32 bit value B = [byte3, byte2, byte1, byte0].
+// Output: the 64-bit carry-less product of B * CONST
+void MacroAssembler::crc32c_ipl_alg4(Register in, uint32_t n,
+                                     Register tmp1, Register tmp2, Register tmp3) {
+  lea(tmp3, ExternalAddress(StubRoutines::crc32c_table_addr()));
+  if (n > 0) {
+    addq(tmp3, n * 256 * 8);
+  }
+  //    Q1 = TABLEExt[n][B & 0xFF];
+  movl(tmp1, in);
+  andl(tmp1, 0x000000FF);
+  shll(tmp1, 3);
+  addq(tmp1, tmp3);
+  movq(tmp1, Address(tmp1, 0));
+
+  //    Q2 = TABLEExt[n][B >> 8 & 0xFF];
+  movl(tmp2, in);
+  shrl(tmp2, 8);
+  andl(tmp2, 0x000000FF);
+  shll(tmp2, 3);
+  addq(tmp2, tmp3);
+  movq(tmp2, Address(tmp2, 0));
+
+  shlq(tmp2, 8);
+  xorq(tmp1, tmp2);
+
+  //    Q3 = TABLEExt[n][B >> 16 & 0xFF];
+  movl(tmp2, in);
+  shrl(tmp2, 16);
+  andl(tmp2, 0x000000FF);
+  shll(tmp2, 3);
+  addq(tmp2, tmp3);
+  movq(tmp2, Address(tmp2, 0));
+
+  shlq(tmp2, 16);
+  xorq(tmp1, tmp2);
+
+  //    Q4 = TABLEExt[n][B >> 24 & 0xFF];
+  shrl(in, 24);
+  andl(in, 0x000000FF);
+  shll(in, 3);
+  addq(in, tmp3);
+  movq(in, Address(in, 0));
+
+  shlq(in, 24);
+  xorq(in, tmp1);
+  //    return Q1 ^ Q2 << 8 ^ Q3 << 16 ^ Q4 << 24;
+}
+
+void MacroAssembler::crc32c_pclmulqdq(XMMRegister w_xtmp1,
+                                      Register in_out,
+                                      uint32_t const_or_pre_comp_const_index, bool is_pclmulqdq_supported,
+                                      XMMRegister w_xtmp2,
+                                      Register tmp1,
+                                      Register n_tmp2, Register n_tmp3) {
+  if (is_pclmulqdq_supported) {
+    movdl(w_xtmp1, in_out); // modified blindly
+
+    movl(tmp1, const_or_pre_comp_const_index);
+    movdl(w_xtmp2, tmp1);
+    pclmulqdq(w_xtmp1, w_xtmp2, 0);
+
+    movdq(in_out, w_xtmp1);
+  } else {
+    crc32c_ipl_alg4(in_out, const_or_pre_comp_const_index, tmp1, n_tmp2, n_tmp3);
+  }
+}
+
+// Recombination Alternative 2: No bit-reflections
+// T1 = (CRC_A * U1) << 1
+// T2 = (CRC_B * U2) << 1
+// C1 = T1 >> 32
+// C2 = T2 >> 32
+// T1 = T1 & 0xFFFFFFFF
+// T2 = T2 & 0xFFFFFFFF
+// T1 = CRC32(0, T1)
+// T2 = CRC32(0, T2)
+// C1 = C1 ^ T1
+// C2 = C2 ^ T2
+// CRC = C1 ^ C2 ^ CRC_C
+void MacroAssembler::crc32c_rec_alt2(uint32_t const_or_pre_comp_const_index_u1, uint32_t const_or_pre_comp_const_index_u2, bool is_pclmulqdq_supported, Register in_out, Register in1, Register in2,
+                                     XMMRegister w_xtmp1, XMMRegister w_xtmp2, XMMRegister w_xtmp3,
+                                     Register tmp1, Register tmp2,
+                                     Register n_tmp3) {
+  crc32c_pclmulqdq(w_xtmp1, in_out, const_or_pre_comp_const_index_u1, is_pclmulqdq_supported, w_xtmp3, tmp1, tmp2, n_tmp3);
+  crc32c_pclmulqdq(w_xtmp2, in1, const_or_pre_comp_const_index_u2, is_pclmulqdq_supported, w_xtmp3, tmp1, tmp2, n_tmp3);
+  shlq(in_out, 1);
+  movl(tmp1, in_out);
+  shrq(in_out, 32);
+  xorl(tmp2, tmp2);
+  crc32(tmp2, tmp1, 4);
+  xorl(in_out, tmp2); // we don't care about upper 32 bit contents here
+  shlq(in1, 1);
+  movl(tmp1, in1);
+  shrq(in1, 32);
+  xorl(tmp2, tmp2);
+  crc32(tmp2, tmp1, 4);
+  xorl(in1, tmp2);
+  xorl(in_out, in1);
+  xorl(in_out, in2);
+}
+
+// Set N to predefined value
+// Subtract from a lenght of a buffer
+// execute in a loop:
+// CRC_A = 0xFFFFFFFF, CRC_B = 0, CRC_C = 0
+// for i = 1 to N do
+//  CRC_A = CRC32(CRC_A, A[i])
+//  CRC_B = CRC32(CRC_B, B[i])
+//  CRC_C = CRC32(CRC_C, C[i])
+// end for
+// Recombine
+void MacroAssembler::crc32c_proc_chunk(uint32_t size, uint32_t const_or_pre_comp_const_index_u1, uint32_t const_or_pre_comp_const_index_u2, bool is_pclmulqdq_supported,
+                                       Register in_out1, Register in_out2, Register in_out3,
+                                       Register tmp1, Register tmp2, Register tmp3,
+                                       XMMRegister w_xtmp1, XMMRegister w_xtmp2, XMMRegister w_xtmp3,
+                                       Register tmp4, Register tmp5,
+                                       Register n_tmp6) {
+  Label L_processPartitions;
+  Label L_processPartition;
+  Label L_exit;
+
+  bind(L_processPartitions);
+  cmpl(in_out1, 3 * size);
+  jcc(Assembler::less, L_exit);
+    xorl(tmp1, tmp1);
+    xorl(tmp2, tmp2);
+    movq(tmp3, in_out2);
+    addq(tmp3, size);
+
+    bind(L_processPartition);
+      crc32(in_out3, Address(in_out2, 0), 8);
+      crc32(tmp1, Address(in_out2, size), 8);
+      crc32(tmp2, Address(in_out2, size * 2), 8);
+      addq(in_out2, 8);
+      cmpq(in_out2, tmp3);
+      jcc(Assembler::less, L_processPartition);
+    crc32c_rec_alt2(const_or_pre_comp_const_index_u1, const_or_pre_comp_const_index_u2, is_pclmulqdq_supported, in_out3, tmp1, tmp2,
+            w_xtmp1, w_xtmp2, w_xtmp3,
+            tmp4, tmp5,
+            n_tmp6);
+    addq(in_out2, 2 * size);
+    subl(in_out1, 3 * size);
+    jmp(L_processPartitions);
+
+  bind(L_exit);
+}
+#else
+void MacroAssembler::crc32c_ipl_alg4(Register in_out, uint32_t n,
+                                     Register tmp1, Register tmp2, Register tmp3,
+                                     XMMRegister xtmp1, XMMRegister xtmp2) {
+  lea(tmp3, ExternalAddress(StubRoutines::crc32c_table_addr()));
+  if (n > 0) {
+    addl(tmp3, n * 256 * 8);
+  }
+  //    Q1 = TABLEExt[n][B & 0xFF];
+  movl(tmp1, in_out);
+  andl(tmp1, 0x000000FF);
+  shll(tmp1, 3);
+  addl(tmp1, tmp3);
+  movq(xtmp1, Address(tmp1, 0));
+
+  //    Q2 = TABLEExt[n][B >> 8 & 0xFF];
+  movl(tmp2, in_out);
+  shrl(tmp2, 8);
+  andl(tmp2, 0x000000FF);
+  shll(tmp2, 3);
+  addl(tmp2, tmp3);
+  movq(xtmp2, Address(tmp2, 0));
+
+  psllq(xtmp2, 8);
+  pxor(xtmp1, xtmp2);
+
+  //    Q3 = TABLEExt[n][B >> 16 & 0xFF];
+  movl(tmp2, in_out);
+  shrl(tmp2, 16);
+  andl(tmp2, 0x000000FF);
+  shll(tmp2, 3);
+  addl(tmp2, tmp3);
+  movq(xtmp2, Address(tmp2, 0));
+
+  psllq(xtmp2, 16);
+  pxor(xtmp1, xtmp2);
+
+  //    Q4 = TABLEExt[n][B >> 24 & 0xFF];
+  shrl(in_out, 24);
+  andl(in_out, 0x000000FF);
+  shll(in_out, 3);
+  addl(in_out, tmp3);
+  movq(xtmp2, Address(in_out, 0));
+
+  psllq(xtmp2, 24);
+  pxor(xtmp1, xtmp2); // Result in CXMM
+  //    return Q1 ^ Q2 << 8 ^ Q3 << 16 ^ Q4 << 24;
+}
+
+void MacroAssembler::crc32c_pclmulqdq(XMMRegister w_xtmp1,
+                                      Register in_out,
+                                      uint32_t const_or_pre_comp_const_index, bool is_pclmulqdq_supported,
+                                      XMMRegister w_xtmp2,
+                                      Register tmp1,
+                                      Register n_tmp2, Register n_tmp3) {
+  if (is_pclmulqdq_supported) {
+    movdl(w_xtmp1, in_out);
+
+    movl(tmp1, const_or_pre_comp_const_index);
+    movdl(w_xtmp2, tmp1);
+    pclmulqdq(w_xtmp1, w_xtmp2, 0);
+    // Keep result in XMM since GPR is 32 bit in length
+  } else {
+    crc32c_ipl_alg4(in_out, const_or_pre_comp_const_index, tmp1, n_tmp2, n_tmp3, w_xtmp1, w_xtmp2);
+  }
+}
+
+void MacroAssembler::crc32c_rec_alt2(uint32_t const_or_pre_comp_const_index_u1, uint32_t const_or_pre_comp_const_index_u2, bool is_pclmulqdq_supported, Register in_out, Register in1, Register in2,
+                                     XMMRegister w_xtmp1, XMMRegister w_xtmp2, XMMRegister w_xtmp3,
+                                     Register tmp1, Register tmp2,
+                                     Register n_tmp3) {
+  crc32c_pclmulqdq(w_xtmp1, in_out, const_or_pre_comp_const_index_u1, is_pclmulqdq_supported, w_xtmp3, tmp1, tmp2, n_tmp3);
+  crc32c_pclmulqdq(w_xtmp2, in1, const_or_pre_comp_const_index_u2, is_pclmulqdq_supported, w_xtmp3, tmp1, tmp2, n_tmp3);
+
+  psllq(w_xtmp1, 1);
+  movdl(tmp1, w_xtmp1);
+  psrlq(w_xtmp1, 32);
+  movdl(in_out, w_xtmp1);
+
+  xorl(tmp2, tmp2);
+  crc32(tmp2, tmp1, 4);
+  xorl(in_out, tmp2);
+
+  psllq(w_xtmp2, 1);
+  movdl(tmp1, w_xtmp2);
+  psrlq(w_xtmp2, 32);
+  movdl(in1, w_xtmp2);
+
+  xorl(tmp2, tmp2);
+  crc32(tmp2, tmp1, 4);
+  xorl(in1, tmp2);
+  xorl(in_out, in1);
+  xorl(in_out, in2);
+}
+
+void MacroAssembler::crc32c_proc_chunk(uint32_t size, uint32_t const_or_pre_comp_const_index_u1, uint32_t const_or_pre_comp_const_index_u2, bool is_pclmulqdq_supported,
+                                       Register in_out1, Register in_out2, Register in_out3,
+                                       Register tmp1, Register tmp2, Register tmp3,
+                                       XMMRegister w_xtmp1, XMMRegister w_xtmp2, XMMRegister w_xtmp3,
+                                       Register tmp4, Register tmp5,
+                                       Register n_tmp6) {
+  Label L_processPartitions;
+  Label L_processPartition;
+  Label L_exit;
+
+  bind(L_processPartitions);
+  cmpl(in_out1, 3 * size);
+  jcc(Assembler::less, L_exit);
+    xorl(tmp1, tmp1);
+    xorl(tmp2, tmp2);
+    movl(tmp3, in_out2);
+    addl(tmp3, size);
+
+    bind(L_processPartition);
+      crc32(in_out3, Address(in_out2, 0), 4);
+      crc32(tmp1, Address(in_out2, size), 4);
+      crc32(tmp2, Address(in_out2, size*2), 4);
+      crc32(in_out3, Address(in_out2, 0+4), 4);
+      crc32(tmp1, Address(in_out2, size+4), 4);
+      crc32(tmp2, Address(in_out2, size*2+4), 4);
+      addl(in_out2, 8);
+      cmpl(in_out2, tmp3);
+      jcc(Assembler::less, L_processPartition);
+
+        push(tmp3);
+        push(in_out1);
+        push(in_out2);
+        tmp4 = tmp3;
+        tmp5 = in_out1;
+        n_tmp6 = in_out2;
+
+      crc32c_rec_alt2(const_or_pre_comp_const_index_u1, const_or_pre_comp_const_index_u2, is_pclmulqdq_supported, in_out3, tmp1, tmp2,
+            w_xtmp1, w_xtmp2, w_xtmp3,
+            tmp4, tmp5,
+            n_tmp6);
+
+        pop(in_out2);
+        pop(in_out1);
+        pop(tmp3);
+
+    addl(in_out2, 2 * size);
+    subl(in_out1, 3 * size);
+    jmp(L_processPartitions);
+
+  bind(L_exit);
+}
+#endif //LP64
+
+#ifdef _LP64
+// Algorithm 2: Pipelined usage of the CRC32 instruction.
+// Input: A buffer I of L bytes.
+// Output: the CRC32C value of the buffer.
+// Notations:
+// Write L = 24N + r, with N = floor (L/24).
+// r = L mod 24 (0 <= r < 24).
+// Consider I as the concatenation of A|B|C|R, where A, B, C, each,
+// N quadwords, and R consists of r bytes.
+// A[j] = I [8j+7:8j], j= 0, 1, ..., N-1
+// B[j] = I [N + 8j+7:N + 8j], j= 0, 1, ..., N-1
+// C[j] = I [2N + 8j+7:2N + 8j], j= 0, 1, ..., N-1
+// if r > 0 R[j] = I [3N +j], j= 0, 1, ...,r-1
+void MacroAssembler::crc32c_ipl_alg2_alt2(Register in_out, Register in1, Register in2,
+                                          Register tmp1, Register tmp2, Register tmp3,
+                                          Register tmp4, Register tmp5, Register tmp6,
+                                          XMMRegister w_xtmp1, XMMRegister w_xtmp2, XMMRegister w_xtmp3,
+                                          bool is_pclmulqdq_supported) {
+  uint32_t const_or_pre_comp_const_index[CRC32C_NUM_PRECOMPUTED_CONSTANTS];
+  Label L_wordByWord;
+  Label L_byteByByteProlog;
+  Label L_byteByByte;
+  Label L_exit;
+
+  if (is_pclmulqdq_supported ) {
+    const_or_pre_comp_const_index[1] = *(uint32_t *)StubRoutines::_crc32c_table_addr;
+    const_or_pre_comp_const_index[0] = *((uint32_t *)StubRoutines::_crc32c_table_addr+1);
+
+    const_or_pre_comp_const_index[3] = *((uint32_t *)StubRoutines::_crc32c_table_addr + 2);
+    const_or_pre_comp_const_index[2] = *((uint32_t *)StubRoutines::_crc32c_table_addr + 3);
+
+    const_or_pre_comp_const_index[5] = *((uint32_t *)StubRoutines::_crc32c_table_addr + 4);
+    const_or_pre_comp_const_index[4] = *((uint32_t *)StubRoutines::_crc32c_table_addr + 5);
+    assert((CRC32C_NUM_PRECOMPUTED_CONSTANTS - 1 ) == 5, "Checking whether you declared all of the constants based on the number of \"chunks\"");
+  } else {
+    const_or_pre_comp_const_index[0] = 1;
+    const_or_pre_comp_const_index[1] = 0;
+
+    const_or_pre_comp_const_index[2] = 3;
+    const_or_pre_comp_const_index[3] = 2;
+
+    const_or_pre_comp_const_index[4] = 5;
+    const_or_pre_comp_const_index[5] = 4;
+   }
+  crc32c_proc_chunk(CRC32C_HIGH, const_or_pre_comp_const_index[0], const_or_pre_comp_const_index[1], is_pclmulqdq_supported,
+                    in2, in1, in_out,
+                    tmp1, tmp2, tmp3,
+                    w_xtmp1, w_xtmp2, w_xtmp3,
+                    tmp4, tmp5,
+                    tmp6);
+  crc32c_proc_chunk(CRC32C_MIDDLE, const_or_pre_comp_const_index[2], const_or_pre_comp_const_index[3], is_pclmulqdq_supported,
+                    in2, in1, in_out,
+                    tmp1, tmp2, tmp3,
+                    w_xtmp1, w_xtmp2, w_xtmp3,
+                    tmp4, tmp5,
+                    tmp6);
+  crc32c_proc_chunk(CRC32C_LOW, const_or_pre_comp_const_index[4], const_or_pre_comp_const_index[5], is_pclmulqdq_supported,
+                    in2, in1, in_out,
+                    tmp1, tmp2, tmp3,
+                    w_xtmp1, w_xtmp2, w_xtmp3,
+                    tmp4, tmp5,
+                    tmp6);
+  movl(tmp1, in2);
+  andl(tmp1, 0x00000007);
+  negl(tmp1);
+  addl(tmp1, in2);
+  addq(tmp1, in1);
+
+  BIND(L_wordByWord);
+  cmpq(in1, tmp1);
+  jcc(Assembler::greaterEqual, L_byteByByteProlog);
+    crc32(in_out, Address(in1, 0), 4);
+    addq(in1, 4);
+    jmp(L_wordByWord);
+
+  BIND(L_byteByByteProlog);
+  andl(in2, 0x00000007);
+  movl(tmp2, 1);
+
+  BIND(L_byteByByte);
+  cmpl(tmp2, in2);
+  jccb(Assembler::greater, L_exit);
+    crc32(in_out, Address(in1, 0), 1);
+    incq(in1);
+    incl(tmp2);
+    jmp(L_byteByByte);
+
+  BIND(L_exit);
+}
+#else
+void MacroAssembler::crc32c_ipl_alg2_alt2(Register in_out, Register in1, Register in2,
+                                          Register tmp1, Register  tmp2, Register tmp3,
+                                          Register tmp4, Register  tmp5, Register tmp6,
+                                          XMMRegister w_xtmp1, XMMRegister w_xtmp2, XMMRegister w_xtmp3,
+                                          bool is_pclmulqdq_supported) {
+  uint32_t const_or_pre_comp_const_index[CRC32C_NUM_PRECOMPUTED_CONSTANTS];
+  Label L_wordByWord;
+  Label L_byteByByteProlog;
+  Label L_byteByByte;
+  Label L_exit;
+
+  if (is_pclmulqdq_supported) {
+    const_or_pre_comp_const_index[1] = *(uint32_t *)StubRoutines::_crc32c_table_addr;
+    const_or_pre_comp_const_index[0] = *((uint32_t *)StubRoutines::_crc32c_table_addr + 1);
+
+    const_or_pre_comp_const_index[3] = *((uint32_t *)StubRoutines::_crc32c_table_addr + 2);
+    const_or_pre_comp_const_index[2] = *((uint32_t *)StubRoutines::_crc32c_table_addr + 3);
+
+    const_or_pre_comp_const_index[5] = *((uint32_t *)StubRoutines::_crc32c_table_addr + 4);
+    const_or_pre_comp_const_index[4] = *((uint32_t *)StubRoutines::_crc32c_table_addr + 5);
+  } else {
+    const_or_pre_comp_const_index[0] = 1;
+    const_or_pre_comp_const_index[1] = 0;
+
+    const_or_pre_comp_const_index[2] = 3;
+    const_or_pre_comp_const_index[3] = 2;
+
+    const_or_pre_comp_const_index[4] = 5;
+    const_or_pre_comp_const_index[5] = 4;
+  }
+  crc32c_proc_chunk(CRC32C_HIGH, const_or_pre_comp_const_index[0], const_or_pre_comp_const_index[1], is_pclmulqdq_supported,
+                    in2, in1, in_out,
+                    tmp1, tmp2, tmp3,
+                    w_xtmp1, w_xtmp2, w_xtmp3,
+                    tmp4, tmp5,
+                    tmp6);
+  crc32c_proc_chunk(CRC32C_MIDDLE, const_or_pre_comp_const_index[2], const_or_pre_comp_const_index[3], is_pclmulqdq_supported,
+                    in2, in1, in_out,
+                    tmp1, tmp2, tmp3,
+                    w_xtmp1, w_xtmp2, w_xtmp3,
+                    tmp4, tmp5,
+                    tmp6);
+  crc32c_proc_chunk(CRC32C_LOW, const_or_pre_comp_const_index[4], const_or_pre_comp_const_index[5], is_pclmulqdq_supported,
+                    in2, in1, in_out,
+                    tmp1, tmp2, tmp3,
+                    w_xtmp1, w_xtmp2, w_xtmp3,
+                    tmp4, tmp5,
+                    tmp6);
+  movl(tmp1, in2);
+  andl(tmp1, 0x00000007);
+  negl(tmp1);
+  addl(tmp1, in2);
+  addl(tmp1, in1);
+
+  BIND(L_wordByWord);
+  cmpl(in1, tmp1);
+  jcc(Assembler::greaterEqual, L_byteByByteProlog);
+    crc32(in_out, Address(in1,0), 4);
+    addl(in1, 4);
+    jmp(L_wordByWord);
+
+  BIND(L_byteByByteProlog);
+  andl(in2, 0x00000007);
+  movl(tmp2, 1);
+
+  BIND(L_byteByByte);
+  cmpl(tmp2, in2);
+  jccb(Assembler::greater, L_exit);
+    movb(tmp1, Address(in1, 0));
+    crc32(in_out, tmp1, 1);
+    incl(in1);
+    incl(tmp2);
+    jmp(L_byteByByte);
+
+  BIND(L_exit);
+}
+#endif // LP64
 #undef BIND
 #undef BLOCK_COMMENT

--- a/hotspot/src/cpu/x86/vm/macroAssembler_x86.hpp
+++ b/hotspot/src/cpu/x86/vm/macroAssembler_x86.hpp
@ -1278,9 +1278,42 @@ public:
               Register raxReg);
 #endif

-  // CRC32 code for java.util.zip.CRC32::updateBytes() instrinsic.
+  // CRC32 code for java.util.zip.CRC32::updateBytes() intrinsic.
  void update_byte_crc32(Register crc, Register val, Register table);
  void kernel_crc32(Register crc, Register buf, Register len, Register table, Register tmp);
+  // CRC32C code for java.util.zip.CRC32C::updateBytes() intrinsic
+  // Note on a naming convention:
+  // Prefix w = register only used on a Westmere+ architecture
+  // Prefix n = register only used on a Nehalem architecture
+#ifdef _LP64
+  void crc32c_ipl_alg4(Register in_out, uint32_t n,
+                       Register tmp1, Register tmp2, Register tmp3);
+#else
+  void crc32c_ipl_alg4(Register in_out, uint32_t n,
+                       Register tmp1, Register tmp2, Register tmp3,
+                       XMMRegister xtmp1, XMMRegister xtmp2);
+#endif
+  void crc32c_pclmulqdq(XMMRegister w_xtmp1,
+                        Register in_out,
+                        uint32_t const_or_pre_comp_const_index, bool is_pclmulqdq_supported,
+                        XMMRegister w_xtmp2,
+                        Register tmp1,
+                        Register n_tmp2, Register n_tmp3);
+  void crc32c_rec_alt2(uint32_t const_or_pre_comp_const_index_u1, uint32_t const_or_pre_comp_const_index_u2, bool is_pclmulqdq_supported, Register in_out, Register in1, Register in2,
+                       XMMRegister w_xtmp1, XMMRegister w_xtmp2, XMMRegister w_xtmp3,
+                       Register tmp1, Register tmp2,
+                       Register n_tmp3);
+  void crc32c_proc_chunk(uint32_t size, uint32_t const_or_pre_comp_const_index_u1, uint32_t const_or_pre_comp_const_index_u2, bool is_pclmulqdq_supported,
+                         Register in_out1, Register in_out2, Register in_out3,
+                         Register tmp1, Register tmp2, Register tmp3,
+                         XMMRegister w_xtmp1, XMMRegister w_xtmp2, XMMRegister w_xtmp3,
+                         Register tmp4, Register tmp5,
+                         Register n_tmp6);
+  void crc32c_ipl_alg2_alt2(Register in_out, Register in1, Register in2,
+                            Register tmp1, Register tmp2, Register tmp3,
+                            Register tmp4, Register tmp5, Register tmp6,
+                            XMMRegister w_xtmp1, XMMRegister w_xtmp2, XMMRegister w_xtmp3,
+                            bool is_pclmulqdq_supported);
  // Fold 128-bit data chunk
  void fold_128bit_crc32(XMMRegister xcrc, XMMRegister xK, XMMRegister xtmp, Register buf, int offset);
  void fold_128bit_crc32(XMMRegister xcrc, XMMRegister xK, XMMRegister xtmp, XMMRegister xbuf);
--- a/hotspot/src/cpu/x86/vm/stubGenerator_x86_32.cpp
+++ b/hotspot/src/cpu/x86/vm/stubGenerator_x86_32.cpp
@ -2991,6 +2991,63 @@ class StubGenerator: public StubCodeGenerator {
    return start;
  }

+  /**
+  *  Arguments:
+  *
+  * Inputs:
+  *   rsp(4)   - int crc
+  *   rsp(8)   - byte* buf
+  *   rsp(12)  - int length
+  *   rsp(16)  - table_start - optional (present only when doing a library_calll,
+  *              not used by x86 algorithm)
+  *
+  * Ouput:
+  *       rax  - int crc result
+  */
+  address generate_updateBytesCRC32C(bool is_pclmulqdq_supported) {
+    assert(UseCRC32CIntrinsics, "need SSE4_2");
+    __ align(CodeEntryAlignment);
+    StubCodeMark mark(this, "StubRoutines", "updateBytesCRC32C");
+    address start = __ pc();
+    const Register crc = rax;  // crc
+    const Register buf = rcx;  // source java byte array address
+    const Register len = rdx;  // length
+    const Register d = rbx;
+    const Register g = rsi;
+    const Register h = rdi;
+    const Register empty = 0; // will never be used, in order not
+                              // to change a signature for crc32c_IPL_Alg2_Alt2
+                              // between 64/32 I'm just keeping it here
+    assert_different_registers(crc, buf, len, d, g, h);
+
+    BLOCK_COMMENT("Entry:");
+    __ enter(); // required for proper stackwalking of RuntimeStub frame
+    Address crc_arg(rsp, 4 + 4 + 0); // ESP+4 +
+                                     // we need to add additional 4 because __ enter
+                                     // have just pushed ebp on a stack
+    Address buf_arg(rsp, 4 + 4 + 4);
+    Address len_arg(rsp, 4 + 4 + 8);
+      // Load up:
+      __ movl(crc, crc_arg);
+      __ movl(buf, buf_arg);
+      __ movl(len, len_arg);
+      __ push(d);
+      __ push(g);
+      __ push(h);
+      __ crc32c_ipl_alg2_alt2(crc, buf, len,
+                              d, g, h,
+                              empty, empty, empty,
+                              xmm0, xmm1, xmm2,
+                              is_pclmulqdq_supported);
+      __ pop(h);
+      __ pop(g);
+      __ pop(d);
+    __ leave(); // required for proper stackwalking of RuntimeStub frame
+    __ ret(0);
+
+    return start;
+  }
+
  // Safefetch stubs.
  void generate_safefetch(const char* name, int size, address* entry,
                          address* fault_pc, address* continuation_pc) {
@ -3204,6 +3261,13 @@ class StubGenerator: public StubCodeGenerator {
      StubRoutines::_crc_table_adr = (address)StubRoutines::x86::_crc_table;
      StubRoutines::_updateBytesCRC32 = generate_updateBytesCRC32();
    }
+
+    if (UseCRC32CIntrinsics) {
+      bool supports_clmul = VM_Version::supports_clmul();
+      StubRoutines::x86::generate_CRC32C_table(supports_clmul);
+      StubRoutines::_crc32c_table_addr = (address)StubRoutines::x86::_crc32c_table;
+      StubRoutines::_updateBytesCRC32C = generate_updateBytesCRC32C(supports_clmul);
+    }
  }


--- a/hotspot/src/cpu/x86/vm/stubGenerator_x86_64.cpp
+++ b/hotspot/src/cpu/x86/vm/stubGenerator_x86_64.cpp
@ -3958,6 +3958,64 @@ class StubGenerator: public StubCodeGenerator {
    return start;
  }

+  /**
+  *  Arguments:
+  *
+  * Inputs:
+  *   c_rarg0   - int crc
+  *   c_rarg1   - byte* buf
+  *   c_rarg2   - long length
+  *   c_rarg3   - table_start - optional (present only when doing a library_calll,
+  *              not used by x86 algorithm)
+  *
+  * Ouput:
+  *       rax   - int crc result
+  */
+  address generate_updateBytesCRC32C(bool is_pclmulqdq_supported) {
+      assert(UseCRC32CIntrinsics, "need SSE4_2");
+      __ align(CodeEntryAlignment);
+      StubCodeMark mark(this, "StubRoutines", "updateBytesCRC32C");
+      address start = __ pc();
+      //reg.arg        int#0        int#1        int#2        int#3        int#4        int#5        float regs
+      //Windows        RCX          RDX          R8           R9           none         none         XMM0..XMM3
+      //Lin / Sol      RDI          RSI          RDX          RCX          R8           R9           XMM0..XMM7
+      const Register crc = c_rarg0;  // crc
+      const Register buf = c_rarg1;  // source java byte array address
+      const Register len = c_rarg2;  // length
+      const Register a = rax;
+      const Register j = r9;
+      const Register k = r10;
+      const Register l = r11;
+#ifdef _WIN64
+      const Register y = rdi;
+      const Register z = rsi;
+#else
+      const Register y = rcx;
+      const Register z = r8;
+#endif
+      assert_different_registers(crc, buf, len, a, j, k, l, y, z);
+
+      BLOCK_COMMENT("Entry:");
+      __ enter(); // required for proper stackwalking of RuntimeStub frame
+#ifdef _WIN64
+      __ push(y);
+      __ push(z);
+#endif
+      __ crc32c_ipl_alg2_alt2(crc, buf, len,
+                              a, j, k,
+                              l, y, z,
+                              c_farg0, c_farg1, c_farg2,
+                              is_pclmulqdq_supported);
+      __ movl(rax, crc);
+#ifdef _WIN64
+      __ pop(z);
+      __ pop(y);
+#endif
+      __ leave(); // required for proper stackwalking of RuntimeStub frame
+      __ ret(0);
+
+      return start;
+  }

  /**
   *  Arguments:
@ -4302,6 +4360,13 @@ class StubGenerator: public StubCodeGenerator {
      StubRoutines::_crc_table_adr = (address)StubRoutines::x86::_crc_table;
      StubRoutines::_updateBytesCRC32 = generate_updateBytesCRC32();
    }
+
+    if (UseCRC32CIntrinsics) {
+      bool supports_clmul = VM_Version::supports_clmul();
+      StubRoutines::x86::generate_CRC32C_table(supports_clmul);
+      StubRoutines::_crc32c_table_addr = (address)StubRoutines::x86::_crc32c_table;
+      StubRoutines::_updateBytesCRC32C = generate_updateBytesCRC32C(supports_clmul);
+    }
  }

  void generate_all() {
--- a/hotspot/src/cpu/x86/vm/stubRoutines_x86.cpp
+++ b/hotspot/src/cpu/x86/vm/stubRoutines_x86.cpp
@ -27,6 +27,7 @@
 #include "runtime/frame.inline.hpp"
 #include "runtime/stubRoutines.hpp"
 #include "runtime/thread.inline.hpp"
+#include "crc32c.h"

 // Implementation of the platform-specific part of StubRoutines - for
 // a description of how to extend it, see the stubRoutines.hpp file.
@ -130,3 +131,107 @@ juint StubRoutines::x86::_crc_table[] =
    0x5d681b02UL, 0x2a6f2b94UL, 0xb40bbe37UL, 0xc30c8ea1UL, 0x5a05df1bUL,
    0x2d02ef8dUL
 };
+
+#define D 32
+#define P 0x82F63B78 // Reflection of Castagnoli (0x11EDC6F41)
+
+#define TILL_CYCLE 31
+uint32_t _crc32c_pow_2k_table[TILL_CYCLE]; // because _crc32c_pow_2k_table[TILL_CYCLE == 31] == _crc32c_pow_2k_table[0]
+
+// A. Kadatch and B. Jenkins / Everything we know about CRC but afraid to forget September 3, 2010 8
+// Listing 1: Multiplication of normalized polynomials
+// "a" and "b" occupy D least significant bits.
+uint32_t crc32c_multiply(uint32_t a, uint32_t b) {
+  uint32_t product = 0;
+  uint32_t b_pow_x_table[D + 1]; // b_pow_x_table[k] = (b * x**k) mod P
+  b_pow_x_table[0] = b;
+  for (int k = 0; k < D; ++k) {
+    // If "a" has non-zero coefficient at x**k,/ add ((b * x**k) mod P) to the result.
+    if ((a & (uint64_t)(1 << (D - 1 - k))) != 0) product ^= b_pow_x_table[k];
+
+    // Compute b_pow_x_table[k+1] = (b ** x**(k+1)) mod P.
+    if (b_pow_x_table[k] & 1) {
+      // If degree of (b_pow_x_table[k] * x) is D, then
+      // degree of (b_pow_x_table[k] * x - P) is less than D.
+      b_pow_x_table[k + 1] = (b_pow_x_table[k] >> 1) ^ P;
+    }
+    else {
+      b_pow_x_table[k + 1] = b_pow_x_table[k] >> 1;
+    }
+  }
+  return product;
+}
+#undef D
+#undef P
+
+// A. Kadatch and B. Jenkins / Everything we know about CRC but afraid to forget September 3, 2010 9
+void crc32c_init_pow_2k(void) {
+  // _crc32c_pow_2k_table(0) =
+  // x^(2^k) mod P(x) = x mod P(x) = x
+  // Since we are operating on a reflected values
+  // x = 10b, reflect(x) = 0x40000000
+  _crc32c_pow_2k_table[0] = 0x40000000;
+
+  for (int k = 1; k < TILL_CYCLE; k++) {
+    // _crc32c_pow_2k_table(k+1) = _crc32c_pow_2k_table(k-1)^2 mod P(x)
+    uint32_t tmp = _crc32c_pow_2k_table[k - 1];
+    _crc32c_pow_2k_table[k] = crc32c_multiply(tmp, tmp);
+  }
+}
+
+// x^N mod P(x)
+uint32_t crc32c_f_pow_n(uint32_t n) {
+  //            result = 1 (polynomial)
+  uint32_t one, result = 0x80000000, i = 0;
+
+  while (one = (n & 1), (n == 1 || n - one > 0)) {
+    if (one) {
+      result = crc32c_multiply(result, _crc32c_pow_2k_table[i]);
+    }
+    n >>= 1;
+    i++;
+  }
+
+  return result;
+}
+
+juint *StubRoutines::x86::_crc32c_table;
+
+void StubRoutines::x86::generate_CRC32C_table(bool is_pclmulqdq_table_supported) {
+
+  static juint pow_n[CRC32C_NUM_PRECOMPUTED_CONSTANTS];
+
+  crc32c_init_pow_2k();
+
+  pow_n[0] = crc32c_f_pow_n(CRC32C_HIGH * 8);      // 8N * 8 = 64N
+  pow_n[1] = crc32c_f_pow_n(CRC32C_HIGH * 8 * 2);  // 128N
+
+  pow_n[2] = crc32c_f_pow_n(CRC32C_MIDDLE * 8);
+  pow_n[3] = crc32c_f_pow_n(CRC32C_MIDDLE * 8 * 2);
+
+  pow_n[4] = crc32c_f_pow_n(CRC32C_LOW * 8);
+  pow_n[CRC32C_NUM_PRECOMPUTED_CONSTANTS - 1] =
+            crc32c_f_pow_n(CRC32C_LOW * 8 * 2);
+
+  if (is_pclmulqdq_table_supported) {
+    _crc32c_table = pow_n;
+  } else {
+    static julong pclmulqdq_table[CRC32C_NUM_PRECOMPUTED_CONSTANTS * 256];
+
+    for (int j = 0; j < CRC32C_NUM_PRECOMPUTED_CONSTANTS; j++) {
+      static juint X_CONST = pow_n[j];
+      for (int64_t i = 0; i < 256; i++) { // to force 64 bit wide computations
+      // S. Gueron / Information Processing Letters 112 (2012) 184
+      // Algorithm 3: Generating a carry-less multiplication lookup table.
+      // Input: A 32-bit constant, X_CONST.
+      // Output: A table of 256 entries, each one is a 64-bit quadword,
+      // that can be used for computing "byte" * X_CONST, for a given byte.
+        pclmulqdq_table[j * 256 + i] =
+          ((i & 1) * X_CONST) ^ ((i & 2) * X_CONST) ^ ((i & 4) * X_CONST) ^
+          ((i & 8) * X_CONST) ^ ((i & 16) * X_CONST) ^ ((i & 32) * X_CONST) ^
+          ((i & 64) * X_CONST) ^ ((i & 128) * X_CONST);
+      }
+    }
+    _crc32c_table = (juint*)pclmulqdq_table;
+  }
+}
--- a/hotspot/src/cpu/x86/vm/stubRoutines_x86.hpp
+++ b/hotspot/src/cpu/x86/vm/stubRoutines_x86.hpp
@ -36,6 +36,8 @@
  // masks and table for CRC32
  static uint64_t _crc_by128_masks[];
  static juint    _crc_table[];
+  // table for CRC32C
+  static juint* _crc32c_table;
  // swap mask for ghash
  static address _ghash_long_swap_mask_addr;
  static address _ghash_byte_swap_mask_addr;
@ -46,5 +48,6 @@
  static address crc_by128_masks_addr()  { return (address)_crc_by128_masks; }
  static address ghash_long_swap_mask_addr() { return _ghash_long_swap_mask_addr; }
  static address ghash_byte_swap_mask_addr() { return _ghash_byte_swap_mask_addr; }
+  static void generate_CRC32C_table(bool is_pclmulqdq_supported);

 #endif // CPU_X86_VM_STUBROUTINES_X86_32_HPP
--- a/hotspot/src/cpu/x86/vm/templateInterpreter_x86_32.cpp
+++ b/hotspot/src/cpu/x86/vm/templateInterpreter_x86_32.cpp
@ -790,18 +790,25 @@ address InterpreterGenerator::generate_CRC32_updateBytes_entry(AbstractInterpret
    const Register buf = rdx;  // source java byte array address
    const Register len = rdi;  // length

+    // value              x86_32
+    // interp. arg ptr    ESP + 4
+    // int java.util.zip.CRC32.updateBytes(int crc, byte[] b, int off, int len)
+    //                                         3           2      1        0
+    // int java.util.zip.CRC32.updateByteBuffer(int crc, long buf, int off, int len)
+    //                                              4         2,3      1        0
+
    // Arguments are reversed on java expression stack
-    __ movl(len,   Address(rsp,   wordSize)); // Length
+    __ movl(len,   Address(rsp,   4 + 0)); // Length
    // Calculate address of start element
    if (kind == Interpreter::java_util_zip_CRC32_updateByteBuffer) {
-      __ movptr(buf, Address(rsp, 3*wordSize)); // long buf
-      __ addptr(buf, Address(rsp, 2*wordSize)); // + offset
-      __ movl(crc,   Address(rsp, 5*wordSize)); // Initial CRC
+      __ movptr(buf, Address(rsp, 4 + 2 * wordSize)); // long buf
+      __ addptr(buf, Address(rsp, 4 + 1 * wordSize)); // + offset
+      __ movl(crc,   Address(rsp, 4 + 4 * wordSize)); // Initial CRC
    } else {
-      __ movptr(buf, Address(rsp, 3*wordSize)); // byte[] array
+      __ movptr(buf, Address(rsp, 4 + 2 * wordSize)); // byte[] array
      __ addptr(buf, arrayOopDesc::base_offset_in_bytes(T_BYTE)); // + header size
-      __ addptr(buf, Address(rsp, 2*wordSize)); // + offset
-      __ movl(crc,   Address(rsp, 4*wordSize)); // Initial CRC
+      __ addptr(buf, Address(rsp, 4 + 1 * wordSize)); // + offset
+      __ movl(crc,   Address(rsp, 4 + 3 * wordSize)); // Initial CRC
    }

    __ super_call_VM_leaf(CAST_FROM_FN_PTR(address, StubRoutines::updateBytesCRC32()), crc, buf, len);
@ -822,6 +829,53 @@ address InterpreterGenerator::generate_CRC32_updateBytes_entry(AbstractInterpret
  return generate_native_entry(false);
 }

+/**
+* Method entry for static native methods:
+*   int java.util.zip.CRC32C.updateBytes(int crc, byte[] b, int off, int end)
+*   int java.util.zip.CRC32C.updateByteBuffer(int crc, long address, int off, int end)
+*/
+address InterpreterGenerator::generate_CRC32C_updateBytes_entry(AbstractInterpreter::MethodKind kind) {
+  if (UseCRC32CIntrinsics) {
+    address entry = __ pc();
+    // Load parameters
+    const Register crc = rax;  // crc
+    const Register buf = rcx;  // source java byte array address
+    const Register len = rdx;  // length
+    const Register end = len;
+
+    // value              x86_32
+    // interp. arg ptr    ESP + 4
+    // int java.util.zip.CRC32.updateBytes(int crc, byte[] b, int off, int end)
+    //                                         3           2      1        0
+    // int java.util.zip.CRC32.updateByteBuffer(int crc, long address, int off, int end)
+    //                                              4         2,3          1        0
+
+    // Arguments are reversed on java expression stack
+    __ movl(end, Address(rsp, 4 + 0)); // end
+    __ subl(len, Address(rsp, 4 + 1 * wordSize));  // end - offset == length
+    // Calculate address of start element
+    if (kind == Interpreter::java_util_zip_CRC32_updateByteBuffer) {
+      __ movptr(buf, Address(rsp, 4 + 2 * wordSize)); // long address
+      __ addptr(buf, Address(rsp, 4 + 1 * wordSize)); // + offset
+      __ movl(crc, Address(rsp, 4 + 4 * wordSize)); // Initial CRC
+    } else {
+      __ movptr(buf, Address(rsp, 4 + 2 * wordSize)); // byte[] array
+      __ addptr(buf, arrayOopDesc::base_offset_in_bytes(T_BYTE)); // + header size
+      __ addptr(buf, Address(rsp, 4 + 1 * wordSize)); // + offset
+      __ movl(crc, Address(rsp, 4 + 3 * wordSize)); // Initial CRC
+    }
+    __ super_call_VM_leaf(CAST_FROM_FN_PTR(address, StubRoutines::updateBytesCRC32C()), crc, buf, len);
+    // result in rax
+    // _areturn
+    __ pop(rdi);                // get return address
+    __ mov(rsp, rsi);           // set sp to sender sp
+    __ jmp(rdi);
+
+    return entry;
+  }
+  return generate_native_entry(false);
+}
+
 /**
 * Method entry for static native method:
 *    java.lang.Float.intBitsToFloat(int bits)
--- a/hotspot/src/cpu/x86/vm/templateInterpreter_x86_64.cpp
+++ b/hotspot/src/cpu/x86/vm/templateInterpreter_x86_64.cpp
@ -804,6 +804,57 @@ address InterpreterGenerator::generate_CRC32_updateBytes_entry(AbstractInterpret
  return generate_native_entry(false);
 }

+/**
+* Method entry for static native methods:
+*   int java.util.zip.CRC32C.updateBytes(int crc, byte[] b, int off, int end)
+*   int java.util.zip.CRC32C.updateByteBuffer(int crc, long address, int off, int end)
+*/
+address InterpreterGenerator::generate_CRC32C_updateBytes_entry(AbstractInterpreter::MethodKind kind) {
+  if (UseCRC32CIntrinsics) {
+    address entry = __ pc();
+    // Load parameters
+    const Register crc = c_rarg0;  // crc
+    const Register buf = c_rarg1;  // source java byte array address
+    const Register len = c_rarg2;
+    const Register off = c_rarg3;  // offset
+    const Register end = len;
+
+    // Arguments are reversed on java expression stack
+    // Calculate address of start element
+    if (kind == Interpreter::java_util_zip_CRC32C_updateDirectByteBuffer) {
+      __ movptr(buf, Address(rsp, 3 * wordSize)); // long buf
+      __ movl2ptr(off, Address(rsp, 2 * wordSize)); // offset
+      __ addq(buf, off); // + offset
+      __ movl(crc, Address(rsp, 5 * wordSize)); // Initial CRC
+      // Note on 5 * wordSize vs. 4 * wordSize:
+      // *   int java.util.zip.CRC32C.updateByteBuffer(int crc, long address, int off, int end)
+      //                                                   4         2,3          1        0
+      // end starts at SP + 8
+      // The Java(R) Virtual Machine Specification Java SE 7 Edition
+      // 4.10.2.3. Values of Types long and double
+      //    "When calculating operand stack length, values of type long and double have length two."
+    } else {
+      __ movptr(buf, Address(rsp, 3 * wordSize)); // byte[] array
+      __ addptr(buf, arrayOopDesc::base_offset_in_bytes(T_BYTE)); // + header size
+      __ movl2ptr(off, Address(rsp, 2 * wordSize)); // offset
+      __ addq(buf, off); // + offset
+      __ movl(crc, Address(rsp, 4 * wordSize)); // Initial CRC
+    }
+    __ movl(end, Address(rsp, wordSize)); // end
+    __ subl(end, off); // end - off
+    __ super_call_VM_leaf(CAST_FROM_FN_PTR(address, StubRoutines::updateBytesCRC32C()), crc, buf, len);
+    // result in rax
+    // _areturn
+    __ pop(rdi);                // get return address
+    __ mov(rsp, r13);           // set sp to sender sp
+    __ jmp(rdi);
+
+    return entry;
+  }
+
+  return generate_native_entry(false);
+}
+
 // Interpreter stub for calling a native method. (asm interpreter)
 // This sets up a somewhat different looking stack for calling the
 // native method than the typical interpreter frame setup.
--- a/hotspot/src/cpu/x86/vm/vm_version_x86.cpp
+++ b/hotspot/src/cpu/x86/vm/vm_version_x86.cpp
@ -661,6 +661,18 @@ void VM_Version::get_processor_features() {
    FLAG_SET_DEFAULT(UseCRC32Intrinsics, false);
  }

+  if (supports_sse4_2()) {
+    if (FLAG_IS_DEFAULT(UseCRC32CIntrinsics)) {
+      UseCRC32CIntrinsics = true;
+    }
+  }
+  else if (UseCRC32CIntrinsics) {
+    if (!FLAG_IS_DEFAULT(UseCRC32CIntrinsics)) {
+      warning("CRC32C intrinsics are not available on this CPU");
+    }
+    FLAG_SET_DEFAULT(UseCRC32CIntrinsics, false);
+  }
+
  // The AES intrinsic stubs require AES instruction support (of course)
  // but also require sse3 mode for instructions it use.
  if (UseAES && (UseSSE > 2)) {
@ -704,12 +716,6 @@ void VM_Version::get_processor_features() {
    FLAG_SET_DEFAULT(UseSHA512Intrinsics, false);
  }

-  if (UseCRC32CIntrinsics) {
-    if (!FLAG_IS_DEFAULT(UseCRC32CIntrinsics))
-      warning("CRC32C intrinsics are not available on this CPU");
-    FLAG_SET_DEFAULT(UseCRC32CIntrinsics, false);
-  }
-
  if (UseAdler32Intrinsics) {
    warning("Adler32Intrinsics not available on this CPU.");
    FLAG_SET_DEFAULT(UseAdler32Intrinsics, false);
--- a/hotspot/src/cpu/zero/vm/interpreterGenerator_zero.hpp
+++ b/hotspot/src/cpu/zero/vm/interpreterGenerator_zero.hpp
@ -42,4 +42,5 @@
  // Not supported
  address generate_CRC32_update_entry() { return NULL; }
  address generate_CRC32_updateBytes_entry(AbstractInterpreter::MethodKind kind) { return NULL; }
+  address generate_CRC32C_updateBytes_entry(AbstractInterpreter::MethodKind kind) { return NULL; }
 #endif // CPU_ZERO_VM_INTERPRETERGENERATOR_ZERO_HPP
--- a/hotspot/src/share/vm/interpreter/abstractInterpreter.hpp
+++ b/hotspot/src/share/vm/interpreter/abstractInterpreter.hpp
@ -90,6 +90,8 @@ class AbstractInterpreter: AllStatic {
    java_util_zip_CRC32_update,                                 // implementation of java.util.zip.CRC32.update()
    java_util_zip_CRC32_updateBytes,                            // implementation of java.util.zip.CRC32.updateBytes()
    java_util_zip_CRC32_updateByteBuffer,                       // implementation of java.util.zip.CRC32.updateByteBuffer()
+    java_util_zip_CRC32C_updateBytes,                           // implementation of java.util.zip.CRC32C.updateBytes(crc, b[], off, end)
+    java_util_zip_CRC32C_updateDirectByteBuffer,                // implementation of java.util.zip.CRC32C.updateDirectByteBuffer(crc, address, off, end)
    java_lang_Float_intBitsToFloat,                             // implementation of java.lang.Float.intBitsToFloat()
    java_lang_Float_floatToRawIntBits,                          // implementation of java.lang.Float.floatToRawIntBits()
    java_lang_Double_longBitsToDouble,                          // implementation of java.lang.Double.longBitsToDouble()
--- a/hotspot/src/share/vm/interpreter/interpreter.cpp
+++ b/hotspot/src/share/vm/interpreter/interpreter.cpp
@ -234,6 +234,13 @@ AbstractInterpreter::MethodKind AbstractInterpreter::method_kind(methodHandle m)
      case vmIntrinsics::_updateByteBufferCRC32  : return java_util_zip_CRC32_updateByteBuffer;
    }
  }
+  if (UseCRC32CIntrinsics) {
+    // Use optimized stub code for CRC32C methods.
+    switch (m->intrinsic_id()) {
+      case vmIntrinsics::_updateBytesCRC32C             : return java_util_zip_CRC32C_updateBytes;
+      case vmIntrinsics::_updateDirectByteBufferCRC32C  : return java_util_zip_CRC32C_updateDirectByteBuffer;
+    }
+  }

  switch(m->intrinsic_id()) {
  case vmIntrinsics::_intBitsToFloat:      return java_lang_Float_intBitsToFloat;
@ -349,6 +356,8 @@ void AbstractInterpreter::print_method_kind(MethodKind kind) {
    case java_util_zip_CRC32_update           : tty->print("java_util_zip_CRC32_update"); break;
    case java_util_zip_CRC32_updateBytes      : tty->print("java_util_zip_CRC32_updateBytes"); break;
    case java_util_zip_CRC32_updateByteBuffer : tty->print("java_util_zip_CRC32_updateByteBuffer"); break;
+    case java_util_zip_CRC32C_updateBytes     : tty->print("java_util_zip_CRC32C_updateBytes"); break;
+    case java_util_zip_CRC32C_updateDirectByteBuffer: tty->print("java_util_zip_CRC32C_updateDirectByteByffer"); break;
    default:
      if (kind >= method_handle_invoke_FIRST &&
          kind <= method_handle_invoke_LAST) {
@ -567,6 +576,10 @@ address InterpreterGenerator::generate_method_entry(
                                           : // fall thru
  case Interpreter::java_util_zip_CRC32_updateByteBuffer
                                           : entry_point = generate_CRC32_updateBytes_entry(kind); break;
+  case Interpreter::java_util_zip_CRC32C_updateBytes
+                                           : // fall thru
+  case Interpreter::java_util_zip_CRC32C_updateDirectByteBuffer
+                                           : entry_point = generate_CRC32C_updateBytes_entry(kind); break;
 #if defined(TARGET_ARCH_x86) && !defined(_LP64)
  // On x86_32 platforms, a special entry is generated for the following four methods.
  // On other platforms the normal entry is used to enter these methods.
@ -582,9 +595,9 @@ address InterpreterGenerator::generate_method_entry(
  case Interpreter::java_lang_Float_intBitsToFloat:
  case Interpreter::java_lang_Float_floatToRawIntBits:
  case Interpreter::java_lang_Double_longBitsToDouble:
-  case Interpreter::java_lang_Double_doubleToRawLongBits:
-    entry_point = generate_native_entry(false);
-    break;
+    case Interpreter::java_lang_Double_doubleToRawLongBits:
+        entry_point = generate_native_entry(false);
+            break;
 #endif // defined(TARGET_ARCH_x86) && !defined(_LP64)
 #endif // CC_INTERP
  default:
--- a/hotspot/src/share/vm/interpreter/templateInterpreter.cpp
+++ b/hotspot/src/share/vm/interpreter/templateInterpreter.cpp
@ -418,6 +418,11 @@ void TemplateInterpreterGenerator::generate_all() {
        method_entry(java_util_zip_CRC32_updateByteBuffer)
      }

+      if (UseCRC32CIntrinsics) {
+        method_entry(java_util_zip_CRC32C_updateBytes)
+        method_entry(java_util_zip_CRC32C_updateDirectByteBuffer)
+      }
+
      method_entry(java_lang_Float_intBitsToFloat);
      method_entry(java_lang_Float_floatToRawIntBits);
      method_entry(java_lang_Double_longBitsToDouble);
--- a/hotspot/src/share/vm/runtime/stubRoutines.cpp
+++ b/hotspot/src/share/vm/runtime/stubRoutines.cpp
@ -136,8 +136,9 @@ address StubRoutines::_sha512_implCompress   = NULL;
 address StubRoutines::_sha512_implCompressMB = NULL;

 address StubRoutines::_updateBytesCRC32 = NULL;
-address StubRoutines::_crc_table_adr = NULL;
+address StubRoutines::_crc_table_adr =    NULL;

+address StubRoutines::_crc32c_table_addr = NULL;
 address StubRoutines::_updateBytesCRC32C = NULL;
 address StubRoutines::_updateBytesAdler32 = NULL;

--- a/hotspot/src/share/vm/runtime/stubRoutines.hpp
+++ b/hotspot/src/share/vm/runtime/stubRoutines.hpp
@ -197,6 +197,7 @@ class StubRoutines: AllStatic {
  static address _updateBytesCRC32;
  static address _crc_table_adr;

+  static address _crc32c_table_addr;
  static address _updateBytesCRC32C;
  static address _updateBytesAdler32;

@ -364,6 +365,7 @@ class StubRoutines: AllStatic {
  static address updateBytesCRC32()    { return _updateBytesCRC32; }
  static address crc_table_addr()      { return _crc_table_adr; }

+  static address crc32c_table_addr()   { return _crc32c_table_addr; }
  static address updateBytesCRC32C()   { return _updateBytesCRC32C; }
  static address updateBytesAdler32()  { return _updateBytesAdler32; }

--- a/hotspot/src/share/vm/runtime/vmStructs.cpp
+++ b/hotspot/src/share/vm/runtime/vmStructs.cpp
@ -832,6 +832,7 @@ typedef CompactHashtable<Symbol*, char>       SymbolCompactHashTable;
     static_field(StubRoutines,                _ghash_processBlocks,                          address)                               \
     static_field(StubRoutines,                _updateBytesCRC32,                             address)                               \
     static_field(StubRoutines,                _crc_table_adr,                                address)                               \
+     static_field(StubRoutines,                _crc32c_table_addr,                            address)                               \
     static_field(StubRoutines,                _updateBytesCRC32C,                            address)                               \
     static_field(StubRoutines,                _multiplyToLen,                                address)                               \
     static_field(StubRoutines,                _squareToLen,                                  address)                               \