8275047: Optimize existing fill stubs for AVX-512 target

Reviewed-by: kvn, redestad
2021-10-26 12:34:56 +00:00 · 2021-10-26 12:34:56 +00:00 · 4be88d5482
commit 4be88d5482
parent 63e0f344e9
6 changed files with 262 additions and 43 deletions
--- a/src/hotspot/cpu/x86/macroAssembler_x86.cpp
+++ b/src/hotspot/cpu/x86/macroAssembler_x86.cpp
@ -5026,7 +5026,7 @@ void MacroAssembler::xmm_clear_mem(Register base, Register cnt, Register rtmp, X
  BIND(L_loop);
  if (MaxVectorSize >= 32) {
-    fill64_avx(base, 0, xtmp, use64byteVector);
+    fill64(base, 0, xtmp, use64byteVector);
  } else {
    movdqu(Address(base,  0), xtmp);
    movdqu(Address(base, 16), xtmp);
@ -5043,7 +5043,7 @@ void MacroAssembler::xmm_clear_mem(Register base, Register cnt, Register rtmp, X
  if (use64byteVector) {
    addptr(cnt, 8);
    jccb(Assembler::equal, L_end);
-    fill64_masked_avx(3, base, 0, xtmp, mask, cnt, rtmp, true);
+    fill64_masked(3, base, 0, xtmp, mask, cnt, rtmp, true);
    jmp(L_end);
  } else {
    addptr(cnt, 4);
@ -5062,7 +5062,7 @@ void MacroAssembler::xmm_clear_mem(Register base, Register cnt, Register rtmp, X
  addptr(cnt, 4);
  jccb(Assembler::lessEqual, L_end);
  if (UseAVX > 2 && MaxVectorSize >= 32 && VM_Version::supports_avx512vl()) {
-    fill32_masked_avx(3, base, 0, xtmp, mask, cnt, rtmp);
+    fill32_masked(3, base, 0, xtmp, mask, cnt, rtmp);
  } else {
    decrement(cnt);
@ -5086,7 +5086,7 @@ void MacroAssembler::clear_mem(Register base, int cnt, Register rtmp, XMMRegiste
  // 64 byte initialization loop.
  vpxor(xtmp, xtmp, xtmp, use64byteVector ? AVX_512bit : AVX_256bit);
  for (int i = 0; i < vector64_count; i++) {
-    fill64_avx(base, i * 64, xtmp, use64byteVector);
+    fill64(base, i * 64, xtmp, use64byteVector);
  }
  // Clear remaining 64 byte tail.
@ -5207,6 +5207,15 @@ void MacroAssembler::generate_fill(BasicType t, bool aligned,
  Label L_exit;
  Label L_fill_2_bytes, L_fill_4_bytes;
 #if defined(COMPILER2) && defined(_LP64)
  if(MaxVectorSize >=32 &&
     VM_Version::supports_avx512vlbw() &&
     VM_Version::supports_bmi2()) {
    generate_fill_avx3(t, to, value, count, rtmp, xtmp);
    return;
  }
 #endif
  int shift = -1;
  switch (t) {
    case T_BYTE:
@ -5427,6 +5436,30 @@ void MacroAssembler::generate_fill(BasicType t, bool aligned,
  BIND(L_exit);
 }
 void MacroAssembler::evpbroadcast(BasicType type, XMMRegister dst, Register src, int vector_len) {
  switch(type) {
    case T_BYTE:
    case T_BOOLEAN:
      evpbroadcastb(dst, src, vector_len);
      break;
    case T_SHORT:
    case T_CHAR:
      evpbroadcastw(dst, src, vector_len);
      break;
    case T_INT:
    case T_FLOAT:
      evpbroadcastd(dst, src, vector_len);
      break;
    case T_LONG:
    case T_DOUBLE:
      evpbroadcastq(dst, src, vector_len);
      break;
    default:
      fatal("Unhandled type : %s", type2name(type));
      break;
  }
 }
 // encode char[] to byte[] in ISO_8859_1 or ASCII
   //@IntrinsicCandidate
   //private static int implEncodeISOArray(byte[] sa, int sp,
@ -8236,59 +8269,234 @@ void MacroAssembler::evmovdqu(BasicType type, KRegister kmask, Address dst, XMMR
 #if COMPILER2_OR_JVMCI
 void MacroAssembler::fill_masked(BasicType bt, Address dst, XMMRegister xmm, KRegister mask,
                                 Register length, Register temp, int vec_enc) {
  // Computing mask for predicated vector store.
  movptr(temp, -1);
  bzhiq(temp, temp, length);
  kmov(mask, temp);
  evmovdqu(bt, mask, dst, xmm, vec_enc);
 }
 // Set memory operation for length "less than" 64 bytes.
-void MacroAssembler::fill64_masked_avx(uint shift, Register dst, int disp,
+void MacroAssembler::fill64_masked(uint shift, Register dst, int disp,
                                       XMMRegister xmm, KRegister mask, Register length,
                                       Register temp, bool use64byteVector) {
  assert(MaxVectorSize >= 32, "vector length should be >= 32");
  assert(shift != 0, "shift value should be 1 (short),2(int) or 3(long)");
  BasicType type[] = { T_BYTE, T_SHORT, T_INT, T_LONG};
  if (!use64byteVector) {
-    fill32_avx(dst, disp, xmm);
+    fill32(dst, disp, xmm);
    subptr(length, 32 >> shift);
-    fill32_masked_avx(shift, dst, disp + 32, xmm, mask, length, temp);
+    fill32_masked(shift, dst, disp + 32, xmm, mask, length, temp);
  } else {
    assert(MaxVectorSize == 64, "vector length != 64");
-    movl(temp, 1);
+    fill_masked(type[shift], Address(dst, disp), xmm, mask, length, temp, Assembler::AVX_512bit);
    shlxl(temp, temp, length);
    subptr(temp, 1);
    kmovwl(mask, temp);
    evmovdqu(type[shift], mask, Address(dst, disp), xmm, Assembler::AVX_512bit);
  }
 }
-void MacroAssembler::fill32_masked_avx(uint shift, Register dst, int disp,
+void MacroAssembler::fill32_masked(uint shift, Register dst, int disp,
                                       XMMRegister xmm, KRegister mask, Register length,
                                       Register temp) {
  assert(MaxVectorSize >= 32, "vector length should be >= 32");
  assert(shift != 0, "shift value should be 1 (short), 2(int) or 3(long)");
  BasicType type[] = { T_BYTE, T_SHORT, T_INT, T_LONG};
-  movl(temp, 1);
+  fill_masked(type[shift], Address(dst, disp), xmm, mask, length, temp, Assembler::AVX_256bit);
  shlxl(temp, temp, length);
  subptr(temp, 1);
  kmovwl(mask, temp);
  evmovdqu(type[shift], mask, Address(dst, disp), xmm, Assembler::AVX_256bit);
 }
-void MacroAssembler::fill32_avx(Register dst, int disp, XMMRegister xmm) {
+void MacroAssembler::fill32(Register dst, int disp, XMMRegister xmm) {
  assert(MaxVectorSize >= 32, "vector length should be >= 32");
  vmovdqu(Address(dst, disp), xmm);
 }
-void MacroAssembler::fill64_avx(Register dst, int disp, XMMRegister xmm, bool use64byteVector) {
+void MacroAssembler::fill64(Register dst, int disp, XMMRegister xmm, bool use64byteVector) {
  assert(MaxVectorSize >= 32, "vector length should be >= 32");
  BasicType type[] = {T_BYTE,  T_SHORT,  T_INT,   T_LONG};
  if (!use64byteVector) {
-    fill32_avx(dst, disp, xmm);
+    fill32(dst, disp, xmm);
-    fill32_avx(dst, disp + 32, xmm);
+    fill32(dst, disp + 32, xmm);
  } else {
    evmovdquq(Address(dst, disp), xmm, Assembler::AVX_512bit);
  }
 }
 #ifdef _LP64
 void MacroAssembler::generate_fill_avx3(BasicType type, Register to, Register value,
                                        Register count, Register rtmp, XMMRegister xtmp) {
  Label L_exit;
  Label L_fill_start;
  Label L_fill_64_bytes;
  Label L_fill_96_bytes;
  Label L_fill_128_bytes;
  Label L_fill_128_bytes_loop;
  Label L_fill_128_loop_header;
  Label L_fill_128_bytes_loop_header;
  Label L_fill_128_bytes_loop_pre_header;
  Label L_fill_zmm_sequence;
  int shift = -1;
  switch(type) {
    case T_BYTE:  shift = 0;
      break;
    case T_SHORT: shift = 1;
      break;
    case T_INT:   shift = 2;
      break;
    /* Uncomment when LONG fill stubs are supported.
    case T_LONG:  shift = 3;
      break;
    */
    default:
      fatal("Unhandled type: %s\n", type2name(type));
  }
  if (AVX3Threshold != 0  || MaxVectorSize == 32) {
    if (MaxVectorSize == 64) {
      cmpq(count, AVX3Threshold >> shift);
      jcc(Assembler::greater, L_fill_zmm_sequence);
    }
    evpbroadcast(type, xtmp, value, Assembler::AVX_256bit);
    bind(L_fill_start);
    cmpq(count, 32 >> shift);
    jccb(Assembler::greater, L_fill_64_bytes);
    fill32_masked(shift, to, 0, xtmp, k2, count, rtmp);
    jmp(L_exit);
    bind(L_fill_64_bytes);
    cmpq(count, 64 >> shift);
    jccb(Assembler::greater, L_fill_96_bytes);
    fill64_masked(shift, to, 0, xtmp, k2, count, rtmp);
    jmp(L_exit);
    bind(L_fill_96_bytes);
    cmpq(count, 96 >> shift);
    jccb(Assembler::greater, L_fill_128_bytes);
    fill64(to, 0, xtmp);
    subq(count, 64 >> shift);
    fill32_masked(shift, to, 64, xtmp, k2, count, rtmp);
    jmp(L_exit);
    bind(L_fill_128_bytes);
    cmpq(count, 128 >> shift);
    jccb(Assembler::greater, L_fill_128_bytes_loop_pre_header);
    fill64(to, 0, xtmp);
    fill32(to, 64, xtmp);
    subq(count, 96 >> shift);
    fill32_masked(shift, to, 96, xtmp, k2, count, rtmp);
    jmp(L_exit);
    bind(L_fill_128_bytes_loop_pre_header);
    {
      mov(rtmp, to);
      andq(rtmp, 31);
      jccb(Assembler::zero, L_fill_128_bytes_loop_header);
      negq(rtmp);
      addq(rtmp, 32);
      mov64(r8, -1L);
      bzhiq(r8, r8, rtmp);
      kmovql(k2, r8);
      evmovdqu(T_BYTE, k2, Address(to, 0), xtmp, Assembler::AVX_256bit);
      addq(to, rtmp);
      shrq(rtmp, shift);
      subq(count, rtmp);
    }
    cmpq(count, 128 >> shift);
    jcc(Assembler::less, L_fill_start);
    bind(L_fill_128_bytes_loop_header);
    subq(count, 128 >> shift);
    align32();
    bind(L_fill_128_bytes_loop);
      fill64(to, 0, xtmp);
      fill64(to, 64, xtmp);
      addq(to, 128);
      subq(count, 128 >> shift);
      jccb(Assembler::greaterEqual, L_fill_128_bytes_loop);
    addq(count, 128 >> shift);
    jcc(Assembler::zero, L_exit);
    jmp(L_fill_start);
  }
  if (MaxVectorSize == 64) {
    // Sequence using 64 byte ZMM register.
    Label L_fill_128_bytes_zmm;
    Label L_fill_192_bytes_zmm;
    Label L_fill_192_bytes_loop_zmm;
    Label L_fill_192_bytes_loop_header_zmm;
    Label L_fill_192_bytes_loop_pre_header_zmm;
    Label L_fill_start_zmm_sequence;
    bind(L_fill_zmm_sequence);
    evpbroadcast(type, xtmp, value, Assembler::AVX_512bit);
    bind(L_fill_start_zmm_sequence);
    cmpq(count, 64 >> shift);
    jccb(Assembler::greater, L_fill_128_bytes_zmm);
    fill64_masked(shift, to, 0, xtmp, k2, count, rtmp, true);
    jmp(L_exit);
    bind(L_fill_128_bytes_zmm);
    cmpq(count, 128 >> shift);
    jccb(Assembler::greater, L_fill_192_bytes_zmm);
    fill64(to, 0, xtmp, true);
    subq(count, 64 >> shift);
    fill64_masked(shift, to, 64, xtmp, k2, count, rtmp, true);
    jmp(L_exit);
    bind(L_fill_192_bytes_zmm);
    cmpq(count, 192 >> shift);
    jccb(Assembler::greater, L_fill_192_bytes_loop_pre_header_zmm);
    fill64(to, 0, xtmp, true);
    fill64(to, 64, xtmp, true);
    subq(count, 128 >> shift);
    fill64_masked(shift, to, 128, xtmp, k2, count, rtmp, true);
    jmp(L_exit);
    bind(L_fill_192_bytes_loop_pre_header_zmm);
    {
      movq(rtmp, to);
      andq(rtmp, 63);
      jccb(Assembler::zero, L_fill_192_bytes_loop_header_zmm);
      negq(rtmp);
      addq(rtmp, 64);
      mov64(r8, -1L);
      bzhiq(r8, r8, rtmp);
      kmovql(k2, r8);
      evmovdqu(T_BYTE, k2, Address(to, 0), xtmp, Assembler::AVX_512bit);
      addq(to, rtmp);
      shrq(rtmp, shift);
      subq(count, rtmp);
    }
    cmpq(count, 192 >> shift);
    jcc(Assembler::less, L_fill_start_zmm_sequence);
    bind(L_fill_192_bytes_loop_header_zmm);
    subq(count, 192 >> shift);
    align32();
    bind(L_fill_192_bytes_loop_zmm);
      fill64(to, 0, xtmp, true);
      fill64(to, 64, xtmp, true);
      fill64(to, 128, xtmp, true);
      addq(to, 192);
      subq(count, 192 >> shift);
      jccb(Assembler::greaterEqual, L_fill_192_bytes_loop_zmm);
    addq(count, 192 >> shift);
    jcc(Assembler::zero, L_exit);
    jmp(L_fill_start_zmm_sequence);
  }
  bind(L_exit);
 }
 #endif
 #endif //COMPILER2_OR_JVMCI
--- a/src/hotspot/cpu/x86/macroAssembler_x86.hpp
+++ b/src/hotspot/cpu/x86/macroAssembler_x86.hpp
@ -1305,6 +1305,7 @@ public:
  void evpcmpw(KRegister kdst, KRegister mask, XMMRegister nds, AddressLiteral src,
               int comparison, bool is_signed, int vector_len, Register scratch_reg);
  void evpbroadcast(BasicType type, XMMRegister dst, Register src, int vector_len);
  // Emit comparison instruction for the specified comparison predicate.
  void vpcmpCCW(XMMRegister dst, XMMRegister nds, XMMRegister src, ComparisonPredicate cond, Width width, int vector_len, Register scratch_reg);
@ -1838,17 +1839,20 @@ public:
  void byte_array_inflate(Register src, Register dst, Register len,
                          XMMRegister tmp1, Register tmp2, KRegister mask = knoreg);
-  void fill64_masked_avx(uint shift, Register dst, int disp,
+  void fill_masked(BasicType bt, Address dst, XMMRegister xmm, KRegister mask,
                   Register length, Register temp, int vec_enc);
  void fill64_masked(uint shift, Register dst, int disp,
                         XMMRegister xmm, KRegister mask, Register length,
                         Register temp, bool use64byteVector = false);
-  void fill32_masked_avx(uint shift, Register dst, int disp,
+  void fill32_masked(uint shift, Register dst, int disp,
                         XMMRegister xmm, KRegister mask, Register length,
                         Register temp);
-  void fill32_avx(Register dst, int disp, XMMRegister xmm);
+  void fill32(Register dst, int disp, XMMRegister xmm);
-  void fill64_avx(Register dst, int dis, XMMRegister xmm, bool use64byteVector = false);
+  void fill64(Register dst, int dis, XMMRegister xmm, bool use64byteVector = false);
 #ifdef _LP64
  void convert_f2i(Register dst, XMMRegister src);
@ -1885,6 +1889,10 @@ public:
  void copy64_avx(Register dst, Register src, Register index, XMMRegister xmm,
                  bool conjoint, int shift = Address::times_1, int offset = 0,
                  bool use64byteVector = false);
  void generate_fill_avx3(BasicType type, Register to, Register value,
                          Register count, Register rtmp, XMMRegister xtmp);
 #endif // COMPILER2_OR_JVMCI
 #endif // _LP64
--- a/src/hotspot/cpu/x86/stubGenerator_x86_64.cpp
+++ b/src/hotspot/cpu/x86/stubGenerator_x86_64.cpp
@ -2113,13 +2113,14 @@ class StubGenerator: public StubCodeGenerator {
    BLOCK_COMMENT("Entry:");
-    const Register to       = c_rarg0;  // source array address
+    const Register to       = c_rarg0;  // destination array address
    const Register value    = c_rarg1;  // value
    const Register count    = c_rarg2;  // elements count
    __ mov(r11, count);
    __ enter(); // required for proper stackwalking of RuntimeStub frame
-    __ generate_fill(t, aligned, to, value, count, rax, xmm0);
+    __ generate_fill(t, aligned, to, value, r11, rax, xmm0);
    __ vzeroupper();
    __ leave(); // required for proper stackwalking of RuntimeStub frame
--- a/src/hotspot/cpu/x86/stubRoutines_x86.hpp
+++ b/src/hotspot/cpu/x86/stubRoutines_x86.hpp
@ -33,7 +33,7 @@ static bool returns_to_call_stub(address return_pc) { return return_pc == _call_
 enum platform_dependent_constants {
  code_size1 = 20000 LP64_ONLY(+10000),         // simply increase if too small (assembler will crash if too small)
-  code_size2 = 35300 LP64_ONLY(+32000)          // simply increase if too small (assembler will crash if too small)
+  code_size2 = 35300 LP64_ONLY(+35000)          // simply increase if too small (assembler will crash if too small)
 };
 class x86 {
--- a/src/hotspot/cpu/x86/vm_version_x86.cpp
+++ b/src/hotspot/cpu/x86/vm_version_x86.cpp
@ -1469,6 +1469,14 @@ void VM_Version::get_processor_features() {
 #endif
  }
 #ifdef COMPILER2
  if (FLAG_IS_DEFAULT(OptimizeFill)) {
    if (MaxVectorSize < 32 || !VM_Version::supports_avx512vlbw()) {
      OptimizeFill = false;
    }
  }
 #endif
 #ifdef _LP64
  if (UseSSE42Intrinsics) {
    if (FLAG_IS_DEFAULT(UseVectorizedMismatchIntrinsic)) {
@ -1585,12 +1593,6 @@ void VM_Version::get_processor_features() {
    // Modern processors allow misaligned memory operations for vectors.
    AlignVector = !UseUnalignedLoadStores;
  }
  if (FLAG_IS_DEFAULT(OptimizeFill)) {
    // 8247307: On x86, the auto-vectorized loop array fill code shows
    // better performance than the array fill stubs. We should reenable
    // this after the x86 stubs get improved.
    OptimizeFill = false;
  }
 #endif // COMPILER2
  if (FLAG_IS_DEFAULT(AllocatePrefetchInstr)) {
--- a/test/micro/org/openjdk/bench/java/util/ArraysFill.java
+++ b/test/micro/org/openjdk/bench/java/util/ArraysFill.java
@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2018, 2021, Oracle and/or its affiliates. All rights reserved.
 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
 *
 * This code is free software; you can redistribute it and/or modify it
@ -36,12 +36,12 @@ import org.openjdk.jmh.annotations.Warmup;
 import java.util.Arrays;
 import java.util.concurrent.TimeUnit;
-@BenchmarkMode(Mode.AverageTime)
+@BenchmarkMode(Mode.Throughput)
-@OutputTimeUnit(TimeUnit.NANOSECONDS)
+@OutputTimeUnit(TimeUnit.MILLISECONDS)
@State(Scope.Thread)
 public class ArraysFill {
-    @Param({"10", "266", "2048"})
+    @Param({"10", "16", "31", "59", "89", "126", "250", "266", "511", "1021", "2047", "2048", "4095", "8195"})
    public int size;
    public byte[] testByteArray;