8302358: Behavior of adler32 changes after JDK-8300208

Reviewed-by: kvn, jbhateja
This commit is contained in:
Sandhya Viswanathan 2023-02-17 21:31:42 +00:00
parent 86b9fce980
commit 7c60b9c98f
2 changed files with 13 additions and 12 deletions

View File

@ -110,7 +110,7 @@ address StubGenerator::generate_updateBytesAdler32() {
const XMMRegister xtmp4 = xmm9;
const XMMRegister xtmp5 = xmm10;
Label SLOOP1, SPRELOOP1A_AVX2, SLOOP1A_AVX2, SLOOP1A_AVX3, AVX3_REDUCE, SKIP_LOOP_1A;
Label SLOOP1, SLOOP1A_AVX2, SLOOP1A_AVX3, AVX3_REDUCE, SKIP_LOOP_1A;
Label SKIP_LOOP_1A_AVX3, FINISH, LT64, DO_FINAL, FINAL_LOOP, ZERO_SIZE, END;
__ enter(); // required for proper stackwalking of RuntimeStub frame
@ -133,6 +133,7 @@ address StubGenerator::generate_updateBytesAdler32() {
__ movdl(xa, init_d); //vmovd - 32bit
__ bind(SLOOP1);
__ vpxor(yb, yb, yb, VM_Version::supports_avx512vl() ? Assembler::AVX_512bit : Assembler::AVX_256bit);
__ movl(s, LIMIT);
__ cmpl(s, size);
__ cmovl(Assembler::above, s, size); // s = min(size, LIMIT)
@ -144,10 +145,8 @@ address StubGenerator::generate_updateBytesAdler32() {
if (VM_Version::supports_avx512vl()) {
// AVX2 performs better for smaller inputs because of leaner post loop reduction sequence..
__ cmpl(s, MAX2(128, VM_Version::avx3_threshold()));
__ jcc(Assembler::belowEqual, SPRELOOP1A_AVX2);
__ jcc(Assembler::belowEqual, SLOOP1A_AVX2);
__ lea(end, Address(s, data, Address::times_1, - (2*CHUNKSIZE -1)));
__ vpxor(yb, yb, yb, Assembler::AVX_512bit);
// Some notes on vectorized main loop algorithm.
// Additions are performed in slices of 16 bytes in the main loop.
@ -164,8 +163,8 @@ address StubGenerator::generate_updateBytesAdler32() {
// Since addition was performed in chunks of 16 bytes, thus to match the scalar implementation
// Oth lane element must be repeatedly added 16 times, 1st element 15 times and so on so forth.
// Thus we first multiply yb by 16 followed by subtracting appropriately scaled ya value.
// yb = 16 x yb - [0 - 15] x ya
// = 64 x [0 - 15] + 48 x [16 - 31] + 32 x [32 - 47] + 16 x [48 - 63] - [0 - 15] x ya
// yb = 16 x yb - [a0 - a15] x ya
// = 64 x [a0 - a15] + 48 x [a16 - a31] + 32 x [a32 - a47] + 16 x [a48 - a63] - [a0 - a15] x ya
// = 64 x a0 + 63 x a1 + 62 x a2 ...... + a63
__ bind(SLOOP1A_AVX3);
__ evpmovzxbd(ydata0, Address(data, 0), Assembler::AVX_512bit);
@ -220,8 +219,6 @@ address StubGenerator::generate_updateBytesAdler32() {
}
__ align32();
__ bind(SPRELOOP1A_AVX2);
__ vpxor(yb, yb, yb, Assembler::AVX_256bit);
__ bind(SLOOP1A_AVX2);
__ vbroadcastf128(ydata, Address(data, 0), Assembler::AVX_256bit);
__ addptr(data, CHUNKSIZE);

View File

@ -1,5 +1,5 @@
/*
* Copyright (c) 2015, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2015, 2023, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
@ -146,7 +146,7 @@ public class TestAdler32 {
if (adler0.getValue() != adler1.getValue()) {
System.err.printf("ERROR: adler0 = %08x, adler1 = %08x\n",
adler0.getValue(), adler1.getValue());
throw new AssertionError("TEST FAILED", null);
return false;
}
return true;
}
@ -166,6 +166,7 @@ public class TestAdler32 {
int len1 = 8; // the 8B/iteration loop
int len2 = 32; // the 32B/iteration loop
int len3 = 4096; // the 4KB/iteration loop
int len4 = 5552; // the adler limit
byte[] b = initializedBytes(len3*16, 0);
int[] offsets = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 16, 32, 64, 128, 256, 512 };
@ -176,6 +177,8 @@ public class TestAdler32 {
len2*2, len2*4, len2*8, len2*16, len2*32, len2*64,
len3, len3+1, len3+3, len3+5, len3+7,
len3*2, len3*4, len3*8,
len4, len4+1, len4+3, len4+5, len4+7, len4+len1, len4+len2, len4+len3,
len4*2, len4*4, len4*2+1, len4*4+4,
len1+len2, len1+len2+1, len1+len2+3, len1+len2+5, len1+len2+7,
len1+len3, len1+len3+1, len1+len3+3, len1+len3+5, len1+len3+7,
len2+len3, len2+len3+1, len2+len3+3, len2+len3+5, len2+len3+7,
@ -214,8 +217,9 @@ public class TestAdler32 {
for (i = 0; i < offsets.length; i++) {
for (j = 0; j < sizes.length; j++) {
if (!check(adler0[i*sizes.length + j], adler1[i*sizes.length + j])) {
System.out.printf("offsets[%d] = %d", i, offsets[i]);
System.out.printf("\tsizes[%d] = %d\n", j, sizes[j]);
System.out.printf("Failed at: offsets[%d] = %d", i, offsets[i]);
System.out.printf(", sizes[%d] = %d\n", j, sizes[j]);
throw new AssertionError("TEST FAILED", null);
}
}
}