8324874: AArch64: crypto pmull based CRC32/CRC32C intrinsics clobber V8-V15 registers
Reviewed-by: aph, ngasson
This commit is contained in:
parent
b02599d22e
commit
4cd318756d
src/hotspot/cpu/aarch64
test/hotspot/jtreg/compiler/intrinsics/zip
@ -4265,108 +4265,117 @@ void MacroAssembler::kernel_crc32_common_fold_using_crypto_pmull(Register crc, R
|
||||
}
|
||||
add(table, table, table_offset);
|
||||
|
||||
// Registers v0..v7 are used as data registers.
|
||||
// Registers v16..v31 are used as tmp registers.
|
||||
sub(buf, buf, 0x10);
|
||||
ldrq(v1, Address(buf, 0x10));
|
||||
ldrq(v2, Address(buf, 0x20));
|
||||
ldrq(v3, Address(buf, 0x30));
|
||||
ldrq(v4, Address(buf, 0x40));
|
||||
ldrq(v5, Address(buf, 0x50));
|
||||
ldrq(v6, Address(buf, 0x60));
|
||||
ldrq(v7, Address(buf, 0x70));
|
||||
ldrq(v8, Address(pre(buf, 0x80)));
|
||||
ldrq(v0, Address(buf, 0x10));
|
||||
ldrq(v1, Address(buf, 0x20));
|
||||
ldrq(v2, Address(buf, 0x30));
|
||||
ldrq(v3, Address(buf, 0x40));
|
||||
ldrq(v4, Address(buf, 0x50));
|
||||
ldrq(v5, Address(buf, 0x60));
|
||||
ldrq(v6, Address(buf, 0x70));
|
||||
ldrq(v7, Address(pre(buf, 0x80)));
|
||||
|
||||
movi(v25, T4S, 0);
|
||||
mov(v25, S, 0, crc);
|
||||
eor(v1, T16B, v1, v25);
|
||||
movi(v31, T4S, 0);
|
||||
mov(v31, S, 0, crc);
|
||||
eor(v0, T16B, v0, v31);
|
||||
|
||||
ldrq(v0, Address(table));
|
||||
// Register v16 contains constants from the crc table.
|
||||
ldrq(v16, Address(table));
|
||||
b(CRC_by128_loop);
|
||||
|
||||
align(OptoLoopAlignment);
|
||||
BIND(CRC_by128_loop);
|
||||
pmull (v9, T1Q, v1, v0, T1D);
|
||||
pmull2(v10, T1Q, v1, v0, T2D);
|
||||
ldrq(v1, Address(buf, 0x10));
|
||||
eor3(v1, T16B, v9, v10, v1);
|
||||
pmull (v17, T1Q, v0, v16, T1D);
|
||||
pmull2(v18, T1Q, v0, v16, T2D);
|
||||
ldrq(v0, Address(buf, 0x10));
|
||||
eor3(v0, T16B, v17, v18, v0);
|
||||
|
||||
pmull (v11, T1Q, v2, v0, T1D);
|
||||
pmull2(v12, T1Q, v2, v0, T2D);
|
||||
ldrq(v2, Address(buf, 0x20));
|
||||
eor3(v2, T16B, v11, v12, v2);
|
||||
pmull (v19, T1Q, v1, v16, T1D);
|
||||
pmull2(v20, T1Q, v1, v16, T2D);
|
||||
ldrq(v1, Address(buf, 0x20));
|
||||
eor3(v1, T16B, v19, v20, v1);
|
||||
|
||||
pmull (v13, T1Q, v3, v0, T1D);
|
||||
pmull2(v14, T1Q, v3, v0, T2D);
|
||||
ldrq(v3, Address(buf, 0x30));
|
||||
eor3(v3, T16B, v13, v14, v3);
|
||||
pmull (v21, T1Q, v2, v16, T1D);
|
||||
pmull2(v22, T1Q, v2, v16, T2D);
|
||||
ldrq(v2, Address(buf, 0x30));
|
||||
eor3(v2, T16B, v21, v22, v2);
|
||||
|
||||
pmull (v15, T1Q, v4, v0, T1D);
|
||||
pmull2(v16, T1Q, v4, v0, T2D);
|
||||
ldrq(v4, Address(buf, 0x40));
|
||||
eor3(v4, T16B, v15, v16, v4);
|
||||
pmull (v23, T1Q, v3, v16, T1D);
|
||||
pmull2(v24, T1Q, v3, v16, T2D);
|
||||
ldrq(v3, Address(buf, 0x40));
|
||||
eor3(v3, T16B, v23, v24, v3);
|
||||
|
||||
pmull (v17, T1Q, v5, v0, T1D);
|
||||
pmull2(v18, T1Q, v5, v0, T2D);
|
||||
ldrq(v5, Address(buf, 0x50));
|
||||
eor3(v5, T16B, v17, v18, v5);
|
||||
pmull (v25, T1Q, v4, v16, T1D);
|
||||
pmull2(v26, T1Q, v4, v16, T2D);
|
||||
ldrq(v4, Address(buf, 0x50));
|
||||
eor3(v4, T16B, v25, v26, v4);
|
||||
|
||||
pmull (v19, T1Q, v6, v0, T1D);
|
||||
pmull2(v20, T1Q, v6, v0, T2D);
|
||||
ldrq(v6, Address(buf, 0x60));
|
||||
eor3(v6, T16B, v19, v20, v6);
|
||||
pmull (v27, T1Q, v5, v16, T1D);
|
||||
pmull2(v28, T1Q, v5, v16, T2D);
|
||||
ldrq(v5, Address(buf, 0x60));
|
||||
eor3(v5, T16B, v27, v28, v5);
|
||||
|
||||
pmull (v21, T1Q, v7, v0, T1D);
|
||||
pmull2(v22, T1Q, v7, v0, T2D);
|
||||
ldrq(v7, Address(buf, 0x70));
|
||||
eor3(v7, T16B, v21, v22, v7);
|
||||
pmull (v29, T1Q, v6, v16, T1D);
|
||||
pmull2(v30, T1Q, v6, v16, T2D);
|
||||
ldrq(v6, Address(buf, 0x70));
|
||||
eor3(v6, T16B, v29, v30, v6);
|
||||
|
||||
pmull (v23, T1Q, v8, v0, T1D);
|
||||
pmull2(v24, T1Q, v8, v0, T2D);
|
||||
ldrq(v8, Address(pre(buf, 0x80)));
|
||||
eor3(v8, T16B, v23, v24, v8);
|
||||
// Reuse registers v23, v24.
|
||||
// Using them won't block the first instruction of the next iteration.
|
||||
pmull (v23, T1Q, v7, v16, T1D);
|
||||
pmull2(v24, T1Q, v7, v16, T2D);
|
||||
ldrq(v7, Address(pre(buf, 0x80)));
|
||||
eor3(v7, T16B, v23, v24, v7);
|
||||
|
||||
subs(len, len, 0x80);
|
||||
br(Assembler::GE, CRC_by128_loop);
|
||||
|
||||
// fold into 512 bits
|
||||
ldrq(v0, Address(table, 0x10));
|
||||
// Use v31 for constants because v16 can be still in use.
|
||||
ldrq(v31, Address(table, 0x10));
|
||||
|
||||
pmull (v10, T1Q, v1, v0, T1D);
|
||||
pmull2(v11, T1Q, v1, v0, T2D);
|
||||
eor3(v1, T16B, v10, v11, v5);
|
||||
pmull (v17, T1Q, v0, v31, T1D);
|
||||
pmull2(v18, T1Q, v0, v31, T2D);
|
||||
eor3(v0, T16B, v17, v18, v4);
|
||||
|
||||
pmull (v12, T1Q, v2, v0, T1D);
|
||||
pmull2(v13, T1Q, v2, v0, T2D);
|
||||
eor3(v2, T16B, v12, v13, v6);
|
||||
pmull (v19, T1Q, v1, v31, T1D);
|
||||
pmull2(v20, T1Q, v1, v31, T2D);
|
||||
eor3(v1, T16B, v19, v20, v5);
|
||||
|
||||
pmull (v14, T1Q, v3, v0, T1D);
|
||||
pmull2(v15, T1Q, v3, v0, T2D);
|
||||
eor3(v3, T16B, v14, v15, v7);
|
||||
pmull (v21, T1Q, v2, v31, T1D);
|
||||
pmull2(v22, T1Q, v2, v31, T2D);
|
||||
eor3(v2, T16B, v21, v22, v6);
|
||||
|
||||
pmull (v16, T1Q, v4, v0, T1D);
|
||||
pmull2(v17, T1Q, v4, v0, T2D);
|
||||
eor3(v4, T16B, v16, v17, v8);
|
||||
pmull (v23, T1Q, v3, v31, T1D);
|
||||
pmull2(v24, T1Q, v3, v31, T2D);
|
||||
eor3(v3, T16B, v23, v24, v7);
|
||||
|
||||
// fold into 128 bits
|
||||
ldrq(v5, Address(table, 0x20));
|
||||
pmull (v10, T1Q, v1, v5, T1D);
|
||||
pmull2(v11, T1Q, v1, v5, T2D);
|
||||
eor3(v4, T16B, v4, v10, v11);
|
||||
// Use v17 for constants because v31 can be still in use.
|
||||
ldrq(v17, Address(table, 0x20));
|
||||
pmull (v25, T1Q, v0, v17, T1D);
|
||||
pmull2(v26, T1Q, v0, v17, T2D);
|
||||
eor3(v3, T16B, v3, v25, v26);
|
||||
|
||||
ldrq(v6, Address(table, 0x30));
|
||||
pmull (v12, T1Q, v2, v6, T1D);
|
||||
pmull2(v13, T1Q, v2, v6, T2D);
|
||||
eor3(v4, T16B, v4, v12, v13);
|
||||
// Use v18 for constants because v17 can be still in use.
|
||||
ldrq(v18, Address(table, 0x30));
|
||||
pmull (v27, T1Q, v1, v18, T1D);
|
||||
pmull2(v28, T1Q, v1, v18, T2D);
|
||||
eor3(v3, T16B, v3, v27, v28);
|
||||
|
||||
ldrq(v7, Address(table, 0x40));
|
||||
pmull (v14, T1Q, v3, v7, T1D);
|
||||
pmull2(v15, T1Q, v3, v7, T2D);
|
||||
eor3(v1, T16B, v4, v14, v15);
|
||||
// Use v19 for constants because v18 can be still in use.
|
||||
ldrq(v19, Address(table, 0x40));
|
||||
pmull (v29, T1Q, v2, v19, T1D);
|
||||
pmull2(v30, T1Q, v2, v19, T2D);
|
||||
eor3(v0, T16B, v3, v29, v30);
|
||||
|
||||
add(len, len, 0x80);
|
||||
add(buf, buf, 0x10);
|
||||
|
||||
mov(tmp0, v1, D, 0);
|
||||
mov(tmp1, v1, D, 1);
|
||||
mov(tmp0, v0, D, 0);
|
||||
mov(tmp1, v0, D, 1);
|
||||
}
|
||||
|
||||
SkipIfEqual::SkipIfEqual(
|
||||
|
163
test/hotspot/jtreg/compiler/intrinsics/zip/TestFpRegsABI.java
Normal file
163
test/hotspot/jtreg/compiler/intrinsics/zip/TestFpRegsABI.java
Normal file
@ -0,0 +1,163 @@
|
||||
/*
|
||||
* Copyright Amazon.com Inc. or its affiliates. All rights reserved.
|
||||
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
|
||||
*
|
||||
* This code is free software; you can redistribute it and/or modify it
|
||||
* under the terms of the GNU General Public License version 2 only, as
|
||||
* published by the Free Software Foundation.
|
||||
*
|
||||
* This code is distributed in the hope that it will be useful, but WITHOUT
|
||||
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
|
||||
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
|
||||
* version 2 for more details (a copy is included in the LICENSE file that
|
||||
* accompanied this code).
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License version
|
||||
* 2 along with this work; if not, write to the Free Software Foundation,
|
||||
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||
*
|
||||
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
|
||||
* or visit www.oracle.com if you need additional information or have any
|
||||
* questions.
|
||||
*/
|
||||
|
||||
|
||||
/**
|
||||
* @test TestFpRegsABI
|
||||
* @bug 8324874
|
||||
* @summary ABI for the Arm 64-bit Architecture requires to preserve registers v8-v15 by a callee across subroutine calls
|
||||
*
|
||||
* @run main/othervm -XX:-TieredCompilation -Xbatch -XX:CompileCommand=inline,*::calcValue compiler.intrinsics.zip.TestFpRegsABI
|
||||
* @run main/othervm -XX:-TieredCompilation -Xbatch -XX:CompileCommand=dontinline,*::calcValue compiler.intrinsics.zip.TestFpRegsABI
|
||||
* @run main/othervm -XX:+TieredCompilation -XX:TieredStopAtLevel=1 -Xbatch -XX:CompileCommand=inline,*::calcValue compiler.intrinsics.zip.TestFpRegsABI
|
||||
* @run main/othervm -XX:+TieredCompilation -XX:TieredStopAtLevel=1 -Xbatch -XX:CompileCommand=dontinline,*::calcValue compiler.intrinsics.zip.TestFpRegsABI
|
||||
* @run main/othervm -Xbatch -XX:CompileCommand=inline,*::calcValue compiler.intrinsics.zip.TestFpRegsABI
|
||||
* @run main/othervm -Xbatch -XX:CompileCommand=dontinline,*::calcValue compiler.intrinsics.zip.TestFpRegsABI
|
||||
* @run main/othervm -Xint compiler.intrinsics.zip.TestFpRegsABI
|
||||
*/
|
||||
|
||||
package compiler.intrinsics.zip;
|
||||
|
||||
import java.util.zip.Checksum;
|
||||
import java.util.zip.CRC32;
|
||||
import java.util.zip.CRC32C;
|
||||
|
||||
public class TestFpRegsABI {
|
||||
private static byte[] buf;
|
||||
|
||||
static {
|
||||
buf = new byte[1024];
|
||||
for (int i = 0; i < buf.length; ++i) {
|
||||
buf[i] = (byte)i;
|
||||
}
|
||||
}
|
||||
|
||||
private static class RegressionTest {
|
||||
Checksum checksum;
|
||||
|
||||
RegressionTest(Checksum checksum) {
|
||||
this.checksum = checksum;
|
||||
}
|
||||
|
||||
public void run(byte[] buf, long expectedValue) {
|
||||
for (int i = 0; i < 20_000; ++i) {
|
||||
runIteration(buf, expectedValue);
|
||||
}
|
||||
}
|
||||
|
||||
// If checksum intrinsic does not save fp registers as ABI requires,
|
||||
// the second call of calcValue might produce a wrong result.
|
||||
private void runIteration(byte[] buf, long expectedValue) {
|
||||
int v1 = calcValue(buf);
|
||||
checksum.reset();
|
||||
checksum.update(buf, 0, buf.length);
|
||||
long checksumValue = checksum.getValue();
|
||||
if (checksumValue != expectedValue) {
|
||||
System.err.printf("ERROR: checksum = 0x%016x, expected = 0x%016x\n",
|
||||
checksumValue, expectedValue);
|
||||
throw new RuntimeException("Checksum Error");
|
||||
}
|
||||
int v2 = calcValue(buf);
|
||||
if (v1 != v2) {
|
||||
throw new RuntimeException("Expect v2(" + v2 + ") to equal v1(" + v1 + ")");
|
||||
}
|
||||
}
|
||||
|
||||
private int calcValue(byte[] buf) {
|
||||
return (int)(2.5 * buf.length);
|
||||
}
|
||||
}
|
||||
|
||||
private static class TestIntrinsic {
|
||||
Checksum checksum;
|
||||
|
||||
TestIntrinsic(Checksum checksum) {
|
||||
this.checksum = checksum;
|
||||
}
|
||||
|
||||
public void run(byte[] buf, long expectedValue) {
|
||||
for (int i = 0; i < 20_000; ++i) {
|
||||
runIteration(buf, expectedValue);
|
||||
}
|
||||
}
|
||||
|
||||
// If checksum intrinsic does not save fp registers as ABI requires,
|
||||
// the second call of calcValue might produce a wrong result.
|
||||
private void runIteration(byte[] buf, long expectedValue) {
|
||||
int v1 = calcValue(buf);
|
||||
checksum.reset();
|
||||
checksum.update(buf, 0, buf.length);
|
||||
long checksumValue = checksum.getValue();
|
||||
if (checksumValue != expectedValue) {
|
||||
System.err.printf("ERROR: checksum = 0x%016x, expected = 0x%016x\n",
|
||||
checksumValue, expectedValue);
|
||||
throw new RuntimeException("Checksum Error");
|
||||
}
|
||||
int v2 = calcValue(buf);
|
||||
if (v1 != v2) {
|
||||
throw new RuntimeException("Expect v2(" + v2 + ") to equal v1(" + v1 + ")");
|
||||
}
|
||||
}
|
||||
|
||||
// ABI can require some fp registers to be saved by a callee, e.g. v8-15 in ARM64 ABI.
|
||||
// We create fp register pressure to get as many fp registers used as possible.
|
||||
private int calcValue(byte[] buf) {
|
||||
double v = 0.0;
|
||||
for (int i = 24; i <= buf.length; i += 24) {
|
||||
v += buf[i - 1] * ((double)i - 1.0) + (double)i - 1.0;
|
||||
v += buf[i - 2] * ((double)i - 2.0) + (double)i - 2.0;
|
||||
v += buf[i - 3] * ((double)i - 3.0) + (double)i - 3.0;
|
||||
v += buf[i - 4] * ((double)i - 4.0) + (double)i - 4.0;
|
||||
v += buf[i - 5] * ((double)i - 5.0) + (double)i - 5.0;
|
||||
v += buf[i - 6] * ((double)i - 6.0) + (double)i - 6.0;
|
||||
v += buf[i - 7] * ((double)i - 7.0) + (double)i - 7.0;
|
||||
v += buf[i - 8] * ((double)i - 8.0) + (double)i - 8.0;
|
||||
v += buf[i - 9] * ((double)i - 9.0) + (double)i - 9.0;
|
||||
v += buf[i - 10] * ((double)i - 10.0) + (double)i - 10.0;
|
||||
v += buf[i - 11] * ((double)i - 11.0) + (double)i - 11.0;
|
||||
v += buf[i - 12] * ((double)i - 12.0) + (double)i - 12.0;
|
||||
v += buf[i - 13] * ((double)i - 13.0) + (double)i - 13.0;
|
||||
v += buf[i - 14] * ((double)i - 14.0) + (double)i - 14.0;
|
||||
v += buf[i - 15] * ((double)i - 15.0) + (double)i - 15.0;
|
||||
v += buf[i - 16] * ((double)i - 16.0) + (double)i - 16.0;
|
||||
v += buf[i - 17] * ((double)i - 17.0) + (double)i - 17.0;
|
||||
v += buf[i - 18] * ((double)i - 18.0) + (double)i - 18.0;
|
||||
v += buf[i - 19] * ((double)i - 19.0) + (double)i - 19.0;
|
||||
v += buf[i - 20] * ((double)i - 20.0) + (double)i - 20.0;
|
||||
v += buf[i - 21] * ((double)i - 21.0) + (double)i - 21.0;
|
||||
v += buf[i - 22] * ((double)i - 22.0) + (double)i - 22.0;
|
||||
v += buf[i - 23] * ((double)i - 23.0) + (double)i - 23.0;
|
||||
v += buf[i - 24] * ((double)i - 24.0) + (double)i - 24.0;
|
||||
}
|
||||
return (int)v;
|
||||
}
|
||||
}
|
||||
|
||||
public static void main(final String[] argv) {
|
||||
new TestIntrinsic(new CRC32()).run(buf, 0x00000000b70b4c26L);
|
||||
new TestIntrinsic(new CRC32C()).run(buf, 0x000000002cdf6e8fL);
|
||||
new RegressionTest(new CRC32()).run(buf, 0x00000000b70b4c26L);
|
||||
new RegressionTest(new CRC32C()).run(buf, 0x000000002cdf6e8fL);
|
||||
}
|
||||
}
|
||||
|
Loading…
x
Reference in New Issue
Block a user