From a138ebeb528a92ab135e9fc072de50b6bb056ac9 Mon Sep 17 00:00:00 2001 From: Ed Nevill Date: Thu, 8 Oct 2015 13:14:46 +0000 Subject: [PATCH] 8139043: aarch64: add support for adler32 intrinsic Add adler32 support like 8132081 for sparc Reviewed-by: kvn --- .../cpu/aarch64/vm/stubGenerator_aarch64.cpp | 273 ++++++++++++++++++ .../src/cpu/aarch64/vm/vm_version_aarch64.cpp | 5 +- 2 files changed, 275 insertions(+), 3 deletions(-) diff --git a/hotspot/src/cpu/aarch64/vm/stubGenerator_aarch64.cpp b/hotspot/src/cpu/aarch64/vm/stubGenerator_aarch64.cpp index 56799453938..82b44450128 100644 --- a/hotspot/src/cpu/aarch64/vm/stubGenerator_aarch64.cpp +++ b/hotspot/src/cpu/aarch64/vm/stubGenerator_aarch64.cpp @@ -2395,6 +2395,274 @@ class StubGenerator: public StubCodeGenerator { return start; } + /*** + * Arguments: + * + * Inputs: + * c_rarg0 - int adler + * c_rarg1 - byte* buff + * c_rarg2 - int len + * + * Output: + * c_rarg0 - int adler result + */ + address generate_updateBytesAdler32() { + __ align(CodeEntryAlignment); + StubCodeMark mark(this, "StubRoutines", "updateBytesAdler32"); + address start = __ pc(); + + Label L_simple_by1_loop, L_nmax, L_nmax_loop, L_by16, L_by16_loop, L_by1_loop, L_do_mod, L_combine, L_by1; + + // Aliases + Register adler = c_rarg0; + Register s1 = c_rarg0; + Register s2 = c_rarg3; + Register buff = c_rarg1; + Register len = c_rarg2; + Register nmax = r4; + Register base = r5; + Register count = r6; + Register temp0 = rscratch1; + Register temp1 = rscratch2; + Register temp2 = r7; + + // Max number of bytes we can process before having to take the mod + // 0x15B0 is 5552 in decimal, the largest n such that 255n(n+1)/2 + (n+1)(BASE-1) <= 2^32-1 + unsigned long BASE = 0xfff1; + unsigned long NMAX = 0x15B0; + + __ mov(base, BASE); + __ mov(nmax, NMAX); + + // s1 is initialized to the lower 16 bits of adler + // s2 is initialized to the upper 16 bits of adler + __ ubfx(s2, adler, 16, 16); // s2 = ((adler >> 16) & 0xffff) + __ uxth(s1, adler); // s1 = (adler & 0xffff) + + // The pipelined loop needs at least 16 elements for 1 iteration + // It does check this, but it is more effective to skip to the cleanup loop + __ cmp(len, 16); + __ br(Assembler::HS, L_nmax); + __ cbz(len, L_combine); + + __ bind(L_simple_by1_loop); + __ ldrb(temp0, Address(__ post(buff, 1))); + __ add(s1, s1, temp0); + __ add(s2, s2, s1); + __ subs(len, len, 1); + __ br(Assembler::HI, L_simple_by1_loop); + + // s1 = s1 % BASE + __ subs(temp0, s1, base); + __ csel(s1, temp0, s1, Assembler::HS); + + // s2 = s2 % BASE + __ lsr(temp0, s2, 16); + __ lsl(temp1, temp0, 4); + __ sub(temp1, temp1, temp0); + __ add(s2, temp1, s2, ext::uxth); + + __ subs(temp0, s2, base); + __ csel(s2, temp0, s2, Assembler::HS); + + __ b(L_combine); + + __ bind(L_nmax); + __ subs(len, len, nmax); + __ sub(count, nmax, 16); + __ br(Assembler::LO, L_by16); + + __ bind(L_nmax_loop); + + __ ldp(temp0, temp1, Address(__ post(buff, 16))); + + __ add(s1, s1, temp0, ext::uxtb); + __ ubfx(temp2, temp0, 8, 8); + __ add(s2, s2, s1); + __ add(s1, s1, temp2); + __ ubfx(temp2, temp0, 16, 8); + __ add(s2, s2, s1); + __ add(s1, s1, temp2); + __ ubfx(temp2, temp0, 24, 8); + __ add(s2, s2, s1); + __ add(s1, s1, temp2); + __ ubfx(temp2, temp0, 32, 8); + __ add(s2, s2, s1); + __ add(s1, s1, temp2); + __ ubfx(temp2, temp0, 40, 8); + __ add(s2, s2, s1); + __ add(s1, s1, temp2); + __ ubfx(temp2, temp0, 48, 8); + __ add(s2, s2, s1); + __ add(s1, s1, temp2); + __ add(s2, s2, s1); + __ add(s1, s1, temp0, Assembler::LSR, 56); + __ add(s2, s2, s1); + + __ add(s1, s1, temp1, ext::uxtb); + __ ubfx(temp2, temp1, 8, 8); + __ add(s2, s2, s1); + __ add(s1, s1, temp2); + __ ubfx(temp2, temp1, 16, 8); + __ add(s2, s2, s1); + __ add(s1, s1, temp2); + __ ubfx(temp2, temp1, 24, 8); + __ add(s2, s2, s1); + __ add(s1, s1, temp2); + __ ubfx(temp2, temp1, 32, 8); + __ add(s2, s2, s1); + __ add(s1, s1, temp2); + __ ubfx(temp2, temp1, 40, 8); + __ add(s2, s2, s1); + __ add(s1, s1, temp2); + __ ubfx(temp2, temp1, 48, 8); + __ add(s2, s2, s1); + __ add(s1, s1, temp2); + __ add(s2, s2, s1); + __ add(s1, s1, temp1, Assembler::LSR, 56); + __ add(s2, s2, s1); + + __ subs(count, count, 16); + __ br(Assembler::HS, L_nmax_loop); + + // s1 = s1 % BASE + __ lsr(temp0, s1, 16); + __ lsl(temp1, temp0, 4); + __ sub(temp1, temp1, temp0); + __ add(temp1, temp1, s1, ext::uxth); + + __ lsr(temp0, temp1, 16); + __ lsl(s1, temp0, 4); + __ sub(s1, s1, temp0); + __ add(s1, s1, temp1, ext:: uxth); + + __ subs(temp0, s1, base); + __ csel(s1, temp0, s1, Assembler::HS); + + // s2 = s2 % BASE + __ lsr(temp0, s2, 16); + __ lsl(temp1, temp0, 4); + __ sub(temp1, temp1, temp0); + __ add(temp1, temp1, s2, ext::uxth); + + __ lsr(temp0, temp1, 16); + __ lsl(s2, temp0, 4); + __ sub(s2, s2, temp0); + __ add(s2, s2, temp1, ext:: uxth); + + __ subs(temp0, s2, base); + __ csel(s2, temp0, s2, Assembler::HS); + + __ subs(len, len, nmax); + __ sub(count, nmax, 16); + __ br(Assembler::HS, L_nmax_loop); + + __ bind(L_by16); + __ adds(len, len, count); + __ br(Assembler::LO, L_by1); + + __ bind(L_by16_loop); + + __ ldp(temp0, temp1, Address(__ post(buff, 16))); + + __ add(s1, s1, temp0, ext::uxtb); + __ ubfx(temp2, temp0, 8, 8); + __ add(s2, s2, s1); + __ add(s1, s1, temp2); + __ ubfx(temp2, temp0, 16, 8); + __ add(s2, s2, s1); + __ add(s1, s1, temp2); + __ ubfx(temp2, temp0, 24, 8); + __ add(s2, s2, s1); + __ add(s1, s1, temp2); + __ ubfx(temp2, temp0, 32, 8); + __ add(s2, s2, s1); + __ add(s1, s1, temp2); + __ ubfx(temp2, temp0, 40, 8); + __ add(s2, s2, s1); + __ add(s1, s1, temp2); + __ ubfx(temp2, temp0, 48, 8); + __ add(s2, s2, s1); + __ add(s1, s1, temp2); + __ add(s2, s2, s1); + __ add(s1, s1, temp0, Assembler::LSR, 56); + __ add(s2, s2, s1); + + __ add(s1, s1, temp1, ext::uxtb); + __ ubfx(temp2, temp1, 8, 8); + __ add(s2, s2, s1); + __ add(s1, s1, temp2); + __ ubfx(temp2, temp1, 16, 8); + __ add(s2, s2, s1); + __ add(s1, s1, temp2); + __ ubfx(temp2, temp1, 24, 8); + __ add(s2, s2, s1); + __ add(s1, s1, temp2); + __ ubfx(temp2, temp1, 32, 8); + __ add(s2, s2, s1); + __ add(s1, s1, temp2); + __ ubfx(temp2, temp1, 40, 8); + __ add(s2, s2, s1); + __ add(s1, s1, temp2); + __ ubfx(temp2, temp1, 48, 8); + __ add(s2, s2, s1); + __ add(s1, s1, temp2); + __ add(s2, s2, s1); + __ add(s1, s1, temp1, Assembler::LSR, 56); + __ add(s2, s2, s1); + + __ subs(len, len, 16); + __ br(Assembler::HS, L_by16_loop); + + __ bind(L_by1); + __ adds(len, len, 15); + __ br(Assembler::LO, L_do_mod); + + __ bind(L_by1_loop); + __ ldrb(temp0, Address(__ post(buff, 1))); + __ add(s1, temp0, s1); + __ add(s2, s2, s1); + __ subs(len, len, 1); + __ br(Assembler::HS, L_by1_loop); + + __ bind(L_do_mod); + // s1 = s1 % BASE + __ lsr(temp0, s1, 16); + __ lsl(temp1, temp0, 4); + __ sub(temp1, temp1, temp0); + __ add(temp1, temp1, s1, ext::uxth); + + __ lsr(temp0, temp1, 16); + __ lsl(s1, temp0, 4); + __ sub(s1, s1, temp0); + __ add(s1, s1, temp1, ext:: uxth); + + __ subs(temp0, s1, base); + __ csel(s1, temp0, s1, Assembler::HS); + + // s2 = s2 % BASE + __ lsr(temp0, s2, 16); + __ lsl(temp1, temp0, 4); + __ sub(temp1, temp1, temp0); + __ add(temp1, temp1, s2, ext::uxth); + + __ lsr(temp0, temp1, 16); + __ lsl(s2, temp0, 4); + __ sub(s2, s2, temp0); + __ add(s2, s2, temp1, ext:: uxth); + + __ subs(temp0, s2, base); + __ csel(s2, temp0, s2, Assembler::HS); + + // Combine lower bits and higher bits + __ bind(L_combine); + __ orr(s1, s1, s2, Assembler::LSL, 16); // adler = s1 | (s2 << 16) + + __ ret(lr); + + return start; + } + /** * Arguments: * @@ -3613,6 +3881,11 @@ class StubGenerator: public StubCodeGenerator { StubRoutines::_updateBytesCRC32C = generate_updateBytesCRC32C(); } + // generate Adler32 intrinsics code + if (UseAdler32Intrinsics) { + StubRoutines::_updateBytesAdler32 = generate_updateBytesAdler32(); + } + // Safefetch stubs. generate_safefetch("SafeFetch32", sizeof(int), &StubRoutines::_safefetch32_entry, &StubRoutines::_safefetch32_fault_pc, diff --git a/hotspot/src/cpu/aarch64/vm/vm_version_aarch64.cpp b/hotspot/src/cpu/aarch64/vm/vm_version_aarch64.cpp index daf6163142f..37c0e5bbb63 100644 --- a/hotspot/src/cpu/aarch64/vm/vm_version_aarch64.cpp +++ b/hotspot/src/cpu/aarch64/vm/vm_version_aarch64.cpp @@ -178,9 +178,8 @@ void VM_Version::get_processor_features() { warning("UseCRC32 specified, but not supported on this CPU"); } - if (UseAdler32Intrinsics) { - warning("Adler32Intrinsics not available on this CPU."); - FLAG_SET_DEFAULT(UseAdler32Intrinsics, false); + if (FLAG_IS_DEFAULT(UseAdler32Intrinsics)) { + FLAG_SET_DEFAULT(UseAdler32Intrinsics, true); } if (auxv & HWCAP_AES) {