8341197: [BACKOUT] 8322770: Implement C2 VectorizedHashCode on AArch64
Reviewed-by: shade, jpai
This commit is contained in:
parent
1cf26a5179
commit
58b6fc5baa
src/hotspot
cpu/aarch64
aarch64.adassembler_aarch64.hppc2_MacroAssembler_aarch64.cppc2_MacroAssembler_aarch64.hppstubGenerator_aarch64.cppstubRoutines_aarch64.cppstubRoutines_aarch64.hppvm_version_aarch64.cpp
share/utilities
test/hotspot/gtest/aarch64
@ -4931,60 +4931,6 @@ operand vRegD_V7()
|
||||
interface(REG_INTER);
|
||||
%}
|
||||
|
||||
operand vRegD_V12()
|
||||
%{
|
||||
constraint(ALLOC_IN_RC(v12_reg));
|
||||
match(RegD);
|
||||
op_cost(0);
|
||||
format %{ %}
|
||||
interface(REG_INTER);
|
||||
%}
|
||||
|
||||
operand vRegD_V13()
|
||||
%{
|
||||
constraint(ALLOC_IN_RC(v13_reg));
|
||||
match(RegD);
|
||||
op_cost(0);
|
||||
format %{ %}
|
||||
interface(REG_INTER);
|
||||
%}
|
||||
|
||||
operand vRegD_V14()
|
||||
%{
|
||||
constraint(ALLOC_IN_RC(v14_reg));
|
||||
match(RegD);
|
||||
op_cost(0);
|
||||
format %{ %}
|
||||
interface(REG_INTER);
|
||||
%}
|
||||
|
||||
operand vRegD_V15()
|
||||
%{
|
||||
constraint(ALLOC_IN_RC(v15_reg));
|
||||
match(RegD);
|
||||
op_cost(0);
|
||||
format %{ %}
|
||||
interface(REG_INTER);
|
||||
%}
|
||||
|
||||
operand vRegD_V16()
|
||||
%{
|
||||
constraint(ALLOC_IN_RC(v16_reg));
|
||||
match(RegD);
|
||||
op_cost(0);
|
||||
format %{ %}
|
||||
interface(REG_INTER);
|
||||
%}
|
||||
|
||||
operand vRegD_V17()
|
||||
%{
|
||||
constraint(ALLOC_IN_RC(v17_reg));
|
||||
match(RegD);
|
||||
op_cost(0);
|
||||
format %{ %}
|
||||
interface(REG_INTER);
|
||||
%}
|
||||
|
||||
operand pReg()
|
||||
%{
|
||||
constraint(ALLOC_IN_RC(pr_reg));
|
||||
@ -16605,30 +16551,6 @@ instruct array_equalsC(iRegP_R1 ary1, iRegP_R2 ary2, iRegI_R0 result,
|
||||
ins_pipe(pipe_class_memory);
|
||||
%}
|
||||
|
||||
instruct arrays_hashcode(iRegP_R1 ary, iRegI_R2 cnt, iRegI_R0 result, immI basic_type,
|
||||
vRegD_V0 vtmp0, vRegD_V1 vtmp1, vRegD_V2 vtmp2, vRegD_V3 vtmp3,
|
||||
vRegD_V4 vtmp4, vRegD_V5 vtmp5, vRegD_V6 vtmp6, vRegD_V7 vtmp7,
|
||||
vRegD_V12 vtmp8, vRegD_V13 vtmp9, vRegD_V14 vtmp10,
|
||||
vRegD_V15 vtmp11, vRegD_V16 vtmp12, vRegD_V17 vtmp13,
|
||||
rFlagsReg cr)
|
||||
%{
|
||||
match(Set result (VectorizedHashCode (Binary ary cnt) (Binary result basic_type)));
|
||||
effect(TEMP vtmp0, TEMP vtmp1, TEMP vtmp2, TEMP vtmp3, TEMP vtmp4, TEMP vtmp5, TEMP vtmp6,
|
||||
TEMP vtmp7, TEMP vtmp8, TEMP vtmp9, TEMP vtmp10, TEMP vtmp11, TEMP vtmp12, TEMP vtmp13,
|
||||
USE_KILL ary, USE_KILL cnt, USE basic_type, KILL cr);
|
||||
|
||||
format %{ "Array HashCode array[] $ary,$cnt,$result,$basic_type -> $result // KILL all" %}
|
||||
ins_encode %{
|
||||
address tpc = __ arrays_hashcode($ary$$Register, $cnt$$Register, $result$$Register,
|
||||
(BasicType)$basic_type$$constant);
|
||||
if (tpc == nullptr) {
|
||||
ciEnv::current()->record_failure("CodeCache is full");
|
||||
return;
|
||||
}
|
||||
%}
|
||||
ins_pipe(pipe_class_memory);
|
||||
%}
|
||||
|
||||
instruct count_positives(iRegP_R1 ary1, iRegI_R2 len, iRegI_R0 result, rFlagsReg cr)
|
||||
%{
|
||||
match(Set result (CountPositives ary1 len));
|
||||
|
@ -1,5 +1,5 @@
|
||||
/*
|
||||
* Copyright (c) 1997, 2024, Oracle and/or its affiliates. All rights reserved.
|
||||
* Copyright (c) 1997, 2023, Oracle and/or its affiliates. All rights reserved.
|
||||
* Copyright (c) 2014, 2024, Red Hat Inc. All rights reserved.
|
||||
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
|
||||
*
|
||||
@ -287,11 +287,6 @@ public:
|
||||
f(r->raw_encoding(), lsb + 4, lsb);
|
||||
}
|
||||
|
||||
//<0-15>reg: As `rf(FloatRegister)`, but only the lower 16 FloatRegisters are allowed.
|
||||
void lrf(FloatRegister r, int lsb) {
|
||||
f(r->raw_encoding(), lsb + 3, lsb);
|
||||
}
|
||||
|
||||
void prf(PRegister r, int lsb) {
|
||||
f(r->raw_encoding(), lsb + 3, lsb);
|
||||
}
|
||||
@ -770,7 +765,6 @@ public:
|
||||
#define f current_insn.f
|
||||
#define sf current_insn.sf
|
||||
#define rf current_insn.rf
|
||||
#define lrf current_insn.lrf
|
||||
#define srf current_insn.srf
|
||||
#define zrf current_insn.zrf
|
||||
#define prf current_insn.prf
|
||||
@ -1596,16 +1590,6 @@ public:
|
||||
|
||||
#undef INSN
|
||||
|
||||
// Load/store a register, but with a BasicType parameter. Loaded signed integer values are
|
||||
// extended to 64 bits.
|
||||
void load(Register Rt, const Address &adr, BasicType bt) {
|
||||
int op = (is_signed_subword_type(bt) || bt == T_INT) ? 0b10 : 0b01;
|
||||
ld_st2(Rt, adr, exact_log2(type2aelembytes(bt)), op);
|
||||
}
|
||||
void store(Register Rt, const Address &adr, BasicType bt) {
|
||||
ld_st2(Rt, adr, exact_log2(type2aelembytes(bt)), 0b00);
|
||||
}
|
||||
|
||||
/* SIMD extensions
|
||||
*
|
||||
* We just use FloatRegister in the following. They are exactly the same
|
||||
@ -2603,7 +2587,6 @@ template<typename R, typename... Rx>
|
||||
INSN(addpv, 0, 0b101111, true); // accepted arrangements: T8B, T16B, T4H, T8H, T2S, T4S, T2D
|
||||
INSN(smullv, 0, 0b110000, false); // accepted arrangements: T8B, T16B, T4H, T8H, T2S, T4S
|
||||
INSN(umullv, 1, 0b110000, false); // accepted arrangements: T8B, T16B, T4H, T8H, T2S, T4S
|
||||
INSN(smlalv, 0, 0b100000, false); // accepted arrangements: T8B, T16B, T4H, T8H, T2S, T4S
|
||||
INSN(umlalv, 1, 0b100000, false); // accepted arrangements: T8B, T16B, T4H, T8H, T2S, T4S
|
||||
INSN(maxv, 0, 0b011001, false); // accepted arrangements: T8B, T16B, T4H, T8H, T2S, T4S
|
||||
INSN(minv, 0, 0b011011, false); // accepted arrangements: T8B, T16B, T4H, T8H, T2S, T4S
|
||||
@ -2877,28 +2860,6 @@ template<typename R, typename... Rx>
|
||||
// FMULX - Vector - Scalar
|
||||
INSN(fmulxvs, 1, 0b1001);
|
||||
|
||||
#undef INSN
|
||||
|
||||
#define INSN(NAME, op1, op2) \
|
||||
void NAME(FloatRegister Vd, SIMD_Arrangement T, FloatRegister Vn, FloatRegister Vm, int index) { \
|
||||
starti; \
|
||||
assert(T == T4H || T == T8H || T == T2S || T == T4S, "invalid arrangement"); \
|
||||
assert(index >= 0 && \
|
||||
((T == T2S && index <= 1) || (T != T2S && index <= 3) || (T == T8H && index <= 7)), \
|
||||
"invalid index"); \
|
||||
assert((T != T4H && T != T8H) || Vm->encoding() < 16, "invalid source SIMD&FP register"); \
|
||||
f(0, 31), f((int)T & 1, 30), f(op1, 29), f(0b01111, 28, 24); \
|
||||
if (T == T4H || T == T8H) { \
|
||||
f(0b01, 23, 22), f(index & 0b11, 21, 20), lrf(Vm, 16), f(index >> 2 & 1, 11); \
|
||||
} else { \
|
||||
f(0b10, 23, 22), f(index & 1, 21), rf(Vm, 16), f(index >> 1, 11); \
|
||||
} \
|
||||
f(op2, 15, 12), f(0, 10), rf(Vn, 5), rf(Vd, 0); \
|
||||
}
|
||||
|
||||
// MUL - Vector - Scalar
|
||||
INSN(mulvs, 0, 0b1000);
|
||||
|
||||
#undef INSN
|
||||
|
||||
// Floating-point Reciprocal Estimate
|
||||
@ -3062,33 +3023,6 @@ public:
|
||||
umov(Xd, Vn, T, index);
|
||||
}
|
||||
|
||||
protected:
|
||||
void _xaddwv(bool is_unsigned, FloatRegister Vd, FloatRegister Vn, SIMD_Arrangement Ta,
|
||||
FloatRegister Vm, SIMD_Arrangement Tb) {
|
||||
starti;
|
||||
assert((Tb >> 1) + 1 == (Ta >> 1), "Incompatible arrangement");
|
||||
f(0, 31), f((int)Tb & 1, 30), f(is_unsigned ? 1 : 0, 29), f(0b01110, 28, 24);
|
||||
f((int)(Ta >> 1) - 1, 23, 22), f(1, 21), rf(Vm, 16), f(0b000100, 15, 10), rf(Vn, 5), rf(Vd, 0);
|
||||
}
|
||||
|
||||
public:
|
||||
#define INSN(NAME, assertion, is_unsigned) \
|
||||
void NAME(FloatRegister Vd, FloatRegister Vn, SIMD_Arrangement Ta, FloatRegister Vm, \
|
||||
SIMD_Arrangement Tb) { \
|
||||
assert((assertion), "invalid arrangement"); \
|
||||
_xaddwv(is_unsigned, Vd, Vn, Ta, Vm, Tb); \
|
||||
}
|
||||
|
||||
public:
|
||||
|
||||
INSN(uaddwv, Tb == T8B || Tb == T4H || Tb == T2S, /*is_unsigned*/true)
|
||||
INSN(uaddwv2, Tb == T16B || Tb == T8H || Tb == T4S, /*is_unsigned*/true)
|
||||
INSN(saddwv, Tb == T8B || Tb == T4H || Tb == T2S, /*is_unsigned*/false)
|
||||
INSN(saddwv2, Tb == T16B || Tb == T8H || Tb == T4S, /*is_unsigned*/false)
|
||||
|
||||
#undef INSN
|
||||
|
||||
|
||||
private:
|
||||
void _pmull(FloatRegister Vd, SIMD_Arrangement Ta, FloatRegister Vn, FloatRegister Vm, SIMD_Arrangement Tb) {
|
||||
starti;
|
||||
|
@ -33,7 +33,6 @@
|
||||
#include "opto/subnode.hpp"
|
||||
#include "runtime/stubRoutines.hpp"
|
||||
#include "utilities/globalDefinitions.hpp"
|
||||
#include "utilities/powerOfTwo.hpp"
|
||||
|
||||
#ifdef PRODUCT
|
||||
#define BLOCK_COMMENT(str) /* nothing */
|
||||
@ -47,96 +46,6 @@
|
||||
|
||||
typedef void (MacroAssembler::* chr_insn)(Register Rt, const Address &adr);
|
||||
|
||||
// jdk.internal.util.ArraysSupport.vectorizedHashCode
|
||||
address C2_MacroAssembler::arrays_hashcode(Register ary, Register cnt, Register result,
|
||||
BasicType eltype) {
|
||||
assert_different_registers(ary, cnt, result, rscratch1, rscratch2);
|
||||
|
||||
Register tmp1 = rscratch1, tmp2 = rscratch2;
|
||||
|
||||
Label TAIL, STUB_SWITCH, STUB_SWITCH_OUT, LOOP, BR_BASE, LARGE, DONE;
|
||||
|
||||
// Vectorization factor. Number of array elements loaded to one SIMD&FP registers by the stubs. We
|
||||
// use 8H load arrangements for chars and shorts and 8B for booleans and bytes. It's possible to
|
||||
// use 4H for chars and shorts instead, but using 8H gives better performance.
|
||||
const size_t vf = eltype == T_BOOLEAN || eltype == T_BYTE ? 8
|
||||
: eltype == T_CHAR || eltype == T_SHORT ? 8
|
||||
: eltype == T_INT ? 4
|
||||
: 0;
|
||||
guarantee(vf, "unsupported eltype");
|
||||
|
||||
// Unroll factor for the scalar loop below. The value is chosen based on performance analysis.
|
||||
const size_t unroll_factor = 4;
|
||||
|
||||
switch (eltype) {
|
||||
case T_BOOLEAN:
|
||||
BLOCK_COMMENT("arrays_hashcode(unsigned byte) {");
|
||||
break;
|
||||
case T_CHAR:
|
||||
BLOCK_COMMENT("arrays_hashcode(char) {");
|
||||
break;
|
||||
case T_BYTE:
|
||||
BLOCK_COMMENT("arrays_hashcode(byte) {");
|
||||
break;
|
||||
case T_SHORT:
|
||||
BLOCK_COMMENT("arrays_hashcode(short) {");
|
||||
break;
|
||||
case T_INT:
|
||||
BLOCK_COMMENT("arrays_hashcode(int) {");
|
||||
break;
|
||||
default:
|
||||
ShouldNotReachHere();
|
||||
}
|
||||
|
||||
// large_arrays_hashcode(T_INT) performs worse than the scalar loop below when the Neon loop
|
||||
// implemented by the stub executes just once. Call the stub only if at least two iterations will
|
||||
// be executed.
|
||||
const size_t large_threshold = eltype == T_INT ? vf * 2 : vf;
|
||||
cmpw(cnt, large_threshold);
|
||||
br(Assembler::HS, LARGE);
|
||||
|
||||
bind(TAIL);
|
||||
|
||||
// The andr performs cnt % uf where uf = unroll_factor. The subtract shifted by 3 offsets past
|
||||
// uf - (cnt % uf) pairs of load + madd insns i.e. it only executes cnt % uf load + madd pairs.
|
||||
// Iteration eats up the remainder, uf elements at a time.
|
||||
assert(is_power_of_2(unroll_factor), "can't use this value to calculate the jump target PC");
|
||||
andr(tmp2, cnt, unroll_factor - 1);
|
||||
adr(tmp1, BR_BASE);
|
||||
sub(tmp1, tmp1, tmp2, ext::sxtw, 3);
|
||||
movw(tmp2, 0x1f);
|
||||
br(tmp1);
|
||||
|
||||
bind(LOOP);
|
||||
for (size_t i = 0; i < unroll_factor; ++i) {
|
||||
load(tmp1, Address(post(ary, type2aelembytes(eltype))), eltype);
|
||||
maddw(result, result, tmp2, tmp1);
|
||||
}
|
||||
bind(BR_BASE);
|
||||
subsw(cnt, cnt, unroll_factor);
|
||||
br(Assembler::HS, LOOP);
|
||||
|
||||
b(DONE);
|
||||
|
||||
bind(LARGE);
|
||||
|
||||
RuntimeAddress stub = RuntimeAddress(StubRoutines::aarch64::large_arrays_hashcode(eltype));
|
||||
assert(stub.target() != nullptr, "array_hashcode stub has not been generated");
|
||||
address tpc = trampoline_call(stub);
|
||||
if (tpc == nullptr) {
|
||||
DEBUG_ONLY(reset_labels(TAIL, BR_BASE));
|
||||
postcond(pc() == badAddress);
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
bind(DONE);
|
||||
|
||||
BLOCK_COMMENT("} // arrays_hashcode");
|
||||
|
||||
postcond(pc() != badAddress);
|
||||
return pc();
|
||||
}
|
||||
|
||||
void C2_MacroAssembler::fast_lock(Register objectReg, Register boxReg, Register tmpReg,
|
||||
Register tmp2Reg, Register tmp3Reg) {
|
||||
Register oop = objectReg;
|
||||
|
@ -35,9 +35,6 @@
|
||||
enum shift_kind kind = Assembler::LSL, unsigned shift = 0);
|
||||
|
||||
public:
|
||||
// jdk.internal.util.ArraysSupport.vectorizedHashCode
|
||||
address arrays_hashcode(Register ary, Register cnt, Register result, BasicType eltype);
|
||||
|
||||
// Code used by cmpFastLock and cmpFastUnlock mach instructions in .ad file.
|
||||
void fast_lock(Register object, Register box, Register tmp, Register tmp2, Register tmp3);
|
||||
void fast_unlock(Register object, Register box, Register tmp, Register tmp2);
|
||||
|
@ -53,9 +53,7 @@
|
||||
#include "runtime/stubRoutines.hpp"
|
||||
#include "utilities/align.hpp"
|
||||
#include "utilities/checkedCast.hpp"
|
||||
#include "utilities/debug.hpp"
|
||||
#include "utilities/globalDefinitions.hpp"
|
||||
#include "utilities/intpow.hpp"
|
||||
#include "utilities/powerOfTwo.hpp"
|
||||
#ifdef COMPILER2
|
||||
#include "opto/runtime.hpp"
|
||||
@ -5313,309 +5311,6 @@ class StubGenerator: public StubCodeGenerator {
|
||||
return entry;
|
||||
}
|
||||
|
||||
// result = r0 - return value. Contains initial hashcode value on entry.
|
||||
// ary = r1 - array address
|
||||
// cnt = r2 - elements count
|
||||
// Clobbers: v0-v13, rscratch1, rscratch2
|
||||
address generate_large_arrays_hashcode(BasicType eltype) {
|
||||
const Register result = r0, ary = r1, cnt = r2;
|
||||
const FloatRegister vdata0 = v3, vdata1 = v2, vdata2 = v1, vdata3 = v0;
|
||||
const FloatRegister vmul0 = v4, vmul1 = v5, vmul2 = v6, vmul3 = v7;
|
||||
const FloatRegister vpow = v8; // powers of 31: <31^3, ..., 31^0>
|
||||
const FloatRegister vpowm = v9;
|
||||
|
||||
assert_different_registers(ary, cnt, result);
|
||||
assert_different_registers(vdata0, vdata1, vdata2, vdata3, vmul0, vmul1, vmul2, vmul3, vpow,
|
||||
vpowm);
|
||||
|
||||
Label SMALL_LOOP, LARGE_LOOP_PREHEADER, LARGE_LOOP, TAIL, TAIL_SHORTCUT, BR_BASE;
|
||||
|
||||
unsigned int vf; // vectorization factor
|
||||
bool multiply_by_halves;
|
||||
Assembler::SIMD_Arrangement load_arrangement;
|
||||
switch (eltype) {
|
||||
case T_BOOLEAN:
|
||||
case T_BYTE:
|
||||
load_arrangement = Assembler::T8B;
|
||||
multiply_by_halves = true;
|
||||
vf = 8;
|
||||
break;
|
||||
case T_CHAR:
|
||||
case T_SHORT:
|
||||
load_arrangement = Assembler::T8H;
|
||||
multiply_by_halves = true;
|
||||
vf = 8;
|
||||
break;
|
||||
case T_INT:
|
||||
load_arrangement = Assembler::T4S;
|
||||
multiply_by_halves = false;
|
||||
vf = 4;
|
||||
break;
|
||||
default:
|
||||
ShouldNotReachHere();
|
||||
}
|
||||
|
||||
// Unroll factor
|
||||
const unsigned uf = 4;
|
||||
|
||||
// Effective vectorization factor
|
||||
const unsigned evf = vf * uf;
|
||||
|
||||
__ align(CodeEntryAlignment);
|
||||
|
||||
const char *mark_name = "";
|
||||
switch (eltype) {
|
||||
case T_BOOLEAN:
|
||||
mark_name = "_large_arrays_hashcode_boolean";
|
||||
break;
|
||||
case T_BYTE:
|
||||
mark_name = "_large_arrays_hashcode_byte";
|
||||
break;
|
||||
case T_CHAR:
|
||||
mark_name = "_large_arrays_hashcode_char";
|
||||
break;
|
||||
case T_SHORT:
|
||||
mark_name = "_large_arrays_hashcode_short";
|
||||
break;
|
||||
case T_INT:
|
||||
mark_name = "_large_arrays_hashcode_int";
|
||||
break;
|
||||
default:
|
||||
mark_name = "_large_arrays_hashcode_incorrect_type";
|
||||
__ should_not_reach_here();
|
||||
};
|
||||
|
||||
StubCodeMark mark(this, "StubRoutines", mark_name);
|
||||
|
||||
address entry = __ pc();
|
||||
__ enter();
|
||||
|
||||
// Put 0-3'th powers of 31 into a single SIMD register together. The register will be used in
|
||||
// the SMALL and LARGE LOOPS' epilogues. The initialization is hoisted here and the register's
|
||||
// value shouldn't change throughout both loops.
|
||||
__ movw(rscratch1, intpow(31U, 3));
|
||||
__ mov(vpow, Assembler::S, 0, rscratch1);
|
||||
__ movw(rscratch1, intpow(31U, 2));
|
||||
__ mov(vpow, Assembler::S, 1, rscratch1);
|
||||
__ movw(rscratch1, intpow(31U, 1));
|
||||
__ mov(vpow, Assembler::S, 2, rscratch1);
|
||||
__ movw(rscratch1, intpow(31U, 0));
|
||||
__ mov(vpow, Assembler::S, 3, rscratch1);
|
||||
|
||||
__ mov(vmul0, Assembler::T16B, 0);
|
||||
__ mov(vmul0, Assembler::S, 3, result);
|
||||
|
||||
__ andr(rscratch2, cnt, (uf - 1) * vf);
|
||||
__ cbz(rscratch2, LARGE_LOOP_PREHEADER);
|
||||
|
||||
__ movw(rscratch1, intpow(31U, multiply_by_halves ? vf / 2 : vf));
|
||||
__ mov(vpowm, Assembler::S, 0, rscratch1);
|
||||
|
||||
// SMALL LOOP
|
||||
__ bind(SMALL_LOOP);
|
||||
|
||||
__ ld1(vdata0, load_arrangement, Address(__ post(ary, vf * type2aelembytes(eltype))));
|
||||
__ mulvs(vmul0, Assembler::T4S, vmul0, vpowm, 0);
|
||||
__ subsw(rscratch2, rscratch2, vf);
|
||||
|
||||
if (load_arrangement == Assembler::T8B) {
|
||||
// Extend 8B to 8H to be able to use vector multiply
|
||||
// instructions
|
||||
assert(load_arrangement == Assembler::T8B, "expected to extend 8B to 8H");
|
||||
if (is_signed_subword_type(eltype)) {
|
||||
__ sxtl(vdata0, Assembler::T8H, vdata0, load_arrangement);
|
||||
} else {
|
||||
__ uxtl(vdata0, Assembler::T8H, vdata0, load_arrangement);
|
||||
}
|
||||
}
|
||||
|
||||
switch (load_arrangement) {
|
||||
case Assembler::T4S:
|
||||
__ addv(vmul0, load_arrangement, vmul0, vdata0);
|
||||
break;
|
||||
case Assembler::T8B:
|
||||
case Assembler::T8H:
|
||||
assert(is_subword_type(eltype), "subword type expected");
|
||||
if (is_signed_subword_type(eltype)) {
|
||||
__ saddwv(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T4H);
|
||||
} else {
|
||||
__ uaddwv(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T4H);
|
||||
}
|
||||
break;
|
||||
default:
|
||||
__ should_not_reach_here();
|
||||
}
|
||||
|
||||
// Process the upper half of a vector
|
||||
if (load_arrangement == Assembler::T8B || load_arrangement == Assembler::T8H) {
|
||||
__ mulvs(vmul0, Assembler::T4S, vmul0, vpowm, 0);
|
||||
if (is_signed_subword_type(eltype)) {
|
||||
__ saddwv2(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T8H);
|
||||
} else {
|
||||
__ uaddwv2(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T8H);
|
||||
}
|
||||
}
|
||||
|
||||
__ br(Assembler::HI, SMALL_LOOP);
|
||||
|
||||
// SMALL LOOP'S EPILOQUE
|
||||
__ lsr(rscratch2, cnt, exact_log2(evf));
|
||||
__ cbnz(rscratch2, LARGE_LOOP_PREHEADER);
|
||||
|
||||
__ mulv(vmul0, Assembler::T4S, vmul0, vpow);
|
||||
__ addv(vmul0, Assembler::T4S, vmul0);
|
||||
__ umov(result, vmul0, Assembler::S, 0);
|
||||
|
||||
// TAIL
|
||||
__ bind(TAIL);
|
||||
|
||||
// The andr performs cnt % vf. The subtract shifted by 3 offsets past vf - 1 - (cnt % vf) pairs
|
||||
// of load + madd insns i.e. it only executes cnt % vf load + madd pairs.
|
||||
assert(is_power_of_2(vf), "can't use this value to calculate the jump target PC");
|
||||
__ andr(rscratch2, cnt, vf - 1);
|
||||
__ bind(TAIL_SHORTCUT);
|
||||
__ adr(rscratch1, BR_BASE);
|
||||
__ sub(rscratch1, rscratch1, rscratch2, ext::uxtw, 3);
|
||||
__ movw(rscratch2, 0x1f);
|
||||
__ br(rscratch1);
|
||||
|
||||
for (size_t i = 0; i < vf - 1; ++i) {
|
||||
__ load(rscratch1, Address(__ post(ary, type2aelembytes(eltype))),
|
||||
eltype);
|
||||
__ maddw(result, result, rscratch2, rscratch1);
|
||||
}
|
||||
__ bind(BR_BASE);
|
||||
|
||||
__ leave();
|
||||
__ ret(lr);
|
||||
|
||||
// LARGE LOOP
|
||||
__ bind(LARGE_LOOP_PREHEADER);
|
||||
|
||||
__ lsr(rscratch2, cnt, exact_log2(evf));
|
||||
|
||||
if (multiply_by_halves) {
|
||||
// 31^4 - multiplier between lower and upper parts of a register
|
||||
__ movw(rscratch1, intpow(31U, vf / 2));
|
||||
__ mov(vpowm, Assembler::S, 1, rscratch1);
|
||||
// 31^28 - remainder of the iteraion multiplier, 28 = 32 - 4
|
||||
__ movw(rscratch1, intpow(31U, evf - vf / 2));
|
||||
__ mov(vpowm, Assembler::S, 0, rscratch1);
|
||||
} else {
|
||||
// 31^16
|
||||
__ movw(rscratch1, intpow(31U, evf));
|
||||
__ mov(vpowm, Assembler::S, 0, rscratch1);
|
||||
}
|
||||
|
||||
__ mov(vmul3, Assembler::T16B, 0);
|
||||
__ mov(vmul2, Assembler::T16B, 0);
|
||||
__ mov(vmul1, Assembler::T16B, 0);
|
||||
|
||||
__ bind(LARGE_LOOP);
|
||||
|
||||
__ mulvs(vmul3, Assembler::T4S, vmul3, vpowm, 0);
|
||||
__ mulvs(vmul2, Assembler::T4S, vmul2, vpowm, 0);
|
||||
__ mulvs(vmul1, Assembler::T4S, vmul1, vpowm, 0);
|
||||
__ mulvs(vmul0, Assembler::T4S, vmul0, vpowm, 0);
|
||||
|
||||
__ ld1(vdata3, vdata2, vdata1, vdata0, load_arrangement,
|
||||
Address(__ post(ary, evf * type2aelembytes(eltype))));
|
||||
|
||||
if (load_arrangement == Assembler::T8B) {
|
||||
// Extend 8B to 8H to be able to use vector multiply
|
||||
// instructions
|
||||
assert(load_arrangement == Assembler::T8B, "expected to extend 8B to 8H");
|
||||
if (is_signed_subword_type(eltype)) {
|
||||
__ sxtl(vdata3, Assembler::T8H, vdata3, load_arrangement);
|
||||
__ sxtl(vdata2, Assembler::T8H, vdata2, load_arrangement);
|
||||
__ sxtl(vdata1, Assembler::T8H, vdata1, load_arrangement);
|
||||
__ sxtl(vdata0, Assembler::T8H, vdata0, load_arrangement);
|
||||
} else {
|
||||
__ uxtl(vdata3, Assembler::T8H, vdata3, load_arrangement);
|
||||
__ uxtl(vdata2, Assembler::T8H, vdata2, load_arrangement);
|
||||
__ uxtl(vdata1, Assembler::T8H, vdata1, load_arrangement);
|
||||
__ uxtl(vdata0, Assembler::T8H, vdata0, load_arrangement);
|
||||
}
|
||||
}
|
||||
|
||||
switch (load_arrangement) {
|
||||
case Assembler::T4S:
|
||||
__ addv(vmul3, load_arrangement, vmul3, vdata3);
|
||||
__ addv(vmul2, load_arrangement, vmul2, vdata2);
|
||||
__ addv(vmul1, load_arrangement, vmul1, vdata1);
|
||||
__ addv(vmul0, load_arrangement, vmul0, vdata0);
|
||||
break;
|
||||
case Assembler::T8B:
|
||||
case Assembler::T8H:
|
||||
assert(is_subword_type(eltype), "subword type expected");
|
||||
if (is_signed_subword_type(eltype)) {
|
||||
__ saddwv(vmul3, vmul3, Assembler::T4S, vdata3, Assembler::T4H);
|
||||
__ saddwv(vmul2, vmul2, Assembler::T4S, vdata2, Assembler::T4H);
|
||||
__ saddwv(vmul1, vmul1, Assembler::T4S, vdata1, Assembler::T4H);
|
||||
__ saddwv(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T4H);
|
||||
} else {
|
||||
__ uaddwv(vmul3, vmul3, Assembler::T4S, vdata3, Assembler::T4H);
|
||||
__ uaddwv(vmul2, vmul2, Assembler::T4S, vdata2, Assembler::T4H);
|
||||
__ uaddwv(vmul1, vmul1, Assembler::T4S, vdata1, Assembler::T4H);
|
||||
__ uaddwv(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T4H);
|
||||
}
|
||||
break;
|
||||
default:
|
||||
__ should_not_reach_here();
|
||||
}
|
||||
|
||||
// Process the upper half of a vector
|
||||
if (load_arrangement == Assembler::T8B || load_arrangement == Assembler::T8H) {
|
||||
__ mulvs(vmul3, Assembler::T4S, vmul3, vpowm, 1);
|
||||
__ mulvs(vmul2, Assembler::T4S, vmul2, vpowm, 1);
|
||||
__ mulvs(vmul1, Assembler::T4S, vmul1, vpowm, 1);
|
||||
__ mulvs(vmul0, Assembler::T4S, vmul0, vpowm, 1);
|
||||
if (is_signed_subword_type(eltype)) {
|
||||
__ saddwv2(vmul3, vmul3, Assembler::T4S, vdata3, Assembler::T8H);
|
||||
__ saddwv2(vmul2, vmul2, Assembler::T4S, vdata2, Assembler::T8H);
|
||||
__ saddwv2(vmul1, vmul1, Assembler::T4S, vdata1, Assembler::T8H);
|
||||
__ saddwv2(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T8H);
|
||||
} else {
|
||||
__ uaddwv2(vmul3, vmul3, Assembler::T4S, vdata3, Assembler::T8H);
|
||||
__ uaddwv2(vmul2, vmul2, Assembler::T4S, vdata2, Assembler::T8H);
|
||||
__ uaddwv2(vmul1, vmul1, Assembler::T4S, vdata1, Assembler::T8H);
|
||||
__ uaddwv2(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T8H);
|
||||
}
|
||||
}
|
||||
|
||||
__ subsw(rscratch2, rscratch2, 1);
|
||||
__ br(Assembler::HI, LARGE_LOOP);
|
||||
|
||||
__ mulv(vmul3, Assembler::T4S, vmul3, vpow);
|
||||
__ addv(vmul3, Assembler::T4S, vmul3);
|
||||
__ umov(result, vmul3, Assembler::S, 0);
|
||||
|
||||
__ mov(rscratch2, intpow(31U, vf));
|
||||
|
||||
__ mulv(vmul2, Assembler::T4S, vmul2, vpow);
|
||||
__ addv(vmul2, Assembler::T4S, vmul2);
|
||||
__ umov(rscratch1, vmul2, Assembler::S, 0);
|
||||
__ maddw(result, result, rscratch2, rscratch1);
|
||||
|
||||
__ mulv(vmul1, Assembler::T4S, vmul1, vpow);
|
||||
__ addv(vmul1, Assembler::T4S, vmul1);
|
||||
__ umov(rscratch1, vmul1, Assembler::S, 0);
|
||||
__ maddw(result, result, rscratch2, rscratch1);
|
||||
|
||||
__ mulv(vmul0, Assembler::T4S, vmul0, vpow);
|
||||
__ addv(vmul0, Assembler::T4S, vmul0);
|
||||
__ umov(rscratch1, vmul0, Assembler::S, 0);
|
||||
__ maddw(result, result, rscratch2, rscratch1);
|
||||
|
||||
__ andr(rscratch2, cnt, vf - 1);
|
||||
__ cbnz(rscratch2, TAIL_SHORTCUT);
|
||||
|
||||
__ leave();
|
||||
__ ret(lr);
|
||||
|
||||
return entry;
|
||||
}
|
||||
|
||||
address generate_dsin_dcos(bool isCos) {
|
||||
__ align(CodeEntryAlignment);
|
||||
StubCodeMark mark(this, "StubRoutines", isCos ? "libmDcos" : "libmDsin");
|
||||
@ -8562,13 +8257,6 @@ class StubGenerator: public StubCodeGenerator {
|
||||
StubRoutines::aarch64::_large_array_equals = generate_large_array_equals();
|
||||
}
|
||||
|
||||
// arrays_hascode stub for large arrays.
|
||||
StubRoutines::aarch64::_large_arrays_hashcode_boolean = generate_large_arrays_hashcode(T_BOOLEAN);
|
||||
StubRoutines::aarch64::_large_arrays_hashcode_byte = generate_large_arrays_hashcode(T_BYTE);
|
||||
StubRoutines::aarch64::_large_arrays_hashcode_char = generate_large_arrays_hashcode(T_CHAR);
|
||||
StubRoutines::aarch64::_large_arrays_hashcode_int = generate_large_arrays_hashcode(T_INT);
|
||||
StubRoutines::aarch64::_large_arrays_hashcode_short = generate_large_arrays_hashcode(T_SHORT);
|
||||
|
||||
// byte_array_inflate stub for large arrays.
|
||||
StubRoutines::aarch64::_large_byte_array_inflate = generate_large_byte_array_inflate();
|
||||
|
||||
|
@ -1,5 +1,5 @@
|
||||
/*
|
||||
* Copyright (c) 2003, 2024, Oracle and/or its affiliates. All rights reserved.
|
||||
* Copyright (c) 2003, 2023, Oracle and/or its affiliates. All rights reserved.
|
||||
* Copyright (c) 2014, Red Hat Inc. All rights reserved.
|
||||
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
|
||||
*
|
||||
@ -48,11 +48,6 @@ address StubRoutines::aarch64::_zero_blocks = nullptr;
|
||||
address StubRoutines::aarch64::_count_positives = nullptr;
|
||||
address StubRoutines::aarch64::_count_positives_long = nullptr;
|
||||
address StubRoutines::aarch64::_large_array_equals = nullptr;
|
||||
address StubRoutines::aarch64::_large_arrays_hashcode_boolean = nullptr;
|
||||
address StubRoutines::aarch64::_large_arrays_hashcode_byte = nullptr;
|
||||
address StubRoutines::aarch64::_large_arrays_hashcode_char = nullptr;
|
||||
address StubRoutines::aarch64::_large_arrays_hashcode_int = nullptr;
|
||||
address StubRoutines::aarch64::_large_arrays_hashcode_short = nullptr;
|
||||
address StubRoutines::aarch64::_compare_long_string_LL = nullptr;
|
||||
address StubRoutines::aarch64::_compare_long_string_UU = nullptr;
|
||||
address StubRoutines::aarch64::_compare_long_string_LU = nullptr;
|
||||
|
@ -1,5 +1,5 @@
|
||||
/*
|
||||
* Copyright (c) 2003, 2024, Oracle and/or its affiliates. All rights reserved.
|
||||
* Copyright (c) 2003, 2023, Oracle and/or its affiliates. All rights reserved.
|
||||
* Copyright (c) 2014, Red Hat Inc. All rights reserved.
|
||||
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
|
||||
*
|
||||
@ -62,11 +62,6 @@ class aarch64 {
|
||||
static address _zero_blocks;
|
||||
|
||||
static address _large_array_equals;
|
||||
static address _large_arrays_hashcode_boolean;
|
||||
static address _large_arrays_hashcode_byte;
|
||||
static address _large_arrays_hashcode_char;
|
||||
static address _large_arrays_hashcode_int;
|
||||
static address _large_arrays_hashcode_short;
|
||||
static address _compare_long_string_LL;
|
||||
static address _compare_long_string_LU;
|
||||
static address _compare_long_string_UL;
|
||||
@ -150,25 +145,6 @@ class aarch64 {
|
||||
return _large_array_equals;
|
||||
}
|
||||
|
||||
static address large_arrays_hashcode(BasicType eltype) {
|
||||
switch (eltype) {
|
||||
case T_BOOLEAN:
|
||||
return _large_arrays_hashcode_boolean;
|
||||
case T_BYTE:
|
||||
return _large_arrays_hashcode_byte;
|
||||
case T_CHAR:
|
||||
return _large_arrays_hashcode_char;
|
||||
case T_SHORT:
|
||||
return _large_arrays_hashcode_short;
|
||||
case T_INT:
|
||||
return _large_arrays_hashcode_int;
|
||||
default:
|
||||
ShouldNotReachHere();
|
||||
}
|
||||
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
static address compare_long_string_LL() {
|
||||
return _compare_long_string_LL;
|
||||
}
|
||||
|
@ -574,10 +574,6 @@ void VM_Version::initialize() {
|
||||
if (FLAG_IS_DEFAULT(UsePoly1305Intrinsics)) {
|
||||
FLAG_SET_DEFAULT(UsePoly1305Intrinsics, true);
|
||||
}
|
||||
|
||||
if (FLAG_IS_DEFAULT(UseVectorizedHashCodeIntrinsic)) {
|
||||
FLAG_SET_DEFAULT(UseVectorizedHashCodeIntrinsic, true);
|
||||
}
|
||||
#endif
|
||||
|
||||
_spin_wait = get_spin_wait_desc();
|
||||
|
@ -1,46 +0,0 @@
|
||||
/*
|
||||
* Copyright (c) 2024, Arm Limited. All rights reserved.
|
||||
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
|
||||
*
|
||||
* This code is free software; you can redistribute it and/or modify it
|
||||
* under the terms of the GNU General Public License version 2 only, as
|
||||
* published by the Free Software Foundation.
|
||||
*
|
||||
* This code is distributed in the hope that it will be useful, but WITHOUT
|
||||
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
|
||||
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
|
||||
* version 2 for more details (a copy is included in the LICENSE file that
|
||||
* accompanied this code).
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License version
|
||||
* 2 along with this work; if not, write to the Free Software Foundation,
|
||||
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||
*
|
||||
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
|
||||
* or visit www.oracle.com if you need additional information or have any
|
||||
* questions.
|
||||
*
|
||||
*/
|
||||
|
||||
#ifndef SHARE_UTILITIES_INTPOW_HPP
|
||||
#define SHARE_UTILITIES_INTPOW_HPP
|
||||
|
||||
#include "metaprogramming/enableIf.hpp"
|
||||
#include <limits>
|
||||
#include <type_traits>
|
||||
|
||||
// Raise v to the power p mod 2**N, where N is the width of the type T.
|
||||
template <typename T, ENABLE_IF(std::is_integral<T>::value && std::is_unsigned<T>::value)>
|
||||
static constexpr T intpow(T v, unsigned p) {
|
||||
if (p == 0) {
|
||||
return 1;
|
||||
}
|
||||
|
||||
// We use exponentiation by squaring to calculate the required power.
|
||||
T a = intpow(v, p / 2);
|
||||
T b = (p % 2) ? v : 1;
|
||||
|
||||
return a * a * b;
|
||||
}
|
||||
|
||||
#endif // SHARE_UTILITIES_INTPOW_HPP
|
@ -77,29 +77,11 @@ class FloatRegister(Register):
|
||||
def __str__(self):
|
||||
return self.astr("v")
|
||||
|
||||
def generate(self):
|
||||
self.number = random.randint(0, 31)
|
||||
return self
|
||||
|
||||
def nextReg(self):
|
||||
next = FloatRegister()
|
||||
next.number = (self.number + 1) % 32
|
||||
return next
|
||||
|
||||
class LowFloatRegister(Register):
|
||||
|
||||
def __str__(self):
|
||||
return self.astr("v")
|
||||
|
||||
def generate(self):
|
||||
self.number = random.randint(0, 15)
|
||||
return self
|
||||
|
||||
def nextReg(self):
|
||||
next = FloatRegister()
|
||||
next.number = (self.number + 1) % 16
|
||||
return next
|
||||
|
||||
class GeneralRegister(Register):
|
||||
|
||||
def __str__(self):
|
||||
@ -1289,75 +1271,6 @@ class CommonNEONInstruction(Instruction):
|
||||
def aname(self):
|
||||
return self._name
|
||||
|
||||
class VectorScalarNEONInstruction(Instruction):
|
||||
def __init__(self, args):
|
||||
self._name, self.insname, self.arrangement = args
|
||||
|
||||
def generate(self):
|
||||
vectorLength = {"8B" : 8, "16B" : 16, "4H" : 4, "8H" : 8, "2S" : 2, "4S" : 4, "1D" : 1, "2D" : 2} [self.arrangement]
|
||||
self.elemIndex = random.randrange(0, vectorLength)
|
||||
self.elemSizeSpecifier = self.arrangement[len(self.arrangement) - 1:]
|
||||
self._firstSIMDreg = LowFloatRegister().generate()
|
||||
self.numRegs = 3
|
||||
return self
|
||||
|
||||
def cstr(self):
|
||||
buf = Instruction.cstr(self) + str(self._firstSIMDreg)
|
||||
buf = '%s, __ T%s' % (buf, self.arrangement)
|
||||
current = self._firstSIMDreg
|
||||
for cnt in range(1, self.numRegs - 1):
|
||||
buf = '%s, %s' % (buf, current.nextReg())
|
||||
current = current.nextReg()
|
||||
buf = '%s, %s, %d' % (buf, current.nextReg(), self.elemIndex)
|
||||
return '%s);' % (buf)
|
||||
|
||||
def astr(self):
|
||||
buf = '%s\t%s.%s' % (self.insname, self._firstSIMDreg, self.arrangement)
|
||||
current = self._firstSIMDreg
|
||||
for cnt in range(1, self.numRegs - 1):
|
||||
buf = '%s, %s.%s' % (buf, current.nextReg(), self.arrangement)
|
||||
current = current.nextReg()
|
||||
buf = '%s, %s.%s[%d]' % (buf, current.nextReg(), self.elemSizeSpecifier, self.elemIndex)
|
||||
return buf
|
||||
|
||||
def aname(self):
|
||||
return self._name
|
||||
|
||||
class WideningNEONInstruction(Instruction):
|
||||
def __init__(self, args):
|
||||
self._name, self.insname, self.widerArrangement, self.narrowerArrangement = args
|
||||
|
||||
def generate(self):
|
||||
self._firstSIMDreg = FloatRegister().generate()
|
||||
return self
|
||||
|
||||
def cstr(self):
|
||||
buf = Instruction.cstr(self) + str(self._firstSIMDreg)
|
||||
current = self._firstSIMDreg
|
||||
for cnt in range(1, self.numWiderRegs):
|
||||
buf = '%s, %s' % (buf, current.nextReg())
|
||||
current = current.nextReg()
|
||||
buf = '%s, __ T%s' % (buf, self.widerArrangement)
|
||||
for cnt in range(0, self.numNarrowerRegs):
|
||||
buf = '%s, %s' % (buf, current.nextReg())
|
||||
current = current.nextReg()
|
||||
buf = '%s, __ T%s' % (buf, self.narrowerArrangement)
|
||||
return '%s);' % (buf)
|
||||
|
||||
def astr(self):
|
||||
buf = '%s\t%s.%s' % (self.insname, self._firstSIMDreg, self.widerArrangement)
|
||||
current = self._firstSIMDreg
|
||||
for cnt in range(1, self.numWiderRegs):
|
||||
buf = '%s, %s.%s' % (buf, current.nextReg(), self.widerArrangement)
|
||||
current = current.nextReg()
|
||||
for cnt in range(0, self.numNarrowerRegs):
|
||||
buf = '%s, %s.%s' % (buf, current.nextReg(), self.narrowerArrangement)
|
||||
current = current.nextReg()
|
||||
return buf
|
||||
|
||||
def aname(self):
|
||||
return self._name
|
||||
|
||||
class SHA512SIMDOp(Instruction):
|
||||
|
||||
def generate(self):
|
||||
@ -1477,10 +1390,6 @@ class TwoRegNEONOp(CommonNEONInstruction):
|
||||
class ThreeRegNEONOp(TwoRegNEONOp):
|
||||
numRegs = 3
|
||||
|
||||
class AddWideNEONOp(WideningNEONInstruction):
|
||||
numWiderRegs = 2
|
||||
numNarrowerRegs = 1
|
||||
|
||||
class NEONFloatCompareWithZero(TwoRegNEONOp):
|
||||
def __init__(self, args):
|
||||
self._name = 'fcm'
|
||||
@ -1839,17 +1748,6 @@ generate(ThreeRegNEONOp,
|
||||
["facgt", "facgt", "2D"],
|
||||
])
|
||||
|
||||
generate(VectorScalarNEONInstruction,
|
||||
[["fmlavs", "fmla", "2S"], ["mulvs", "mul", "4S"],
|
||||
["fmlavs", "fmla", "2D"],
|
||||
["fmlsvs", "fmls", "2S"], ["mulvs", "mul", "4S"],
|
||||
["fmlsvs", "fmls", "2D"],
|
||||
["fmulxvs", "fmulx", "2S"], ["mulvs", "mul", "4S"],
|
||||
["fmulxvs", "fmulx", "2D"],
|
||||
["mulvs", "mul", "4H"], ["mulvs", "mul", "8H"],
|
||||
["mulvs", "mul", "2S"], ["mulvs", "mul", "4S"],
|
||||
])
|
||||
|
||||
neonVectorCompareInstructionPrefix = ['cm', 'fcm']
|
||||
neonIntegerVectorCompareConditions = ['GT', 'GE', 'EQ', 'HI', 'HS']
|
||||
neonFloatVectorCompareConditions = ['EQ', 'GT', 'GE']
|
||||
@ -2183,15 +2081,6 @@ generate(SVEVectorOp, [["add", "ZZZ"],
|
||||
generate(SVEReductionOp, [["andv", 0], ["orv", 0], ["eorv", 0], ["smaxv", 0], ["sminv", 0],
|
||||
["fminv", 2], ["fmaxv", 2], ["fadda", 2], ["uaddv", 0]])
|
||||
|
||||
generate(AddWideNEONOp,
|
||||
[["saddwv", "saddw", "8H", "8B"], ["saddwv2", "saddw2", "8H", "16B"],
|
||||
["saddwv", "saddw", "4S", "4H"], ["saddwv2", "saddw2", "4S", "8H"],
|
||||
["saddwv", "saddw", "2D", "2S"], ["saddwv2", "saddw2", "2D", "4S"],
|
||||
["uaddwv", "uaddw", "8H", "8B"], ["uaddwv2", "uaddw2", "8H", "16B"],
|
||||
["uaddwv", "uaddw", "4S", "4H"], ["uaddwv2", "uaddw2", "4S", "8H"],
|
||||
["uaddwv", "uaddw", "2D", "2S"], ["uaddwv2", "uaddw2", "2D", "4S"],
|
||||
])
|
||||
|
||||
print "\n __ bind(forth);"
|
||||
outfile.write("forth:\n")
|
||||
|
||||
|
File diff suppressed because it is too large
Load Diff
Loading…
x
Reference in New Issue
Block a user