Merge
This commit is contained in:
commit
1b0c9f4ae7
@ -1308,6 +1308,7 @@ class Assembler : public AbstractAssembler {
|
||||
inline void li( Register d, int si16);
|
||||
inline void lis( Register d, int si16);
|
||||
inline void addir(Register d, int si16, Register a);
|
||||
inline void subi( Register d, Register a, int si16);
|
||||
|
||||
static bool is_addi(int x) {
|
||||
return ADDI_OPCODE == (x & ADDI_OPCODE_MASK);
|
||||
|
@ -164,6 +164,7 @@ inline void Assembler::divwo_( Register d, Register a, Register b) { emit_int32
|
||||
inline void Assembler::li( Register d, int si16) { Assembler::addi_r0ok( d, R0, si16); }
|
||||
inline void Assembler::lis( Register d, int si16) { Assembler::addis_r0ok(d, R0, si16); }
|
||||
inline void Assembler::addir(Register d, int si16, Register a) { Assembler::addi(d, a, si16); }
|
||||
inline void Assembler::subi( Register d, Register a, int si16) { Assembler::addi(d, a, -si16); }
|
||||
|
||||
// PPC 1, section 3.3.9, Fixed-Point Compare Instructions
|
||||
inline void Assembler::cmpi( ConditionRegister f, int l, Register a, int si16) { emit_int32( CMPI_OPCODE | bf(f) | l10(l) | ra(a) | simm(si16,16)); }
|
||||
|
@ -129,7 +129,7 @@ void MacroAssembler::calculate_address_from_global_toc(Register dst, address add
|
||||
}
|
||||
}
|
||||
|
||||
int MacroAssembler::patch_calculate_address_from_global_toc_at(address a, address bound, address addr) {
|
||||
address MacroAssembler::patch_calculate_address_from_global_toc_at(address a, address bound, address addr) {
|
||||
const int offset = MacroAssembler::offset_to_global_toc(addr);
|
||||
|
||||
const address inst2_addr = a;
|
||||
@ -155,7 +155,7 @@ int MacroAssembler::patch_calculate_address_from_global_toc_at(address a, addres
|
||||
assert(is_addis(inst1) && inv_ra_field(inst1) == 29 /* R29 */, "source must be global TOC");
|
||||
set_imm((int *)inst1_addr, MacroAssembler::largeoffset_si16_si16_hi(offset));
|
||||
set_imm((int *)inst2_addr, MacroAssembler::largeoffset_si16_si16_lo(offset));
|
||||
return (int)((intptr_t)addr - (intptr_t)inst1_addr);
|
||||
return inst1_addr;
|
||||
}
|
||||
|
||||
address MacroAssembler::get_address_of_calculate_address_from_global_toc_at(address a, address bound) {
|
||||
@ -201,7 +201,7 @@ address MacroAssembler::get_address_of_calculate_address_from_global_toc_at(addr
|
||||
// clrldi rx = rx & 0xFFFFffff // clearMS32b, optional
|
||||
// ori rx = rx | const.lo
|
||||
// Clrldi will be passed by.
|
||||
int MacroAssembler::patch_set_narrow_oop(address a, address bound, narrowOop data) {
|
||||
address MacroAssembler::patch_set_narrow_oop(address a, address bound, narrowOop data) {
|
||||
assert(UseCompressedOops, "Should only patch compressed oops");
|
||||
|
||||
const address inst2_addr = a;
|
||||
@ -227,7 +227,7 @@ int MacroAssembler::patch_set_narrow_oop(address a, address bound, narrowOop dat
|
||||
|
||||
set_imm((int *)inst1_addr, (short)(xc)); // see enc_load_con_narrow_hi/_lo
|
||||
set_imm((int *)inst2_addr, (xd)); // unsigned int
|
||||
return (int)((intptr_t)inst2_addr - (intptr_t)inst1_addr);
|
||||
return inst1_addr;
|
||||
}
|
||||
|
||||
// Get compressed oop or klass constant.
|
||||
@ -5234,6 +5234,40 @@ void MacroAssembler::multiply_128_x_128_loop(Register x_xstart,
|
||||
bind(L_post_third_loop_done);
|
||||
} // multiply_128_x_128_loop
|
||||
|
||||
void MacroAssembler::muladd(Register out, Register in,
|
||||
Register offset, Register len, Register k,
|
||||
Register tmp1, Register tmp2, Register carry) {
|
||||
|
||||
// Labels
|
||||
Label LOOP, SKIP;
|
||||
|
||||
// Make sure length is positive.
|
||||
cmpdi (CCR0, len, 0);
|
||||
|
||||
// Prepare variables
|
||||
subi (offset, offset, 4);
|
||||
li (carry, 0);
|
||||
ble (CCR0, SKIP);
|
||||
|
||||
mtctr (len);
|
||||
subi (len, len, 1 );
|
||||
sldi (len, len, 2 );
|
||||
|
||||
// Main loop
|
||||
bind(LOOP);
|
||||
lwzx (tmp1, len, in );
|
||||
lwzx (tmp2, offset, out );
|
||||
mulld (tmp1, tmp1, k );
|
||||
add (tmp2, carry, tmp2 );
|
||||
add (tmp2, tmp1, tmp2 );
|
||||
stwx (tmp2, offset, out );
|
||||
srdi (carry, tmp2, 32 );
|
||||
subi (offset, offset, 4 );
|
||||
subi (len, len, 4 );
|
||||
bdnz (LOOP);
|
||||
bind(SKIP);
|
||||
}
|
||||
|
||||
void MacroAssembler::multiply_to_len(Register x, Register xlen,
|
||||
Register y, Register ylen,
|
||||
Register z, Register zlen,
|
||||
|
@ -105,13 +105,15 @@ class MacroAssembler: public Assembler {
|
||||
};
|
||||
|
||||
inline static bool is_calculate_address_from_global_toc_at(address a, address bound);
|
||||
static int patch_calculate_address_from_global_toc_at(address a, address addr, address bound);
|
||||
// Returns address of first instruction in sequence.
|
||||
static address patch_calculate_address_from_global_toc_at(address a, address bound, address addr);
|
||||
static address get_address_of_calculate_address_from_global_toc_at(address a, address addr);
|
||||
|
||||
#ifdef _LP64
|
||||
// Patch narrow oop constant.
|
||||
inline static bool is_set_narrow_oop(address a, address bound);
|
||||
static int patch_set_narrow_oop(address a, address bound, narrowOop data);
|
||||
// Returns address of first instruction in sequence.
|
||||
static address patch_set_narrow_oop(address a, address bound, narrowOop data);
|
||||
static narrowOop get_narrow_oop(address a, address bound);
|
||||
#endif
|
||||
|
||||
@ -813,6 +815,8 @@ class MacroAssembler: public Assembler {
|
||||
Register yz_idx, Register idx, Register carry,
|
||||
Register product_high, Register product,
|
||||
Register carry2, Register tmp);
|
||||
void muladd(Register out, Register in, Register offset, Register len, Register k,
|
||||
Register tmp1, Register tmp2, Register carry);
|
||||
void multiply_to_len(Register x, Register xlen,
|
||||
Register y, Register ylen,
|
||||
Register z, Register zlen,
|
||||
|
@ -1,5 +1,5 @@
|
||||
/*
|
||||
* Copyright (c) 1997, 2015, Oracle and/or its affiliates. All rights reserved.
|
||||
* Copyright (c) 1997, 2017, Oracle and/or its affiliates. All rights reserved.
|
||||
* Copyright (c) 2012, 2015 SAP SE. All rights reserved.
|
||||
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
|
||||
*
|
||||
@ -221,13 +221,13 @@ address NativeMovConstReg::set_data_plain(intptr_t data, CodeBlob *cb) {
|
||||
// A calculation relative to the global TOC.
|
||||
if (MacroAssembler::get_address_of_calculate_address_from_global_toc_at(addr, cb->content_begin()) !=
|
||||
(address)data) {
|
||||
const int invalidated_range =
|
||||
MacroAssembler::patch_calculate_address_from_global_toc_at(addr, cb->content_begin(),
|
||||
const address inst2_addr = addr;
|
||||
const address inst1_addr =
|
||||
MacroAssembler::patch_calculate_address_from_global_toc_at(inst2_addr, cb->content_begin(),
|
||||
(address)data);
|
||||
const address start = invalidated_range < 0 ? addr + invalidated_range : addr;
|
||||
// FIXME:
|
||||
const int range = invalidated_range < 0 ? 4 - invalidated_range : 8;
|
||||
ICache::ppc64_flush_icache_bytes(start, range);
|
||||
assert(inst1_addr != NULL && inst1_addr < inst2_addr, "first instruction must be found");
|
||||
const int range = inst2_addr - inst1_addr + BytesPerInstWord;
|
||||
ICache::ppc64_flush_icache_bytes(inst1_addr, range);
|
||||
}
|
||||
next_address = addr + 1 * BytesPerInstWord;
|
||||
} else if (MacroAssembler::is_load_const_at(addr)) {
|
||||
@ -288,15 +288,15 @@ void NativeMovConstReg::set_data(intptr_t data) {
|
||||
}
|
||||
|
||||
void NativeMovConstReg::set_narrow_oop(narrowOop data, CodeBlob *code /* = NULL */) {
|
||||
address addr = addr_at(0);
|
||||
address inst2_addr = addr_at(0);
|
||||
CodeBlob* cb = (code) ? code : CodeCache::find_blob(instruction_address());
|
||||
if (MacroAssembler::get_narrow_oop(addr, cb->content_begin()) == (long)data) return;
|
||||
const int invalidated_range =
|
||||
MacroAssembler::patch_set_narrow_oop(addr, cb->content_begin(), (long)data);
|
||||
const address start = invalidated_range < 0 ? addr + invalidated_range : addr;
|
||||
// FIXME:
|
||||
const int range = invalidated_range < 0 ? 4 - invalidated_range : 8;
|
||||
ICache::ppc64_flush_icache_bytes(start, range);
|
||||
if (MacroAssembler::get_narrow_oop(inst2_addr, cb->content_begin()) == (long)data)
|
||||
return;
|
||||
const address inst1_addr =
|
||||
MacroAssembler::patch_set_narrow_oop(inst2_addr, cb->content_begin(), (long)data);
|
||||
assert(inst1_addr != NULL && inst1_addr < inst2_addr, "first instruction must be found");
|
||||
const int range = inst2_addr - inst1_addr + BytesPerInstWord;
|
||||
ICache::ppc64_flush_icache_bytes(inst1_addr, range);
|
||||
}
|
||||
|
||||
// Do not use an assertion here. Let clients decide whether they only
|
||||
|
@ -3306,6 +3306,267 @@ class StubGenerator: public StubCodeGenerator {
|
||||
BLOCK_COMMENT("} Stub body");
|
||||
}
|
||||
|
||||
/**
|
||||
* Arguments:
|
||||
*
|
||||
* Input:
|
||||
* R3_ARG1 - out address
|
||||
* R4_ARG2 - in address
|
||||
* R5_ARG3 - offset
|
||||
* R6_ARG4 - len
|
||||
* R7_ARG5 - k
|
||||
* Output:
|
||||
* R3_RET - carry
|
||||
*/
|
||||
address generate_mulAdd() {
|
||||
__ align(CodeEntryAlignment);
|
||||
StubCodeMark mark(this, "StubRoutines", "mulAdd");
|
||||
|
||||
address start = __ function_entry();
|
||||
|
||||
// C2 does not sign extend signed parameters to full 64 bits registers:
|
||||
__ rldic (R5_ARG3, R5_ARG3, 2, 32); // always positive
|
||||
__ clrldi(R6_ARG4, R6_ARG4, 32); // force zero bits on higher word
|
||||
__ clrldi(R7_ARG5, R7_ARG5, 32); // force zero bits on higher word
|
||||
|
||||
__ muladd(R3_ARG1, R4_ARG2, R5_ARG3, R6_ARG4, R7_ARG5, R8, R9, R10);
|
||||
|
||||
// Moves output carry to return register
|
||||
__ mr (R3_RET, R10);
|
||||
|
||||
__ blr();
|
||||
|
||||
return start;
|
||||
}
|
||||
|
||||
/**
|
||||
* Arguments:
|
||||
*
|
||||
* Input:
|
||||
* R3_ARG1 - in address
|
||||
* R4_ARG2 - in length
|
||||
* R5_ARG3 - out address
|
||||
* R6_ARG4 - out length
|
||||
*/
|
||||
address generate_squareToLen() {
|
||||
__ align(CodeEntryAlignment);
|
||||
StubCodeMark mark(this, "StubRoutines", "squareToLen");
|
||||
|
||||
address start = __ function_entry();
|
||||
|
||||
// args - higher word is cleaned (unsignedly) due to int to long casting
|
||||
const Register in = R3_ARG1;
|
||||
const Register in_len = R4_ARG2;
|
||||
__ clrldi(in_len, in_len, 32);
|
||||
const Register out = R5_ARG3;
|
||||
const Register out_len = R6_ARG4;
|
||||
__ clrldi(out_len, out_len, 32);
|
||||
|
||||
// output
|
||||
const Register ret = R3_RET;
|
||||
|
||||
// temporaries
|
||||
const Register lplw_s = R7;
|
||||
const Register in_aux = R8;
|
||||
const Register out_aux = R9;
|
||||
const Register piece = R10;
|
||||
const Register product = R14;
|
||||
const Register lplw = R15;
|
||||
const Register i_minus1 = R16;
|
||||
const Register carry = R17;
|
||||
const Register offset = R18;
|
||||
const Register off_aux = R19;
|
||||
const Register t = R20;
|
||||
const Register mlen = R21;
|
||||
const Register len = R22;
|
||||
const Register a = R23;
|
||||
const Register b = R24;
|
||||
const Register i = R25;
|
||||
const Register c = R26;
|
||||
const Register cs = R27;
|
||||
|
||||
// Labels
|
||||
Label SKIP_LSHIFT, SKIP_DIAGONAL_SUM, SKIP_ADDONE, SKIP_MULADD, SKIP_LOOP_SQUARE;
|
||||
Label LOOP_LSHIFT, LOOP_DIAGONAL_SUM, LOOP_ADDONE, LOOP_MULADD, LOOP_SQUARE;
|
||||
|
||||
// Save non-volatile regs (frameless).
|
||||
int current_offs = -8;
|
||||
__ std(R28, current_offs, R1_SP); current_offs -= 8;
|
||||
__ std(R27, current_offs, R1_SP); current_offs -= 8;
|
||||
__ std(R26, current_offs, R1_SP); current_offs -= 8;
|
||||
__ std(R25, current_offs, R1_SP); current_offs -= 8;
|
||||
__ std(R24, current_offs, R1_SP); current_offs -= 8;
|
||||
__ std(R23, current_offs, R1_SP); current_offs -= 8;
|
||||
__ std(R22, current_offs, R1_SP); current_offs -= 8;
|
||||
__ std(R21, current_offs, R1_SP); current_offs -= 8;
|
||||
__ std(R20, current_offs, R1_SP); current_offs -= 8;
|
||||
__ std(R19, current_offs, R1_SP); current_offs -= 8;
|
||||
__ std(R18, current_offs, R1_SP); current_offs -= 8;
|
||||
__ std(R17, current_offs, R1_SP); current_offs -= 8;
|
||||
__ std(R16, current_offs, R1_SP); current_offs -= 8;
|
||||
__ std(R15, current_offs, R1_SP); current_offs -= 8;
|
||||
__ std(R14, current_offs, R1_SP);
|
||||
|
||||
// Store the squares, right shifted one bit (i.e., divided by 2)
|
||||
__ subi (out_aux, out, 8);
|
||||
__ subi (in_aux, in, 4);
|
||||
__ cmpwi (CCR0, in_len, 0);
|
||||
// Initialize lplw outside of the loop
|
||||
__ xorr (lplw, lplw, lplw);
|
||||
__ ble (CCR0, SKIP_LOOP_SQUARE); // in_len <= 0
|
||||
__ mtctr (in_len);
|
||||
|
||||
__ bind(LOOP_SQUARE);
|
||||
__ lwzu (piece, 4, in_aux);
|
||||
__ mulld (product, piece, piece);
|
||||
// shift left 63 bits and only keep the MSB
|
||||
__ rldic (lplw_s, lplw, 63, 0);
|
||||
__ mr (lplw, product);
|
||||
// shift right 1 bit without sign extension
|
||||
__ srdi (product, product, 1);
|
||||
// join them to the same register and store it
|
||||
__ orr (product, lplw_s, product);
|
||||
#ifdef VM_LITTLE_ENDIAN
|
||||
// Swap low and high words for little endian
|
||||
__ rldicl (product, product, 32, 0);
|
||||
#endif
|
||||
__ stdu (product, 8, out_aux);
|
||||
__ bdnz (LOOP_SQUARE);
|
||||
|
||||
__ bind(SKIP_LOOP_SQUARE);
|
||||
|
||||
// Add in off-diagonal sums
|
||||
__ cmpwi (CCR0, in_len, 0);
|
||||
__ ble (CCR0, SKIP_DIAGONAL_SUM);
|
||||
// Avoid CTR usage here in order to use it at mulAdd
|
||||
__ subi (i_minus1, in_len, 1);
|
||||
__ li (offset, 4);
|
||||
|
||||
__ bind(LOOP_DIAGONAL_SUM);
|
||||
|
||||
__ sldi (off_aux, out_len, 2);
|
||||
__ sub (off_aux, off_aux, offset);
|
||||
|
||||
__ mr (len, i_minus1);
|
||||
__ sldi (mlen, i_minus1, 2);
|
||||
__ lwzx (t, in, mlen);
|
||||
|
||||
__ muladd (out, in, off_aux, len, t, a, b, carry);
|
||||
|
||||
// begin<addOne>
|
||||
// off_aux = out_len*4 - 4 - mlen - offset*4 - 4;
|
||||
__ addi (mlen, mlen, 4);
|
||||
__ sldi (a, out_len, 2);
|
||||
__ subi (a, a, 4);
|
||||
__ sub (a, a, mlen);
|
||||
__ subi (off_aux, offset, 4);
|
||||
__ sub (off_aux, a, off_aux);
|
||||
|
||||
__ lwzx (b, off_aux, out);
|
||||
__ add (b, b, carry);
|
||||
__ stwx (b, off_aux, out);
|
||||
|
||||
// if (((uint64_t)s >> 32) != 0) {
|
||||
__ srdi_ (a, b, 32);
|
||||
__ beq (CCR0, SKIP_ADDONE);
|
||||
|
||||
// while (--mlen >= 0) {
|
||||
__ bind(LOOP_ADDONE);
|
||||
__ subi (mlen, mlen, 4);
|
||||
__ cmpwi (CCR0, mlen, 0);
|
||||
__ beq (CCR0, SKIP_ADDONE);
|
||||
|
||||
// if (--offset_aux < 0) { // Carry out of number
|
||||
__ subi (off_aux, off_aux, 4);
|
||||
__ cmpwi (CCR0, off_aux, 0);
|
||||
__ blt (CCR0, SKIP_ADDONE);
|
||||
|
||||
// } else {
|
||||
__ lwzx (b, off_aux, out);
|
||||
__ addi (b, b, 1);
|
||||
__ stwx (b, off_aux, out);
|
||||
__ cmpwi (CCR0, b, 0);
|
||||
__ bne (CCR0, SKIP_ADDONE);
|
||||
__ b (LOOP_ADDONE);
|
||||
|
||||
__ bind(SKIP_ADDONE);
|
||||
// } } } end<addOne>
|
||||
|
||||
__ addi (offset, offset, 8);
|
||||
__ subi (i_minus1, i_minus1, 1);
|
||||
__ cmpwi (CCR0, i_minus1, 0);
|
||||
__ bge (CCR0, LOOP_DIAGONAL_SUM);
|
||||
|
||||
__ bind(SKIP_DIAGONAL_SUM);
|
||||
|
||||
// Shift back up and set low bit
|
||||
// Shifts 1 bit left up to len positions. Assumes no leading zeros
|
||||
// begin<primitiveLeftShift>
|
||||
__ cmpwi (CCR0, out_len, 0);
|
||||
__ ble (CCR0, SKIP_LSHIFT);
|
||||
__ li (i, 0);
|
||||
__ lwz (c, 0, out);
|
||||
__ subi (b, out_len, 1);
|
||||
__ mtctr (b);
|
||||
|
||||
__ bind(LOOP_LSHIFT);
|
||||
__ mr (b, c);
|
||||
__ addi (cs, i, 4);
|
||||
__ lwzx (c, out, cs);
|
||||
|
||||
__ sldi (b, b, 1);
|
||||
__ srwi (cs, c, 31);
|
||||
__ orr (b, b, cs);
|
||||
__ stwx (b, i, out);
|
||||
|
||||
__ addi (i, i, 4);
|
||||
__ bdnz (LOOP_LSHIFT);
|
||||
|
||||
__ sldi (c, out_len, 2);
|
||||
__ subi (c, c, 4);
|
||||
__ lwzx (b, out, c);
|
||||
__ sldi (b, b, 1);
|
||||
__ stwx (b, out, c);
|
||||
|
||||
__ bind(SKIP_LSHIFT);
|
||||
// end<primitiveLeftShift>
|
||||
|
||||
// Set low bit
|
||||
__ sldi (i, in_len, 2);
|
||||
__ subi (i, i, 4);
|
||||
__ lwzx (i, in, i);
|
||||
__ sldi (c, out_len, 2);
|
||||
__ subi (c, c, 4);
|
||||
__ lwzx (b, out, c);
|
||||
|
||||
__ andi (i, i, 1);
|
||||
__ orr (i, b, i);
|
||||
|
||||
__ stwx (i, out, c);
|
||||
|
||||
// Restore non-volatile regs.
|
||||
current_offs = -8;
|
||||
__ ld(R28, current_offs, R1_SP); current_offs -= 8;
|
||||
__ ld(R27, current_offs, R1_SP); current_offs -= 8;
|
||||
__ ld(R26, current_offs, R1_SP); current_offs -= 8;
|
||||
__ ld(R25, current_offs, R1_SP); current_offs -= 8;
|
||||
__ ld(R24, current_offs, R1_SP); current_offs -= 8;
|
||||
__ ld(R23, current_offs, R1_SP); current_offs -= 8;
|
||||
__ ld(R22, current_offs, R1_SP); current_offs -= 8;
|
||||
__ ld(R21, current_offs, R1_SP); current_offs -= 8;
|
||||
__ ld(R20, current_offs, R1_SP); current_offs -= 8;
|
||||
__ ld(R19, current_offs, R1_SP); current_offs -= 8;
|
||||
__ ld(R18, current_offs, R1_SP); current_offs -= 8;
|
||||
__ ld(R17, current_offs, R1_SP); current_offs -= 8;
|
||||
__ ld(R16, current_offs, R1_SP); current_offs -= 8;
|
||||
__ ld(R15, current_offs, R1_SP); current_offs -= 8;
|
||||
__ ld(R14, current_offs, R1_SP);
|
||||
|
||||
__ mr(ret, out);
|
||||
__ blr();
|
||||
|
||||
return start;
|
||||
}
|
||||
|
||||
/**
|
||||
* Arguments:
|
||||
@ -3500,6 +3761,12 @@ class StubGenerator: public StubCodeGenerator {
|
||||
}
|
||||
#endif
|
||||
|
||||
if (UseSquareToLenIntrinsic) {
|
||||
StubRoutines::_squareToLen = generate_squareToLen();
|
||||
}
|
||||
if (UseMulAddIntrinsic) {
|
||||
StubRoutines::_mulAdd = generate_mulAdd();
|
||||
}
|
||||
if (UseMontgomeryMultiplyIntrinsic) {
|
||||
StubRoutines::_montgomeryMultiply
|
||||
= CAST_FROM_FN_PTR(address, SharedRuntime::montgomery_multiply);
|
||||
|
@ -258,6 +258,12 @@ void VM_Version::initialize() {
|
||||
FLAG_SET_DEFAULT(UseSHA512Intrinsics, false);
|
||||
}
|
||||
|
||||
if (FLAG_IS_DEFAULT(UseSquareToLenIntrinsic)) {
|
||||
UseSquareToLenIntrinsic = true;
|
||||
}
|
||||
if (FLAG_IS_DEFAULT(UseMulAddIntrinsic)) {
|
||||
UseMulAddIntrinsic = true;
|
||||
}
|
||||
if (FLAG_IS_DEFAULT(UseMultiplyToLenIntrinsic)) {
|
||||
UseMultiplyToLenIntrinsic = true;
|
||||
}
|
||||
|
Loading…
x
Reference in New Issue
Block a user