8130653: ppc: implement MultiplyToLen intrinsic

Reviewed-by: simonis
This commit is contained in:
Peter Januschke 2015-07-07 10:40:09 +02:00 committed by Goetz Lindenmaier
parent b5284a93ce
commit a5d8b8bf25
7 changed files with 489 additions and 5 deletions

View File

@ -1,6 +1,6 @@
/*
* Copyright (c) 2000, 2014, Oracle and/or its affiliates. All rights reserved.
* Copyright 2012, 2014 SAP AG. All rights reserved.
* Copyright (c) 2000, 2015, Oracle and/or its affiliates. All rights reserved.
* Copyright 2012, 2015 SAP AG. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
@ -225,7 +225,7 @@ inline BasicObjectLock* frame::interpreter_frame_monitor_begin() const {
return (BasicObjectLock *) get_ijava_state();
}
// SAPJVM ASc 2012-11-21. Return register stack slot addr at which currently interpreted method is found
// Return register stack slot addr at which currently interpreted method is found.
inline Method** frame::interpreter_frame_method_addr() const {
return (Method**) &(get_ijava_state()->method);
}

View File

@ -3433,6 +3433,376 @@ void MacroAssembler::char_arrays_equalsImm(Register str1_reg, Register str2_reg,
bind(Ldone_false);
}
// dest_lo += src1 + src2
// dest_hi += carry1 + carry2
void MacroAssembler::add2_with_carry(Register dest_hi,
Register dest_lo,
Register src1, Register src2) {
li(R0, 0);
addc(dest_lo, dest_lo, src1);
adde(dest_hi, dest_hi, R0);
addc(dest_lo, dest_lo, src2);
adde(dest_hi, dest_hi, R0);
}
// Multiply 64 bit by 64 bit first loop.
void MacroAssembler::multiply_64_x_64_loop(Register x, Register xstart,
Register x_xstart,
Register y, Register y_idx,
Register z,
Register carry,
Register product_high, Register product,
Register idx, Register kdx,
Register tmp) {
// jlong carry, x[], y[], z[];
// for (int idx=ystart, kdx=ystart+1+xstart; idx >= 0; idx--, kdx--) {
// huge_128 product = y[idx] * x[xstart] + carry;
// z[kdx] = (jlong)product;
// carry = (jlong)(product >>> 64);
// }
// z[xstart] = carry;
Label L_first_loop, L_first_loop_exit;
Label L_one_x, L_one_y, L_multiply;
addic_(xstart, xstart, -1);
blt(CCR0, L_one_x); // Special case: length of x is 1.
// Load next two integers of x.
sldi(tmp, xstart, LogBytesPerInt);
ldx(x_xstart, x, tmp);
#ifdef VM_LITTLE_ENDIAN
rldicl(x_xstart, x_xstart, 32, 0);
#endif
align(32, 16);
bind(L_first_loop);
cmpdi(CCR0, idx, 1);
blt(CCR0, L_first_loop_exit);
addi(idx, idx, -2);
beq(CCR0, L_one_y);
// Load next two integers of y.
sldi(tmp, idx, LogBytesPerInt);
ldx(y_idx, y, tmp);
#ifdef VM_LITTLE_ENDIAN
rldicl(y_idx, y_idx, 32, 0);
#endif
bind(L_multiply);
multiply64(product_high, product, x_xstart, y_idx);
li(tmp, 0);
addc(product, product, carry); // Add carry to result.
adde(product_high, product_high, tmp); // Add carry of the last addition.
addi(kdx, kdx, -2);
// Store result.
#ifdef VM_LITTLE_ENDIAN
rldicl(product, product, 32, 0);
#endif
sldi(tmp, kdx, LogBytesPerInt);
stdx(product, z, tmp);
mr_if_needed(carry, product_high);
b(L_first_loop);
bind(L_one_y); // Load one 32 bit portion of y as (0,value).
lwz(y_idx, 0, y);
b(L_multiply);
bind( L_one_x ); // Load one 32 bit portion of x as (0,value).
lwz(x_xstart, 0, x);
b(L_first_loop);
bind(L_first_loop_exit);
}
// Multiply 64 bit by 64 bit and add 128 bit.
void MacroAssembler::multiply_add_128_x_128(Register x_xstart, Register y,
Register z, Register yz_idx,
Register idx, Register carry,
Register product_high, Register product,
Register tmp, int offset) {
// huge_128 product = (y[idx] * x_xstart) + z[kdx] + carry;
// z[kdx] = (jlong)product;
sldi(tmp, idx, LogBytesPerInt);
if ( offset ) {
addi(tmp, tmp, offset);
}
ldx(yz_idx, y, tmp);
#ifdef VM_LITTLE_ENDIAN
rldicl(yz_idx, yz_idx, 32, 0);
#endif
multiply64(product_high, product, x_xstart, yz_idx);
ldx(yz_idx, z, tmp);
#ifdef VM_LITTLE_ENDIAN
rldicl(yz_idx, yz_idx, 32, 0);
#endif
add2_with_carry(product_high, product, carry, yz_idx);
sldi(tmp, idx, LogBytesPerInt);
if ( offset ) {
addi(tmp, tmp, offset);
}
#ifdef VM_LITTLE_ENDIAN
rldicl(product, product, 32, 0);
#endif
stdx(product, z, tmp);
}
// Multiply 128 bit by 128 bit. Unrolled inner loop.
void MacroAssembler::multiply_128_x_128_loop(Register x_xstart,
Register y, Register z,
Register yz_idx, Register idx, Register carry,
Register product_high, Register product,
Register carry2, Register tmp) {
// jlong carry, x[], y[], z[];
// int kdx = ystart+1;
// for (int idx=ystart-2; idx >= 0; idx -= 2) { // Third loop
// huge_128 product = (y[idx+1] * x_xstart) + z[kdx+idx+1] + carry;
// z[kdx+idx+1] = (jlong)product;
// jlong carry2 = (jlong)(product >>> 64);
// product = (y[idx] * x_xstart) + z[kdx+idx] + carry2;
// z[kdx+idx] = (jlong)product;
// carry = (jlong)(product >>> 64);
// }
// idx += 2;
// if (idx > 0) {
// product = (y[idx] * x_xstart) + z[kdx+idx] + carry;
// z[kdx+idx] = (jlong)product;
// carry = (jlong)(product >>> 64);
// }
Label L_third_loop, L_third_loop_exit, L_post_third_loop_done;
const Register jdx = R0;
// Scale the index.
srdi_(jdx, idx, 2);
beq(CCR0, L_third_loop_exit);
mtctr(jdx);
align(32, 16);
bind(L_third_loop);
addi(idx, idx, -4);
multiply_add_128_x_128(x_xstart, y, z, yz_idx, idx, carry, product_high, product, tmp, 8);
mr_if_needed(carry2, product_high);
multiply_add_128_x_128(x_xstart, y, z, yz_idx, idx, carry2, product_high, product, tmp, 0);
mr_if_needed(carry, product_high);
bdnz(L_third_loop);
bind(L_third_loop_exit); // Handle any left-over operand parts.
andi_(idx, idx, 0x3);
beq(CCR0, L_post_third_loop_done);
Label L_check_1;
addic_(idx, idx, -2);
blt(CCR0, L_check_1);
multiply_add_128_x_128(x_xstart, y, z, yz_idx, idx, carry, product_high, product, tmp, 0);
mr_if_needed(carry, product_high);
bind(L_check_1);
addi(idx, idx, 0x2);
andi_(idx, idx, 0x1) ;
addic_(idx, idx, -1);
blt(CCR0, L_post_third_loop_done);
sldi(tmp, idx, LogBytesPerInt);
lwzx(yz_idx, y, tmp);
multiply64(product_high, product, x_xstart, yz_idx);
lwzx(yz_idx, z, tmp);
add2_with_carry(product_high, product, yz_idx, carry);
sldi(tmp, idx, LogBytesPerInt);
stwx(product, z, tmp);
srdi(product, product, 32);
sldi(product_high, product_high, 32);
orr(product, product, product_high);
mr_if_needed(carry, product);
bind(L_post_third_loop_done);
} // multiply_128_x_128_loop
void MacroAssembler::multiply_to_len(Register x, Register xlen,
Register y, Register ylen,
Register z, Register zlen,
Register tmp1, Register tmp2,
Register tmp3, Register tmp4,
Register tmp5, Register tmp6,
Register tmp7, Register tmp8,
Register tmp9, Register tmp10,
Register tmp11, Register tmp12,
Register tmp13) {
ShortBranchVerifier sbv(this);
assert_different_registers(x, xlen, y, ylen, z, zlen,
tmp1, tmp2, tmp3, tmp4, tmp5, tmp6);
assert_different_registers(x, xlen, y, ylen, z, zlen,
tmp1, tmp2, tmp3, tmp4, tmp5, tmp7);
assert_different_registers(x, xlen, y, ylen, z, zlen,
tmp1, tmp2, tmp3, tmp4, tmp5, tmp8);
const Register idx = tmp1;
const Register kdx = tmp2;
const Register xstart = tmp3;
const Register y_idx = tmp4;
const Register carry = tmp5;
const Register product = tmp6;
const Register product_high = tmp7;
const Register x_xstart = tmp8;
const Register tmp = tmp9;
// First Loop.
//
// final static long LONG_MASK = 0xffffffffL;
// int xstart = xlen - 1;
// int ystart = ylen - 1;
// long carry = 0;
// for (int idx=ystart, kdx=ystart+1+xstart; idx >= 0; idx-, kdx--) {
// long product = (y[idx] & LONG_MASK) * (x[xstart] & LONG_MASK) + carry;
// z[kdx] = (int)product;
// carry = product >>> 32;
// }
// z[xstart] = (int)carry;
mr_if_needed(idx, ylen); // idx = ylen
mr_if_needed(kdx, zlen); // kdx = xlen + ylen
li(carry, 0); // carry = 0
Label L_done;
addic_(xstart, xlen, -1);
blt(CCR0, L_done);
multiply_64_x_64_loop(x, xstart, x_xstart, y, y_idx, z,
carry, product_high, product, idx, kdx, tmp);
Label L_second_loop;
cmpdi(CCR0, kdx, 0);
beq(CCR0, L_second_loop);
Label L_carry;
addic_(kdx, kdx, -1);
beq(CCR0, L_carry);
// Store lower 32 bits of carry.
sldi(tmp, kdx, LogBytesPerInt);
stwx(carry, z, tmp);
srdi(carry, carry, 32);
addi(kdx, kdx, -1);
bind(L_carry);
// Store upper 32 bits of carry.
sldi(tmp, kdx, LogBytesPerInt);
stwx(carry, z, tmp);
// Second and third (nested) loops.
//
// for (int i = xstart-1; i >= 0; i--) { // Second loop
// carry = 0;
// for (int jdx=ystart, k=ystart+1+i; jdx >= 0; jdx--, k--) { // Third loop
// long product = (y[jdx] & LONG_MASK) * (x[i] & LONG_MASK) +
// (z[k] & LONG_MASK) + carry;
// z[k] = (int)product;
// carry = product >>> 32;
// }
// z[i] = (int)carry;
// }
//
// i = xlen, j = tmp1, k = tmp2, carry = tmp5, x[i] = rdx
bind(L_second_loop);
li(carry, 0); // carry = 0;
addic_(xstart, xstart, -1); // i = xstart-1;
blt(CCR0, L_done);
Register zsave = tmp10;
mr(zsave, z);
Label L_last_x;
sldi(tmp, xstart, LogBytesPerInt);
add(z, z, tmp); // z = z + k - j
addi(z, z, 4);
addic_(xstart, xstart, -1); // i = xstart-1;
blt(CCR0, L_last_x);
sldi(tmp, xstart, LogBytesPerInt);
ldx(x_xstart, x, tmp);
#ifdef VM_LITTLE_ENDIAN
rldicl(x_xstart, x_xstart, 32, 0);
#endif
Label L_third_loop_prologue;
bind(L_third_loop_prologue);
Register xsave = tmp11;
Register xlensave = tmp12;
Register ylensave = tmp13;
mr(xsave, x);
mr(xlensave, xstart);
mr(ylensave, ylen);
multiply_128_x_128_loop(x_xstart, y, z, y_idx, ylen,
carry, product_high, product, x, tmp);
mr(z, zsave);
mr(x, xsave);
mr(xlen, xlensave); // This is the decrement of the loop counter!
mr(ylen, ylensave);
addi(tmp3, xlen, 1);
sldi(tmp, tmp3, LogBytesPerInt);
stwx(carry, z, tmp);
addic_(tmp3, tmp3, -1);
blt(CCR0, L_done);
srdi(carry, carry, 32);
sldi(tmp, tmp3, LogBytesPerInt);
stwx(carry, z, tmp);
b(L_second_loop);
// Next infrequent code is moved outside loops.
bind(L_last_x);
lwz(x_xstart, 0, x);
b(L_third_loop_prologue);
bind(L_done);
} // multiply_to_len
void MacroAssembler::asm_assert(bool check_equal, const char *msg, int id) {
#ifdef ASSERT

View File

@ -677,6 +677,31 @@ class MacroAssembler: public Assembler {
void char_arrays_equalsImm(Register str1_reg, Register str2_reg, int cntval, Register result_reg,
Register tmp1_reg, Register tmp2_reg);
// Emitters for BigInteger.multiplyToLen intrinsic.
inline void multiply64(Register dest_hi, Register dest_lo,
Register x, Register y);
void add2_with_carry(Register dest_hi, Register dest_lo,
Register src1, Register src2);
void multiply_64_x_64_loop(Register x, Register xstart, Register x_xstart,
Register y, Register y_idx, Register z,
Register carry, Register product_high, Register product,
Register idx, Register kdx, Register tmp);
void multiply_add_128_x_128(Register x_xstart, Register y, Register z,
Register yz_idx, Register idx, Register carry,
Register product_high, Register product, Register tmp,
int offset);
void multiply_128_x_128_loop(Register x_xstart,
Register y, Register z,
Register yz_idx, Register idx, Register carry,
Register product_high, Register product,
Register carry2, Register tmp);
void multiply_to_len(Register x, Register xlen,
Register y, Register ylen,
Register z, Register zlen,
Register tmp1, Register tmp2, Register tmp3, Register tmp4, Register tmp5,
Register tmp6, Register tmp7, Register tmp8, Register tmp9, Register tmp10,
Register tmp11, Register tmp12, Register tmp13);
//
// Debugging
//

View File

@ -423,6 +423,13 @@ inline void MacroAssembler::trap_range_check_ge(Register a, int si16) {
twi(traptoEqual | traptoGreaterThanUnsigned, a/*reg a*/, si16);
}
// unsigned integer multiplication 64*64 -> 128 bits
inline void MacroAssembler::multiply64(Register dest_hi, Register dest_lo,
Register x, Register y) {
mulld(dest_lo, x, y);
mulhdu(dest_hi, x, y);
}
#if defined(ABI_ELFv2)
inline address MacroAssembler::function_entry() { return pc(); }
#else

View File

@ -10930,7 +10930,7 @@ instruct partialSubtypeCheck(iRegPdst result, iRegP_N2P subklass, iRegP_N2P supe
instruct cmpFastLock(flagsReg crx, iRegPdst oop, iRegPdst box, iRegPdst tmp1, iRegPdst tmp2, iRegPdst tmp3) %{
match(Set crx (FastLock oop box));
effect(TEMP tmp1, TEMP tmp2, TEMP tmp3);
predicate(/*(!UseNewFastLockPPC64 || UseBiasedLocking) &&*/ !Compile::current()->use_rtm());
predicate(!Compile::current()->use_rtm());
format %{ "FASTLOCK $oop, $box, $tmp1, $tmp2, $tmp3" %}
ins_encode %{

View File

@ -2053,6 +2053,79 @@ class StubGenerator: public StubCodeGenerator {
__ blr();
}
// Stub for BigInteger::multiplyToLen()
//
// Arguments:
//
// Input:
// R3 - x address
// R4 - x length
// R5 - y address
// R6 - y length
// R7 - z address
// R8 - z length
//
address generate_multiplyToLen() {
StubCodeMark mark(this, "StubRoutines", "multiplyToLen");
address start = __ function_entry();
const Register x = R3;
const Register xlen = R4;
const Register y = R5;
const Register ylen = R6;
const Register z = R7;
const Register zlen = R8;
const Register tmp1 = R2; // TOC not used.
const Register tmp2 = R9;
const Register tmp3 = R10;
const Register tmp4 = R11;
const Register tmp5 = R12;
// non-volatile regs
const Register tmp6 = R31;
const Register tmp7 = R30;
const Register tmp8 = R29;
const Register tmp9 = R28;
const Register tmp10 = R27;
const Register tmp11 = R26;
const Register tmp12 = R25;
const Register tmp13 = R24;
BLOCK_COMMENT("Entry:");
// Save non-volatile regs (frameless).
int current_offs = 8;
__ std(R24, -current_offs, R1_SP); current_offs += 8;
__ std(R25, -current_offs, R1_SP); current_offs += 8;
__ std(R26, -current_offs, R1_SP); current_offs += 8;
__ std(R27, -current_offs, R1_SP); current_offs += 8;
__ std(R28, -current_offs, R1_SP); current_offs += 8;
__ std(R29, -current_offs, R1_SP); current_offs += 8;
__ std(R30, -current_offs, R1_SP); current_offs += 8;
__ std(R31, -current_offs, R1_SP);
__ multiply_to_len(x, xlen, y, ylen, z, zlen, tmp1, tmp2, tmp3, tmp4, tmp5,
tmp6, tmp7, tmp8, tmp9, tmp10, tmp11, tmp12, tmp13);
// Restore non-volatile regs.
current_offs = 8;
__ ld(R24, -current_offs, R1_SP); current_offs += 8;
__ ld(R25, -current_offs, R1_SP); current_offs += 8;
__ ld(R26, -current_offs, R1_SP); current_offs += 8;
__ ld(R27, -current_offs, R1_SP); current_offs += 8;
__ ld(R28, -current_offs, R1_SP); current_offs += 8;
__ ld(R29, -current_offs, R1_SP); current_offs += 8;
__ ld(R30, -current_offs, R1_SP); current_offs += 8;
__ ld(R31, -current_offs, R1_SP);
__ blr(); // Return to caller.
return start;
}
// Initialization
void generate_initial() {
// Generates all stubs and initializes the entry points
@ -2102,6 +2175,12 @@ class StubGenerator: public StubCodeGenerator {
generate_safefetch("SafeFetchN", sizeof(intptr_t), &StubRoutines::_safefetchN_entry,
&StubRoutines::_safefetchN_fault_pc,
&StubRoutines::_safefetchN_continuation_pc);
#ifdef COMPILER2
if (UseMultiplyToLenIntrinsic) {
StubRoutines::_multiplyToLen = generate_multiplyToLen();
}
#endif
}
public:

View File

@ -198,6 +198,10 @@ void VM_Version::initialize() {
FLAG_SET_DEFAULT(UseCRC32CIntrinsics, false);
}
if (FLAG_IS_DEFAULT(UseMultiplyToLenIntrinsic)) {
UseMultiplyToLenIntrinsic = true;
}
// Adjust RTM (Restricted Transactional Memory) flags.
if (!has_tcheck() && UseRTMLocking) {
// Can't continue because UseRTMLocking affects UseBiasedLocking flag
@ -228,7 +232,6 @@ void VM_Version::initialize() {
warning("RTMAbortRatio must be in the range 0 to 100, resetting it to 50");
FLAG_SET_DEFAULT(RTMAbortRatio, 50);
}
FLAG_SET_ERGO(bool, UseNewFastLockPPC64, false); // Does not implement TM.
guarantee(RTMSpinLoopCount > 0, "unsupported");
#else
// Only C2 does RTM locking optimization.