8277175: Add a parallel multiply method to BigInteger

Reviewed-by: psandoz
This commit is contained in:
Dr Heinz M. Kabutz 2022-02-11 18:49:04 +00:00 committed by Paul Sandoz
parent 0786ddb471
commit 83ffbd2e7a
4 changed files with 601 additions and 19 deletions

View File

@ -36,6 +36,9 @@ import java.io.ObjectStreamField;
import java.util.Arrays;
import java.util.Objects;
import java.util.Random;
import java.util.concurrent.ForkJoinPool;
import java.util.concurrent.ForkJoinWorkerThread;
import java.util.concurrent.RecursiveTask;
import java.util.concurrent.ThreadLocalRandom;
import jdk.internal.math.DoubleConsts;
@ -1581,7 +1584,30 @@ public class BigInteger extends Number implements Comparable<BigInteger> {
* @return {@code this * val}
public BigInteger multiply(BigInteger val) {
return multiply(val, false);
return multiply(val, false, false, 0);
* Returns a BigInteger whose value is {@code (this * val)}.
* When both {@code this} and {@code val} are large, typically
* in the thousands of bits, parallel multiply might be used.
* This method returns the exact same mathematical result as
* {@link #multiply}.
* @implNote This implementation may offer better algorithmic
* performance when {@code val == this}.
* @implNote Compared to {@link #multiply}, an implementation's
* parallel multiplication algorithm would typically use more
* CPU resources to compute the result faster, and may do so
* with a slight increase in memory consumption.
* @param val value to be multiplied by this BigInteger.
* @return {@code this * val}
* @see #multiply
public BigInteger parallelMultiply(BigInteger val) {
return multiply(val, false, true, 0);
@ -1590,16 +1616,17 @@ public class BigInteger extends Number implements Comparable<BigInteger> {
* @param val value to be multiplied by this BigInteger.
* @param isRecursion whether this is a recursive invocation
* @param parallel whether the multiply should be done in parallel
* @return {@code this * val}
private BigInteger multiply(BigInteger val, boolean isRecursion) {
private BigInteger multiply(BigInteger val, boolean isRecursion, boolean parallel, int depth) {
if (val.signum == 0 || signum == 0)
return ZERO;
int xlen = mag.length;
if (val == this && xlen > MULTIPLY_SQUARE_THRESHOLD) {
return square();
return square(true, parallel, depth);
int ylen = val.mag.length;
@ -1677,7 +1704,7 @@ public class BigInteger extends Number implements Comparable<BigInteger> {
return multiplyToomCook3(this, val);
return multiplyToomCook3(this, val, parallel, depth);
@ -1844,6 +1871,88 @@ public class BigInteger extends Number implements Comparable<BigInteger> {
private abstract static sealed class RecursiveOp extends RecursiveTask<BigInteger> {
* The threshold until when we should continue forking recursive ops
* if parallel is true. This threshold is only relevant for Toom Cook 3
* multiply and square.
private static final int PARALLEL_FORK_DEPTH_THRESHOLD =
private static final int calculateMaximumDepth(int parallelism) {
return 32 - Integer.numberOfLeadingZeros(parallelism);
final boolean parallel;
* The current recursing depth. Since it is a logarithmic algorithm,
* we do not need an int to hold the number.
final byte depth;
private RecursiveOp(boolean parallel, int depth) {
this.parallel = parallel;
this.depth = (byte) depth;
private static int getParallelForkDepthThreshold() {
if (Thread.currentThread() instanceof ForkJoinWorkerThread fjwt) {
return calculateMaximumDepth(fjwt.getPool().getParallelism());
else {
protected RecursiveTask<BigInteger> forkOrInvoke() {
if (parallel && depth <= getParallelForkDepthThreshold()) fork();
else invoke();
return this;
private static final class RecursiveMultiply extends RecursiveOp {
private final BigInteger a;
private final BigInteger b;
public RecursiveMultiply(BigInteger a, BigInteger b, boolean parallel, int depth) {
super(parallel, depth);
this.a = a;
this.b = b;
public BigInteger compute() {
return a.multiply(b, true, parallel, depth);
private static final class RecursiveSquare extends RecursiveOp {
private final BigInteger a;
public RecursiveSquare(BigInteger a, boolean parallel, int depth) {
super(parallel, depth);
this.a = a;
public BigInteger compute() {
return a.square(true, parallel, depth);
private static RecursiveTask<BigInteger> multiply(BigInteger a, BigInteger b, boolean parallel, int depth) {
return new RecursiveMultiply(a, b, parallel, depth).forkOrInvoke();
private static RecursiveTask<BigInteger> square(BigInteger a, boolean parallel, int depth) {
return new RecursiveSquare(a, parallel, depth).forkOrInvoke();
* Multiplies two BigIntegers using a 3-way Toom-Cook multiplication
* algorithm. This is a recursive divide-and-conquer algorithm which is
@ -1872,7 +1981,7 @@ public class BigInteger extends Number implements Comparable<BigInteger> {
* LNCS #4547. Springer, Madrid, Spain, June 21-22, 2007.
private static BigInteger multiplyToomCook3(BigInteger a, BigInteger b) {
private static BigInteger multiplyToomCook3(BigInteger a, BigInteger b, boolean parallel, int depth) {
int alen = a.mag.length;
int blen = b.mag.length;
@ -1896,16 +2005,20 @@ public class BigInteger extends Number implements Comparable<BigInteger> {
BigInteger v0, v1, v2, vm1, vinf, t1, t2, tm1, da1, db1;
v0 = a0.multiply(b0, true);
var v0_task = RecursiveOp.multiply(a0, b0, parallel, depth);
da1 = a2.add(a0);
db1 = b2.add(b0);
vm1 = da1.subtract(a1).multiply(db1.subtract(b1), true);
var vm1_task = RecursiveOp.multiply(da1.subtract(a1), db1.subtract(b1), parallel, depth);
da1 = da1.add(a1);
db1 = db1.add(b1);
v1 = da1.multiply(db1, true);
var v1_task = RecursiveOp.multiply(da1, db1, parallel, depth);
v2 = da1.add(a2).shiftLeft(1).subtract(a0).multiply(
db1.add(b2).shiftLeft(1).subtract(b0), true);
vinf = a2.multiply(b2, true);
db1.add(b2).shiftLeft(1).subtract(b0), true, parallel, depth);
vinf = a2.multiply(b2, true, parallel, depth);
v0 = v0_task.join();
vm1 = vm1_task.join();
v1 = v1_task.join();
// The algorithm requires two divisions by 2 and one by 3.
// All divisions are known to be exact, that is, they do not produce
@ -2071,7 +2184,7 @@ public class BigInteger extends Number implements Comparable<BigInteger> {
* @return <code>this<sup>2</sup></code>
private BigInteger square() {
return square(false);
return square(false, false, 0);
@ -2081,7 +2194,7 @@ public class BigInteger extends Number implements Comparable<BigInteger> {
* @param isRecursion whether this is a recursive invocation
* @return <code>this<sup>2</sup></code>
private BigInteger square(boolean isRecursion) {
private BigInteger square(boolean isRecursion, boolean parallel, int depth) {
if (signum == 0) {
return ZERO;
@ -2103,7 +2216,7 @@ public class BigInteger extends Number implements Comparable<BigInteger> {
return squareToomCook3();
return squareToomCook3(parallel, depth);
@ -2237,7 +2350,7 @@ public class BigInteger extends Number implements Comparable<BigInteger> {
* that has better asymptotic performance than the algorithm used in
* squareToLen or squareKaratsuba.
private BigInteger squareToomCook3() {
private BigInteger squareToomCook3(boolean parallel, int depth) {
int len = mag.length;
// k is the size (in ints) of the lower-order slices.
@ -2254,13 +2367,17 @@ public class BigInteger extends Number implements Comparable<BigInteger> {
a0 = getToomSlice(k, r, 2, len);
BigInteger v0, v1, v2, vm1, vinf, t1, t2, tm1, da1;
v0 = a0.square(true);
var v0_fork = RecursiveOp.square(a0, parallel, depth);
da1 = a2.add(a0);
vm1 = da1.subtract(a1).square(true);
var vm1_fork = RecursiveOp.square(da1.subtract(a1), parallel, depth);
da1 = da1.add(a1);
v1 = da1.square(true);
vinf = a2.square(true);
v2 = da1.add(a2).shiftLeft(1).subtract(a0).square(true);
var v1_fork = RecursiveOp.square(da1, parallel, depth);
vinf = a2.square(true, parallel, depth);
v2 = da1.add(a2).shiftLeft(1).subtract(a0).square(true, parallel, depth);
v0 = v0_fork.join();
vm1 = vm1_fork.join();
v1 = v1_fork.join();
// The algorithm requires two divisions by 2 and one by 3.
// All divisions are known to be exact, that is, they do not produce

View File

@ -0,0 +1,82 @@
* Copyright (c) 1998, 2020, Oracle and/or its affiliates. All rights reserved.
* This code is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License version 2 only, as
* published by the Free Software Foundation.
* This code is distributed in the hope that it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
* version 2 for more details (a copy is included in the LICENSE file that
* accompanied this code).
* You should have received a copy of the GNU General Public License version
* 2 along with this work; if not, write to the Free Software Foundation,
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
* or visit www.oracle.com if you need additional information or have any
* questions.
* @test
* @run main BigIntegerParallelMultiplyTest
* @summary tests parallelMultiply() method in BigInteger
* @author Heinz Kabutz heinz@javaspecialists.eu
import java.math.BigInteger;
import java.util.function.BinaryOperator;
* This is a simple test class created to ensure that the results
* of multiply() are the same as multiplyParallel(). We calculate
* the Fibonacci numbers using Dijkstra's sum of squares to get
* very large numbers (hundreds of thousands of bits).
* @author Heinz Kabutz, heinz@javaspecialists.eu
public class BigIntegerParallelMultiplyTest {
public static BigInteger fibonacci(int n, BinaryOperator<BigInteger> multiplyOperator) {
if (n == 0) return BigInteger.ZERO;
if (n == 1) return BigInteger.ONE;
int half = (n + 1) / 2;
BigInteger f0 = fibonacci(half - 1, multiplyOperator);
BigInteger f1 = fibonacci(half, multiplyOperator);
if (n % 2 == 1) {
BigInteger b0 = multiplyOperator.apply(f0, f0);
BigInteger b1 = multiplyOperator.apply(f1, f1);
return b0.add(b1);
} else {
BigInteger b0 = f0.shiftLeft(1).add(f1);
return multiplyOperator.apply(b0, f1);
public static void main(String[] args) throws Exception {
compare(1000, 324);
compare(10_000, 3473);
compare(100_000, 34883);
compare(1_000_000, 347084);
private static void compare(int n, int expectedBitCount) {
BigInteger multiplyResult = fibonacci(n, BigInteger::multiply);
BigInteger parallelMultiplyResult = fibonacci(n, BigInteger::parallelMultiply);
checkBitCount(n, expectedBitCount, multiplyResult);
checkBitCount(n, expectedBitCount, parallelMultiplyResult);
if (!multiplyResult.equals(parallelMultiplyResult))
throw new AssertionError("multiply() and parallelMultiply() give different results");
private static void checkBitCount(int n, int expectedBitCount, BigInteger number) {
if (number.bitCount() != expectedBitCount)
throw new AssertionError(
"bitCount of fibonacci(" + n + ") was expected to be " + expectedBitCount
+ " but was " + number.bitCount());

View File

@ -0,0 +1,322 @@
package org.openjdk.bench.java.math;
import javax.management.MBeanServer;
import javax.management.MalformedObjectNameException;
import javax.management.ObjectName;
import java.lang.management.ManagementFactory;
import java.lang.management.ThreadMXBean;
import java.lang.reflect.InvocationTargetException;
import java.lang.reflect.Method;
import java.math.BigInteger;
import java.util.Arrays;
import java.util.IdentityHashMap;
import java.util.List;
import java.util.Locale;
import java.util.LongSummaryStatistics;
import java.util.Map;
import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.ForkJoinPool;
import java.util.concurrent.ForkJoinWorkerThread;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.atomic.AtomicLong;
import java.util.function.BinaryOperator;
import java.util.function.LongUnaryOperator;
import java.util.stream.Collectors;
import static java.util.concurrent.ForkJoinPool.defaultForkJoinWorkerThreadFactory;
* Benchmark for checking performance difference between sequential and parallel
* multiply of very large Mersenne primes using BigInteger. We want to measure
* real time, user time, system time and the amount of memory allocated. To
* calculate this, we create our own thread factory for the common ForkJoinPool
* and then use that to measure user time, cpu time and bytes allocated.
* <p>
* We use reflection to discover all methods that match "*ultiply", and use them
* to multiply two very large Mersenne primes together.
* <p>
* <h3>Results on a 1-6-2 machine running Ubuntu linux</h3>
* <p>
* Memory allocation increased from 83.9GB to 84GB, for both the sequential and
* parallel versions. This is an increase of just 0.1%. On this machine, the
* parallel version was 3.8x faster in latency (real time), but it used 2.7x
* more CPU resources.
* <p>
* Testing multiplying Mersenne primes of 2^57885161-1 and 2^82589933-1
* <p>
* <pre>
* openjdk version "18-internal" 2022-03-15
* BigInteger.parallelMultiply()
* real 0m6.288s
* user 1m3.010s
* sys 0m0.027s
* mem 84.0GB
* BigInteger.multiply()
* real 0m23.682s
* user 0m23.530s
* sys 0m0.004s
* mem 84.0GB
* openjdk version "1.8.0_302"
* BigInteger.multiply()
* real 0m25.657s
* user 0m25.390s
* sys 0m0.001s
* mem 83.9GB
* openjdk version ""
* BigInteger.multiply()
* real 0m24.907s
* user 0m24.700s
* sys 0m0.001s
* mem 83.9GB
* openjdk version "10.0.2" 2018-07-17
* BigInteger.multiply()
* real 0m24.632s
* user 0m24.380s
* sys 0m0.004s
* mem 83.9GB
* openjdk version "11.0.12" 2021-07-20 LTS
* BigInteger.multiply()
* real 0m22.114s
* user 0m21.930s
* sys 0m0.001s
* mem 83.9GB
* openjdk version "12.0.2" 2019-07-16
* BigInteger.multiply()
* real 0m23.015s
* user 0m22.830s
* sys 0m0.000s
* mem 83.9GB
* openjdk version "13.0.9" 2021-10-19
* BigInteger.multiply()
* real 0m23.548s
* user 0m23.350s
* sys 0m0.005s
* mem 83.9GB
* openjdk version "14.0.2" 2020-07-14
* BigInteger.multiply()
* real 0m22.918s
* user 0m22.530s
* sys 0m0.131s
* mem 83.9GB
* openjdk version "15.0.5" 2021-10-19
* BigInteger.multiply()
* real 0m22.038s
* user 0m21.750s
* sys 0m0.003s
* mem 83.9GB
* openjdk version "16.0.2" 2021-07-20
* BigInteger.multiply()
* real 0m23.049s
* user 0m22.760s
* sys 0m0.006s
* mem 83.9GB
* openjdk version "17" 2021-09-14
* BigInteger.multiply()
* real 0m22.580s
* user 0m22.310s
* sys 0m0.001s
* mem 83.9GB
* @author Heinz Kabutz, heinz@javaspecialists.eu
public class BigIntegerMersennePrimeMultiply implements ForkJoinPool.ForkJoinWorkerThreadFactory {
// Large Mersenne prime discovered by Curtis Cooper in 2013
private static final int EXPONENT_1 = 57885161;
private static final BigInteger MERSENNE_1 =
// Largest Mersenne prime number discovered by Patrick Laroche in 2018
private static final int EXPONENT_2 = 82589933;
private static final BigInteger MERSENNE_2 =
private static boolean DEBUG = false;
public static void main(String... args) {
System.out.println("Testing multiplying Mersenne primes of " +
"2^" + EXPONENT_1 + "-1 and 2^" + EXPONENT_2 + "-1");
System.out.println("Using the following multiply methods:");
List<Method> methods = Arrays.stream(BigInteger.class.getMethods())
.filter(method -> method.getName().endsWith("ultiply") &&
method.getParameterCount() == 1 &&
method.getParameterTypes()[0] == BigInteger.class)
.peek(method -> System.out.println(" " + method))
for (int i = 0; i < 3; i++) {
private static void test(Method method) {
BinaryOperator<BigInteger> multiplyOperator = (a, b) -> {
try {
return (BigInteger) method.invoke(a, b);
} catch (IllegalAccessException e) {
throw new AssertionError(e);
} catch (InvocationTargetException e) {
throw new AssertionError(e.getCause());
test(method.getName(), multiplyOperator);
private static void test(String description,
BinaryOperator<BigInteger> multiplyOperator) {
System.out.println("BigInteger." + description + "()");
long elapsedTimeInNanos = System.nanoTime();
try {
BigInteger result1 = multiplyOperator.apply(MERSENNE_1, MERSENNE_2);
BigInteger result2 = multiplyOperator.apply(MERSENNE_2, MERSENNE_1);
if (result1.bitLength() != 140475094)
throw new AssertionError("Expected bitLength: 140475094, " +
"but was " + result1.bitLength());
if (result2.bitLength() != 140475094)
throw new AssertionError("Expected bitLength: 140475094, " +
"but was " + result1.bitLength());
} finally {
elapsedTimeInNanos = System.nanoTime() - elapsedTimeInNanos;
LongSummaryStatistics userTimeStatistics = getStatistics(userTime);
LongSummaryStatistics cpuTimeStatistics = getStatistics(cpuTime);
LongSummaryStatistics memoryAllocationStatistics = getStatistics(bytes);
System.out.println("real " + formatTime(elapsedTimeInNanos));
System.out.println("user " + formatTime(userTimeStatistics.getSum()));
System.out.println("sys " +
formatTime(cpuTimeStatistics.getSum() - userTimeStatistics.getSum()));
System.out.println("mem " + formatMemory(memoryAllocationStatistics.getSum(), 1));
private static LongSummaryStatistics getStatistics(Map<Thread, AtomicLong> timeMap) {
return timeMap.entrySet()
.peek(entry -> {
long timeInMs = (counterExtractorMap.get(timeMap)
- entry.getValue().get());
private static void printTime(Map.Entry<Thread, AtomicLong> threadCounter) {
if (DEBUG)
System.out.printf("%s %d%n", threadCounter.getKey(), threadCounter.getValue()
private static void addCounters(Thread thread) {
counterExtractorMap.forEach((map, timeExtractor) -> add(map, thread, timeExtractor));
private static void add(Map<Thread, AtomicLong> time, Thread thread,
LongUnaryOperator timeExtractor) {
time.put(thread, new AtomicLong(timeExtractor.applyAsLong(thread.getId())));
private static void resetAllCounters() {
private static void resetTimes(Map<Thread, AtomicLong> timeMap, LongUnaryOperator timeMethod) {
timeMap.forEach((thread, time) ->
private static final Map<Thread, AtomicLong> userTime =
new ConcurrentHashMap<>();
private static final Map<Thread, AtomicLong> cpuTime =
new ConcurrentHashMap<>();
private static final Map<Thread, AtomicLong> bytes =
new ConcurrentHashMap<>();
private static final ThreadMXBean tmb = ManagementFactory.getThreadMXBean();
private static final Map<Map<Thread, AtomicLong>, LongUnaryOperator> counterExtractorMap =
new IdentityHashMap<>();
static {
counterExtractorMap.put(userTime, tmb::getThreadUserTime);
counterExtractorMap.put(cpuTime, tmb::getThreadCpuTime);
counterExtractorMap.put(bytes, BigIntegerMersennePrimeMultiply::threadAllocatedBytes);
public final ForkJoinWorkerThread newThread(ForkJoinPool pool) {
ForkJoinWorkerThread thread = defaultForkJoinWorkerThreadFactory.newThread(pool);
return thread;
private static final String[] SIGNATURE = new String[]{long.class.getName()};
private static final MBeanServer mBeanServer;
private static final ObjectName name;
static {
try {
name = new ObjectName(ManagementFactory.THREAD_MXBEAN_NAME);
mBeanServer = ManagementFactory.getPlatformMBeanServer();
} catch (MalformedObjectNameException e) {
throw new ExceptionInInitializerError(e);
public static long threadAllocatedBytes(long threadId) {
try {
return (long) mBeanServer.invoke(
new Object[]{threadId},
} catch (Exception e) {
throw new IllegalArgumentException(e);
public static String formatMemory(double bytes, int decimals) {
double val;
String unitStr;
if (bytes < 1024) {
val = bytes;
unitStr = "B";
} else if (bytes < 1024 * 1024) {
val = bytes / 1024;
unitStr = "KB";
} else if (bytes < 1024 * 1024 * 1024) {
val = bytes / (1024 * 1024);
unitStr = "MB";
} else if (bytes < 1024 * 1024 * 1024 * 1024L) {
val = bytes / (1024 * 1024 * 1024L);
unitStr = "GB";
} else {
val = bytes / (1024 * 1024 * 1024 * 1024L);
unitStr = "TB";
return String.format(Locale.US, "%." + decimals + "f%s", val, unitStr);
public static String formatTime(long nanos) {
if (nanos < 0) nanos = 0;
long timeInMs = TimeUnit.NANOSECONDS.toMillis(nanos);
long minutes = timeInMs / 60_000;
double remainingMs = (timeInMs % 60_000) / 1000.0;
return String.format(Locale.US, "%dm%.3fs", minutes, remainingMs);

View File

@ -0,0 +1,61 @@
package org.openjdk.bench.java.math;
import org.openjdk.jmh.annotations.Benchmark;
import org.openjdk.jmh.annotations.BenchmarkMode;
import org.openjdk.jmh.annotations.Fork;
import org.openjdk.jmh.annotations.Measurement;
import org.openjdk.jmh.annotations.Mode;
import org.openjdk.jmh.annotations.OutputTimeUnit;
import org.openjdk.jmh.annotations.Param;
import org.openjdk.jmh.annotations.Scope;
import org.openjdk.jmh.annotations.State;
import org.openjdk.jmh.annotations.Warmup;
import java.math.BigInteger;
import java.util.concurrent.TimeUnit;
import java.util.function.BinaryOperator;
* Benchmark for checking performance difference between
* sequential and parallel multiply methods in BigInteger,
* using a large Fibonacci calculation of up to n = 100 million.
* @author Heinz Kabutz, heinz@javaspecialists.eu
@Fork(value = 2)
@Warmup(iterations = 2)
@Measurement(iterations = 2) // only 2 iterations because each one takes very long
public class BigIntegerParallelMultiply {
private static BigInteger fibonacci(int n, BinaryOperator<BigInteger> multiplyOperator) {
if (n == 0) return BigInteger.ZERO;
if (n == 1) return BigInteger.ONE;
int half = (n + 1) / 2;
BigInteger f0 = fibonacci(half - 1, multiplyOperator);
BigInteger f1 = fibonacci(half, multiplyOperator);
if (n % 2 == 1) {
BigInteger b0 = multiplyOperator.apply(f0, f0);
BigInteger b1 = multiplyOperator.apply(f1, f1);
return b0.add(b1);
} else {
BigInteger b0 = f0.shiftLeft(1).add(f1);
return multiplyOperator.apply(b0, f1);
@Param({"1000000", "10000000", "100000000"})
private int n;
public void multiply() {
fibonacci(n, BigInteger::multiply);
public void parallelMultiply() {
fibonacci(n, BigInteger::parallelMultiply);