From 81ff91ef27a6a856ae2c453a9a9b8333b91da3ab Mon Sep 17 00:00:00 2001 From: Per Minborg <pminborg@openjdk.org> Date: Thu, 12 Sep 2024 18:31:08 +0000 Subject: [PATCH] 8339531: Improve performance of MemorySegment::mismatch Reviewed-by: mcimadamore --- .../java/lang/foreign/MemorySegment.java | 8 +- .../foreign/AbstractMemorySegmentImpl.java | 178 +--------- .../foreign/SegmentBulkOperations.java | 316 ++++++++++++++++++ test/jdk/java/foreign/TestMismatch.java | 64 ++++ .../{CopyTest.java => SegmentBulkCopy.java} | 54 +-- .../{TestFill.java => SegmentBulkFill.java} | 38 ++- .../lang/foreign/SegmentBulkMismatch.java | 112 +++++++ 7 files changed, 559 insertions(+), 211 deletions(-) create mode 100644 src/java.base/share/classes/jdk/internal/foreign/SegmentBulkOperations.java rename test/micro/org/openjdk/bench/java/lang/foreign/{CopyTest.java => SegmentBulkCopy.java} (76%) rename test/micro/org/openjdk/bench/java/lang/foreign/{TestFill.java => SegmentBulkFill.java} (69%) create mode 100644 test/micro/org/openjdk/bench/java/lang/foreign/SegmentBulkMismatch.java diff --git a/src/java.base/share/classes/java/lang/foreign/MemorySegment.java b/src/java.base/share/classes/java/lang/foreign/MemorySegment.java index 38fd36bbb15..cb1f3707db6 100644 --- a/src/java.base/share/classes/java/lang/foreign/MemorySegment.java +++ b/src/java.base/share/classes/java/lang/foreign/MemorySegment.java @@ -43,6 +43,7 @@ import java.util.function.Consumer; import java.util.stream.Stream; import jdk.internal.foreign.AbstractMemorySegmentImpl; import jdk.internal.foreign.MemorySessionImpl; +import jdk.internal.foreign.SegmentBulkOperations; import jdk.internal.foreign.SegmentFactories; import jdk.internal.javac.Restricted; import jdk.internal.reflect.CallerSensitive; @@ -1571,7 +1572,7 @@ public sealed interface MemorySegment permits AbstractMemorySegmentImpl { static void copy(MemorySegment srcSegment, long srcOffset, MemorySegment dstSegment, long dstOffset, long bytes) { - AbstractMemorySegmentImpl.copy((AbstractMemorySegmentImpl) srcSegment, srcOffset, + SegmentBulkOperations.copy((AbstractMemorySegmentImpl) srcSegment, srcOffset, (AbstractMemorySegmentImpl) dstSegment, dstOffset, bytes); } @@ -2635,8 +2636,9 @@ public sealed interface MemorySegment permits AbstractMemorySegmentImpl { */ static long mismatch(MemorySegment srcSegment, long srcFromOffset, long srcToOffset, MemorySegment dstSegment, long dstFromOffset, long dstToOffset) { - return AbstractMemorySegmentImpl.mismatch(srcSegment, srcFromOffset, srcToOffset, - dstSegment, dstFromOffset, dstToOffset); + return SegmentBulkOperations.mismatch( + (AbstractMemorySegmentImpl)Objects.requireNonNull(srcSegment), srcFromOffset, srcToOffset, + (AbstractMemorySegmentImpl)Objects.requireNonNull(dstSegment), dstFromOffset, dstToOffset); } /** diff --git a/src/java.base/share/classes/jdk/internal/foreign/AbstractMemorySegmentImpl.java b/src/java.base/share/classes/jdk/internal/foreign/AbstractMemorySegmentImpl.java index 83b11b7ce68..64994af5cb7 100644 --- a/src/java.base/share/classes/jdk/internal/foreign/AbstractMemorySegmentImpl.java +++ b/src/java.base/share/classes/jdk/internal/foreign/AbstractMemorySegmentImpl.java @@ -72,8 +72,6 @@ public abstract sealed class AbstractMemorySegmentImpl implements MemorySegment, SegmentAllocator, BiFunction<String, List<Number>, RuntimeException> permits HeapMemorySegmentImpl, NativeMemorySegmentImpl { - private static final ScopedMemoryAccess SCOPED_MEMORY_ACCESS = ScopedMemoryAccess.getScopedMemoryAccess(); - static final JavaNioAccess NIO_ACCESS = SharedSecrets.getJavaNioAccess(); final long length; @@ -189,53 +187,10 @@ public abstract sealed class AbstractMemorySegmentImpl return StreamSupport.stream(spliterator(elementLayout), false); } - // FILL_NATIVE_THRESHOLD must be a power of two and should be greater than 2^3 - // Update the value for Aarch64 once 8338975 is fixed. - private static final long FILL_NATIVE_THRESHOLD = 1L << (Architecture.isAARCH64() ? 10 : 5); - - @Override @ForceInline + @Override public final MemorySegment fill(byte value) { - checkReadOnly(false); - if (length == 0) { - // Implicit state check - checkValidState(); - } else if (length < FILL_NATIVE_THRESHOLD) { - // 0 <= length < FILL_NATIVE_LIMIT : 0...0X...XXXX - - // Handle smaller segments directly without transitioning to native code - final long u = Byte.toUnsignedLong(value); - final long longValue = u << 56 | u << 48 | u << 40 | u << 32 | u << 24 | u << 16 | u << 8 | u; - - int offset = 0; - // 0...0X...X000 - final int limit = (int) (length & (FILL_NATIVE_THRESHOLD - 8)); - for (; offset < limit; offset += 8) { - SCOPED_MEMORY_ACCESS.putLong(sessionImpl(), unsafeGetBase(), unsafeGetOffset() + offset, longValue); - } - int remaining = (int) length - limit; - // 0...0X00 - if (remaining >= 4) { - SCOPED_MEMORY_ACCESS.putInt(sessionImpl(), unsafeGetBase(), unsafeGetOffset() + offset, (int) longValue); - offset += 4; - remaining -= 4; - } - // 0...00X0 - if (remaining >= 2) { - SCOPED_MEMORY_ACCESS.putShort(sessionImpl(), unsafeGetBase(), unsafeGetOffset() + offset, (short) longValue); - offset += 2; - remaining -= 2; - } - // 0...000X - if (remaining == 1) { - SCOPED_MEMORY_ACCESS.putByte(sessionImpl(), unsafeGetBase(), unsafeGetOffset() + offset, value); - } - // We have now fully handled 0...0X...XXXX - } else { - // Handle larger segments via native calls - SCOPED_MEMORY_ACCESS.setMemory(sessionImpl(), unsafeGetBase(), unsafeGetOffset(), length, value); - } - return this; + return SegmentBulkOperations.fill(this, value); } @Override @@ -244,38 +199,6 @@ public abstract sealed class AbstractMemorySegmentImpl return asSlice(0, byteSize, byteAlignment); } - /** - * Mismatch over long lengths. - */ - public static long vectorizedMismatchLargeForBytes(MemorySessionImpl aSession, MemorySessionImpl bSession, - Object a, long aOffset, - Object b, long bOffset, - long length) { - long off = 0; - long remaining = length; - int i, size; - boolean lastSubRange = false; - while (remaining > 7 && !lastSubRange) { - if (remaining > Integer.MAX_VALUE) { - size = Integer.MAX_VALUE; - } else { - size = (int) remaining; - lastSubRange = true; - } - i = SCOPED_MEMORY_ACCESS.vectorizedMismatch(aSession, bSession, - a, aOffset + off, - b, bOffset + off, - size, ArraysSupport.LOG2_ARRAY_BYTE_INDEX_SCALE); - if (i >= 0) - return off + i; - - i = size - ~i; - off += i; - remaining -= i; - } - return ~remaining; - } - @Override public final ByteBuffer asByteBuffer() { checkArraySize("ByteBuffer", 1); @@ -314,7 +237,7 @@ public abstract sealed class AbstractMemorySegmentImpl } @ForceInline - private boolean overlaps(AbstractMemorySegmentImpl that) { + boolean overlaps(AbstractMemorySegmentImpl that) { if (unsafeGetBase() == that.unsafeGetBase()) { // both either native or the same heap segment final long thisStart = this.unsafeGetOffset(); final long thatStart = that.unsafeGetOffset(); @@ -334,7 +257,8 @@ public abstract sealed class AbstractMemorySegmentImpl @Override public long mismatch(MemorySegment other) { Objects.requireNonNull(other); - return MemorySegment.mismatch(this, 0, byteSize(), other, 0, other.byteSize()); + return SegmentBulkOperations.mismatch(this, 0, byteSize(), + (AbstractMemorySegmentImpl) other, 0, other.byteSize()); } @Override @@ -650,64 +574,6 @@ public abstract sealed class AbstractMemorySegmentImpl } } - // COPY_NATIVE_THRESHOLD must be a power of two and should be greater than 2^3 - private static final long COPY_NATIVE_THRESHOLD = 1 << 6; - - @ForceInline - public static void copy(AbstractMemorySegmentImpl src, long srcOffset, - AbstractMemorySegmentImpl dst, long dstOffset, - long size) { - - Utils.checkNonNegativeIndex(size, "size"); - // Implicit null check for src and dst - src.checkAccess(srcOffset, size, true); - dst.checkAccess(dstOffset, size, false); - - if (size <= 0) { - // Do nothing - } else if (size < COPY_NATIVE_THRESHOLD && !src.overlaps(dst)) { - // 0 < size < FILL_NATIVE_LIMIT : 0...0X...XXXX - // - // Strictly, we could check for !src.asSlice(srcOffset, size).overlaps(dst.asSlice(dstOffset, size) but - // this is a bit slower and it likely very unusual there is any difference in the outcome. Also, if there - // is an overlap, we could tolerate one particular direction of overlap (but not the other). - - // 0...0X...X000 - final int limit = (int) (size & (COPY_NATIVE_THRESHOLD - 8)); - int offset = 0; - for (; offset < limit; offset += 8) { - final long v = SCOPED_MEMORY_ACCESS.getLong(src.sessionImpl(), src.unsafeGetBase(), src.unsafeGetOffset() + srcOffset + offset); - SCOPED_MEMORY_ACCESS.putLong(dst.sessionImpl(), dst.unsafeGetBase(), dst.unsafeGetOffset() + dstOffset + offset, v); - } - int remaining = (int) size - offset; - // 0...0X00 - if (remaining >= 4) { - final int v = SCOPED_MEMORY_ACCESS.getInt(src.sessionImpl(), src.unsafeGetBase(),src.unsafeGetOffset() + srcOffset + offset); - SCOPED_MEMORY_ACCESS.putInt(dst.sessionImpl(), dst.unsafeGetBase(), dst.unsafeGetOffset() + dstOffset + offset, v); - offset += 4; - remaining -= 4; - } - // 0...00X0 - if (remaining >= 2) { - final short v = SCOPED_MEMORY_ACCESS.getShort(src.sessionImpl(), src.unsafeGetBase(), src.unsafeGetOffset() + srcOffset + offset); - SCOPED_MEMORY_ACCESS.putShort(dst.sessionImpl(), dst.unsafeGetBase(), dst.unsafeGetOffset() + dstOffset + offset, v); - offset += 2; - remaining -=2; - } - // 0...000X - if (remaining == 1) { - final byte v = SCOPED_MEMORY_ACCESS.getByte(src.sessionImpl(), src.unsafeGetBase(), src.unsafeGetOffset() + srcOffset + offset); - SCOPED_MEMORY_ACCESS.putByte(dst.sessionImpl(), dst.unsafeGetBase(), dst.unsafeGetOffset() + dstOffset + offset, v); - } - // We have now fully handled 0...0X...XXXX - } else { - // For larger sizes, the transition to native code pays off - SCOPED_MEMORY_ACCESS.copyMemory(src.sessionImpl(), dst.sessionImpl(), - src.unsafeGetBase(), src.unsafeGetOffset() + srcOffset, - dst.unsafeGetBase(), dst.unsafeGetOffset() + dstOffset, size); - } - } - @ForceInline public static void copy(MemorySegment srcSegment, ValueLayout srcElementLayout, long srcOffset, MemorySegment dstSegment, ValueLayout dstElementLayout, long dstOffset, @@ -794,40 +660,6 @@ public abstract sealed class AbstractMemorySegmentImpl } } - public static long mismatch(MemorySegment srcSegment, long srcFromOffset, long srcToOffset, - MemorySegment dstSegment, long dstFromOffset, long dstToOffset) { - AbstractMemorySegmentImpl srcImpl = (AbstractMemorySegmentImpl)Objects.requireNonNull(srcSegment); - AbstractMemorySegmentImpl dstImpl = (AbstractMemorySegmentImpl)Objects.requireNonNull(dstSegment); - long srcBytes = srcToOffset - srcFromOffset; - long dstBytes = dstToOffset - dstFromOffset; - srcImpl.checkAccess(srcFromOffset, srcBytes, true); - dstImpl.checkAccess(dstFromOffset, dstBytes, true); - - long bytes = Math.min(srcBytes, dstBytes); - long i = 0; - if (bytes > 7) { - if (srcImpl.get(JAVA_BYTE, srcFromOffset) != dstImpl.get(JAVA_BYTE, dstFromOffset)) { - return 0; - } - i = AbstractMemorySegmentImpl.vectorizedMismatchLargeForBytes(srcImpl.sessionImpl(), dstImpl.sessionImpl(), - srcImpl.unsafeGetBase(), srcImpl.unsafeGetOffset() + srcFromOffset, - dstImpl.unsafeGetBase(), dstImpl.unsafeGetOffset() + dstFromOffset, - bytes); - if (i >= 0) { - return i; - } - long remaining = ~i; - assert remaining < 8 : "remaining greater than 7: " + remaining; - i = bytes - remaining; - } - for (; i < bytes; i++) { - if (srcImpl.get(JAVA_BYTE, srcFromOffset + i) != dstImpl.get(JAVA_BYTE, dstFromOffset + i)) { - return i; - } - } - return srcBytes != dstBytes ? bytes : -1; - } - private static int getScaleFactor(Buffer buffer) { return switch (buffer) { case ByteBuffer _ -> 0; diff --git a/src/java.base/share/classes/jdk/internal/foreign/SegmentBulkOperations.java b/src/java.base/share/classes/jdk/internal/foreign/SegmentBulkOperations.java new file mode 100644 index 00000000000..74953f077e4 --- /dev/null +++ b/src/java.base/share/classes/jdk/internal/foreign/SegmentBulkOperations.java @@ -0,0 +1,316 @@ +/* + * Copyright (c) 2024, Oracle and/or its affiliates. All rights reserved. + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This code is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 only, as + * published by the Free Software Foundation. Oracle designates this + * particular file as subject to the "Classpath" exception as provided + * by Oracle in the LICENSE file that accompanied this code. + * + * This code is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * version 2 for more details (a copy is included in the LICENSE file that + * accompanied this code). + * + * You should have received a copy of the GNU General Public License version + * 2 along with this work; if not, write to the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. + * + * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA + * or visit www.oracle.com if you need additional information or have any + * questions. + */ + +package jdk.internal.foreign; + +import jdk.internal.misc.ScopedMemoryAccess; +import jdk.internal.util.Architecture; +import jdk.internal.util.ArraysSupport; +import jdk.internal.vm.annotation.ForceInline; + +import java.lang.foreign.MemorySegment; + +/** + * This class contains optimized bulk operation methods that operate on one or several + * memory segments. + * <p> + * Generally, the methods attempt to work with as-large-as-possible units of memory at + * a time. + * <p> + * It should be noted that when invoking scoped memory access get/set operations, it + * is imperative from a performance perspective to convey the sharp types from the + * call site in order for the compiler to pick the correct Unsafe access variant. + */ +public final class SegmentBulkOperations { + + private SegmentBulkOperations() {} + + private static final ScopedMemoryAccess SCOPED_MEMORY_ACCESS = ScopedMemoryAccess.getScopedMemoryAccess(); + + // All the threshold values below MUST be a power of two and should preferably be + // greater or equal to 2^3. + + // Update the FILL value for Aarch64 once 8338975 is fixed. + private static final int NATIVE_THRESHOLD_FILL = powerOfPropertyOr("fill", Architecture.isAARCH64() ? 10 : 5); + private static final int NATIVE_THRESHOLD_MISMATCH = powerOfPropertyOr("mismatch", 6); + private static final int NATIVE_THRESHOLD_COPY = powerOfPropertyOr("copy", 6); + + @ForceInline + public static MemorySegment fill(AbstractMemorySegmentImpl dst, byte value) { + dst.checkReadOnly(false); + if (dst.length == 0) { + // Implicit state check + dst.checkValidState(); + } else if (dst.length < NATIVE_THRESHOLD_FILL) { + // 0 <= length < FILL_NATIVE_LIMIT : 0...0X...XXXX + + // Handle smaller segments directly without transitioning to native code + final long u = Byte.toUnsignedLong(value); + final long longValue = u << 56 | u << 48 | u << 40 | u << 32 | u << 24 | u << 16 | u << 8 | u; + + int offset = 0; + // 0...0X...X000 + final int limit = (int) (dst.length & (NATIVE_THRESHOLD_FILL - 8)); + for (; offset < limit; offset += 8) { + SCOPED_MEMORY_ACCESS.putLongUnaligned(dst.sessionImpl(), dst.unsafeGetBase(), dst.unsafeGetOffset() + offset, longValue, !Architecture.isLittleEndian()); + } + int remaining = (int) dst.length - limit; + // 0...0X00 + if (remaining >= 4) { + SCOPED_MEMORY_ACCESS.putIntUnaligned(dst.sessionImpl(), dst.unsafeGetBase(), dst.unsafeGetOffset() + offset, (int) longValue, !Architecture.isLittleEndian()); + offset += 4; + remaining -= 4; + } + // 0...00X0 + if (remaining >= 2) { + SCOPED_MEMORY_ACCESS.putShortUnaligned(dst.sessionImpl(), dst.unsafeGetBase(), dst.unsafeGetOffset() + offset, (short) longValue, !Architecture.isLittleEndian()); + offset += 2; + remaining -= 2; + } + // 0...000X + if (remaining == 1) { + SCOPED_MEMORY_ACCESS.putByte(dst.sessionImpl(), dst.unsafeGetBase(), dst.unsafeGetOffset() + offset, value); + } + // We have now fully handled 0...0X...XXXX + } else { + // Handle larger segments via native calls + SCOPED_MEMORY_ACCESS.setMemory(dst.sessionImpl(), dst.unsafeGetBase(), dst.unsafeGetOffset(), dst.length, value); + } + return dst; + } + + @ForceInline + public static void copy(AbstractMemorySegmentImpl src, long srcOffset, + AbstractMemorySegmentImpl dst, long dstOffset, + long size) { + + Utils.checkNonNegativeIndex(size, "size"); + // Implicit null check for src and dst + src.checkAccess(srcOffset, size, true); + dst.checkAccess(dstOffset, size, false); + + if (size <= 0) { + // Do nothing + } else if (size < NATIVE_THRESHOLD_COPY && !src.overlaps(dst)) { + // 0 < size < FILL_NATIVE_LIMIT : 0...0X...XXXX + // + // Strictly, we could check for !src.asSlice(srcOffset, size).overlaps(dst.asSlice(dstOffset, size) but + // this is a bit slower and it likely very unusual there is any difference in the outcome. Also, if there + // is an overlap, we could tolerate one particular direction of overlap (but not the other). + + // 0...0X...X000 + final int limit = (int) (size & (NATIVE_THRESHOLD_COPY - 8)); + int offset = 0; + for (; offset < limit; offset += 8) { + final long v = SCOPED_MEMORY_ACCESS.getLongUnaligned(src.sessionImpl(), src.unsafeGetBase(), src.unsafeGetOffset() + srcOffset + offset, !Architecture.isLittleEndian()); + SCOPED_MEMORY_ACCESS.putLongUnaligned(dst.sessionImpl(), dst.unsafeGetBase(), dst.unsafeGetOffset() + dstOffset + offset, v, !Architecture.isLittleEndian()); + } + int remaining = (int) size - offset; + // 0...0X00 + if (remaining >= 4) { + final int v = SCOPED_MEMORY_ACCESS.getIntUnaligned(src.sessionImpl(), src.unsafeGetBase(),src.unsafeGetOffset() + srcOffset + offset, !Architecture.isLittleEndian()); + SCOPED_MEMORY_ACCESS.putIntUnaligned(dst.sessionImpl(), dst.unsafeGetBase(), dst.unsafeGetOffset() + dstOffset + offset, v, !Architecture.isLittleEndian()); + offset += 4; + remaining -= 4; + } + // 0...00X0 + if (remaining >= 2) { + final short v = SCOPED_MEMORY_ACCESS.getShortUnaligned(src.sessionImpl(), src.unsafeGetBase(), src.unsafeGetOffset() + srcOffset + offset, !Architecture.isLittleEndian()); + SCOPED_MEMORY_ACCESS.putShortUnaligned(dst.sessionImpl(), dst.unsafeGetBase(), dst.unsafeGetOffset() + dstOffset + offset, v, !Architecture.isLittleEndian()); + offset += 2; + remaining -=2; + } + // 0...000X + if (remaining == 1) { + final byte v = SCOPED_MEMORY_ACCESS.getByte(src.sessionImpl(), src.unsafeGetBase(), src.unsafeGetOffset() + srcOffset + offset); + SCOPED_MEMORY_ACCESS.putByte(dst.sessionImpl(), dst.unsafeGetBase(), dst.unsafeGetOffset() + dstOffset + offset, v); + } + // We have now fully handled 0...0X...XXXX + } else { + // For larger sizes, the transition to native code pays off + SCOPED_MEMORY_ACCESS.copyMemory(src.sessionImpl(), dst.sessionImpl(), + src.unsafeGetBase(), src.unsafeGetOffset() + srcOffset, + dst.unsafeGetBase(), dst.unsafeGetOffset() + dstOffset, size); + } + } + + @ForceInline + public static long mismatch(AbstractMemorySegmentImpl src, long srcFromOffset, long srcToOffset, + AbstractMemorySegmentImpl dst, long dstFromOffset, long dstToOffset) { + final long srcBytes = srcToOffset - srcFromOffset; + final long dstBytes = dstToOffset - dstFromOffset; + src.checkAccess(srcFromOffset, srcBytes, true); + dst.checkAccess(dstFromOffset, dstBytes, true); + + final long length = Math.min(srcBytes, dstBytes); + final boolean srcAndDstBytesDiffer = srcBytes != dstBytes; + + if (length == 0) { + return srcAndDstBytesDiffer ? 0 : -1; + } else if (length < NATIVE_THRESHOLD_MISMATCH) { + return mismatch(src, srcFromOffset, dst, dstFromOffset, 0, (int) length, srcAndDstBytesDiffer); + } else { + long i; + if (SCOPED_MEMORY_ACCESS.getByte(src.sessionImpl(), src.unsafeGetBase(), src.unsafeGetOffset() + srcFromOffset) != + SCOPED_MEMORY_ACCESS.getByte(dst.sessionImpl(), dst.unsafeGetBase(), dst.unsafeGetOffset() + dstFromOffset)) { + return 0; + } + i = vectorizedMismatchLargeForBytes(src.sessionImpl(), dst.sessionImpl(), + src.unsafeGetBase(), src.unsafeGetOffset() + srcFromOffset, + dst.unsafeGetBase(), dst.unsafeGetOffset() + dstFromOffset, + length); + if (i >= 0) { + return i; + } + final long remaining = ~i; + assert remaining < 8 : "remaining greater than 7: " + remaining; + i = length - remaining; + return mismatch(src, srcFromOffset + i, dst, dstFromOffset + i, i, (int) remaining, srcAndDstBytesDiffer); + } + } + + // Mismatch is handled in chunks of 64 (unroll of eight 8s), 8, 4, 2, and 1 byte(s). + @ForceInline + private static long mismatch(AbstractMemorySegmentImpl src, long srcFromOffset, + AbstractMemorySegmentImpl dst, long dstFromOffset, + long start, int length, boolean srcAndDstBytesDiffer) { + int offset = 0; + final int limit = length & (NATIVE_THRESHOLD_MISMATCH - 8); + for (; offset < limit; offset += 8) { + final long s = SCOPED_MEMORY_ACCESS.getLongUnaligned(src.sessionImpl(), src.unsafeGetBase(), src.unsafeGetOffset() + srcFromOffset + offset, !Architecture.isLittleEndian()); + final long d = SCOPED_MEMORY_ACCESS.getLongUnaligned(dst.sessionImpl(), dst.unsafeGetBase(), dst.unsafeGetOffset() + dstFromOffset + offset, !Architecture.isLittleEndian()); + if (s != d) { + return start + offset + mismatch(s, d); + } + } + int remaining = length - offset; + // 0...XXX000 + for (; remaining >= 8; remaining -= 8) { + final long s = SCOPED_MEMORY_ACCESS.getLongUnaligned(src.sessionImpl(), src.unsafeGetBase(), src.unsafeGetOffset() + srcFromOffset + offset, !Architecture.isLittleEndian()); + final long d = SCOPED_MEMORY_ACCESS.getLongUnaligned(dst.sessionImpl(), dst.unsafeGetBase(), dst.unsafeGetOffset() + dstFromOffset + offset, !Architecture.isLittleEndian()); + if (s != d) { + return start + offset + mismatch(s, d); + } + offset += 8; + } + + // 0...0X00 + if (remaining >= 4) { + final int s = SCOPED_MEMORY_ACCESS.getIntUnaligned(src.sessionImpl(), src.unsafeGetBase(), src.unsafeGetOffset() + srcFromOffset + offset, !Architecture.isLittleEndian()); + final int d = SCOPED_MEMORY_ACCESS.getIntUnaligned(dst.sessionImpl(), dst.unsafeGetBase(), dst.unsafeGetOffset() + dstFromOffset + offset, !Architecture.isLittleEndian()); + if (s != d) { + return start + offset + mismatch(s, d); + } + offset += 4; + remaining -= 4; + } + // 0...00X0 + if (remaining >= 2) { + final short s = SCOPED_MEMORY_ACCESS.getShortUnaligned(src.sessionImpl(), src.unsafeGetBase(), src.unsafeGetOffset() + srcFromOffset + offset, !Architecture.isLittleEndian()); + final short d = SCOPED_MEMORY_ACCESS.getShortUnaligned(dst.sessionImpl(), dst.unsafeGetBase(), dst.unsafeGetOffset() + dstFromOffset + offset, !Architecture.isLittleEndian()); + if (s != d) { + return start + offset + mismatch(s, d); + } + offset += 2; + remaining -= 2; + } + // 0...000X + if (remaining == 1) { + final byte s = SCOPED_MEMORY_ACCESS.getByte(src.sessionImpl(), src.unsafeGetBase(), src.unsafeGetOffset() + srcFromOffset + offset); + final byte d = SCOPED_MEMORY_ACCESS.getByte(dst.sessionImpl(), dst.unsafeGetBase(), dst.unsafeGetOffset() + dstFromOffset + offset); + if (s != d) { + return start + offset; + } + } + return srcAndDstBytesDiffer ? (start + length) : -1; + // We have now fully handled 0...0X...XXXX + } + + @ForceInline + private static int mismatch(long first, long second) { + final long x = first ^ second; + return (Architecture.isLittleEndian() + ? Long.numberOfTrailingZeros(x) + : Long.numberOfLeadingZeros(x)) / 8; + } + + @ForceInline + private static int mismatch(int first, int second) { + final int x = first ^ second; + return (Architecture.isLittleEndian() + ? Integer.numberOfTrailingZeros(x) + : Integer.numberOfLeadingZeros(x)) / 8; + } + + @ForceInline + private static int mismatch(short first, short second) { + if (Architecture.isLittleEndian()) { + return ((0xff & first) == (0xff & second)) ? 1 : 0; + } else { + return ((0xff & first) == (0xff & second)) ? 0 : 1; + } + } + + /** + * Mismatch over long lengths. + */ + private static long vectorizedMismatchLargeForBytes(MemorySessionImpl aSession, MemorySessionImpl bSession, + Object a, long aOffset, + Object b, long bOffset, + long length) { + long off = 0; + long remaining = length; + int i, size; + boolean lastSubRange = false; + while (remaining > 7 && !lastSubRange) { + if (remaining > Integer.MAX_VALUE) { + size = Integer.MAX_VALUE; + } else { + size = (int) remaining; + lastSubRange = true; + } + i = SCOPED_MEMORY_ACCESS.vectorizedMismatch(aSession, bSession, + a, aOffset + off, + b, bOffset + off, + size, ArraysSupport.LOG2_ARRAY_BYTE_INDEX_SCALE); + if (i >= 0) + return off + i; + + i = size - ~i; + off += i; + remaining -= i; + } + return ~remaining; + } + + static final String PROPERTY_PATH = "java.lang.foreign.native.threshold.power."; + + // The returned value is in the interval [0, 2^30] + static int powerOfPropertyOr(String name, int defaultPower) { + final int power = Integer.getInteger(PROPERTY_PATH + name, defaultPower); + return 1 << Math.clamp(power, 0, Integer.SIZE - 2); + } + +} diff --git a/test/jdk/java/foreign/TestMismatch.java b/test/jdk/java/foreign/TestMismatch.java index 9549b2508ff..f50621e3415 100644 --- a/test/jdk/java/foreign/TestMismatch.java +++ b/test/jdk/java/foreign/TestMismatch.java @@ -29,7 +29,9 @@ import java.lang.foreign.Arena; import java.util.ArrayList; +import java.util.Arrays; import java.util.List; +import java.util.Random; import java.util.concurrent.atomic.AtomicReference; import java.lang.foreign.MemorySegment; @@ -122,6 +124,68 @@ public class TestMismatch { } } + @Test + public void random() { + try (var arena = Arena.ofConfined()) { + var rnd = new Random(42); + for (int size = 1; size < 64; size++) { + // Repeat a fair number of rounds + for (int i = 0; i < 147; i++) { + var src = arena.allocate(size); + // The dst segment might be zero to eight bytes longer + var dst = arena.allocate(size + rnd.nextInt(8 + 1)); + // Fill the src with random data + for (int j = 0; j < size; j++) { + src.set(ValueLayout.JAVA_BYTE, j, randomByte(rnd)); + } + // copy the random data from src to dst + dst.copyFrom(src); + // Fill the rest (if any) of the dst with random data + for (long j = src.byteSize(); j < dst.byteSize(); j++) { + dst.set(ValueLayout.JAVA_BYTE, j, randomByte(rnd)); + } + + if (rnd.nextBoolean()) { + // In this branch, we inject one or more deviating bytes + int beginDiff = rnd.nextInt(size); + int endDiff = rnd.nextInt(beginDiff, size); + for (int d = beginDiff; d <= endDiff; d++) { + byte existing = dst.get(ValueLayout.JAVA_BYTE, d); + // Make sure we never get back the same value + byte mutatedValue; + do { + mutatedValue = randomByte(rnd); + } while (existing == mutatedValue); + dst.set(ValueLayout.JAVA_BYTE, d, mutatedValue); + } + + // They are not equal and differs in position beginDiff + assertEquals(src.mismatch(dst), beginDiff); + assertEquals(dst.mismatch(src), beginDiff); + } else { + // In this branch, there is no injection + + if (src.byteSize() == dst.byteSize()) { + // The content matches and they are of equal size + assertEquals(src.mismatch(dst), -1); + assertEquals(dst.mismatch(src), -1); + } else { + // The content matches but they are of different length + // Remember, the size of src is always smaller or equal + // to the size of dst. + assertEquals(src.mismatch(dst), src.byteSize()); + assertEquals(dst.mismatch(src), src.byteSize()); + } + } + } + } + } + } + + static byte randomByte(Random rnd) { + return (byte) rnd.nextInt(Byte.MIN_VALUE, Byte.MAX_VALUE + 1); + } + @Test(dataProvider = "slices") public void testDifferentValues(MemorySegment s1, MemorySegment s2) { out.format("testDifferentValues s1:%s, s2:%s\n", s1, s2); diff --git a/test/micro/org/openjdk/bench/java/lang/foreign/CopyTest.java b/test/micro/org/openjdk/bench/java/lang/foreign/SegmentBulkCopy.java similarity index 76% rename from test/micro/org/openjdk/bench/java/lang/foreign/CopyTest.java rename to test/micro/org/openjdk/bench/java/lang/foreign/SegmentBulkCopy.java index 8996b1de117..22ef139aac0 100644 --- a/test/micro/org/openjdk/bench/java/lang/foreign/CopyTest.java +++ b/test/micro/org/openjdk/bench/java/lang/foreign/SegmentBulkCopy.java @@ -41,21 +41,16 @@ import java.lang.foreign.MemorySegment; import java.nio.ByteBuffer; import java.util.concurrent.TimeUnit; -import static java.lang.foreign.ValueLayout.*; - @BenchmarkMode(Mode.AverageTime) @Warmup(iterations = 5, time = 500, timeUnit = TimeUnit.MILLISECONDS) @Measurement(iterations = 10, time = 500, timeUnit = TimeUnit.MILLISECONDS) @State(Scope.Thread) @OutputTimeUnit(TimeUnit.NANOSECONDS) @Fork(value = 3) -public class CopyTest { +public class SegmentBulkCopy { - @Param({"0", "1", "2", "3", "4", "5", "6", "7", "8", - "9", "10", "11", "12", "13", "14", "15", "16", - "17", "18", "19", "20", "21", "22", "23", "24", - "25", "26", "27", "28", "29", "30", "31", "32", - "33", "36", "40", "44", "48", "52", "56", "60", "63", "64", "128"}) + @Param({"2", "3", "4", "5", "6", "7", "8", "64", "512", + "4096", "32768", "262144", "2097152", "16777216", "134217728"}) public int ELEM_SIZE; byte[] srcArray; @@ -80,28 +75,37 @@ public class CopyTest { } @Benchmark - public void array_copy() { + public void arrayCopy() { System.arraycopy(srcArray, 0, dstArray, 0, ELEM_SIZE); } @Benchmark - public void heap_segment_copy5Arg() { - MemorySegment.copy(heapSrcSegment, 0, heapDstSegment, 0, ELEM_SIZE); - } - - @Benchmark - public void native_segment_copy5Arg() { - MemorySegment.copy(nativeSrcSegment, 0, nativeDstSegment, 0, ELEM_SIZE); - } - - @Benchmark - public void heap_segment_copy7arg() { - MemorySegment.copy(heapSrcSegment, JAVA_BYTE, 0, heapDstSegment, JAVA_BYTE, 0, ELEM_SIZE); - } - - @Benchmark - public void buffer_copy() { + public void bufferCopy() { dstBuffer.put(srcBuffer); } + @Fork(value = 3, jvmArgsAppend = {"-Djava.lang.foreign.native.threshold.power.copy=31"}) + @Benchmark + public void heapSegmentCopyJava() { + MemorySegment.copy(heapSrcSegment, 0, heapDstSegment, 0, ELEM_SIZE); + } + + @Fork(value = 3, jvmArgsAppend = {"-Djava.lang.foreign.native.threshold.power.copy=0"}) + @Benchmark + public void heapSegmentCopyUnsafe() { + MemorySegment.copy(heapSrcSegment, 0, heapDstSegment, 0, ELEM_SIZE); + } + + @Fork(value = 3, jvmArgsAppend = {"-Djava.lang.foreign.native.threshold.power.copy=31"}) + @Benchmark + public void nativeSegmentCopyJava() { + MemorySegment.copy(nativeSrcSegment, 0, nativeDstSegment, 0, ELEM_SIZE); + } + + @Fork(value = 3, jvmArgsAppend = {"-Djava.lang.foreign.native.threshold.power.copy=0"}) + @Benchmark + public void nativeSegmentCopyUnsafe() { + MemorySegment.copy(nativeSrcSegment, 0, nativeDstSegment, 0, ELEM_SIZE); + } + } diff --git a/test/micro/org/openjdk/bench/java/lang/foreign/TestFill.java b/test/micro/org/openjdk/bench/java/lang/foreign/SegmentBulkFill.java similarity index 69% rename from test/micro/org/openjdk/bench/java/lang/foreign/TestFill.java rename to test/micro/org/openjdk/bench/java/lang/foreign/SegmentBulkFill.java index 78719f03bc3..95ca7228969 100644 --- a/test/micro/org/openjdk/bench/java/lang/foreign/TestFill.java +++ b/test/micro/org/openjdk/bench/java/lang/foreign/SegmentBulkFill.java @@ -48,13 +48,10 @@ import java.util.concurrent.TimeUnit; @State(Scope.Thread) @OutputTimeUnit(TimeUnit.NANOSECONDS) @Fork(value = 3) -public class TestFill { +public class SegmentBulkFill { - @Param({"0", "1", "2", "3", "4", "5", "6", "7", - "8", "9", "10", "11", "12", "13", "14", "15", - "16", "17", "18", "19", "20", "21", "22", "23", - "24", "25", "26", "27", "28", "29", "30", "31", - "32", "128", "256", "384", "511", "512"}) + @Param({"2", "3", "4", "5", "6", "7", "8", "64", "512", + "4096", "32768", "262144", "2097152", "16777216", "134217728"}) public int ELEM_SIZE; byte[] array; @@ -73,22 +70,43 @@ public class TestFill { } @Benchmark - public void arrays_fill() { + public void arraysFill() { Arrays.fill(array, (byte) 0); } + @Fork(value = 3, jvmArgsAppend = {"-Djava.lang.foreign.native.threshold.power.fill=31"}) @Benchmark - public void heap_segment_fill() { + public void heapSegmentFillJava() { heapSegment.fill((byte) 0); } + @Fork(value = 3, jvmArgsAppend = {"-Djava.lang.foreign.native.threshold.power.fill=0"}) @Benchmark - public void native_segment_fill() { + public void heapSegmentFillUnsafe() { + heapSegment.fill((byte) 0); + } + + @Fork(value = 3, jvmArgsAppend = {"-Djava.lang.foreign.native.threshold.power.fill=31"}) + @Benchmark + public void nativeSegmentFillJava() { nativeSegment.fill((byte) 0); } + @Fork(value = 3, jvmArgsAppend = {"-Djava.lang.foreign.native.threshold.power.fill=0"}) @Benchmark - public void unaligned_segment_fill() { + public void nativeSegmentFillUnsafe() { + nativeSegment.fill((byte) 0); + } + + @Fork(value = 3, jvmArgsAppend = {"-Djava.lang.foreign.native.threshold.power.fill=31"}) + @Benchmark + public void unalignedSegmentFillJava() { + unalignedSegment.fill((byte) 0); + } + + @Fork(value = 3, jvmArgsAppend = {"-Djava.lang.foreign.native.threshold.power.fill=0"}) + @Benchmark + public void unalignedSegmentFillUnsafe() { unalignedSegment.fill((byte) 0); } diff --git a/test/micro/org/openjdk/bench/java/lang/foreign/SegmentBulkMismatch.java b/test/micro/org/openjdk/bench/java/lang/foreign/SegmentBulkMismatch.java new file mode 100644 index 00000000000..5656b2f6b9f --- /dev/null +++ b/test/micro/org/openjdk/bench/java/lang/foreign/SegmentBulkMismatch.java @@ -0,0 +1,112 @@ +/* + * Copyright (c) 2024, Oracle and/or its affiliates. All rights reserved. + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This code is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 only, as + * published by the Free Software Foundation. + * + * This code is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * version 2 for more details (a copy is included in the LICENSE file that + * accompanied this code). + * + * You should have received a copy of the GNU General Public License version + * 2 along with this work; if not, write to the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. + * + * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA + * or visit www.oracle.com if you need additional information or have any + * questions. + * + */ + +package org.openjdk.bench.java.lang.foreign; + +import org.openjdk.jmh.annotations.Benchmark; +import org.openjdk.jmh.annotations.BenchmarkMode; +import org.openjdk.jmh.annotations.Fork; +import org.openjdk.jmh.annotations.Measurement; +import org.openjdk.jmh.annotations.Mode; +import org.openjdk.jmh.annotations.OutputTimeUnit; +import org.openjdk.jmh.annotations.Param; +import org.openjdk.jmh.annotations.Scope; +import org.openjdk.jmh.annotations.Setup; +import org.openjdk.jmh.annotations.State; +import org.openjdk.jmh.annotations.Warmup; + +import java.lang.foreign.Arena; +import java.lang.foreign.MemorySegment; +import java.util.Arrays; +import java.util.Random; +import java.util.concurrent.TimeUnit; + +import static java.lang.foreign.ValueLayout.*; + +@BenchmarkMode(Mode.AverageTime) +@Warmup(iterations = 5, time = 500, timeUnit = TimeUnit.MILLISECONDS) +@Measurement(iterations = 10, time = 500, timeUnit = TimeUnit.MILLISECONDS) +@State(Scope.Thread) +@OutputTimeUnit(TimeUnit.NANOSECONDS) +@Fork(value = 3) +public class SegmentBulkMismatch { + + @Param({"2", "3", "4", "5", "6", "7", "8", "64", "512", + "4096", "32768", "262144", "2097152", "16777216", "134217728"}) + public int ELEM_SIZE; + + MemorySegment srcNative; + MemorySegment dstNative; + byte[] srcArray; + byte[] dstArray; + MemorySegment srcHeap; + MemorySegment dstHeap; + + @Setup + public void setup() { + // Always use the same alignment regardless of size + srcNative = Arena.ofAuto().allocate(ELEM_SIZE,16); + dstNative = Arena.ofAuto().allocate(ELEM_SIZE, 16); + var rnd = new Random(42); + for (int i = 0; i < ELEM_SIZE; i++) { + srcNative.set(JAVA_BYTE, i, (byte) rnd.nextInt(Byte.MIN_VALUE, Byte.MAX_VALUE)); + } + dstNative.copyFrom(srcNative); + srcArray = srcNative.toArray(JAVA_BYTE); + dstArray = dstNative.toArray(JAVA_BYTE); + srcHeap = MemorySegment.ofArray(srcArray); + dstHeap = MemorySegment.ofArray(dstArray); + } + + @Fork(value = 3, jvmArgsAppend = {"-Djava.lang.foreign.native.threshold.power.mismatch=31"}) + @Benchmark + public long nativeSegmentJava() { + return srcNative.mismatch(dstNative); + } + + @Fork(value = 3, jvmArgsAppend = {"-Djava.lang.foreign.native.threshold.power.mismatch=31"}) + @Benchmark + public long heapSegmentJava() { + return srcHeap.mismatch(dstHeap); + } + + @Fork(value = 3, jvmArgsAppend = {"-Djava.lang.foreign.native.threshold.power.mismatch=0"}) + @Benchmark + public long nativeSegmentUnsafe() { + return srcNative.mismatch(dstNative); + } + + @Fork(value = 3, jvmArgsAppend = {"-Djava.lang.foreign.native.threshold.power.mismatch=0"}) + @Benchmark + public long heapSegmentUnsafe() { + return srcHeap.mismatch(dstHeap); + } + + @Benchmark + public long array() { + return Arrays.mismatch(srcArray, dstArray); + } + +} +