From 81ff91ef27a6a856ae2c453a9a9b8333b91da3ab Mon Sep 17 00:00:00 2001
From: Per Minborg <pminborg@openjdk.org>
Date: Thu, 12 Sep 2024 18:31:08 +0000
Subject: [PATCH] 8339531: Improve performance of MemorySegment::mismatch

Reviewed-by: mcimadamore
---
 .../java/lang/foreign/MemorySegment.java      |   8 +-
 .../foreign/AbstractMemorySegmentImpl.java    | 178 +---------
 .../foreign/SegmentBulkOperations.java        | 316 ++++++++++++++++++
 test/jdk/java/foreign/TestMismatch.java       |  64 ++++
 .../{CopyTest.java => SegmentBulkCopy.java}   |  54 +--
 .../{TestFill.java => SegmentBulkFill.java}   |  38 ++-
 .../lang/foreign/SegmentBulkMismatch.java     | 112 +++++++
 7 files changed, 559 insertions(+), 211 deletions(-)
 create mode 100644 src/java.base/share/classes/jdk/internal/foreign/SegmentBulkOperations.java
 rename test/micro/org/openjdk/bench/java/lang/foreign/{CopyTest.java => SegmentBulkCopy.java} (76%)
 rename test/micro/org/openjdk/bench/java/lang/foreign/{TestFill.java => SegmentBulkFill.java} (69%)
 create mode 100644 test/micro/org/openjdk/bench/java/lang/foreign/SegmentBulkMismatch.java

diff --git a/src/java.base/share/classes/java/lang/foreign/MemorySegment.java b/src/java.base/share/classes/java/lang/foreign/MemorySegment.java
index 38fd36bbb15..cb1f3707db6 100644
--- a/src/java.base/share/classes/java/lang/foreign/MemorySegment.java
+++ b/src/java.base/share/classes/java/lang/foreign/MemorySegment.java
@@ -43,6 +43,7 @@ import java.util.function.Consumer;
 import java.util.stream.Stream;
 import jdk.internal.foreign.AbstractMemorySegmentImpl;
 import jdk.internal.foreign.MemorySessionImpl;
+import jdk.internal.foreign.SegmentBulkOperations;
 import jdk.internal.foreign.SegmentFactories;
 import jdk.internal.javac.Restricted;
 import jdk.internal.reflect.CallerSensitive;
@@ -1571,7 +1572,7 @@ public sealed interface MemorySegment permits AbstractMemorySegmentImpl {
     static void copy(MemorySegment srcSegment, long srcOffset,
                      MemorySegment dstSegment, long dstOffset, long bytes) {
 
-        AbstractMemorySegmentImpl.copy((AbstractMemorySegmentImpl) srcSegment, srcOffset,
+        SegmentBulkOperations.copy((AbstractMemorySegmentImpl) srcSegment, srcOffset,
                 (AbstractMemorySegmentImpl) dstSegment, dstOffset,
                 bytes);
     }
@@ -2635,8 +2636,9 @@ public sealed interface MemorySegment permits AbstractMemorySegmentImpl {
      */
     static long mismatch(MemorySegment srcSegment, long srcFromOffset, long srcToOffset,
                          MemorySegment dstSegment, long dstFromOffset, long dstToOffset) {
-        return AbstractMemorySegmentImpl.mismatch(srcSegment, srcFromOffset, srcToOffset,
-                dstSegment, dstFromOffset, dstToOffset);
+        return SegmentBulkOperations.mismatch(
+                (AbstractMemorySegmentImpl)Objects.requireNonNull(srcSegment), srcFromOffset, srcToOffset,
+                (AbstractMemorySegmentImpl)Objects.requireNonNull(dstSegment), dstFromOffset, dstToOffset);
     }
 
     /**
diff --git a/src/java.base/share/classes/jdk/internal/foreign/AbstractMemorySegmentImpl.java b/src/java.base/share/classes/jdk/internal/foreign/AbstractMemorySegmentImpl.java
index 83b11b7ce68..64994af5cb7 100644
--- a/src/java.base/share/classes/jdk/internal/foreign/AbstractMemorySegmentImpl.java
+++ b/src/java.base/share/classes/jdk/internal/foreign/AbstractMemorySegmentImpl.java
@@ -72,8 +72,6 @@ public abstract sealed class AbstractMemorySegmentImpl
         implements MemorySegment, SegmentAllocator, BiFunction<String, List<Number>, RuntimeException>
         permits HeapMemorySegmentImpl, NativeMemorySegmentImpl {
 
-    private static final ScopedMemoryAccess SCOPED_MEMORY_ACCESS = ScopedMemoryAccess.getScopedMemoryAccess();
-
     static final JavaNioAccess NIO_ACCESS = SharedSecrets.getJavaNioAccess();
 
     final long length;
@@ -189,53 +187,10 @@ public abstract sealed class AbstractMemorySegmentImpl
         return StreamSupport.stream(spliterator(elementLayout), false);
     }
 
-    // FILL_NATIVE_THRESHOLD must be a power of two and should be greater than 2^3
-    // Update the value for Aarch64 once 8338975 is fixed.
-    private static final long FILL_NATIVE_THRESHOLD = 1L << (Architecture.isAARCH64() ? 10 : 5);
-
-    @Override
     @ForceInline
+    @Override
     public final MemorySegment fill(byte value) {
-        checkReadOnly(false);
-        if (length == 0) {
-            // Implicit state check
-            checkValidState();
-        } else if (length < FILL_NATIVE_THRESHOLD) {
-            // 0 <= length < FILL_NATIVE_LIMIT : 0...0X...XXXX
-
-            // Handle smaller segments directly without transitioning to native code
-            final long u = Byte.toUnsignedLong(value);
-            final long longValue = u << 56 | u << 48 | u << 40 | u << 32 | u << 24 | u << 16 | u << 8 | u;
-
-            int offset = 0;
-            // 0...0X...X000
-            final int limit = (int) (length & (FILL_NATIVE_THRESHOLD - 8));
-            for (; offset < limit; offset += 8) {
-                SCOPED_MEMORY_ACCESS.putLong(sessionImpl(), unsafeGetBase(), unsafeGetOffset() + offset, longValue);
-            }
-            int remaining = (int) length - limit;
-            // 0...0X00
-            if (remaining >= 4) {
-                SCOPED_MEMORY_ACCESS.putInt(sessionImpl(), unsafeGetBase(), unsafeGetOffset() + offset, (int) longValue);
-                offset += 4;
-                remaining -= 4;
-            }
-            // 0...00X0
-            if (remaining >= 2) {
-                SCOPED_MEMORY_ACCESS.putShort(sessionImpl(), unsafeGetBase(), unsafeGetOffset() + offset, (short) longValue);
-                offset += 2;
-                remaining -= 2;
-            }
-            // 0...000X
-            if (remaining == 1) {
-                SCOPED_MEMORY_ACCESS.putByte(sessionImpl(), unsafeGetBase(), unsafeGetOffset() + offset, value);
-            }
-            // We have now fully handled 0...0X...XXXX
-        } else {
-            // Handle larger segments via native calls
-            SCOPED_MEMORY_ACCESS.setMemory(sessionImpl(), unsafeGetBase(), unsafeGetOffset(), length, value);
-        }
-        return this;
+        return SegmentBulkOperations.fill(this, value);
     }
 
     @Override
@@ -244,38 +199,6 @@ public abstract sealed class AbstractMemorySegmentImpl
         return asSlice(0, byteSize, byteAlignment);
     }
 
-    /**
-     * Mismatch over long lengths.
-     */
-    public static long vectorizedMismatchLargeForBytes(MemorySessionImpl aSession, MemorySessionImpl bSession,
-                                                        Object a, long aOffset,
-                                                        Object b, long bOffset,
-                                                        long length) {
-        long off = 0;
-        long remaining = length;
-        int i, size;
-        boolean lastSubRange = false;
-        while (remaining > 7 && !lastSubRange) {
-            if (remaining > Integer.MAX_VALUE) {
-                size = Integer.MAX_VALUE;
-            } else {
-                size = (int) remaining;
-                lastSubRange = true;
-            }
-            i = SCOPED_MEMORY_ACCESS.vectorizedMismatch(aSession, bSession,
-                    a, aOffset + off,
-                    b, bOffset + off,
-                    size, ArraysSupport.LOG2_ARRAY_BYTE_INDEX_SCALE);
-            if (i >= 0)
-                return off + i;
-
-            i = size - ~i;
-            off += i;
-            remaining -= i;
-        }
-        return ~remaining;
-    }
-
     @Override
     public final ByteBuffer asByteBuffer() {
         checkArraySize("ByteBuffer", 1);
@@ -314,7 +237,7 @@ public abstract sealed class AbstractMemorySegmentImpl
     }
 
     @ForceInline
-    private boolean overlaps(AbstractMemorySegmentImpl that) {
+    boolean overlaps(AbstractMemorySegmentImpl that) {
         if (unsafeGetBase() == that.unsafeGetBase()) {  // both either native or the same heap segment
             final long thisStart = this.unsafeGetOffset();
             final long thatStart = that.unsafeGetOffset();
@@ -334,7 +257,8 @@ public abstract sealed class AbstractMemorySegmentImpl
     @Override
     public long mismatch(MemorySegment other) {
         Objects.requireNonNull(other);
-        return MemorySegment.mismatch(this, 0, byteSize(), other, 0, other.byteSize());
+        return SegmentBulkOperations.mismatch(this, 0, byteSize(),
+                (AbstractMemorySegmentImpl) other, 0, other.byteSize());
     }
 
     @Override
@@ -650,64 +574,6 @@ public abstract sealed class AbstractMemorySegmentImpl
         }
     }
 
-    // COPY_NATIVE_THRESHOLD must be a power of two and should be greater than 2^3
-    private static final long COPY_NATIVE_THRESHOLD = 1 << 6;
-
-    @ForceInline
-    public static void copy(AbstractMemorySegmentImpl src, long srcOffset,
-                            AbstractMemorySegmentImpl dst, long dstOffset,
-                            long size) {
-
-        Utils.checkNonNegativeIndex(size, "size");
-        // Implicit null check for src and dst
-        src.checkAccess(srcOffset, size, true);
-        dst.checkAccess(dstOffset, size, false);
-
-        if (size <= 0) {
-            // Do nothing
-        } else if (size < COPY_NATIVE_THRESHOLD && !src.overlaps(dst)) {
-            // 0 < size < FILL_NATIVE_LIMIT : 0...0X...XXXX
-            //
-            // Strictly, we could check for !src.asSlice(srcOffset, size).overlaps(dst.asSlice(dstOffset, size) but
-            // this is a bit slower and it likely very unusual there is any difference in the outcome. Also, if there
-            // is an overlap, we could tolerate one particular direction of overlap (but not the other).
-
-            // 0...0X...X000
-            final int limit = (int) (size & (COPY_NATIVE_THRESHOLD - 8));
-            int offset = 0;
-            for (; offset < limit; offset += 8) {
-                final long v = SCOPED_MEMORY_ACCESS.getLong(src.sessionImpl(), src.unsafeGetBase(), src.unsafeGetOffset() + srcOffset + offset);
-                SCOPED_MEMORY_ACCESS.putLong(dst.sessionImpl(), dst.unsafeGetBase(), dst.unsafeGetOffset() + dstOffset + offset, v);
-            }
-            int remaining = (int) size - offset;
-            // 0...0X00
-            if (remaining >= 4) {
-                final int v = SCOPED_MEMORY_ACCESS.getInt(src.sessionImpl(), src.unsafeGetBase(),src.unsafeGetOffset() + srcOffset + offset);
-                SCOPED_MEMORY_ACCESS.putInt(dst.sessionImpl(), dst.unsafeGetBase(), dst.unsafeGetOffset() + dstOffset + offset, v);
-                offset += 4;
-                remaining -= 4;
-            }
-            // 0...00X0
-            if (remaining >= 2) {
-                final short v = SCOPED_MEMORY_ACCESS.getShort(src.sessionImpl(), src.unsafeGetBase(), src.unsafeGetOffset() + srcOffset + offset);
-                SCOPED_MEMORY_ACCESS.putShort(dst.sessionImpl(), dst.unsafeGetBase(), dst.unsafeGetOffset() + dstOffset + offset, v);
-                offset += 2;
-                remaining -=2;
-            }
-            // 0...000X
-            if (remaining == 1) {
-                final byte v = SCOPED_MEMORY_ACCESS.getByte(src.sessionImpl(), src.unsafeGetBase(), src.unsafeGetOffset() + srcOffset + offset);
-                SCOPED_MEMORY_ACCESS.putByte(dst.sessionImpl(), dst.unsafeGetBase(), dst.unsafeGetOffset() + dstOffset + offset, v);
-            }
-            // We have now fully handled 0...0X...XXXX
-        } else {
-            // For larger sizes, the transition to native code pays off
-            SCOPED_MEMORY_ACCESS.copyMemory(src.sessionImpl(), dst.sessionImpl(),
-                    src.unsafeGetBase(), src.unsafeGetOffset() + srcOffset,
-                    dst.unsafeGetBase(), dst.unsafeGetOffset() + dstOffset, size);
-        }
-    }
-
     @ForceInline
     public static void copy(MemorySegment srcSegment, ValueLayout srcElementLayout, long srcOffset,
                             MemorySegment dstSegment, ValueLayout dstElementLayout, long dstOffset,
@@ -794,40 +660,6 @@ public abstract sealed class AbstractMemorySegmentImpl
         }
     }
 
-    public static long mismatch(MemorySegment srcSegment, long srcFromOffset, long srcToOffset,
-                                MemorySegment dstSegment, long dstFromOffset, long dstToOffset) {
-        AbstractMemorySegmentImpl srcImpl = (AbstractMemorySegmentImpl)Objects.requireNonNull(srcSegment);
-        AbstractMemorySegmentImpl dstImpl = (AbstractMemorySegmentImpl)Objects.requireNonNull(dstSegment);
-        long srcBytes = srcToOffset - srcFromOffset;
-        long dstBytes = dstToOffset - dstFromOffset;
-        srcImpl.checkAccess(srcFromOffset, srcBytes, true);
-        dstImpl.checkAccess(dstFromOffset, dstBytes, true);
-
-        long bytes = Math.min(srcBytes, dstBytes);
-        long i = 0;
-        if (bytes > 7) {
-            if (srcImpl.get(JAVA_BYTE, srcFromOffset) != dstImpl.get(JAVA_BYTE, dstFromOffset)) {
-                return 0;
-            }
-            i = AbstractMemorySegmentImpl.vectorizedMismatchLargeForBytes(srcImpl.sessionImpl(), dstImpl.sessionImpl(),
-                    srcImpl.unsafeGetBase(), srcImpl.unsafeGetOffset() + srcFromOffset,
-                    dstImpl.unsafeGetBase(), dstImpl.unsafeGetOffset() + dstFromOffset,
-                    bytes);
-            if (i >= 0) {
-                return i;
-            }
-            long remaining = ~i;
-            assert remaining < 8 : "remaining greater than 7: " + remaining;
-            i = bytes - remaining;
-        }
-        for (; i < bytes; i++) {
-            if (srcImpl.get(JAVA_BYTE, srcFromOffset + i) != dstImpl.get(JAVA_BYTE, dstFromOffset + i)) {
-                return i;
-            }
-        }
-        return srcBytes != dstBytes ? bytes : -1;
-    }
-
     private static int getScaleFactor(Buffer buffer) {
         return switch (buffer) {
             case ByteBuffer   _                 -> 0;
diff --git a/src/java.base/share/classes/jdk/internal/foreign/SegmentBulkOperations.java b/src/java.base/share/classes/jdk/internal/foreign/SegmentBulkOperations.java
new file mode 100644
index 00000000000..74953f077e4
--- /dev/null
+++ b/src/java.base/share/classes/jdk/internal/foreign/SegmentBulkOperations.java
@@ -0,0 +1,316 @@
+/*
+ * Copyright (c) 2024, Oracle and/or its affiliates. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.  Oracle designates this
+ * particular file as subject to the "Classpath" exception as provided
+ * by Oracle in the LICENSE file that accompanied this code.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ */
+
+package jdk.internal.foreign;
+
+import jdk.internal.misc.ScopedMemoryAccess;
+import jdk.internal.util.Architecture;
+import jdk.internal.util.ArraysSupport;
+import jdk.internal.vm.annotation.ForceInline;
+
+import java.lang.foreign.MemorySegment;
+
+/**
+ * This class contains optimized bulk operation methods that operate on one or several
+ * memory segments.
+ * <p>
+ * Generally, the methods attempt to work with as-large-as-possible units of memory at
+ * a time.
+ * <p>
+ * It should be noted that when invoking scoped memory access get/set operations, it
+ * is imperative from a performance perspective to convey the sharp types from the
+ * call site in order for the compiler to pick the correct Unsafe access variant.
+ */
+public final class SegmentBulkOperations {
+
+    private SegmentBulkOperations() {}
+
+    private static final ScopedMemoryAccess SCOPED_MEMORY_ACCESS = ScopedMemoryAccess.getScopedMemoryAccess();
+
+    // All the threshold values below MUST be a power of two and should preferably be
+    // greater or equal to 2^3.
+
+    // Update the FILL value for Aarch64 once 8338975 is fixed.
+    private static final int NATIVE_THRESHOLD_FILL = powerOfPropertyOr("fill", Architecture.isAARCH64() ? 10 : 5);
+    private static final int NATIVE_THRESHOLD_MISMATCH = powerOfPropertyOr("mismatch", 6);
+    private static final int NATIVE_THRESHOLD_COPY = powerOfPropertyOr("copy", 6);
+
+    @ForceInline
+    public static MemorySegment fill(AbstractMemorySegmentImpl dst, byte value) {
+        dst.checkReadOnly(false);
+        if (dst.length == 0) {
+            // Implicit state check
+            dst.checkValidState();
+        } else if (dst.length < NATIVE_THRESHOLD_FILL) {
+            // 0 <= length < FILL_NATIVE_LIMIT : 0...0X...XXXX
+
+            // Handle smaller segments directly without transitioning to native code
+            final long u = Byte.toUnsignedLong(value);
+            final long longValue = u << 56 | u << 48 | u << 40 | u << 32 | u << 24 | u << 16 | u << 8 | u;
+
+            int offset = 0;
+            // 0...0X...X000
+            final int limit = (int) (dst.length & (NATIVE_THRESHOLD_FILL - 8));
+            for (; offset < limit; offset += 8) {
+                SCOPED_MEMORY_ACCESS.putLongUnaligned(dst.sessionImpl(), dst.unsafeGetBase(), dst.unsafeGetOffset() + offset, longValue, !Architecture.isLittleEndian());
+            }
+            int remaining = (int) dst.length - limit;
+            // 0...0X00
+            if (remaining >= 4) {
+                SCOPED_MEMORY_ACCESS.putIntUnaligned(dst.sessionImpl(), dst.unsafeGetBase(), dst.unsafeGetOffset() + offset, (int) longValue, !Architecture.isLittleEndian());
+                offset += 4;
+                remaining -= 4;
+            }
+            // 0...00X0
+            if (remaining >= 2) {
+                SCOPED_MEMORY_ACCESS.putShortUnaligned(dst.sessionImpl(), dst.unsafeGetBase(), dst.unsafeGetOffset() + offset, (short) longValue, !Architecture.isLittleEndian());
+                offset += 2;
+                remaining -= 2;
+            }
+            // 0...000X
+            if (remaining == 1) {
+                SCOPED_MEMORY_ACCESS.putByte(dst.sessionImpl(), dst.unsafeGetBase(), dst.unsafeGetOffset() + offset, value);
+            }
+            // We have now fully handled 0...0X...XXXX
+        } else {
+            // Handle larger segments via native calls
+            SCOPED_MEMORY_ACCESS.setMemory(dst.sessionImpl(), dst.unsafeGetBase(), dst.unsafeGetOffset(), dst.length, value);
+        }
+        return dst;
+    }
+
+    @ForceInline
+    public static void copy(AbstractMemorySegmentImpl src, long srcOffset,
+                            AbstractMemorySegmentImpl dst, long dstOffset,
+                            long size) {
+
+        Utils.checkNonNegativeIndex(size, "size");
+        // Implicit null check for src and dst
+        src.checkAccess(srcOffset, size, true);
+        dst.checkAccess(dstOffset, size, false);
+
+        if (size <= 0) {
+            // Do nothing
+        } else if (size < NATIVE_THRESHOLD_COPY && !src.overlaps(dst)) {
+            // 0 < size < FILL_NATIVE_LIMIT : 0...0X...XXXX
+            //
+            // Strictly, we could check for !src.asSlice(srcOffset, size).overlaps(dst.asSlice(dstOffset, size) but
+            // this is a bit slower and it likely very unusual there is any difference in the outcome. Also, if there
+            // is an overlap, we could tolerate one particular direction of overlap (but not the other).
+
+            // 0...0X...X000
+            final int limit = (int) (size & (NATIVE_THRESHOLD_COPY - 8));
+            int offset = 0;
+            for (; offset < limit; offset += 8) {
+                final long v = SCOPED_MEMORY_ACCESS.getLongUnaligned(src.sessionImpl(), src.unsafeGetBase(), src.unsafeGetOffset() + srcOffset + offset, !Architecture.isLittleEndian());
+                SCOPED_MEMORY_ACCESS.putLongUnaligned(dst.sessionImpl(), dst.unsafeGetBase(), dst.unsafeGetOffset() + dstOffset + offset, v, !Architecture.isLittleEndian());
+            }
+            int remaining = (int) size - offset;
+            // 0...0X00
+            if (remaining >= 4) {
+                final int v = SCOPED_MEMORY_ACCESS.getIntUnaligned(src.sessionImpl(), src.unsafeGetBase(),src.unsafeGetOffset() + srcOffset + offset, !Architecture.isLittleEndian());
+                SCOPED_MEMORY_ACCESS.putIntUnaligned(dst.sessionImpl(), dst.unsafeGetBase(), dst.unsafeGetOffset() + dstOffset + offset, v, !Architecture.isLittleEndian());
+                offset += 4;
+                remaining -= 4;
+            }
+            // 0...00X0
+            if (remaining >= 2) {
+                final short v = SCOPED_MEMORY_ACCESS.getShortUnaligned(src.sessionImpl(), src.unsafeGetBase(), src.unsafeGetOffset() + srcOffset + offset, !Architecture.isLittleEndian());
+                SCOPED_MEMORY_ACCESS.putShortUnaligned(dst.sessionImpl(), dst.unsafeGetBase(), dst.unsafeGetOffset() + dstOffset + offset, v, !Architecture.isLittleEndian());
+                offset += 2;
+                remaining -=2;
+            }
+            // 0...000X
+            if (remaining == 1) {
+                final byte v = SCOPED_MEMORY_ACCESS.getByte(src.sessionImpl(), src.unsafeGetBase(), src.unsafeGetOffset() + srcOffset + offset);
+                SCOPED_MEMORY_ACCESS.putByte(dst.sessionImpl(), dst.unsafeGetBase(), dst.unsafeGetOffset() + dstOffset + offset, v);
+            }
+            // We have now fully handled 0...0X...XXXX
+        } else {
+            // For larger sizes, the transition to native code pays off
+            SCOPED_MEMORY_ACCESS.copyMemory(src.sessionImpl(), dst.sessionImpl(),
+                    src.unsafeGetBase(), src.unsafeGetOffset() + srcOffset,
+                    dst.unsafeGetBase(), dst.unsafeGetOffset() + dstOffset, size);
+        }
+    }
+
+    @ForceInline
+    public static long mismatch(AbstractMemorySegmentImpl src, long srcFromOffset, long srcToOffset,
+                                AbstractMemorySegmentImpl dst, long dstFromOffset, long dstToOffset) {
+        final long srcBytes = srcToOffset - srcFromOffset;
+        final long dstBytes = dstToOffset - dstFromOffset;
+        src.checkAccess(srcFromOffset, srcBytes, true);
+        dst.checkAccess(dstFromOffset, dstBytes, true);
+
+        final long length = Math.min(srcBytes, dstBytes);
+        final boolean srcAndDstBytesDiffer = srcBytes != dstBytes;
+
+        if (length == 0) {
+            return srcAndDstBytesDiffer ? 0 : -1;
+        } else if (length < NATIVE_THRESHOLD_MISMATCH) {
+            return mismatch(src, srcFromOffset, dst, dstFromOffset, 0, (int) length, srcAndDstBytesDiffer);
+        } else {
+            long i;
+            if (SCOPED_MEMORY_ACCESS.getByte(src.sessionImpl(), src.unsafeGetBase(), src.unsafeGetOffset() + srcFromOffset) !=
+                    SCOPED_MEMORY_ACCESS.getByte(dst.sessionImpl(), dst.unsafeGetBase(), dst.unsafeGetOffset() + dstFromOffset)) {
+                return 0;
+            }
+            i = vectorizedMismatchLargeForBytes(src.sessionImpl(), dst.sessionImpl(),
+                    src.unsafeGetBase(), src.unsafeGetOffset() + srcFromOffset,
+                    dst.unsafeGetBase(), dst.unsafeGetOffset() + dstFromOffset,
+                    length);
+            if (i >= 0) {
+                return i;
+            }
+            final long remaining = ~i;
+            assert remaining < 8 : "remaining greater than 7: " + remaining;
+            i = length - remaining;
+            return mismatch(src, srcFromOffset + i, dst, dstFromOffset + i, i, (int) remaining, srcAndDstBytesDiffer);
+        }
+    }
+
+    // Mismatch is handled in chunks of 64 (unroll of eight 8s), 8, 4, 2, and 1 byte(s).
+    @ForceInline
+    private static long mismatch(AbstractMemorySegmentImpl src, long srcFromOffset,
+                                 AbstractMemorySegmentImpl dst, long dstFromOffset,
+                                 long start, int length, boolean srcAndDstBytesDiffer) {
+        int offset = 0;
+        final int limit = length & (NATIVE_THRESHOLD_MISMATCH - 8);
+        for (; offset < limit; offset += 8) {
+            final long s = SCOPED_MEMORY_ACCESS.getLongUnaligned(src.sessionImpl(), src.unsafeGetBase(), src.unsafeGetOffset() + srcFromOffset + offset, !Architecture.isLittleEndian());
+            final long d = SCOPED_MEMORY_ACCESS.getLongUnaligned(dst.sessionImpl(), dst.unsafeGetBase(), dst.unsafeGetOffset() + dstFromOffset + offset, !Architecture.isLittleEndian());
+            if (s != d) {
+                return start + offset + mismatch(s, d);
+            }
+        }
+        int remaining = length - offset;
+        // 0...XXX000
+        for (; remaining >= 8; remaining -= 8) {
+            final long s = SCOPED_MEMORY_ACCESS.getLongUnaligned(src.sessionImpl(), src.unsafeGetBase(), src.unsafeGetOffset() + srcFromOffset + offset, !Architecture.isLittleEndian());
+            final long d = SCOPED_MEMORY_ACCESS.getLongUnaligned(dst.sessionImpl(), dst.unsafeGetBase(), dst.unsafeGetOffset() + dstFromOffset + offset, !Architecture.isLittleEndian());
+            if (s != d) {
+                return start + offset + mismatch(s, d);
+            }
+            offset += 8;
+        }
+
+        // 0...0X00
+        if (remaining >= 4) {
+            final int s = SCOPED_MEMORY_ACCESS.getIntUnaligned(src.sessionImpl(), src.unsafeGetBase(), src.unsafeGetOffset() + srcFromOffset + offset, !Architecture.isLittleEndian());
+            final int d = SCOPED_MEMORY_ACCESS.getIntUnaligned(dst.sessionImpl(), dst.unsafeGetBase(), dst.unsafeGetOffset() + dstFromOffset + offset, !Architecture.isLittleEndian());
+            if (s != d) {
+                return start + offset + mismatch(s, d);
+            }
+            offset += 4;
+            remaining -= 4;
+        }
+        // 0...00X0
+        if (remaining >= 2) {
+            final short s = SCOPED_MEMORY_ACCESS.getShortUnaligned(src.sessionImpl(), src.unsafeGetBase(), src.unsafeGetOffset() + srcFromOffset + offset, !Architecture.isLittleEndian());
+            final short d = SCOPED_MEMORY_ACCESS.getShortUnaligned(dst.sessionImpl(), dst.unsafeGetBase(), dst.unsafeGetOffset() + dstFromOffset + offset, !Architecture.isLittleEndian());
+            if (s != d) {
+                return start + offset + mismatch(s, d);
+            }
+            offset += 2;
+            remaining -= 2;
+        }
+        // 0...000X
+        if (remaining == 1) {
+            final byte s = SCOPED_MEMORY_ACCESS.getByte(src.sessionImpl(), src.unsafeGetBase(), src.unsafeGetOffset() + srcFromOffset + offset);
+            final byte d = SCOPED_MEMORY_ACCESS.getByte(dst.sessionImpl(), dst.unsafeGetBase(), dst.unsafeGetOffset() + dstFromOffset + offset);
+            if (s != d) {
+                return start + offset;
+            }
+        }
+        return srcAndDstBytesDiffer ? (start + length) : -1;
+        // We have now fully handled 0...0X...XXXX
+    }
+
+    @ForceInline
+    private static int mismatch(long first, long second) {
+        final long x = first ^ second;
+        return (Architecture.isLittleEndian()
+                ? Long.numberOfTrailingZeros(x)
+                : Long.numberOfLeadingZeros(x)) / 8;
+    }
+
+    @ForceInline
+    private static int mismatch(int first, int second) {
+        final int x = first ^ second;
+        return (Architecture.isLittleEndian()
+                ? Integer.numberOfTrailingZeros(x)
+                : Integer.numberOfLeadingZeros(x)) / 8;
+    }
+
+    @ForceInline
+    private static int mismatch(short first, short second) {
+        if (Architecture.isLittleEndian()) {
+            return ((0xff & first) == (0xff & second)) ? 1 : 0;
+        } else {
+            return ((0xff & first) == (0xff & second)) ? 0 : 1;
+        }
+    }
+
+    /**
+     * Mismatch over long lengths.
+     */
+    private static long vectorizedMismatchLargeForBytes(MemorySessionImpl aSession, MemorySessionImpl bSession,
+                                                        Object a, long aOffset,
+                                                        Object b, long bOffset,
+                                                        long length) {
+        long off = 0;
+        long remaining = length;
+        int i, size;
+        boolean lastSubRange = false;
+        while (remaining > 7 && !lastSubRange) {
+            if (remaining > Integer.MAX_VALUE) {
+                size = Integer.MAX_VALUE;
+            } else {
+                size = (int) remaining;
+                lastSubRange = true;
+            }
+            i = SCOPED_MEMORY_ACCESS.vectorizedMismatch(aSession, bSession,
+                    a, aOffset + off,
+                    b, bOffset + off,
+                    size, ArraysSupport.LOG2_ARRAY_BYTE_INDEX_SCALE);
+            if (i >= 0)
+                return off + i;
+
+            i = size - ~i;
+            off += i;
+            remaining -= i;
+        }
+        return ~remaining;
+    }
+
+    static final String PROPERTY_PATH = "java.lang.foreign.native.threshold.power.";
+
+    // The returned value is in the interval [0, 2^30]
+    static int powerOfPropertyOr(String name, int defaultPower) {
+        final int power = Integer.getInteger(PROPERTY_PATH + name, defaultPower);
+        return 1 << Math.clamp(power, 0, Integer.SIZE - 2);
+    }
+
+}
diff --git a/test/jdk/java/foreign/TestMismatch.java b/test/jdk/java/foreign/TestMismatch.java
index 9549b2508ff..f50621e3415 100644
--- a/test/jdk/java/foreign/TestMismatch.java
+++ b/test/jdk/java/foreign/TestMismatch.java
@@ -29,7 +29,9 @@
 
 import java.lang.foreign.Arena;
 import java.util.ArrayList;
+import java.util.Arrays;
 import java.util.List;
+import java.util.Random;
 import java.util.concurrent.atomic.AtomicReference;
 
 import java.lang.foreign.MemorySegment;
@@ -122,6 +124,68 @@ public class TestMismatch {
         }
     }
 
+    @Test
+    public void random() {
+        try (var arena = Arena.ofConfined()) {
+            var rnd = new Random(42);
+            for (int size = 1; size < 64; size++) {
+                // Repeat a fair number of rounds
+                for (int i = 0; i < 147; i++) {
+                    var src = arena.allocate(size);
+                    // The dst segment might be zero to eight bytes longer
+                    var dst = arena.allocate(size + rnd.nextInt(8 + 1));
+                    // Fill the src with random data
+                    for (int j = 0; j < size; j++) {
+                        src.set(ValueLayout.JAVA_BYTE, j, randomByte(rnd));
+                    }
+                    // copy the random data from src to dst
+                    dst.copyFrom(src);
+                    // Fill the rest (if any) of the dst with random data
+                    for (long j = src.byteSize(); j < dst.byteSize(); j++) {
+                        dst.set(ValueLayout.JAVA_BYTE, j, randomByte(rnd));
+                    }
+
+                    if (rnd.nextBoolean()) {
+                        // In this branch, we inject one or more deviating bytes
+                        int beginDiff = rnd.nextInt(size);
+                        int endDiff = rnd.nextInt(beginDiff, size);
+                        for (int d = beginDiff; d <= endDiff; d++) {
+                            byte existing = dst.get(ValueLayout.JAVA_BYTE, d);
+                            // Make sure we never get back the same value
+                            byte mutatedValue;
+                            do {
+                                mutatedValue = randomByte(rnd);
+                            } while (existing == mutatedValue);
+                            dst.set(ValueLayout.JAVA_BYTE, d, mutatedValue);
+                        }
+
+                        // They are not equal and differs in position beginDiff
+                        assertEquals(src.mismatch(dst), beginDiff);
+                        assertEquals(dst.mismatch(src), beginDiff);
+                    } else {
+                        // In this branch, there is no injection
+
+                        if (src.byteSize() == dst.byteSize()) {
+                            // The content matches and they are of equal size
+                            assertEquals(src.mismatch(dst), -1);
+                            assertEquals(dst.mismatch(src), -1);
+                        } else {
+                            // The content matches but they are of different length
+                            // Remember, the size of src is always smaller or equal
+                            // to the size of dst.
+                            assertEquals(src.mismatch(dst), src.byteSize());
+                            assertEquals(dst.mismatch(src), src.byteSize());
+                        }
+                    }
+                }
+            }
+        }
+    }
+
+    static byte randomByte(Random rnd) {
+        return (byte) rnd.nextInt(Byte.MIN_VALUE, Byte.MAX_VALUE + 1);
+    }
+
     @Test(dataProvider = "slices")
     public void testDifferentValues(MemorySegment s1, MemorySegment s2) {
         out.format("testDifferentValues s1:%s, s2:%s\n", s1, s2);
diff --git a/test/micro/org/openjdk/bench/java/lang/foreign/CopyTest.java b/test/micro/org/openjdk/bench/java/lang/foreign/SegmentBulkCopy.java
similarity index 76%
rename from test/micro/org/openjdk/bench/java/lang/foreign/CopyTest.java
rename to test/micro/org/openjdk/bench/java/lang/foreign/SegmentBulkCopy.java
index 8996b1de117..22ef139aac0 100644
--- a/test/micro/org/openjdk/bench/java/lang/foreign/CopyTest.java
+++ b/test/micro/org/openjdk/bench/java/lang/foreign/SegmentBulkCopy.java
@@ -41,21 +41,16 @@ import java.lang.foreign.MemorySegment;
 import java.nio.ByteBuffer;
 import java.util.concurrent.TimeUnit;
 
-import static java.lang.foreign.ValueLayout.*;
-
 @BenchmarkMode(Mode.AverageTime)
 @Warmup(iterations = 5, time = 500, timeUnit = TimeUnit.MILLISECONDS)
 @Measurement(iterations = 10, time = 500, timeUnit = TimeUnit.MILLISECONDS)
 @State(Scope.Thread)
 @OutputTimeUnit(TimeUnit.NANOSECONDS)
 @Fork(value = 3)
-public class CopyTest {
+public class SegmentBulkCopy {
 
-    @Param({"0", "1", "2", "3", "4", "5", "6", "7", "8",
-            "9", "10", "11", "12", "13", "14", "15", "16",
-            "17", "18", "19", "20", "21", "22", "23", "24",
-            "25", "26", "27", "28", "29", "30", "31", "32",
-            "33", "36", "40", "44", "48", "52", "56", "60", "63", "64", "128"})
+    @Param({"2", "3", "4", "5", "6", "7", "8", "64", "512",
+            "4096", "32768", "262144", "2097152", "16777216", "134217728"})
     public int ELEM_SIZE;
 
     byte[] srcArray;
@@ -80,28 +75,37 @@ public class CopyTest {
     }
 
     @Benchmark
-    public void array_copy() {
+    public void arrayCopy() {
         System.arraycopy(srcArray, 0, dstArray, 0, ELEM_SIZE);
     }
 
     @Benchmark
-    public void heap_segment_copy5Arg() {
-        MemorySegment.copy(heapSrcSegment, 0, heapDstSegment, 0, ELEM_SIZE);
-    }
-
-    @Benchmark
-    public void native_segment_copy5Arg() {
-        MemorySegment.copy(nativeSrcSegment, 0, nativeDstSegment, 0, ELEM_SIZE);
-    }
-
-    @Benchmark
-    public void heap_segment_copy7arg() {
-        MemorySegment.copy(heapSrcSegment, JAVA_BYTE, 0, heapDstSegment, JAVA_BYTE, 0, ELEM_SIZE);
-    }
-
-    @Benchmark
-    public void buffer_copy() {
+    public void bufferCopy() {
         dstBuffer.put(srcBuffer);
     }
 
+    @Fork(value = 3, jvmArgsAppend = {"-Djava.lang.foreign.native.threshold.power.copy=31"})
+    @Benchmark
+    public void heapSegmentCopyJava() {
+        MemorySegment.copy(heapSrcSegment, 0, heapDstSegment, 0, ELEM_SIZE);
+    }
+
+    @Fork(value = 3, jvmArgsAppend = {"-Djava.lang.foreign.native.threshold.power.copy=0"})
+    @Benchmark
+    public void heapSegmentCopyUnsafe() {
+        MemorySegment.copy(heapSrcSegment, 0, heapDstSegment, 0, ELEM_SIZE);
+    }
+
+    @Fork(value = 3, jvmArgsAppend = {"-Djava.lang.foreign.native.threshold.power.copy=31"})
+    @Benchmark
+    public void nativeSegmentCopyJava() {
+        MemorySegment.copy(nativeSrcSegment, 0, nativeDstSegment, 0, ELEM_SIZE);
+    }
+
+    @Fork(value = 3, jvmArgsAppend = {"-Djava.lang.foreign.native.threshold.power.copy=0"})
+    @Benchmark
+    public void nativeSegmentCopyUnsafe() {
+        MemorySegment.copy(nativeSrcSegment, 0, nativeDstSegment, 0, ELEM_SIZE);
+    }
+
 }
diff --git a/test/micro/org/openjdk/bench/java/lang/foreign/TestFill.java b/test/micro/org/openjdk/bench/java/lang/foreign/SegmentBulkFill.java
similarity index 69%
rename from test/micro/org/openjdk/bench/java/lang/foreign/TestFill.java
rename to test/micro/org/openjdk/bench/java/lang/foreign/SegmentBulkFill.java
index 78719f03bc3..95ca7228969 100644
--- a/test/micro/org/openjdk/bench/java/lang/foreign/TestFill.java
+++ b/test/micro/org/openjdk/bench/java/lang/foreign/SegmentBulkFill.java
@@ -48,13 +48,10 @@ import java.util.concurrent.TimeUnit;
 @State(Scope.Thread)
 @OutputTimeUnit(TimeUnit.NANOSECONDS)
 @Fork(value = 3)
-public class TestFill {
+public class SegmentBulkFill {
 
-    @Param({"0", "1", "2", "3", "4", "5", "6", "7",
-            "8", "9", "10", "11", "12", "13", "14", "15",
-            "16", "17", "18", "19", "20", "21", "22", "23",
-            "24", "25", "26", "27", "28", "29", "30", "31",
-            "32", "128", "256", "384", "511", "512"})
+    @Param({"2", "3", "4", "5", "6", "7", "8", "64", "512",
+            "4096", "32768", "262144", "2097152", "16777216", "134217728"})
     public int ELEM_SIZE;
 
     byte[] array;
@@ -73,22 +70,43 @@ public class TestFill {
     }
 
     @Benchmark
-    public void arrays_fill() {
+    public void arraysFill() {
         Arrays.fill(array, (byte) 0);
     }
 
+    @Fork(value = 3, jvmArgsAppend = {"-Djava.lang.foreign.native.threshold.power.fill=31"})
     @Benchmark
-    public void heap_segment_fill() {
+    public void heapSegmentFillJava() {
         heapSegment.fill((byte) 0);
     }
 
+    @Fork(value = 3, jvmArgsAppend = {"-Djava.lang.foreign.native.threshold.power.fill=0"})
     @Benchmark
-    public void native_segment_fill() {
+    public void heapSegmentFillUnsafe() {
+        heapSegment.fill((byte) 0);
+    }
+
+    @Fork(value = 3, jvmArgsAppend = {"-Djava.lang.foreign.native.threshold.power.fill=31"})
+    @Benchmark
+    public void nativeSegmentFillJava() {
         nativeSegment.fill((byte) 0);
     }
 
+    @Fork(value = 3, jvmArgsAppend = {"-Djava.lang.foreign.native.threshold.power.fill=0"})
     @Benchmark
-    public void unaligned_segment_fill() {
+    public void nativeSegmentFillUnsafe() {
+        nativeSegment.fill((byte) 0);
+    }
+
+    @Fork(value = 3, jvmArgsAppend = {"-Djava.lang.foreign.native.threshold.power.fill=31"})
+    @Benchmark
+    public void unalignedSegmentFillJava() {
+        unalignedSegment.fill((byte) 0);
+    }
+
+    @Fork(value = 3, jvmArgsAppend = {"-Djava.lang.foreign.native.threshold.power.fill=0"})
+    @Benchmark
+    public void unalignedSegmentFillUnsafe() {
         unalignedSegment.fill((byte) 0);
     }
 
diff --git a/test/micro/org/openjdk/bench/java/lang/foreign/SegmentBulkMismatch.java b/test/micro/org/openjdk/bench/java/lang/foreign/SegmentBulkMismatch.java
new file mode 100644
index 00000000000..5656b2f6b9f
--- /dev/null
+++ b/test/micro/org/openjdk/bench/java/lang/foreign/SegmentBulkMismatch.java
@@ -0,0 +1,112 @@
+/*
+ *  Copyright (c) 2024, Oracle and/or its affiliates. All rights reserved.
+ *  DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ *  This code is free software; you can redistribute it and/or modify it
+ *  under the terms of the GNU General Public License version 2 only, as
+ *  published by the Free Software Foundation.
+ *
+ *  This code is distributed in the hope that it will be useful, but WITHOUT
+ *  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ *  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ *  version 2 for more details (a copy is included in the LICENSE file that
+ *  accompanied this code).
+ *
+ *  You should have received a copy of the GNU General Public License version
+ *  2 along with this work; if not, write to the Free Software Foundation,
+ *  Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ *   Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ *  or visit www.oracle.com if you need additional information or have any
+ *  questions.
+ *
+ */
+
+package org.openjdk.bench.java.lang.foreign;
+
+import org.openjdk.jmh.annotations.Benchmark;
+import org.openjdk.jmh.annotations.BenchmarkMode;
+import org.openjdk.jmh.annotations.Fork;
+import org.openjdk.jmh.annotations.Measurement;
+import org.openjdk.jmh.annotations.Mode;
+import org.openjdk.jmh.annotations.OutputTimeUnit;
+import org.openjdk.jmh.annotations.Param;
+import org.openjdk.jmh.annotations.Scope;
+import org.openjdk.jmh.annotations.Setup;
+import org.openjdk.jmh.annotations.State;
+import org.openjdk.jmh.annotations.Warmup;
+
+import java.lang.foreign.Arena;
+import java.lang.foreign.MemorySegment;
+import java.util.Arrays;
+import java.util.Random;
+import java.util.concurrent.TimeUnit;
+
+import static java.lang.foreign.ValueLayout.*;
+
+@BenchmarkMode(Mode.AverageTime)
+@Warmup(iterations = 5, time = 500, timeUnit = TimeUnit.MILLISECONDS)
+@Measurement(iterations = 10, time = 500, timeUnit = TimeUnit.MILLISECONDS)
+@State(Scope.Thread)
+@OutputTimeUnit(TimeUnit.NANOSECONDS)
+@Fork(value = 3)
+public class SegmentBulkMismatch {
+
+    @Param({"2", "3", "4", "5", "6", "7", "8", "64", "512",
+            "4096", "32768", "262144", "2097152", "16777216", "134217728"})
+    public int ELEM_SIZE;
+
+    MemorySegment srcNative;
+    MemorySegment dstNative;
+    byte[] srcArray;
+    byte[] dstArray;
+    MemorySegment srcHeap;
+    MemorySegment dstHeap;
+
+    @Setup
+    public void setup() {
+        // Always use the same alignment regardless of size
+        srcNative = Arena.ofAuto().allocate(ELEM_SIZE,16);
+        dstNative = Arena.ofAuto().allocate(ELEM_SIZE, 16);
+        var rnd = new Random(42);
+        for (int i = 0; i < ELEM_SIZE; i++) {
+            srcNative.set(JAVA_BYTE, i, (byte) rnd.nextInt(Byte.MIN_VALUE, Byte.MAX_VALUE));
+        }
+        dstNative.copyFrom(srcNative);
+        srcArray = srcNative.toArray(JAVA_BYTE);
+        dstArray = dstNative.toArray(JAVA_BYTE);
+        srcHeap = MemorySegment.ofArray(srcArray);
+        dstHeap = MemorySegment.ofArray(dstArray);
+    }
+
+    @Fork(value = 3, jvmArgsAppend = {"-Djava.lang.foreign.native.threshold.power.mismatch=31"})
+    @Benchmark
+    public long nativeSegmentJava() {
+        return srcNative.mismatch(dstNative);
+    }
+
+    @Fork(value = 3, jvmArgsAppend = {"-Djava.lang.foreign.native.threshold.power.mismatch=31"})
+    @Benchmark
+    public long heapSegmentJava() {
+        return srcHeap.mismatch(dstHeap);
+    }
+
+    @Fork(value = 3, jvmArgsAppend = {"-Djava.lang.foreign.native.threshold.power.mismatch=0"})
+    @Benchmark
+    public long nativeSegmentUnsafe() {
+        return srcNative.mismatch(dstNative);
+    }
+
+    @Fork(value = 3, jvmArgsAppend = {"-Djava.lang.foreign.native.threshold.power.mismatch=0"})
+    @Benchmark
+    public long heapSegmentUnsafe() {
+        return srcHeap.mismatch(dstHeap);
+    }
+
+    @Benchmark
+    public long array() {
+        return Arrays.mismatch(srcArray, dstArray);
+    }
+
+}
+