8339699: Optimize DataOutputStream writeUTF

Reviewed-by: liach, bpb
2024-10-04 22:35:03 +00:00 · 2024-10-04 22:35:03 +00:00 · b42fbf43df
commit b42fbf43df
parent 559289487d
5 changed files with 252 additions and 182 deletions
--- a/src/java.base/share/classes/java/io/DataOutputStream.java
+++ b/src/java.base/share/classes/java/io/DataOutputStream.java
@ -1,5 +1,6 @@
 /*
- * Copyright (c) 1994, 2023, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1994, 2024, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2024, Alibaba Group Holding Limited. All Rights Reserved.
 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
 *
 * This code is free software; you can redistribute it and/or modify it
@ -25,8 +26,13 @@

 package java.io;

+import jdk.internal.access.JavaLangAccess;
+import jdk.internal.access.SharedSecrets;
 import jdk.internal.util.ByteArray;

+import static jdk.internal.util.ModifiedUtf.putChar;
+import static jdk.internal.util.ModifiedUtf.utfLen;
+
 /**
 * A data output stream lets an application write primitive Java data
 * types to an output stream in a portable way. An application can
@ -44,6 +50,8 @@ import jdk.internal.util.ByteArray;
 * @since   1.0
 */
 public class DataOutputStream extends FilterOutputStream implements DataOutput {
+    private static final JavaLangAccess JLA = SharedSecrets.getJavaLangAccess();
+
    /**
     * The number of bytes written to the data output stream so far.
     * If this counter overflows, it will be wrapped to Integer.MAX_VALUE.
@ -352,15 +360,11 @@ public class DataOutputStream extends FilterOutputStream implements DataOutput {
     *             {@code str} would exceed 65535 bytes in length
     * @throws     IOException  if some other I/O error occurs.
     */
+    @SuppressWarnings("deprecation")
    static int writeUTF(String str, DataOutput out) throws IOException {
        final int strlen = str.length();
-        int utflen = strlen; // optimized for ASCII
-
-        for (int i = 0; i < strlen; i++) {
-            int c = str.charAt(i);
-            if (c >= 0x80 || c == 0)
-                utflen += (c >= 0x800) ? 2 : 1;
-        }
+        int countNonZeroAscii = JLA.countNonZeroAscii(str);
+        int utflen = utfLen(str, countNonZeroAscii);

        if (utflen > 65535 || /* overflow */ utflen < strlen)
            throw new UTFDataFormatException(tooLongMsg(str, utflen));
@ -377,25 +381,11 @@ public class DataOutputStream extends FilterOutputStream implements DataOutput {
        int count = 0;
        ByteArray.setUnsignedShort(bytearr, count, utflen);
        count += 2;
-        int i = 0;
-        for (i = 0; i < strlen; i++) { // optimized for initial run of ASCII
-            int c = str.charAt(i);
-            if (c >= 0x80 || c == 0) break;
-            bytearr[count++] = (byte) c;
-        }
+        str.getBytes(0, countNonZeroAscii, bytearr, count);
+        count += countNonZeroAscii;

-        for (; i < strlen; i++) {
-            int c = str.charAt(i);
-            if (c < 0x80 && c != 0) {
-                bytearr[count++] = (byte) c;
-            } else if (c >= 0x800) {
-                bytearr[count++] = (byte) (0xE0 | ((c >> 12) & 0x0F));
-                bytearr[count++] = (byte) (0x80 | ((c >>  6) & 0x3F));
-                bytearr[count++] = (byte) (0x80 | ((c >>  0) & 0x3F));
-            } else {
-                bytearr[count++] = (byte) (0xC0 | ((c >>  6) & 0x1F));
-                bytearr[count++] = (byte) (0x80 | ((c >>  0) & 0x3F));
-            }
+        for (int i = countNonZeroAscii; i < strlen;) {
+            count = putChar(bytearr, count, str.charAt(i++));
        }
        out.write(bytearr, 0, utflen + 2);
        return utflen + 2;
--- a/src/java.base/share/classes/java/io/ObjectOutputStream.java
+++ b/src/java.base/share/classes/java/io/ObjectOutputStream.java
@ -1,5 +1,6 @@
 /*
 * Copyright (c) 1996, 2024, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2024, Alibaba Group Holding Limited. All Rights Reserved.
 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
 *
 * This code is free software; you can redistribute it and/or modify it
@ -34,8 +35,13 @@ import java.util.Objects;
 import java.util.StringJoiner;

 import jdk.internal.util.ByteArray;
+import jdk.internal.access.JavaLangAccess;
+import jdk.internal.access.SharedSecrets;
 import sun.reflect.misc.ReflectUtil;

+import static jdk.internal.util.ModifiedUtf.putChar;
+import static jdk.internal.util.ModifiedUtf.utfLen;
+
 /**
 * An ObjectOutputStream writes primitive data types and graphs of Java objects
 * to an OutputStream.  The objects can be read (reconstituted) using an
@ -169,6 +175,7 @@ import sun.reflect.misc.ReflectUtil;
 public class ObjectOutputStream
    extends OutputStream implements ObjectOutput, ObjectStreamConstants
 {
+    private static final JavaLangAccess JLA = SharedSecrets.getJavaLangAccess();

    private static class Caches {
        /** cache of subclass security audit results */
@ -885,7 +892,7 @@ public class ObjectOutputStream
     *          stream
     */
    public void writeUTF(String str) throws IOException {
-        bout.writeUTF(str);
+        bout.writeUTFInternal(str, false);
    }

    /**
@ -1317,14 +1324,7 @@ public class ObjectOutputStream
     */
    private void writeString(String str, boolean unshared) throws IOException {
        handles.assign(unshared ? null : str);
-        long utflen = bout.getUTFLength(str);
-        if (utflen <= 0xFFFF) {
-            bout.writeByte(TC_STRING);
-            bout.writeUTF(str, utflen);
-        } else {
-            bout.writeByte(TC_LONGSTRING);
-            bout.writeLongUTF(str, utflen);
-        }
+        bout.writeUTFInternal(str, true);
    }

    /**
@ -1994,26 +1994,27 @@ public class ObjectOutputStream
            }
        }

-        public void writeBytes(String s) throws IOException {
-            int endoff = s.length();
-            int cpos = 0;
-            int csize = 0;
-            for (int off = 0; off < endoff; ) {
-                if (cpos >= csize) {
-                    cpos = 0;
-                    csize = Math.min(endoff - off, CHAR_BUF_SIZE);
-                    s.getChars(off, off + csize, cbuf, 0);
-                }
-                if (pos >= MAX_BLOCK_SIZE) {
+        @SuppressWarnings("deprecation")
+        void writeBytes(String s, int len) throws IOException {
+            int pos = this.pos;
+            for (int strpos = 0; strpos < len;) {
+                int rem = MAX_BLOCK_SIZE - pos;
+                int csize = Math.min(len - strpos, rem);
+                s.getBytes(strpos, strpos + csize, buf, pos);
+                pos += csize;
+                strpos += csize;
+
+                if (pos == MAX_BLOCK_SIZE) {
+                    this.pos = pos;
                    drain();
+                    pos = 0;
                }
-                int n = Math.min(csize - cpos, MAX_BLOCK_SIZE - pos);
-                int stop = pos + n;
-                while (pos < stop) {
-                    buf[pos++] = (byte) cbuf[cpos++];
-                }
-                off += n;
            }
+            this.pos = pos;
+        }
+
+        public void writeBytes(String s) throws IOException {
+            writeBytes(s, s.length());
        }

        public void writeChars(String s) throws IOException {
@ -2026,8 +2027,47 @@ public class ObjectOutputStream
            }
        }

-        public void writeUTF(String s) throws IOException {
-            writeUTF(s, getUTFLength(s));
+        public void writeUTF(String str) throws IOException {
+            writeUTFInternal(str, false);
+        }
+
+        private void writeUTFInternal(String str, boolean writeHeader) throws IOException {
+            int strlen = str.length();
+            int countNonZeroAscii = JLA.countNonZeroAscii(str);
+            int utflen = utfLen(str, countNonZeroAscii);
+            if (utflen <= 0xFFFF) {
+                if(writeHeader) {
+                    writeByte(TC_STRING);
+                }
+                writeShort(utflen);
+            } else {
+                if(writeHeader) {
+                    writeByte(TC_LONGSTRING);
+                }
+                writeLong(utflen);
+            }
+
+            if (countNonZeroAscii != 0) {
+                writeBytes(str, countNonZeroAscii);
+            }
+            if (countNonZeroAscii != strlen) {
+                writeMoreUTF(str, countNonZeroAscii);
+            }
+        }
+
+        private void writeMoreUTF(String str, int stroff) throws IOException {
+            int pos = this.pos;
+            for (int strlen = str.length(); stroff < strlen;) {
+                char c = str.charAt(stroff++);
+                int csize = c != 0 && c < 0x80 ? 1 : c >= 0x800 ? 3 : 2;
+                if (pos + csize >= MAX_BLOCK_SIZE) {
+                    this.pos = pos;
+                    drain();
+                    pos = 0;
+                }
+                pos = putChar(buf, pos, c);
+            }
+            this.pos = pos;
        }


@ -2153,112 +2193,6 @@ public class ObjectOutputStream
                }
            }
        }
-
-        /**
-         * Returns the length in bytes of the UTF encoding of the given string.
-         */
-        long getUTFLength(String s) {
-            int len = s.length();
-            long utflen = 0;
-            for (int off = 0; off < len; ) {
-                int csize = Math.min(len - off, CHAR_BUF_SIZE);
-                s.getChars(off, off + csize, cbuf, 0);
-                for (int cpos = 0; cpos < csize; cpos++) {
-                    char c = cbuf[cpos];
-                    if (c >= 0x0001 && c <= 0x007F) {
-                        utflen++;
-                    } else if (c > 0x07FF) {
-                        utflen += 3;
-                    } else {
-                        utflen += 2;
-                    }
-                }
-                off += csize;
-            }
-            return utflen;
-        }
-
-        /**
-         * Writes the given string in UTF format.  This method is used in
-         * situations where the UTF encoding length of the string is already
-         * known; specifying it explicitly avoids a prescan of the string to
-         * determine its UTF length.
-         */
-        void writeUTF(String s, long utflen) throws IOException {
-            if (utflen > 0xFFFFL) {
-                throw new UTFDataFormatException();
-            }
-            writeShort((int) utflen);
-            if (utflen == (long) s.length()) {
-                writeBytes(s);
-            } else {
-                writeUTFBody(s);
-            }
-        }
-
-        /**
-         * Writes given string in "long" UTF format.  "Long" UTF format is
-         * identical to standard UTF, except that it uses an 8 byte header
-         * (instead of the standard 2 bytes) to convey the UTF encoding length.
-         */
-        void writeLongUTF(String s) throws IOException {
-            writeLongUTF(s, getUTFLength(s));
-        }
-
-        /**
-         * Writes given string in "long" UTF format, where the UTF encoding
-         * length of the string is already known.
-         */
-        void writeLongUTF(String s, long utflen) throws IOException {
-            writeLong(utflen);
-            if (utflen == (long) s.length()) {
-                writeBytes(s);
-            } else {
-                writeUTFBody(s);
-            }
-        }
-
-        /**
-         * Writes the "body" (i.e., the UTF representation minus the 2-byte or
-         * 8-byte length header) of the UTF encoding for the given string.
-         */
-        private void writeUTFBody(String s) throws IOException {
-            int limit = MAX_BLOCK_SIZE - 3;
-            int len = s.length();
-            for (int off = 0; off < len; ) {
-                int csize = Math.min(len - off, CHAR_BUF_SIZE);
-                s.getChars(off, off + csize, cbuf, 0);
-                for (int cpos = 0; cpos < csize; cpos++) {
-                    char c = cbuf[cpos];
-                    if (pos <= limit) {
-                        if (c <= 0x007F && c != 0) {
-                            buf[pos++] = (byte) c;
-                        } else if (c > 0x07FF) {
-                            buf[pos + 2] = (byte) (0x80 | ((c >> 0) & 0x3F));
-                            buf[pos + 1] = (byte) (0x80 | ((c >> 6) & 0x3F));
-                            buf[pos + 0] = (byte) (0xE0 | ((c >> 12) & 0x0F));
-                            pos += 3;
-                        } else {
-                            buf[pos + 1] = (byte) (0x80 | ((c >> 0) & 0x3F));
-                            buf[pos + 0] = (byte) (0xC0 | ((c >> 6) & 0x1F));
-                            pos += 2;
-                        }
-                    } else {    // write one byte at a time to normalize block
-                        if (c <= 0x007F && c != 0) {
-                            write(c);
-                        } else if (c > 0x07FF) {
-                            write(0xE0 | ((c >> 12) & 0x0F));
-                            write(0x80 | ((c >> 6) & 0x3F));
-                            write(0x80 | ((c >> 0) & 0x3F));
-                        } else {
-                            write(0xC0 | ((c >> 6) & 0x1F));
-                            write(0x80 | ((c >> 0) & 0x3F));
-                        }
-                    }
-                }
-                off += csize;
-            }
-        }
    }

    /**
--- a/src/java.base/share/classes/jdk/internal/classfile/impl/BufWriterImpl.java
+++ b/src/java.base/share/classes/jdk/internal/classfile/impl/BufWriterImpl.java
@ -38,6 +38,9 @@ import jdk.internal.access.JavaLangAccess;
 import jdk.internal.access.SharedSecrets;
 import jdk.internal.vm.annotation.ForceInline;

+import static jdk.internal.util.ModifiedUtf.putChar;
+import static jdk.internal.util.ModifiedUtf.utfLen;
+
 public final class BufWriterImpl implements BufWriter {
    private static final JavaLangAccess JLA = SharedSecrets.getJavaLangAccess();

@ -162,14 +165,7 @@ public final class BufWriterImpl implements BufWriter {
    void writeUTF(String str) {
        int strlen = str.length();
        int countNonZeroAscii = JLA.countNonZeroAscii(str);
-        int utflen = strlen;
-        if (countNonZeroAscii != strlen) {
-            for (int i = countNonZeroAscii; i < strlen; i++) {
-                int c = str.charAt(i);
-                if (c >= 0x80 || c == 0)
-                    utflen += (c >= 0x800) ? 2 : 1;
-            }
-        }
+        int utflen = utfLen(str, countNonZeroAscii);
        if (utflen > 65535) {
            throw new IllegalArgumentException("string too long");
        }
@ -185,20 +181,8 @@ public final class BufWriterImpl implements BufWriter {
        str.getBytes(0, countNonZeroAscii, elems, offset);
        offset += countNonZeroAscii;

-        for (int i = countNonZeroAscii; i < strlen; ++i) {
-            char c = str.charAt(i);
-            if (c >= '\001' && c <= '\177') {
-                elems[offset++] = (byte) c;
-            } else if (c > '\u07FF') {
-                elems[offset    ] = (byte) (0xE0 | c >> 12 & 0xF);
-                elems[offset + 1] = (byte) (0x80 | c >> 6 & 0x3F);
-                elems[offset + 2] = (byte) (0x80 | c      & 0x3F);
-                offset += 3;
-            } else {
-                elems[offset    ] = (byte) (0xC0 | c >> 6 & 0x1F);
-                elems[offset + 1] = (byte) (0x80 | c      & 0x3F);
-                offset += 2;
-            }
+        for (int i = countNonZeroAscii; i < strlen; i++) {
+            offset = putChar(elems, offset, str.charAt(i));
        }

        this.offset = offset;
--- a/src/java.base/share/classes/jdk/internal/util/ModifiedUtf.java
+++ b/src/java.base/share/classes/jdk/internal/util/ModifiedUtf.java
@ -0,0 +1,71 @@
+/*
+ * Copyright (c) 2024, Alibaba Group Holding Limited. All Rights Reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.  Oracle designates this
+ * particular file as subject to the "Classpath" exception as provided
+ * by Oracle in the LICENSE file that accompanied this code.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ */
+
+package jdk.internal.util;
+
+import jdk.internal.vm.annotation.ForceInline;
+
+/**
+ * Helper to JDK UTF putChar and Calculate length
+ *
+ * @since 24
+ */
+public abstract class ModifiedUtf {
+    private ModifiedUtf() {
+    }
+
+    @ForceInline
+    public static int putChar(byte[] buf, int offset, char c) {
+        if (c != 0 && c < 0x80) {
+            buf[offset++] = (byte) c;
+        } else if (c >= 0x800) {
+            buf[offset    ] = (byte) (0xE0 | c >> 12 & 0x0F);
+            buf[offset + 1] = (byte) (0x80 | c >> 6  & 0x3F);
+            buf[offset + 2] = (byte) (0x80 | c       & 0x3F);
+            offset += 3;
+        } else {
+            buf[offset    ] = (byte) (0xC0 | c >> 6 & 0x1F);
+            buf[offset + 1] = (byte) (0x80 | c      & 0x3F);
+            offset += 2;
+        }
+        return offset;
+    }
+
+    /**
+     * Calculate the utf length of a string
+     * @param str input string
+     * @param countNonZeroAscii the number of non-zero ascii characters in the prefix calculated by JLA.countNonZeroAscii(str)
+     */
+    @ForceInline
+    public static int utfLen(String str, int countNonZeroAscii) {
+        int utflen = str.length();
+        for (int i = utflen - 1; i >= countNonZeroAscii; i--) {
+            int c = str.charAt(i);
+            if (c >= 0x80 || c == 0)
+                utflen += (c >= 0x800) ? 2 : 1;
+        }
+        return utflen;
+    }
+}
--- a/test/micro/org/openjdk/bench/java/io/DataOutputStreamBench.java
+++ b/test/micro/org/openjdk/bench/java/io/DataOutputStreamBench.java
@ -0,0 +1,91 @@
+/*
+ * Copyright (c) 2024, Alibaba Group Holding Limited. All Rights Reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ */
+
+package org.openjdk.bench.java.io;
+
+import org.openjdk.jmh.annotations.*;
+import org.openjdk.jmh.infra.Blackhole;
+
+import java.io.*;
+import java.nio.charset.StandardCharsets;
+import java.util.HexFormat;
+import java.util.concurrent.TimeUnit;
+
+@BenchmarkMode(Mode.AverageTime)
+@OutputTimeUnit(TimeUnit.MICROSECONDS)
+@Fork(2)
+@Measurement(iterations = 6, time = 1)
+@Warmup(iterations = 4, time = 2)
+@State(Scope.Thread)
+public class DataOutputStreamBench {
+
+    @Param({"ascii", "utf8_2_bytes", "utf8_3_bytes", "emoji"})
+    public String charType;
+
+    ByteArrayOutputStream bytesOutput;
+    DataOutputStream dataOutput;
+    ObjectOutputStream objectOutput;
+    String[] strings;
+
+    @Setup(Level.Trial)
+    public void setup() throws Exception {
+        byte[] bytes = HexFormat.of().parseHex(
+                switch (charType) {
+                    case "ascii"        -> "78";
+                    case "utf8_2_bytes" -> "c2a9";
+                    case "utf8_3_bytes" -> "e6b8a9";
+                    case "emoji"        -> "e29da3efb88f";
+                    default -> throw new IllegalArgumentException("bad charType: " + charType);
+                }
+        );
+        String s = new String(bytes, 0, bytes.length, StandardCharsets.UTF_8);
+        strings = new String[128];
+        for (int i = 0; i < strings.length; i++) {
+            strings[i] = "A".repeat(i).concat(s.repeat(i));
+        }
+
+        bytesOutput = new ByteArrayOutputStream(1024 * 64);
+        dataOutput = new DataOutputStream(bytesOutput);
+        objectOutput = new ObjectOutputStream(bytesOutput);
+    }
+
+    @Benchmark
+    public void dataOutwriteUTF(Blackhole bh) throws Exception {
+        bytesOutput.reset();
+        for (var s : strings) {
+            dataOutput.writeUTF(s);
+        }
+        dataOutput.flush();
+        bh.consume(bytesOutput.size());
+    }
+
+    @Benchmark
+    public void objectWriteUTF(Blackhole bh) throws Exception {
+        bytesOutput.reset();
+        for (var s : strings) {
+            objectOutput.writeUTF(s);
+        }
+        objectOutput.flush();
+        bh.consume(bytesOutput.size());
+    }
+}