8339699: Optimize DataOutputStream writeUTF

Reviewed-by: liach, bpb
This commit is contained in:
Shaojin Wen 2024-10-04 22:35:03 +00:00
parent 559289487d
commit b42fbf43df
5 changed files with 252 additions and 182 deletions

View File

@ -1,5 +1,6 @@
/*
* Copyright (c) 1994, 2023, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 1994, 2024, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2024, Alibaba Group Holding Limited. All Rights Reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
@ -25,8 +26,13 @@
package java.io;
import jdk.internal.access.JavaLangAccess;
import jdk.internal.access.SharedSecrets;
import jdk.internal.util.ByteArray;
import static jdk.internal.util.ModifiedUtf.putChar;
import static jdk.internal.util.ModifiedUtf.utfLen;
/**
* A data output stream lets an application write primitive Java data
* types to an output stream in a portable way. An application can
@ -44,6 +50,8 @@ import jdk.internal.util.ByteArray;
* @since 1.0
*/
public class DataOutputStream extends FilterOutputStream implements DataOutput {
private static final JavaLangAccess JLA = SharedSecrets.getJavaLangAccess();
/**
* The number of bytes written to the data output stream so far.
* If this counter overflows, it will be wrapped to Integer.MAX_VALUE.
@ -352,15 +360,11 @@ public class DataOutputStream extends FilterOutputStream implements DataOutput {
* {@code str} would exceed 65535 bytes in length
* @throws IOException if some other I/O error occurs.
*/
@SuppressWarnings("deprecation")
static int writeUTF(String str, DataOutput out) throws IOException {
final int strlen = str.length();
int utflen = strlen; // optimized for ASCII
for (int i = 0; i < strlen; i++) {
int c = str.charAt(i);
if (c >= 0x80 || c == 0)
utflen += (c >= 0x800) ? 2 : 1;
}
int countNonZeroAscii = JLA.countNonZeroAscii(str);
int utflen = utfLen(str, countNonZeroAscii);
if (utflen > 65535 || /* overflow */ utflen < strlen)
throw new UTFDataFormatException(tooLongMsg(str, utflen));
@ -377,25 +381,11 @@ public class DataOutputStream extends FilterOutputStream implements DataOutput {
int count = 0;
ByteArray.setUnsignedShort(bytearr, count, utflen);
count += 2;
int i = 0;
for (i = 0; i < strlen; i++) { // optimized for initial run of ASCII
int c = str.charAt(i);
if (c >= 0x80 || c == 0) break;
bytearr[count++] = (byte) c;
}
str.getBytes(0, countNonZeroAscii, bytearr, count);
count += countNonZeroAscii;
for (; i < strlen; i++) {
int c = str.charAt(i);
if (c < 0x80 && c != 0) {
bytearr[count++] = (byte) c;
} else if (c >= 0x800) {
bytearr[count++] = (byte) (0xE0 | ((c >> 12) & 0x0F));
bytearr[count++] = (byte) (0x80 | ((c >> 6) & 0x3F));
bytearr[count++] = (byte) (0x80 | ((c >> 0) & 0x3F));
} else {
bytearr[count++] = (byte) (0xC0 | ((c >> 6) & 0x1F));
bytearr[count++] = (byte) (0x80 | ((c >> 0) & 0x3F));
}
for (int i = countNonZeroAscii; i < strlen;) {
count = putChar(bytearr, count, str.charAt(i++));
}
out.write(bytearr, 0, utflen + 2);
return utflen + 2;

View File

@ -1,5 +1,6 @@
/*
* Copyright (c) 1996, 2024, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2024, Alibaba Group Holding Limited. All Rights Reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
@ -34,8 +35,13 @@ import java.util.Objects;
import java.util.StringJoiner;
import jdk.internal.util.ByteArray;
import jdk.internal.access.JavaLangAccess;
import jdk.internal.access.SharedSecrets;
import sun.reflect.misc.ReflectUtil;
import static jdk.internal.util.ModifiedUtf.putChar;
import static jdk.internal.util.ModifiedUtf.utfLen;
/**
* An ObjectOutputStream writes primitive data types and graphs of Java objects
* to an OutputStream. The objects can be read (reconstituted) using an
@ -169,6 +175,7 @@ import sun.reflect.misc.ReflectUtil;
public class ObjectOutputStream
extends OutputStream implements ObjectOutput, ObjectStreamConstants
{
private static final JavaLangAccess JLA = SharedSecrets.getJavaLangAccess();
private static class Caches {
/** cache of subclass security audit results */
@ -885,7 +892,7 @@ public class ObjectOutputStream
* stream
*/
public void writeUTF(String str) throws IOException {
bout.writeUTF(str);
bout.writeUTFInternal(str, false);
}
/**
@ -1317,14 +1324,7 @@ public class ObjectOutputStream
*/
private void writeString(String str, boolean unshared) throws IOException {
handles.assign(unshared ? null : str);
long utflen = bout.getUTFLength(str);
if (utflen <= 0xFFFF) {
bout.writeByte(TC_STRING);
bout.writeUTF(str, utflen);
} else {
bout.writeByte(TC_LONGSTRING);
bout.writeLongUTF(str, utflen);
}
bout.writeUTFInternal(str, true);
}
/**
@ -1994,26 +1994,27 @@ public class ObjectOutputStream
}
}
public void writeBytes(String s) throws IOException {
int endoff = s.length();
int cpos = 0;
int csize = 0;
for (int off = 0; off < endoff; ) {
if (cpos >= csize) {
cpos = 0;
csize = Math.min(endoff - off, CHAR_BUF_SIZE);
s.getChars(off, off + csize, cbuf, 0);
}
if (pos >= MAX_BLOCK_SIZE) {
@SuppressWarnings("deprecation")
void writeBytes(String s, int len) throws IOException {
int pos = this.pos;
for (int strpos = 0; strpos < len;) {
int rem = MAX_BLOCK_SIZE - pos;
int csize = Math.min(len - strpos, rem);
s.getBytes(strpos, strpos + csize, buf, pos);
pos += csize;
strpos += csize;
if (pos == MAX_BLOCK_SIZE) {
this.pos = pos;
drain();
pos = 0;
}
int n = Math.min(csize - cpos, MAX_BLOCK_SIZE - pos);
int stop = pos + n;
while (pos < stop) {
buf[pos++] = (byte) cbuf[cpos++];
}
off += n;
}
this.pos = pos;
}
public void writeBytes(String s) throws IOException {
writeBytes(s, s.length());
}
public void writeChars(String s) throws IOException {
@ -2026,8 +2027,47 @@ public class ObjectOutputStream
}
}
public void writeUTF(String s) throws IOException {
writeUTF(s, getUTFLength(s));
public void writeUTF(String str) throws IOException {
writeUTFInternal(str, false);
}
private void writeUTFInternal(String str, boolean writeHeader) throws IOException {
int strlen = str.length();
int countNonZeroAscii = JLA.countNonZeroAscii(str);
int utflen = utfLen(str, countNonZeroAscii);
if (utflen <= 0xFFFF) {
if(writeHeader) {
writeByte(TC_STRING);
}
writeShort(utflen);
} else {
if(writeHeader) {
writeByte(TC_LONGSTRING);
}
writeLong(utflen);
}
if (countNonZeroAscii != 0) {
writeBytes(str, countNonZeroAscii);
}
if (countNonZeroAscii != strlen) {
writeMoreUTF(str, countNonZeroAscii);
}
}
private void writeMoreUTF(String str, int stroff) throws IOException {
int pos = this.pos;
for (int strlen = str.length(); stroff < strlen;) {
char c = str.charAt(stroff++);
int csize = c != 0 && c < 0x80 ? 1 : c >= 0x800 ? 3 : 2;
if (pos + csize >= MAX_BLOCK_SIZE) {
this.pos = pos;
drain();
pos = 0;
}
pos = putChar(buf, pos, c);
}
this.pos = pos;
}
@ -2153,112 +2193,6 @@ public class ObjectOutputStream
}
}
}
/**
* Returns the length in bytes of the UTF encoding of the given string.
*/
long getUTFLength(String s) {
int len = s.length();
long utflen = 0;
for (int off = 0; off < len; ) {
int csize = Math.min(len - off, CHAR_BUF_SIZE);
s.getChars(off, off + csize, cbuf, 0);
for (int cpos = 0; cpos < csize; cpos++) {
char c = cbuf[cpos];
if (c >= 0x0001 && c <= 0x007F) {
utflen++;
} else if (c > 0x07FF) {
utflen += 3;
} else {
utflen += 2;
}
}
off += csize;
}
return utflen;
}
/**
* Writes the given string in UTF format. This method is used in
* situations where the UTF encoding length of the string is already
* known; specifying it explicitly avoids a prescan of the string to
* determine its UTF length.
*/
void writeUTF(String s, long utflen) throws IOException {
if (utflen > 0xFFFFL) {
throw new UTFDataFormatException();
}
writeShort((int) utflen);
if (utflen == (long) s.length()) {
writeBytes(s);
} else {
writeUTFBody(s);
}
}
/**
* Writes given string in "long" UTF format. "Long" UTF format is
* identical to standard UTF, except that it uses an 8 byte header
* (instead of the standard 2 bytes) to convey the UTF encoding length.
*/
void writeLongUTF(String s) throws IOException {
writeLongUTF(s, getUTFLength(s));
}
/**
* Writes given string in "long" UTF format, where the UTF encoding
* length of the string is already known.
*/
void writeLongUTF(String s, long utflen) throws IOException {
writeLong(utflen);
if (utflen == (long) s.length()) {
writeBytes(s);
} else {
writeUTFBody(s);
}
}
/**
* Writes the "body" (i.e., the UTF representation minus the 2-byte or
* 8-byte length header) of the UTF encoding for the given string.
*/
private void writeUTFBody(String s) throws IOException {
int limit = MAX_BLOCK_SIZE - 3;
int len = s.length();
for (int off = 0; off < len; ) {
int csize = Math.min(len - off, CHAR_BUF_SIZE);
s.getChars(off, off + csize, cbuf, 0);
for (int cpos = 0; cpos < csize; cpos++) {
char c = cbuf[cpos];
if (pos <= limit) {
if (c <= 0x007F && c != 0) {
buf[pos++] = (byte) c;
} else if (c > 0x07FF) {
buf[pos + 2] = (byte) (0x80 | ((c >> 0) & 0x3F));
buf[pos + 1] = (byte) (0x80 | ((c >> 6) & 0x3F));
buf[pos + 0] = (byte) (0xE0 | ((c >> 12) & 0x0F));
pos += 3;
} else {
buf[pos + 1] = (byte) (0x80 | ((c >> 0) & 0x3F));
buf[pos + 0] = (byte) (0xC0 | ((c >> 6) & 0x1F));
pos += 2;
}
} else { // write one byte at a time to normalize block
if (c <= 0x007F && c != 0) {
write(c);
} else if (c > 0x07FF) {
write(0xE0 | ((c >> 12) & 0x0F));
write(0x80 | ((c >> 6) & 0x3F));
write(0x80 | ((c >> 0) & 0x3F));
} else {
write(0xC0 | ((c >> 6) & 0x1F));
write(0x80 | ((c >> 0) & 0x3F));
}
}
}
off += csize;
}
}
}
/**

View File

@ -38,6 +38,9 @@ import jdk.internal.access.JavaLangAccess;
import jdk.internal.access.SharedSecrets;
import jdk.internal.vm.annotation.ForceInline;
import static jdk.internal.util.ModifiedUtf.putChar;
import static jdk.internal.util.ModifiedUtf.utfLen;
public final class BufWriterImpl implements BufWriter {
private static final JavaLangAccess JLA = SharedSecrets.getJavaLangAccess();
@ -162,14 +165,7 @@ public final class BufWriterImpl implements BufWriter {
void writeUTF(String str) {
int strlen = str.length();
int countNonZeroAscii = JLA.countNonZeroAscii(str);
int utflen = strlen;
if (countNonZeroAscii != strlen) {
for (int i = countNonZeroAscii; i < strlen; i++) {
int c = str.charAt(i);
if (c >= 0x80 || c == 0)
utflen += (c >= 0x800) ? 2 : 1;
}
}
int utflen = utfLen(str, countNonZeroAscii);
if (utflen > 65535) {
throw new IllegalArgumentException("string too long");
}
@ -185,20 +181,8 @@ public final class BufWriterImpl implements BufWriter {
str.getBytes(0, countNonZeroAscii, elems, offset);
offset += countNonZeroAscii;
for (int i = countNonZeroAscii; i < strlen; ++i) {
char c = str.charAt(i);
if (c >= '\001' && c <= '\177') {
elems[offset++] = (byte) c;
} else if (c > '\u07FF') {
elems[offset ] = (byte) (0xE0 | c >> 12 & 0xF);
elems[offset + 1] = (byte) (0x80 | c >> 6 & 0x3F);
elems[offset + 2] = (byte) (0x80 | c & 0x3F);
offset += 3;
} else {
elems[offset ] = (byte) (0xC0 | c >> 6 & 0x1F);
elems[offset + 1] = (byte) (0x80 | c & 0x3F);
offset += 2;
}
for (int i = countNonZeroAscii; i < strlen; i++) {
offset = putChar(elems, offset, str.charAt(i));
}
this.offset = offset;

View File

@ -0,0 +1,71 @@
/*
* Copyright (c) 2024, Alibaba Group Holding Limited. All Rights Reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License version 2 only, as
* published by the Free Software Foundation. Oracle designates this
* particular file as subject to the "Classpath" exception as provided
* by Oracle in the LICENSE file that accompanied this code.
*
* This code is distributed in the hope that it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
* version 2 for more details (a copy is included in the LICENSE file that
* accompanied this code).
*
* You should have received a copy of the GNU General Public License version
* 2 along with this work; if not, write to the Free Software Foundation,
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
*
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
* or visit www.oracle.com if you need additional information or have any
* questions.
*/
package jdk.internal.util;
import jdk.internal.vm.annotation.ForceInline;
/**
* Helper to JDK UTF putChar and Calculate length
*
* @since 24
*/
public abstract class ModifiedUtf {
private ModifiedUtf() {
}
@ForceInline
public static int putChar(byte[] buf, int offset, char c) {
if (c != 0 && c < 0x80) {
buf[offset++] = (byte) c;
} else if (c >= 0x800) {
buf[offset ] = (byte) (0xE0 | c >> 12 & 0x0F);
buf[offset + 1] = (byte) (0x80 | c >> 6 & 0x3F);
buf[offset + 2] = (byte) (0x80 | c & 0x3F);
offset += 3;
} else {
buf[offset ] = (byte) (0xC0 | c >> 6 & 0x1F);
buf[offset + 1] = (byte) (0x80 | c & 0x3F);
offset += 2;
}
return offset;
}
/**
* Calculate the utf length of a string
* @param str input string
* @param countNonZeroAscii the number of non-zero ascii characters in the prefix calculated by JLA.countNonZeroAscii(str)
*/
@ForceInline
public static int utfLen(String str, int countNonZeroAscii) {
int utflen = str.length();
for (int i = utflen - 1; i >= countNonZeroAscii; i--) {
int c = str.charAt(i);
if (c >= 0x80 || c == 0)
utflen += (c >= 0x800) ? 2 : 1;
}
return utflen;
}
}

View File

@ -0,0 +1,91 @@
/*
* Copyright (c) 2024, Alibaba Group Holding Limited. All Rights Reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License version 2 only, as
* published by the Free Software Foundation.
*
* This code is distributed in the hope that it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
* version 2 for more details (a copy is included in the LICENSE file that
* accompanied this code).
*
* You should have received a copy of the GNU General Public License version
* 2 along with this work; if not, write to the Free Software Foundation,
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
*
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
* or visit www.oracle.com if you need additional information or have any
* questions.
*/
package org.openjdk.bench.java.io;
import org.openjdk.jmh.annotations.*;
import org.openjdk.jmh.infra.Blackhole;
import java.io.*;
import java.nio.charset.StandardCharsets;
import java.util.HexFormat;
import java.util.concurrent.TimeUnit;
@BenchmarkMode(Mode.AverageTime)
@OutputTimeUnit(TimeUnit.MICROSECONDS)
@Fork(2)
@Measurement(iterations = 6, time = 1)
@Warmup(iterations = 4, time = 2)
@State(Scope.Thread)
public class DataOutputStreamBench {
@Param({"ascii", "utf8_2_bytes", "utf8_3_bytes", "emoji"})
public String charType;
ByteArrayOutputStream bytesOutput;
DataOutputStream dataOutput;
ObjectOutputStream objectOutput;
String[] strings;
@Setup(Level.Trial)
public void setup() throws Exception {
byte[] bytes = HexFormat.of().parseHex(
switch (charType) {
case "ascii" -> "78";
case "utf8_2_bytes" -> "c2a9";
case "utf8_3_bytes" -> "e6b8a9";
case "emoji" -> "e29da3efb88f";
default -> throw new IllegalArgumentException("bad charType: " + charType);
}
);
String s = new String(bytes, 0, bytes.length, StandardCharsets.UTF_8);
strings = new String[128];
for (int i = 0; i < strings.length; i++) {
strings[i] = "A".repeat(i).concat(s.repeat(i));
}
bytesOutput = new ByteArrayOutputStream(1024 * 64);
dataOutput = new DataOutputStream(bytesOutput);
objectOutput = new ObjectOutputStream(bytesOutput);
}
@Benchmark
public void dataOutwriteUTF(Blackhole bh) throws Exception {
bytesOutput.reset();
for (var s : strings) {
dataOutput.writeUTF(s);
}
dataOutput.flush();
bh.consume(bytesOutput.size());
}
@Benchmark
public void objectWriteUTF(Blackhole bh) throws Exception {
bytesOutput.reset();
for (var s : strings) {
objectOutput.writeUTF(s);
}
objectOutput.flush();
bh.consume(bytesOutput.size());
}
}