From 91d93715666b9d44a4bc9b709ca9de2c579ee5d1 Mon Sep 17 00:00:00 2001 From: Joe Wang Date: Tue, 27 May 2014 17:26:52 -0700 Subject: [PATCH] 8043592: The basic XML parser based on UKit fails to read XML files encoded in UTF-16BE or LE Reviewed-by: sherman, lancea --- .../jdk/internal/util/xml/impl/Parser.java | 80 ++++++++++++++++--- .../java/util/Properties/LoadAndStoreXML.java | 29 +++++-- 2 files changed, 90 insertions(+), 19 deletions(-) diff --git a/jdk/src/share/classes/jdk/internal/util/xml/impl/Parser.java b/jdk/src/share/classes/jdk/internal/util/xml/impl/Parser.java index 7eb918d7b7b..6db4d4fdb6d 100644 --- a/jdk/src/share/classes/jdk/internal/util/xml/impl/Parser.java +++ b/jdk/src/share/classes/jdk/internal/util/xml/impl/Parser.java @@ -2860,14 +2860,25 @@ public abstract class Parser { } else { // Get encoding from BOM or the xml text decl. reader = bom(is.getByteStream(), ' '); + /** + * [#4.3.3] requires BOM for UTF-16, however, it's not uncommon + * that it may be missing. A mature technique exists in Xerces + * to further check for possible UTF-16 encoding + */ + if (reader == null) { + reader = utf16(is.getByteStream()); + } + if (reader == null) { // Encoding is defined by the xml text decl. reader = enc("UTF-8", is.getByteStream()); expenc = xml(reader); - if (expenc.startsWith("UTF-16")) { - panic(FAULT); // UTF-16 must have BOM [#4.3.3] + if (!expenc.equals("UTF-8")) { + if (expenc.startsWith("UTF-16")) { + panic(FAULT); // UTF-16 must have BOM [#4.3.3] + } + reader = enc(expenc, is.getByteStream()); } - reader = enc(expenc, is.getByteStream()); } else { // Encoding is defined by the BOM. xml(reader); @@ -2956,6 +2967,49 @@ public abstract class Parser { } } + + /** + * Using a mature technique from Xerces, this method checks further after + * the bom method above to see if the encoding is UTF-16 + * + * @param is A byte stream of the entity. + * @return a reader, may be null + * @exception Exception is parser specific exception form panic method. + * @exception IOException + */ + private Reader utf16(InputStream is) + throws Exception { + if (mChIdx != 0) { + //The bom method has read ONE byte into the buffer. + byte b0 = (byte)mChars[0]; + if (b0 == 0x00 || b0 == 0x3C) { + int b1 = is.read(); + int b2 = is.read(); + int b3 = is.read(); + if (b0 == 0x00 && b1 == 0x3C && b2 == 0x00 && b3 == 0x3F) { + // UTF-16, big-endian, no BOM + mChars[0] = (char)(b1); + mChars[mChIdx++] = (char)(b3); + return new ReaderUTF16(is, 'b'); + } else if (b0 == 0x3C && b1 == 0x00 && b2 == 0x3F && b3 == 0x00) { + // UTF-16, little-endian, no BOM + mChars[0] = (char)(b0); + mChars[mChIdx++] = (char)(b2); + return new ReaderUTF16(is, 'l'); + } else { + /**not every InputStream supports reset, so we have to remember + * the state for further parsing + **/ + mChars[0] = (char)(b0); + mChars[mChIdx++] = (char)(b1); + mChars[mChIdx++] = (char)(b2); + mChars[mChIdx++] = (char)(b3); + } + + } + } + return null; + } /** * Parses the xml text declaration. * @@ -2974,17 +3028,17 @@ public abstract class Parser { String enc = "UTF-8"; char ch; int val; - short st; - // Read the xml text declaration into the buffer - if (mChIdx != 0) { - // The bom method have read ONE char into the buffer. - st = (short) ((mChars[0] == '<') ? 1 : -1); - } else { - st = 0; - } + short st = 0; + int byteRead = mChIdx; //number of bytes read prior to entering this method + while (st >= 0 && mChIdx < mChars.length) { - ch = ((val = reader.read()) >= 0) ? (char) val : EOS; - mChars[mChIdx++] = ch; + if (st < byteRead) { + ch = mChars[st]; + } else { + ch = ((val = reader.read()) >= 0) ? (char) val : EOS; + mChars[mChIdx++] = ch; + } + switch (st) { case 0: // read '<' of xml declaration switch (ch) { diff --git a/jdk/test/java/util/Properties/LoadAndStoreXML.java b/jdk/test/java/util/Properties/LoadAndStoreXML.java index 5353c645b81..241a783053f 100644 --- a/jdk/test/java/util/Properties/LoadAndStoreXML.java +++ b/jdk/test/java/util/Properties/LoadAndStoreXML.java @@ -32,6 +32,7 @@ import java.io.ByteArrayOutputStream; import java.io.IOException; import java.io.InputStream; import java.io.UnsupportedEncodingException; +import java.nio.charset.Charset; import java.nio.file.DirectoryStream; import java.nio.file.Files; import java.nio.file.Path; @@ -47,6 +48,7 @@ import java.util.Properties; import java.util.PropertyPermission; public class LoadAndStoreXML { + static final String bomChar = "\uFEFF"; /** * Simple policy implementation that grants a set of permissions to @@ -125,13 +127,14 @@ public class LoadAndStoreXML { * Sanity test that properties saved with Properties#storeToXML can be * read with Properties#loadFromXML. */ - static void testLoadAndStore(String encoding) throws IOException { + static void testLoadAndStore(String encoding, boolean appendBOM) throws IOException { System.out.println("testLoadAndStore, encoding=" + encoding); Properties props = new Properties(); + props.put("k0", "\u6C34"); props.put("k1", "foo"); props.put("k2", "bar"); - props.put("k3", "\\u0020\\u0391\\u0392\\u0393\\u0394\\u0395\\u0396\\u0397"); + props.put("k3", "\u0020\u0391\u0392\u0393\u0394\u0395\u0396\u0397"); props.put("k4", "\u7532\u9aa8\u6587"); props.put("k5", "/lib/jaxp.properties"); @@ -141,7 +144,17 @@ public class LoadAndStoreXML { throw new RuntimeException("OutputStream closed by storeToXML"); Properties p = new Properties(); - TestInputStream in = new TestInputStream(out.toByteArray()); + TestInputStream in; + if (appendBOM) { + byte[] byteOrderMark = bomChar.getBytes(Charset.forName(encoding)); + byte[] outArray = out.toByteArray(); + byte[] inputArray = new byte[byteOrderMark.length + outArray.length]; + System.arraycopy(byteOrderMark, 0, inputArray, 0, byteOrderMark.length); + System.arraycopy(outArray, 0, inputArray, byteOrderMark.length, outArray.length); + in = new TestInputStream(inputArray); + } else { + in = new TestInputStream(out.toByteArray()); + } p.loadFromXML(in); if (in.isOpen()) throw new RuntimeException("InputStream not closed by loadFromXML"); @@ -231,8 +244,12 @@ public class LoadAndStoreXML { public static void main(String[] args) throws IOException { - testLoadAndStore("UTF-8"); - testLoadAndStore("UTF-16"); + testLoadAndStore("UTF-8", false); + testLoadAndStore("UTF-16", false); + testLoadAndStore("UTF-16BE", false); + testLoadAndStore("UTF-16LE", false); + testLoadAndStore("UTF-16BE", true); + testLoadAndStore("UTF-16LE", true); testLoadWithoutEncoding(); testLoadWithBadEncoding(); testStoreWithBadEncoding(); @@ -250,7 +267,7 @@ public class LoadAndStoreXML { Policy.setPolicy(p); System.setSecurityManager(new SecurityManager()); try { - testLoadAndStore("UTF-8"); + testLoadAndStore("UTF-8", false); } finally { // turn off security manager and restore policy System.setSecurityManager(null);