8207760: SAXException: Invalid UTF-16 surrogate detected: d83c ?

Reviewed-by: lancea, dfuchs
This commit is contained in:
Joe Wang 2018-09-18 09:44:20 -07:00
parent b6180e668e
commit 9ed646a020
4 changed files with 338 additions and 144 deletions

View File

@ -1,5 +1,5 @@
/*
* Copyright (c) 2014, 2016, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2014, 2018, Oracle and/or its affiliates. All rights reserved.
*/
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
@ -40,6 +40,7 @@ import com.sun.org.apache.xml.internal.serializer.utils.Utils;
* because it is used from another package.
*
* @xsl.usage internal
* @LastModified: Sept 2018
*/
public final class ToHTMLStream extends ToStream
{
@ -1049,7 +1050,7 @@ public final class ToHTMLStream extends ToStream
String name,
String value,
ElemDesc elemDesc)
throws IOException
throws IOException, SAXException
{
writer.write(' ');
@ -1373,7 +1374,7 @@ public final class ToHTMLStream extends ToStream
*/
public void writeAttrString(
final java.io.Writer writer, String string, String encoding)
throws IOException
throws IOException, SAXException
{
final int end = string.length();
if (end > m_attrBuff.length)
@ -1425,13 +1426,16 @@ public final class ToHTMLStream extends ToStream
}
else
{
if (Encodings.isHighUTF16Surrogate(ch))
if (Encodings.isHighUTF16Surrogate(ch) ||
Encodings.isLowUTF16Surrogate(ch))
{
writeUTF16Surrogate(ch, chars, i, end);
i++; // two input characters processed
// this increments by one and the for()
// loop itself increments by another one.
if (writeUTF16Surrogate(ch, chars, i, end) >= 0) {
// move the index if the low surrogate is consumed
// as writeUTF16Surrogate has written the pair
if (Encodings.isHighUTF16Surrogate(ch)) {
i++;
}
}
}
// The next is kind of a hack to keep from escaping in the case

View File

@ -51,7 +51,7 @@ import org.xml.sax.SAXException;
* serializers (xml, html, text ...) that write output to a stream.
*
* @xsl.usage internal
* @LastModified: Feb 2018
* @LastModified: Sept 2018
*/
abstract public class ToStream extends SerializerBase {
@ -193,6 +193,8 @@ abstract public class ToStream extends SerializerBase {
*/
private boolean m_expandDTDEntities = true;
private char m_highSurrogate = 0;
/**
* Default constructor
*/
@ -953,45 +955,46 @@ abstract public class ToStream extends SerializerBase {
* @param ch Character array.
* @param i position Where the surrogate was detected.
* @param end The end index of the significant characters.
* @return 0 if the pair of characters was written out as-is,
* the unicode code point of the character represented by
* the surrogate pair if an entity reference with that value
* was written out.
* @return the status of writing a surrogate pair.
* -1 -- nothing is written
* 0 -- the pair is written as-is
* code point -- the pair is written as an entity reference
*
* @throws IOException
* @throws org.xml.sax.SAXException if invalid UTF-16 surrogate detected.
*/
protected int writeUTF16Surrogate(char c, char ch[], int i, int end)
throws IOException
throws IOException, SAXException
{
int codePoint = 0;
int status = -1;
if (i + 1 >= end)
{
throw new IOException(
Utils.messages.createMessage(
MsgKey.ER_INVALID_UTF16_SURROGATE,
new Object[] { Integer.toHexString((int) c)}));
m_highSurrogate = c;
return status;
}
char high, low;
if (m_highSurrogate == 0) {
high = c;
low = ch[i+1];
status = 0;
} else {
high = m_highSurrogate;
low = c;
m_highSurrogate = 0;
}
final char high = c;
final char low = ch[i+1];
if (!Encodings.isLowUTF16Surrogate(low)) {
throw new IOException(
Utils.messages.createMessage(
MsgKey.ER_INVALID_UTF16_SURROGATE,
new Object[] {
Integer.toHexString((int) c)
+ " "
+ Integer.toHexString(low)}));
throwIOE(high, low);
}
final Writer writer = m_writer;
// If we make it to here we have a valid high, low surrogate pair
if (m_encodingInfo.isInEncoding(c,low)) {
if (m_encodingInfo.isInEncoding(high,low)) {
// If the character formed by the surrogate pair
// is in the encoding, so just write it out
writer.write(ch,i,2);
writer.write(new char[]{high, low}, 0, 2);
}
else {
// Don't know what to do with this char, it is
@ -999,24 +1002,16 @@ abstract public class ToStream extends SerializerBase {
// a surrogate pair, so write out as an entity ref
final String encoding = getEncoding();
if (encoding != null) {
/* The output encoding is known,
* so somthing is wrong.
*/
codePoint = Encodings.toCodePoint(high, low);
// not in the encoding, so write out a character reference
writer.write('&');
writer.write('#');
writer.write(Integer.toString(codePoint));
writer.write(';');
status = writeCharRef(writer, high, low);
} else {
/* The output encoding is not known,
* so just write it out as-is.
*/
writer.write(ch, i, 2);
writer.write(new char[]{high, low}, 0, 2);
}
}
// non-zero only if character reference was written out.
return codePoint;
return status;
}
/**
@ -1106,32 +1101,7 @@ abstract public class ToStream extends SerializerBase {
}
else if (isCData && (!escapingNotNeeded(c)))
{
// if (i != 0)
if (m_cdataTagOpen)
closeCDATA();
// This needs to go into a function...
if (Encodings.isHighUTF16Surrogate(c))
{
writeUTF16Surrogate(c, ch, i, end);
i++ ; // process two input characters
}
else
{
writer.write("&#");
String intStr = Integer.toString((int) c);
writer.write(intStr);
writer.write(';');
}
// if ((i != 0) && (i < (end - 1)))
// if (!m_cdataTagOpen && (i < (end - 1)))
// {
// writer.write(CDATA_DELIMITER_OPEN);
// m_cdataTagOpen = true;
// }
i = handleEscaping(writer, c, ch, i, end);
}
else if (
isCData
@ -1155,31 +1125,46 @@ abstract public class ToStream extends SerializerBase {
}
writer.write(c);
}
// This needs to go into a function...
else if (Encodings.isHighUTF16Surrogate(c))
{
if (m_cdataTagOpen)
closeCDATA();
writeUTF16Surrogate(c, ch, i, end);
i++; // process two input characters
}
else
{
if (m_cdataTagOpen)
closeCDATA();
writer.write("&#");
String intStr = Integer.toString((int) c);
writer.write(intStr);
writer.write(';');
else {
i = handleEscaping(writer, c, ch, i, end);
}
}
}
}
/**
* Handles escaping, writes either with a surrogate pair or a character
* reference.
*
* @param c the current char
* @param ch the character array
* @param i the current position
* @param end the end index of the array
* @return the next index
*
* @throws IOException
* @throws org.xml.sax.SAXException if invalid UTF-16 surrogate detected.
*/
private int handleEscaping(Writer writer, char c, char ch[], int i, int end)
throws IOException, SAXException {
if (Encodings.isHighUTF16Surrogate(c) || Encodings.isLowUTF16Surrogate(c))
{
if (writeUTF16Surrogate(c, ch, i, end) >= 0) {
// move the index if the low surrogate is consumed
// as writeUTF16Surrogate has written the pair
if (Encodings.isHighUTF16Surrogate(c)) {
i++ ;
}
}
}
else
{
writeCharRef(writer, c);
}
return i;
}
/**
* Ends an un-escaping section.
*
@ -1246,7 +1231,7 @@ abstract public class ToStream extends SerializerBase {
m_elemContext.m_startTagOpen = false;
}
if (shouldIndent())
if (!m_cdataTagOpen && shouldIndent())
indent();
boolean writeCDataBrackets =
@ -1644,7 +1629,7 @@ abstract public class ToStream extends SerializerBase {
int i,
char ch,
int lastDirty,
boolean fromTextNode) throws IOException
boolean fromTextNode) throws IOException, SAXException
{
int startClean = lastDirty + 1;
// if we have some clean characters accumulated
@ -1723,54 +1708,40 @@ abstract public class ToStream extends SerializerBase {
int len,
boolean fromTextNode,
boolean escLF)
throws IOException
throws IOException, SAXException
{
int pos = accumDefaultEntity(writer, ch, i, chars, len, fromTextNode, escLF);
if (i == pos)
{
if (m_highSurrogate != 0) {
if (!(Encodings.isLowUTF16Surrogate(ch))) {
throwIOE(m_highSurrogate, ch);
}
writeCharRef(writer, m_highSurrogate, ch);
m_highSurrogate = 0;
return ++pos;
}
if (Encodings.isHighUTF16Surrogate(ch))
{
// Should be the UTF-16 low surrogate of the hig/low pair.
char next;
// Unicode code point formed from the high/low pair.
int codePoint = 0;
if (i + 1 >= len)
{
throw new IOException(
Utils.messages.createMessage(
MsgKey.ER_INVALID_UTF16_SURROGATE,
new Object[] { Integer.toHexString(ch)}));
//"Invalid UTF-16 surrogate detected: "
//+Integer.toHexString(ch)+ " ?");
// save for the next read
m_highSurrogate = ch;
pos++;
}
else
{
next = chars[++i];
// the next should be the UTF-16 low surrogate of the hig/low pair.
char next = chars[++i];
if (!(Encodings.isLowUTF16Surrogate(next)))
throw new IOException(
Utils.messages.createMessage(
MsgKey
.ER_INVALID_UTF16_SURROGATE,
new Object[] {
Integer.toHexString(ch)
+ " "
+ Integer.toHexString(next)}));
//"Invalid UTF-16 surrogate detected: "
throwIOE(ch, next);
//+Integer.toHexString(ch)+" "+Integer.toHexString(next));
codePoint = Encodings.toCodePoint(ch,next);
writeCharRef(writer, ch, next);
pos += 2; // count the two characters that went into writing out this entity
}
writer.write("&#");
writer.write(Integer.toString(codePoint));
writer.write(';');
pos += 2; // count the two characters that went into writing out this entity
}
else
{
@ -1782,18 +1753,14 @@ abstract public class ToStream extends SerializerBase {
if (isCharacterInC0orC1Range(ch) ||
(XMLVERSION11.equals(getVersion()) && isNELorLSEPCharacter(ch)))
{
writer.write("&#");
writer.write(Integer.toString(ch));
writer.write(';');
writeCharRef(writer, ch);
}
else if ((!escapingNotNeeded(ch) ||
( (fromTextNode && m_charInfo.isSpecialTextChar(ch))
|| (!fromTextNode && m_charInfo.isSpecialAttrChar(ch))))
&& m_elemContext.m_currentElemDepth > 0)
&& m_elemContext.m_currentElemDepth > 0)
{
writer.write("&#");
writer.write(Integer.toString(ch));
writer.write(';');
writeCharRef(writer, ch);
}
else
{
@ -1806,6 +1773,45 @@ abstract public class ToStream extends SerializerBase {
return pos;
}
/**
* Writes out a character reference.
* @param writer the writer
* @param c the character
* @throws IOException
*/
private void writeCharRef(Writer writer, char c) throws IOException, SAXException {
if (m_cdataTagOpen)
closeCDATA();
writer.write("&#");
writer.write(Integer.toString(c));
writer.write(';');
}
/**
* Writes out a pair of surrogates as a character reference
* @param writer the writer
* @param high the high surrogate
* @param low the low surrogate
* @throws IOException
*/
private int writeCharRef(Writer writer, char high, char low) throws IOException, SAXException {
if (m_cdataTagOpen)
closeCDATA();
// Unicode code point formed from the high/low pair.
int codePoint = Encodings.toCodePoint(high, low);
writer.write("&#");
writer.write(Integer.toString(codePoint));
writer.write(';');
return codePoint;
}
private void throwIOE(char ch, char next) throws IOException {
throw new IOException(Utils.messages.createMessage(
MsgKey.ER_INVALID_UTF16_SURROGATE,
new Object[] {Integer.toHexString(ch) + " "
+ Integer.toHexString(next)}));
}
/**
* Receive notification of the beginning of an element, although this is a
* SAX method additional namespace or attribute information can occur before
@ -2053,7 +2059,7 @@ abstract public class ToStream extends SerializerBase {
Writer writer,
String string,
String encoding)
throws IOException
throws IOException, SAXException
{
final int len = string.length();
if (len > m_attrBuff.length)

View File

@ -1,6 +1,5 @@
/*
* reserved comment block
* DO NOT REMOVE OR ALTER!
* Copyright (c) 2018, Oracle and/or its affiliates. All rights reserved.
*/
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
@ -34,6 +33,7 @@ import org.xml.sax.SAXException;
* This class converts SAX or SAX-like calls to a
* serialized document for xsl:output method of "text".
* @xsl.usage internal
* @LastModified: Sept 2018
*/
public final class ToTextStream extends ToStream
{
@ -295,23 +295,32 @@ public final class ToTextStream extends ToStream
} else if (m_encodingInfo.isInEncoding(c)) {
writer.write(c);
// one input char processed
} else if (Encodings.isHighUTF16Surrogate(c)) {
} else if (Encodings.isHighUTF16Surrogate(c) ||
Encodings.isLowUTF16Surrogate(c)) {
final int codePoint = writeUTF16Surrogate(c, ch, i, end);
if (codePoint != 0) {
// I think we can just emit the message,
// not crash and burn.
final String integralValue = Integer.toString(codePoint);
final String msg = Utils.messages.createMessage(
MsgKey.ER_ILLEGAL_CHARACTER,
new Object[] { integralValue, encoding });
if (codePoint >= 0) {
// move the index if the low surrogate is consumed
// as writeUTF16Surrogate has written the pair
if (Encodings.isHighUTF16Surrogate(c)) {
i++;
}
//Older behavior was to throw the message,
//but newer gentler behavior is to write a message to System.err
//throw new SAXException(msg);
System.err.println(msg);
// printing to the console is not appropriate, but will leave
// it as is for compatibility.
if (codePoint >0) {
// I think we can just emit the message,
// not crash and burn.
final String integralValue = Integer.toString(codePoint);
final String msg = Utils.messages.createMessage(
MsgKey.ER_ILLEGAL_CHARACTER,
new Object[] { integralValue, encoding });
//Older behavior was to throw the message,
//but newer gentler behavior is to write a message to System.err
//throw new SAXException(msg);
System.err.println(msg);
}
}
i++; // two input chars processed
} else {
// Don't know what to do with this char, it is
// not in the encoding and not a high char in

View File

@ -0,0 +1,175 @@
/*
* Copyright (c) 2018, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License version 2 only, as
* published by the Free Software Foundation.
*
* This code is distributed in the hope that it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
* version 2 for more details (a copy is included in the LICENSE file that
* accompanied this code).
*
* You should have received a copy of the GNU General Public License version
* 2 along with this work; if not, write to the Free Software Foundation,
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
*
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
* or visit www.oracle.com if you need additional information or have any
* questions.
*/
package transform;
import java.io.ByteArrayInputStream;
import java.io.InputStream;
import java.io.StringReader;
import java.io.StringWriter;
import java.nio.charset.StandardCharsets;
import javax.xml.transform.Transformer;
import javax.xml.transform.TransformerException;
import javax.xml.transform.TransformerFactory;
import javax.xml.transform.stream.StreamResult;
import javax.xml.transform.stream.StreamSource;
import org.testng.Assert;
import org.testng.annotations.Listeners;
import org.testng.annotations.Test;
import java.util.Random;
import javax.xml.transform.OutputKeys;
import org.testng.annotations.DataProvider;
/*
* @test
* @library /javax/xml/jaxp/libs /javax/xml/jaxp/unittest
* @run testng/othervm transform.JDK8207760
* @summary Verifies that a surrogate pair at the edge of a buffer is properly handled
* @bug 8207760
*/
@Listeners({jaxp.library.FilePolicy.class})
public class JDK8207760 {
final String xsl8207760 =
"<xsl:stylesheet version=\"1.0\" xmlns:xsl=\"http://www.w3.org/1999/XSL/Transform\">\n" +
" <xsl:output omit-xml-declaration=\"yes\" indent=\"no\" />\n" +
"\n" +
" <xsl:template match=\"node()|@*\">\n" +
" <xsl:copy>\n" +
" <xsl:apply-templates select=\"node()|@*\" />\n" +
" </xsl:copy>\n" +
" </xsl:template>\n" +
"</xsl:stylesheet>\n";
final String xsl8207760_2 = "<xsl:stylesheet \n" +
" version=\"1.0\" \n" +
" xmlns:xsl=\"http://www.w3.org/1999/XSL/Transform\">\n" +
"\n" +
" <xsl:output method=\"xml\" indent=\"no\" cdata-section-elements=\"source\"/>\n" +
"\n" +
" <xsl:template match=\"source\">\n" +
" <xsl:copy>\n" +
" <xsl:apply-templates select=\"node()\" />\n" +
" </xsl:copy>\n" +
" </xsl:template>\n" +
"\n" +
"</xsl:stylesheet>";
final String xsl8207760_3 = "<xsl:stylesheet \n" +
" version=\"1.0\" \n" +
" xmlns:xsl=\"http://www.w3.org/1999/XSL/Transform\">\n" +
"\n" +
" <xsl:output method=\"xml\" indent=\"no\" cdata-section-elements=\"source\"/>\n" +
"\n" +
" <xsl:template match=\"source\">\n" +
" <xsl:copy>\n" +
" <!-- Copy the attributes -->\n" +
" <xsl:apply-templates select=\"@*\"/>\n" +
" <!-- Convert the contained nodes (elements and text) into text -->\n" +
" <xsl:variable name=\"subElementsText\">\n" +
" <xsl:apply-templates select=\"node()\"/>\n" +
" </xsl:variable>\n" +
" <!-- Output the XML directive and the converted nodes -->\n" +
" <xsl:value-of select=\"$subElementsText\"/>\n" +
" </xsl:copy>\n" +
" </xsl:template>\n" +
"\n" +
"</xsl:stylesheet>";
@DataProvider(name = "xsls")
public Object[][] getDataBug8207760_cdata() {
return new Object[][]{
{xsl8207760_2},
{xsl8207760_3},
};
}
/*
* @bug 8207760
* Verifies that a surrogate pair at the edge of a buffer is properly handled
* when serializing into a Character section.
*/
@Test
public final void testBug8207760() throws Exception {
String[] xmls = prepareXML(false);
Transformer t = createTransformerFromInputstream(
new ByteArrayInputStream(xsl8207760.getBytes(StandardCharsets.UTF_8)));
t.setOutputProperty(OutputKeys.ENCODING, StandardCharsets.UTF_8.name());
StringWriter sw = new StringWriter();
t.transform(new StreamSource(new StringReader(xmls[0])), new StreamResult(sw));
Assert.assertEquals(sw.toString().replaceAll(System.lineSeparator(), "\n"), xmls[1]);
}
/*
* @bug 8207760
* Verifies that a surrogate pair at the edge of a buffer is properly handled
* when serializing into a CDATA section.
*/
@Test(dataProvider = "xsls")
public final void testBug8207760_cdata(String xsl) throws Exception {
String[] xmls = prepareXML(true);
Transformer t = createTransformerFromInputstream(
new ByteArrayInputStream(xsl.getBytes(StandardCharsets.UTF_8)));
t.setOutputProperty(OutputKeys.ENCODING, StandardCharsets.UTF_8.name());
StringWriter sw = new StringWriter();
t.transform(new StreamSource(new StringReader(xmls[0])), new StreamResult(sw));
Assert.assertEquals(sw.toString().replaceAll(System.lineSeparator(), "\n"), xmls[1]);
}
private String[] prepareXML(boolean cdata) {
String xml = "<?xml version=\"1.0\" encoding=\"UTF-8\"?><source>";
if (cdata) {
xml += "<![CDATA[";
}
String tail = "abc 123 </source>";
if (cdata) {
tail = "abc 123 ]]></source>";
}
String temp = generateString(1023);
xml = xml + temp + '\uD83C' + '\uDF42' + tail;
//xml = xml + temp + tail;
String expected = (!cdata) ? "<source>" + temp + "&#127810;" + tail
: xml;
return new String[]{xml, expected};
}
static final char[] CHARS = "abcdefghijklmnopqrstuvwxyz \n".toCharArray();
StringBuilder sb = new StringBuilder(1024 << 4);
Random random = new Random();
private String generateString(int size) {
sb.setLength(0);
for (int i = 0; i < size; i++) {
char c = CHARS[random.nextInt(CHARS.length)];
sb.append(c);
}
return sb.toString();
}
private Transformer createTransformerFromInputstream(InputStream xslStream)
throws TransformerException {
return TransformerFactory.newInstance().newTransformer(new StreamSource(xslStream));
}
}