6928542: Chinese characters in RTF are not decoded

Reviewed-by: prr, psadhukhan
This commit is contained in:
Ichiroh Takiguchi 2023-11-09 01:05:00 +00:00
parent 4e8c0364a2
commit a26f7c03c7
3 changed files with 286 additions and 4 deletions

View File

@ -1,5 +1,5 @@
/* /*
* Copyright (c) 1997, 2018, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 1997, 2023, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
* *
* This code is free software; you can redistribute it and/or modify it * This code is free software; you can redistribute it and/or modify it
@ -26,6 +26,10 @@ package javax.swing.text.rtf;
import java.io.*; import java.io.*;
import java.lang.*; import java.lang.*;
import java.nio.ByteBuffer;
import java.nio.CharBuffer;
import java.nio.charset.CharsetDecoder;
import java.nio.charset.CoderResult;
/** /**
* <b>RTFParser</b> is a subclass of <b>AbstractFilter</b> which understands basic RTF syntax * <b>RTFParser</b> is a subclass of <b>AbstractFilter</b> which understands basic RTF syntax
@ -69,6 +73,11 @@ abstract class RTFParser extends AbstractFilter
private final int S_inblob = 6; // in a \bin blob private final int S_inblob = 6; // in a \bin blob
// For fcharset control word
protected CharsetDecoder decoder = null;
private byte[] ba = new byte[2];
protected ByteBuffer decoderBB = ByteBuffer.wrap(ba);
/** Implemented by subclasses to interpret a parameter-less RTF keyword. /** Implemented by subclasses to interpret a parameter-less RTF keyword.
* The keyword is passed without the leading '/' or any delimiting * The keyword is passed without the leading '/' or any delimiting
* whitespace. */ * whitespace. */
@ -100,6 +109,9 @@ abstract class RTFParser extends AbstractFilter
rtfSpecialsTable['\\'] = true; rtfSpecialsTable['\\'] = true;
} }
// Defined for replacement character
static final char REPLACEMENT_CHAR = '\uFFFD';
public RTFParser() public RTFParser()
{ {
currentCharacters = new StringBuffer(); currentCharacters = new StringBuffer();
@ -109,6 +121,9 @@ abstract class RTFParser extends AbstractFilter
//warnings = System.out; //warnings = System.out;
specialsTable = rtfSpecialsTable; specialsTable = rtfSpecialsTable;
// Initialize byte buffer for CharsetDecoder
decoderBB.clear();
decoderBB.limit(1);
} }
// TODO: Handle wrapup at end of file correctly. // TODO: Handle wrapup at end of file correctly.
@ -182,6 +197,9 @@ abstract class RTFParser extends AbstractFilter
} }
state = S_backslashed; state = S_backslashed;
} else { } else {
// SBCS: ASCII character
// DBCS: Non lead byte
ch = decode(ch);
currentCharacters.append(ch); currentCharacters.append(ch);
} }
break; break;
@ -301,7 +319,9 @@ abstract class RTFParser extends AbstractFilter
if (Character.digit(ch, 16) != -1) if (Character.digit(ch, 16) != -1)
{ {
pendingCharacter = pendingCharacter * 16 + Character.digit(ch, 16); pendingCharacter = pendingCharacter * 16 + Character.digit(ch, 16);
ch = translationTable[pendingCharacter]; // Use translationTable if decoder is not defined
ch = decoder == null ? translationTable[pendingCharacter]
: decode((char)pendingCharacter);
if (ch != 0) if (ch != 0)
handleText(ch); handleText(ch);
} }
@ -360,4 +380,37 @@ abstract class RTFParser extends AbstractFilter
super.close(); super.close();
} }
// For fcharset control word
private char[] ca = new char[1];
private CharBuffer decoderCB = CharBuffer.wrap(ca);
private char decode(char ch) {
if (decoder == null) return ch;
decoderBB.put((byte) ch);
decoderBB.rewind();
decoderCB.clear();
CoderResult cr = decoder.decode(decoderBB, decoderCB, false);
if (cr.isUnderflow()) {
if (decoderCB.position() == 1) {
// Converted to Unicode (including replacement character)
decoder.reset();
decoderBB.clear();
decoderBB.limit(1);
return ca[0];
} else {
// Detected lead byte
decoder.reset();
decoderBB.limit(2);
decoderBB.position(1);
return 0; // Skip write operation if return value is 0
}
} else {
// Fallback, should not be called
decoder.reset();
decoderBB.clear();
decoderBB.limit(1);
return REPLACEMENT_CHAR;
}
}
} }

View File

@ -1,5 +1,5 @@
/* /*
* Copyright (c) 1997, 2021, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 1997, 2023, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
* *
* This code is free software; you can redistribute it and/or modify it * This code is free software; you can redistribute it and/or modify it
@ -32,6 +32,11 @@ import java.io.InputStream;
import java.io.InputStreamReader; import java.io.InputStreamReader;
import java.io.OutputStream; import java.io.OutputStream;
import java.io.StreamTokenizer; import java.io.StreamTokenizer;
import java.nio.ByteBuffer;
import java.nio.CharBuffer;
import java.nio.charset.Charset;
import java.nio.charset.CharsetDecoder;
import java.nio.charset.CodingErrorAction;
import java.security.AccessController; import java.security.AccessController;
import java.security.PrivilegedAction; import java.security.PrivilegedAction;
import java.util.Dictionary; import java.util.Dictionary;
@ -87,6 +92,10 @@ class RTFReader extends RTFParser
/** This Dictionary maps Integer font numbers to String font names. */ /** This Dictionary maps Integer font numbers to String font names. */
Dictionary<Integer, String> fontTable; Dictionary<Integer, String> fontTable;
/** This Dictionary maps Integer font numbers to Charset font charset. */
Dictionary<Integer, Charset> fcharsetTable;
/** This Dictionary maps String font charset to String code page. */
static Dictionary<String, String> fcharsetToCP = null;
/** This array maps color indices to Color objects. */ /** This array maps color indices to Color objects. */
Color[] colorTable; Color[] colorTable;
/** This Map maps character style numbers to Style objects. */ /** This Map maps character style numbers to Style objects. */
@ -133,6 +142,7 @@ class RTFReader extends RTFParser
textKeywords.put("emspace", "\u2003"); textKeywords.put("emspace", "\u2003");
textKeywords.put("endash", "\u2013"); textKeywords.put("endash", "\u2013");
textKeywords.put("enspace", "\u2002"); textKeywords.put("enspace", "\u2002");
textKeywords.put("line", "\n");
textKeywords.put("ldblquote", "\u201C"); textKeywords.put("ldblquote", "\u201C");
textKeywords.put("lquote", "\u2018"); textKeywords.put("lquote", "\u2018");
textKeywords.put("ltrmark", "\u200E"); textKeywords.put("ltrmark", "\u200E");
@ -159,7 +169,50 @@ class RTFReader extends RTFParser
defineCharacterSet("ansicpg", latin1TranslationTable); defineCharacterSet("ansicpg", latin1TranslationTable);
} }
/* TODO: per-font font encodings ( \fcharset control word ) ? */ /**
* Windows font charset
*/
private static final int ANSI_CHARSET = 0;
private static final int DEFAULT_CHARSET = 1;
private static final int SYMBOL_CHARSET = 2;
private static final int MAC_CHARSET = 77;
private static final int SHIFTJIS_CHARSET = 128;
private static final int HANGUL_CHARSET = 129;
private static final int JOHAB_CHARSET = 130;
private static final int GB2312_CHARSET = 134;
private static final int CHINESEBIG5_CHARSET = 136;
private static final int GREEK_CHARSET = 161;
private static final int TURKISH_CHARSET = 162;
private static final int VIETNAMESE_CHARSET = 163;
private static final int HEBREW_CHARSET = 177;
private static final int ARABIC_CHARSET = 178;
private static final int BALTIC_CHARSET = 186;
private static final int RUSSIAN_CHARSET = 204;
private static final int THAI_CHARSET = 222;
private static final int EASTEUROPE_CHARSET = 238;
private static final int OEM_CHARSET = 255;
static {
fcharsetToCP = new Hashtable<String, String>();
fcharsetToCP.put("fcharset" + ANSI_CHARSET, "windows-1252");
fcharsetToCP.put("fcharset" + SHIFTJIS_CHARSET, "ms932");
fcharsetToCP.put("fcharset" + HANGUL_CHARSET, "ms949");
fcharsetToCP.put("fcharset" + JOHAB_CHARSET, "ms1361");
fcharsetToCP.put("fcharset" + GB2312_CHARSET, "ms936");
fcharsetToCP.put("fcharset" + CHINESEBIG5_CHARSET, "ms950");
fcharsetToCP.put("fcharset" + GREEK_CHARSET, "windows-1253");
fcharsetToCP.put("fcharset" + TURKISH_CHARSET, "windows-1254");
fcharsetToCP.put("fcharset" + VIETNAMESE_CHARSET, "windows-1258");
fcharsetToCP.put("fcharset" + HEBREW_CHARSET, "windows-1255");
fcharsetToCP.put("fcharset" + ARABIC_CHARSET, "windows-1256");
fcharsetToCP.put("fcharset" + BALTIC_CHARSET, "windows-1257");
fcharsetToCP.put("fcharset" + RUSSIAN_CHARSET, "windows-1251");
fcharsetToCP.put("fcharset" + THAI_CHARSET, "ms874");
fcharsetToCP.put("fcharset" + EASTEUROPE_CHARSET, "windows-1250");
}
// Defined for replacement character
private static final String REPLACEMENT_CHAR = "\uFFFD";
/** /**
* Creates a new RTFReader instance. Text will be sent to * Creates a new RTFReader instance. Text will be sent to
@ -174,6 +227,7 @@ public RTFReader(StyledDocument destination)
target = destination; target = destination;
parserState = new Hashtable<Object, Object>(); parserState = new Hashtable<Object, Object>();
fontTable = new Hashtable<Integer, String>(); fontTable = new Hashtable<Integer, String>();
fcharsetTable = new Hashtable<Integer, Charset>();
rtfversion = -1; rtfversion = -1;
@ -762,6 +816,25 @@ class FonttblDestination implements Destination
nextFontNumber = parameter; nextFontNumber = parameter;
return true; return true;
} }
// For fcharset control word
if (keyword.equals("fcharset")) {
String fcharset = keyword+parameter;
String csName = fcharsetToCP.get(fcharset);
Charset cs;
if (csName != null) {
try {
cs = Charset.forName(csName);
} catch (IllegalArgumentException iae) {
// Fallback, should not be called
cs = ISO_8859_1;
}
} else {
// Fallback, fcharset control word number is not defined
cs = ISO_8859_1;
}
fcharsetTable.put(nextFontNumber, cs);
return true;
}
return false; return false;
} }
@ -1216,6 +1289,25 @@ abstract class AttributeTrackingDestination implements Destination
if (keyword.equals("f")) { if (keyword.equals("f")) {
parserState.put(keyword, Integer.valueOf(parameter)); parserState.put(keyword, Integer.valueOf(parameter));
// Check lead byte is stored or not
if (decoderBB.position() == 1) {
handleText(REPLACEMENT_CHAR);
}
// Reset decoder byte buffer
decoderBB.clear();
decoderBB.limit(1);
// Check fcharset is used or not
Charset cs = fcharsetTable.get(parameter);
if (cs != null) {
decoder = cs.newDecoder();
decoder.onMalformedInput(CodingErrorAction.REPLACE)
.onUnmappableCharacter(CodingErrorAction.REPLACE);
} else {
// fcharset is not used, use translationTable
decoder = null;
}
return true; return true;
} }
if (keyword.equals("cf")) { if (keyword.equals("cf")) {
@ -1610,6 +1702,12 @@ abstract class TextHandlingDestination
if (keyword.equals("par")) { if (keyword.equals("par")) {
// warnings.println("Ending paragraph."); // warnings.println("Ending paragraph.");
// Check lead byte is stored or not
if (decoderBB.position() == 1) {
handleText(REPLACEMENT_CHAR);
decoderBB.clear();
decoderBB.limit(1);
}
endParagraph(); endParagraph();
return true; return true;
} }

View File

@ -0,0 +1,131 @@
/*
* Copyright (c) 2023, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License version 2 only, as
* published by the Free Software Foundation.
*
* This code is distributed in the hope that it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
* version 2 for more details (a copy is included in the LICENSE file that
* accompanied this code).
*
* You should have received a copy of the GNU General Public License version
* 2 along with this work; if not, write to the Free Software Foundation,
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
*
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
* or visit www.oracle.com if you need additional information or have any
* questions.
*/
/*
* @test
* @bug 6928542
* @summary Verify RTFEditorKit.read() with fcharset
*/
import java.io.ByteArrayInputStream;
import java.io.InputStreamReader;
import java.io.PrintStream;
import javax.swing.text.Document;
import javax.swing.text.Element;
import javax.swing.text.rtf.RTFEditorKit;
import static java.nio.charset.StandardCharsets.ISO_8859_1;
public class RTFReadFontCharsetTest {
public static void main(String[] args) throws Exception {
String s =
"{\\rtf1\\fbidis\\ansi\\ansicpg932\\deff0\\nouicomp" +
"at\\deflang1033\\deflangfe1041{\\fonttbl{\\f0\\fni" +
"l\\fcharset0 Segoe UI;}{\\f1\\fnil\\fcharset128 Yu" +
" Gothic UI;}{\\f2\\fswiss\\fprq2\\fcharset129 Malg" +
"un Gothic;}{\\f3\\fnil\\fcharset134 Microsoft YaHe" +
"i;}{\\f4\\fnil\\fcharset136 Microsoft JhengHei;}{\\" +
"f5\\fnil\\fcharset161 Segoe UI;}{\\f6\\fnil\\fcha" +
"rset162 Segoe UI;}{\\f7\\fnil\\fcharset163 Segoe U" +
"I;}{\\f8\\fnil\\fcharset177 Segoe UI;}{\\f9\\fnil\\" +
"fcharset178 Segoe UI;}{\\f10\\fnil\\fcharset186 S" +
"egoe UI;}{\\f11\\fnil\\fcharset204 Segoe UI;}{\\f1" +
"2\\fnil\\fcharset222 Leelawadee UI;}{\\f13\\fnil\\" +
"fcharset0 Leelawadee UI;}{\\f14\\fnil\\fcharset238" +
" Segoe UI;}}\r\n{\\*\\generator Riched20 10.0.1904" +
"1}\\viewkind4\\uc1 \r\n\\pard\\ltrpar\\nowidctlpar" +
"\\sa200\\sl276\\slmult1\\f0\\fs22\\lang1041 Gr\\'f" +
"cezi - Switzerland 0\\line\\f1\\'82\\'b1\\'82\\'f" +
"1\\'82\\'c9\\'82\\'bf\\'82\\'cd - Japanese 128\\li" +
"ne\\f2\\lang17\\'be\\'c8\\'b3\\'e7\\'c7\\'cf\\'bc\\" +
"'bc\\'bf\\'e4\\lang1041 - Korean 129\\line\\kern" +
"ing2\\f3\\lang1033\\'c4\\'e3\\'ba\\'c3 - China 134" +
"\\line\\f4\\'bb\\'4f\\'c6\\'57 - Traditional Chine" +
"se - Taiwan 136\\line\\kerning0\\f5\\lang17\\'e3\\" +
"'e5\\'e9\\'e1 \\'f3\\'ef\\'f5 - Greek\\f0\\lang104" +
"1 161\\line\\f6\\lang17 A\\'f0a\\'e7 - \\f0 Turki" +
"sh (Tree) 162\\line\\f7\\'fe\\f0\\lang1041 \\lang" +
"1033 - \\lang17 Vietnam currency\\lang1041 163\\l" +
"ine\\f8\\rtlch\\lang17\\'f9\\'c8\\'d1\\'ec\\'e5\\'" +
"c9\\'ed\\f0\\ltrch - Hebrew 177\\line\\f9\\rtlch\\" +
"lang1025\\'e3\\'d1\\'cd\\'c8\\'c7\\f0\\ltrch\\lan" +
"g17 - Arabic 178\\line\\kerning2\\f10\\lang1033 A" +
"\\'e8i\\'fb - Lithuanian (Thank you) 186\\kerning0" +
"\\f0\\lang1041\\line\\kerning2\\f11\\lang1049\\'c7" +
"\\'e4\\'f0\\'e0\\'e2\\'f1\\'f2\\'e2\\'f3\\'e9\\'f2" +
"\\'e5\\f0\\lang1033 - Russian 204\\line\\kerning0" +
"\\f12\\lang1054\\'ca\\'c7\\'d1\\'ca\\'b4\\'d5 \\f1" +
"3\\lang1033 - Thailand 222\\line\\kerning2\\f14 cz" +
"e\\'9c\\'e6 - Polish 238\\par\r\n}\r\n\u0000";
String expected =
"Gr\u00fcezi - Switzerland 0\n" +
"\u3053\u3093\u306b\u3061\u306f - Japanese 128\n" +
"\uc548\ub155\ud558\uc138\uc694 - Korean 129\n" +
"\u4f60\u597d - China 134\n" +
"\u81fa\u7063 - Traditional Chinese - Taiwan 136\n" +
"\u03b3\u03b5\u03b9\u03b1 \u03c3\u03bf\u03c5 - Greek 161\n" +
"A\u011fa\u00e7 - Turkish (Tree) 162\n" +
"\u20ab - Vietnam currency 163\n" +
"\u05e9\u05b8\u05c1\u05dc\u05d5\u05b9\u05dd - Hebrew 177\n" +
"\u0645\u0631\u062d\u0628\u0627 - Arabic 178\n" +
"A\u010di\u016b - Lithuanian (Thank you) 186\n" +
"\u0417\u0434\u0440\u0430\u0432\u0441\u0442" +
"\u0432\u0443\u0439\u0442\u0435 - Russian 204\n" +
"\u0e2a\u0e27\u0e31\u0e2a\u0e14\u0e35 - Thailand 222\n" +
"cze\u015b\u0107 - Polish 238\n" +
"\n";
ByteArrayInputStream bais = new ByteArrayInputStream(
s.getBytes(ISO_8859_1));
InputStreamReader isr = new InputStreamReader(bais, ISO_8859_1);
RTFEditorKit kit = new RTFEditorKit();
Document doc = kit.createDefaultDocument();
kit.read(isr, doc, 0);
Element elem = doc.getDefaultRootElement();
int elemStart = elem.getStartOffset();
int elemEnd = elem.getEndOffset();
String text = doc.getText(elemStart, elemEnd - elemStart);
if (!expected.equals(text)) {
System.err.println("Read data");
System.err.println("=========");
dump(text, System.err);
System.err.println("Expected data");
System.err.println("=============");
dump(expected, System.err);
throw new RuntimeException("Test failed");
}
}
private static void dump(String s, PrintStream ps) {
for(char ch : s.toCharArray()) {
if (ch == '\\')
ps.print("\\\\");
else if (ch >= 0x20 && ch <= 0x7e)
ps.print(ch);
else if (ch == '\n')
ps.println();
else
ps.printf("\\u%04x", (int)ch);
}
}
}