6928542: Chinese characters in RTF are not decoded
Reviewed-by: prr, psadhukhan
This commit is contained in:
parent
4e8c0364a2
commit
a26f7c03c7
@ -1,5 +1,5 @@
|
||||
/*
|
||||
* Copyright (c) 1997, 2018, Oracle and/or its affiliates. All rights reserved.
|
||||
* Copyright (c) 1997, 2023, Oracle and/or its affiliates. All rights reserved.
|
||||
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
|
||||
*
|
||||
* This code is free software; you can redistribute it and/or modify it
|
||||
@ -26,6 +26,10 @@ package javax.swing.text.rtf;
|
||||
|
||||
import java.io.*;
|
||||
import java.lang.*;
|
||||
import java.nio.ByteBuffer;
|
||||
import java.nio.CharBuffer;
|
||||
import java.nio.charset.CharsetDecoder;
|
||||
import java.nio.charset.CoderResult;
|
||||
|
||||
/**
|
||||
* <b>RTFParser</b> is a subclass of <b>AbstractFilter</b> which understands basic RTF syntax
|
||||
@ -69,6 +73,11 @@ abstract class RTFParser extends AbstractFilter
|
||||
|
||||
private final int S_inblob = 6; // in a \bin blob
|
||||
|
||||
// For fcharset control word
|
||||
protected CharsetDecoder decoder = null;
|
||||
private byte[] ba = new byte[2];
|
||||
protected ByteBuffer decoderBB = ByteBuffer.wrap(ba);
|
||||
|
||||
/** Implemented by subclasses to interpret a parameter-less RTF keyword.
|
||||
* The keyword is passed without the leading '/' or any delimiting
|
||||
* whitespace. */
|
||||
@ -100,6 +109,9 @@ abstract class RTFParser extends AbstractFilter
|
||||
rtfSpecialsTable['\\'] = true;
|
||||
}
|
||||
|
||||
// Defined for replacement character
|
||||
static final char REPLACEMENT_CHAR = '\uFFFD';
|
||||
|
||||
public RTFParser()
|
||||
{
|
||||
currentCharacters = new StringBuffer();
|
||||
@ -109,6 +121,9 @@ abstract class RTFParser extends AbstractFilter
|
||||
//warnings = System.out;
|
||||
|
||||
specialsTable = rtfSpecialsTable;
|
||||
// Initialize byte buffer for CharsetDecoder
|
||||
decoderBB.clear();
|
||||
decoderBB.limit(1);
|
||||
}
|
||||
|
||||
// TODO: Handle wrapup at end of file correctly.
|
||||
@ -182,6 +197,9 @@ abstract class RTFParser extends AbstractFilter
|
||||
}
|
||||
state = S_backslashed;
|
||||
} else {
|
||||
// SBCS: ASCII character
|
||||
// DBCS: Non lead byte
|
||||
ch = decode(ch);
|
||||
currentCharacters.append(ch);
|
||||
}
|
||||
break;
|
||||
@ -301,7 +319,9 @@ abstract class RTFParser extends AbstractFilter
|
||||
if (Character.digit(ch, 16) != -1)
|
||||
{
|
||||
pendingCharacter = pendingCharacter * 16 + Character.digit(ch, 16);
|
||||
ch = translationTable[pendingCharacter];
|
||||
// Use translationTable if decoder is not defined
|
||||
ch = decoder == null ? translationTable[pendingCharacter]
|
||||
: decode((char)pendingCharacter);
|
||||
if (ch != 0)
|
||||
handleText(ch);
|
||||
}
|
||||
@ -360,4 +380,37 @@ abstract class RTFParser extends AbstractFilter
|
||||
super.close();
|
||||
}
|
||||
|
||||
// For fcharset control word
|
||||
private char[] ca = new char[1];
|
||||
private CharBuffer decoderCB = CharBuffer.wrap(ca);
|
||||
|
||||
private char decode(char ch) {
|
||||
if (decoder == null) return ch;
|
||||
decoderBB.put((byte) ch);
|
||||
decoderBB.rewind();
|
||||
decoderCB.clear();
|
||||
CoderResult cr = decoder.decode(decoderBB, decoderCB, false);
|
||||
if (cr.isUnderflow()) {
|
||||
if (decoderCB.position() == 1) {
|
||||
// Converted to Unicode (including replacement character)
|
||||
decoder.reset();
|
||||
decoderBB.clear();
|
||||
decoderBB.limit(1);
|
||||
return ca[0];
|
||||
} else {
|
||||
// Detected lead byte
|
||||
decoder.reset();
|
||||
decoderBB.limit(2);
|
||||
decoderBB.position(1);
|
||||
return 0; // Skip write operation if return value is 0
|
||||
}
|
||||
} else {
|
||||
// Fallback, should not be called
|
||||
decoder.reset();
|
||||
decoderBB.clear();
|
||||
decoderBB.limit(1);
|
||||
return REPLACEMENT_CHAR;
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
@ -1,5 +1,5 @@
|
||||
/*
|
||||
* Copyright (c) 1997, 2021, Oracle and/or its affiliates. All rights reserved.
|
||||
* Copyright (c) 1997, 2023, Oracle and/or its affiliates. All rights reserved.
|
||||
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
|
||||
*
|
||||
* This code is free software; you can redistribute it and/or modify it
|
||||
@ -32,6 +32,11 @@ import java.io.InputStream;
|
||||
import java.io.InputStreamReader;
|
||||
import java.io.OutputStream;
|
||||
import java.io.StreamTokenizer;
|
||||
import java.nio.ByteBuffer;
|
||||
import java.nio.CharBuffer;
|
||||
import java.nio.charset.Charset;
|
||||
import java.nio.charset.CharsetDecoder;
|
||||
import java.nio.charset.CodingErrorAction;
|
||||
import java.security.AccessController;
|
||||
import java.security.PrivilegedAction;
|
||||
import java.util.Dictionary;
|
||||
@ -87,6 +92,10 @@ class RTFReader extends RTFParser
|
||||
|
||||
/** This Dictionary maps Integer font numbers to String font names. */
|
||||
Dictionary<Integer, String> fontTable;
|
||||
/** This Dictionary maps Integer font numbers to Charset font charset. */
|
||||
Dictionary<Integer, Charset> fcharsetTable;
|
||||
/** This Dictionary maps String font charset to String code page. */
|
||||
static Dictionary<String, String> fcharsetToCP = null;
|
||||
/** This array maps color indices to Color objects. */
|
||||
Color[] colorTable;
|
||||
/** This Map maps character style numbers to Style objects. */
|
||||
@ -133,6 +142,7 @@ class RTFReader extends RTFParser
|
||||
textKeywords.put("emspace", "\u2003");
|
||||
textKeywords.put("endash", "\u2013");
|
||||
textKeywords.put("enspace", "\u2002");
|
||||
textKeywords.put("line", "\n");
|
||||
textKeywords.put("ldblquote", "\u201C");
|
||||
textKeywords.put("lquote", "\u2018");
|
||||
textKeywords.put("ltrmark", "\u200E");
|
||||
@ -159,7 +169,50 @@ class RTFReader extends RTFParser
|
||||
defineCharacterSet("ansicpg", latin1TranslationTable);
|
||||
}
|
||||
|
||||
/* TODO: per-font font encodings ( \fcharset control word ) ? */
|
||||
/**
|
||||
* Windows font charset
|
||||
*/
|
||||
private static final int ANSI_CHARSET = 0;
|
||||
private static final int DEFAULT_CHARSET = 1;
|
||||
private static final int SYMBOL_CHARSET = 2;
|
||||
private static final int MAC_CHARSET = 77;
|
||||
private static final int SHIFTJIS_CHARSET = 128;
|
||||
private static final int HANGUL_CHARSET = 129;
|
||||
private static final int JOHAB_CHARSET = 130;
|
||||
private static final int GB2312_CHARSET = 134;
|
||||
private static final int CHINESEBIG5_CHARSET = 136;
|
||||
private static final int GREEK_CHARSET = 161;
|
||||
private static final int TURKISH_CHARSET = 162;
|
||||
private static final int VIETNAMESE_CHARSET = 163;
|
||||
private static final int HEBREW_CHARSET = 177;
|
||||
private static final int ARABIC_CHARSET = 178;
|
||||
private static final int BALTIC_CHARSET = 186;
|
||||
private static final int RUSSIAN_CHARSET = 204;
|
||||
private static final int THAI_CHARSET = 222;
|
||||
private static final int EASTEUROPE_CHARSET = 238;
|
||||
private static final int OEM_CHARSET = 255;
|
||||
|
||||
static {
|
||||
fcharsetToCP = new Hashtable<String, String>();
|
||||
fcharsetToCP.put("fcharset" + ANSI_CHARSET, "windows-1252");
|
||||
fcharsetToCP.put("fcharset" + SHIFTJIS_CHARSET, "ms932");
|
||||
fcharsetToCP.put("fcharset" + HANGUL_CHARSET, "ms949");
|
||||
fcharsetToCP.put("fcharset" + JOHAB_CHARSET, "ms1361");
|
||||
fcharsetToCP.put("fcharset" + GB2312_CHARSET, "ms936");
|
||||
fcharsetToCP.put("fcharset" + CHINESEBIG5_CHARSET, "ms950");
|
||||
fcharsetToCP.put("fcharset" + GREEK_CHARSET, "windows-1253");
|
||||
fcharsetToCP.put("fcharset" + TURKISH_CHARSET, "windows-1254");
|
||||
fcharsetToCP.put("fcharset" + VIETNAMESE_CHARSET, "windows-1258");
|
||||
fcharsetToCP.put("fcharset" + HEBREW_CHARSET, "windows-1255");
|
||||
fcharsetToCP.put("fcharset" + ARABIC_CHARSET, "windows-1256");
|
||||
fcharsetToCP.put("fcharset" + BALTIC_CHARSET, "windows-1257");
|
||||
fcharsetToCP.put("fcharset" + RUSSIAN_CHARSET, "windows-1251");
|
||||
fcharsetToCP.put("fcharset" + THAI_CHARSET, "ms874");
|
||||
fcharsetToCP.put("fcharset" + EASTEUROPE_CHARSET, "windows-1250");
|
||||
}
|
||||
|
||||
// Defined for replacement character
|
||||
private static final String REPLACEMENT_CHAR = "\uFFFD";
|
||||
|
||||
/**
|
||||
* Creates a new RTFReader instance. Text will be sent to
|
||||
@ -174,6 +227,7 @@ public RTFReader(StyledDocument destination)
|
||||
target = destination;
|
||||
parserState = new Hashtable<Object, Object>();
|
||||
fontTable = new Hashtable<Integer, String>();
|
||||
fcharsetTable = new Hashtable<Integer, Charset>();
|
||||
|
||||
rtfversion = -1;
|
||||
|
||||
@ -762,6 +816,25 @@ class FonttblDestination implements Destination
|
||||
nextFontNumber = parameter;
|
||||
return true;
|
||||
}
|
||||
// For fcharset control word
|
||||
if (keyword.equals("fcharset")) {
|
||||
String fcharset = keyword+parameter;
|
||||
String csName = fcharsetToCP.get(fcharset);
|
||||
Charset cs;
|
||||
if (csName != null) {
|
||||
try {
|
||||
cs = Charset.forName(csName);
|
||||
} catch (IllegalArgumentException iae) {
|
||||
// Fallback, should not be called
|
||||
cs = ISO_8859_1;
|
||||
}
|
||||
} else {
|
||||
// Fallback, fcharset control word number is not defined
|
||||
cs = ISO_8859_1;
|
||||
}
|
||||
fcharsetTable.put(nextFontNumber, cs);
|
||||
return true;
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
@ -1216,6 +1289,25 @@ abstract class AttributeTrackingDestination implements Destination
|
||||
|
||||
if (keyword.equals("f")) {
|
||||
parserState.put(keyword, Integer.valueOf(parameter));
|
||||
|
||||
// Check lead byte is stored or not
|
||||
if (decoderBB.position() == 1) {
|
||||
handleText(REPLACEMENT_CHAR);
|
||||
}
|
||||
// Reset decoder byte buffer
|
||||
decoderBB.clear();
|
||||
decoderBB.limit(1);
|
||||
// Check fcharset is used or not
|
||||
Charset cs = fcharsetTable.get(parameter);
|
||||
if (cs != null) {
|
||||
decoder = cs.newDecoder();
|
||||
decoder.onMalformedInput(CodingErrorAction.REPLACE)
|
||||
.onUnmappableCharacter(CodingErrorAction.REPLACE);
|
||||
} else {
|
||||
// fcharset is not used, use translationTable
|
||||
decoder = null;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
if (keyword.equals("cf")) {
|
||||
@ -1610,6 +1702,12 @@ abstract class TextHandlingDestination
|
||||
|
||||
if (keyword.equals("par")) {
|
||||
// warnings.println("Ending paragraph.");
|
||||
// Check lead byte is stored or not
|
||||
if (decoderBB.position() == 1) {
|
||||
handleText(REPLACEMENT_CHAR);
|
||||
decoderBB.clear();
|
||||
decoderBB.limit(1);
|
||||
}
|
||||
endParagraph();
|
||||
return true;
|
||||
}
|
||||
|
131
test/jdk/javax/swing/text/rtf/RTFReadFontCharsetTest.java
Normal file
131
test/jdk/javax/swing/text/rtf/RTFReadFontCharsetTest.java
Normal file
@ -0,0 +1,131 @@
|
||||
/*
|
||||
* Copyright (c) 2023, Oracle and/or its affiliates. All rights reserved.
|
||||
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
|
||||
*
|
||||
* This code is free software; you can redistribute it and/or modify it
|
||||
* under the terms of the GNU General Public License version 2 only, as
|
||||
* published by the Free Software Foundation.
|
||||
*
|
||||
* This code is distributed in the hope that it will be useful, but WITHOUT
|
||||
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
|
||||
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
|
||||
* version 2 for more details (a copy is included in the LICENSE file that
|
||||
* accompanied this code).
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License version
|
||||
* 2 along with this work; if not, write to the Free Software Foundation,
|
||||
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||
*
|
||||
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
|
||||
* or visit www.oracle.com if you need additional information or have any
|
||||
* questions.
|
||||
*/
|
||||
|
||||
/*
|
||||
* @test
|
||||
* @bug 6928542
|
||||
* @summary Verify RTFEditorKit.read() with fcharset
|
||||
*/
|
||||
|
||||
import java.io.ByteArrayInputStream;
|
||||
import java.io.InputStreamReader;
|
||||
import java.io.PrintStream;
|
||||
import javax.swing.text.Document;
|
||||
import javax.swing.text.Element;
|
||||
import javax.swing.text.rtf.RTFEditorKit;
|
||||
|
||||
import static java.nio.charset.StandardCharsets.ISO_8859_1;
|
||||
|
||||
public class RTFReadFontCharsetTest {
|
||||
public static void main(String[] args) throws Exception {
|
||||
String s =
|
||||
"{\\rtf1\\fbidis\\ansi\\ansicpg932\\deff0\\nouicomp" +
|
||||
"at\\deflang1033\\deflangfe1041{\\fonttbl{\\f0\\fni" +
|
||||
"l\\fcharset0 Segoe UI;}{\\f1\\fnil\\fcharset128 Yu" +
|
||||
" Gothic UI;}{\\f2\\fswiss\\fprq2\\fcharset129 Malg" +
|
||||
"un Gothic;}{\\f3\\fnil\\fcharset134 Microsoft YaHe" +
|
||||
"i;}{\\f4\\fnil\\fcharset136 Microsoft JhengHei;}{\\" +
|
||||
"f5\\fnil\\fcharset161 Segoe UI;}{\\f6\\fnil\\fcha" +
|
||||
"rset162 Segoe UI;}{\\f7\\fnil\\fcharset163 Segoe U" +
|
||||
"I;}{\\f8\\fnil\\fcharset177 Segoe UI;}{\\f9\\fnil\\" +
|
||||
"fcharset178 Segoe UI;}{\\f10\\fnil\\fcharset186 S" +
|
||||
"egoe UI;}{\\f11\\fnil\\fcharset204 Segoe UI;}{\\f1" +
|
||||
"2\\fnil\\fcharset222 Leelawadee UI;}{\\f13\\fnil\\" +
|
||||
"fcharset0 Leelawadee UI;}{\\f14\\fnil\\fcharset238" +
|
||||
" Segoe UI;}}\r\n{\\*\\generator Riched20 10.0.1904" +
|
||||
"1}\\viewkind4\\uc1 \r\n\\pard\\ltrpar\\nowidctlpar" +
|
||||
"\\sa200\\sl276\\slmult1\\f0\\fs22\\lang1041 Gr\\'f" +
|
||||
"cezi - Switzerland 0\\line\\f1\\'82\\'b1\\'82\\'f" +
|
||||
"1\\'82\\'c9\\'82\\'bf\\'82\\'cd - Japanese 128\\li" +
|
||||
"ne\\f2\\lang17\\'be\\'c8\\'b3\\'e7\\'c7\\'cf\\'bc\\" +
|
||||
"'bc\\'bf\\'e4\\lang1041 - Korean 129\\line\\kern" +
|
||||
"ing2\\f3\\lang1033\\'c4\\'e3\\'ba\\'c3 - China 134" +
|
||||
"\\line\\f4\\'bb\\'4f\\'c6\\'57 - Traditional Chine" +
|
||||
"se - Taiwan 136\\line\\kerning0\\f5\\lang17\\'e3\\" +
|
||||
"'e5\\'e9\\'e1 \\'f3\\'ef\\'f5 - Greek\\f0\\lang104" +
|
||||
"1 161\\line\\f6\\lang17 A\\'f0a\\'e7 - \\f0 Turki" +
|
||||
"sh (Tree) 162\\line\\f7\\'fe\\f0\\lang1041 \\lang" +
|
||||
"1033 - \\lang17 Vietnam currency\\lang1041 163\\l" +
|
||||
"ine\\f8\\rtlch\\lang17\\'f9\\'c8\\'d1\\'ec\\'e5\\'" +
|
||||
"c9\\'ed\\f0\\ltrch - Hebrew 177\\line\\f9\\rtlch\\" +
|
||||
"lang1025\\'e3\\'d1\\'cd\\'c8\\'c7\\f0\\ltrch\\lan" +
|
||||
"g17 - Arabic 178\\line\\kerning2\\f10\\lang1033 A" +
|
||||
"\\'e8i\\'fb - Lithuanian (Thank you) 186\\kerning0" +
|
||||
"\\f0\\lang1041\\line\\kerning2\\f11\\lang1049\\'c7" +
|
||||
"\\'e4\\'f0\\'e0\\'e2\\'f1\\'f2\\'e2\\'f3\\'e9\\'f2" +
|
||||
"\\'e5\\f0\\lang1033 - Russian 204\\line\\kerning0" +
|
||||
"\\f12\\lang1054\\'ca\\'c7\\'d1\\'ca\\'b4\\'d5 \\f1" +
|
||||
"3\\lang1033 - Thailand 222\\line\\kerning2\\f14 cz" +
|
||||
"e\\'9c\\'e6 - Polish 238\\par\r\n}\r\n\u0000";
|
||||
String expected =
|
||||
"Gr\u00fcezi - Switzerland 0\n" +
|
||||
"\u3053\u3093\u306b\u3061\u306f - Japanese 128\n" +
|
||||
"\uc548\ub155\ud558\uc138\uc694 - Korean 129\n" +
|
||||
"\u4f60\u597d - China 134\n" +
|
||||
"\u81fa\u7063 - Traditional Chinese - Taiwan 136\n" +
|
||||
"\u03b3\u03b5\u03b9\u03b1 \u03c3\u03bf\u03c5 - Greek 161\n" +
|
||||
"A\u011fa\u00e7 - Turkish (Tree) 162\n" +
|
||||
"\u20ab - Vietnam currency 163\n" +
|
||||
"\u05e9\u05b8\u05c1\u05dc\u05d5\u05b9\u05dd - Hebrew 177\n" +
|
||||
"\u0645\u0631\u062d\u0628\u0627 - Arabic 178\n" +
|
||||
"A\u010di\u016b - Lithuanian (Thank you) 186\n" +
|
||||
"\u0417\u0434\u0440\u0430\u0432\u0441\u0442" +
|
||||
"\u0432\u0443\u0439\u0442\u0435 - Russian 204\n" +
|
||||
"\u0e2a\u0e27\u0e31\u0e2a\u0e14\u0e35 - Thailand 222\n" +
|
||||
"cze\u015b\u0107 - Polish 238\n" +
|
||||
"\n";
|
||||
ByteArrayInputStream bais = new ByteArrayInputStream(
|
||||
s.getBytes(ISO_8859_1));
|
||||
InputStreamReader isr = new InputStreamReader(bais, ISO_8859_1);
|
||||
RTFEditorKit kit = new RTFEditorKit();
|
||||
Document doc = kit.createDefaultDocument();
|
||||
kit.read(isr, doc, 0);
|
||||
Element elem = doc.getDefaultRootElement();
|
||||
int elemStart = elem.getStartOffset();
|
||||
int elemEnd = elem.getEndOffset();
|
||||
String text = doc.getText(elemStart, elemEnd - elemStart);
|
||||
if (!expected.equals(text)) {
|
||||
System.err.println("Read data");
|
||||
System.err.println("=========");
|
||||
dump(text, System.err);
|
||||
System.err.println("Expected data");
|
||||
System.err.println("=============");
|
||||
dump(expected, System.err);
|
||||
throw new RuntimeException("Test failed");
|
||||
}
|
||||
}
|
||||
|
||||
private static void dump(String s, PrintStream ps) {
|
||||
for(char ch : s.toCharArray()) {
|
||||
if (ch == '\\')
|
||||
ps.print("\\\\");
|
||||
else if (ch >= 0x20 && ch <= 0x7e)
|
||||
ps.print(ch);
|
||||
else if (ch == '\n')
|
||||
ps.println();
|
||||
else
|
||||
ps.printf("\\u%04x", (int)ch);
|
||||
}
|
||||
}
|
||||
|
||||
}
|
Loading…
Reference in New Issue
Block a user