Skip to content

Commit

Permalink
Add support for UTF-8 encoding.
Browse files Browse the repository at this point in the history
Add comments noting details of other encodings
  • Loading branch information
joniles committed Jun 8, 2016
1 parent 4becf02 commit a2ab5fb
Showing 1 changed file with 142 additions and 0 deletions.
142 changes: 142 additions & 0 deletions RTF Parser Kit/src/com/rtfparserkit/parser/standard/Encoding.java
Original file line number Diff line number Diff line change
Expand Up @@ -32,9 +32,37 @@ class Encoding
public static final Map<String, String> LOCALEID_MAPPING = new HashMap<String, String>();
static
{
// Comment lines based on: https://msdn.microsoft.com/en-us/library/windows/desktop/dd317756(v=vs.85).aspx

// 037 IBM037 IBM EBCDIC US-Canada
// 437 IBM437 OEM United States
// 500 IBM500 IBM EBCDIC International
// 708 ASMO-708 Arabic (ASMO 708)
// 709 Arabic (ASMO-449+, BCON V4)
// 710 Arabic - Transparent Arabic
// 720 DOS-720 Arabic (Transparent ASMO); Arabic (DOS)
// 737 ibm737 OEM Greek (formerly 437G); Greek (DOS)
// 775 ibm775 OEM Baltic; Baltic (DOS)
// 850 ibm850 OEM Multilingual Latin 1; Western European (DOS)
// 852 ibm852 OEM Latin 2; Central European (DOS)
// 855 IBM855 OEM Cyrillic (primarily Russian)
// 857 ibm857 OEM Turkish; Turkish (DOS)
// 858 IBM00858 OEM Multilingual Latin 1 + Euro symbol
// 860 IBM860 OEM Portuguese; Portuguese (DOS)
// 861 ibm861 OEM Icelandic; Icelandic (DOS)
// 862 DOS-862 OEM Hebrew; Hebrew (DOS)
// 863 IBM863 OEM French Canadian; French Canadian (DOS)
// 864 IBM864 OEM Arabic; Arabic (864)
// 865 IBM865 OEM Nordic; Nordic (DOS)
// 866 cp866 OEM Russian; Cyrillic (DOS)
// 869 ibm869 OEM Modern Greek; Greek, Modern (DOS)
// 870 IBM870 IBM EBCDIC Multilingual/ROECE (Latin 2); IBM EBCDIC Multilingual Latin 2
// 874 windows-874 ANSI/OEM Thai (ISO 8859-11); Thai (Windows)
// 875 cp875 IBM EBCDIC Greek Modern
LOCALEID_MAPPING.put("932", "SJIS"); // Japanese
LOCALEID_MAPPING.put("936", "Cp936"); // Simplified Chinese
LOCALEID_MAPPING.put("949", "Cp949"); // Korean
// 950 big5 ANSI/OEM Traditional Chinese (Taiwan; Hong Kong SAR, PRC); Chinese Traditional (Big5)
LOCALEID_MAPPING.put("1025", "Cp1256"); // Arabic (Saudi Arabia)
LOCALEID_MAPPING.put("1026", "Cp1251"); // Bulgarian
LOCALEID_MAPPING.put("1028", "Cp950"); // Chinese (Taiwan)
Expand All @@ -45,6 +73,7 @@ class Encoding
LOCALEID_MAPPING.put("1041", "SJIS"); // Japanese
LOCALEID_MAPPING.put("1042", "Cp949"); // Korean
LOCALEID_MAPPING.put("1045", "Cp1250"); // Polish
// 1047 IBM01047 IBM EBCDIC Latin 1/Open System
LOCALEID_MAPPING.put("1048", "Cp1250"); // Romanian
LOCALEID_MAPPING.put("1049", "Cp1251"); // Russian
LOCALEID_MAPPING.put("1050", "Cp1250"); // Croatian
Expand All @@ -68,13 +97,28 @@ class Encoding
LOCALEID_MAPPING.put("1091", "Cp1254"); // Uzbek (Latin)
LOCALEID_MAPPING.put("1092", "Cp1251"); // Tatar
LOCALEID_MAPPING.put("1104", "Cp1251"); // Mongolian (Cyrillic)
// 1140 IBM01140 IBM EBCDIC US-Canada (037 + Euro symbol); IBM EBCDIC (US-Canada-Euro)
// 1141 IBM01141 IBM EBCDIC Germany (20273 + Euro symbol); IBM EBCDIC (Germany-Euro)
// 1142 IBM01142 IBM EBCDIC Denmark-Norway (20277 + Euro symbol); IBM EBCDIC (Denmark-Norway-Euro)
// 1143 IBM01143 IBM EBCDIC Finland-Sweden (20278 + Euro symbol); IBM EBCDIC (Finland-Sweden-Euro)
// 1144 IBM01144 IBM EBCDIC Italy (20280 + Euro symbol); IBM EBCDIC (Italy-Euro)
// 1145 IBM01145 IBM EBCDIC Latin America-Spain (20284 + Euro symbol); IBM EBCDIC (Spain-Euro)
// 1146 IBM01146 IBM EBCDIC United Kingdom (20285 + Euro symbol); IBM EBCDIC (UK-Euro)
// 1147 IBM01147 IBM EBCDIC France (20297 + Euro symbol); IBM EBCDIC (France-Euro)
// 1148 IBM01148 IBM EBCDIC International (500 + Euro symbol); IBM EBCDIC (International-Euro)
// 1149 IBM01149 IBM EBCDIC Icelandic (20871 + Euro symbol); IBM EBCDIC (Icelandic-Euro)
// 1200 utf-16 Unicode UTF-16, little endian byte order (BMP of ISO 10646)
// 1201 unicodeFFFE Unicode UTF-16, big endian byte order
LOCALEID_MAPPING.put("1250", "Cp1250"); // Windows Latin 2 (Central Europe)
LOCALEID_MAPPING.put("1251", "Cp1251"); // Cyrillic
LOCALEID_MAPPING.put("1252", "Cp1252"); // Latin
LOCALEID_MAPPING.put("1253", "Cp1253"); // Greek
LOCALEID_MAPPING.put("1254", "Cp1254"); // Turkish
LOCALEID_MAPPING.put("1255", "Cp1255"); // Windows Hebrew
LOCALEID_MAPPING.put("1256", "Cp1256"); // Arabic (Iraq)
LOCALEID_MAPPING.put("1257", "Cp1257"); // Baltic
LOCALEID_MAPPING.put("1258", "Cp1258"); // Vietnamese
// 1361 Johab Korean (Johab)
LOCALEID_MAPPING.put("2049", "Cp1256"); // Arabic (Iraq)
LOCALEID_MAPPING.put("2052", "MS936"); // Chinese (PRC)
LOCALEID_MAPPING.put("2074", "Cp1250"); // Serbian (Latin)
Expand All @@ -92,12 +136,110 @@ class Encoding
LOCALEID_MAPPING.put("8193", "Cp1256"); // Arabic (Oman)
LOCALEID_MAPPING.put("9217", "Cp1256"); // Arabic (Yemen)
LOCALEID_MAPPING.put("10000", "MacRoman"); // Mac Roman
// 10001 x-mac-japanese Japanese (Mac)
// 10002 x-mac-chinesetrad MAC Traditional Chinese (Big5); Chinese Traditional (Mac)
// 10003 x-mac-korean Korean (Mac)
// 10004 x-mac-arabic Arabic (Mac)
// 10005 x-mac-hebrew Hebrew (Mac)
// 10006 x-mac-greek Greek (Mac)
// 10007 x-mac-cyrillic Cyrillic (Mac)
// 10008 x-mac-chinesesimp MAC Simplified Chinese (GB 2312); Chinese Simplified (Mac)
// 10010 x-mac-romanian Romanian (Mac)
// 10017 x-mac-ukrainian Ukrainian (Mac)
// 10021 x-mac-thai Thai (Mac)
// 10029 x-mac-ce MAC Latin 2; Central European (Mac)
// 10079 x-mac-icelandic Icelandic (Mac)
// 10081 x-mac-turkish Turkish (Mac)
// 10082 x-mac-croatian Croatian (Mac)
LOCALEID_MAPPING.put("10241", "Cp1256"); // Arabic (Syria)
LOCALEID_MAPPING.put("11265", "Cp1256"); // Arabic (Jordan)
// 12000 utf-32 Unicode UTF-32, little endian byte order
// 12001 utf-32BE Unicode UTF-32, big endian byte order
LOCALEID_MAPPING.put("12289", "Cp1256"); // Arabic (Lebanon)
LOCALEID_MAPPING.put("13313", "Cp1256"); // Arabic (Kuwait)
LOCALEID_MAPPING.put("14337", "Cp1256"); // Arabic (U.A.E.)
LOCALEID_MAPPING.put("15361", "Cp1256"); // Arabic (Bahrain)
LOCALEID_MAPPING.put("16385", "Cp1256"); // Arabic (Qatar)
// 20000 x-Chinese_CNS CNS Taiwan; Chinese Traditional (CNS)
// 20001 x-cp20001 TCA Taiwan
// 20002 x_Chinese-Eten Eten Taiwan; Chinese Traditional (Eten)
// 20003 x-cp20003 IBM5550 Taiwan
// 20004 x-cp20004 TeleText Taiwan
// 20005 x-cp20005 Wang Taiwan
// 20105 x-IA5 IA5 (IRV International Alphabet No. 5, 7-bit); Western European (IA5)
// 20106 x-IA5-German IA5 German (7-bit)
// 20107 x-IA5-Swedish IA5 Swedish (7-bit)
// 20108 x-IA5-Norwegian IA5 Norwegian (7-bit)
// 20127 us-ascii US-ASCII (7-bit)
// 20261 x-cp20261 T.61
// 20269 x-cp20269 ISO 6937 Non-Spacing Accent
// 20273 IBM273 IBM EBCDIC Germany
// 20277 IBM277 IBM EBCDIC Denmark-Norway
// 20278 IBM278 IBM EBCDIC Finland-Sweden
// 20280 IBM280 IBM EBCDIC Italy
// 20284 IBM284 IBM EBCDIC Latin America-Spain
// 20285 IBM285 IBM EBCDIC United Kingdom
// 20290 IBM290 IBM EBCDIC Japanese Katakana Extended
// 20297 IBM297 IBM EBCDIC France
// 20420 IBM420 IBM EBCDIC Arabic
// 20423 IBM423 IBM EBCDIC Greek
// 20424 IBM424 IBM EBCDIC Hebrew
// 20833 x-EBCDIC-KoreanExtended IBM EBCDIC Korean Extended
// 20838 IBM-Thai IBM EBCDIC Thai
// 20866 koi8-r Russian (KOI8-R); Cyrillic (KOI8-R)
// 20871 IBM871 IBM EBCDIC Icelandic
// 20880 IBM880 IBM EBCDIC Cyrillic Russian
// 20905 IBM905 IBM EBCDIC Turkish
// 20924 IBM00924 IBM EBCDIC Latin 1/Open System (1047 + Euro symbol)
// 20932 EUC-JP Japanese (JIS 0208-1990 and 0212-1990)
// 20936 x-cp20936 Simplified Chinese (GB2312); Chinese Simplified (GB2312-80)
// 20949 x-cp20949 Korean Wansung
// 21025 cp1025 IBM EBCDIC Cyrillic Serbian-Bulgarian
// 21027 (deprecated)
// 21866 koi8-u Ukrainian (KOI8-U); Cyrillic (KOI8-U)
// 28591 iso-8859-1 ISO 8859-1 Latin 1; Western European (ISO)
// 28592 iso-8859-2 ISO 8859-2 Central European; Central European (ISO)
// 28593 iso-8859-3 ISO 8859-3 Latin 3
// 28594 iso-8859-4 ISO 8859-4 Baltic
// 28595 iso-8859-5 ISO 8859-5 Cyrillic
// 28596 iso-8859-6 ISO 8859-6 Arabic
// 28597 iso-8859-7 ISO 8859-7 Greek
// 28598 iso-8859-8 ISO 8859-8 Hebrew; Hebrew (ISO-Visual)
// 28599 iso-8859-9 ISO 8859-9 Turkish
// 28603 iso-8859-13 ISO 8859-13 Estonian
// 28605 iso-8859-15 ISO 8859-15 Latin 9
// 29001 x-Europa Europa 3
// 38598 iso-8859-8-i ISO 8859-8 Hebrew; Hebrew (ISO-Logical)
// 50220 iso-2022-jp ISO 2022 Japanese with no halfwidth Katakana; Japanese (JIS)
// 50221 csISO2022JP ISO 2022 Japanese with halfwidth Katakana; Japanese (JIS-Allow 1 byte Kana)
// 50222 iso-2022-jp ISO 2022 Japanese JIS X 0201-1989; Japanese (JIS-Allow 1 byte Kana - SO/SI)
// 50225 iso-2022-kr ISO 2022 Korean
// 50227 x-cp50227 ISO 2022 Simplified Chinese; Chinese Simplified (ISO 2022)
// 50229 ISO 2022 Traditional Chinese
// 50930 EBCDIC Japanese (Katakana) Extended
// 50931 EBCDIC US-Canada and Japanese
// 50933 EBCDIC Korean Extended and Korean
// 50935 EBCDIC Simplified Chinese Extended and Simplified Chinese
// 50936 EBCDIC Simplified Chinese
// 50937 EBCDIC US-Canada and Traditional Chinese
// 50939 EBCDIC Japanese (Latin) Extended and Japanese
// 51932 euc-jp EUC Japanese
// 51936 EUC-CN EUC Simplified Chinese; Chinese Simplified (EUC)
// 51949 euc-kr EUC Korean
// 51950 EUC Traditional Chinese
// 52936 hz-gb-2312 HZ-GB2312 Simplified Chinese; Chinese Simplified (HZ)
// 54936 GB18030 Windows XP and later: GB18030 Simplified Chinese (4 byte); Chinese Simplified (GB18030)
// 57002 x-iscii-de ISCII Devanagari
// 57003 x-iscii-be ISCII Bangla
// 57004 x-iscii-ta ISCII Tamil
// 57005 x-iscii-te ISCII Telugu
// 57006 x-iscii-as ISCII Assamese
// 57007 x-iscii-or ISCII Odia
// 57008 x-iscii-ka ISCII Kannada
// 57009 x-iscii-ma ISCII Malayalam
// 57010 x-iscii-gu ISCII Gujarati
// 57011 x-iscii-pa ISCII Punjabi
LOCALEID_MAPPING.put("65000", null); // UTF-7 - not a supported Java encoding, see: http://stackoverflow.com/questions/19861987/java-io-unsupportedencodingexception-unicode-1-1-utf-7
LOCALEID_MAPPING.put("65001", "UTF-8"); // UTF-8
}
}

0 comments on commit a2ab5fb

Please sign in to comment.