From 8eb6e2ed7b8187262b58a08370926a59369346ef Mon Sep 17 00:00:00 2001 From: Simon Ser Date: Thu, 19 Dec 2019 12:58:17 +0100 Subject: [PATCH] Add support for all charsets Compared to the trimmed-down list of charsets we had before, this adds about 100KiB to binaries size. This isn't enough to justify having a separate package with all charsets. Closes: https://github.com/emersion/go-message/issues/64 --- README.md | 3 +- charset/charset.go | 71 +++++++++++++++++++++++----------------------- 2 files changed, 38 insertions(+), 36 deletions(-) diff --git a/README.md b/README.md index 4b6ecde9..14d06d1a 100644 --- a/README.md +++ b/README.md @@ -14,7 +14,8 @@ A Go library for the Internet Message Format. It implements: ## Features * Streaming API -* Automatic encoding and charset handling +* Automatic encoding and charset handling (to decode all charsets, add + `import _ "github.com/emersion/go-message/charset"` to your application) * A [`mail`](https://godoc.org/github.com/emersion/go-message/mail) subpackage to read and write mail messages * DKIM-friendly diff --git a/charset/charset.go b/charset/charset.go index 4cd64f5d..f08775eb 100644 --- a/charset/charset.go +++ b/charset/charset.go @@ -1,4 +1,7 @@ // Package charset provides functions to decode and encode charsets. +// +// It imports all supported charsets, which adds about 1MiB to binaries size. +// Importing the package automatically sets message.CharsetReader. package charset import ( @@ -9,37 +12,29 @@ import ( "github.com/emersion/go-message" "golang.org/x/text/encoding" "golang.org/x/text/encoding/charmap" - "golang.org/x/text/encoding/japanese" + "golang.org/x/text/encoding/ianaindex" "golang.org/x/text/encoding/simplifiedchinese" - "golang.org/x/text/encoding/traditionalchinese" ) +// Quirks table for charsets not handled by ianaindex +// +// For aliases, see +// https://www.iana.org/assignments/character-sets/character-sets.xhtml var charsets = map[string]encoding.Encoding{ - "big5": traditionalchinese.Big5, - "euc-jp": japanese.EUCJP, - "gbk": simplifiedchinese.GBK, - "gb2312": simplifiedchinese.GBK, // as GBK is a superset of HZGB2312, so just use GBK - "gb18030": simplifiedchinese.GB18030, // GB18030 Use for parse QQ business mail message - "iso-2022-jp": japanese.ISO2022JP, - "iso-8859-1": charmap.ISO8859_1, - "iso-8859-2": charmap.ISO8859_2, - "iso-8859-3": charmap.ISO8859_3, - "iso-8859-4": charmap.ISO8859_4, - "iso-8859-9": charmap.ISO8859_9, - "iso-8859-10": charmap.ISO8859_10, - "iso-8859-13": charmap.ISO8859_13, - "iso-8859-14": charmap.ISO8859_14, - "iso-8859-15": charmap.ISO8859_15, - "iso-8859-16": charmap.ISO8859_16, - "koi8-r": charmap.KOI8R, - "shift_jis": japanese.ShiftJIS, - "windows-1250": charmap.Windows1250, - "windows-1251": charmap.Windows1251, - "windows-1252": charmap.Windows1252, - "cp1250": charmap.Windows1250, - "cp1251": charmap.Windows1251, - "cp1252": charmap.Windows1252, - "ansi_x3.110-1983": charmap.ISO8859_1, + // us-ascii not handled by ianaindex + "us-ascii": encoding.Nop, + "iso-ir-6": encoding.Nop, + "ansi_x3.4-1968": encoding.Nop, + "ansi_x3.4-1986": encoding.Nop, + "iso_646.irv:1991": encoding.Nop, + "iso646-us": encoding.Nop, + "us": encoding.Nop, + "ibm367": encoding.Nop, + "cp367": encoding.Nop, + "ascii": encoding.Nop, // non-standard + + "ansi_x3.110-1983": charmap.ISO8859_1, // see RFC 1345 page 62, mostly superset of ISO 8859-1 + "gb2312": simplifiedchinese.GBK, // GBK is a superset of HZGB2312 } func init() { @@ -48,16 +43,22 @@ func init() { // Reader returns an io.Reader that converts the provided charset to UTF-8. func Reader(charset string, input io.Reader) (io.Reader, error) { - charset = strings.ToLower(charset) - // QUIRK: "ascii" and "utf8" are not in the spec but are common. The - // names ANSI_X3.4-{1968,1986} are historical and recognized as aliases - if charset == "utf-8" || charset == "utf8" || charset == "us-ascii" || charset == "ascii" || strings.HasPrefix(charset, "ansi_x3.4-") { - return input, nil + var err error + enc, ok := charsets[strings.ToLower(charset)] + if !ok { + enc, err = ianaindex.MIME.Encoding(charset) + } + if err != nil { + enc, err = ianaindex.MIME.Encoding("cs" + charset) + } + if err != nil { + return nil, fmt.Errorf("charset %q: %v", charset, err) } - if enc, ok := charsets[charset]; ok { - return enc.NewDecoder().Reader(input), nil + // See https://github.com/golang/go/issues/19421 + if enc == nil { + return nil, fmt.Errorf("charset %q: unsupported charset", charset) } - return nil, fmt.Errorf("unhandled charset %q", charset) + return enc.NewDecoder().Reader(input), nil } // RegisterEncoding registers an encoding. This is intended to be called from