Skip to content

Commit

Permalink
Add support for all charsets
Browse files Browse the repository at this point in the history
Compared to the trimmed-down list of charsets we had before, this adds
about 100KiB to binaries size. This isn't enough to justify having a
separate package with all charsets.

Closes: #64
  • Loading branch information
emersion committed Dec 19, 2019
1 parent 7e4a2fb commit 8eb6e2e
Show file tree
Hide file tree
Showing 2 changed files with 38 additions and 36 deletions.
3 changes: 2 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,8 @@ A Go library for the Internet Message Format. It implements:
## Features

* Streaming API
* Automatic encoding and charset handling
* Automatic encoding and charset handling (to decode all charsets, add
`import _ "github.com/emersion/go-message/charset"` to your application)
* A [`mail`](https://godoc.org/github.com/emersion/go-message/mail) subpackage
to read and write mail messages
* DKIM-friendly
Expand Down
71 changes: 36 additions & 35 deletions charset/charset.go
Original file line number Diff line number Diff line change
@@ -1,4 +1,7 @@
// Package charset provides functions to decode and encode charsets.
//
// It imports all supported charsets, which adds about 1MiB to binaries size.
// Importing the package automatically sets message.CharsetReader.
package charset

import (
Expand All @@ -9,37 +12,29 @@ import (
"github.com/emersion/go-message"
"golang.org/x/text/encoding"
"golang.org/x/text/encoding/charmap"
"golang.org/x/text/encoding/japanese"
"golang.org/x/text/encoding/ianaindex"
"golang.org/x/text/encoding/simplifiedchinese"
"golang.org/x/text/encoding/traditionalchinese"
)

// Quirks table for charsets not handled by ianaindex
//
// For aliases, see
// https://www.iana.org/assignments/character-sets/character-sets.xhtml
var charsets = map[string]encoding.Encoding{
"big5": traditionalchinese.Big5,
"euc-jp": japanese.EUCJP,
"gbk": simplifiedchinese.GBK,
"gb2312": simplifiedchinese.GBK, // as GBK is a superset of HZGB2312, so just use GBK
"gb18030": simplifiedchinese.GB18030, // GB18030 Use for parse QQ business mail message
"iso-2022-jp": japanese.ISO2022JP,
"iso-8859-1": charmap.ISO8859_1,
"iso-8859-2": charmap.ISO8859_2,
"iso-8859-3": charmap.ISO8859_3,
"iso-8859-4": charmap.ISO8859_4,
"iso-8859-9": charmap.ISO8859_9,
"iso-8859-10": charmap.ISO8859_10,
"iso-8859-13": charmap.ISO8859_13,
"iso-8859-14": charmap.ISO8859_14,
"iso-8859-15": charmap.ISO8859_15,
"iso-8859-16": charmap.ISO8859_16,
"koi8-r": charmap.KOI8R,
"shift_jis": japanese.ShiftJIS,
"windows-1250": charmap.Windows1250,
"windows-1251": charmap.Windows1251,
"windows-1252": charmap.Windows1252,
"cp1250": charmap.Windows1250,
"cp1251": charmap.Windows1251,
"cp1252": charmap.Windows1252,
"ansi_x3.110-1983": charmap.ISO8859_1,
// us-ascii not handled by ianaindex
"us-ascii": encoding.Nop,
"iso-ir-6": encoding.Nop,
"ansi_x3.4-1968": encoding.Nop,
"ansi_x3.4-1986": encoding.Nop,
"iso_646.irv:1991": encoding.Nop,
"iso646-us": encoding.Nop,
"us": encoding.Nop,
"ibm367": encoding.Nop,
"cp367": encoding.Nop,
"ascii": encoding.Nop, // non-standard

"ansi_x3.110-1983": charmap.ISO8859_1, // see RFC 1345 page 62, mostly superset of ISO 8859-1
"gb2312": simplifiedchinese.GBK, // GBK is a superset of HZGB2312
}

func init() {
Expand All @@ -48,16 +43,22 @@ func init() {

// Reader returns an io.Reader that converts the provided charset to UTF-8.
func Reader(charset string, input io.Reader) (io.Reader, error) {
charset = strings.ToLower(charset)
// QUIRK: "ascii" and "utf8" are not in the spec but are common. The
// names ANSI_X3.4-{1968,1986} are historical and recognized as aliases
if charset == "utf-8" || charset == "utf8" || charset == "us-ascii" || charset == "ascii" || strings.HasPrefix(charset, "ansi_x3.4-") {
return input, nil
var err error
enc, ok := charsets[strings.ToLower(charset)]
if !ok {
enc, err = ianaindex.MIME.Encoding(charset)
}
if err != nil {
enc, err = ianaindex.MIME.Encoding("cs" + charset)
}
if err != nil {
return nil, fmt.Errorf("charset %q: %v", charset, err)
}
if enc, ok := charsets[charset]; ok {
return enc.NewDecoder().Reader(input), nil
// See https://github.com/golang/go/issues/19421
if enc == nil {
return nil, fmt.Errorf("charset %q: unsupported charset", charset)
}
return nil, fmt.Errorf("unhandled charset %q", charset)
return enc.NewDecoder().Reader(input), nil
}

// RegisterEncoding registers an encoding. This is intended to be called from
Expand Down

0 comments on commit 8eb6e2e

Please sign in to comment.