From 4253cdaa5267687e3b7612da7ff105d4414174da Mon Sep 17 00:00:00 2001
From: Sean Cunningham <scunningham@endgame.com>
Date: Wed, 4 Mar 2020 17:11:57 -0500
Subject: [PATCH] JSON encoder may produce invalid utf-8 when provided invalid
 utf-8 message pack string. Fix encoder and add unit test which validates
 expectation.

---
 msgp/json.go      | 62 +++++++++++++++++++++++++++++++++--------------
 msgp/json_test.go | 28 +++++++++++++++++++++
 2 files changed, 72 insertions(+), 18 deletions(-)

diff --git a/msgp/json.go b/msgp/json.go
index 4325860a..77601e52 100644
--- a/msgp/json.go
+++ b/msgp/json.go
@@ -466,7 +466,23 @@ func rwquoted(dst jsWriter, s []byte) (n int, err error) {
 					return
 				}
 				n++
+			case '\t':
+				err = dst.WriteByte('\\')
+				if err != nil {
+					return
+				}
+				n++
+				err = dst.WriteByte('t')
+				if err != nil {
+					return
+				}
+				n++
 			default:
+				// This encodes bytes < 0x20 except for \t, \n and \r.
+				// It also escapes <, >, and &
+				// because they can lead to security holes when
+				// user-controlled strings are rendered into JSON
+				// and served to some browsers.
 				nn, err = dst.WriteString(`\u00`)
 				n += nn
 				if err != nil {
@@ -495,16 +511,23 @@ func rwquoted(dst jsWriter, s []byte) (n int, err error) {
 				if err != nil {
 					return
 				}
-				nn, err = dst.WriteString(`\ufffd`)
-				n += nn
-				if err != nil {
-					return
-				}
-				i += size
-				start = i
-				continue
 			}
+			nn, err = dst.WriteString(`\ufffd`)
+			n += nn
+			if err != nil {
+				return
+			}
+			i += size
+			start = i
+			continue
 		}
+		// U+2028 is LINE SEPARATOR.
+		// U+2029 is PARAGRAPH SEPARATOR.
+		// They are both technically valid characters in JSON strings,
+		// but don't work in JSONP, which has to be evaluated as JavaScript,
+		// and can lead to security holes there. It is valid JSON to
+		// escape them, so we do so unconditionally.
+		// See http://timelessrepo.com/json-isnt-a-javascript-subset for discussion.
 		if c == '\u2028' || c == '\u2029' {
 			if start < i {
 				nn, err = dst.Write(s[start:i])
@@ -512,17 +535,20 @@ func rwquoted(dst jsWriter, s []byte) (n int, err error) {
 				if err != nil {
 					return
 				}
-				nn, err = dst.WriteString(`\u202`)
-				n += nn
-				if err != nil {
-					return
-				}
-				err = dst.WriteByte(hex[c&0xF])
-				if err != nil {
-					return
-				}
-				n++
 			}
+			nn, err = dst.WriteString(`\u202`)
+			n += nn
+			if err != nil {
+				return
+			}
+			err = dst.WriteByte(hex[c&0xF])
+			if err != nil {
+				return
+			}
+			n++
+			i += size
+			start = i
+			continue
 		}
 		i += size
 	}
diff --git a/msgp/json_test.go b/msgp/json_test.go
index 439d4790..3ca01e8c 100644
--- a/msgp/json_test.go
+++ b/msgp/json_test.go
@@ -5,6 +5,7 @@ import (
 	"encoding/json"
 	"reflect"
 	"testing"
+	"unicode/utf8"
 )
 
 func TestCopyJSON(t *testing.T) {
@@ -69,6 +70,33 @@ func TestCopyJSON(t *testing.T) {
 	}
 }
 
+// Encoder should generate valid utf-8 even if passed bad input
+func TestCopyJSONNegativeUTF8(t *testing.T) {
+	// Single string with non-compliant utf-8 byte
+	stringWithBadUTF8 := []byte{
+		0xa1, 0xe0,
+	}
+
+	src := bytes.NewBuffer(stringWithBadUTF8)
+
+	var js bytes.Buffer
+	_, err := CopyToJSON(&js, src)
+	if err != nil {
+		t.Fatal(err)
+	}
+
+	// Even though we provided bad input, should have escaped the naughty character
+	if !utf8.Valid(js.Bytes()) {
+		t.Errorf("Expected JSON to be valid utf-8 even when provided bad input")
+	}
+
+	// Expect a bad character string
+	expected := `"\ufffd"`
+	if js.String() != expected {
+		t.Errorf("Expected: '%s', got: '%s'", expected, js.String())
+	}
+}
+
 func BenchmarkCopyToJSON(b *testing.B) {
 	var buf bytes.Buffer
 	enc := NewWriter(&buf)