diff --git a/README.md b/README.md index 52be4d5..a69da1b 100644 --- a/README.md +++ b/README.md @@ -91,6 +91,11 @@ EXAMPLES BUGS Report bugs at https://github.com/pascaldekloe/colfer/issues + Text validation is not part of the marshalling and unmarshalling + process. C and Go just pass any malformed UTF-8 characters. Java + and JavaScript replace unmappable content with the '?' character + (ASCII 63). + SEE ALSO protoc(1) ``` diff --git a/cmd/colf/main.go b/cmd/colf/main.go index 8fe9841..0681e3a 100644 --- a/cmd/colf/main.go +++ b/cmd/colf/main.go @@ -178,6 +178,10 @@ func init() { tail += "\t\t" + cmd + " -p com/example -x com/example/Parent Java api\n" tail += "\n" + bold + "BUGS" + clear + "\n" tail += "\tReport bugs at https://github.com/pascaldekloe/colfer/issues\n\n" + tail += "\tText validation is not part of the marshalling and unmarshalling\n" + tail += "\tprocess. C and Go just pass any malformed UTF-8 characters. Java\n" + tail += "\tand JavaScript replace unmappable content with the '?' character\n" + tail += "\t(ASCII 63).\n\n" tail += bold + "SEE ALSO\n\t" + clear + "protoc(1)\n" flag.Usage = func() { diff --git a/ecma.go b/ecma.go index c58b999..080e1c1 100644 --- a/ecma.go +++ b/ecma.go @@ -136,15 +136,20 @@ var {{.NameNative}} = new function() { bytes[i++] = c >> 6 | 192; } else { if (c > 0xd7ff && c < 0xdc00) { - if (++ci == s.length) fail('UTF-8 encode: incomplete surrogate pair'); + if (++ci == s.length) { + bytes[i++] = 63; + continue; + } var c2 = s.charCodeAt(ci); - if (c2 < 0xdc00 || c2 > 0xdfff) fail('UTF-8 encode: second char code 0x' + c2.toString(16) + ' at index ' + ci + ' in surrogate pair out of range'); + if (c2 < 0xdc00 || c2 > 0xdfff) { + bytes[i++] = 63; + --ci; + continue; + } c = 0x10000 + ((c & 0x03ff) << 10) + (c2 & 0x03ff); bytes[i++] = c >> 18 | 240; bytes[i++] = c>> 12 & 63 | 128; - } else { // c <= 0xffff - bytes[i++] = c >> 12 | 224; - } + } else bytes[i++] = c >> 12 | 224; bytes[i++] = c >> 6 & 63 | 128; } bytes[i++] = c & 63 | 128; @@ -159,24 +164,21 @@ var {{.NameNative}} = new function() { var c = bytes[i++]; if (c > 127) { if (c > 191 && c < 224) { - if (i >= bytes.length) fail('UTF-8 decode: incomplete 2-byte sequence'); - c = (c & 31) << 6 | bytes[i] & 63; + c = (i >= bytes.length) ? 63 : (c & 31) << 6 | bytes[i++] & 63; } else if (c > 223 && c < 240) { - if (i + 1 >= bytes.length) fail('UTF-8 decode: incomplete 3-byte sequence'); - c = (c & 15) << 12 | (bytes[i] & 63) << 6 | bytes[++i] & 63; + c = (i + 1 >= bytes.length) ? 63 : (c & 15) << 12 | (bytes[i++] & 63) << 6 | bytes[i++] & 63; } else if (c > 239 && c < 248) { - if (i+2 >= bytes.length) fail('UTF-8 decode: incomplete 4-byte sequence'); - c = (c & 7) << 18 | (bytes[i] & 63) << 12 | (bytes[++i] & 63) << 6 | bytes[++i] & 63; - } else fail('UTF-8 decode: unknown multibyte start 0x' + c.toString(16) + ' at index ' + (i - 1)); - ++i; + c = (i+2 >= bytes.length) ? 63 : (c & 7) << 18 | (bytes[i++] & 63) << 12 | (bytes[i++] & 63) << 6 | bytes[i++] & 63; + } else c = 63; } if (c <= 0xffff) s += String.fromCharCode(c); - else if (c <= 0x10ffff) { + else if (c > 0x10ffff) s += '?'; + else { c -= 0x10000; s += String.fromCharCode(c >> 10 | 0xd800) s += String.fromCharCode(c & 0x3FF | 0xdc00) - } else fail('UTF-8 decode: code point 0x' + c.toString(16) + ' exceeds UTF-16 reach'); + } } return s; } diff --git a/ecma/gen/Colfer.js b/ecma/gen/Colfer.js index 2d0507c..02d66c1 100644 --- a/ecma/gen/Colfer.js +++ b/ecma/gen/Colfer.js @@ -649,15 +649,20 @@ var gen = new function() { bytes[i++] = c >> 6 | 192; } else { if (c > 0xd7ff && c < 0xdc00) { - if (++ci == s.length) fail('UTF-8 encode: incomplete surrogate pair'); + if (++ci == s.length) { + bytes[i++] = 63; + continue; + } var c2 = s.charCodeAt(ci); - if (c2 < 0xdc00 || c2 > 0xdfff) fail('UTF-8 encode: second char code 0x' + c2.toString(16) + ' at index ' + ci + ' in surrogate pair out of range'); + if (c2 < 0xdc00 || c2 > 0xdfff) { + bytes[i++] = 63; + --ci; + continue; + } c = 0x10000 + ((c & 0x03ff) << 10) + (c2 & 0x03ff); bytes[i++] = c >> 18 | 240; bytes[i++] = c>> 12 & 63 | 128; - } else { // c <= 0xffff - bytes[i++] = c >> 12 | 224; - } + } else bytes[i++] = c >> 12 | 224; bytes[i++] = c >> 6 & 63 | 128; } bytes[i++] = c & 63 | 128; @@ -672,24 +677,21 @@ var gen = new function() { var c = bytes[i++]; if (c > 127) { if (c > 191 && c < 224) { - if (i >= bytes.length) fail('UTF-8 decode: incomplete 2-byte sequence'); - c = (c & 31) << 6 | bytes[i] & 63; + c = (i >= bytes.length) ? 63 : (c & 31) << 6 | bytes[i++] & 63; } else if (c > 223 && c < 240) { - if (i + 1 >= bytes.length) fail('UTF-8 decode: incomplete 3-byte sequence'); - c = (c & 15) << 12 | (bytes[i] & 63) << 6 | bytes[++i] & 63; + c = (i + 1 >= bytes.length) ? 63 : (c & 15) << 12 | (bytes[i++] & 63) << 6 | bytes[i++] & 63; } else if (c > 239 && c < 248) { - if (i+2 >= bytes.length) fail('UTF-8 decode: incomplete 4-byte sequence'); - c = (c & 7) << 18 | (bytes[i] & 63) << 12 | (bytes[++i] & 63) << 6 | bytes[++i] & 63; - } else fail('UTF-8 decode: unknown multibyte start 0x' + c.toString(16) + ' at index ' + (i - 1)); - ++i; + c = (i+2 >= bytes.length) ? 63 : (c & 7) << 18 | (bytes[i++] & 63) << 12 | (bytes[i++] & 63) << 6 | bytes[i++] & 63; + } else c = 63; } if (c <= 0xffff) s += String.fromCharCode(c); - else if (c <= 0x10ffff) { + else if (c > 0x10ffff) s += '?'; + else { c -= 0x10000; s += String.fromCharCode(c >> 10 | 0xd800) s += String.fromCharCode(c & 0x3FF | 0xdc00) - } else fail('UTF-8 decode: code point 0x' + c.toString(16) + ' exceeds UTF-16 reach'); + } } return s; }