Skip to content

Commit

Permalink
Clarify text validation.
Browse files Browse the repository at this point in the history
  • Loading branch information
pascaldekloe committed Jul 1, 2017
1 parent cf39c49 commit cf320a6
Show file tree
Hide file tree
Showing 4 changed files with 43 additions and 30 deletions.
5 changes: 5 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -91,6 +91,11 @@ EXAMPLES
BUGS
Report bugs at https://github.com/pascaldekloe/colfer/issues
Text validation is not part of the marshalling and unmarshalling
process. C and Go just pass any malformed UTF-8 characters. Java
and JavaScript replace unmappable content with the '?' character
(ASCII 63).
SEE ALSO
protoc(1)
```
Expand Down
4 changes: 4 additions & 0 deletions cmd/colf/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -178,6 +178,10 @@ func init() {
tail += "\t\t" + cmd + " -p com/example -x com/example/Parent Java api\n"
tail += "\n" + bold + "BUGS" + clear + "\n"
tail += "\tReport bugs at https://github.com/pascaldekloe/colfer/issues\n\n"
tail += "\tText validation is not part of the marshalling and unmarshalling\n"
tail += "\tprocess. C and Go just pass any malformed UTF-8 characters. Java\n"
tail += "\tand JavaScript replace unmappable content with the '?' character\n"
tail += "\t(ASCII 63).\n\n"
tail += bold + "SEE ALSO\n\t" + clear + "protoc(1)\n"

flag.Usage = func() {
Expand Down
32 changes: 17 additions & 15 deletions ecma.go
Original file line number Diff line number Diff line change
Expand Up @@ -136,15 +136,20 @@ var {{.NameNative}} = new function() {
bytes[i++] = c >> 6 | 192;
} else {
if (c > 0xd7ff && c < 0xdc00) {
if (++ci == s.length) fail('UTF-8 encode: incomplete surrogate pair');
if (++ci == s.length) {
bytes[i++] = 63;
continue;
}
var c2 = s.charCodeAt(ci);
if (c2 < 0xdc00 || c2 > 0xdfff) fail('UTF-8 encode: second char code 0x' + c2.toString(16) + ' at index ' + ci + ' in surrogate pair out of range');
if (c2 < 0xdc00 || c2 > 0xdfff) {
bytes[i++] = 63;
--ci;
continue;
}
c = 0x10000 + ((c & 0x03ff) << 10) + (c2 & 0x03ff);
bytes[i++] = c >> 18 | 240;
bytes[i++] = c>> 12 & 63 | 128;
} else { // c <= 0xffff
bytes[i++] = c >> 12 | 224;
}
} else bytes[i++] = c >> 12 | 224;
bytes[i++] = c >> 6 & 63 | 128;
}
bytes[i++] = c & 63 | 128;
Expand All @@ -159,24 +164,21 @@ var {{.NameNative}} = new function() {
var c = bytes[i++];
if (c > 127) {
if (c > 191 && c < 224) {
if (i >= bytes.length) fail('UTF-8 decode: incomplete 2-byte sequence');
c = (c & 31) << 6 | bytes[i] & 63;
c = (i >= bytes.length) ? 63 : (c & 31) << 6 | bytes[i++] & 63;
} else if (c > 223 && c < 240) {
if (i + 1 >= bytes.length) fail('UTF-8 decode: incomplete 3-byte sequence');
c = (c & 15) << 12 | (bytes[i] & 63) << 6 | bytes[++i] & 63;
c = (i + 1 >= bytes.length) ? 63 : (c & 15) << 12 | (bytes[i++] & 63) << 6 | bytes[i++] & 63;
} else if (c > 239 && c < 248) {
if (i+2 >= bytes.length) fail('UTF-8 decode: incomplete 4-byte sequence');
c = (c & 7) << 18 | (bytes[i] & 63) << 12 | (bytes[++i] & 63) << 6 | bytes[++i] & 63;
} else fail('UTF-8 decode: unknown multibyte start 0x' + c.toString(16) + ' at index ' + (i - 1));
++i;
c = (i+2 >= bytes.length) ? 63 : (c & 7) << 18 | (bytes[i++] & 63) << 12 | (bytes[i++] & 63) << 6 | bytes[i++] & 63;
} else c = 63;
}
if (c <= 0xffff) s += String.fromCharCode(c);
else if (c <= 0x10ffff) {
else if (c > 0x10ffff) s += '?';
else {
c -= 0x10000;
s += String.fromCharCode(c >> 10 | 0xd800)
s += String.fromCharCode(c & 0x3FF | 0xdc00)
} else fail('UTF-8 decode: code point 0x' + c.toString(16) + ' exceeds UTF-16 reach');
}
}
return s;
}
Expand Down
32 changes: 17 additions & 15 deletions ecma/gen/Colfer.js
Original file line number Diff line number Diff line change
Expand Up @@ -649,15 +649,20 @@ var gen = new function() {
bytes[i++] = c >> 6 | 192;
} else {
if (c > 0xd7ff && c < 0xdc00) {
if (++ci == s.length) fail('UTF-8 encode: incomplete surrogate pair');
if (++ci == s.length) {
bytes[i++] = 63;
continue;
}
var c2 = s.charCodeAt(ci);
if (c2 < 0xdc00 || c2 > 0xdfff) fail('UTF-8 encode: second char code 0x' + c2.toString(16) + ' at index ' + ci + ' in surrogate pair out of range');
if (c2 < 0xdc00 || c2 > 0xdfff) {
bytes[i++] = 63;
--ci;
continue;
}
c = 0x10000 + ((c & 0x03ff) << 10) + (c2 & 0x03ff);
bytes[i++] = c >> 18 | 240;
bytes[i++] = c>> 12 & 63 | 128;
} else { // c <= 0xffff
bytes[i++] = c >> 12 | 224;
}
} else bytes[i++] = c >> 12 | 224;
bytes[i++] = c >> 6 & 63 | 128;
}
bytes[i++] = c & 63 | 128;
Expand All @@ -672,24 +677,21 @@ var gen = new function() {
var c = bytes[i++];
if (c > 127) {
if (c > 191 && c < 224) {
if (i >= bytes.length) fail('UTF-8 decode: incomplete 2-byte sequence');
c = (c & 31) << 6 | bytes[i] & 63;
c = (i >= bytes.length) ? 63 : (c & 31) << 6 | bytes[i++] & 63;
} else if (c > 223 && c < 240) {
if (i + 1 >= bytes.length) fail('UTF-8 decode: incomplete 3-byte sequence');
c = (c & 15) << 12 | (bytes[i] & 63) << 6 | bytes[++i] & 63;
c = (i + 1 >= bytes.length) ? 63 : (c & 15) << 12 | (bytes[i++] & 63) << 6 | bytes[i++] & 63;
} else if (c > 239 && c < 248) {
if (i+2 >= bytes.length) fail('UTF-8 decode: incomplete 4-byte sequence');
c = (c & 7) << 18 | (bytes[i] & 63) << 12 | (bytes[++i] & 63) << 6 | bytes[++i] & 63;
} else fail('UTF-8 decode: unknown multibyte start 0x' + c.toString(16) + ' at index ' + (i - 1));
++i;
c = (i+2 >= bytes.length) ? 63 : (c & 7) << 18 | (bytes[i++] & 63) << 12 | (bytes[i++] & 63) << 6 | bytes[i++] & 63;
} else c = 63;
}

if (c <= 0xffff) s += String.fromCharCode(c);
else if (c <= 0x10ffff) {
else if (c > 0x10ffff) s += '?';
else {
c -= 0x10000;
s += String.fromCharCode(c >> 10 | 0xd800)
s += String.fromCharCode(c & 0x3FF | 0xdc00)
} else fail('UTF-8 decode: code point 0x' + c.toString(16) + ' exceeds UTF-16 reach');
}
}
return s;
}
Expand Down

0 comments on commit cf320a6

Please sign in to comment.