Skip to content

Commit

Permalink
modify Utf8 and Utf16 decode_code_point to let them more safe
Browse files Browse the repository at this point in the history
  • Loading branch information
mandel59 committed Jun 6, 2014
1 parent e81dd67 commit 698bd0b
Show file tree
Hide file tree
Showing 2 changed files with 38 additions and 60 deletions.
39 changes: 18 additions & 21 deletions unifill/Utf16.hx
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ class Utf16 {
`this`.
**/
public function codePointAt(index : Int) : Int {
return Utf16Impl.decode_code_point(codeUnitAt, index);
return Utf16Impl.decode_code_point(length, codeUnitAt, index);
}

/**
Expand Down Expand Up @@ -116,7 +116,7 @@ class Utf16 {
var accessor = codeUnitAt;
var i = 0;
while (i < len) {
Utf16Impl.validate_sequence(len, accessor, i);
Utf16Impl.decode_code_point(len, accessor, i);
i += codePointWidthAt(i);
}
}
Expand Down Expand Up @@ -171,16 +171,6 @@ private class Utf16Impl {
return (!Unicode.isLowSurrogate(c)) ? 1 : 2;
}

public static function decode_code_point(accessor : Int -> Int, index : Int) : Int {
var hi = accessor(index);
if (Unicode.isHighSurrogate(hi)) {
var lo = accessor(index + 1);
return Unicode.decodeSurrogate(hi, lo);
} else {
return hi;
}
}

public static function encode_code_point(addUnit : Int -> Void, codePoint : Int) : Void {
if (codePoint <= 0xFFFF) {
addUnit(codePoint);
Expand All @@ -190,18 +180,25 @@ private class Utf16Impl {
}
}

public static inline function validate_sequence(len : Int, accessor : Int -> Int, index : Int) : Void {
if (index >= len)
public static function decode_code_point(len : Int, accessor : Int -> Int, index : Int) : Int {
if (index < 0 || len <= index)
throw Exception.InvalidCodeUnitSequence(index);
var c = accessor(index);
if (Unicode.isHighSurrogate(c)) {
if (index >= len - 1 || !Unicode.isLowSurrogate(accessor(index + 1)))
var hi = accessor(index);
if (Unicode.isHighSurrogate(hi)) {
if (index + 1 < 0 || len <= index + 1) {
throw Exception.InvalidCodeUnitSequence(index);
}
if (Unicode.isLowSurrogate(c)) {
}
var lo = accessor(index + 1);
if (Unicode.isLowSurrogate(lo)) {
return Unicode.decodeSurrogate(hi, lo);
} else {
throw Exception.InvalidCodeUnitSequence(index);
}
} else if (Unicode.isLowSurrogate(hi)) {
throw Exception.InvalidCodeUnitSequence(index);
} else {
return hi;
}
return;
}

}
Expand Down Expand Up @@ -332,7 +329,7 @@ private abstract StringU16(Array<Int>) {
var len = this.length;
var cua = function (i) return this[i];
while (i < len) {
var u = Utf16Impl.decode_code_point(cua, i);
var u = Utf16Impl.decode_code_point(len, cua, i);
buf.add(InternalEncoding.fromCodePoint(u));
i += Utf16Impl.code_point_width(codeUnitAt(i));
}
Expand Down
59 changes: 20 additions & 39 deletions unifill/Utf8.hx
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@ class Utf8 {
`this`.
**/
public function codePointAt(index : Int) : Int {
return Utf8Impl.decode_code_point(codeUnitAt, index);
return Utf8Impl.decode_code_point(length, codeUnitAt, index);
}

/**
Expand Down Expand Up @@ -119,7 +119,7 @@ class Utf8 {
var accessor = codeUnitAt;
var i = 0;
while (i < len) {
Utf8Impl.validate_sequence(len, accessor, i);
Utf8Impl.decode_code_point(len, accessor, i);
i += codePointWidthAt(i);
}
}
Expand Down Expand Up @@ -178,29 +178,6 @@ private class Utf8Impl {
: 1;
}

public static function decode_code_point(accessor : Int -> Int, index : Int) : Int {
var c1 = accessor(index);
if (c1 < 0x80) {
return c1;
} else if (c1 < 0xC0) {
throw Exception.InvalidCodeUnitSequence(index);
} else if (c1 < 0xE0) {
var c2 = accessor(index + 1);
return ((c1 & 0x3F) << 6) | (c2 & 0x7F);
} else if (c1 < 0xF0) {
var c2 = accessor(index + 1);
var c3 = accessor(index + 2);
return ((c1 & 0x1F) << 12) | ((c2 & 0x7F) << 6) | (c3 & 0x7F);
} else if (c1 < 0xF8) {
var c2 = accessor(index + 1);
var c3 = accessor(index + 2);
var c4 = accessor(index + 3);
return ((c1 & 0x0F) << 18) | ((c2 & 0x7F) << 12) | ((c3 & 0x7F) << 6) | (c4 & 0x7F);
} else {
throw Exception.InvalidCodeUnitSequence(index);
}
}

public static function encode_code_point(addUnit : Int -> Void, codePoint : Int) : Void {
if (codePoint <= 0x7F) {
addUnit(codePoint);
Expand All @@ -221,42 +198,46 @@ private class Utf8Impl {
}
}

public static inline function validate_sequence(len : Int, accessor : Int -> Int, index : Int) : Void {
if (index >= len)
public static function decode_code_point(len : Int, accessor : Int -> Int, index : Int) : Int {
var i = index;
if (i < 0 || len <= i)
throw Exception.InvalidCodeUnitSequence(index);
var c1 = accessor(index);
var c1 = accessor(i);
if (c1 < 0x80) {
return;
return c1;
}
if (c1 < 0xC0) {
throw Exception.InvalidCodeUnitSequence(index);
}
if (index >= len - 1)
++i;
if (i < 0 || len <= i)
throw Exception.InvalidCodeUnitSequence(index);
var c2 = accessor(index + 1);
var c2 = accessor(i);
if (c1 < 0xE0) {
if ((c1 & 0x1E != 0) && (c2 & 0xC0 == 0x80))
return;
return ((c1 & 0x3F) << 6) | (c2 & 0x7F);
else
throw Exception.InvalidCodeUnitSequence(index);
}
if (index >= len - 2)
++i;
if (i < 0 || len <= i)
throw Exception.InvalidCodeUnitSequence(index);
var c3 = accessor(index + 2);
var c3 = accessor(i);
if (c1 < 0xF0) {
if (((c1 & 0x0F != 0) || (c2 & 0x20 != 0)) && (c2 & 0xC0 == 0x80) && (c3 & 0xC0 == 0x80)
&& !(c1 == 0xED && 0xA0 <= c2 && c2 <= 0xBF))
return;
return ((c1 & 0x1F) << 12) | ((c2 & 0x7F) << 6) | (c3 & 0x7F);
else
throw Exception.InvalidCodeUnitSequence(index);
}
if (index >= len - 3)
++i;
if (i < 0 || len <= i)
throw Exception.InvalidCodeUnitSequence(index);
var c4 = accessor(index + 3);
var c4 = accessor(i);
if (c1 < 0xF8) {
if (((c1 & 0x07 != 0) || (c2 & 0x30 != 0)) && (c2 & 0xC0 == 0x80) && (c3 & 0xC0 == 0x80) && (c4 & 0xC0 == 0x80)
&& !((c1 == 0xF4 && c2 > 0x8F) || c1 > 0xF4))
return;
return ((c1 & 0x0F) << 18) | ((c2 & 0x7F) << 12) | ((c3 & 0x7F) << 6) | (c4 & 0x7F);
else
throw Exception.InvalidCodeUnitSequence(index);
}
Expand Down Expand Up @@ -354,7 +335,7 @@ private abstract StringU8(Bytes) {
var len = this.length;
var cua = function (i) return this.get(i);
while (i < len) {
var u = Utf8Impl.decode_code_point(cua, i);
var u = Utf8Impl.decode_code_point(len, cua, i);
buf.add(InternalEncoding.fromCodePoint(u));
i += Utf8Impl.code_point_width(codeUnitAt(i));
}
Expand Down

0 comments on commit 698bd0b

Please sign in to comment.