From 698bd0b29f8756eeaef9f107ee5528a13fc3e531 Mon Sep 17 00:00:00 2001 From: Ryusei Yamaguchi Date: Fri, 6 Jun 2014 11:14:37 +0900 Subject: [PATCH] modify Utf8 and Utf16 decode_code_point to let them more safe --- unifill/Utf16.hx | 39 +++++++++++++++----------------- unifill/Utf8.hx | 59 ++++++++++++++++-------------------------------- 2 files changed, 38 insertions(+), 60 deletions(-) diff --git a/unifill/Utf16.hx b/unifill/Utf16.hx index be1b926..ff41ea8 100644 --- a/unifill/Utf16.hx +++ b/unifill/Utf16.hx @@ -44,7 +44,7 @@ class Utf16 { `this`. **/ public function codePointAt(index : Int) : Int { - return Utf16Impl.decode_code_point(codeUnitAt, index); + return Utf16Impl.decode_code_point(length, codeUnitAt, index); } /** @@ -116,7 +116,7 @@ class Utf16 { var accessor = codeUnitAt; var i = 0; while (i < len) { - Utf16Impl.validate_sequence(len, accessor, i); + Utf16Impl.decode_code_point(len, accessor, i); i += codePointWidthAt(i); } } @@ -171,16 +171,6 @@ private class Utf16Impl { return (!Unicode.isLowSurrogate(c)) ? 1 : 2; } - public static function decode_code_point(accessor : Int -> Int, index : Int) : Int { - var hi = accessor(index); - if (Unicode.isHighSurrogate(hi)) { - var lo = accessor(index + 1); - return Unicode.decodeSurrogate(hi, lo); - } else { - return hi; - } - } - public static function encode_code_point(addUnit : Int -> Void, codePoint : Int) : Void { if (codePoint <= 0xFFFF) { addUnit(codePoint); @@ -190,18 +180,25 @@ private class Utf16Impl { } } - public static inline function validate_sequence(len : Int, accessor : Int -> Int, index : Int) : Void { - if (index >= len) + public static function decode_code_point(len : Int, accessor : Int -> Int, index : Int) : Int { + if (index < 0 || len <= index) throw Exception.InvalidCodeUnitSequence(index); - var c = accessor(index); - if (Unicode.isHighSurrogate(c)) { - if (index >= len - 1 || !Unicode.isLowSurrogate(accessor(index + 1))) + var hi = accessor(index); + if (Unicode.isHighSurrogate(hi)) { + if (index + 1 < 0 || len <= index + 1) { throw Exception.InvalidCodeUnitSequence(index); - } - if (Unicode.isLowSurrogate(c)) { + } + var lo = accessor(index + 1); + if (Unicode.isLowSurrogate(lo)) { + return Unicode.decodeSurrogate(hi, lo); + } else { + throw Exception.InvalidCodeUnitSequence(index); + } + } else if (Unicode.isLowSurrogate(hi)) { throw Exception.InvalidCodeUnitSequence(index); + } else { + return hi; } - return; } } @@ -332,7 +329,7 @@ private abstract StringU16(Array) { var len = this.length; var cua = function (i) return this[i]; while (i < len) { - var u = Utf16Impl.decode_code_point(cua, i); + var u = Utf16Impl.decode_code_point(len, cua, i); buf.add(InternalEncoding.fromCodePoint(u)); i += Utf16Impl.code_point_width(codeUnitAt(i)); } diff --git a/unifill/Utf8.hx b/unifill/Utf8.hx index 0baa53d..5a2eb53 100644 --- a/unifill/Utf8.hx +++ b/unifill/Utf8.hx @@ -47,7 +47,7 @@ class Utf8 { `this`. **/ public function codePointAt(index : Int) : Int { - return Utf8Impl.decode_code_point(codeUnitAt, index); + return Utf8Impl.decode_code_point(length, codeUnitAt, index); } /** @@ -119,7 +119,7 @@ class Utf8 { var accessor = codeUnitAt; var i = 0; while (i < len) { - Utf8Impl.validate_sequence(len, accessor, i); + Utf8Impl.decode_code_point(len, accessor, i); i += codePointWidthAt(i); } } @@ -178,29 +178,6 @@ private class Utf8Impl { : 1; } - public static function decode_code_point(accessor : Int -> Int, index : Int) : Int { - var c1 = accessor(index); - if (c1 < 0x80) { - return c1; - } else if (c1 < 0xC0) { - throw Exception.InvalidCodeUnitSequence(index); - } else if (c1 < 0xE0) { - var c2 = accessor(index + 1); - return ((c1 & 0x3F) << 6) | (c2 & 0x7F); - } else if (c1 < 0xF0) { - var c2 = accessor(index + 1); - var c3 = accessor(index + 2); - return ((c1 & 0x1F) << 12) | ((c2 & 0x7F) << 6) | (c3 & 0x7F); - } else if (c1 < 0xF8) { - var c2 = accessor(index + 1); - var c3 = accessor(index + 2); - var c4 = accessor(index + 3); - return ((c1 & 0x0F) << 18) | ((c2 & 0x7F) << 12) | ((c3 & 0x7F) << 6) | (c4 & 0x7F); - } else { - throw Exception.InvalidCodeUnitSequence(index); - } - } - public static function encode_code_point(addUnit : Int -> Void, codePoint : Int) : Void { if (codePoint <= 0x7F) { addUnit(codePoint); @@ -221,42 +198,46 @@ private class Utf8Impl { } } - public static inline function validate_sequence(len : Int, accessor : Int -> Int, index : Int) : Void { - if (index >= len) + public static function decode_code_point(len : Int, accessor : Int -> Int, index : Int) : Int { + var i = index; + if (i < 0 || len <= i) throw Exception.InvalidCodeUnitSequence(index); - var c1 = accessor(index); + var c1 = accessor(i); if (c1 < 0x80) { - return; + return c1; } if (c1 < 0xC0) { throw Exception.InvalidCodeUnitSequence(index); } - if (index >= len - 1) + ++i; + if (i < 0 || len <= i) throw Exception.InvalidCodeUnitSequence(index); - var c2 = accessor(index + 1); + var c2 = accessor(i); if (c1 < 0xE0) { if ((c1 & 0x1E != 0) && (c2 & 0xC0 == 0x80)) - return; + return ((c1 & 0x3F) << 6) | (c2 & 0x7F); else throw Exception.InvalidCodeUnitSequence(index); } - if (index >= len - 2) + ++i; + if (i < 0 || len <= i) throw Exception.InvalidCodeUnitSequence(index); - var c3 = accessor(index + 2); + var c3 = accessor(i); if (c1 < 0xF0) { if (((c1 & 0x0F != 0) || (c2 & 0x20 != 0)) && (c2 & 0xC0 == 0x80) && (c3 & 0xC0 == 0x80) && !(c1 == 0xED && 0xA0 <= c2 && c2 <= 0xBF)) - return; + return ((c1 & 0x1F) << 12) | ((c2 & 0x7F) << 6) | (c3 & 0x7F); else throw Exception.InvalidCodeUnitSequence(index); } - if (index >= len - 3) + ++i; + if (i < 0 || len <= i) throw Exception.InvalidCodeUnitSequence(index); - var c4 = accessor(index + 3); + var c4 = accessor(i); if (c1 < 0xF8) { if (((c1 & 0x07 != 0) || (c2 & 0x30 != 0)) && (c2 & 0xC0 == 0x80) && (c3 & 0xC0 == 0x80) && (c4 & 0xC0 == 0x80) && !((c1 == 0xF4 && c2 > 0x8F) || c1 > 0xF4)) - return; + return ((c1 & 0x0F) << 18) | ((c2 & 0x7F) << 12) | ((c3 & 0x7F) << 6) | (c4 & 0x7F); else throw Exception.InvalidCodeUnitSequence(index); } @@ -354,7 +335,7 @@ private abstract StringU8(Bytes) { var len = this.length; var cua = function (i) return this.get(i); while (i < len) { - var u = Utf8Impl.decode_code_point(cua, i); + var u = Utf8Impl.decode_code_point(len, cua, i); buf.add(InternalEncoding.fromCodePoint(u)); i += Utf8Impl.code_point_width(codeUnitAt(i)); }