Skip to content

Commit

Permalink
Fix escaped Unicode hex literals parsing (#49)
Browse files Browse the repository at this point in the history
* Update package set

* Fix escaped Unicode hex literals parsing

The Char type represents a UTF-16 code unit, while String can contain
any UTF-8 code unit. The same logic for escaping Unicode hexadecimal
literals was used by both String and Char literal parsers. This caused
a bug that prevented PureScript source code from having string literals
containing UTF-8 hexadecimal literals representing code units larger
than two bytes.

* Add tests for Unicode hex literals

* Update bench and parse-package-set dependencies

* Update CONTRIBUTORS.md

* Update parse-package-set's test package set.

---------

Co-authored-by: Nathan Faubion <nathan@n-son.com>
  • Loading branch information
turlando and natefaubion authored Apr 1, 2023
1 parent 13a2437 commit 7a0a448
Show file tree
Hide file tree
Showing 9 changed files with 59 additions and 12 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ jobs:

- uses: purescript-contrib/setup-purescript@main
with:
purescript: "0.15.0"
purescript: "0.15.8"
spago: "0.20.9"
psa: "0.8.2"
purs-tidy: "latest"
Expand Down
1 change: 1 addition & 0 deletions CONTRIBUTORS.md
Original file line number Diff line number Diff line change
Expand Up @@ -19,3 +19,4 @@ By adding your name to the list below, you agree to license your contributions u
| [@rintcius](https://github.com/rintcius) | Rintcius Blok
| [@i-am-the-slime](https://github.com/i-am-the-slime) | Mark Eibes
| [@monoidmusician](https://github.com/MonoidMusician) | Verity Scheel
| [@turlando](https://github.com/turlando) | Tancredi Orlando
1 change: 1 addition & 0 deletions bench/bench.dhall
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ in conf // {
, "control"
, "effect"
, "either"
, "enums"
, "foldable-traversable"
, "free"
, "functions"
Expand Down
4 changes: 2 additions & 2 deletions packages.dhall
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
let upstream =
https://github.com/purescript/package-sets/releases/download/psc-0.15.0-20220507/packages.dhall
sha256:cf54330f3bc1b25a093b69bff8489180c954b43668c81288901a2ec29a08cc64
https://github.com/purescript/package-sets/releases/download/psc-0.15.7-20230401/packages.dhall
sha256:d385eeee6ca160c32d7389a1f4f4ee6a05aff95e81373cdc50670b436efa1060

in upstream
2 changes: 1 addition & 1 deletion parse-package-set/Main.purs
Original file line number Diff line number Diff line change
Expand Up @@ -158,7 +158,7 @@ defaultSpagoDhall :: String
defaultSpagoDhall = Array.intercalate "\n"
[ "{ name = \"test-parser\""
, ", dependencies = [] : List Text"
, ", packages = https://github.com/purescript/package-sets/releases/download/psc-0.15.0-20220507/packages.dhall sha256:cf54330f3bc1b25a093b69bff8489180c954b43668c81288901a2ec29a08cc64"
, ", packages = https://github.com/purescript/package-sets/releases/download/psc-0.15.7-20230401/packages.dhall sha256:d385eeee6ca160c32d7389a1f4f4ee6a05aff95e81373cdc50670b436efa1060"
, ", sources = [] : List Text"
, "}"
]
Expand Down
1 change: 1 addition & 0 deletions parse-package-set/parse-package-set.dhall
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ in conf // {
, "datetime"
, "effect"
, "either"
, "enums"
, "exceptions"
, "filterable"
, "foldable-traversable"
Expand Down
1 change: 1 addition & 0 deletions spago.dhall
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
, "control"
, "effect"
, "either"
, "enums"
, "foldable-traversable"
, "free"
, "functions"
Expand Down
39 changes: 31 additions & 8 deletions src/PureScript/CST/Lexer.purs
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ import Data.Array.NonEmpty as NonEmptyArray
import Data.Array.ST as STArray
import Data.Char as Char
import Data.Either (Either(..))
import Data.Enum (toEnum)
import Data.Foldable (fold, foldl, foldMap)
import Data.Int (hexadecimal)
import Data.Int as Int
Expand All @@ -23,6 +24,8 @@ import Data.Newtype (unwrap)
import Data.Number as Number
import Data.String (Pattern(..), Replacement(..))
import Data.String as String
import Data.String.CodePoints (CodePoint)
import Data.String.CodePoints as SCP
import Data.String.CodeUnits as SCU
import Data.String.Regex as Regex
import Data.String.Regex.Flags (unicode)
Expand All @@ -36,6 +39,18 @@ import PureScript.CST.Types (Comment(..), IntValue(..), LineFeed(..), ModuleName

infixr 3 alt as <|>

class IsChar a where
fromChar :: Char -> a
fromCharCode :: Int -> Maybe a

instance IsChar Char where
fromChar = identity
fromCharCode = Char.fromCharCode

instance IsChar CodePoint where
fromChar = SCP.codePointFromChar
fromCharCode = toEnum

data LexResult e a
= LexFail e String
| LexSucc a String
Expand Down Expand Up @@ -502,29 +517,37 @@ token =
_ ->
pure { raw: SCU.singleton ch, char: ch }

parseEscape
:: forall a
. IsChar a
=> Lex (Unit -> ParseError) { raw :: String, char :: a }
parseEscape = do
ch <- charAny
case ch of
't' ->
pure { raw: "\\t", char: '\t' }
pure { raw: "\\t", char: fromChar '\t' }
'r' ->
pure { raw: "\\r", char: '\r' }
pure { raw: "\\r", char: fromChar '\r' }
'n' ->
pure { raw: "\\n", char: '\n' }
pure { raw: "\\n", char: fromChar '\n' }
'"' ->
pure { raw: "\\\"", char: '"' }
pure { raw: "\\\"", char: fromChar '"' }
'\'' ->
pure { raw: "\\'", char: '\'' }
pure { raw: "\\'", char: fromChar '\'' }
'\\' ->
pure { raw: "\\\\", char: '\\' }
pure { raw: "\\\\", char: fromChar '\\' }
'x' ->
parseHexEscape
_ ->
fail $ LexInvalidCharEscape $ SCU.singleton ch

parseHexEscape
:: forall a
. IsChar a
=> Lex (Unit -> ParseError) { raw :: String, char :: a }
parseHexEscape = do
esc <- hexEscapeRegex
case Char.fromCharCode =<< Int.fromStringAs hexadecimal esc of
case fromCharCode =<< Int.fromStringAs hexadecimal esc of
Just ch ->
pure { raw: "\\x" <> esc, char: ch }
Nothing ->
Expand Down Expand Up @@ -552,7 +575,7 @@ token =

parseStringEscape = ado
res <- charBackslash *> parseEscape
in { raw: res.raw, string: SCU.singleton res.char }
in { raw: res.raw, string: SCP.singleton res.char }

parseStringChars = ado
raw <- stringCharsRegex
Expand Down
20 changes: 20 additions & 0 deletions test/Main.purs
Original file line number Diff line number Diff line change
Expand Up @@ -232,3 +232,23 @@ main = do
true
_ ->
false

assertParse "String with Unicode astral code point hex literal"
"""
"\x10ffff"
"""
case _ of
ParseSucceeded (ExprString _ _) ->
true
_ ->
false

assertParse "Unicode astral code point Char hex literal"
"""
'\x10ffff'
"""
case _ of
(ParseFailed _ :: RecoveredParserResult Expr) ->
true
_ ->
false

0 comments on commit 7a0a448

Please sign in to comment.