From 7a0a4484f8abd91d688d297461209a75667e0da6 Mon Sep 17 00:00:00 2001 From: turlando Date: Sat, 1 Apr 2023 22:25:59 +0200 Subject: [PATCH] Fix escaped Unicode hex literals parsing (#49) * Update package set * Fix escaped Unicode hex literals parsing The Char type represents a UTF-16 code unit, while String can contain any UTF-8 code unit. The same logic for escaping Unicode hexadecimal literals was used by both String and Char literal parsers. This caused a bug that prevented PureScript source code from having string literals containing UTF-8 hexadecimal literals representing code units larger than two bytes. * Add tests for Unicode hex literals * Update bench and parse-package-set dependencies * Update CONTRIBUTORS.md * Update parse-package-set's test package set. --------- Co-authored-by: Nathan Faubion --- .github/workflows/ci.yml | 2 +- CONTRIBUTORS.md | 1 + bench/bench.dhall | 1 + packages.dhall | 4 +-- parse-package-set/Main.purs | 2 +- parse-package-set/parse-package-set.dhall | 1 + spago.dhall | 1 + src/PureScript/CST/Lexer.purs | 39 ++++++++++++++++++----- test/Main.purs | 20 ++++++++++++ 9 files changed, 59 insertions(+), 12 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 992a1d6..8542370 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -14,7 +14,7 @@ jobs: - uses: purescript-contrib/setup-purescript@main with: - purescript: "0.15.0" + purescript: "0.15.8" spago: "0.20.9" psa: "0.8.2" purs-tidy: "latest" diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md index 24cdc44..4752dba 100644 --- a/CONTRIBUTORS.md +++ b/CONTRIBUTORS.md @@ -19,3 +19,4 @@ By adding your name to the list below, you agree to license your contributions u | [@rintcius](https://github.com/rintcius) | Rintcius Blok | [@i-am-the-slime](https://github.com/i-am-the-slime) | Mark Eibes | [@monoidmusician](https://github.com/MonoidMusician) | Verity Scheel +| [@turlando](https://github.com/turlando) | Tancredi Orlando diff --git a/bench/bench.dhall b/bench/bench.dhall index a5f86bf..78eec12 100644 --- a/bench/bench.dhall +++ b/bench/bench.dhall @@ -10,6 +10,7 @@ in conf // { , "control" , "effect" , "either" + , "enums" , "foldable-traversable" , "free" , "functions" diff --git a/packages.dhall b/packages.dhall index 8c11748..41eca28 100644 --- a/packages.dhall +++ b/packages.dhall @@ -1,5 +1,5 @@ let upstream = - https://github.com/purescript/package-sets/releases/download/psc-0.15.0-20220507/packages.dhall - sha256:cf54330f3bc1b25a093b69bff8489180c954b43668c81288901a2ec29a08cc64 + https://github.com/purescript/package-sets/releases/download/psc-0.15.7-20230401/packages.dhall + sha256:d385eeee6ca160c32d7389a1f4f4ee6a05aff95e81373cdc50670b436efa1060 in upstream diff --git a/parse-package-set/Main.purs b/parse-package-set/Main.purs index e50ba88..6bd2ec9 100644 --- a/parse-package-set/Main.purs +++ b/parse-package-set/Main.purs @@ -158,7 +158,7 @@ defaultSpagoDhall :: String defaultSpagoDhall = Array.intercalate "\n" [ "{ name = \"test-parser\"" , ", dependencies = [] : List Text" - , ", packages = https://github.com/purescript/package-sets/releases/download/psc-0.15.0-20220507/packages.dhall sha256:cf54330f3bc1b25a093b69bff8489180c954b43668c81288901a2ec29a08cc64" + , ", packages = https://github.com/purescript/package-sets/releases/download/psc-0.15.7-20230401/packages.dhall sha256:d385eeee6ca160c32d7389a1f4f4ee6a05aff95e81373cdc50670b436efa1060" , ", sources = [] : List Text" , "}" ] diff --git a/parse-package-set/parse-package-set.dhall b/parse-package-set/parse-package-set.dhall index 3ae5b80..24a1470 100644 --- a/parse-package-set/parse-package-set.dhall +++ b/parse-package-set/parse-package-set.dhall @@ -12,6 +12,7 @@ in conf // { , "datetime" , "effect" , "either" + , "enums" , "exceptions" , "filterable" , "foldable-traversable" diff --git a/spago.dhall b/spago.dhall index edff0d7..6e14541 100644 --- a/spago.dhall +++ b/spago.dhall @@ -8,6 +8,7 @@ , "control" , "effect" , "either" + , "enums" , "foldable-traversable" , "free" , "functions" diff --git a/src/PureScript/CST/Lexer.purs b/src/PureScript/CST/Lexer.purs index 2d84305..c0ebbe2 100644 --- a/src/PureScript/CST/Lexer.purs +++ b/src/PureScript/CST/Lexer.purs @@ -13,6 +13,7 @@ import Data.Array.NonEmpty as NonEmptyArray import Data.Array.ST as STArray import Data.Char as Char import Data.Either (Either(..)) +import Data.Enum (toEnum) import Data.Foldable (fold, foldl, foldMap) import Data.Int (hexadecimal) import Data.Int as Int @@ -23,6 +24,8 @@ import Data.Newtype (unwrap) import Data.Number as Number import Data.String (Pattern(..), Replacement(..)) import Data.String as String +import Data.String.CodePoints (CodePoint) +import Data.String.CodePoints as SCP import Data.String.CodeUnits as SCU import Data.String.Regex as Regex import Data.String.Regex.Flags (unicode) @@ -36,6 +39,18 @@ import PureScript.CST.Types (Comment(..), IntValue(..), LineFeed(..), ModuleName infixr 3 alt as <|> +class IsChar a where + fromChar :: Char -> a + fromCharCode :: Int -> Maybe a + +instance IsChar Char where + fromChar = identity + fromCharCode = Char.fromCharCode + +instance IsChar CodePoint where + fromChar = SCP.codePointFromChar + fromCharCode = toEnum + data LexResult e a = LexFail e String | LexSucc a String @@ -502,29 +517,37 @@ token = _ -> pure { raw: SCU.singleton ch, char: ch } + parseEscape + :: forall a + . IsChar a + => Lex (Unit -> ParseError) { raw :: String, char :: a } parseEscape = do ch <- charAny case ch of 't' -> - pure { raw: "\\t", char: '\t' } + pure { raw: "\\t", char: fromChar '\t' } 'r' -> - pure { raw: "\\r", char: '\r' } + pure { raw: "\\r", char: fromChar '\r' } 'n' -> - pure { raw: "\\n", char: '\n' } + pure { raw: "\\n", char: fromChar '\n' } '"' -> - pure { raw: "\\\"", char: '"' } + pure { raw: "\\\"", char: fromChar '"' } '\'' -> - pure { raw: "\\'", char: '\'' } + pure { raw: "\\'", char: fromChar '\'' } '\\' -> - pure { raw: "\\\\", char: '\\' } + pure { raw: "\\\\", char: fromChar '\\' } 'x' -> parseHexEscape _ -> fail $ LexInvalidCharEscape $ SCU.singleton ch + parseHexEscape + :: forall a + . IsChar a + => Lex (Unit -> ParseError) { raw :: String, char :: a } parseHexEscape = do esc <- hexEscapeRegex - case Char.fromCharCode =<< Int.fromStringAs hexadecimal esc of + case fromCharCode =<< Int.fromStringAs hexadecimal esc of Just ch -> pure { raw: "\\x" <> esc, char: ch } Nothing -> @@ -552,7 +575,7 @@ token = parseStringEscape = ado res <- charBackslash *> parseEscape - in { raw: res.raw, string: SCU.singleton res.char } + in { raw: res.raw, string: SCP.singleton res.char } parseStringChars = ado raw <- stringCharsRegex diff --git a/test/Main.purs b/test/Main.purs index 7e3eda3..d1eecc0 100644 --- a/test/Main.purs +++ b/test/Main.purs @@ -232,3 +232,23 @@ main = do true _ -> false + + assertParse "String with Unicode astral code point hex literal" + """ + "\x10ffff" + """ + case _ of + ParseSucceeded (ExprString _ _) -> + true + _ -> + false + + assertParse "Unicode astral code point Char hex literal" + """ + '\x10ffff' + """ + case _ of + (ParseFailed _ :: RecoveredParserResult Expr) -> + true + _ -> + false