From b27a8cc662bfaa019de25ba8f6f66867e28e87b9 Mon Sep 17 00:00:00 2001 From: John MacFarlane Date: Wed, 18 Dec 2024 16:42:21 -0800 Subject: [PATCH] Further improvements to base64 data URI parsing. Text.Pandoc.URI: export `pBase64DataURI`. Modify `isURI` to use this and avoid calling network-uri's inefficient `parseURI` for data URIs. Markdown reader: use T.P.URI's `pBase64DataURI` in parsing data URIs. Partially addresses #10075. Obsoletes #10434 (borrowing most of its ideas). Co-authored-by: Evan Silberman --- src/Text/Pandoc/Readers/Markdown.hs | 49 +++++++---------------------- src/Text/Pandoc/URI.hs | 39 ++++++++++++++++++++--- 2 files changed, 46 insertions(+), 42 deletions(-) diff --git a/src/Text/Pandoc/Readers/Markdown.hs b/src/Text/Pandoc/Readers/Markdown.hs index 8035bf16381f..139b853dcf25 100644 --- a/src/Text/Pandoc/Readers/Markdown.hs +++ b/src/Text/Pandoc/Readers/Markdown.hs @@ -52,7 +52,7 @@ import Text.Pandoc.Readers.HTML (htmlInBalanced, htmlTag, isBlockTag, isCommentTag, isInlineTag, isTextTag) import Text.Pandoc.Readers.LaTeX (applyMacros, rawLaTeXBlock, rawLaTeXInline) import Text.Pandoc.Shared -import Text.Pandoc.URI (escapeURI, isURI) +import Text.Pandoc.URI (escapeURI, isURI, pBase64DataURI) import Text.Pandoc.XML (fromEntities) import Text.Pandoc.Readers.Metadata (yamlBsToMeta, yamlBsToRefs, yamlMetaBlock) -- import Debug.Trace (traceShowId) @@ -1835,47 +1835,22 @@ source = do let sourceURL = T.unwords . T.words . T.concat <$> many urlChunk let betweenAngles = try $ char '<' >> mconcat <$> (manyTill litChar (char '>')) - src <- try betweenAngles <|> try pBase64DataURI <|> sourceURL + src <- try betweenAngles <|> try base64DataURI <|> sourceURL tit <- option "" $ try $ spnl >> linkTitle skipSpaces char ')' return (escapeURI $ trimr src, tit) -pBase64DataURI :: PandocMonad m => ParsecT Sources s m Text -pBase64DataURI = mconcat <$> sequence - [ textStr "data:" - , T.singleton <$> alphaNum - , restrictedName - , T.singleton <$> char '/' - , restrictedName - , textStr ";" - , mconcat <$> many (try mediaParam) - , textStr "base64," - , pBase64Data - ] - where - restrictedName = manyChar (satisfy (A.inClass "A-Za-z0-9!#$&^_.+-")) - mediaParam = mconcat <$> sequence - [ restrictedName - , textStr "=" - , manyChar (noneOf ";") - , textStr ";" - ] - -pBase64Data :: PandocMonad m => ParsecT Sources s m Text -pBase64Data = do - Sources inps <- getInput - case inps of - [] -> mzero - (pos,t):rest -> do - satisfy (A.inClass "A-Za-z0-9+/") -- parse one character or parsec won't know - -- we have consumed input - let (a,r) = T.span (A.inClass "A-Za-z0-9+/") t - let (b, trest) = T.span (=='=') r - let b64 = a <> b - let pos' = incSourceColumn pos (T.length b64) - setInput $ Sources ((pos',trest):rest) - return b64 +base64DataURI :: PandocMonad m => ParsecT Sources s m Text +base64DataURI = do + Sources ((pos, txt):rest) <- getInput + let r = A.parse pBase64DataURI txt + case r of + A.Done remaining consumed -> do + let pos' = incSourceColumn pos (T.length consumed) + setInput $ Sources ((pos', remaining):rest) + return consumed + _ -> mzero linkTitle :: PandocMonad m => MarkdownParser m Text linkTitle = quotedTitle '"' <|> quotedTitle '\'' diff --git a/src/Text/Pandoc/URI.hs b/src/Text/Pandoc/URI.hs index a0b47d259cd5..53b7288b2211 100644 --- a/src/Text/Pandoc/URI.hs +++ b/src/Text/Pandoc/URI.hs @@ -15,6 +15,7 @@ module Text.Pandoc.URI ( urlEncode , isURI , schemes , uriPathToPath + , pBase64DataURI ) where import qualified Network.HTTP.Types as HTTP import qualified Text.Pandoc.UTF8 as UTF8 @@ -22,6 +23,8 @@ import qualified Data.Text as T import qualified Data.Set as Set import Data.Char (isSpace, isAscii) import Network.URI (URI (uriScheme), parseURI, escapeURIString) +import qualified Data.Attoparsec.Text as A +import Control.Applicative (many) urlEncode :: T.Text -> T.Text urlEncode = UTF8.toText . HTTP.urlEncode True . UTF8.fromText @@ -90,12 +93,16 @@ schemes = Set.fromList -- | Check if the string is a valid URL with a IANA or frequently used but -- unofficial scheme (see @schemes@). isURI :: T.Text -> Bool -isURI = - -- we URI-escape non-ASCII characters because otherwise parseURI will choke: - maybe False hasKnownScheme . parseURI . escapeURIString isAscii . T.unpack +isURI t = + -- If it's a base 64 data: URI, avoid the expensive call to parseURI: + case A.parseOnly (pBase64DataURI *> A.endOfInput) t of + Right () -> True + Left _ -> + -- we URI-escape non-ASCII characters because otherwise parseURI will choke: + maybe False hasKnownScheme . parseURI . escapeURIString isAscii . T.unpack $ t where - hasKnownScheme = (`Set.member` schemes) . T.toLower . - T.filter (/= ':') . T.pack . uriScheme + hasKnownScheme = + (`Set.member` schemes) . T.toLower . T.filter (/= ':') . T.pack . uriScheme -- | Converts the path part of a file: URI to a regular path. -- On windows, @/c:/foo@ should be @c:/foo@. @@ -109,3 +116,25 @@ uriPathToPath (T.unpack -> path) = #else path #endif + +pBase64DataURI :: A.Parser T.Text +pBase64DataURI = fst <$> A.match base64uri + where + base64uri = do + A.string "data:" + restrictedName + A.char '/' + restrictedName + A.char ';' + many mediaParam + A.string "base64," + A.skipWhile (A.inClass "A-Za-z0-9+/") + A.skipWhile (== '=') + restrictedName = do + A.satisfy (A.inClass "A-Za-z0-9") + A.skipWhile (A.inClass "A-Za-z0-9!#$&^_.+-") + mediaParam = do + restrictedName + A.char '=' + A.skipWhile (/=';') + A.char ';'