From b27a8cc662bfaa019de25ba8f6f66867e28e87b9 Mon Sep 17 00:00:00 2001
From: John MacFarlane <jgm@berkeley.edu>
Date: Wed, 18 Dec 2024 16:42:21 -0800
Subject: [PATCH] Further improvements to base64 data URI parsing.

Text.Pandoc.URI: export `pBase64DataURI`.  Modify `isURI` to use this
and avoid calling network-uri's inefficient `parseURI` for data URIs.

Markdown reader: use T.P.URI's `pBase64DataURI` in parsing data
URIs.

Partially addresses #10075.

Obsoletes #10434 (borrowing most of its ideas).

Co-authored-by: Evan Silberman <evan@jklol.net>
---
 src/Text/Pandoc/Readers/Markdown.hs | 49 +++++++----------------------
 src/Text/Pandoc/URI.hs              | 39 ++++++++++++++++++++---
 2 files changed, 46 insertions(+), 42 deletions(-)

diff --git a/src/Text/Pandoc/Readers/Markdown.hs b/src/Text/Pandoc/Readers/Markdown.hs
index 8035bf16381f..139b853dcf25 100644
--- a/src/Text/Pandoc/Readers/Markdown.hs
+++ b/src/Text/Pandoc/Readers/Markdown.hs
@@ -52,7 +52,7 @@ import Text.Pandoc.Readers.HTML (htmlInBalanced, htmlTag, isBlockTag,
                                  isCommentTag, isInlineTag, isTextTag)
 import Text.Pandoc.Readers.LaTeX (applyMacros, rawLaTeXBlock, rawLaTeXInline)
 import Text.Pandoc.Shared
-import Text.Pandoc.URI (escapeURI, isURI)
+import Text.Pandoc.URI (escapeURI, isURI, pBase64DataURI)
 import Text.Pandoc.XML (fromEntities)
 import Text.Pandoc.Readers.Metadata (yamlBsToMeta, yamlBsToRefs, yamlMetaBlock)
 -- import Debug.Trace (traceShowId)
@@ -1835,47 +1835,22 @@ source = do
   let sourceURL = T.unwords . T.words . T.concat <$> many urlChunk
   let betweenAngles = try $
          char '<' >> mconcat <$> (manyTill litChar (char '>'))
-  src <- try betweenAngles <|> try pBase64DataURI <|> sourceURL
+  src <- try betweenAngles <|> try base64DataURI <|> sourceURL
   tit <- option "" $ try $ spnl >> linkTitle
   skipSpaces
   char ')'
   return (escapeURI $ trimr src, tit)
 
-pBase64DataURI :: PandocMonad m => ParsecT Sources s m Text
-pBase64DataURI = mconcat <$> sequence
-  [ textStr "data:"
-  , T.singleton <$> alphaNum
-  , restrictedName
-  , T.singleton <$> char '/'
-  , restrictedName
-  , textStr ";"
-  , mconcat <$> many (try mediaParam)
-  , textStr "base64,"
-  , pBase64Data
-  ]
- where
-    restrictedName = manyChar (satisfy (A.inClass "A-Za-z0-9!#$&^_.+-"))
-    mediaParam = mconcat <$> sequence
-      [ restrictedName
-      , textStr "="
-      , manyChar (noneOf ";")
-      , textStr ";"
-      ]
-
-pBase64Data :: PandocMonad m => ParsecT Sources s m Text
-pBase64Data = do
-  Sources inps <- getInput
-  case inps of
-    [] -> mzero
-    (pos,t):rest -> do
-      satisfy (A.inClass "A-Za-z0-9+/") -- parse one character or parsec won't know
-                                        -- we have consumed input
-      let (a,r) = T.span (A.inClass "A-Za-z0-9+/") t
-      let (b, trest) = T.span (=='=') r
-      let b64 = a <> b
-      let pos' = incSourceColumn pos (T.length b64)
-      setInput $ Sources ((pos',trest):rest)
-      return b64
+base64DataURI :: PandocMonad m => ParsecT Sources s m Text
+base64DataURI = do
+  Sources ((pos, txt):rest) <- getInput
+  let r = A.parse pBase64DataURI txt
+  case r of
+    A.Done remaining consumed -> do
+      let pos' = incSourceColumn pos (T.length consumed)
+      setInput $ Sources ((pos', remaining):rest)
+      return consumed
+    _ -> mzero
 
 linkTitle :: PandocMonad m => MarkdownParser m Text
 linkTitle = quotedTitle '"' <|> quotedTitle '\''
diff --git a/src/Text/Pandoc/URI.hs b/src/Text/Pandoc/URI.hs
index a0b47d259cd5..53b7288b2211 100644
--- a/src/Text/Pandoc/URI.hs
+++ b/src/Text/Pandoc/URI.hs
@@ -15,6 +15,7 @@ module Text.Pandoc.URI ( urlEncode
                        , isURI
                        , schemes
                        , uriPathToPath
+                       , pBase64DataURI
                        ) where
 import qualified Network.HTTP.Types as HTTP
 import qualified Text.Pandoc.UTF8 as UTF8
@@ -22,6 +23,8 @@ import qualified Data.Text as T
 import qualified Data.Set as Set
 import Data.Char (isSpace, isAscii)
 import Network.URI (URI (uriScheme), parseURI, escapeURIString)
+import qualified Data.Attoparsec.Text as A
+import Control.Applicative (many)
 
 urlEncode :: T.Text -> T.Text
 urlEncode = UTF8.toText . HTTP.urlEncode True . UTF8.fromText
@@ -90,12 +93,16 @@ schemes = Set.fromList
 -- | Check if the string is a valid URL with a IANA or frequently used but
 -- unofficial scheme (see @schemes@).
 isURI :: T.Text -> Bool
-isURI =
-  -- we URI-escape non-ASCII characters because otherwise parseURI will choke:
-  maybe False hasKnownScheme . parseURI . escapeURIString isAscii . T.unpack
+isURI t =
+  -- If it's a base 64 data: URI, avoid the expensive call to parseURI:
+  case A.parseOnly (pBase64DataURI *> A.endOfInput) t of
+    Right () -> True
+    Left _ ->
+      -- we URI-escape non-ASCII characters because otherwise parseURI will choke:
+      maybe False hasKnownScheme . parseURI . escapeURIString isAscii . T.unpack $ t
   where
-    hasKnownScheme = (`Set.member` schemes) . T.toLower .
-                     T.filter (/= ':') . T.pack . uriScheme
+    hasKnownScheme =
+      (`Set.member` schemes) . T.toLower . T.filter (/= ':') . T.pack . uriScheme
 
 -- | Converts the path part of a file: URI to a regular path.
 -- On windows, @/c:/foo@ should be @c:/foo@.
@@ -109,3 +116,25 @@ uriPathToPath (T.unpack -> path) =
 #else
   path
 #endif
+
+pBase64DataURI :: A.Parser T.Text
+pBase64DataURI = fst <$> A.match base64uri
+ where
+  base64uri = do
+    A.string "data:"
+    restrictedName
+    A.char '/'
+    restrictedName
+    A.char ';'
+    many mediaParam
+    A.string "base64,"
+    A.skipWhile (A.inClass "A-Za-z0-9+/")
+    A.skipWhile (== '=')
+  restrictedName = do
+    A.satisfy (A.inClass "A-Za-z0-9")
+    A.skipWhile (A.inClass "A-Za-z0-9!#$&^_.+-")
+  mediaParam = do
+    restrictedName
+    A.char '='
+    A.skipWhile (/=';')
+    A.char ';'