From 126e3330b9805f1ed632939821998afadcd10744 Mon Sep 17 00:00:00 2001
From: Evan Silberman <evan@jklol.net>
Date: Wed, 4 Dec 2024 12:18:13 -0800
Subject: [PATCH] Parameterize Roff escaping

The existing lexRoff does some stuff I don't want to deal with in mdoc
just yet, like lexing tbl, and some stuff I won't do at all, like
handling macro and text string definitions and switching between modes.
Uses a typeclass with associated type families to reuse most of the
escaping code between Roff (i.e. man) and Mdoc.

Future work could improve on this so that more lexing code could be
shared between Man and Mdoc. Mdoc inherits Roff's surface syntax so
hypothetically it makes sense to lex it into tokens that make sense for
roff. But it happens that the Mdoc parser is much easier to build with
an Mdoc specific token stream. Some discussion in jgm/pandoc#10225 about
the rationale.

Adds a test for the roff \A escape, which I accidentally dropped support
for in an earlier iteration without anything complaining.
---
 pandoc.cabal                           |   1 +
 src/Text/Pandoc/Readers/Roff.hs        | 278 +++++--------------------
 src/Text/Pandoc/Readers/Roff/Escape.hs | 222 ++++++++++++++++++++
 test/command/man-defines.md            |  15 ++
 4 files changed, 295 insertions(+), 221 deletions(-)
 create mode 100644 src/Text/Pandoc/Readers/Roff/Escape.hs
 create mode 100644 test/command/man-defines.md

diff --git a/pandoc.cabal b/pandoc.cabal
index 32ebcd43a6aa..2896d1ee513f 100644
--- a/pandoc.cabal
+++ b/pandoc.cabal
@@ -731,6 +731,7 @@ library
                    Text.Pandoc.Readers.Org.Shared,
                    Text.Pandoc.Readers.Metadata,
                    Text.Pandoc.Readers.Roff,
+                   Text.Pandoc.Readers.Roff.Escape,
                    Text.Pandoc.Writers.Docx.OpenXML,
                    Text.Pandoc.Writers.Docx.StyleMap,
                    Text.Pandoc.Writers.Docx.Table,
diff --git a/src/Text/Pandoc/Readers/Roff.hs b/src/Text/Pandoc/Readers/Roff.hs
index 429b90ce16ca..607b7c92239c 100644
--- a/src/Text/Pandoc/Readers/Roff.hs
+++ b/src/Text/Pandoc/Readers/Roff.hs
@@ -2,6 +2,7 @@
 {-# LANGUAGE GeneralizedNewtypeDeriving #-}
 {-# LANGUAGE OverloadedStrings #-}
 {-# LANGUAGE ViewPatterns #-}
+{-# LANGUAGE TypeFamilies #-}
 {- |
    Module      : Text.Pandoc.Readers.Roff
    Copyright   : Copyright (C) 2018-2020 Yan Pashkovsky and John MacFarlane
@@ -29,11 +30,11 @@ module Text.Pandoc.Readers.Roff
 where
 
 import Safe (lastDef)
-import Control.Monad (void, mzero, mplus, guard)
+import Control.Monad (void, guard)
 import Control.Monad.Except (throwError)
 import Text.Pandoc.Class.PandocMonad
        (getResourcePath, readFileFromDirs, PandocMonad(..), report)
-import Data.Char (isLower, toLower, toUpper, chr, isAscii, isAlphaNum)
+import Data.Char (isLower, toLower, toUpper, isAlphaNum)
 import Data.Default (Default)
 import qualified Data.Map as M
 import Data.List (intercalate)
@@ -42,10 +43,9 @@ import Text.Pandoc.Logging (LogMessage(..))
 import Text.Pandoc.Options
 import Text.Pandoc.Parsing
 import Text.Pandoc.Shared (safeRead)
-import Text.Pandoc.RoffChar (characterCodes, combiningAccents)
+import Text.Pandoc.Readers.Roff.Escape
 import qualified Data.Sequence as Seq
 import qualified Data.Foldable as Foldable
-import qualified Data.Text.Normalize as Normalize
 
 -- import Debug.Trace (traceShowId)
 
@@ -65,6 +65,59 @@ data LinePart = RoffStr T.Text
               | MacroArg Int
               deriving Show
 
+instance RoffLikeLexer RoffTokens where
+  -- The token stream is a list of 'LinePart's
+  type Token RoffTokens = [LinePart]
+  type State RoffTokens = RoffState
+  emit t = [RoffStr t]
+  expandString = try $ do
+    pos <- getPosition
+    char '\\'
+    char '*'
+    cs <- escapeArg <|> countChar 1 anyChar
+    s <- linePartsToText <$> resolveText cs pos
+    addToInput s
+  escString = try $ do
+    pos <- getPosition
+    (do cs <- escapeArg <|> countChar 1 anyChar
+        resolveText cs pos)
+      <|> mempty <$ char 'S'
+  backslash = do
+    char '\\'
+    mode <- roffMode <$> getState
+    case mode of
+      -- experimentally, it seems you don't always need to double
+      -- the backslash in macro defs.  It's essential with \\$1,
+      -- but not with \\f[I].  So we make the second one optional.
+      CopyMode   -> optional $ char '\\'
+      NormalMode -> return ()
+  checkDefined name = do
+    macros <- customMacros <$> getState
+    case M.lookup name macros of
+      Just _  -> return [RoffStr "1"]
+      Nothing -> return [RoffStr "0"]
+  -- \E is ignored in copy mode
+  escE = do
+      mode <- roffMode <$> getState
+      case mode of
+        CopyMode   -> return mempty
+        NormalMode -> return [RoffStr "\\"]
+  escFont = do
+    font <- escapeArg <|> countChar 1 alphaNum
+    font' <- if T.null font || font == "P"
+                then prevFont <$> getState
+                else return $ foldr processFontLetter defaultFontSpec $ T.unpack font
+    updateState $ \st -> st{ prevFont = currentFont st
+                           , currentFont = font' }
+    return [Font font']
+    where
+      processFontLetter c fs
+                | isLower c    = processFontLetter (toUpper c) fs
+      processFontLetter 'B' fs = fs{ fontBold = True }
+      processFontLetter 'I' fs = fs{ fontItalic = True }
+      processFontLetter 'C' fs = fs{ fontMonospace = True }
+      processFontLetter _   fs = fs -- do nothing
+
 type Arg = [LinePart]
 
 type TableOption = (T.Text, T.Text)
@@ -133,198 +186,6 @@ eofline = void newline <|> eof <|> () <$ lookAhead (string "\\}")
 spacetab :: (Stream s m Char, UpdateSourcePos s Char) => ParsecT s u m Char
 spacetab = char ' ' <|> char '\t'
 
-characterCodeMap :: M.Map T.Text Char
-characterCodeMap =
-  M.fromList $ map (\(x,y) -> (y,x)) characterCodes
-
-combiningAccentsMap :: M.Map T.Text Char
-combiningAccentsMap =
-  M.fromList $ map (\(x,y) -> (y,x)) combiningAccents
-
-escape :: PandocMonad m => RoffLexer m [LinePart]
-escape = try $ do
-  backslash
-  escapeGlyph <|> escapeNormal
-
-escapeGlyph :: PandocMonad m => RoffLexer m [LinePart]
-escapeGlyph = do
-  c <- lookAhead (oneOf ['[','('])
-  escapeArg >>= resolveGlyph c
-
-resolveGlyph :: PandocMonad m => Char -> T.Text -> RoffLexer m [LinePart]
-resolveGlyph delimChar glyph = do
-  let cs = T.replace "_u" " u" glyph -- unicode glyphs separated by _
-  (case T.words cs of
-      []  -> mzero
-      [s] -> case M.lookup s characterCodeMap `mplus` readUnicodeChar s of
-               Nothing -> mzero
-               Just c  -> return [RoffStr $ T.singleton c]
-      (s:ss) -> do
-        basechar <- case M.lookup s characterCodeMap `mplus`
-                         readUnicodeChar s of
-                      Nothing ->
-                        case T.unpack s of
-                          [ch] | isAscii ch && isAlphaNum ch ->
-                                 return ch
-                          _ -> mzero
-                      Just c  -> return c
-        let addAccents [] xs = return $ Normalize.normalize Normalize.NFC $
-                                        T.reverse xs
-            addAccents (a:as) xs =
-              case M.lookup a combiningAccentsMap `mplus` readUnicodeChar a of
-                Just x  -> addAccents as $ T.cons x xs
-                Nothing -> mzero
-        addAccents ss (T.singleton basechar) >>= \xs -> return [RoffStr xs])
-      <|> case delimChar of
-            '['  -> escUnknown ("\\[" <> glyph <> "]")
-            '('  -> escUnknown ("\\(" <> glyph)
-            '\'' -> escUnknown ("\\C'" <> glyph <> "'")
-            _    -> Prelude.fail "resolveGlyph: unknown glyph delimiter"
-
-readUnicodeChar :: T.Text -> Maybe Char
-readUnicodeChar t = case T.uncons t of
-  Just ('u', cs) | T.length cs > 3 -> chr <$> safeRead ("0x" <> cs)
-  _ -> Nothing
-
-escapeNormal :: PandocMonad m => RoffLexer m [LinePart]
-escapeNormal = do
-  c <- noneOf "{}"
-  optional expandString
-  case c of
-    ' ' -> return [RoffStr " "]
-    '"' -> mempty <$ skipMany (satisfy (/='\n')) -- line comment
-    '#' -> mempty <$ manyTill anyChar newline
-    '%' -> return mempty  -- optional hyphenation
-    '&' -> return mempty  -- nonprintable zero-width
-    ')' -> return mempty  -- nonprintable zero-width
-    '*' -> escString
-    ',' -> return mempty  -- to fix spacing after roman
-    '-' -> return [RoffStr "-"]
-    '.' -> return [RoffStr "."]
-    '/' -> return mempty  -- to fix spacing before roman
-    '0' -> return [RoffStr "\x2007"] -- digit-width space
-    ':' -> return mempty  -- zero-width break
-    'A' -> quoteArg >>= checkDefined
-    'B' -> escIgnore 'B' [quoteArg]
-    'C' -> quoteArg >>= resolveGlyph '\''
-    'D' -> escIgnore 'D' [quoteArg]
-    'E' -> do
-      mode <- roffMode <$> getState
-      case mode of
-        CopyMode   -> return mempty
-        NormalMode -> return [RoffStr "\\"]
-    'H' -> escIgnore 'H' [quoteArg]
-    'L' -> escIgnore 'L' [quoteArg]
-    'M' -> escIgnore 'M' [escapeArg, countChar 1 (satisfy (/='\n'))]
-    'N' -> escIgnore 'N' [quoteArg]
-    'O' -> escIgnore 'O' [countChar 1 (oneOf ['0','1'])]
-    'R' -> escIgnore 'R' [quoteArg]
-    'S' -> escIgnore 'S' [quoteArg]
-    'V' -> escIgnore 'V' [escapeArg, countChar 1 alphaNum]
-    'X' -> escIgnore 'X' [quoteArg]
-    'Y' -> escIgnore 'Y' [escapeArg, countChar 1 (satisfy (/='\n'))]
-    'Z' -> escIgnore 'Z' [quoteArg]
-    '\'' -> return [RoffStr "'"]
-    '\n' -> return mempty  -- line continuation
-    '^' -> return [RoffStr "\x200A"] -- 1/12 em space
-    '_' -> return [RoffStr "_"]
-    '`' -> return [RoffStr "`"]
-    'a' -> return mempty  -- "non-interpreted leader character"
-    'b' -> escIgnore 'b' [quoteArg]
-    'c' -> return mempty  -- interrupt text processing
-    'd' -> escIgnore 'd' [] -- forward down 1/2em
-    'e' -> return [RoffStr "\\"]
-    'f' -> escFont
-    'g' -> escIgnore 'g' [escapeArg, countChar 1 (satisfy (/='\n'))]
-    'h' -> escIgnore 'h' [quoteArg]
-    'k' -> escIgnore 'k' [escapeArg, countChar 1 (satisfy (/='\n'))]
-    'l' -> escIgnore 'l' [quoteArg]
-    'm' -> escIgnore 'm' [escapeArg, countChar 1 (satisfy (/='\n'))]
-    'n' -> escIgnore 'm' [escapeArg, countChar 1 (satisfy (/='\n'))]
-    'o' -> escIgnore 'o' [quoteArg]
-    'p' -> escIgnore 'p' []
-    'r' -> escIgnore 'r' []
-    's' -> escIgnore 's' [escapeArg, signedNumber]
-    't' -> return [RoffStr "\t"]
-    'u' -> escIgnore 'u' []
-    'v' -> escIgnore 'v' [quoteArg]
-    'w' -> escIgnore 'w' [quoteArg]
-    'x' -> escIgnore 'x' [quoteArg]
-    'z' -> escIgnore 'z' [countChar 1 anyChar]
-    '|' -> return [RoffStr "\x2006"] --1/6 em space
-    '~' -> return [RoffStr "\160"] -- nonbreaking space
-    '\\' -> do
-      mode <- roffMode <$> getState
-      case mode of
-        CopyMode   -> char '\\'
-        NormalMode -> return '\\'
-      return [RoffStr "\\"]
-    _   -> return [RoffStr $ T.singleton c]
-    -- man 7 groff: "If  a  backslash  is followed by a character that
-    -- does not constitute a defined escape sequence, the backslash
-    -- is  silently  ignored  and  the character maps to itself."
-
-escIgnore :: PandocMonad m
-          => Char
-          -> [RoffLexer m T.Text]
-          -> RoffLexer m [LinePart]
-escIgnore c argparsers = do
-  pos <- getPosition
-  arg <- snd <$> withRaw (choice argparsers) <|> return ""
-  report $ SkippedContent ("\\" <> T.cons c arg) pos
-  return mempty
-
-escUnknown :: PandocMonad m => T.Text -> RoffLexer m [LinePart]
-escUnknown s = do
-  pos <- getPosition
-  report $ SkippedContent s pos
-  return [RoffStr "\xFFFD"]
-
-signedNumber :: PandocMonad m => RoffLexer m T.Text
-signedNumber = try $ do
-  sign <- option "" ("-" <$ char '-' <|> "" <$ char '+')
-  ds <- many1Char digit
-  return (sign <> ds)
-
--- Parses: [..] or (..
-escapeArg :: PandocMonad m => RoffLexer m T.Text
-escapeArg = choice
-    [ char '[' *> optional expandString *>
-                  manyTillChar (noneOf ['\n',']']) (char ']')
-    , char '(' *> optional expandString *>
-                  countChar 2 (satisfy (/='\n'))
-    ]
-
-expandString :: PandocMonad m => RoffLexer m ()
-expandString = try $ do
-  pos <- getPosition
-  char '\\'
-  char '*'
-  cs <- escapeArg <|> countChar 1 anyChar
-  s <- linePartsToText <$> resolveText cs pos
-  addToInput s
-
--- Parses: '..'
-quoteArg :: PandocMonad m => RoffLexer m T.Text
-quoteArg = char '\'' *> manyTillChar (noneOf ['\n','\'']) (char '\'')
-
-escFont :: PandocMonad m => RoffLexer m [LinePart]
-escFont = do
-  font <- escapeArg <|> countChar 1 alphaNum
-  font' <- if T.null font || font == "P"
-              then prevFont <$> getState
-              else return $ foldr processFontLetter defaultFontSpec $ T.unpack font
-  updateState $ \st -> st{ prevFont = currentFont st
-                         , currentFont = font' }
-  return [Font font']
-  where
-    processFontLetter c fs
-              | isLower c    = processFontLetter (toUpper c) fs
-    processFontLetter 'B' fs = fs{ fontBold = True }
-    processFontLetter 'I' fs = fs{ fontItalic = True }
-    processFontLetter 'C' fs = fs{ fontMonospace = True }
-    processFontLetter _   fs = fs -- do nothing
-
 -- separate function from lexMacro since real man files sometimes do not
 -- follow the rules
 lexComment :: PandocMonad m => RoffLexer m RoffTokens
@@ -624,20 +485,6 @@ lexArgs = do
         char '"'
         return [RoffStr "\""]
 
-checkDefined :: PandocMonad m => T.Text -> RoffLexer m [LinePart]
-checkDefined name = do
-  macros <- customMacros <$> getState
-  case M.lookup name macros of
-    Just _  -> return [RoffStr "1"]
-    Nothing -> return [RoffStr "0"]
-
-escString :: PandocMonad m => RoffLexer m [LinePart]
-escString = try $ do
-  pos <- getPosition
-  (do cs <- escapeArg <|> countChar 1 anyChar
-      resolveText cs pos)
-    <|> mempty <$ char 'S'
-
 -- strings and macros share namespace
 resolveText :: PandocMonad m
               => T.Text -> SourcePos -> RoffLexer m [LinePart]
@@ -668,17 +515,6 @@ linePart :: PandocMonad m => RoffLexer m [LinePart]
 linePart = macroArg <|> escape <|>
            regularText <|> quoteChar <|> spaceTabChar
 
-backslash :: PandocMonad m => RoffLexer m ()
-backslash = do
-  char '\\'
-  mode <- roffMode <$> getState
-  case mode of
-    -- experimentally, it seems you don't always need to double
-    -- the backslash in macro defs.  It's essential with \\$1,
-    -- but not with \\f[I].  So we make the second one optional.
-    CopyMode   -> optional $ char '\\'
-    NormalMode -> return ()
-
 macroArg :: PandocMonad m => RoffLexer m [LinePart]
 macroArg = try $ do
   pos <- getPosition
diff --git a/src/Text/Pandoc/Readers/Roff/Escape.hs b/src/Text/Pandoc/Readers/Roff/Escape.hs
new file mode 100644
index 000000000000..f797ab165155
--- /dev/null
+++ b/src/Text/Pandoc/Readers/Roff/Escape.hs
@@ -0,0 +1,222 @@
+{-# LANGUAGE FlexibleContexts #-}
+{-# LANGUAGE OverloadedStrings #-}
+{-# LANGUAGE MultiParamTypeClasses #-}
+{-# LANGUAGE TypeFamilyDependencies #-}
+module Text.Pandoc.Readers.Roff.Escape
+  ( escape,
+    escapeArg,
+    RoffLikeLexer(..),
+  )
+where
+import Text.Pandoc.Class.PandocMonad
+    ( PandocMonad(..), report, PandocMonad(..), report )
+import Control.Monad
+    ( mzero, mplus, mzero, mplus )
+import Data.Char (chr, isAscii, isAlphaNum)
+import qualified Data.Map as M
+import qualified Data.Text as T
+import Text.Pandoc.Logging (LogMessage(..))
+import Text.Pandoc.Parsing
+import Text.Pandoc.Shared (safeRead)
+import qualified Data.Text.Normalize as Normalize
+import Text.Pandoc.RoffChar (characterCodes, combiningAccents)
+
+-- | Functions and typeclass for escaping special characters in languages
+-- that inherit the Roff syntax.
+--
+-- For various reasons, the mdoc reader doesn't directly reuse the previously
+-- existing roff lexer. The main one is to make it possible to simultaneously
+-- process roff escapes and to tokenize mdoc macros correctly based on the
+-- control line contents. The extracted interface here allows the same `escape`
+-- function to work with lexers that target different token types and support
+-- different subsets of the original roff language.
+
+type Lexer m x = ParsecT Sources (State x) m
+
+-- | Lexers for Roff macro
+class (Monoid (Token x)) => RoffLikeLexer x where
+  -- | Type family for the lexer state
+  type State x = a | a -> x
+  -- | Type family for the token type being lexed
+  type Token x = a | a -> x
+  -- | Turn a `T.Text` into a token of the output
+  emit :: T.Text -> Token x
+  -- | Attempt to parse a roff predefined string sequence and push its expansion
+  -- onto the input stream.
+  expandString :: PandocMonad m => Lexer m x ()
+  -- | Parse the name of a defined string and return its expansion as a `Token`
+  escString :: PandocMonad m => Lexer m x (Token x)
+  -- | Parse the escape character
+  backslash :: PandocMonad m => Lexer m x ()
+  -- | If the given custom macro is defined in this document, emit a
+  -- tokenized "1", otherwise emit a tokenized "0", implementing the roff
+  -- escape @\A@. The default case assumes an implementation that doesn't
+  -- support macro definition and always emits null.
+  checkDefined :: PandocMonad m => T.Text -> Lexer m x (Token x)
+  -- | Output an appropriate token for the @\E@ escape sequence. In roff, @\E@
+  -- is an "escape character intended to not be interpreted in copy mode".
+  -- If the lexer being defined doesn't implement copy mode, @\E@ can just be
+  -- lexed by 'backslash'
+  escE :: PandocMonad m => Lexer m x (Token x)
+  -- | Lex the low-level roff font selection escape @\f@.
+  escFont :: PandocMonad m => Lexer m x (Token x)
+
+
+characterCodeMap :: M.Map T.Text Char
+characterCodeMap =
+  M.fromList $ map (\(x,y) -> (y,x)) characterCodes
+
+combiningAccentsMap :: M.Map T.Text Char
+combiningAccentsMap =
+  M.fromList $ map (\(x,y) -> (y,x)) combiningAccents
+
+escape ::  (PandocMonad m, RoffLikeLexer x) => Lexer m x (Token x)
+escape = try $ do
+  backslash
+  escapeGlyph <|> escapeNormal
+
+escapeGlyph :: (PandocMonad m, RoffLikeLexer x) => Lexer m x (Token x)
+escapeGlyph = do
+  c <- lookAhead (oneOf ['[','('])
+  escapeArg >>= resolveGlyph c
+
+resolveGlyph :: (PandocMonad m, RoffLikeLexer x) => Char -> T.Text -> Lexer m x (Token x)
+resolveGlyph delimChar glyph = do
+  let cs = T.replace "_u" " u" glyph -- unicode glyphs separated by _
+  (case T.words cs of
+      []  -> mzero
+      [s] -> case M.lookup s characterCodeMap `mplus` readUnicodeChar s of
+               Nothing -> mzero
+               Just c  -> return $ emit $ T.singleton c
+      (s:ss) -> do
+        basechar <- case M.lookup s characterCodeMap `mplus`
+                         readUnicodeChar s of
+                      Nothing ->
+                        case T.unpack s of
+                          [ch] | isAscii ch && isAlphaNum ch ->
+                                 return ch
+                          _ -> mzero
+                      Just c  -> return c
+        let addAccents [] xs = return $ Normalize.normalize Normalize.NFC $
+                                        T.reverse xs
+            addAccents (a:as) xs =
+              case M.lookup a combiningAccentsMap `mplus` readUnicodeChar a of
+                Just x  -> addAccents as $ T.cons x xs
+                Nothing -> mzero
+        addAccents ss (T.singleton basechar) >>= \xs -> return $ emit xs)
+      <|> case delimChar of
+            '['  -> escUnknown ("\\[" <> glyph <> "]")
+            '('  -> escUnknown ("\\(" <> glyph)
+            '\'' -> escUnknown ("\\C'" <> glyph <> "'")
+            _    -> Prelude.fail "resolveGlyph: unknown glyph delimiter"
+
+readUnicodeChar :: T.Text -> Maybe Char
+readUnicodeChar t = case T.uncons t of
+  Just ('u', cs) | T.length cs > 3 -> chr <$> safeRead ("0x" <> cs)
+  _ -> Nothing
+
+escapeNormal :: (PandocMonad m, RoffLikeLexer x) => Lexer m x (Token x)
+escapeNormal = do
+  c <- noneOf "{}"
+  optional expandString
+  let groffSkip = [escapeArg, countChar 1 (satisfy (/='\n'))]
+  case c of
+    ' ' -> return $ emit " " -- mandoc_char(7) says this should be a nonbreaking space
+    '"' -> mempty <$ skipMany (satisfy (/='\n')) -- line comment
+    '#' -> mempty <$ manyTill anyChar newline
+    '%' -> return mempty  -- optional hyphenation
+    '&' -> return mempty  -- nonprintable zero-width
+    ')' -> return mempty  -- nonprintable zero-width
+    '*' -> escString
+    ',' -> return mempty  -- to fix spacing after roman
+    '-' -> return $ emit "-"
+    '.' -> return $ emit "."
+    '/' -> return mempty  -- to fix spacing before roman
+    '0' -> return $ emit "\x2007" -- digit-width space
+    ':' -> return mempty  -- zero-width break
+    'A' -> quoteArg >>= checkDefined
+    'B' -> escIgnore 'B' [quoteArg]
+    'C' -> quoteArg >>= resolveGlyph '\''
+    'D' -> escIgnore 'D' [quoteArg]
+    'E' -> escE
+    'F' -> escIgnore 'F' groffSkip
+    'H' -> escIgnore 'H' [quoteArg]
+    'L' -> escIgnore 'L' [quoteArg]
+    'M' -> escIgnore 'M' groffSkip
+    'N' -> escIgnore 'N' [quoteArg]
+    'O' -> escIgnore 'O' [countChar 1 (oneOf ['0','1'])]
+    'R' -> escIgnore 'R' [quoteArg]
+    'S' -> escIgnore 'S' [quoteArg]
+    'V' -> escIgnore 'V' groffSkip
+    'X' -> escIgnore 'X' [quoteArg]
+    'Y' -> escIgnore 'Y' groffSkip
+    'Z' -> escIgnore 'Z' [quoteArg]
+    '\'' -> return $ emit "'"
+    '\n' -> return mempty  -- line continuation
+    '^' -> return $ emit "\x200A" -- 1/12 em space
+    '_' -> return $ emit "_"
+    '`' -> return $ emit "`"
+    'a' -> return mempty  -- "non-interpreted leader character"
+    'b' -> escIgnore 'b' [quoteArg]
+    'c' -> return mempty  -- interrupt text processing
+    'd' -> escIgnore 'd' [] -- forward down 1/2em
+    'e' -> return $ emit "\\"
+    'f' -> escFont
+    'g' -> escIgnore 'g' groffSkip
+    'h' -> escIgnore 'h' [quoteArg]
+    'k' -> escIgnore 'k' groffSkip
+    'l' -> escIgnore 'l' [quoteArg]
+    'm' -> escIgnore 'm' groffSkip
+    'n' -> escIgnore 'm' groffSkip
+    'o' -> escIgnore 'o' [quoteArg]
+    'p' -> escIgnore 'p' []
+    'r' -> escIgnore 'r' []
+    's' -> escIgnore 's' [escapeArg, signedNumber]
+    't' -> return $ emit "\t"
+    'u' -> escIgnore 'u' []
+    'v' -> escIgnore 'v' [quoteArg]
+    'w' -> escIgnore 'w' [quoteArg]
+    'x' -> escIgnore 'x' [quoteArg]
+    'z' -> escIgnore 'z' [countChar 1 anyChar]
+    '|' -> return $ emit "\x2006" --1/6 em space
+    '~' -> return $ emit "\160" -- nonbreaking space
+    '\\' -> return $ emit "\\"
+    _   -> return $ emit $ T.singleton c
+    -- man 7 groff: "If  a  backslash  is followed by a character that
+    -- does not constitute a defined escape sequence, the backslash
+    -- is  silently  ignored  and  the character maps to itself."
+
+escIgnore :: (PandocMonad m, RoffLikeLexer x)
+          => Char
+          -> [Lexer m x T.Text]
+          -> Lexer m x (Token x)
+escIgnore c argparsers = do
+  pos <- getPosition
+  arg <- snd <$> withRaw (choice argparsers) <|> return ""
+  report $ SkippedContent ("\\" <> T.cons c arg) pos
+  return mempty
+
+escUnknown :: (PandocMonad m, RoffLikeLexer x) => T.Text -> Lexer m x (Token x)
+escUnknown s = do
+  pos <- getPosition
+  report $ SkippedContent s pos
+  return $ emit "\xFFFD"
+
+signedNumber :: (PandocMonad m, RoffLikeLexer x) => Lexer m x T.Text
+signedNumber = try $ do
+  sign <- option "" ("-" <$ char '-' <|> "" <$ char '+')
+  ds <- many1Char digit
+  return (sign <> ds)
+
+-- Parses: [..] or (..
+escapeArg :: (PandocMonad m, RoffLikeLexer x) => Lexer m x T.Text
+escapeArg = choice
+    [ char '[' *> optional expandString *>
+                  manyTillChar (noneOf ['\n',']']) (char ']')
+    , char '(' *> optional expandString *>
+                  countChar 2 (satisfy (/='\n'))
+    ]
+
+-- Parses: '..'
+quoteArg :: (PandocMonad m, RoffLikeLexer x) => Lexer m x T.Text
+quoteArg = char '\'' *> manyTillChar (noneOf ['\n','\'']) (char '\'')
diff --git a/test/command/man-defines.md b/test/command/man-defines.md
new file mode 100644
index 000000000000..87103a8824e1
--- /dev/null
+++ b/test/command/man-defines.md
@@ -0,0 +1,15 @@
+```
+% pandoc -f man -t plain
+.de test
+ok
+..
+.test
+.br
+\A'test'
+.br
+\A'xyz'
+^D
+ok
+1
+0
+```