From 3998da767b1d52409cb604ec608b58d9a1c8f59f Mon Sep 17 00:00:00 2001
From: Jason Manuel <jama.indo@hotmail.com>
Date: Wed, 6 Nov 2024 00:28:06 -0700
Subject: [PATCH 1/4] Display nice errors from Python tokenizer exceptions

---
 concat/__main__.py                            | 147 +++++++++++-------
 concat/error_reporting.py                     |  21 ++-
 concat/lex.py                                 |  85 +++++++---
 concat/lsp/__init__.py                        |  57 ++++---
 concat/stdlib/repl.py                         |  11 +-
 .../stdlib/test_python_concat_interface.py    |   4 +-
 concat/tests/test_lex.py                      |  19 ++-
 concat/tests/test_typecheck.py                |   2 +-
 concat/typecheck/__init__.py                  |  24 ++-
 9 files changed, 264 insertions(+), 106 deletions(-)
diff --git a/concat/__main__.py b/concat/__main__.py
index 3d71f3a..3c3a848 100644
--- a/concat/__main__.py
+++ b/concat/__main__.py
@@ -2,7 +2,12 @@
 
 import argparse
 from concat.transpile import parse, transpile_ast, typecheck
-from concat.error_reporting import get_line_at, create_parsing_failure_message
+from concat.error_reporting import (
+    get_line_at,
+    create_indentation_error_message,
+    create_lexical_error_message,
+    create_parsing_failure_message,
+)
 import concat.execute
 import concat.lex
 import concat.parser_combinators
@@ -11,7 +16,7 @@
 import json
 import os.path
 import sys
-from typing import Callable, IO, AnyStr
+from typing import Callable, IO, AnyStr, assert_never
 
 
 filename = '<stdin>'
@@ -55,69 +60,99 @@ def func(name: str) -> IO[AnyStr]:
     help='tokenize input from the given file and print the tokens as a JSON array',
 )
 
+
+def main():
+    # interactive mode
+    if args.file.isatty():
+        concat.stdlib.repl.repl([], [], args.debug)
+    else:
+        try:
+            token_results = concat.lex.tokenize(args.file.read())
+            tokens = list[concat.lex.Token]()
+            for r in token_results:
+                if r.type == 'token':
+                    tokens.append(r.token)
+                elif r.type == 'indent-err':
+                    position = (r.err.lineno or 1, r.err.offset or 0)
+                    message = r.err.msg
+                    print('Indentation error:')
+                    print(
+                        create_indentation_error_message(
+                            args.file, position, message
+                        )
+                    )
+                elif r.type == 'token-err':
+                    position = r.location
+                    message = str(r.err)
+                    print('Lexical error:')
+                    print(
+                        create_lexical_error_message(
+                            args.file, position, message
+                        )
+                    )
+                else:
+                    assert_never(r)
+            concat_ast = parse(tokens)
+            recovered_parsing_failures = concat_ast.parsing_failures
+            for failure in recovered_parsing_failures:
+                print('Parse Error:')
+                print(
+                    create_parsing_failure_message(args.file, tokens, failure)
+                )
+            source_dir = os.path.dirname(filename)
+            typecheck(concat_ast, source_dir)
+            python_ast = transpile_ast(concat_ast)
+        except concat.typecheck.StaticAnalysisError as e:
+            if e.path is None:
+                in_path = ''
+            else:
+                in_path = ' in file ' + str(e.path)
+            print(f'Static Analysis Error{in_path}:\n')
+            print(e, 'in line:')
+            if e.location:
+                if e.path is not None:
+                    with e.path.open() as f:
+                        print(get_line_at(f, e.location), end='')
+                else:
+                    print(get_line_at(args.file, e.location), end='')
+                print(' ' * e.location[1] + '^')
+            if args.verbose:
+                raise
+        except concat.parser_combinators.ParseError as e:
+            print('Parse Error:')
+            print(
+                create_parsing_failure_message(
+                    args.file, tokens, e.args[0].failures
+                )
+            )
+        except Exception:
+            print('An internal error has occurred.')
+            print('This is a bug in Concat.')
+            raise
+        else:
+            concat.execute.execute(
+                filename,
+                python_ast,
+                {},
+                should_log_stacks=args.debug,
+                import_resolution_start_directory=source_dir,
+            )
+            if list(concat_ast.parsing_failures):
+                sys.exit(1)
+        finally:
+            args.file.close()
+
+
 # We should pass any unknown args onto the program we're about to run.
 # FIXME: There might be a better way to go about this, but I think this is fine
 # for now.
 args, rest = arg_parser.parse_known_args()
 sys.argv = [sys.argv[0], *rest]
 
-
 if args.tokenize:
     code = args.file.read()
     tokens = concat.lex.tokenize(code, should_preserve_comments=True)
     json.dump(tokens, sys.stdout, cls=concat.lex.TokenEncoder)
     sys.exit()
 
-# interactive mode
-if args.file.isatty():
-    concat.stdlib.repl.repl([], [], args.debug)
-else:
-    try:
-        tokens = concat.lex.tokenize(args.file.read())
-        concat_ast = parse(tokens)
-        recovered_parsing_failures = concat_ast.parsing_failures
-        for failure in recovered_parsing_failures:
-            print('Parse Error:')
-            print(create_parsing_failure_message(args.file, tokens, failure))
-        source_dir = os.path.dirname(filename)
-        typecheck(concat_ast, source_dir)
-        python_ast = transpile_ast(concat_ast)
-    except concat.typecheck.StaticAnalysisError as e:
-        if e.path is None:
-            in_path = ''
-        else:
-            in_path = ' in file ' + str(e.path)
-        print(f'Static Analysis Error{in_path}:\n')
-        print(e, 'in line:')
-        if e.location:
-            if e.path is not None:
-                with e.path.open() as f:
-                    print(get_line_at(f, e.location), end='')
-            else:
-                print(get_line_at(args.file, e.location), end='')
-            print(' ' * e.location[1] + '^')
-        if args.verbose:
-            raise
-    except concat.parser_combinators.ParseError as e:
-        print('Parse Error:')
-        print(
-            create_parsing_failure_message(
-                args.file, tokens, e.args[0].failures
-            )
-        )
-    except Exception:
-        print('An internal error has occurred.')
-        print('This is a bug in Concat.')
-        raise
-    else:
-        concat.execute.execute(
-            filename,
-            python_ast,
-            {},
-            should_log_stacks=args.debug,
-            import_resolution_start_directory=source_dir,
-        )
-        if list(concat_ast.parsing_failures):
-            sys.exit(1)
-    finally:
-        args.file.close()
+main()
diff --git a/concat/error_reporting.py b/concat/error_reporting.py
index f44b75f..5c6c996 100644
--- a/concat/error_reporting.py
+++ b/concat/error_reporting.py
@@ -16,7 +16,10 @@ def create_parsing_failure_message(
     stream: Sequence[concat.lex.Token],
     failure: concat.parser_combinators.FailureTree,
 ) -> str:
-    location = stream[failure.furthest_index].start
+    if failure.furthest_index < len(stream):
+        location = stream[failure.furthest_index].start
+    else:
+        location = stream[-1].start
     line = get_line_at(file, location)
     message = f'Expected {failure.expected} at line {location[0]}, column {location[1] + 1}:\n{line.rstrip()}\n{" " * location[1] + "^"}'
     if failure.children:
@@ -26,3 +29,19 @@ def create_parsing_failure_message(
                 create_parsing_failure_message(file, stream, f), '  '
             )
     return message
+
+
+def create_lexical_error_message(
+    file: TextIO, location: concat.astutils.Location, message: str
+) -> str:
+    line = get_line_at(file, location)
+    message = f'Cannot tokenize file at line {location[0]}, column {location[1] + 1}:\n{line.rstrip()}\n{' ' * location[1] + '^'}\n'
+    return message
+
+
+def create_indentation_error_message(
+    file: TextIO, location: concat.astutils.Location, message: str
+) -> str:
+    line = get_line_at(file, location)
+    message = f'Malformed indentation at line {location[0]}, column {location[1] + 1}:\n{line.rstrip()}\n'
+    return message
diff --git a/concat/lex.py b/concat/lex.py
index e7887c6..4a79fdb 100644
--- a/concat/lex.py
+++ b/concat/lex.py
@@ -1,9 +1,10 @@
-import concat.astutils
+from __future__ import annotations
+from concat.astutils import Location, are_on_same_line_and_offset_by
 import dataclasses
 import io
 import json
 import tokenize as py_tokenize
-from typing import Iterator, List, Optional, Tuple, Union
+from typing import Iterator, List, Literal, Optional, Tuple, Union
 
 
 @dataclasses.dataclass
@@ -19,8 +20,8 @@ class Token:
 
     type: str = ''
     value: str = ''
-    start: 'concat.astutils.Location' = (0, 0)
-    end: 'concat.astutils.Location' = (0, 0)
+    start: Location = (0, 0)
+    end: Location = (0, 0)
     is_keyword: bool = False
 
 
@@ -33,7 +34,10 @@ def default(self, obj):
         return super().default(obj)
 
 
-def tokenize(code: str, should_preserve_comments: bool = False) -> List[Token]:
+def tokenize(
+    code: str,
+    should_preserve_comments: bool = False,
+) -> List[Result]:
     lexer = Lexer()
     lexer.input(code, should_preserve_comments)
     tokens = []
@@ -46,10 +50,8 @@ def tokenize(code: str, should_preserve_comments: bool = False) -> List[Token]:
 
 
 TokenTuple = Union[
-    Tuple[str, str, 'concat.astutils.Location', 'concat.astutils.Location'],
-    Tuple[
-        str, str, 'concat.astutils.Location', 'concat.astutils.Location', bool
-    ],
+    Tuple[str, str, Location, Location],
+    Tuple[str, str, Location, Location, bool],
 ]
 
 
@@ -64,7 +66,7 @@ def __init__(self) -> None:
         self.tokens: Optional[Iterator[py_tokenize.TokenInfo]]
         self.lineno: int
         self.lexpos: int
-        self._concat_token_iterator: Iterator['Token']
+        self._concat_token_iterator: Iterator[Result]
         self._should_preserve_comments: bool
 
     def input(self, data: str, should_preserve_comments: bool = False) -> None:
@@ -76,11 +78,11 @@ def input(self, data: str, should_preserve_comments: bool = False) -> None:
         self._concat_token_iterator = self._tokens()
         self._should_preserve_comments = should_preserve_comments
 
-    def token(self) -> Optional['Token']:
+    def token(self) -> Optional[Result]:
         """Return the next token as a Token object."""
         return next(self._concat_token_iterator, None)
 
-    def _tokens(self) -> Iterator['Token']:
+    def _tokens(self) -> Iterator[Result]:
         import token
 
         if self.tokens is None:
@@ -88,8 +90,16 @@ def _tokens(self) -> Iterator['Token']:
                 io.BytesIO(self.data.encode('utf-8')).readline
             )
 
-        glued_token_prefix = None
-        for token_ in self.tokens:
+        glued_token_prefix: Token | None = None
+        while True:
+            try:
+                token_ = next(self.tokens)
+            except StopIteration:
+                return
+            except IndentationError as e:
+                yield IndentationErrorResult(e)
+            except py_tokenize.TokenError as e:
+                yield TokenErrorResult(e, (self.lineno, self.lexpos))
             tok = Token()
             _, tok.value, tok.start, tok.end, _ = token_
             tok.type = token.tok_name[token_.exact_type]
@@ -98,7 +108,7 @@ def _tokens(self) -> Iterator['Token']:
                 if (
                     glued_token_prefix.value == '-'
                     and tok.value == '-'
-                    and concat.astutils.are_on_same_line_and_offset_by(
+                    and are_on_same_line_and_offset_by(
                         glued_token_prefix.start, tok.start, 1
                     )
                 ):
@@ -106,7 +116,7 @@ def _tokens(self) -> Iterator['Token']:
                     glued_token_prefix.type = 'MINUSMINUS'
                     glued_token_prefix.end = tok.end
                     self._update_position(glued_token_prefix)
-                    yield glued_token_prefix
+                    yield TokenResult(glued_token_prefix)
                     glued_token_prefix = None
                     continue
                 else:
@@ -119,7 +129,7 @@ def _tokens(self) -> Iterator['Token']:
                         self._should_preserve_comments
                         and tok.type == 'COMMENT'
                     ):
-                        yield tok
+                        yield TokenResult(tok)
                     continue
                 elif tok.type == 'ERRORTOKEN':
                     if tok.value == ' ':
@@ -182,16 +192,49 @@ def _tokens(self) -> Iterator['Token']:
                 elif tok.type == 'EXCLAMATION':
                     tok.type = 'EXCLAMATIONMARK'
 
-                yield tok
+                yield TokenResult(tok)
 
     def _update_position(self, tok: 'Token') -> None:
-        self.lexpos += len(tok.value)
-        if tok.type in {'NEWLINE', 'NL'}:
-            self.lineno += 1
+        self.lineno, self.lexpos = tok.start
 
     def __is_bytes_literal(self, literal: str) -> bool:
         return isinstance(eval(literal), bytes)
 
 
+@dataclasses.dataclass
+class TokenResult:
+    type: Literal['token']
+    token: Token
+
+    def __init__(self, token: Token) -> None:
+        self.type = 'token'
+        self.token = token
+
+
+@dataclasses.dataclass
+class IndentationErrorResult:
+    type: Literal['indent-err']
+    err: IndentationError
+
+    def __init__(self, err: IndentationError) -> None:
+        self.type = 'indent-err'
+        self.err = err
+
+
+@dataclasses.dataclass
+class TokenErrorResult:
+    type: Literal['token-err']
+    err: py_tokenize.TokenError
+    location: Location
+
+    def __init__(self, err: py_tokenize.TokenError, loc: Location) -> None:
+        self.type = 'token-err'
+        self.err = err
+        self.location = loc
+
+
+type Result = TokenResult | IndentationErrorResult | TokenErrorResult
+
+
 def to_tokens(*tokTuples: TokenTuple) -> List[Token]:
     return [Token(*tuple) for tuple in tokTuples]
diff --git a/concat/lsp/__init__.py b/concat/lsp/__init__.py
index 2bd3642..cd0e8ec 100644
--- a/concat/lsp/__init__.py
+++ b/concat/lsp/__init__.py
@@ -1,8 +1,8 @@
 from concat.astutils import Location
 import concat.jsonrpc
-from concat.lex import tokenize
+from concat.lex import Token, tokenize
 from concat.logging import ConcatLogger
-from concat.parse import ParseError
+from concat.parser_combinators import ParseError
 from concat.transpile import parse, typecheck
 from concat.typecheck import StaticAnalysisError
 from enum import Enum, IntEnum
@@ -10,7 +10,6 @@
 import logging
 from pathlib import Path
 import re
-import tokenize as py_tokenize
 from typing import (
     BinaryIO,
     Callable,
@@ -411,14 +410,26 @@ def diagnose(self) -> None:
 
     def _diagnose(self) -> List[_Diagnostic]:
         text_lines = self._text.splitlines(keepends=True)
-        try:
-            tokens = tokenize(self._text)
-        except py_tokenize.TokenError as e:
-            message = e.args[0]
-            position = _Position.from_tokenizer_location(text_lines, e.args[1])
-            range_ = _Range(position, position)
-            return [_Diagnostic(range_, message)]
+        token_results = tokenize(self._text)
         diagnostics = []
+        tokens = list[Token]()
+        for r in token_results:
+            if r.type == 'token':
+                tokens.append(r.token)
+            elif r.type == 'indent-err':
+                position = _Position.from_tokenizer_location(
+                    text_lines, (r.err.lineno or 1, r.err.offset or 0)
+                )
+                range_ = _Range(position, position)
+                message = r.err.msg
+                diagnostics.append(_Diagnostic(range_, message))
+            elif r.type == 'token-err':
+                position = _Position.from_tokenizer_location(
+                    text_lines, r.location
+                )
+                range_ = _Range(position, position)
+                message = str(r.err)
+                diagnostics.append(_Diagnostic(range_, message))
         for token in tokens:
             if token.type == 'ERRORTOKEN':
                 _logger.debug('error token: {token!r}', token=token)
@@ -436,19 +447,21 @@ def _diagnose(self) -> List[_Diagnostic]:
                 diagnostics.append(_Diagnostic(range_, message))
         try:
             ast = parse(tokens)
+            ast.assert_no_parse_errors()
         except ParseError as e:
-            parser_start_position = e.get_start_position()
-            parser_end_position = e.get_end_position()
-            range_ = _Range(
-                _Position.from_tokenizer_location(
-                    text_lines, parser_start_position
-                ),
-                _Position.from_tokenizer_location(
-                    text_lines, parser_end_position
-                ),
-            )
-            message = f'Expected one of: {", ".join(e.expected)}'
-            diagnostics.append(_Diagnostic(range_, message))
+            for failure in e.args[0].failures:
+                parser_start_position = tokens[failure.furthest_index].start
+                parser_end_position = parser_start_position
+                range_ = _Range(
+                    _Position.from_tokenizer_location(
+                        text_lines, parser_start_position
+                    ),
+                    _Position.from_tokenizer_location(
+                        text_lines, parser_end_position
+                    ),
+                )
+                message = f'Expected one of: {failure.expected}'
+                diagnostics.append(_Diagnostic(range_, message))
             return diagnostics
         try:
             # https://stackoverflow.com/questions/5977576/is-there-a-convenient-way-to-map-a-file-uri-to-os-path
diff --git a/concat/stdlib/repl.py b/concat/stdlib/repl.py
index e4d139e..70d8b93 100644
--- a/concat/stdlib/repl.py
+++ b/concat/stdlib/repl.py
@@ -23,6 +23,10 @@
 sys.modules[__name__].__class__ = concat.stdlib.importlib.Module
 
 
+class _REPLTokenizeError(Exception):
+    pass
+
+
 def _tokenize(code: str) -> List[concat.lex.Token]:
     lexer = concat.lex.Lexer()
     lexer.input(code)
@@ -31,7 +35,9 @@ def _tokenize(code: str) -> List[concat.lex.Token]:
         token = lexer.token()
         if token is None:
             break
-        tokens.append(token)
+        if token.type != 'token':
+            raise _REPLTokenizeError from token.err
+        tokens.append(token.token)
     return tokens
 
 
@@ -192,6 +198,9 @@ def show_var(stack: List[object], stash: List[object]):
             except concat.execute.ConcatRuntimeError as e:
                 print('Runtime error:\n')
                 print(e)
+            except _REPLTokenizeError as e:
+                print('Lexical error:\n')
+                print(e)
             except EOFError:
                 break
             else:
diff --git a/concat/tests/stdlib/test_python_concat_interface.py b/concat/tests/stdlib/test_python_concat_interface.py
index 7abbb43..50ad8a8 100644
--- a/concat/tests/stdlib/test_python_concat_interface.py
+++ b/concat/tests/stdlib/test_python_concat_interface.py
@@ -731,7 +731,9 @@ def test_modules_are_callable(self) -> None:
             token = lexer.token()
             if token is None:
                 break
-            tokens.append(token)
+            if token.type != 'token':
+                raise token.err
+            tokens.append(token.token)
         parser = concat.parse.ParserDict()
         parser.extend_with(concat.parse.extension)
         concat_ast = parser.parse(tokens)
diff --git a/concat/tests/test_lex.py b/concat/tests/test_lex.py
index 161f0e9..5e69e61 100644
--- a/concat/tests/test_lex.py
+++ b/concat/tests/test_lex.py
@@ -1,5 +1,6 @@
 import concat.lex as lex
 from concat.tests.small_example_programs import examples
+import textwrap
 import unittest
 
 
@@ -32,6 +33,22 @@ def test_examples(self) -> None:
                     tokens.append(token)
 
                 self.assertEqual(len(tokens), len(expected_tokens))
-                expectationPairs = zip(tokens, expected_tokens)
+                expectationPairs = zip(
+                    tokens, map(lambda t: lex.TokenResult(t), expected_tokens)
+                )
                 for actual_token, expected_token in expectationPairs:
                     self.assertEqual(actual_token, expected_token)
+
+    def test_indentation_error(self) -> None:
+        code = textwrap.dedent("""\
+            def remove_stack_polymorphism(
+                f:forall `t *s. (*s i:`t -- *s) -- g:forall `t. (i:`t -- )
+            ):
+              ()
+             dfbfdbff""")
+        lexer = lex.Lexer()
+        lexer.input(code)
+        while True:
+            token = lexer.token()
+            if token is None:
+                break
diff --git a/concat/tests/test_typecheck.py b/concat/tests/test_typecheck.py
index 8aee6f8..4a1ccd6 100644
--- a/concat/tests/test_typecheck.py
+++ b/concat/tests/test_typecheck.py
@@ -44,7 +44,7 @@
 
 
 def lex_string(string: str) -> List[concat.lex.Token]:
-    return lex.tokenize(string)
+    return [r.token for r in lex.tokenize(string) if r.type == 'token']
 
 
 def parse(string: str) -> concat.parse.TopLevelNode:
diff --git a/concat/typecheck/__init__.py b/concat/typecheck/__init__.py
index dcb6b99..52679b7 100644
--- a/concat/typecheck/__init__.py
+++ b/concat/typecheck/__init__.py
@@ -28,6 +28,7 @@
     TYPE_CHECKING,
     Tuple,
     Union,
+    assert_never,
     cast,
 )
 from concat.typecheck.types import (
@@ -58,7 +59,11 @@
     no_return_type,
 )
 import abc
-from concat.error_reporting import create_parsing_failure_message
+from concat.error_reporting import (
+    create_indentation_error_message,
+    create_lexical_error_message,
+    create_parsing_failure_message,
+)
 from concat.lex import Token
 import itertools
 import pathlib
@@ -658,7 +663,22 @@ def _check_stub_resolved_path(
         raise TypeError(f'Type stubs at {path} do not exist') from e
     except IOError as e:
         raise TypeError(f'Failed to read type stubs at {path}') from e
-    tokens = concat.lex.tokenize(source)
+    token_results = concat.lex.tokenize(source)
+    tokens = list[Token]()
+    with path.open() as f:
+        for r in token_results:
+            if r.type == 'token':
+                tokens.append(r.token)
+            elif r.type == 'indent-err':
+                print(
+                    create_indentation_error_message(
+                        f, (r.err.lineno or 1, r.err.offset or 0), r.err.msg
+                    )
+                )
+            elif r.type == 'token-err':
+                print(create_lexical_error_message(f, r.location, str(r.err)))
+            else:
+                assert_never(r)
     env = initial_env or Environment()
     from concat.transpile import parse
 

From a8edcc0a91de3175578e4267e47cd300dbbd79f3 Mon Sep 17 00:00:00 2001
From: Jason Manuel <jama.indo@hotmail.com>
Date: Wed, 6 Nov 2024 00:31:21 -0700
Subject: [PATCH 2/4] Add tokenizer error prefaces to stub typechecker to match
 __main__

---
 concat/typecheck/__init__.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/concat/typecheck/__init__.py b/concat/typecheck/__init__.py
index 52679b7..d009499 100644
--- a/concat/typecheck/__init__.py
+++ b/concat/typecheck/__init__.py
@@ -670,12 +670,14 @@ def _check_stub_resolved_path(
             if r.type == 'token':
                 tokens.append(r.token)
             elif r.type == 'indent-err':
+                print('Indentation error:')
                 print(
                     create_indentation_error_message(
                         f, (r.err.lineno or 1, r.err.offset or 0), r.err.msg
                     )
                 )
             elif r.type == 'token-err':
+                print('Lexical error:')
                 print(create_lexical_error_message(f, r.location, str(r.err)))
             else:
                 assert_never(r)

From cb6670ffd0efe28f16bf9c21316d655915df14bc Mon Sep 17 00:00:00 2001
From: Jason Manuel <jama.indo@hotmail.com>
Date: Fri, 8 Nov 2024 17:29:34 -0700
Subject: [PATCH 3/4] Address Deepsource suggestions

---
 concat/__main__.py        | 156 +++++++++++++++--------------
 concat/error_reporting.py |  13 ++-
 concat/lex.py             | 203 +++++++++++++++++++-------------------
 concat/tests/test_lex.py  |   2 +-
 4 files changed, 196 insertions(+), 178 deletions(-)

diff --git a/concat/__main__.py b/concat/__main__.py
index 3c3a848..077aa14 100644
--- a/concat/__main__.py
+++ b/concat/__main__.py
@@ -57,90 +57,94 @@ def func(name: str) -> IO[AnyStr]:
     '--tokenize',
     action='store_true',
     default=False,
-    help='tokenize input from the given file and print the tokens as a JSON array',
+    help=(
+        'tokenize input from the given file and print the tokens as a JSON '
+        'array'
+    ),
 )
 
 
+def tokenize_printing_errors() -> list[concat.lex.Token]:
+    token_results = concat.lex.tokenize(args.file.read())
+    tokens = list[concat.lex.Token]()
+    for r in token_results:
+        if r.type == 'token':
+            tokens.append(r.token)
+        elif r.type == 'indent-err':
+            position = (r.err.lineno or 1, r.err.offset or 0)
+            message = r.err.msg
+            print('Indentation error:')
+            print(
+                create_indentation_error_message(args.file, position, message)
+            )
+        elif r.type == 'token-err':
+            position = r.location
+            message = str(r.err)
+            print('Lexical error:')
+            print(create_lexical_error_message(args.file, position, message))
+        else:
+            assert_never(r)
+    return tokens
+
+
+def batch_main():
+    try:
+        tokens = tokenize_printing_errors()
+        concat_ast = parse(tokens)
+        recovered_parsing_failures = concat_ast.parsing_failures
+        for failure in recovered_parsing_failures:
+            print('Parse Error:')
+            print(create_parsing_failure_message(args.file, tokens, failure))
+        source_dir = os.path.dirname(filename)
+        typecheck(concat_ast, source_dir)
+        python_ast = transpile_ast(concat_ast)
+    except concat.typecheck.StaticAnalysisError as e:
+        if e.path is None:
+            in_path = ''
+        else:
+            in_path = ' in file ' + str(e.path)
+        print(f'Static Analysis Error{in_path}:\n')
+        print(e, 'in line:')
+        if e.location:
+            if e.path is not None:
+                with e.path.open() as f:
+                    print(get_line_at(f, e.location), end='')
+            else:
+                print(get_line_at(args.file, e.location), end='')
+            print(' ' * e.location[1] + '^')
+        if args.verbose:
+            raise
+    except concat.parser_combinators.ParseError as e:
+        print('Parse Error:')
+        print(
+            create_parsing_failure_message(
+                args.file, tokens, e.args[0].failures
+            )
+        )
+    except Exception:
+        print('An internal error has occurred.')
+        print('This is a bug in Concat.')
+        raise
+    else:
+        concat.execute.execute(
+            filename,
+            python_ast,
+            {},
+            should_log_stacks=args.debug,
+            import_resolution_start_directory=source_dir,
+        )
+        if list(concat_ast.parsing_failures):
+            sys.exit(1)
+    finally:
+        args.file.close()
+
+
 def main():
     # interactive mode
     if args.file.isatty():
         concat.stdlib.repl.repl([], [], args.debug)
     else:
-        try:
-            token_results = concat.lex.tokenize(args.file.read())
-            tokens = list[concat.lex.Token]()
-            for r in token_results:
-                if r.type == 'token':
-                    tokens.append(r.token)
-                elif r.type == 'indent-err':
-                    position = (r.err.lineno or 1, r.err.offset or 0)
-                    message = r.err.msg
-                    print('Indentation error:')
-                    print(
-                        create_indentation_error_message(
-                            args.file, position, message
-                        )
-                    )
-                elif r.type == 'token-err':
-                    position = r.location
-                    message = str(r.err)
-                    print('Lexical error:')
-                    print(
-                        create_lexical_error_message(
-                            args.file, position, message
-                        )
-                    )
-                else:
-                    assert_never(r)
-            concat_ast = parse(tokens)
-            recovered_parsing_failures = concat_ast.parsing_failures
-            for failure in recovered_parsing_failures:
-                print('Parse Error:')
-                print(
-                    create_parsing_failure_message(args.file, tokens, failure)
-                )
-            source_dir = os.path.dirname(filename)
-            typecheck(concat_ast, source_dir)
-            python_ast = transpile_ast(concat_ast)
-        except concat.typecheck.StaticAnalysisError as e:
-            if e.path is None:
-                in_path = ''
-            else:
-                in_path = ' in file ' + str(e.path)
-            print(f'Static Analysis Error{in_path}:\n')
-            print(e, 'in line:')
-            if e.location:
-                if e.path is not None:
-                    with e.path.open() as f:
-                        print(get_line_at(f, e.location), end='')
-                else:
-                    print(get_line_at(args.file, e.location), end='')
-                print(' ' * e.location[1] + '^')
-            if args.verbose:
-                raise
-        except concat.parser_combinators.ParseError as e:
-            print('Parse Error:')
-            print(
-                create_parsing_failure_message(
-                    args.file, tokens, e.args[0].failures
-                )
-            )
-        except Exception:
-            print('An internal error has occurred.')
-            print('This is a bug in Concat.')
-            raise
-        else:
-            concat.execute.execute(
-                filename,
-                python_ast,
-                {},
-                should_log_stacks=args.debug,
-                import_resolution_start_directory=source_dir,
-            )
-            if list(concat_ast.parsing_failures):
-                sys.exit(1)
-        finally:
-            args.file.close()
+        batch_main()
 
 
 # We should pass any unknown args onto the program we're about to run.
diff --git a/concat/error_reporting.py b/concat/error_reporting.py
index 5c6c996..6f5855a 100644
--- a/concat/error_reporting.py
+++ b/concat/error_reporting.py
@@ -35,7 +35,12 @@ def create_lexical_error_message(
     file: TextIO, location: concat.astutils.Location, message: str
 ) -> str:
     line = get_line_at(file, location)
-    message = f'Cannot tokenize file at line {location[0]}, column {location[1] + 1}:\n{line.rstrip()}\n{' ' * location[1] + '^'}\n'
+    message = (
+        f'Cannot tokenize file at line {location[0]}, '
+        f'column {location[1] + 1}:\n'
+        f'{line.rstrip()}\n'
+        f'{' ' * location[1] + '^'}\n'
+    )
     return message
 
 
@@ -43,5 +48,9 @@ def create_indentation_error_message(
     file: TextIO, location: concat.astutils.Location, message: str
 ) -> str:
     line = get_line_at(file, location)
-    message = f'Malformed indentation at line {location[0]}, column {location[1] + 1}:\n{line.rstrip()}\n'
+    message = (
+        f'Malformed indentation at line {location[0]}, '
+        f'column {location[1] + 1}:\n'
+        f'{line.rstrip()}\n'
+    )
     return message
diff --git a/concat/lex.py b/concat/lex.py
index 4a79fdb..9fd5118 100644
--- a/concat/lex.py
+++ b/concat/lex.py
@@ -4,6 +4,7 @@
 import io
 import json
 import tokenize as py_tokenize
+import token
 from typing import Iterator, List, Literal, Optional, Tuple, Union
 
 
@@ -63,7 +64,9 @@ class Lexer:
 
     def __init__(self) -> None:
         self.data: str
-        self.tokens: Optional[Iterator[py_tokenize.TokenInfo]]
+        self.tokens: Iterator[
+            py_tokenize.TokenInfo | IndentationErrorResult | TokenErrorResult
+        ]
         self.lineno: int
         self.lexpos: int
         self._concat_token_iterator: Iterator[Result]
@@ -72,127 +75,129 @@ def __init__(self) -> None:
     def input(self, data: str, should_preserve_comments: bool = False) -> None:
         """Initialize the Lexer object with the data to tokenize."""
         self.data = data
-        self.tokens = None
+        self.tokens = self._py_tokens_handling_errors(
+            py_tokenize.tokenize(
+                io.BytesIO(self.data.encode('utf-8')).readline
+            )
+        )
         self.lineno = 1
         self.lexpos = 0
-        self._concat_token_iterator = self._tokens()
+        self._concat_token_iterator = self._tokens_glued(self._tokens())
         self._should_preserve_comments = should_preserve_comments
 
     def token(self) -> Optional[Result]:
         """Return the next token as a Token object."""
         return next(self._concat_token_iterator, None)
 
-    def _tokens(self) -> Iterator[Result]:
-        import token
-
-        if self.tokens is None:
-            self.tokens = py_tokenize.tokenize(
-                io.BytesIO(self.data.encode('utf-8')).readline
-            )
-
-        glued_token_prefix: Token | None = None
+    def _py_tokens_handling_errors(
+        self, tokens: Iterator[py_tokenize.TokenInfo]
+    ) -> Iterator[
+        py_tokenize.TokenInfo | IndentationErrorResult | TokenErrorResult
+    ]:
         while True:
             try:
-                token_ = next(self.tokens)
+                tok = next(tokens)
+                yield tok
             except StopIteration:
                 return
             except IndentationError as e:
                 yield IndentationErrorResult(e)
             except py_tokenize.TokenError as e:
                 yield TokenErrorResult(e, (self.lineno, self.lexpos))
-            tok = Token()
-            _, tok.value, tok.start, tok.end, _ = token_
-            tok.type = token.tok_name[token_.exact_type]
-            tokens_to_massage = [tok]
-            if glued_token_prefix:
-                if (
-                    glued_token_prefix.value == '-'
-                    and tok.value == '-'
-                    and are_on_same_line_and_offset_by(
-                        glued_token_prefix.start, tok.start, 1
-                    )
-                ):
-                    glued_token_prefix.value = '--'
-                    glued_token_prefix.type = 'MINUSMINUS'
-                    glued_token_prefix.end = tok.end
+
+    def _tokens_glued(self, tokens: Iterator[Result]) -> Iterator[Result]:
+        glued_token_prefix: Token | None = None
+        for r in tokens:
+            if r.type == 'token':
+                tok = r.token
+                if glued_token_prefix:
                     self._update_position(glued_token_prefix)
+                    if tok.value == '-' and are_on_same_line_and_offset_by(
+                        glued_token_prefix.start, tok.start, 1
+                    ):
+                        glued_token_prefix.value = '--'
+                        glued_token_prefix.type = 'MINUSMINUS'
+                        glued_token_prefix.end = tok.end
+                        yield TokenResult(glued_token_prefix)
+                        glued_token_prefix = None
+                        continue
                     yield TokenResult(glued_token_prefix)
                     glued_token_prefix = None
-                    continue
+                if tok.value == '-':
+                    glued_token_prefix = tok
                 else:
-                    tokens_to_massage[:0] = [glued_token_prefix]
-                    glued_token_prefix = None
-            for tok in tokens_to_massage:
-                if tok.type in {'NL', 'COMMENT'}:
                     self._update_position(tok)
-                    if (
-                        self._should_preserve_comments
-                        and tok.type == 'COMMENT'
-                    ):
-                        yield TokenResult(tok)
-                    continue
-                elif tok.type == 'ERRORTOKEN':
-                    if tok.value == ' ':
-                        self._update_position(tok)
-                        continue
-                    elif tok.value == '!':
-                        tok.type = 'EXCLAMATIONMARK'
-                elif tok.value in {'def', 'import', 'from'}:
-                    tok.type = tok.value.upper()
-                    tok.is_keyword = True
-                elif tok.value == '$':
-                    tok.type = 'DOLLARSIGN'
-                elif tok.type != 'NAME' and tok.value in {
-                    '...',
-                    '-',
-                    '**',
-                    '~',
-                    '*',
-                    '*=',
-                    '//',
-                    '/',
-                    '%',
-                    '+',
-                    '<<',
-                    '>>',
-                    '&',
-                    '^',
-                    '|',
-                    '<',
-                    '>',
-                    '==',
-                    '>=',
-                    '<=',
-                    '!=',
-                    'is',
-                    'in',
-                    'or',
-                    'and',
-                    'not',
-                    '@',
-                }:
-                    tok.type = 'NAME'
-                    if tok.value == '-':
-                        glued_token_prefix = tok
-                        continue
+                    yield r
+            else:
+                yield r
+        if glued_token_prefix:
+            self._update_position(glued_token_prefix)
+            yield TokenResult(glued_token_prefix)
 
+    def _tokens(self) -> Iterator[Result]:
+        for token_or_error in self.tokens:
+            if isinstance(
+                token_or_error, (IndentationErrorResult, TokenErrorResult)
+            ):
+                yield token_or_error
+                continue
+            tok = Token()
+            _, tok.value, tok.start, tok.end, _ = token_or_error
+            tok.type = token.tok_name[token_or_error.exact_type]
+            if tok.type in {'NL', 'COMMENT'}:
                 self._update_position(tok)
-
-                if tok.type == 'NAME':
-                    type_map = {'as': 'AS', 'class': 'CLASS', 'cast': 'CAST'}
-                    if tok.value in type_map:
-                        tok.type = type_map[tok.value]
-                        tok.is_keyword = True
-                elif tok.type == 'STRING' and self.__is_bytes_literal(
-                    tok.value
-                ):
-                    tok.type = 'BYTES'
-                elif tok.value == '`':
-                    tok.type = 'BACKTICK'
-                elif tok.type == 'EXCLAMATION':
-                    tok.type = 'EXCLAMATIONMARK'
-
-                yield TokenResult(tok)
+                if self._should_preserve_comments and tok.type == 'COMMENT':
+                    yield TokenResult(tok)
+                continue
+            elif tok.type == 'ERRORTOKEN' and tok.value == ' ':
+                self._update_position(tok)
+                continue
+            elif tok.value in {'def', 'import', 'from', 'as', 'class', 'cast'}:
+                tok.type = tok.value.upper()
+                tok.is_keyword = True
+            elif tok.value == '$':
+                tok.type = 'DOLLARSIGN'
+            elif tok.type != 'NAME' and tok.value in {
+                '...',
+                '-',
+                '**',
+                '~',
+                '*',
+                '*=',
+                '//',
+                '/',
+                '%',
+                '+',
+                '<<',
+                '>>',
+                '&',
+                '^',
+                '|',
+                '<',
+                '>',
+                '==',
+                '>=',
+                '<=',
+                '!=',
+                'is',
+                'in',
+                'or',
+                'and',
+                'not',
+                '@',
+            }:
+                tok.type = 'NAME'
+
+            self._update_position(tok)
+
+            if tok.type == 'STRING' and self.__is_bytes_literal(tok.value):
+                tok.type = 'BYTES'
+            elif tok.value == '`':
+                tok.type = 'BACKTICK'
+            elif tok.value == '!':
+                tok.type = 'EXCLAMATIONMARK'
+
+            yield TokenResult(tok)
 
     def _update_position(self, tok: 'Token') -> None:
         self.lineno, self.lexpos = tok.start
diff --git a/concat/tests/test_lex.py b/concat/tests/test_lex.py
index 5e69e61..618ae04 100644
--- a/concat/tests/test_lex.py
+++ b/concat/tests/test_lex.py
@@ -34,7 +34,7 @@ def test_examples(self) -> None:
 
                 self.assertEqual(len(tokens), len(expected_tokens))
                 expectationPairs = zip(
-                    tokens, map(lambda t: lex.TokenResult(t), expected_tokens)
+                    tokens, map(lex.TokenResult, expected_tokens)
                 )
                 for actual_token, expected_token in expectationPairs:
                     self.assertEqual(actual_token, expected_token)

From 20e4ede33eb0cb93d1f6bc3514f06e728ad5dae1 Mon Sep 17 00:00:00 2001
From: Jason Manuel <jama.indo@hotmail.com>
Date: Fri, 8 Nov 2024 21:05:23 -0700
Subject: [PATCH 4/4] Address Deepsource suggestions

---
 concat/error_reporting.py |  4 +++-
 concat/lex.py             | 31 +++++++++++++++++++++++--------
 concat/tests/test_lex.py  |  3 ++-
 3 files changed, 28 insertions(+), 10 deletions(-)

diff --git a/concat/error_reporting.py b/concat/error_reporting.py
index 6f5855a..8b1287f 100644
--- a/concat/error_reporting.py
+++ b/concat/error_reporting.py
@@ -18,8 +18,10 @@ def create_parsing_failure_message(
 ) -> str:
     if failure.furthest_index < len(stream):
         location = stream[failure.furthest_index].start
-    else:
+    elif stream:
         location = stream[-1].start
+    else:
+        location = (1, 0)
     line = get_line_at(file, location)
     message = f'Expected {failure.expected} at line {location[0]}, column {location[1] + 1}:\n{line.rstrip()}\n{" " * location[1] + "^"}'
     if failure.children:
diff --git a/concat/lex.py b/concat/lex.py
index 9fd5118..849435e 100644
--- a/concat/lex.py
+++ b/concat/lex.py
@@ -82,7 +82,9 @@ def input(self, data: str, should_preserve_comments: bool = False) -> None:
         )
         self.lineno = 1
         self.lexpos = 0
-        self._concat_token_iterator = self._tokens_glued(self._tokens())
+        self._concat_token_iterator = self._tokens_filtering_nl_and_comments(
+            self._tokens_glued(self._tokens())
+        )
         self._should_preserve_comments = should_preserve_comments
 
     def token(self) -> Optional[Result]:
@@ -134,6 +136,18 @@ def _tokens_glued(self, tokens: Iterator[Result]) -> Iterator[Result]:
             self._update_position(glued_token_prefix)
             yield TokenResult(glued_token_prefix)
 
+    def _tokens_filtering_nl_and_comments(
+        self, tokens: Iterator[Result]
+    ) -> Iterator[Result]:
+        for r in tokens:
+            if r.type != 'token' or r.token.type not in ['NL', 'COMMENT']:
+                yield r
+                continue
+            tok = r.token
+            self._update_position(tok)
+            if self._should_preserve_comments and tok.type == 'COMMENT':
+                yield r
+
     def _tokens(self) -> Iterator[Result]:
         for token_or_error in self.tokens:
             if isinstance(
@@ -144,15 +158,10 @@ def _tokens(self) -> Iterator[Result]:
             tok = Token()
             _, tok.value, tok.start, tok.end, _ = token_or_error
             tok.type = token.tok_name[token_or_error.exact_type]
-            if tok.type in {'NL', 'COMMENT'}:
-                self._update_position(tok)
-                if self._should_preserve_comments and tok.type == 'COMMENT':
-                    yield TokenResult(tok)
-                continue
-            elif tok.type == 'ERRORTOKEN' and tok.value == ' ':
+            if tok.type == 'ERRORTOKEN' and tok.value == ' ':
                 self._update_position(tok)
                 continue
-            elif tok.value in {'def', 'import', 'from', 'as', 'class', 'cast'}:
+            if tok.value in {'def', 'import', 'from', 'as', 'class', 'cast'}:
                 tok.type = tok.value.upper()
                 tok.is_keyword = True
             elif tok.value == '$':
@@ -208,6 +217,8 @@ def __is_bytes_literal(self, literal: str) -> bool:
 
 @dataclasses.dataclass
 class TokenResult:
+    """Result class for successfully generated tokens."""
+
     type: Literal['token']
     token: Token
 
@@ -218,6 +229,8 @@ def __init__(self, token: Token) -> None:
 
 @dataclasses.dataclass
 class IndentationErrorResult:
+    """Result class for IndentationErrors raised by the Python tokenizer."""
+
     type: Literal['indent-err']
     err: IndentationError
 
@@ -228,6 +241,8 @@ def __init__(self, err: IndentationError) -> None:
 
 @dataclasses.dataclass
 class TokenErrorResult:
+    """Result class for TokenErrors raised by the Python tokenizer."""
+
     type: Literal['token-err']
     err: py_tokenize.TokenError
     location: Location
diff --git a/concat/tests/test_lex.py b/concat/tests/test_lex.py
index 618ae04..f2b5633 100644
--- a/concat/tests/test_lex.py
+++ b/concat/tests/test_lex.py
@@ -39,7 +39,8 @@ def test_examples(self) -> None:
                 for actual_token, expected_token in expectationPairs:
                     self.assertEqual(actual_token, expected_token)
 
-    def test_indentation_error(self) -> None:
+    @staticmethod
+    def test_indentation_error() -> None:
         code = textwrap.dedent("""\
             def remove_stack_polymorphism(
                 f:forall `t *s. (*s i:`t -- *s) -- g:forall `t. (i:`t -- )