From 3998da767b1d52409cb604ec608b58d9a1c8f59f Mon Sep 17 00:00:00 2001 From: Jason Manuel Date: Wed, 6 Nov 2024 00:28:06 -0700 Subject: [PATCH 1/4] Display nice errors from Python tokenizer exceptions --- concat/__main__.py | 147 +++++++++++------- concat/error_reporting.py | 21 ++- concat/lex.py | 85 +++++++--- concat/lsp/__init__.py | 57 ++++--- concat/stdlib/repl.py | 11 +- .../stdlib/test_python_concat_interface.py | 4 +- concat/tests/test_lex.py | 19 ++- concat/tests/test_typecheck.py | 2 +- concat/typecheck/__init__.py | 24 ++- 9 files changed, 264 insertions(+), 106 deletions(-) diff --git a/concat/__main__.py b/concat/__main__.py index 3d71f3a..3c3a848 100644 --- a/concat/__main__.py +++ b/concat/__main__.py @@ -2,7 +2,12 @@ import argparse from concat.transpile import parse, transpile_ast, typecheck -from concat.error_reporting import get_line_at, create_parsing_failure_message +from concat.error_reporting import ( + get_line_at, + create_indentation_error_message, + create_lexical_error_message, + create_parsing_failure_message, +) import concat.execute import concat.lex import concat.parser_combinators @@ -11,7 +16,7 @@ import json import os.path import sys -from typing import Callable, IO, AnyStr +from typing import Callable, IO, AnyStr, assert_never filename = '' @@ -55,69 +60,99 @@ def func(name: str) -> IO[AnyStr]: help='tokenize input from the given file and print the tokens as a JSON array', ) + +def main(): + # interactive mode + if args.file.isatty(): + concat.stdlib.repl.repl([], [], args.debug) + else: + try: + token_results = concat.lex.tokenize(args.file.read()) + tokens = list[concat.lex.Token]() + for r in token_results: + if r.type == 'token': + tokens.append(r.token) + elif r.type == 'indent-err': + position = (r.err.lineno or 1, r.err.offset or 0) + message = r.err.msg + print('Indentation error:') + print( + create_indentation_error_message( + args.file, position, message + ) + ) + elif r.type == 'token-err': + position = r.location + message = str(r.err) + print('Lexical error:') + print( + create_lexical_error_message( + args.file, position, message + ) + ) + else: + assert_never(r) + concat_ast = parse(tokens) + recovered_parsing_failures = concat_ast.parsing_failures + for failure in recovered_parsing_failures: + print('Parse Error:') + print( + create_parsing_failure_message(args.file, tokens, failure) + ) + source_dir = os.path.dirname(filename) + typecheck(concat_ast, source_dir) + python_ast = transpile_ast(concat_ast) + except concat.typecheck.StaticAnalysisError as e: + if e.path is None: + in_path = '' + else: + in_path = ' in file ' + str(e.path) + print(f'Static Analysis Error{in_path}:\n') + print(e, 'in line:') + if e.location: + if e.path is not None: + with e.path.open() as f: + print(get_line_at(f, e.location), end='') + else: + print(get_line_at(args.file, e.location), end='') + print(' ' * e.location[1] + '^') + if args.verbose: + raise + except concat.parser_combinators.ParseError as e: + print('Parse Error:') + print( + create_parsing_failure_message( + args.file, tokens, e.args[0].failures + ) + ) + except Exception: + print('An internal error has occurred.') + print('This is a bug in Concat.') + raise + else: + concat.execute.execute( + filename, + python_ast, + {}, + should_log_stacks=args.debug, + import_resolution_start_directory=source_dir, + ) + if list(concat_ast.parsing_failures): + sys.exit(1) + finally: + args.file.close() + + # We should pass any unknown args onto the program we're about to run. # FIXME: There might be a better way to go about this, but I think this is fine # for now. args, rest = arg_parser.parse_known_args() sys.argv = [sys.argv[0], *rest] - if args.tokenize: code = args.file.read() tokens = concat.lex.tokenize(code, should_preserve_comments=True) json.dump(tokens, sys.stdout, cls=concat.lex.TokenEncoder) sys.exit() -# interactive mode -if args.file.isatty(): - concat.stdlib.repl.repl([], [], args.debug) -else: - try: - tokens = concat.lex.tokenize(args.file.read()) - concat_ast = parse(tokens) - recovered_parsing_failures = concat_ast.parsing_failures - for failure in recovered_parsing_failures: - print('Parse Error:') - print(create_parsing_failure_message(args.file, tokens, failure)) - source_dir = os.path.dirname(filename) - typecheck(concat_ast, source_dir) - python_ast = transpile_ast(concat_ast) - except concat.typecheck.StaticAnalysisError as e: - if e.path is None: - in_path = '' - else: - in_path = ' in file ' + str(e.path) - print(f'Static Analysis Error{in_path}:\n') - print(e, 'in line:') - if e.location: - if e.path is not None: - with e.path.open() as f: - print(get_line_at(f, e.location), end='') - else: - print(get_line_at(args.file, e.location), end='') - print(' ' * e.location[1] + '^') - if args.verbose: - raise - except concat.parser_combinators.ParseError as e: - print('Parse Error:') - print( - create_parsing_failure_message( - args.file, tokens, e.args[0].failures - ) - ) - except Exception: - print('An internal error has occurred.') - print('This is a bug in Concat.') - raise - else: - concat.execute.execute( - filename, - python_ast, - {}, - should_log_stacks=args.debug, - import_resolution_start_directory=source_dir, - ) - if list(concat_ast.parsing_failures): - sys.exit(1) - finally: - args.file.close() +main() diff --git a/concat/error_reporting.py b/concat/error_reporting.py index f44b75f..5c6c996 100644 --- a/concat/error_reporting.py +++ b/concat/error_reporting.py @@ -16,7 +16,10 @@ def create_parsing_failure_message( stream: Sequence[concat.lex.Token], failure: concat.parser_combinators.FailureTree, ) -> str: - location = stream[failure.furthest_index].start + if failure.furthest_index < len(stream): + location = stream[failure.furthest_index].start + else: + location = stream[-1].start line = get_line_at(file, location) message = f'Expected {failure.expected} at line {location[0]}, column {location[1] + 1}:\n{line.rstrip()}\n{" " * location[1] + "^"}' if failure.children: @@ -26,3 +29,19 @@ def create_parsing_failure_message( create_parsing_failure_message(file, stream, f), ' ' ) return message + + +def create_lexical_error_message( + file: TextIO, location: concat.astutils.Location, message: str +) -> str: + line = get_line_at(file, location) + message = f'Cannot tokenize file at line {location[0]}, column {location[1] + 1}:\n{line.rstrip()}\n{' ' * location[1] + '^'}\n' + return message + + +def create_indentation_error_message( + file: TextIO, location: concat.astutils.Location, message: str +) -> str: + line = get_line_at(file, location) + message = f'Malformed indentation at line {location[0]}, column {location[1] + 1}:\n{line.rstrip()}\n' + return message diff --git a/concat/lex.py b/concat/lex.py index e7887c6..4a79fdb 100644 --- a/concat/lex.py +++ b/concat/lex.py @@ -1,9 +1,10 @@ -import concat.astutils +from __future__ import annotations +from concat.astutils import Location, are_on_same_line_and_offset_by import dataclasses import io import json import tokenize as py_tokenize -from typing import Iterator, List, Optional, Tuple, Union +from typing import Iterator, List, Literal, Optional, Tuple, Union @dataclasses.dataclass @@ -19,8 +20,8 @@ class Token: type: str = '' value: str = '' - start: 'concat.astutils.Location' = (0, 0) - end: 'concat.astutils.Location' = (0, 0) + start: Location = (0, 0) + end: Location = (0, 0) is_keyword: bool = False @@ -33,7 +34,10 @@ def default(self, obj): return super().default(obj) -def tokenize(code: str, should_preserve_comments: bool = False) -> List[Token]: +def tokenize( + code: str, + should_preserve_comments: bool = False, +) -> List[Result]: lexer = Lexer() lexer.input(code, should_preserve_comments) tokens = [] @@ -46,10 +50,8 @@ def tokenize(code: str, should_preserve_comments: bool = False) -> List[Token]: TokenTuple = Union[ - Tuple[str, str, 'concat.astutils.Location', 'concat.astutils.Location'], - Tuple[ - str, str, 'concat.astutils.Location', 'concat.astutils.Location', bool - ], + Tuple[str, str, Location, Location], + Tuple[str, str, Location, Location, bool], ] @@ -64,7 +66,7 @@ def __init__(self) -> None: self.tokens: Optional[Iterator[py_tokenize.TokenInfo]] self.lineno: int self.lexpos: int - self._concat_token_iterator: Iterator['Token'] + self._concat_token_iterator: Iterator[Result] self._should_preserve_comments: bool def input(self, data: str, should_preserve_comments: bool = False) -> None: @@ -76,11 +78,11 @@ def input(self, data: str, should_preserve_comments: bool = False) -> None: self._concat_token_iterator = self._tokens() self._should_preserve_comments = should_preserve_comments - def token(self) -> Optional['Token']: + def token(self) -> Optional[Result]: """Return the next token as a Token object.""" return next(self._concat_token_iterator, None) - def _tokens(self) -> Iterator['Token']: + def _tokens(self) -> Iterator[Result]: import token if self.tokens is None: @@ -88,8 +90,16 @@ def _tokens(self) -> Iterator['Token']: io.BytesIO(self.data.encode('utf-8')).readline ) - glued_token_prefix = None - for token_ in self.tokens: + glued_token_prefix: Token | None = None + while True: + try: + token_ = next(self.tokens) + except StopIteration: + return + except IndentationError as e: + yield IndentationErrorResult(e) + except py_tokenize.TokenError as e: + yield TokenErrorResult(e, (self.lineno, self.lexpos)) tok = Token() _, tok.value, tok.start, tok.end, _ = token_ tok.type = token.tok_name[token_.exact_type] @@ -98,7 +108,7 @@ def _tokens(self) -> Iterator['Token']: if ( glued_token_prefix.value == '-' and tok.value == '-' - and concat.astutils.are_on_same_line_and_offset_by( + and are_on_same_line_and_offset_by( glued_token_prefix.start, tok.start, 1 ) ): @@ -106,7 +116,7 @@ def _tokens(self) -> Iterator['Token']: glued_token_prefix.type = 'MINUSMINUS' glued_token_prefix.end = tok.end self._update_position(glued_token_prefix) - yield glued_token_prefix + yield TokenResult(glued_token_prefix) glued_token_prefix = None continue else: @@ -119,7 +129,7 @@ def _tokens(self) -> Iterator['Token']: self._should_preserve_comments and tok.type == 'COMMENT' ): - yield tok + yield TokenResult(tok) continue elif tok.type == 'ERRORTOKEN': if tok.value == ' ': @@ -182,16 +192,49 @@ def _tokens(self) -> Iterator['Token']: elif tok.type == 'EXCLAMATION': tok.type = 'EXCLAMATIONMARK' - yield tok + yield TokenResult(tok) def _update_position(self, tok: 'Token') -> None: - self.lexpos += len(tok.value) - if tok.type in {'NEWLINE', 'NL'}: - self.lineno += 1 + self.lineno, self.lexpos = tok.start def __is_bytes_literal(self, literal: str) -> bool: return isinstance(eval(literal), bytes) +@dataclasses.dataclass +class TokenResult: + type: Literal['token'] + token: Token + + def __init__(self, token: Token) -> None: + self.type = 'token' + self.token = token + + +@dataclasses.dataclass +class IndentationErrorResult: + type: Literal['indent-err'] + err: IndentationError + + def __init__(self, err: IndentationError) -> None: + self.type = 'indent-err' + self.err = err + + +@dataclasses.dataclass +class TokenErrorResult: + type: Literal['token-err'] + err: py_tokenize.TokenError + location: Location + + def __init__(self, err: py_tokenize.TokenError, loc: Location) -> None: + self.type = 'token-err' + self.err = err + self.location = loc + + +type Result = TokenResult | IndentationErrorResult | TokenErrorResult + + def to_tokens(*tokTuples: TokenTuple) -> List[Token]: return [Token(*tuple) for tuple in tokTuples] diff --git a/concat/lsp/__init__.py b/concat/lsp/__init__.py index 2bd3642..cd0e8ec 100644 --- a/concat/lsp/__init__.py +++ b/concat/lsp/__init__.py @@ -1,8 +1,8 @@ from concat.astutils import Location import concat.jsonrpc -from concat.lex import tokenize +from concat.lex import Token, tokenize from concat.logging import ConcatLogger -from concat.parse import ParseError +from concat.parser_combinators import ParseError from concat.transpile import parse, typecheck from concat.typecheck import StaticAnalysisError from enum import Enum, IntEnum @@ -10,7 +10,6 @@ import logging from pathlib import Path import re -import tokenize as py_tokenize from typing import ( BinaryIO, Callable, @@ -411,14 +410,26 @@ def diagnose(self) -> None: def _diagnose(self) -> List[_Diagnostic]: text_lines = self._text.splitlines(keepends=True) - try: - tokens = tokenize(self._text) - except py_tokenize.TokenError as e: - message = e.args[0] - position = _Position.from_tokenizer_location(text_lines, e.args[1]) - range_ = _Range(position, position) - return [_Diagnostic(range_, message)] + token_results = tokenize(self._text) diagnostics = [] + tokens = list[Token]() + for r in token_results: + if r.type == 'token': + tokens.append(r.token) + elif r.type == 'indent-err': + position = _Position.from_tokenizer_location( + text_lines, (r.err.lineno or 1, r.err.offset or 0) + ) + range_ = _Range(position, position) + message = r.err.msg + diagnostics.append(_Diagnostic(range_, message)) + elif r.type == 'token-err': + position = _Position.from_tokenizer_location( + text_lines, r.location + ) + range_ = _Range(position, position) + message = str(r.err) + diagnostics.append(_Diagnostic(range_, message)) for token in tokens: if token.type == 'ERRORTOKEN': _logger.debug('error token: {token!r}', token=token) @@ -436,19 +447,21 @@ def _diagnose(self) -> List[_Diagnostic]: diagnostics.append(_Diagnostic(range_, message)) try: ast = parse(tokens) + ast.assert_no_parse_errors() except ParseError as e: - parser_start_position = e.get_start_position() - parser_end_position = e.get_end_position() - range_ = _Range( - _Position.from_tokenizer_location( - text_lines, parser_start_position - ), - _Position.from_tokenizer_location( - text_lines, parser_end_position - ), - ) - message = f'Expected one of: {", ".join(e.expected)}' - diagnostics.append(_Diagnostic(range_, message)) + for failure in e.args[0].failures: + parser_start_position = tokens[failure.furthest_index].start + parser_end_position = parser_start_position + range_ = _Range( + _Position.from_tokenizer_location( + text_lines, parser_start_position + ), + _Position.from_tokenizer_location( + text_lines, parser_end_position + ), + ) + message = f'Expected one of: {failure.expected}' + diagnostics.append(_Diagnostic(range_, message)) return diagnostics try: # https://stackoverflow.com/questions/5977576/is-there-a-convenient-way-to-map-a-file-uri-to-os-path diff --git a/concat/stdlib/repl.py b/concat/stdlib/repl.py index e4d139e..70d8b93 100644 --- a/concat/stdlib/repl.py +++ b/concat/stdlib/repl.py @@ -23,6 +23,10 @@ sys.modules[__name__].__class__ = concat.stdlib.importlib.Module +class _REPLTokenizeError(Exception): + pass + + def _tokenize(code: str) -> List[concat.lex.Token]: lexer = concat.lex.Lexer() lexer.input(code) @@ -31,7 +35,9 @@ def _tokenize(code: str) -> List[concat.lex.Token]: token = lexer.token() if token is None: break - tokens.append(token) + if token.type != 'token': + raise _REPLTokenizeError from token.err + tokens.append(token.token) return tokens @@ -192,6 +198,9 @@ def show_var(stack: List[object], stash: List[object]): except concat.execute.ConcatRuntimeError as e: print('Runtime error:\n') print(e) + except _REPLTokenizeError as e: + print('Lexical error:\n') + print(e) except EOFError: break else: diff --git a/concat/tests/stdlib/test_python_concat_interface.py b/concat/tests/stdlib/test_python_concat_interface.py index 7abbb43..50ad8a8 100644 --- a/concat/tests/stdlib/test_python_concat_interface.py +++ b/concat/tests/stdlib/test_python_concat_interface.py @@ -731,7 +731,9 @@ def test_modules_are_callable(self) -> None: token = lexer.token() if token is None: break - tokens.append(token) + if token.type != 'token': + raise token.err + tokens.append(token.token) parser = concat.parse.ParserDict() parser.extend_with(concat.parse.extension) concat_ast = parser.parse(tokens) diff --git a/concat/tests/test_lex.py b/concat/tests/test_lex.py index 161f0e9..5e69e61 100644 --- a/concat/tests/test_lex.py +++ b/concat/tests/test_lex.py @@ -1,5 +1,6 @@ import concat.lex as lex from concat.tests.small_example_programs import examples +import textwrap import unittest @@ -32,6 +33,22 @@ def test_examples(self) -> None: tokens.append(token) self.assertEqual(len(tokens), len(expected_tokens)) - expectationPairs = zip(tokens, expected_tokens) + expectationPairs = zip( + tokens, map(lambda t: lex.TokenResult(t), expected_tokens) + ) for actual_token, expected_token in expectationPairs: self.assertEqual(actual_token, expected_token) + + def test_indentation_error(self) -> None: + code = textwrap.dedent("""\ + def remove_stack_polymorphism( + f:forall `t *s. (*s i:`t -- *s) -- g:forall `t. (i:`t -- ) + ): + () + dfbfdbff""") + lexer = lex.Lexer() + lexer.input(code) + while True: + token = lexer.token() + if token is None: + break diff --git a/concat/tests/test_typecheck.py b/concat/tests/test_typecheck.py index 8aee6f8..4a1ccd6 100644 --- a/concat/tests/test_typecheck.py +++ b/concat/tests/test_typecheck.py @@ -44,7 +44,7 @@ def lex_string(string: str) -> List[concat.lex.Token]: - return lex.tokenize(string) + return [r.token for r in lex.tokenize(string) if r.type == 'token'] def parse(string: str) -> concat.parse.TopLevelNode: diff --git a/concat/typecheck/__init__.py b/concat/typecheck/__init__.py index dcb6b99..52679b7 100644 --- a/concat/typecheck/__init__.py +++ b/concat/typecheck/__init__.py @@ -28,6 +28,7 @@ TYPE_CHECKING, Tuple, Union, + assert_never, cast, ) from concat.typecheck.types import ( @@ -58,7 +59,11 @@ no_return_type, ) import abc -from concat.error_reporting import create_parsing_failure_message +from concat.error_reporting import ( + create_indentation_error_message, + create_lexical_error_message, + create_parsing_failure_message, +) from concat.lex import Token import itertools import pathlib @@ -658,7 +663,22 @@ def _check_stub_resolved_path( raise TypeError(f'Type stubs at {path} do not exist') from e except IOError as e: raise TypeError(f'Failed to read type stubs at {path}') from e - tokens = concat.lex.tokenize(source) + token_results = concat.lex.tokenize(source) + tokens = list[Token]() + with path.open() as f: + for r in token_results: + if r.type == 'token': + tokens.append(r.token) + elif r.type == 'indent-err': + print( + create_indentation_error_message( + f, (r.err.lineno or 1, r.err.offset or 0), r.err.msg + ) + ) + elif r.type == 'token-err': + print(create_lexical_error_message(f, r.location, str(r.err))) + else: + assert_never(r) env = initial_env or Environment() from concat.transpile import parse From a8edcc0a91de3175578e4267e47cd300dbbd79f3 Mon Sep 17 00:00:00 2001 From: Jason Manuel Date: Wed, 6 Nov 2024 00:31:21 -0700 Subject: [PATCH 2/4] Add tokenizer error prefaces to stub typechecker to match __main__ --- concat/typecheck/__init__.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/concat/typecheck/__init__.py b/concat/typecheck/__init__.py index 52679b7..d009499 100644 --- a/concat/typecheck/__init__.py +++ b/concat/typecheck/__init__.py @@ -670,12 +670,14 @@ def _check_stub_resolved_path( if r.type == 'token': tokens.append(r.token) elif r.type == 'indent-err': + print('Indentation error:') print( create_indentation_error_message( f, (r.err.lineno or 1, r.err.offset or 0), r.err.msg ) ) elif r.type == 'token-err': + print('Lexical error:') print(create_lexical_error_message(f, r.location, str(r.err))) else: assert_never(r) From cb6670ffd0efe28f16bf9c21316d655915df14bc Mon Sep 17 00:00:00 2001 From: Jason Manuel Date: Fri, 8 Nov 2024 17:29:34 -0700 Subject: [PATCH 3/4] Address Deepsource suggestions --- concat/__main__.py | 156 +++++++++++++++-------------- concat/error_reporting.py | 13 ++- concat/lex.py | 203 +++++++++++++++++++------------------- concat/tests/test_lex.py | 2 +- 4 files changed, 196 insertions(+), 178 deletions(-) diff --git a/concat/__main__.py b/concat/__main__.py index 3c3a848..077aa14 100644 --- a/concat/__main__.py +++ b/concat/__main__.py @@ -57,90 +57,94 @@ def func(name: str) -> IO[AnyStr]: '--tokenize', action='store_true', default=False, - help='tokenize input from the given file and print the tokens as a JSON array', + help=( + 'tokenize input from the given file and print the tokens as a JSON ' + 'array' + ), ) +def tokenize_printing_errors() -> list[concat.lex.Token]: + token_results = concat.lex.tokenize(args.file.read()) + tokens = list[concat.lex.Token]() + for r in token_results: + if r.type == 'token': + tokens.append(r.token) + elif r.type == 'indent-err': + position = (r.err.lineno or 1, r.err.offset or 0) + message = r.err.msg + print('Indentation error:') + print( + create_indentation_error_message(args.file, position, message) + ) + elif r.type == 'token-err': + position = r.location + message = str(r.err) + print('Lexical error:') + print(create_lexical_error_message(args.file, position, message)) + else: + assert_never(r) + return tokens + + +def batch_main(): + try: + tokens = tokenize_printing_errors() + concat_ast = parse(tokens) + recovered_parsing_failures = concat_ast.parsing_failures + for failure in recovered_parsing_failures: + print('Parse Error:') + print(create_parsing_failure_message(args.file, tokens, failure)) + source_dir = os.path.dirname(filename) + typecheck(concat_ast, source_dir) + python_ast = transpile_ast(concat_ast) + except concat.typecheck.StaticAnalysisError as e: + if e.path is None: + in_path = '' + else: + in_path = ' in file ' + str(e.path) + print(f'Static Analysis Error{in_path}:\n') + print(e, 'in line:') + if e.location: + if e.path is not None: + with e.path.open() as f: + print(get_line_at(f, e.location), end='') + else: + print(get_line_at(args.file, e.location), end='') + print(' ' * e.location[1] + '^') + if args.verbose: + raise + except concat.parser_combinators.ParseError as e: + print('Parse Error:') + print( + create_parsing_failure_message( + args.file, tokens, e.args[0].failures + ) + ) + except Exception: + print('An internal error has occurred.') + print('This is a bug in Concat.') + raise + else: + concat.execute.execute( + filename, + python_ast, + {}, + should_log_stacks=args.debug, + import_resolution_start_directory=source_dir, + ) + if list(concat_ast.parsing_failures): + sys.exit(1) + finally: + args.file.close() + + def main(): # interactive mode if args.file.isatty(): concat.stdlib.repl.repl([], [], args.debug) else: - try: - token_results = concat.lex.tokenize(args.file.read()) - tokens = list[concat.lex.Token]() - for r in token_results: - if r.type == 'token': - tokens.append(r.token) - elif r.type == 'indent-err': - position = (r.err.lineno or 1, r.err.offset or 0) - message = r.err.msg - print('Indentation error:') - print( - create_indentation_error_message( - args.file, position, message - ) - ) - elif r.type == 'token-err': - position = r.location - message = str(r.err) - print('Lexical error:') - print( - create_lexical_error_message( - args.file, position, message - ) - ) - else: - assert_never(r) - concat_ast = parse(tokens) - recovered_parsing_failures = concat_ast.parsing_failures - for failure in recovered_parsing_failures: - print('Parse Error:') - print( - create_parsing_failure_message(args.file, tokens, failure) - ) - source_dir = os.path.dirname(filename) - typecheck(concat_ast, source_dir) - python_ast = transpile_ast(concat_ast) - except concat.typecheck.StaticAnalysisError as e: - if e.path is None: - in_path = '' - else: - in_path = ' in file ' + str(e.path) - print(f'Static Analysis Error{in_path}:\n') - print(e, 'in line:') - if e.location: - if e.path is not None: - with e.path.open() as f: - print(get_line_at(f, e.location), end='') - else: - print(get_line_at(args.file, e.location), end='') - print(' ' * e.location[1] + '^') - if args.verbose: - raise - except concat.parser_combinators.ParseError as e: - print('Parse Error:') - print( - create_parsing_failure_message( - args.file, tokens, e.args[0].failures - ) - ) - except Exception: - print('An internal error has occurred.') - print('This is a bug in Concat.') - raise - else: - concat.execute.execute( - filename, - python_ast, - {}, - should_log_stacks=args.debug, - import_resolution_start_directory=source_dir, - ) - if list(concat_ast.parsing_failures): - sys.exit(1) - finally: - args.file.close() + batch_main() # We should pass any unknown args onto the program we're about to run. diff --git a/concat/error_reporting.py b/concat/error_reporting.py index 5c6c996..6f5855a 100644 --- a/concat/error_reporting.py +++ b/concat/error_reporting.py @@ -35,7 +35,12 @@ def create_lexical_error_message( file: TextIO, location: concat.astutils.Location, message: str ) -> str: line = get_line_at(file, location) - message = f'Cannot tokenize file at line {location[0]}, column {location[1] + 1}:\n{line.rstrip()}\n{' ' * location[1] + '^'}\n' + message = ( + f'Cannot tokenize file at line {location[0]}, ' + f'column {location[1] + 1}:\n' + f'{line.rstrip()}\n' + f'{' ' * location[1] + '^'}\n' + ) return message @@ -43,5 +48,9 @@ def create_indentation_error_message( file: TextIO, location: concat.astutils.Location, message: str ) -> str: line = get_line_at(file, location) - message = f'Malformed indentation at line {location[0]}, column {location[1] + 1}:\n{line.rstrip()}\n' + message = ( + f'Malformed indentation at line {location[0]}, ' + f'column {location[1] + 1}:\n' + f'{line.rstrip()}\n' + ) return message diff --git a/concat/lex.py b/concat/lex.py index 4a79fdb..9fd5118 100644 --- a/concat/lex.py +++ b/concat/lex.py @@ -4,6 +4,7 @@ import io import json import tokenize as py_tokenize +import token from typing import Iterator, List, Literal, Optional, Tuple, Union @@ -63,7 +64,9 @@ class Lexer: def __init__(self) -> None: self.data: str - self.tokens: Optional[Iterator[py_tokenize.TokenInfo]] + self.tokens: Iterator[ + py_tokenize.TokenInfo | IndentationErrorResult | TokenErrorResult + ] self.lineno: int self.lexpos: int self._concat_token_iterator: Iterator[Result] @@ -72,127 +75,129 @@ def __init__(self) -> None: def input(self, data: str, should_preserve_comments: bool = False) -> None: """Initialize the Lexer object with the data to tokenize.""" self.data = data - self.tokens = None + self.tokens = self._py_tokens_handling_errors( + py_tokenize.tokenize( + io.BytesIO(self.data.encode('utf-8')).readline + ) + ) self.lineno = 1 self.lexpos = 0 - self._concat_token_iterator = self._tokens() + self._concat_token_iterator = self._tokens_glued(self._tokens()) self._should_preserve_comments = should_preserve_comments def token(self) -> Optional[Result]: """Return the next token as a Token object.""" return next(self._concat_token_iterator, None) - def _tokens(self) -> Iterator[Result]: - import token - - if self.tokens is None: - self.tokens = py_tokenize.tokenize( - io.BytesIO(self.data.encode('utf-8')).readline - ) - - glued_token_prefix: Token | None = None + def _py_tokens_handling_errors( + self, tokens: Iterator[py_tokenize.TokenInfo] + ) -> Iterator[ + py_tokenize.TokenInfo | IndentationErrorResult | TokenErrorResult + ]: while True: try: - token_ = next(self.tokens) + tok = next(tokens) + yield tok except StopIteration: return except IndentationError as e: yield IndentationErrorResult(e) except py_tokenize.TokenError as e: yield TokenErrorResult(e, (self.lineno, self.lexpos)) - tok = Token() - _, tok.value, tok.start, tok.end, _ = token_ - tok.type = token.tok_name[token_.exact_type] - tokens_to_massage = [tok] - if glued_token_prefix: - if ( - glued_token_prefix.value == '-' - and tok.value == '-' - and are_on_same_line_and_offset_by( - glued_token_prefix.start, tok.start, 1 - ) - ): - glued_token_prefix.value = '--' - glued_token_prefix.type = 'MINUSMINUS' - glued_token_prefix.end = tok.end + + def _tokens_glued(self, tokens: Iterator[Result]) -> Iterator[Result]: + glued_token_prefix: Token | None = None + for r in tokens: + if r.type == 'token': + tok = r.token + if glued_token_prefix: self._update_position(glued_token_prefix) + if tok.value == '-' and are_on_same_line_and_offset_by( + glued_token_prefix.start, tok.start, 1 + ): + glued_token_prefix.value = '--' + glued_token_prefix.type = 'MINUSMINUS' + glued_token_prefix.end = tok.end + yield TokenResult(glued_token_prefix) + glued_token_prefix = None + continue yield TokenResult(glued_token_prefix) glued_token_prefix = None - continue + if tok.value == '-': + glued_token_prefix = tok else: - tokens_to_massage[:0] = [glued_token_prefix] - glued_token_prefix = None - for tok in tokens_to_massage: - if tok.type in {'NL', 'COMMENT'}: self._update_position(tok) - if ( - self._should_preserve_comments - and tok.type == 'COMMENT' - ): - yield TokenResult(tok) - continue - elif tok.type == 'ERRORTOKEN': - if tok.value == ' ': - self._update_position(tok) - continue - elif tok.value == '!': - tok.type = 'EXCLAMATIONMARK' - elif tok.value in {'def', 'import', 'from'}: - tok.type = tok.value.upper() - tok.is_keyword = True - elif tok.value == '$': - tok.type = 'DOLLARSIGN' - elif tok.type != 'NAME' and tok.value in { - '...', - '-', - '**', - '~', - '*', - '*=', - '//', - '/', - '%', - '+', - '<<', - '>>', - '&', - '^', - '|', - '<', - '>', - '==', - '>=', - '<=', - '!=', - 'is', - 'in', - 'or', - 'and', - 'not', - '@', - }: - tok.type = 'NAME' - if tok.value == '-': - glued_token_prefix = tok - continue + yield r + else: + yield r + if glued_token_prefix: + self._update_position(glued_token_prefix) + yield TokenResult(glued_token_prefix) + def _tokens(self) -> Iterator[Result]: + for token_or_error in self.tokens: + if isinstance( + token_or_error, (IndentationErrorResult, TokenErrorResult) + ): + yield token_or_error + continue + tok = Token() + _, tok.value, tok.start, tok.end, _ = token_or_error + tok.type = token.tok_name[token_or_error.exact_type] + if tok.type in {'NL', 'COMMENT'}: self._update_position(tok) - - if tok.type == 'NAME': - type_map = {'as': 'AS', 'class': 'CLASS', 'cast': 'CAST'} - if tok.value in type_map: - tok.type = type_map[tok.value] - tok.is_keyword = True - elif tok.type == 'STRING' and self.__is_bytes_literal( - tok.value - ): - tok.type = 'BYTES' - elif tok.value == '`': - tok.type = 'BACKTICK' - elif tok.type == 'EXCLAMATION': - tok.type = 'EXCLAMATIONMARK' - - yield TokenResult(tok) + if self._should_preserve_comments and tok.type == 'COMMENT': + yield TokenResult(tok) + continue + elif tok.type == 'ERRORTOKEN' and tok.value == ' ': + self._update_position(tok) + continue + elif tok.value in {'def', 'import', 'from', 'as', 'class', 'cast'}: + tok.type = tok.value.upper() + tok.is_keyword = True + elif tok.value == '$': + tok.type = 'DOLLARSIGN' + elif tok.type != 'NAME' and tok.value in { + '...', + '-', + '**', + '~', + '*', + '*=', + '//', + '/', + '%', + '+', + '<<', + '>>', + '&', + '^', + '|', + '<', + '>', + '==', + '>=', + '<=', + '!=', + 'is', + 'in', + 'or', + 'and', + 'not', + '@', + }: + tok.type = 'NAME' + + self._update_position(tok) + + if tok.type == 'STRING' and self.__is_bytes_literal(tok.value): + tok.type = 'BYTES' + elif tok.value == '`': + tok.type = 'BACKTICK' + elif tok.value == '!': + tok.type = 'EXCLAMATIONMARK' + + yield TokenResult(tok) def _update_position(self, tok: 'Token') -> None: self.lineno, self.lexpos = tok.start diff --git a/concat/tests/test_lex.py b/concat/tests/test_lex.py index 5e69e61..618ae04 100644 --- a/concat/tests/test_lex.py +++ b/concat/tests/test_lex.py @@ -34,7 +34,7 @@ def test_examples(self) -> None: self.assertEqual(len(tokens), len(expected_tokens)) expectationPairs = zip( - tokens, map(lambda t: lex.TokenResult(t), expected_tokens) + tokens, map(lex.TokenResult, expected_tokens) ) for actual_token, expected_token in expectationPairs: self.assertEqual(actual_token, expected_token) From 20e4ede33eb0cb93d1f6bc3514f06e728ad5dae1 Mon Sep 17 00:00:00 2001 From: Jason Manuel Date: Fri, 8 Nov 2024 21:05:23 -0700 Subject: [PATCH 4/4] Address Deepsource suggestions --- concat/error_reporting.py | 4 +++- concat/lex.py | 31 +++++++++++++++++++++++-------- concat/tests/test_lex.py | 3 ++- 3 files changed, 28 insertions(+), 10 deletions(-) diff --git a/concat/error_reporting.py b/concat/error_reporting.py index 6f5855a..8b1287f 100644 --- a/concat/error_reporting.py +++ b/concat/error_reporting.py @@ -18,8 +18,10 @@ def create_parsing_failure_message( ) -> str: if failure.furthest_index < len(stream): location = stream[failure.furthest_index].start - else: + elif stream: location = stream[-1].start + else: + location = (1, 0) line = get_line_at(file, location) message = f'Expected {failure.expected} at line {location[0]}, column {location[1] + 1}:\n{line.rstrip()}\n{" " * location[1] + "^"}' if failure.children: diff --git a/concat/lex.py b/concat/lex.py index 9fd5118..849435e 100644 --- a/concat/lex.py +++ b/concat/lex.py @@ -82,7 +82,9 @@ def input(self, data: str, should_preserve_comments: bool = False) -> None: ) self.lineno = 1 self.lexpos = 0 - self._concat_token_iterator = self._tokens_glued(self._tokens()) + self._concat_token_iterator = self._tokens_filtering_nl_and_comments( + self._tokens_glued(self._tokens()) + ) self._should_preserve_comments = should_preserve_comments def token(self) -> Optional[Result]: @@ -134,6 +136,18 @@ def _tokens_glued(self, tokens: Iterator[Result]) -> Iterator[Result]: self._update_position(glued_token_prefix) yield TokenResult(glued_token_prefix) + def _tokens_filtering_nl_and_comments( + self, tokens: Iterator[Result] + ) -> Iterator[Result]: + for r in tokens: + if r.type != 'token' or r.token.type not in ['NL', 'COMMENT']: + yield r + continue + tok = r.token + self._update_position(tok) + if self._should_preserve_comments and tok.type == 'COMMENT': + yield r + def _tokens(self) -> Iterator[Result]: for token_or_error in self.tokens: if isinstance( @@ -144,15 +158,10 @@ def _tokens(self) -> Iterator[Result]: tok = Token() _, tok.value, tok.start, tok.end, _ = token_or_error tok.type = token.tok_name[token_or_error.exact_type] - if tok.type in {'NL', 'COMMENT'}: - self._update_position(tok) - if self._should_preserve_comments and tok.type == 'COMMENT': - yield TokenResult(tok) - continue - elif tok.type == 'ERRORTOKEN' and tok.value == ' ': + if tok.type == 'ERRORTOKEN' and tok.value == ' ': self._update_position(tok) continue - elif tok.value in {'def', 'import', 'from', 'as', 'class', 'cast'}: + if tok.value in {'def', 'import', 'from', 'as', 'class', 'cast'}: tok.type = tok.value.upper() tok.is_keyword = True elif tok.value == '$': @@ -208,6 +217,8 @@ def __is_bytes_literal(self, literal: str) -> bool: @dataclasses.dataclass class TokenResult: + """Result class for successfully generated tokens.""" + type: Literal['token'] token: Token @@ -218,6 +229,8 @@ def __init__(self, token: Token) -> None: @dataclasses.dataclass class IndentationErrorResult: + """Result class for IndentationErrors raised by the Python tokenizer.""" + type: Literal['indent-err'] err: IndentationError @@ -228,6 +241,8 @@ def __init__(self, err: IndentationError) -> None: @dataclasses.dataclass class TokenErrorResult: + """Result class for TokenErrors raised by the Python tokenizer.""" + type: Literal['token-err'] err: py_tokenize.TokenError location: Location diff --git a/concat/tests/test_lex.py b/concat/tests/test_lex.py index 618ae04..f2b5633 100644 --- a/concat/tests/test_lex.py +++ b/concat/tests/test_lex.py @@ -39,7 +39,8 @@ def test_examples(self) -> None: for actual_token, expected_token in expectationPairs: self.assertEqual(actual_token, expected_token) - def test_indentation_error(self) -> None: + @staticmethod + def test_indentation_error() -> None: code = textwrap.dedent("""\ def remove_stack_polymorphism( f:forall `t *s. (*s i:`t -- *s) -- g:forall `t. (i:`t -- )