diff --git a/concat/__main__.py b/concat/__main__.py index 3d71f3a..077aa14 100644 --- a/concat/__main__.py +++ b/concat/__main__.py @@ -2,7 +2,12 @@ import argparse from concat.transpile import parse, transpile_ast, typecheck -from concat.error_reporting import get_line_at, create_parsing_failure_message +from concat.error_reporting import ( + get_line_at, + create_indentation_error_message, + create_lexical_error_message, + create_parsing_failure_message, +) import concat.execute import concat.lex import concat.parser_combinators @@ -11,7 +16,7 @@ import json import os.path import sys -from typing import Callable, IO, AnyStr +from typing import Callable, IO, AnyStr, assert_never filename = '' @@ -52,28 +57,39 @@ def func(name: str) -> IO[AnyStr]: '--tokenize', action='store_true', default=False, - help='tokenize input from the given file and print the tokens as a JSON array', + help=( + 'tokenize input from the given file and print the tokens as a JSON ' + 'array' + ), ) -# We should pass any unknown args onto the program we're about to run. -# FIXME: There might be a better way to go about this, but I think this is fine -# for now. -args, rest = arg_parser.parse_known_args() -sys.argv = [sys.argv[0], *rest] +def tokenize_printing_errors() -> list[concat.lex.Token]: + token_results = concat.lex.tokenize(args.file.read()) + tokens = list[concat.lex.Token]() + for r in token_results: + if r.type == 'token': + tokens.append(r.token) + elif r.type == 'indent-err': + position = (r.err.lineno or 1, r.err.offset or 0) + message = r.err.msg + print('Indentation error:') + print( + create_indentation_error_message(args.file, position, message) + ) + elif r.type == 'token-err': + position = r.location + message = str(r.err) + print('Lexical error:') + print(create_lexical_error_message(args.file, position, message)) + else: + assert_never(r) + return tokens -if args.tokenize: - code = args.file.read() - tokens = concat.lex.tokenize(code, should_preserve_comments=True) - json.dump(tokens, sys.stdout, cls=concat.lex.TokenEncoder) - sys.exit() -# interactive mode -if args.file.isatty(): - concat.stdlib.repl.repl([], [], args.debug) -else: +def batch_main(): try: - tokens = concat.lex.tokenize(args.file.read()) + tokens = tokenize_printing_errors() concat_ast = parse(tokens) recovered_parsing_failures = concat_ast.parsing_failures for failure in recovered_parsing_failures: @@ -121,3 +137,26 @@ def func(name: str) -> IO[AnyStr]: sys.exit(1) finally: args.file.close() + + +def main(): + # interactive mode + if args.file.isatty(): + concat.stdlib.repl.repl([], [], args.debug) + else: + batch_main() + + +# We should pass any unknown args onto the program we're about to run. +# FIXME: There might be a better way to go about this, but I think this is fine +# for now. +args, rest = arg_parser.parse_known_args() +sys.argv = [sys.argv[0], *rest] + +if args.tokenize: + code = args.file.read() + tokens = concat.lex.tokenize(code, should_preserve_comments=True) + json.dump(tokens, sys.stdout, cls=concat.lex.TokenEncoder) + sys.exit() + +main() diff --git a/concat/error_reporting.py b/concat/error_reporting.py index f44b75f..8b1287f 100644 --- a/concat/error_reporting.py +++ b/concat/error_reporting.py @@ -16,7 +16,12 @@ def create_parsing_failure_message( stream: Sequence[concat.lex.Token], failure: concat.parser_combinators.FailureTree, ) -> str: - location = stream[failure.furthest_index].start + if failure.furthest_index < len(stream): + location = stream[failure.furthest_index].start + elif stream: + location = stream[-1].start + else: + location = (1, 0) line = get_line_at(file, location) message = f'Expected {failure.expected} at line {location[0]}, column {location[1] + 1}:\n{line.rstrip()}\n{" " * location[1] + "^"}' if failure.children: @@ -26,3 +31,28 @@ def create_parsing_failure_message( create_parsing_failure_message(file, stream, f), ' ' ) return message + + +def create_lexical_error_message( + file: TextIO, location: concat.astutils.Location, message: str +) -> str: + line = get_line_at(file, location) + message = ( + f'Cannot tokenize file at line {location[0]}, ' + f'column {location[1] + 1}:\n' + f'{line.rstrip()}\n' + f'{' ' * location[1] + '^'}\n' + ) + return message + + +def create_indentation_error_message( + file: TextIO, location: concat.astutils.Location, message: str +) -> str: + line = get_line_at(file, location) + message = ( + f'Malformed indentation at line {location[0]}, ' + f'column {location[1] + 1}:\n' + f'{line.rstrip()}\n' + ) + return message diff --git a/concat/lex.py b/concat/lex.py index e7887c6..849435e 100644 --- a/concat/lex.py +++ b/concat/lex.py @@ -1,9 +1,11 @@ -import concat.astutils +from __future__ import annotations +from concat.astutils import Location, are_on_same_line_and_offset_by import dataclasses import io import json import tokenize as py_tokenize -from typing import Iterator, List, Optional, Tuple, Union +import token +from typing import Iterator, List, Literal, Optional, Tuple, Union @dataclasses.dataclass @@ -19,8 +21,8 @@ class Token: type: str = '' value: str = '' - start: 'concat.astutils.Location' = (0, 0) - end: 'concat.astutils.Location' = (0, 0) + start: Location = (0, 0) + end: Location = (0, 0) is_keyword: bool = False @@ -33,7 +35,10 @@ def default(self, obj): return super().default(obj) -def tokenize(code: str, should_preserve_comments: bool = False) -> List[Token]: +def tokenize( + code: str, + should_preserve_comments: bool = False, +) -> List[Result]: lexer = Lexer() lexer.input(code, should_preserve_comments) tokens = [] @@ -46,10 +51,8 @@ def tokenize(code: str, should_preserve_comments: bool = False) -> List[Token]: TokenTuple = Union[ - Tuple[str, str, 'concat.astutils.Location', 'concat.astutils.Location'], - Tuple[ - str, str, 'concat.astutils.Location', 'concat.astutils.Location', bool - ], + Tuple[str, str, Location, Location], + Tuple[str, str, Location, Location, bool], ] @@ -61,137 +64,197 @@ class Lexer: def __init__(self) -> None: self.data: str - self.tokens: Optional[Iterator[py_tokenize.TokenInfo]] + self.tokens: Iterator[ + py_tokenize.TokenInfo | IndentationErrorResult | TokenErrorResult + ] self.lineno: int self.lexpos: int - self._concat_token_iterator: Iterator['Token'] + self._concat_token_iterator: Iterator[Result] self._should_preserve_comments: bool def input(self, data: str, should_preserve_comments: bool = False) -> None: """Initialize the Lexer object with the data to tokenize.""" self.data = data - self.tokens = None + self.tokens = self._py_tokens_handling_errors( + py_tokenize.tokenize( + io.BytesIO(self.data.encode('utf-8')).readline + ) + ) self.lineno = 1 self.lexpos = 0 - self._concat_token_iterator = self._tokens() + self._concat_token_iterator = self._tokens_filtering_nl_and_comments( + self._tokens_glued(self._tokens()) + ) self._should_preserve_comments = should_preserve_comments - def token(self) -> Optional['Token']: + def token(self) -> Optional[Result]: """Return the next token as a Token object.""" return next(self._concat_token_iterator, None) - def _tokens(self) -> Iterator['Token']: - import token - - if self.tokens is None: - self.tokens = py_tokenize.tokenize( - io.BytesIO(self.data.encode('utf-8')).readline - ) + def _py_tokens_handling_errors( + self, tokens: Iterator[py_tokenize.TokenInfo] + ) -> Iterator[ + py_tokenize.TokenInfo | IndentationErrorResult | TokenErrorResult + ]: + while True: + try: + tok = next(tokens) + yield tok + except StopIteration: + return + except IndentationError as e: + yield IndentationErrorResult(e) + except py_tokenize.TokenError as e: + yield TokenErrorResult(e, (self.lineno, self.lexpos)) - glued_token_prefix = None - for token_ in self.tokens: - tok = Token() - _, tok.value, tok.start, tok.end, _ = token_ - tok.type = token.tok_name[token_.exact_type] - tokens_to_massage = [tok] - if glued_token_prefix: - if ( - glued_token_prefix.value == '-' - and tok.value == '-' - and concat.astutils.are_on_same_line_and_offset_by( - glued_token_prefix.start, tok.start, 1 - ) - ): - glued_token_prefix.value = '--' - glued_token_prefix.type = 'MINUSMINUS' - glued_token_prefix.end = tok.end + def _tokens_glued(self, tokens: Iterator[Result]) -> Iterator[Result]: + glued_token_prefix: Token | None = None + for r in tokens: + if r.type == 'token': + tok = r.token + if glued_token_prefix: self._update_position(glued_token_prefix) - yield glued_token_prefix + if tok.value == '-' and are_on_same_line_and_offset_by( + glued_token_prefix.start, tok.start, 1 + ): + glued_token_prefix.value = '--' + glued_token_prefix.type = 'MINUSMINUS' + glued_token_prefix.end = tok.end + yield TokenResult(glued_token_prefix) + glued_token_prefix = None + continue + yield TokenResult(glued_token_prefix) glued_token_prefix = None - continue + if tok.value == '-': + glued_token_prefix = tok else: - tokens_to_massage[:0] = [glued_token_prefix] - glued_token_prefix = None - for tok in tokens_to_massage: - if tok.type in {'NL', 'COMMENT'}: self._update_position(tok) - if ( - self._should_preserve_comments - and tok.type == 'COMMENT' - ): - yield tok - continue - elif tok.type == 'ERRORTOKEN': - if tok.value == ' ': - self._update_position(tok) - continue - elif tok.value == '!': - tok.type = 'EXCLAMATIONMARK' - elif tok.value in {'def', 'import', 'from'}: - tok.type = tok.value.upper() - tok.is_keyword = True - elif tok.value == '$': - tok.type = 'DOLLARSIGN' - elif tok.type != 'NAME' and tok.value in { - '...', - '-', - '**', - '~', - '*', - '*=', - '//', - '/', - '%', - '+', - '<<', - '>>', - '&', - '^', - '|', - '<', - '>', - '==', - '>=', - '<=', - '!=', - 'is', - 'in', - 'or', - 'and', - 'not', - '@', - }: - tok.type = 'NAME' - if tok.value == '-': - glued_token_prefix = tok - continue + yield r + else: + yield r + if glued_token_prefix: + self._update_position(glued_token_prefix) + yield TokenResult(glued_token_prefix) + + def _tokens_filtering_nl_and_comments( + self, tokens: Iterator[Result] + ) -> Iterator[Result]: + for r in tokens: + if r.type != 'token' or r.token.type not in ['NL', 'COMMENT']: + yield r + continue + tok = r.token + self._update_position(tok) + if self._should_preserve_comments and tok.type == 'COMMENT': + yield r + def _tokens(self) -> Iterator[Result]: + for token_or_error in self.tokens: + if isinstance( + token_or_error, (IndentationErrorResult, TokenErrorResult) + ): + yield token_or_error + continue + tok = Token() + _, tok.value, tok.start, tok.end, _ = token_or_error + tok.type = token.tok_name[token_or_error.exact_type] + if tok.type == 'ERRORTOKEN' and tok.value == ' ': self._update_position(tok) + continue + if tok.value in {'def', 'import', 'from', 'as', 'class', 'cast'}: + tok.type = tok.value.upper() + tok.is_keyword = True + elif tok.value == '$': + tok.type = 'DOLLARSIGN' + elif tok.type != 'NAME' and tok.value in { + '...', + '-', + '**', + '~', + '*', + '*=', + '//', + '/', + '%', + '+', + '<<', + '>>', + '&', + '^', + '|', + '<', + '>', + '==', + '>=', + '<=', + '!=', + 'is', + 'in', + 'or', + 'and', + 'not', + '@', + }: + tok.type = 'NAME' - if tok.type == 'NAME': - type_map = {'as': 'AS', 'class': 'CLASS', 'cast': 'CAST'} - if tok.value in type_map: - tok.type = type_map[tok.value] - tok.is_keyword = True - elif tok.type == 'STRING' and self.__is_bytes_literal( - tok.value - ): - tok.type = 'BYTES' - elif tok.value == '`': - tok.type = 'BACKTICK' - elif tok.type == 'EXCLAMATION': - tok.type = 'EXCLAMATIONMARK' + self._update_position(tok) - yield tok + if tok.type == 'STRING' and self.__is_bytes_literal(tok.value): + tok.type = 'BYTES' + elif tok.value == '`': + tok.type = 'BACKTICK' + elif tok.value == '!': + tok.type = 'EXCLAMATIONMARK' + + yield TokenResult(tok) def _update_position(self, tok: 'Token') -> None: - self.lexpos += len(tok.value) - if tok.type in {'NEWLINE', 'NL'}: - self.lineno += 1 + self.lineno, self.lexpos = tok.start def __is_bytes_literal(self, literal: str) -> bool: return isinstance(eval(literal), bytes) +@dataclasses.dataclass +class TokenResult: + """Result class for successfully generated tokens.""" + + type: Literal['token'] + token: Token + + def __init__(self, token: Token) -> None: + self.type = 'token' + self.token = token + + +@dataclasses.dataclass +class IndentationErrorResult: + """Result class for IndentationErrors raised by the Python tokenizer.""" + + type: Literal['indent-err'] + err: IndentationError + + def __init__(self, err: IndentationError) -> None: + self.type = 'indent-err' + self.err = err + + +@dataclasses.dataclass +class TokenErrorResult: + """Result class for TokenErrors raised by the Python tokenizer.""" + + type: Literal['token-err'] + err: py_tokenize.TokenError + location: Location + + def __init__(self, err: py_tokenize.TokenError, loc: Location) -> None: + self.type = 'token-err' + self.err = err + self.location = loc + + +type Result = TokenResult | IndentationErrorResult | TokenErrorResult + + def to_tokens(*tokTuples: TokenTuple) -> List[Token]: return [Token(*tuple) for tuple in tokTuples] diff --git a/concat/lsp/__init__.py b/concat/lsp/__init__.py index 2bd3642..cd0e8ec 100644 --- a/concat/lsp/__init__.py +++ b/concat/lsp/__init__.py @@ -1,8 +1,8 @@ from concat.astutils import Location import concat.jsonrpc -from concat.lex import tokenize +from concat.lex import Token, tokenize from concat.logging import ConcatLogger -from concat.parse import ParseError +from concat.parser_combinators import ParseError from concat.transpile import parse, typecheck from concat.typecheck import StaticAnalysisError from enum import Enum, IntEnum @@ -10,7 +10,6 @@ import logging from pathlib import Path import re -import tokenize as py_tokenize from typing import ( BinaryIO, Callable, @@ -411,14 +410,26 @@ def diagnose(self) -> None: def _diagnose(self) -> List[_Diagnostic]: text_lines = self._text.splitlines(keepends=True) - try: - tokens = tokenize(self._text) - except py_tokenize.TokenError as e: - message = e.args[0] - position = _Position.from_tokenizer_location(text_lines, e.args[1]) - range_ = _Range(position, position) - return [_Diagnostic(range_, message)] + token_results = tokenize(self._text) diagnostics = [] + tokens = list[Token]() + for r in token_results: + if r.type == 'token': + tokens.append(r.token) + elif r.type == 'indent-err': + position = _Position.from_tokenizer_location( + text_lines, (r.err.lineno or 1, r.err.offset or 0) + ) + range_ = _Range(position, position) + message = r.err.msg + diagnostics.append(_Diagnostic(range_, message)) + elif r.type == 'token-err': + position = _Position.from_tokenizer_location( + text_lines, r.location + ) + range_ = _Range(position, position) + message = str(r.err) + diagnostics.append(_Diagnostic(range_, message)) for token in tokens: if token.type == 'ERRORTOKEN': _logger.debug('error token: {token!r}', token=token) @@ -436,19 +447,21 @@ def _diagnose(self) -> List[_Diagnostic]: diagnostics.append(_Diagnostic(range_, message)) try: ast = parse(tokens) + ast.assert_no_parse_errors() except ParseError as e: - parser_start_position = e.get_start_position() - parser_end_position = e.get_end_position() - range_ = _Range( - _Position.from_tokenizer_location( - text_lines, parser_start_position - ), - _Position.from_tokenizer_location( - text_lines, parser_end_position - ), - ) - message = f'Expected one of: {", ".join(e.expected)}' - diagnostics.append(_Diagnostic(range_, message)) + for failure in e.args[0].failures: + parser_start_position = tokens[failure.furthest_index].start + parser_end_position = parser_start_position + range_ = _Range( + _Position.from_tokenizer_location( + text_lines, parser_start_position + ), + _Position.from_tokenizer_location( + text_lines, parser_end_position + ), + ) + message = f'Expected one of: {failure.expected}' + diagnostics.append(_Diagnostic(range_, message)) return diagnostics try: # https://stackoverflow.com/questions/5977576/is-there-a-convenient-way-to-map-a-file-uri-to-os-path diff --git a/concat/stdlib/repl.py b/concat/stdlib/repl.py index e4d139e..70d8b93 100644 --- a/concat/stdlib/repl.py +++ b/concat/stdlib/repl.py @@ -23,6 +23,10 @@ sys.modules[__name__].__class__ = concat.stdlib.importlib.Module +class _REPLTokenizeError(Exception): + pass + + def _tokenize(code: str) -> List[concat.lex.Token]: lexer = concat.lex.Lexer() lexer.input(code) @@ -31,7 +35,9 @@ def _tokenize(code: str) -> List[concat.lex.Token]: token = lexer.token() if token is None: break - tokens.append(token) + if token.type != 'token': + raise _REPLTokenizeError from token.err + tokens.append(token.token) return tokens @@ -192,6 +198,9 @@ def show_var(stack: List[object], stash: List[object]): except concat.execute.ConcatRuntimeError as e: print('Runtime error:\n') print(e) + except _REPLTokenizeError as e: + print('Lexical error:\n') + print(e) except EOFError: break else: diff --git a/concat/tests/stdlib/test_python_concat_interface.py b/concat/tests/stdlib/test_python_concat_interface.py index 7abbb43..50ad8a8 100644 --- a/concat/tests/stdlib/test_python_concat_interface.py +++ b/concat/tests/stdlib/test_python_concat_interface.py @@ -731,7 +731,9 @@ def test_modules_are_callable(self) -> None: token = lexer.token() if token is None: break - tokens.append(token) + if token.type != 'token': + raise token.err + tokens.append(token.token) parser = concat.parse.ParserDict() parser.extend_with(concat.parse.extension) concat_ast = parser.parse(tokens) diff --git a/concat/tests/test_lex.py b/concat/tests/test_lex.py index 161f0e9..f2b5633 100644 --- a/concat/tests/test_lex.py +++ b/concat/tests/test_lex.py @@ -1,5 +1,6 @@ import concat.lex as lex from concat.tests.small_example_programs import examples +import textwrap import unittest @@ -32,6 +33,23 @@ def test_examples(self) -> None: tokens.append(token) self.assertEqual(len(tokens), len(expected_tokens)) - expectationPairs = zip(tokens, expected_tokens) + expectationPairs = zip( + tokens, map(lex.TokenResult, expected_tokens) + ) for actual_token, expected_token in expectationPairs: self.assertEqual(actual_token, expected_token) + + @staticmethod + def test_indentation_error() -> None: + code = textwrap.dedent("""\ + def remove_stack_polymorphism( + f:forall `t *s. (*s i:`t -- *s) -- g:forall `t. (i:`t -- ) + ): + () + dfbfdbff""") + lexer = lex.Lexer() + lexer.input(code) + while True: + token = lexer.token() + if token is None: + break diff --git a/concat/tests/test_typecheck.py b/concat/tests/test_typecheck.py index 8aee6f8..4a1ccd6 100644 --- a/concat/tests/test_typecheck.py +++ b/concat/tests/test_typecheck.py @@ -44,7 +44,7 @@ def lex_string(string: str) -> List[concat.lex.Token]: - return lex.tokenize(string) + return [r.token for r in lex.tokenize(string) if r.type == 'token'] def parse(string: str) -> concat.parse.TopLevelNode: diff --git a/concat/typecheck/__init__.py b/concat/typecheck/__init__.py index dcb6b99..d009499 100644 --- a/concat/typecheck/__init__.py +++ b/concat/typecheck/__init__.py @@ -28,6 +28,7 @@ TYPE_CHECKING, Tuple, Union, + assert_never, cast, ) from concat.typecheck.types import ( @@ -58,7 +59,11 @@ no_return_type, ) import abc -from concat.error_reporting import create_parsing_failure_message +from concat.error_reporting import ( + create_indentation_error_message, + create_lexical_error_message, + create_parsing_failure_message, +) from concat.lex import Token import itertools import pathlib @@ -658,7 +663,24 @@ def _check_stub_resolved_path( raise TypeError(f'Type stubs at {path} do not exist') from e except IOError as e: raise TypeError(f'Failed to read type stubs at {path}') from e - tokens = concat.lex.tokenize(source) + token_results = concat.lex.tokenize(source) + tokens = list[Token]() + with path.open() as f: + for r in token_results: + if r.type == 'token': + tokens.append(r.token) + elif r.type == 'indent-err': + print('Indentation error:') + print( + create_indentation_error_message( + f, (r.err.lineno or 1, r.err.offset or 0), r.err.msg + ) + ) + elif r.type == 'token-err': + print('Lexical error:') + print(create_lexical_error_message(f, r.location, str(r.err))) + else: + assert_never(r) env = initial_env or Environment() from concat.transpile import parse