Merge pull request #25 from watermarkhu/development

Add find/findall + mypy fixes
watermarkhu · Feb 19, 2024 · 2575a83 · 2575a83
2 parents acbaf61 + 042d255
commit 2575a83
Show file tree

Hide file tree

Showing 21 changed files with 143 additions and 88 deletions.
diff --git a/.gitignore b/.gitignore
@@ -159,8 +159,5 @@ cython_debug/
 # option (not recommended) you can uncomment the following to ignore the entire idea folder.
 #.idea/
 
-# VSCode settings
-.vscode/
-
 # Ruff cache
 .ruff_cache/
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -1,14 +1,10 @@
 repos:
  - repo: https://github.com/astral-sh/ruff-pre-commit
  # Ruff version.
- rev: v0.2.1
+ rev: v0.2.2
  hooks:
  # Run the linter.
  - id: ruff
  args: [--fix]
  # Run the formatter.
  - id: ruff-format
- - repo: https://github.com/pre-commit/mirrors-mypy
- rev: 'v1.8.0' # Use the sha / tag you want to point at
- hooks:
- - id: mypy
diff --git a/.vscode/settings.json b/.vscode/settings.json
@@ -0,0 +1,14 @@
+{
+ "python.testing.pytestArgs": [
+ "test"
+ ],
+ "python.testing.unittestEnabled": false,
+ "python.testing.pytestEnabled": true,
+ "files.exclude": {
+ "**/__pycache__": true,
+ "**/.mypy_cache": true,
+ "**/.pytest_cache": true,
+ "**/.ruff_cache": true,
+ "**/.tox": true,
+ },
+}
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,14 +1,14 @@
 ##################################### poetry ##################################### 
 [tool.poetry]
 name = "textmate-grammar-python"
-version = "0.1.2"
+version = "0.2.0"
 description = "An interpreter for grammar files as defined by TextMate and used in VSCode, implemented in Python. TextMate grammars use the oniguruma dialect (https://github.com/kkos/oniguruma). Supports loading grammar files from JSON, PLIST, or YAML format."
 authors = ["Mark Shui Hu <watermarkhu@gmail.com>"]
 license = "MIT"
 readme = "README.md"
 repository = "https://github.com/watermarkhu/textmate-grammar-python"
 keywords = ["textmate", "tokenization"]
-packages = [{include = "textmate_grammar"}]
+packages = [{include = "textmate_grammar", from = "src"}]
 
 [tool.poetry.dependencies]
 python = "^3.11"
@@ -30,8 +30,8 @@ types-pyyaml = "^6.0.12.12"
 ##################################### ruff ##################################### 
 ruff = "^0.2.1"
 [tool.ruff]
-include = ["pyproject.toml", "textmate_grammar/**/*.py"]
-exclude = ["textmate_grammar/grammars/"]
+include = ["pyproject.toml", "src/textmate_grammar/**/*.py"]
+exclude = ["src/textmate_grammar/grammars/"]
 line-length = 100
 indent-width = 4
 

diff --git a/textmate_grammar/__init__.py → src/textmate_grammar/__init__.py b/textmate_grammar/__init__.py → src/textmate_grammar/__init__.py
diff --git a/textmate_grammar/elements.py → src/textmate_grammar/elements.py b/textmate_grammar/elements.py → src/textmate_grammar/elements.py
@@ -4,7 +4,7 @@
 from collections import defaultdict
 from itertools import groupby
 from pprint import pprint
-from typing import TYPE_CHECKING
+from typing import TYPE_CHECKING, Generator
 
 from .handler import POS, ContentHandler, Match, Pattern
 from .logger import LOGGER
@@ -16,13 +16,7 @@
 TOKEN_DICT = dict[POS, list[str]]
 
 
-class Element:
- def _token_by_index(self, *args, **kwargs):
- # Stub for Mypy
- return
-
-
-class Capture(Element):
+class Capture:
  """A captured matching group.
 
  After mathing, any pattern can have a number of capture groups for which subsequent parsers can be defined.
@@ -62,7 +56,7 @@ def __eq__(self, other: object) -> bool:
  def __repr__(self) -> str:
  return f"@capture<{self.key}>"
 
- def dispatch(self) -> list[Element]:
+ def dispatch(self) -> list[Capture | ContentElement]:
  """Dispatches the remaining parse of the capture group."""
  elements = []
  for group_id, parser in self.parsers.items():
@@ -107,20 +101,20 @@ def dispatch(self) -> list[Element]:
 
 
 def dispatch_list(
- pending_elements: list[Element], parent: ContentElement | None = None
-) -> list[Element]:
+ pending_elements: list[Capture | ContentElement], parent: ContentElement | None = None
+) -> list[ContentElement]:
  """Dispatches all captured parsers in the list."""
  elements = []
  for item in pending_elements:
  if isinstance(item, Capture):
- captured_elements = dispatch_list(item.dispatch())
+ captured_elements: list[ContentElement] = dispatch_list(item.dispatch())
  elements.extend(captured_elements)
  elif item != parent:
  elements.append(item)
  return elements
 
 
-class ContentElement(Element):
+class ContentElement:
  """The base grammar element object."""
 
  def __init__(
@@ -129,7 +123,7 @@ def __init__(
  grammar: dict,
  content: str,
  characters: dict[POS, str],
- children: list[Element] | None = None,
+ children: list[Capture | ContentElement] | None = None,
  ) -> None:
  if children is None:
  children = []
@@ -138,11 +132,15 @@ def __init__(
  self.content = content
  self.characters = characters
  self._children_pending = children
- self._children_dispached: list[Element] = []
+ self._children_dispached: list[ContentElement] = []
  self._dispatched_children: bool = False
 
  @property
- def children(self) -> list[Element]:
+ def _subelements(self) -> list[ContentElement]:
+ return self.children
+
+ @property
+ def children(self) -> list[ContentElement]:
  "Children elements"
  if self._children_pending:
  if not self._dispatched_children:
@@ -172,6 +170,61 @@ def to_dict(self, verbosity: int = -1, all_content: bool = False, **kwargs) -> d
  )
  return out_dict
 
+ def find(
+ self,
+ tokens: str | list[str],
+ stop_tokens: str | list[str] = "",
+ verbosity: int = -1,
+ stack: list[str] | None = None,
+ attribute: str = "_subelements",
+ ) -> Generator[tuple[ContentElement, list[str]], None, None]:
+ """Find the next subelement that match the input token(s).
+
+ The find method will return a generator that globs though the element-tree, searching for the next
+ subelement that matches the given token.
+ """
+ if isinstance(tokens, str):
+ tokens = [tokens]
+ if isinstance(stop_tokens, str):
+ stop_tokens = [stop_tokens] if stop_tokens else []
+ if not set(tokens).isdisjoint(set(stop_tokens)):
+ raise ValueError("Input tokens and stop_tokens must be disjoint")
+
+ if stack is None:
+ stack = []
+ stack += [self.token]
+
+ if verbosity:
+ verbosity -= 1
+ children: list[ContentElement] = getattr(self, attribute, self._subelements)
+ for child in children:
+ if stop_tokens and (
+ child.token in stop_tokens
+ or (stop_tokens == ["*"] and child.token not in tokens)
+ ):
+ return None
+
+ if child.token in tokens or tokens == ["*"]:
+ yield child, [e for e in stack]
+ if verbosity:
+ nested_generator = child.find(
+ tokens, verbosity=verbosity - 1, stack=[e for e in stack]
+ )
+ yield from nested_generator
+ return None
+
+ def findall(
+ self,
+ tokens: str | list[str],
+ stop_tokens: str | list[str] = "",
+ verbosity: int = -1,
+ attribute: str = "_subelements",
+ ) -> list[tuple[ContentElement, list[str]]]:
+ """Returns subelements that match the input token(s)."""
+ return list(
+ self.find(tokens, stop_tokens=stop_tokens, verbosity=verbosity, attribute=attribute)
+ )
+
  def flatten(self) -> list[tuple[tuple[int, int], str, list[str]]]:
  """Converts the object to a flattened array of tokens per index."""
  token_dict = self._token_by_index(defaultdict(list))
@@ -238,8 +291,8 @@ class ContentBlockElement(ContentElement):
 
  def __init__(
  self,
- begin: list[Element] | None = None,
- end: list[Element] | None = None,
+ begin: list[Capture | ContentElement] | None = None,
+ end: list[Capture | ContentElement] | None = None,
  **kwargs,
  ) -> None:
  if end is None:
@@ -249,13 +302,17 @@ def __init__(
  super().__init__(**kwargs)
  self._begin_pending = begin
  self._end_pending = end
- self._begin_dispached: list[Element] = []
- self._end_dispached: list[Element] = []
+ self._begin_dispached: list[ContentElement] = []
+ self._end_dispached: list[ContentElement] = []
  self._dispatched_begin: bool = False
  self._dispatched_end: bool = False
 
  @property
- def begin(self) -> list[Element]:
+ def _subelements(self) -> list[ContentElement]:
+ return self.begin + self.children + self.end
+
+ @property
+ def begin(self) -> list[ContentElement]:
  "Begin elements"
  if self._begin_pending:
  if not self._dispatched_begin:
@@ -266,7 +323,7 @@ def begin(self) -> list[Element]:
  return []
 
  @property
- def end(self) -> list[Element]:
+ def end(self) -> list[ContentElement]:
  "End elements"
  if self._end_pending:
  if not self._dispatched_end:

diff --git a/textmate_grammar/exceptions.py → src/textmate_grammar/exceptions.py b/textmate_grammar/exceptions.py → src/textmate_grammar/exceptions.py
diff --git a/textmate_grammar/grammars/__init__.py → src/textmate_grammar/grammars/__init__.py b/textmate_grammar/grammars/__init__.py → src/textmate_grammar/grammars/__init__.py
diff --git a/...ate_grammar/grammars/markdown/LICENSE.txt → ...ate_grammar/grammars/markdown/LICENSE.txt b/...ate_grammar/grammars/markdown/LICENSE.txt → ...ate_grammar/grammars/markdown/LICENSE.txt
diff --git a/...ate_grammar/grammars/markdown/__init__.py → ...ate_grammar/grammars/markdown/__init__.py b/...ate_grammar/grammars/markdown/__init__.py → ...ate_grammar/grammars/markdown/__init__.py
@@ -1,21 +1,18 @@
-from pathlib import Path
 import shutil
-import yaml
+from pathlib import Path
 
+import yaml
 
 tmLanguageFile = (
- Path(__file__).parents[3]
- / "syntaxes"
- / "markdown"
- / "markdown.tmLanguage.base.yaml"
+ Path(__file__).parents[3] / "syntaxes" / "markdown" / "markdown.tmLanguage.base.yaml"
 )
 tmLanguageYAML = Path(__file__).parent / "grammar.yaml"
 
 
 if tmLanguageFile.exists():
  shutil.copyfile(tmLanguageFile, tmLanguageYAML)
 
-with open(tmLanguageYAML, "r") as file:
+with open(tmLanguageYAML) as file:
  try:
  GRAMMAR = yaml.load(file.read(), Loader=yaml.CLoader)
  except ImportError:

diff --git a/...te_grammar/grammars/markdown/grammar.yaml → ...te_grammar/grammars/markdown/grammar.yaml b/...te_grammar/grammars/markdown/grammar.yaml → ...te_grammar/grammars/markdown/grammar.yaml
diff --git a/textmate_grammar/grammars/matlab/__init__.py → ...tmate_grammar/grammars/matlab/__init__.py b/textmate_grammar/grammars/matlab/__init__.py → ...tmate_grammar/grammars/matlab/__init__.py
@@ -1,7 +1,7 @@
-from pathlib import Path
 import plistlib
-import yaml
+from pathlib import Path
 
+import yaml
 
 tmLanguageFile = (
  Path(__file__).parents[3]
@@ -20,7 +20,7 @@
  with open(tmLanguageYAML, "w") as f:
  f.write(yaml.dump(GRAMMAR, indent=2))
 else:
- with open(tmLanguageYAML, "r") as file:
+ with open(tmLanguageYAML) as file:
  try:
  GRAMMAR = yaml.load(file.read(), Loader=yaml.CLoader)
  except ImportError:

diff --git a/...mate_grammar/grammars/matlab/grammar.yaml → ...mate_grammar/grammars/matlab/grammar.yaml b/...mate_grammar/grammars/matlab/grammar.yaml → ...mate_grammar/grammars/matlab/grammar.yaml
diff --git a/textmate_grammar/grammars/matlab/license.txt → ...tmate_grammar/grammars/matlab/license.txt b/textmate_grammar/grammars/matlab/license.txt → ...tmate_grammar/grammars/matlab/license.txt
diff --git a/textmate_grammar/handler.py → src/textmate_grammar/handler.py b/textmate_grammar/handler.py → src/textmate_grammar/handler.py
diff --git a/textmate_grammar/language.py → src/textmate_grammar/language.py b/textmate_grammar/language.py → src/textmate_grammar/language.py
@@ -1,6 +1,6 @@
 from pathlib import Path
 
-from .elements import Element
+from .elements import Capture, ContentElement
 from .exceptions import IncompatibleFileType
 from .handler import POS, ContentHandler
 from .logger import LOGGER
@@ -80,7 +80,7 @@ def _initialize_repository(self):
 
  super()._initialize_repository()
 
- def parse_file(self, filePath: str | Path, **kwargs) -> Element | None:
+ def parse_file(self, filePath: str | Path, **kwargs) -> Capture | ContentElement | None:
  """Parses an entire file with the current grammar"""
  if type(filePath) != Path:
  filePath = Path(filePath)
@@ -89,6 +89,8 @@ def parse_file(self, filePath: str | Path, **kwargs) -> Element | None:
  raise IncompatibleFileType(extensions=self.file_types)
 
  handler = ContentHandler.from_path(filePath)
+ if handler.source == "":
+ return None
 
  # Configure logger
  LOGGER.configure(self, height=len(handler.lines), width=max(handler.line_lengths))
@@ -102,15 +104,15 @@ def parse_string(self, input: str, **kwargs):
  LOGGER.configure(self, height=len(handler.lines), width=max(handler.line_lengths))
  return self._parse_language(handler, **kwargs)
 
- def _parse_language(self, handler: ContentHandler, **kwargs) -> Element | None:
+ def _parse_language(self, handler: ContentHandler, **kwargs) -> Capture | ContentElement | None:
  """Parses the current stream with the language scope."""
 
  parsed, elements, _ = self.parse(handler, (0, 0), **kwargs)
  return elements[0] if parsed else None
 
  def _parse(
  self, handler: ContentHandler, starting: POS, **kwargs
- ) -> tuple[bool, list[Element], tuple[int, int]]:
+ ) -> tuple[bool, list[Capture | ContentElement], tuple[int, int]]:
  kwargs.pop("find_one", None)
  return super()._parse(handler, starting, find_one=False, **kwargs)
 

diff --git a/textmate_grammar/logger.py → src/textmate_grammar/logger.py b/textmate_grammar/logger.py → src/textmate_grammar/logger.py