Handle comments with annotations

PiperOrigin-RevId: 679548102
google · Oct 2, 2024 · d7701ec · d7701ec
1 parent d61d957
commit d7701ec
Show file tree

Hide file tree

Showing 11 changed files with 475 additions and 184 deletions.
diff --git a/patches/pyink.patch b/patches/pyink.patch
diff --git a/src/pyink/__init__.py b/src/pyink/__init__.py
@@ -69,7 +69,13 @@
 from pyink.lines import EmptyLineTracker, LinesBlock
 from pyink.mode import FUTURE_FLAG_TO_FEATURE, Feature, VERSION_TO_FEATURES
 from pyink.mode import Mode as Mode  # re-exported
-from pyink.mode import Preview, QuoteStyle, TargetVersion, supports_feature
+from pyink.mode import (
+    DEFAULT_ANNOTATION_PRAGMAS,
+    Preview,
+    QuoteStyle,
+    TargetVersion,
+    supports_feature,
+)
 from pyink.nodes import STARS, is_number_token, is_simple_decorator_expression, syms
 from pyink.output import color_diff, diff, dump_to_file, err, ipynb_diff, out
 from pyink.parsing import (  # noqa F401
@@ -368,6 +374,18 @@ def validate_regex(
         " notebook."
     ),
 )
+@click.option(
+    "--pyink-annotation-pragmas",
+    type=str,
+    multiple=True,
+    help=(
+        "Pyink won't split too long lines if they contain a comment with any of"
+        " the given annotation pragmas because it doesn't know to which part of"
+        " the line the annotation applies. The default annotation pragmas are:"
+        f" {', '.join(DEFAULT_ANNOTATION_PRAGMAS)}."
+    ),
+    default=[],
+)
 @click.option(
     "--pyink-use-majority-quotes",
     is_flag=True,
@@ -580,6 +598,7 @@ def main(  # noqa: C901
     pyink: bool,
     pyink_indentation: str,
     pyink_ipynb_indentation: str,
+    pyink_annotation_pragmas: List[str],
     pyink_use_majority_quotes: bool,
     quiet: bool,
     verbose: bool,
@@ -681,6 +700,9 @@ def main(  # noqa: C901
         is_pyink=pyink,
         pyink_indentation=int(pyink_indentation),
         pyink_ipynb_indentation=int(pyink_ipynb_indentation),
+        pyink_annotation_pragmas=(
+            tuple(pyink_annotation_pragmas) or DEFAULT_ANNOTATION_PRAGMAS
+        ),
         quote_style=(
             QuoteStyle.MAJORITY if pyink_use_majority_quotes else QuoteStyle.DOUBLE
         ),

diff --git a/src/pyink/comments.py b/src/pyink/comments.py
@@ -3,6 +3,7 @@
 from functools import lru_cache
 from typing import Collection, Final, Iterator, List, Optional, Tuple, Union
 
+from pyink import ink_comments
 from pyink.mode import Mode, Preview
 from pyink.nodes import (
     CLOSING_BRACKETS,
@@ -376,15 +377,15 @@ def children_contains_fmt_on(container: LN) -> bool:
     return False
 
 
-def contains_pragma_comment(comment_list: List[Leaf]) -> bool:
+def contains_pragma_comment(comment_list: List[Leaf], mode: Mode) -> bool:
     """
     Returns:
         True iff one of the comments in @comment_list is a pragma used by one
         of the more common static analysis tools for python (e.g. mypy, flake8,
         pylint).
     """
     for comment in comment_list:
-        if comment.value.startswith(("# type:", "# noqa", "# pylint:")):
+        if ink_comments.comment_contains_pragma(comment.value, mode):
             return True
 
     return False

diff --git a/src/pyink/ink_comments.py b/src/pyink/ink_comments.py
@@ -0,0 +1,30 @@
+"""Module that provides utilities related to comments.
+
+This is separate from pyink.ink to avoid circular dependencies.
+"""
+
+import re
+
+from pyink.mode import Mode
+
+
+def comment_contains_pragma(comment: str, mode: Mode) -> bool:
+    """Check if the given string contains one of the pragma forms.
+
+    A pragma form can appear at the beginning of a comment:
+      # pytype: disable=attribute-error
+    or somewhere in the middle:
+      # some comment # type: ignore # another comment
+    or the comments can even be separated by a semicolon:
+      # some comment; noqa: E111; another comment
+
+    Args:
+      comment: The comment to check.
+      mode: The mode that defines which pragma forms to check for.
+
+    Returns:
+      True if the comment contains one of the pragma forms.
+    """
+    joined_pragma_expression = "|".join(mode.pyink_annotation_pragmas)
+    pragma_regex = re.compile(rf"([#|;] ?(?:{joined_pragma_expression}))")
+    return pragma_regex.search(comment) is not None
diff --git a/src/pyink/linegen.py b/src/pyink/linegen.py
@@ -14,6 +14,7 @@
 else:
     from typing import Final, Literal
 
+from pyink import ink_comments
 from pyink.brackets import (
     COMMA_PRIORITY,
     DOT_PRIORITY,
@@ -61,7 +62,6 @@
     is_stub_body,
     is_stub_suite,
     is_tuple_containing_walrus,
-    is_type_ignore_comment_string,
     is_vararg,
     is_walrus_assignment,
     is_yield,
@@ -272,6 +272,7 @@ def visit_dictsetmaker(self, node: Node) -> Iterator[Line]:
                         maybe_make_parens_invisible_in_atom(
                             child,
                             parent=node,
+                            mode=self.mode,
                             remove_brackets_around_comma=False,
                         )
                     else:
@@ -292,6 +293,7 @@ def visit_funcdef(self, node: Node) -> Iterator[Line]:
                     if maybe_make_parens_invisible_in_atom(
                         child,
                         parent=node,
+                        mode=self.mode,
                         remove_brackets_around_comma=False,
                     ):
                         wrap_in_parentheses(node, child, visible=False)
@@ -395,7 +397,7 @@ def visit_power(self, node: Node) -> Iterator[Line]:
             ):
                 wrap_in_parentheses(node, leaf)
 
-        remove_await_parens(node)
+        remove_await_parens(node, mode=self.mode)
 
         yield from self.visit_default(node)
 
@@ -442,7 +444,9 @@ def foo(a: int, b: float = 7): ...
         def foo(a: (int), b: (float) = 7): ...
         """
         assert len(node.children) == 3
-        if maybe_make_parens_invisible_in_atom(node.children[2], parent=node):
+        if maybe_make_parens_invisible_in_atom(
+            node.children[2], parent=node, mode=self.mode
+        ):
             wrap_in_parentheses(node, node.children[2], visible=False)
 
         yield from self.visit_default(node)
@@ -688,7 +692,7 @@ def transform_line(
 
     transformers: List[Transformer]
     if (
-        not line.contains_uncollapsable_type_comments()
+        not line.contains_uncollapsable_pragma_comments()
         and not line.should_split_rhs
         and not line.magic_trailing_comma
         and (
@@ -1451,15 +1455,17 @@ def normalize_invisible_parens(  # noqa: C901
                 if maybe_make_parens_invisible_in_atom(
                     child,
                     parent=node,
+                    mode=mode,
                     remove_brackets_around_comma=True,
                 ):
                     wrap_in_parentheses(node, child, visible=False)
             elif isinstance(child, Node) and node.type == syms.with_stmt:
-                remove_with_parens(child, node)
+                remove_with_parens(child, node, mode=mode)
             elif child.type == syms.atom:
                 if maybe_make_parens_invisible_in_atom(
                     child,
                     parent=node,
+                    mode=mode,
                 ):
                     wrap_in_parentheses(node, child, visible=False)
             elif is_one_tuple(child):
@@ -1511,7 +1517,7 @@ def _normalize_import_from(parent: Node, child: LN, index: int) -> None:
         parent.append_child(Leaf(token.RPAR, ""))
 
 
-def remove_await_parens(node: Node) -> None:
+def remove_await_parens(node: Node, mode: Mode) -> None:
     if node.children[0].type == token.AWAIT and len(node.children) > 1:
         if (
             node.children[1].type == syms.atom
@@ -1520,6 +1526,7 @@ def remove_await_parens(node: Node) -> None:
             if maybe_make_parens_invisible_in_atom(
                 node.children[1],
                 parent=node,
+                mode=mode,
                 remove_brackets_around_comma=True,
             ):
                 wrap_in_parentheses(node, node.children[1], visible=False)
@@ -1588,7 +1595,7 @@ def _maybe_wrap_cms_in_parens(
         node.insert_child(1, new_child)
 
 
-def remove_with_parens(node: Node, parent: Node) -> None:
+def remove_with_parens(node: Node, parent: Node, mode: Mode) -> None:
     """Recursively hide optional parens in `with` statements."""
     # Removing all unnecessary parentheses in with statements in one pass is a tad
     # complex as different variations of bracketed statements result in pretty
@@ -1610,21 +1617,23 @@ def remove_with_parens(node: Node, parent: Node) -> None:
         if maybe_make_parens_invisible_in_atom(
             node,
             parent=parent,
+            mode=mode,
             remove_brackets_around_comma=True,
         ):
             wrap_in_parentheses(parent, node, visible=False)
         if isinstance(node.children[1], Node):
-            remove_with_parens(node.children[1], node)
+            remove_with_parens(node.children[1], node, mode=mode)
     elif node.type == syms.testlist_gexp:
         for child in node.children:
             if isinstance(child, Node):
-                remove_with_parens(child, node)
+                remove_with_parens(child, node, mode=mode)
     elif node.type == syms.asexpr_test and not any(
         leaf.type == token.COLONEQUAL for leaf in node.leaves()
     ):
         if maybe_make_parens_invisible_in_atom(
             node.children[0],
             parent=node,
+            mode=mode,
             remove_brackets_around_comma=True,
         ):
             wrap_in_parentheses(node, node.children[0], visible=False)
@@ -1633,6 +1642,7 @@ def remove_with_parens(node: Node, parent: Node) -> None:
 def maybe_make_parens_invisible_in_atom(
     node: LN,
     parent: LN,
+    mode: Mode,
     remove_brackets_around_comma: bool = False,
 ) -> bool:
     """If it's safe, make the parens in the atom `node` invisible, recursively.
@@ -1682,7 +1692,7 @@ def maybe_make_parens_invisible_in_atom(
         if (
             # If the prefix of `middle` includes a type comment with
             # ignore annotation, then we do not remove the parentheses
-            not is_type_ignore_comment_string(middle.prefix.strip())
+            not ink_comments.comment_contains_pragma(middle.prefix.strip(), mode)
         ):
             first.value = ""
             if first.prefix.strip():
@@ -1692,6 +1702,7 @@ def maybe_make_parens_invisible_in_atom(
         maybe_make_parens_invisible_in_atom(
             middle,
             parent=parent,
+            mode=mode,
             remove_brackets_around_comma=remove_brackets_around_comma,
         )
 
@@ -1835,7 +1846,7 @@ def run_transformer(
         or not line.bracket_tracker.invisible
         or any(bracket.value for bracket in line.bracket_tracker.invisible)
         or line.contains_multiline_strings()
-        or result[0].contains_uncollapsable_type_comments()
+        or result[0].contains_uncollapsable_pragma_comments()
         or result[0].contains_unsplittable_type_ignore()
         or is_line_short_enough(result[0], mode=mode)
         # If any leaves have no parents (which _can_ occur since

diff --git a/src/pyink/lines.py b/src/pyink/lines.py
@@ -1,7 +1,6 @@
 from enum import Enum, auto
 import itertools
 import math
-import re
 from dataclasses import dataclass, field
 from typing import (
     Callable,
@@ -30,7 +29,7 @@
     is_multiline_string,
     is_one_sequence_between,
     is_type_comment,
-    is_type_ignore_comment,
+    is_pragma_comment,
     is_with_or_async_with_stmt,
     make_simple_prefix,
     replace_child,
@@ -47,9 +46,6 @@
 LeafID = int
 LN = Union[Leaf, Node]
 
-# This regex should contain a single capture group capturing the entire match.
-_PRAGMA_REGEX = re.compile("( *# (?:pylint|pytype):)")
-
 
 class Indentation(Enum):
     SCOPE = auto()  # Scope indentation.
@@ -293,7 +289,7 @@ def contains_implicit_multiline_string_with_comments(self) -> bool:
                     return True
         return False
 
-    def contains_uncollapsable_type_comments(self) -> bool:
+    def contains_uncollapsable_pragma_comments(self) -> bool:
         ignored_ids = set()
         try:
             last_leaf = self.leaves[-1]
@@ -318,11 +314,9 @@ def contains_uncollapsable_type_comments(self) -> bool:
         comment_seen = False
         for leaf_id, comments in self.comments.items():
             for comment in comments:
-                if is_type_comment(comment):
-                    if comment_seen or (
-                        not is_type_ignore_comment(comment)
-                        and leaf_id not in ignored_ids
-                    ):
+                is_pragma = is_pragma_comment(comment, self.mode)
+                if is_type_comment(comment) or is_pragma:
+                    if comment_seen or (not is_pragma and leaf_id not in ignored_ids):
                         return True
 
                 comment_seen = True
@@ -357,34 +351,11 @@ def contains_unsplittable_type_ignore(self) -> bool:
             # line.
             for node in self.leaves[-2:]:
                 for comment in self.comments.get(id(node), []):
-                    if is_type_ignore_comment(comment):
+                    if is_pragma_comment(comment, self.mode):
                         return True
 
         return False
 
-    def trailing_pragma_comment_length(self) -> int:
-        if not self.leaves:
-            return 0
-
-        trailing_comments = self.comments.get(id(self.leaves[-1]), [])
-        if (
-            not trailing_comments
-            and len(self.leaves) > 1
-            and self.leaves[-1].type == token.RPAR
-            and not self.leaves[-1].value
-        ):
-            # When last leaf is an invisible paren, the trailing comment is
-            # attached to the leaf before.
-            trailing_comments = self.comments.get(id(self.leaves[-2]), [])
-        length = 0
-        for comment in trailing_comments:
-            # str(comment) contains the whitespace preceding the `#`
-            comment_str = str(comment)
-            parts = _PRAGMA_REGEX.split(comment_str, maxsplit=1)
-            if len(parts) == 3:
-                length += len(parts[1]) + len(parts[2])
-        return length
-
     def contains_multiline_strings(self) -> bool:
         return any(is_multiline_string(leaf) for leaf in self.leaves)
 
@@ -730,7 +701,7 @@ def _maybe_empty_lines(self, current_line: Line) -> Tuple[int, int]:  # noqa: C9
                 or self.previous_line.is_fmt_pass_converted(
                     first_leaf_matches=is_import
                 )
-             )
+            )
             and not current_line.is_import
             and not (
                 # Should not add empty lines before a STANDALONE_COMMENT.
@@ -878,14 +849,9 @@ def is_line_short_enough(  # noqa: C901
     if not line_str:
         line_str = line_to_string(line)
 
-    if line.mode.is_pyink:
-        effective_length = str_width(line_str) - line.trailing_pragma_comment_length()
-    else:
-        effective_length = str_width(line_str)
-
     if Preview.multiline_string_handling not in mode:
         return (
-            effective_length <= mode.line_length
+            str_width(line_str) <= mode.line_length
             and "\n" not in line_str  # multiline strings
             and not line.contains_standalone_comments()
         )
@@ -894,7 +860,7 @@ def is_line_short_enough(  # noqa: C901
         return False
     if "\n" not in line_str:
         # No multiline strings (MLS) present
-        return effective_length <= mode.line_length
+        return str_width(line_str) <= mode.line_length
 
     first, *_, last = line_str.split("\n")
     if str_width(first) > mode.line_length or str_width(last) > mode.line_length: