From fca5bdcedb5574f143127eee0bf7976208c2989d Mon Sep 17 00:00:00 2001 From: Andrew Paseltiner Date: Sat, 8 Jun 2024 09:16:17 -0400 Subject: [PATCH] Avoid keeping all files in memory for find-unusual-characters Each file can be processed independently, so processing them one at a time reduces total memory consumption. --- se/commands/find_unusual_characters.py | 60 ++++++++++++-------------- 1 file changed, 28 insertions(+), 32 deletions(-) diff --git a/se/commands/find_unusual_characters.py b/se/commands/find_unusual_characters.py index fe8a049d..aeb1f669 100644 --- a/se/commands/find_unusual_characters.py +++ b/se/commands/find_unusual_characters.py @@ -26,32 +26,6 @@ def find_unusual_characters(plain_output: bool) -> int: return_code = 0 unusual_characters: Dict[str, int] = {} # key: word; value: count target_filenames = se.get_target_filenames(args.targets, ".xhtml") - files_xhtml = [] - - # Read files and cache for later - for filename in target_filenames: - try: - with open(filename, "r", encoding="utf-8") as file: - xhtml = file.read() - dom = se.easy_xml.EasyXmlTree(xhtml) - - # Save any `alt` and `title` attributes because we may be interested in their contents - for node in dom.xpath("//*[@alt or @title]"): - for _, value in node.attrs.items(): - xhtml = xhtml + f" {value} " - - # Strip tags - xhtml = regex.sub(r"<[^>]+?>", " ", xhtml) - - files_xhtml.append(xhtml) - - except FileNotFoundError: - se.print_error(f"Couldn’t open file: [path][link=file://{filename}]{filename}[/][/].", plain_output=plain_output) - return_code = se.InvalidInputException.code - - except se.SeException as ex: - se.print_error(str(ex) + f" File: [path][link=file://{filename}]{filename}[/][/].", plain_output=plain_output) - return_code = ex.code # Create a regex for unusual characters. # The result is a series of Unicode ranges that cover the characters @@ -118,12 +92,34 @@ def find_unusual_characters(plain_output: bool) -> int: unusual_character_set += "\u2e3c-\ufefe" unusual_character_set += "]" - for xhtml in files_xhtml: - for character in regex.findall(unusual_character_set, xhtml): - if character in unusual_characters: - unusual_characters[character] = unusual_characters[character] + len(character) - else: - unusual_characters[character] = len(character) + # Read files and process one at a time + for filename in target_filenames: + try: + with open(filename, "r", encoding="utf-8") as file: + xhtml = file.read() + dom = se.easy_xml.EasyXmlTree(xhtml) + + # Save any `alt` and `title` attributes because we may be interested in their contents + for node in dom.xpath("//*[@alt or @title]"): + for _, value in node.attrs.items(): + xhtml = xhtml + f" {value} " + + # Strip tags + xhtml = regex.sub(r"<[^>]+?>", " ", xhtml) + + for character in regex.findall(unusual_character_set, xhtml): + if character in unusual_characters: + unusual_characters[character] = unusual_characters[character] + len(character) + else: + unusual_characters[character] = len(character) + + except FileNotFoundError: + se.print_error(f"Couldn’t open file: [path][link=file://{filename}]{filename}[/][/].", plain_output=plain_output) + return_code = se.InvalidInputException.code + + except se.SeException as ex: + se.print_error(str(ex) + f" File: [path][link=file://{filename}]{filename}[/][/].", plain_output=plain_output) + return_code = ex.code # Sort and prepare the output lines = []