diff --git a/se/commands/find_unusual_characters.py b/se/commands/find_unusual_characters.py index fe8a049d..aeb1f669 100644 --- a/se/commands/find_unusual_characters.py +++ b/se/commands/find_unusual_characters.py @@ -26,32 +26,6 @@ def find_unusual_characters(plain_output: bool) -> int: return_code = 0 unusual_characters: Dict[str, int] = {} # key: word; value: count target_filenames = se.get_target_filenames(args.targets, ".xhtml") - files_xhtml = [] - - # Read files and cache for later - for filename in target_filenames: - try: - with open(filename, "r", encoding="utf-8") as file: - xhtml = file.read() - dom = se.easy_xml.EasyXmlTree(xhtml) - - # Save any `alt` and `title` attributes because we may be interested in their contents - for node in dom.xpath("//*[@alt or @title]"): - for _, value in node.attrs.items(): - xhtml = xhtml + f" {value} " - - # Strip tags - xhtml = regex.sub(r"<[^>]+?>", " ", xhtml) - - files_xhtml.append(xhtml) - - except FileNotFoundError: - se.print_error(f"Couldn’t open file: [path][link=file://{filename}]{filename}[/][/].", plain_output=plain_output) - return_code = se.InvalidInputException.code - - except se.SeException as ex: - se.print_error(str(ex) + f" File: [path][link=file://{filename}]{filename}[/][/].", plain_output=plain_output) - return_code = ex.code # Create a regex for unusual characters. # The result is a series of Unicode ranges that cover the characters @@ -118,12 +92,34 @@ def find_unusual_characters(plain_output: bool) -> int: unusual_character_set += "\u2e3c-\ufefe" unusual_character_set += "]" - for xhtml in files_xhtml: - for character in regex.findall(unusual_character_set, xhtml): - if character in unusual_characters: - unusual_characters[character] = unusual_characters[character] + len(character) - else: - unusual_characters[character] = len(character) + # Read files and process one at a time + for filename in target_filenames: + try: + with open(filename, "r", encoding="utf-8") as file: + xhtml = file.read() + dom = se.easy_xml.EasyXmlTree(xhtml) + + # Save any `alt` and `title` attributes because we may be interested in their contents + for node in dom.xpath("//*[@alt or @title]"): + for _, value in node.attrs.items(): + xhtml = xhtml + f" {value} " + + # Strip tags + xhtml = regex.sub(r"<[^>]+?>", " ", xhtml) + + for character in regex.findall(unusual_character_set, xhtml): + if character in unusual_characters: + unusual_characters[character] = unusual_characters[character] + len(character) + else: + unusual_characters[character] = len(character) + + except FileNotFoundError: + se.print_error(f"Couldn’t open file: [path][link=file://{filename}]{filename}[/][/].", plain_output=plain_output) + return_code = se.InvalidInputException.code + + except se.SeException as ex: + se.print_error(str(ex) + f" File: [path][link=file://{filename}]{filename}[/][/].", plain_output=plain_output) + return_code = ex.code # Sort and prepare the output lines = []