diff --git a/se/se_epub_lint.py b/se/se_epub_lint.py index d380de70..867d403c 100644 --- a/se/se_epub_lint.py +++ b/se/se_epub_lint.py @@ -1293,7 +1293,7 @@ def _lint_svg_checks(filename: str, file_contents: str, svg_dom: se.easy_xml.Eas return messages -def _lint_special_file_checks(filename: str, dom: se.easy_xml.EasyXmlTree, file_contents: str, ebook_info: dict, special_file: str, self) -> list: +def _lint_special_file_checks(filename: str, dom: se.easy_xml.EasyXmlTree, file_contents: str, ebook_flags: dict, special_file: str, self) -> list: """ Process error checks in “special” .xhtml files @@ -1301,7 +1301,7 @@ def _lint_special_file_checks(filename: str, dom: se.easy_xml.EasyXmlTree, file_ filename: The name of the file being checked dom: The dom of the file being checked file_contents: The contents of the file being checked - ebook_info: A dictionary containing ebook information + ebook_flags: A dictionary containing ebook information special_file: A string identifying the type of special file being checked self @@ -1354,14 +1354,14 @@ def _lint_special_file_checks(filename: str, dom: se.easy_xml.EasyXmlTree, file_ if self.metadata_dom.xpath("/package/metadata/meta[@property='role' and text()='trl']") and "translated from" not in file_contents: messages.append(LintMessage("m-025", "Translator found in metadata, but no [text]translated from LANG[/] block in colophon.", se.MESSAGE_TYPE_ERROR, filename)) - if ebook_info["has_multiple_transcriptions"] and not dom.xpath("/html/body//a[contains(@href, '#transcriptions')]"): + if ebook_flags["has_multiple_transcriptions"] and not dom.xpath("/html/body//a[contains(@href, '#transcriptions')]"): messages.append(LintMessage("m-074", "Multiple transcriptions found in metadata, but no link to [text]EBOOK_URL#transcriptions[/].", se.MESSAGE_TYPE_ERROR, filename)) - if ebook_info["has_multiple_page_scans"] and not dom.xpath("/html/body//a[contains(@href, '#page-scans')]"): + if ebook_flags["has_multiple_page_scans"] and not dom.xpath("/html/body//a[contains(@href, '#page-scans')]"): messages.append(LintMessage("m-075", "Multiple page scans found in metadata, but no link to [text]EBOOK_URL#page-scans[/].", se.MESSAGE_TYPE_ERROR, filename)) # Check that the formula changed from the default if we added 'various sources' - if ebook_info["has_multiple_transcriptions"] or ebook_info["has_multiple_page_scans"]: + if ebook_flags["has_multiple_transcriptions"] or ebook_flags["has_multiple_page_scans"]: nodes = dom.xpath("/html/body//a[text() = 'various sources' and not(re:test(preceding-sibling::br[1]/preceding-sibling::node()[1], '(digital scans|transcriptions) from\\s*$'))]") if nodes: messages.append(LintMessage("t-072", "[text]various sources[/] link not preceded by [text]from[/].", se.MESSAGE_TYPE_ERROR, filename)) @@ -1407,12 +1407,12 @@ def _lint_special_file_checks(filename: str, dom: se.easy_xml.EasyXmlTree, file_ # Are the sources represented correctly? # We don't have a standard yet for more than two sources (transcription and scan) so just ignore that case for now. # We can't merge this with the imprint check because imprint doesn't have `
` between `the` - if not ebook_info["has_multiple_transcriptions"] and not ebook_info["has_other_sources"]: + if not ebook_flags["has_multiple_transcriptions"] and not ebook_flags["has_other_sources"]: for link in source_links: if "gutenberg.org" in link and f"Project Gutenberg" not in file_contents: messages.append(LintMessage("m-037", f"Transcription/page scan source link not found. Expected: [xhtml]Project Gutenberg[/].", se.MESSAGE_TYPE_ERROR, filename)) - if not ebook_info["has_multiple_page_scans"] and not ebook_info["has_other_sources"]: + if not ebook_flags["has_multiple_page_scans"] and not ebook_flags["has_other_sources"]: for link in source_links: if "hathitrust.org" in link and f"the
\n\t\t\tHathiTrust Digital Library" not in file_contents: messages.append(LintMessage("m-037", f"Transcription/page scan source link not found. Expected: [xhtml]the
HathiTrust Digital Library[/].", se.MESSAGE_TYPE_ERROR, filename)) @@ -1436,25 +1436,25 @@ def _lint_special_file_checks(filename: str, dom: se.easy_xml.EasyXmlTree, file_ if missing_imprint_vars: messages.append(LintMessage("m-036", "Variable not replaced with value.", se.MESSAGE_TYPE_ERROR, filename, missing_imprint_vars)) - if ebook_info["has_multiple_transcriptions"] and not dom.xpath("/html/body//a[contains(@href, '#transcriptions')]"): + if ebook_flags["has_multiple_transcriptions"] and not dom.xpath("/html/body//a[contains(@href, '#transcriptions')]"): messages.append(LintMessage("m-074", "Multiple transcriptions found in metadata, but no link to [text]EBOOK_URL#transcriptions[/].", se.MESSAGE_TYPE_ERROR, filename)) - if ebook_info["has_multiple_page_scans"] and not dom.xpath("/html/body//a[contains(@href, '#page-scans')]"): + if ebook_flags["has_multiple_page_scans"] and not dom.xpath("/html/body//a[contains(@href, '#page-scans')]"): messages.append(LintMessage("m-075", "Multiple page scans found in metadata, but no link to [text]EBOOK_URL#page-scans[/].", se.MESSAGE_TYPE_ERROR, filename)) # Check that the formula changed from the default if we added 'various sources' - if ebook_info["has_multiple_transcriptions"] or ebook_info["has_multiple_page_scans"]: + if ebook_flags["has_multiple_transcriptions"] or ebook_flags["has_multiple_page_scans"]: nodes = dom.xpath("/html/body//a[text() = 'various sources' and not(re:test(preceding-sibling::node()[1], '(digital scans|transcriptions) from\\s*$'))]") if nodes: messages.append(LintMessage("t-072", "[text]various sources[/] link not preceded by [text]from[/].", se.MESSAGE_TYPE_ERROR, filename)) # Check for correctly named links. We can't merge this with the colophon check because the colophon breaks `the` with `
` - if not ebook_info["has_multiple_transcriptions"] and not ebook_info["has_other_sources"]: + if not ebook_flags["has_multiple_transcriptions"] and not ebook_flags["has_other_sources"]: for link in source_links: if "gutenberg.org" in link and f"Project Gutenberg" not in file_contents: messages.append(LintMessage("m-037", f"Transcription/page scan source link not found. Expected: [xhtml]Project Gutenberg[/].", se.MESSAGE_TYPE_ERROR, filename)) - if not ebook_info["has_multiple_page_scans"] and not ebook_info["has_other_sources"]: + if not ebook_flags["has_multiple_page_scans"] and not ebook_flags["has_other_sources"]: for link in source_links: if "hathitrust.org" in link and f"the HathiTrust Digital Library" not in file_contents: messages.append(LintMessage("m-037", f"Transcription/page scan source link not found. Expected: the [xhtml]HathiTrust Digital Library[/].", se.MESSAGE_TYPE_ERROR, filename)) @@ -1667,7 +1667,7 @@ def _lint_xhtml_metadata_checks(filename: str, dom: se.easy_xml.EasyXmlTree) -> return messages -def _lint_xhtml_syntax_checks(filename: str, dom: se.easy_xml.EasyXmlTree, self, file_contents: str, ebook_info: dict, language: str) -> list: +def _lint_xhtml_syntax_checks(filename: str, dom: se.easy_xml.EasyXmlTree, self, file_contents: str, ebook_flags: dict, language: str) -> list: """ Helper function used in self.lint() Process syntax checks on an .xhtml file @@ -1676,7 +1676,7 @@ def _lint_xhtml_syntax_checks(filename: str, dom: se.easy_xml.EasyXmlTree, self, filename: The name of the file being checked dom: A dom tree to check file_contents: The contents of the file being checked - ebook_info: A dictionary containing several pieces of information about an ebook + ebook_flags: A dictionary containing several pieces of information about an ebook OUTPUTS A list of LintMessages representing syntax errors found in the file @@ -2181,7 +2181,7 @@ def _lint_xhtml_syntax_checks(filename: str, dom: se.easy_xml.EasyXmlTree, self, messages.append(LintMessage("s-086", "[text]Op. Cit.[/] or [text]Loc. Cit.[/] in endnote. Hint: [text]Op. Cit.[/] and [text]Loc. Cit.[/] mean [text]the previous reference[/], which usually doesn’t make sense in a popup endnote. Such references should be expanded.", se.MESSAGE_TYPE_WARNING, filename, [node.to_tag_string() for node in nodes])) # Check for half title pages missing subtitles - if ebook_info["has_subtitle"]: + if ebook_flags["has_subtitle"]: # Make sure we exclude because that appears in the ToC landmarks nodes = dom.xpath("/html/body//*[name()!='a' and contains(@epub:type, 'halftitlepage') and not(.//*[contains(@epub:type, 'subtitle')])]") if nodes: @@ -2252,7 +2252,7 @@ def _lint_xhtml_syntax_checks(filename: str, dom: se.easy_xml.EasyXmlTree, self, return messages -def _lint_xhtml_typography_checks(filename: str, dom: se.easy_xml.EasyXmlTree, file_contents: str, special_file: str) -> list: +def _lint_xhtml_typography_checks(filename: str, dom: se.easy_xml.EasyXmlTree, file_contents: str, special_file: str, ebook_flags: dict) -> list: """ Helper function used in self.lint() Process typography checks on an .xhtml file @@ -2267,7 +2267,6 @@ def _lint_xhtml_typography_checks(filename: str, dom: se.easy_xml.EasyXmlTree, f A list of LintMessages representing typography errors found in the file """ - has_images = False; messages = []; # Check for punctuation outside quotes. We don't check single quotes because contractions are too common. @@ -2447,7 +2446,7 @@ def _lint_xhtml_typography_checks(filename: str, dom: se.easy_xml.EasyXmlTree, f img_alt_lacking_punctuation = [] for node in nodes: if "titlepage.svg" not in node.get_attr("src"): - has_images = True # Save for a later check + ebook_flags["has_images"] = True # Save for a later check alt = node.get_attr("alt") @@ -2735,7 +2734,7 @@ def _lint_xhtml_typography_checks(filename: str, dom: se.easy_xml.EasyXmlTree, f if node_text != expected_text: messages.append(LintMessage("t-073", f"Possible transcription error in Greek. Found: [text]{node_text}[/], but expected [text]{expected_text}[/text]. Hint: Use [bash]se unicode-names[/] to see differences in Unicode characters.", se.MESSAGE_TYPE_WARNING, filename)) - return (messages, has_images) + return (messages) def _lint_xhtml_xhtml_checks(filename: str, dom: se.easy_xml.EasyXmlTree, file_contents: str) -> list: """ @@ -3151,7 +3150,8 @@ def lint(self, skip_lint_ignore: bool, allowed_messages: Optional[List[str]] = N Check this ebook for some common SE style errors. INPUTS - None + self + skip_lint_ignore: Flag indicating whether ignore file should be used OUTPUTS A list of LintMessage objects. @@ -3159,10 +3159,6 @@ def lint(self, skip_lint_ignore: bool, allowed_messages: Optional[List[str]] = N local_css_path = self.content_path / "css/local.css" messages: List[LintMessage] = [] - is_titlepage = False - has_halftitle = False - has_frontmatter = False - has_cover_source = False cover_svg_title = "" titlepage_svg_title = "" xhtml_css_classes: Dict[str, int] = {} @@ -3171,9 +3167,32 @@ def lint(self, skip_lint_ignore: bool, allowed_messages: Optional[List[str]] = N unused_selectors: List[str] = [] id_attrs: List[str] = [] abbr_elements_requiring_css: List[se.easy_xml.EasyXmlElement] = [] - has_glossary_search_key_map = False glossary_usage = [] - has_images = False + short_story_count = 0 + missing_styles: List[str] = [] + directories_not_url_safe = [] + files_not_url_safe = [] + id_values = {} + duplicate_id_values = [] + local_css = { + "has_poem_stye": False, + "has_verse_style": False, + "has_song_style": False, + "has_hymn_style": False, + "has_lyrics_style": False, + "has_elision_style": False + } + ebook_flags = { + "has_cover_source": False, + "has_frontmatter": False, + "has_glossary_search_key_map": False, + "has_halftitle": False, + "has_subtitle": bool(self.metadata_dom.xpath("/package/metadata/meta[@property='title-type' and text()='subtitle']")), + "has_images": False, + "has_multiple_transcriptions": False, + "has_multiple_page_scans": False, + "has_other_sources": False + } # Cache the browser default stylesheet for later use with importlib_resources.open_text("se.data", "browser.css", encoding="utf-8") as css: @@ -3213,27 +3232,6 @@ def lint(self, skip_lint_ignore: bool, allowed_messages: Optional[List[str]] = N local_css_selectors = [regex.sub(r"::[\p{Lowercase_Letter}\-]+", "", selector) for selector in local_css_rules] unused_selectors = local_css_selectors.copy() - local_css = { - "has_poem_stye": False, - "has_verse_style": False, - "has_song_style": False, - "has_hymn_style": False, - "has_lyrics_style": False, - "has_elision_style": False - } - ebook_info = { - "has_subtitle": bool(self.metadata_dom.xpath("/package/metadata/meta[@property='title-type' and text()='subtitle']")), - "has_multiple_transcriptions": False, - "has_multiple_page_scans": False, - "has_other_sources": False - } - short_story_count = 0 - missing_styles: List[str] = [] - directories_not_url_safe = [] - files_not_url_safe = [] - id_values = {} - duplicate_id_values = [] - (css_messages, local_css) = _lint_css_checks(self, local_css, local_css_path, local_css_rules) if css_messages: messages = messages + css_messages @@ -3278,9 +3276,9 @@ def lint(self, skip_lint_ignore: bool, allowed_messages: Optional[List[str]] = N else: other_source_count = other_source_count + 1 - ebook_info["has_multiple_transcriptions"] = transcription_source_count >= 2 - ebook_info["has_multiple_page_scans"] = page_scan_source_count >= 2 - ebook_info["has_other_sources"] = other_source_count > 0 + ebook_flags["has_multiple_transcriptions"] = transcription_source_count >= 2 + ebook_flags["has_multiple_page_scans"] = page_scan_source_count >= 2 + ebook_flags["has_other_sources"] = other_source_count > 0 messages = messages + _lint_metadata_checks(self) @@ -3343,7 +3341,7 @@ def lint(self, skip_lint_ignore: bool, allowed_messages: Optional[List[str]] = N if filename.stem != "LICENSE": if filename.stem == "cover.source": - has_cover_source = True + ebook_flags["has_cover_source"] = True else: url_safe_filename = se.formatting.make_url_safe(filename.stem) + filename.suffix if filename.name != url_safe_filename: @@ -3394,7 +3392,7 @@ def lint(self, skip_lint_ignore: bool, allowed_messages: Optional[List[str]] = N # Make sure that everything in glossaries are in the rest of the text # We’ll check the files later, and log any errors at the end if filename.name == "glossary-search-key-map.xml": - has_glossary_search_key_map = True + ebook_flags["has_glossary_search_key_map"] = True # Map the glossary to tuples of the values and whether they’re used (initially false) glossary_usage = list(map(lambda node: (node.get_attr("value"), False), xml_dom.xpath(".//*[@value]"))) @@ -3439,12 +3437,12 @@ def lint(self, skip_lint_ignore: bool, allowed_messages: Optional[List[str]] = N # Check if this is a frontmatter file, but exclude the titlepage, imprint, and toc if dom.xpath("/html//*[contains(@epub:type, 'frontmatter') and not(descendant-or-self::*[re:test(@epub:type, '\\b(titlepage|imprint|toc)\\b')])]"): - has_frontmatter = True + ebook_flags["has_frontmatter"] = True # Do we have a half title? # Sometimes the half title might not be a section, like in Cane by Jean Toomer if dom.xpath("/html/body//*[contains(@epub:type, 'halftitlepage')]"): - has_halftitle = True + ebook_flags["has_halftitle"] = True # Add new CSS classes to global list if filename.name not in IGNORED_FILENAMES: @@ -3484,7 +3482,7 @@ def lint(self, skip_lint_ignore: bool, allowed_messages: Optional[List[str]] = N abbr_elements_requiring_css += dom.xpath("/html/body//abbr[re:test(@epub:type, '\\b(se:temperature|se:era|z3998:acronym)\\b')]") # Check and log missing glossary keys - if has_glossary_search_key_map and filename.name not in IGNORED_FILENAMES: + if ebook_flags["has_glossary_search_key_map"] and filename.name not in IGNORED_FILENAMES: source_text = dom.xpath("/html/body")[0].inner_text() if dom.xpath("/html/body//section[contains(@epub:type, 'glossary')]"): nodes = dom.xpath("/html/body//dd[contains(@epub:type, 'glossdef')]") @@ -3513,7 +3511,7 @@ def lint(self, skip_lint_ignore: bool, allowed_messages: Optional[List[str]] = N special_file = None if special_file in SPECIAL_FILES: - messages = messages + _lint_special_file_checks(filename, dom, file_contents, ebook_info, special_file, self) + messages = messages + _lint_special_file_checks(filename, dom, file_contents, ebook_flags, special_file, self) missing_styles = missing_styles + _update_missing_styles(filename, dom, local_css) @@ -3521,9 +3519,9 @@ def lint(self, skip_lint_ignore: bool, allowed_messages: Optional[List[str]] = N messages = messages + _lint_xhtml_metadata_checks(filename, dom) - messages = messages + _lint_xhtml_syntax_checks(filename, dom, self, file_contents, ebook_info, language) + messages = messages + _lint_xhtml_syntax_checks(filename, dom, self, file_contents, ebook_flags, language) - (typography_messages, has_images) = _lint_xhtml_typography_checks(filename, dom, file_contents, special_file) + (typography_messages) = _lint_xhtml_typography_checks(filename, dom, file_contents, special_file, ebook_flags) if typography_messages: messages = messages + typography_messages @@ -3534,10 +3532,10 @@ def lint(self, skip_lint_ignore: bool, allowed_messages: Optional[List[str]] = N if self.cover_path and cover_svg_title != titlepage_svg_title: messages.append(LintMessage("s-028", f"[path][link=file://{self.cover_path}]{self.cover_path.name}[/][/] and [path][link=file://{self.path / 'images/titlepage.svg'}]titlepage.svg[/][/] [xhtml][/] elements don’t match.", se.MESSAGE_TYPE_ERROR, self.cover_path)) - if has_frontmatter and not has_halftitle: + if ebook_flags["has_frontmatter"] and not ebook_flags["has_halftitle"]: messages.append(LintMessage("s-020", "Frontmatter found, but no half title page. Half title page is required when frontmatter is present.", se.MESSAGE_TYPE_ERROR, self.metadata_file_path)) - if self.is_se_ebook and not has_cover_source: + if self.is_se_ebook and not ebook_flags["has_cover_source"]: missing_files.append("images/cover.source.jpg") missing_selectors = [] @@ -3659,7 +3657,7 @@ def lint(self, skip_lint_ignore: bool, allowed_messages: Optional[List[str]] = N if f"[epub|type~=\"{value}\"]" not in self.local_css: missing_styles.append(element.to_tag_string()) - messages = messages + _lint_image_metadata_checks(self, has_images) + messages = messages + _lint_image_metadata_checks(self, ebook_flags["has_images"]) if missing_styles: messages.append(LintMessage("c-006", f"Semantic found, but missing corresponding style in [path][link=file://{local_css_path}]local.css[/][/].", se.MESSAGE_TYPE_ERROR, local_css_path, set(missing_styles))) @@ -3676,7 +3674,7 @@ def lint(self, skip_lint_ignore: bool, allowed_messages: Optional[List[str]] = N if short_story_count and not self.metadata_dom.xpath("//meta[@property='se:subject' and text() = 'Shorts']"): messages.append(LintMessage("m-027", "[val]se:short-story[/] semantic inflection found, but no [val]se:subject[/] with the value of [text]Shorts[/].", se.MESSAGE_TYPE_ERROR, self.metadata_file_path)) - if has_glossary_search_key_map: + if ebook_flags["has_glossary_search_key_map"]: entries = [] for glossary_value in glossary_usage: if glossary_value[1] is False: