Skip to content

Commit

Permalink
Rename ebook_info to ebook_flags, consolidate several other flags int…
Browse files Browse the repository at this point in the history
…o it
  • Loading branch information
Vince committed Sep 5, 2023
1 parent 4896285 commit c0091af
Showing 1 changed file with 60 additions and 62 deletions.
122 changes: 60 additions & 62 deletions se/se_epub_lint.py
Original file line number Diff line number Diff line change
Expand Up @@ -1293,15 +1293,15 @@ def _lint_svg_checks(filename: str, file_contents: str, svg_dom: se.easy_xml.Eas

return messages

def _lint_special_file_checks(filename: str, dom: se.easy_xml.EasyXmlTree, file_contents: str, ebook_info: dict, special_file: str, self) -> list:
def _lint_special_file_checks(filename: str, dom: se.easy_xml.EasyXmlTree, file_contents: str, ebook_flags: dict, special_file: str, self) -> list:
"""
Process error checks in “special” .xhtml files
INPUTS
filename: The name of the file being checked
dom: The dom of the file being checked
file_contents: The contents of the file being checked
ebook_info: A dictionary containing ebook information
ebook_flags: A dictionary containing ebook information
special_file: A string identifying the type of special file being checked
self
Expand Down Expand Up @@ -1354,14 +1354,14 @@ def _lint_special_file_checks(filename: str, dom: se.easy_xml.EasyXmlTree, file_
if self.metadata_dom.xpath("/package/metadata/meta[@property='role' and text()='trl']") and "translated from" not in file_contents:
messages.append(LintMessage("m-025", "Translator found in metadata, but no [text]translated from LANG[/] block in colophon.", se.MESSAGE_TYPE_ERROR, filename))

if ebook_info["has_multiple_transcriptions"] and not dom.xpath("/html/body//a[contains(@href, '#transcriptions')]"):
if ebook_flags["has_multiple_transcriptions"] and not dom.xpath("/html/body//a[contains(@href, '#transcriptions')]"):
messages.append(LintMessage("m-074", "Multiple transcriptions found in metadata, but no link to [text]EBOOK_URL#transcriptions[/].", se.MESSAGE_TYPE_ERROR, filename))

if ebook_info["has_multiple_page_scans"] and not dom.xpath("/html/body//a[contains(@href, '#page-scans')]"):
if ebook_flags["has_multiple_page_scans"] and not dom.xpath("/html/body//a[contains(@href, '#page-scans')]"):
messages.append(LintMessage("m-075", "Multiple page scans found in metadata, but no link to [text]EBOOK_URL#page-scans[/].", se.MESSAGE_TYPE_ERROR, filename))

# Check that the formula changed from the default if we added 'various sources'
if ebook_info["has_multiple_transcriptions"] or ebook_info["has_multiple_page_scans"]:
if ebook_flags["has_multiple_transcriptions"] or ebook_flags["has_multiple_page_scans"]:
nodes = dom.xpath("/html/body//a[text() = 'various sources' and not(re:test(preceding-sibling::br[1]/preceding-sibling::node()[1], '(digital scans|transcriptions) from\\s*$'))]")
if nodes:
messages.append(LintMessage("t-072", "[text]various sources[/] link not preceded by [text]from[/].", se.MESSAGE_TYPE_ERROR, filename))
Expand Down Expand Up @@ -1407,12 +1407,12 @@ def _lint_special_file_checks(filename: str, dom: se.easy_xml.EasyXmlTree, file_
# Are the sources represented correctly?
# We don't have a standard yet for more than two sources (transcription and scan) so just ignore that case for now.
# We can't merge this with the imprint check because imprint doesn't have `<br/>` between `the`
if not ebook_info["has_multiple_transcriptions"] and not ebook_info["has_other_sources"]:
if not ebook_flags["has_multiple_transcriptions"] and not ebook_flags["has_other_sources"]:
for link in source_links:
if "gutenberg.org" in link and f"<a href=\"{link}\">Project Gutenberg</a>" not in file_contents:
messages.append(LintMessage("m-037", f"Transcription/page scan source link not found. Expected: [xhtml]<a href=\"{link}\">Project Gutenberg</a>[/].", se.MESSAGE_TYPE_ERROR, filename))

if not ebook_info["has_multiple_page_scans"] and not ebook_info["has_other_sources"]:
if not ebook_flags["has_multiple_page_scans"] and not ebook_flags["has_other_sources"]:
for link in source_links:
if "hathitrust.org" in link and f"the<br/>\n\t\t\t<a href=\"{link}\">HathiTrust Digital Library</a>" not in file_contents:
messages.append(LintMessage("m-037", f"Transcription/page scan source link not found. Expected: [xhtml]the<br/> <a href=\"{link}\">HathiTrust Digital Library</a>[/].", se.MESSAGE_TYPE_ERROR, filename))
Expand All @@ -1436,25 +1436,25 @@ def _lint_special_file_checks(filename: str, dom: se.easy_xml.EasyXmlTree, file_
if missing_imprint_vars:
messages.append(LintMessage("m-036", "Variable not replaced with value.", se.MESSAGE_TYPE_ERROR, filename, missing_imprint_vars))

if ebook_info["has_multiple_transcriptions"] and not dom.xpath("/html/body//a[contains(@href, '#transcriptions')]"):
if ebook_flags["has_multiple_transcriptions"] and not dom.xpath("/html/body//a[contains(@href, '#transcriptions')]"):
messages.append(LintMessage("m-074", "Multiple transcriptions found in metadata, but no link to [text]EBOOK_URL#transcriptions[/].", se.MESSAGE_TYPE_ERROR, filename))

if ebook_info["has_multiple_page_scans"] and not dom.xpath("/html/body//a[contains(@href, '#page-scans')]"):
if ebook_flags["has_multiple_page_scans"] and not dom.xpath("/html/body//a[contains(@href, '#page-scans')]"):
messages.append(LintMessage("m-075", "Multiple page scans found in metadata, but no link to [text]EBOOK_URL#page-scans[/].", se.MESSAGE_TYPE_ERROR, filename))

# Check that the formula changed from the default if we added 'various sources'
if ebook_info["has_multiple_transcriptions"] or ebook_info["has_multiple_page_scans"]:
if ebook_flags["has_multiple_transcriptions"] or ebook_flags["has_multiple_page_scans"]:
nodes = dom.xpath("/html/body//a[text() = 'various sources' and not(re:test(preceding-sibling::node()[1], '(digital scans|transcriptions) from\\s*$'))]")
if nodes:
messages.append(LintMessage("t-072", "[text]various sources[/] link not preceded by [text]from[/].", se.MESSAGE_TYPE_ERROR, filename))

# Check for correctly named links. We can't merge this with the colophon check because the colophon breaks `the` with `<br/>`
if not ebook_info["has_multiple_transcriptions"] and not ebook_info["has_other_sources"]:
if not ebook_flags["has_multiple_transcriptions"] and not ebook_flags["has_other_sources"]:
for link in source_links:
if "gutenberg.org" in link and f"<a href=\"{link}\">Project Gutenberg</a>" not in file_contents:
messages.append(LintMessage("m-037", f"Transcription/page scan source link not found. Expected: [xhtml]<a href=\"{link}\">Project Gutenberg</a>[/].", se.MESSAGE_TYPE_ERROR, filename))

if not ebook_info["has_multiple_page_scans"] and not ebook_info["has_other_sources"]:
if not ebook_flags["has_multiple_page_scans"] and not ebook_flags["has_other_sources"]:
for link in source_links:
if "hathitrust.org" in link and f"the <a href=\"{link}\">HathiTrust Digital Library</a>" not in file_contents:
messages.append(LintMessage("m-037", f"Transcription/page scan source link not found. Expected: the [xhtml]<a href=\"{link}\">HathiTrust Digital Library</a>[/].", se.MESSAGE_TYPE_ERROR, filename))
Expand Down Expand Up @@ -1667,7 +1667,7 @@ def _lint_xhtml_metadata_checks(filename: str, dom: se.easy_xml.EasyXmlTree) ->

return messages

def _lint_xhtml_syntax_checks(filename: str, dom: se.easy_xml.EasyXmlTree, self, file_contents: str, ebook_info: dict, language: str) -> list:
def _lint_xhtml_syntax_checks(filename: str, dom: se.easy_xml.EasyXmlTree, self, file_contents: str, ebook_flags: dict, language: str) -> list:
"""
Helper function used in self.lint()
Process syntax checks on an .xhtml file
Expand All @@ -1676,7 +1676,7 @@ def _lint_xhtml_syntax_checks(filename: str, dom: se.easy_xml.EasyXmlTree, self,
filename: The name of the file being checked
dom: A dom tree to check
file_contents: The contents of the file being checked
ebook_info: A dictionary containing several pieces of information about an ebook
ebook_flags: A dictionary containing several pieces of information about an ebook
OUTPUTS
A list of LintMessages representing syntax errors found in the file
Expand Down Expand Up @@ -2181,7 +2181,7 @@ def _lint_xhtml_syntax_checks(filename: str, dom: se.easy_xml.EasyXmlTree, self,
messages.append(LintMessage("s-086", "[text]Op. Cit.[/] or [text]Loc. Cit.[/] in endnote. Hint: [text]Op. Cit.[/] and [text]Loc. Cit.[/] mean [text]the previous reference[/], which usually doesn’t make sense in a popup endnote. Such references should be expanded.", se.MESSAGE_TYPE_WARNING, filename, [node.to_tag_string() for node in nodes]))

# Check for half title pages missing subtitles
if ebook_info["has_subtitle"]:
if ebook_flags["has_subtitle"]:
# Make sure we exclude <a> because that appears in the ToC landmarks
nodes = dom.xpath("/html/body//*[name()!='a' and contains(@epub:type, 'halftitlepage') and not(.//*[contains(@epub:type, 'subtitle')])]")
if nodes:
Expand Down Expand Up @@ -2252,7 +2252,7 @@ def _lint_xhtml_syntax_checks(filename: str, dom: se.easy_xml.EasyXmlTree, self,

return messages

def _lint_xhtml_typography_checks(filename: str, dom: se.easy_xml.EasyXmlTree, file_contents: str, special_file: str) -> list:
def _lint_xhtml_typography_checks(filename: str, dom: se.easy_xml.EasyXmlTree, file_contents: str, special_file: str, ebook_flags: dict) -> list:
"""
Helper function used in self.lint()
Process typography checks on an .xhtml file
Expand All @@ -2267,7 +2267,6 @@ def _lint_xhtml_typography_checks(filename: str, dom: se.easy_xml.EasyXmlTree, f
A list of LintMessages representing typography errors found in the file
"""

has_images = False;
messages = [];

# Check for punctuation outside quotes. We don't check single quotes because contractions are too common.
Expand Down Expand Up @@ -2447,7 +2446,7 @@ def _lint_xhtml_typography_checks(filename: str, dom: se.easy_xml.EasyXmlTree, f
img_alt_lacking_punctuation = []
for node in nodes:
if "titlepage.svg" not in node.get_attr("src"):
has_images = True # Save for a later check
ebook_flags["has_images"] = True # Save for a later check

alt = node.get_attr("alt")

Expand Down Expand Up @@ -2735,7 +2734,7 @@ def _lint_xhtml_typography_checks(filename: str, dom: se.easy_xml.EasyXmlTree, f
if node_text != expected_text:
messages.append(LintMessage("t-073", f"Possible transcription error in Greek. Found: [text]{node_text}[/], but expected [text]{expected_text}[/text]. Hint: Use [bash]se unicode-names[/] to see differences in Unicode characters.", se.MESSAGE_TYPE_WARNING, filename))

return (messages, has_images)
return (messages)

def _lint_xhtml_xhtml_checks(filename: str, dom: se.easy_xml.EasyXmlTree, file_contents: str) -> list:
"""
Expand Down Expand Up @@ -3151,18 +3150,15 @@ def lint(self, skip_lint_ignore: bool, allowed_messages: Optional[List[str]] = N
Check this ebook for some common SE style errors.
INPUTS
None
self
skip_lint_ignore: Flag indicating whether ignore file should be used
OUTPUTS
A list of LintMessage objects.
"""

local_css_path = self.content_path / "css/local.css"
messages: List[LintMessage] = []
is_titlepage = False
has_halftitle = False
has_frontmatter = False
has_cover_source = False
cover_svg_title = ""
titlepage_svg_title = ""
xhtml_css_classes: Dict[str, int] = {}
Expand All @@ -3171,9 +3167,32 @@ def lint(self, skip_lint_ignore: bool, allowed_messages: Optional[List[str]] = N
unused_selectors: List[str] = []
id_attrs: List[str] = []
abbr_elements_requiring_css: List[se.easy_xml.EasyXmlElement] = []
has_glossary_search_key_map = False
glossary_usage = []
has_images = False
short_story_count = 0
missing_styles: List[str] = []
directories_not_url_safe = []
files_not_url_safe = []
id_values = {}
duplicate_id_values = []
local_css = {
"has_poem_stye": False,
"has_verse_style": False,
"has_song_style": False,
"has_hymn_style": False,
"has_lyrics_style": False,
"has_elision_style": False
}
ebook_flags = {
"has_cover_source": False,
"has_frontmatter": False,
"has_glossary_search_key_map": False,
"has_halftitle": False,
"has_subtitle": bool(self.metadata_dom.xpath("/package/metadata/meta[@property='title-type' and text()='subtitle']")),
"has_images": False,
"has_multiple_transcriptions": False,
"has_multiple_page_scans": False,
"has_other_sources": False
}

# Cache the browser default stylesheet for later use
with importlib_resources.open_text("se.data", "browser.css", encoding="utf-8") as css:
Expand Down Expand Up @@ -3213,27 +3232,6 @@ def lint(self, skip_lint_ignore: bool, allowed_messages: Optional[List[str]] = N
local_css_selectors = [regex.sub(r"::[\p{Lowercase_Letter}\-]+", "", selector) for selector in local_css_rules]
unused_selectors = local_css_selectors.copy()

local_css = {
"has_poem_stye": False,
"has_verse_style": False,
"has_song_style": False,
"has_hymn_style": False,
"has_lyrics_style": False,
"has_elision_style": False
}
ebook_info = {
"has_subtitle": bool(self.metadata_dom.xpath("/package/metadata/meta[@property='title-type' and text()='subtitle']")),
"has_multiple_transcriptions": False,
"has_multiple_page_scans": False,
"has_other_sources": False
}
short_story_count = 0
missing_styles: List[str] = []
directories_not_url_safe = []
files_not_url_safe = []
id_values = {}
duplicate_id_values = []

(css_messages, local_css) = _lint_css_checks(self, local_css, local_css_path, local_css_rules)
if css_messages:
messages = messages + css_messages
Expand Down Expand Up @@ -3278,9 +3276,9 @@ def lint(self, skip_lint_ignore: bool, allowed_messages: Optional[List[str]] = N
else:
other_source_count = other_source_count + 1

ebook_info["has_multiple_transcriptions"] = transcription_source_count >= 2
ebook_info["has_multiple_page_scans"] = page_scan_source_count >= 2
ebook_info["has_other_sources"] = other_source_count > 0
ebook_flags["has_multiple_transcriptions"] = transcription_source_count >= 2
ebook_flags["has_multiple_page_scans"] = page_scan_source_count >= 2
ebook_flags["has_other_sources"] = other_source_count > 0

messages = messages + _lint_metadata_checks(self)

Expand Down Expand Up @@ -3343,7 +3341,7 @@ def lint(self, skip_lint_ignore: bool, allowed_messages: Optional[List[str]] = N

if filename.stem != "LICENSE":
if filename.stem == "cover.source":
has_cover_source = True
ebook_flags["has_cover_source"] = True
else:
url_safe_filename = se.formatting.make_url_safe(filename.stem) + filename.suffix
if filename.name != url_safe_filename:
Expand Down Expand Up @@ -3394,7 +3392,7 @@ def lint(self, skip_lint_ignore: bool, allowed_messages: Optional[List[str]] = N
# Make sure that everything in glossaries are in the rest of the text
# We’ll check the files later, and log any errors at the end
if filename.name == "glossary-search-key-map.xml":
has_glossary_search_key_map = True
ebook_flags["has_glossary_search_key_map"] = True
# Map the glossary to tuples of the values and whether they’re used (initially false)
glossary_usage = list(map(lambda node: (node.get_attr("value"), False), xml_dom.xpath(".//*[@value]")))

Expand Down Expand Up @@ -3439,12 +3437,12 @@ def lint(self, skip_lint_ignore: bool, allowed_messages: Optional[List[str]] = N

# Check if this is a frontmatter file, but exclude the titlepage, imprint, and toc
if dom.xpath("/html//*[contains(@epub:type, 'frontmatter') and not(descendant-or-self::*[re:test(@epub:type, '\\b(titlepage|imprint|toc)\\b')])]"):
has_frontmatter = True
ebook_flags["has_frontmatter"] = True

# Do we have a half title?
# Sometimes the half title might not be a section, like in Cane by Jean Toomer
if dom.xpath("/html/body//*[contains(@epub:type, 'halftitlepage')]"):
has_halftitle = True
ebook_flags["has_halftitle"] = True

# Add new CSS classes to global list
if filename.name not in IGNORED_FILENAMES:
Expand Down Expand Up @@ -3484,7 +3482,7 @@ def lint(self, skip_lint_ignore: bool, allowed_messages: Optional[List[str]] = N
abbr_elements_requiring_css += dom.xpath("/html/body//abbr[re:test(@epub:type, '\\b(se:temperature|se:era|z3998:acronym)\\b')]")

# Check and log missing glossary keys
if has_glossary_search_key_map and filename.name not in IGNORED_FILENAMES:
if ebook_flags["has_glossary_search_key_map"] and filename.name not in IGNORED_FILENAMES:
source_text = dom.xpath("/html/body")[0].inner_text()
if dom.xpath("/html/body//section[contains(@epub:type, 'glossary')]"):
nodes = dom.xpath("/html/body//dd[contains(@epub:type, 'glossdef')]")
Expand Down Expand Up @@ -3513,17 +3511,17 @@ def lint(self, skip_lint_ignore: bool, allowed_messages: Optional[List[str]] = N
special_file = None

if special_file in SPECIAL_FILES:
messages = messages + _lint_special_file_checks(filename, dom, file_contents, ebook_info, special_file, self)
messages = messages + _lint_special_file_checks(filename, dom, file_contents, ebook_flags, special_file, self)

missing_styles = missing_styles + _update_missing_styles(filename, dom, local_css)

messages = messages + _lint_xhtml_css_checks(filename, dom, local_css_path)

messages = messages + _lint_xhtml_metadata_checks(filename, dom)

messages = messages + _lint_xhtml_syntax_checks(filename, dom, self, file_contents, ebook_info, language)
messages = messages + _lint_xhtml_syntax_checks(filename, dom, self, file_contents, ebook_flags, language)

(typography_messages, has_images) = _lint_xhtml_typography_checks(filename, dom, file_contents, special_file)
(typography_messages) = _lint_xhtml_typography_checks(filename, dom, file_contents, special_file, ebook_flags)
if typography_messages:
messages = messages + typography_messages

Expand All @@ -3534,10 +3532,10 @@ def lint(self, skip_lint_ignore: bool, allowed_messages: Optional[List[str]] = N
if self.cover_path and cover_svg_title != titlepage_svg_title:
messages.append(LintMessage("s-028", f"[path][link=file://{self.cover_path}]{self.cover_path.name}[/][/] and [path][link=file://{self.path / 'images/titlepage.svg'}]titlepage.svg[/][/] [xhtml]<title>[/] elements don’t match.", se.MESSAGE_TYPE_ERROR, self.cover_path))

if has_frontmatter and not has_halftitle:
if ebook_flags["has_frontmatter"] and not ebook_flags["has_halftitle"]:
messages.append(LintMessage("s-020", "Frontmatter found, but no half title page. Half title page is required when frontmatter is present.", se.MESSAGE_TYPE_ERROR, self.metadata_file_path))

if self.is_se_ebook and not has_cover_source:
if self.is_se_ebook and not ebook_flags["has_cover_source"]:
missing_files.append("images/cover.source.jpg")

missing_selectors = []
Expand Down Expand Up @@ -3659,7 +3657,7 @@ def lint(self, skip_lint_ignore: bool, allowed_messages: Optional[List[str]] = N
if f"[epub|type~=\"{value}\"]" not in self.local_css:
missing_styles.append(element.to_tag_string())

messages = messages + _lint_image_metadata_checks(self, has_images)
messages = messages + _lint_image_metadata_checks(self, ebook_flags["has_images"])

if missing_styles:
messages.append(LintMessage("c-006", f"Semantic found, but missing corresponding style in [path][link=file://{local_css_path}]local.css[/][/].", se.MESSAGE_TYPE_ERROR, local_css_path, set(missing_styles)))
Expand All @@ -3676,7 +3674,7 @@ def lint(self, skip_lint_ignore: bool, allowed_messages: Optional[List[str]] = N
if short_story_count and not self.metadata_dom.xpath("//meta[@property='se:subject' and text() = 'Shorts']"):
messages.append(LintMessage("m-027", "[val]se:short-story[/] semantic inflection found, but no [val]se:subject[/] with the value of [text]Shorts[/].", se.MESSAGE_TYPE_ERROR, self.metadata_file_path))

if has_glossary_search_key_map:
if ebook_flags["has_glossary_search_key_map"]:
entries = []
for glossary_value in glossary_usage:
if glossary_value[1] is False:
Expand Down

0 comments on commit c0091af

Please sign in to comment.