diff --git a/se/se_epub_lint.py b/se/se_epub_lint.py index 89b66c8e..61754699 100644 --- a/se/se_epub_lint.py +++ b/se/se_epub_lint.py @@ -3515,7 +3515,15 @@ def lint(self, skip_lint_ignore: bool, allowed_messages: Optional[List[str]] = N if filename.name == "glossary-search-key-map.xml": ebook_flags["has_glossary_search_key_map"] = True # Map the glossary to tuples of the values and whether they’re used (initially false) - glossary_usage = list(map(lambda node: (node.get_attr("value"), False), xml_dom.xpath(".//*[@value]"))) + # If a has no children, its @value must appear in the text + # Otherwise, each of its children's @value must appear + for match in xml_dom.xpath("/search-key-map/search-key-group/match[@value]"): + values = match.xpath("./value[@value]") + if not values: + glossary_usage.append((match.get_attr("value"), False)) + else: + for value in values: + glossary_usage.append((value.get_attr("value"), False)) if filename.suffix == ".xhtml": # Read file contents into a DOM for querying @@ -3614,12 +3622,17 @@ def lint(self, skip_lint_ignore: bool, allowed_messages: Optional[List[str]] = N # Check and log missing glossary keys if ebook_flags["has_glossary_search_key_map"] and filename.name not in IGNORED_FILENAMES: - source_text = dom.xpath("/html/body")[0].inner_text() + # Remove all noterefs, as their anchor text will otherwise immediately follow a potential glossary term, defeating the below regex. + dom_copy = deepcopy(dom) + for node in dom_copy.xpath(".//a[contains(@epub:type, 'noteref')]"): + node.remove() + + source_text = dom_copy.xpath("/html/body")[0].inner_text() if dom.xpath("/html/body//section[contains(@epub:type, 'glossary')]"): - nodes = dom.xpath("/html/body//dd[contains(@epub:type, 'glossdef')]") + nodes = dom_copy.xpath("/html/body//dd[contains(@epub:type, 'glossdef')]") source_text = " ".join([node.inner_text() for node in nodes]) for glossary_index, glossary_value in enumerate(glossary_usage): - if glossary_value[1] is False and regex.search(glossary_value[0], source_text, flags=regex.IGNORECASE): + if glossary_value[1] is False and regex.search(r"(?(?!\w)", source_text, flags=regex.IGNORECASE, val=[glossary_value[0]]): glossary_usage[glossary_index] = (glossary_value[0], True) # Test against word boundaries to not match `halftitlepage` diff --git a/tests/lint/metadata/m-070/golden/m-070-out.txt b/tests/lint/metadata/m-070/golden/m-070-out.txt new file mode 100644 index 00000000..22cec68c --- /dev/null +++ b/tests/lint/metadata/m-070/golden/m-070-out.txt @@ -0,0 +1,3 @@ +m-070 [Error] glossary-search-key-map.xml Glossary entry not found in the text. + foo + pariahsss diff --git a/tests/lint/metadata/m-070/in/src/epub/glossary-search-key-map.xml b/tests/lint/metadata/m-070/in/src/epub/glossary-search-key-map.xml new file mode 100644 index 00000000..aa902fa7 --- /dev/null +++ b/tests/lint/metadata/m-070/in/src/epub/glossary-search-key-map.xml @@ -0,0 +1,24 @@ + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/tests/lint/metadata/m-070/in/src/epub/text/chapter-1.xhtml b/tests/lint/metadata/m-070/in/src/epub/text/chapter-1.xhtml new file mode 100644 index 00000000..ed4fc696 --- /dev/null +++ b/tests/lint/metadata/m-070/in/src/epub/text/chapter-1.xhtml @@ -0,0 +1,19 @@ + + + + I + + + + +
+

I

+

He ate some food.

+

A common theory was R+L=J.

+

A ’versal truth.

+

An unknown M.O.

+

Unsiker1 is an unusual term.

+

Pariahs is plural, but should match.

+
+ +