From 21dd6d57552148bc6ccf70f72bbb58409463d570 Mon Sep 17 00:00:00 2001 From: Andrew Paseltiner Date: Sat, 13 Jul 2024 12:34:00 -0400 Subject: [PATCH 1/5] Add test demonstrating erroneous behavior of m-070 --- tests/lint/metadata/m-070/golden/m-070-out.txt | 2 ++ .../in/src/epub/glossary-search-key-map.xml | 9 +++++++++ .../m-070/in/src/epub/text/chapter-1.xhtml | 17 +++++++++++++++++ 3 files changed, 28 insertions(+) create mode 100644 tests/lint/metadata/m-070/golden/m-070-out.txt create mode 100644 tests/lint/metadata/m-070/in/src/epub/glossary-search-key-map.xml create mode 100644 tests/lint/metadata/m-070/in/src/epub/text/chapter-1.xhtml diff --git a/tests/lint/metadata/m-070/golden/m-070-out.txt b/tests/lint/metadata/m-070/golden/m-070-out.txt new file mode 100644 index 00000000..3992efad --- /dev/null +++ b/tests/lint/metadata/m-070/golden/m-070-out.txt @@ -0,0 +1,2 @@ +m-070 [Error] glossary-search-key-map.xml Glossary entry not found in the text. + R+L=J diff --git a/tests/lint/metadata/m-070/in/src/epub/glossary-search-key-map.xml b/tests/lint/metadata/m-070/in/src/epub/glossary-search-key-map.xml new file mode 100644 index 00000000..e980fc08 --- /dev/null +++ b/tests/lint/metadata/m-070/in/src/epub/glossary-search-key-map.xml @@ -0,0 +1,9 @@ + + + + + + + + + diff --git a/tests/lint/metadata/m-070/in/src/epub/text/chapter-1.xhtml b/tests/lint/metadata/m-070/in/src/epub/text/chapter-1.xhtml new file mode 100644 index 00000000..35ad0f62 --- /dev/null +++ b/tests/lint/metadata/m-070/in/src/epub/text/chapter-1.xhtml @@ -0,0 +1,17 @@ + + + + I + + + + +
+

I

+ +

He ate some food.

+ +

A common theory was R+L=J.

+
+ + From ab2b45e180398ad9fb6d894679848dba7baf0915 Mon Sep 17 00:00:00 2001 From: Andrew Paseltiner Date: Sat, 13 Jul 2024 12:43:09 -0400 Subject: [PATCH 2/5] Fix m-070 - Treat glossary values as string literals, not patterns - Search for glossary values surrounded by word boundaries --- se/se_epub_lint.py | 2 +- tests/lint/metadata/m-070/golden/m-070-out.txt | 2 +- tests/lint/metadata/m-070/in/src/epub/text/chapter-1.xhtml | 2 -- 3 files changed, 2 insertions(+), 4 deletions(-) diff --git a/se/se_epub_lint.py b/se/se_epub_lint.py index 89b66c8e..dd9ca2a6 100644 --- a/se/se_epub_lint.py +++ b/se/se_epub_lint.py @@ -3619,7 +3619,7 @@ def lint(self, skip_lint_ignore: bool, allowed_messages: Optional[List[str]] = N nodes = dom.xpath("/html/body//dd[contains(@epub:type, 'glossdef')]") source_text = " ".join([node.inner_text() for node in nodes]) for glossary_index, glossary_value in enumerate(glossary_usage): - if glossary_value[1] is False and regex.search(glossary_value[0], source_text, flags=regex.IGNORECASE): + if glossary_value[1] is False and regex.search(r"\b\L\b", source_text, flags=regex.IGNORECASE, val=[glossary_value[0]]): glossary_usage[glossary_index] = (glossary_value[0], True) # Test against word boundaries to not match `halftitlepage` diff --git a/tests/lint/metadata/m-070/golden/m-070-out.txt b/tests/lint/metadata/m-070/golden/m-070-out.txt index 3992efad..6034b8e7 100644 --- a/tests/lint/metadata/m-070/golden/m-070-out.txt +++ b/tests/lint/metadata/m-070/golden/m-070-out.txt @@ -1,2 +1,2 @@ m-070 [Error] glossary-search-key-map.xml Glossary entry not found in the text. - R+L=J + foo diff --git a/tests/lint/metadata/m-070/in/src/epub/text/chapter-1.xhtml b/tests/lint/metadata/m-070/in/src/epub/text/chapter-1.xhtml index 35ad0f62..c4c65576 100644 --- a/tests/lint/metadata/m-070/in/src/epub/text/chapter-1.xhtml +++ b/tests/lint/metadata/m-070/in/src/epub/text/chapter-1.xhtml @@ -8,9 +8,7 @@

I

-

He ate some food.

-

A common theory was R+L=J.

From 1f78ebb9f8dcd6289fc4fe666cbc68fa5cb6dde5 Mon Sep 17 00:00:00 2001 From: Andrew Paseltiner Date: Sat, 13 Jul 2024 17:25:47 -0400 Subject: [PATCH 3/5] Replace word boundaries with negative assertions --- se/se_epub_lint.py | 2 +- .../metadata/m-070/in/src/epub/glossary-search-key-map.xml | 6 ++++++ tests/lint/metadata/m-070/in/src/epub/text/chapter-1.xhtml | 2 ++ 3 files changed, 9 insertions(+), 1 deletion(-) diff --git a/se/se_epub_lint.py b/se/se_epub_lint.py index dd9ca2a6..79e2f2ea 100644 --- a/se/se_epub_lint.py +++ b/se/se_epub_lint.py @@ -3619,7 +3619,7 @@ def lint(self, skip_lint_ignore: bool, allowed_messages: Optional[List[str]] = N nodes = dom.xpath("/html/body//dd[contains(@epub:type, 'glossdef')]") source_text = " ".join([node.inner_text() for node in nodes]) for glossary_index, glossary_value in enumerate(glossary_usage): - if glossary_value[1] is False and regex.search(r"\b\L\b", source_text, flags=regex.IGNORECASE, val=[glossary_value[0]]): + if glossary_value[1] is False and regex.search(r"(?(?!\w)", source_text, flags=regex.IGNORECASE, val=[glossary_value[0]]): glossary_usage[glossary_index] = (glossary_value[0], True) # Test against word boundaries to not match `halftitlepage` diff --git a/tests/lint/metadata/m-070/in/src/epub/glossary-search-key-map.xml b/tests/lint/metadata/m-070/in/src/epub/glossary-search-key-map.xml index e980fc08..d50708e4 100644 --- a/tests/lint/metadata/m-070/in/src/epub/glossary-search-key-map.xml +++ b/tests/lint/metadata/m-070/in/src/epub/glossary-search-key-map.xml @@ -6,4 +6,10 @@ + + + + + + diff --git a/tests/lint/metadata/m-070/in/src/epub/text/chapter-1.xhtml b/tests/lint/metadata/m-070/in/src/epub/text/chapter-1.xhtml index c4c65576..f290d151 100644 --- a/tests/lint/metadata/m-070/in/src/epub/text/chapter-1.xhtml +++ b/tests/lint/metadata/m-070/in/src/epub/text/chapter-1.xhtml @@ -10,6 +10,8 @@

I

He ate some food.

A common theory was R+L=J.

+

A ’versal truth.

+

An unknown M.O.

From 9545297d8e9b9f04083c9a15055dc5fe470e4ddf Mon Sep 17 00:00:00 2001 From: Andrew Paseltiner Date: Mon, 15 Jul 2024 16:22:41 -0400 Subject: [PATCH 4/5] Properly handle noterefs in m-070 --- se/se_epub_lint.py | 9 +++++++-- .../m-070/in/src/epub/glossary-search-key-map.xml | 3 +++ .../lint/metadata/m-070/in/src/epub/text/chapter-1.xhtml | 1 + 3 files changed, 11 insertions(+), 2 deletions(-) diff --git a/se/se_epub_lint.py b/se/se_epub_lint.py index 79e2f2ea..1b3a666c 100644 --- a/se/se_epub_lint.py +++ b/se/se_epub_lint.py @@ -3614,9 +3614,14 @@ def lint(self, skip_lint_ignore: bool, allowed_messages: Optional[List[str]] = N # Check and log missing glossary keys if ebook_flags["has_glossary_search_key_map"] and filename.name not in IGNORED_FILENAMES: - source_text = dom.xpath("/html/body")[0].inner_text() + # Remove all noterefs, as their anchor text will otherwise immediately follow a potential glossary term, defeating the below regex. + dom_copy = deepcopy(dom) + for node in dom_copy.xpath(".//a[contains(@epub:type, 'noteref')]"): + node.remove() + + source_text = dom_copy.xpath("/html/body")[0].inner_text() if dom.xpath("/html/body//section[contains(@epub:type, 'glossary')]"): - nodes = dom.xpath("/html/body//dd[contains(@epub:type, 'glossdef')]") + nodes = dom_copy.xpath("/html/body//dd[contains(@epub:type, 'glossdef')]") source_text = " ".join([node.inner_text() for node in nodes]) for glossary_index, glossary_value in enumerate(glossary_usage): if glossary_value[1] is False and regex.search(r"(?(?!\w)", source_text, flags=regex.IGNORECASE, val=[glossary_value[0]]): diff --git a/tests/lint/metadata/m-070/in/src/epub/glossary-search-key-map.xml b/tests/lint/metadata/m-070/in/src/epub/glossary-search-key-map.xml index d50708e4..266070a0 100644 --- a/tests/lint/metadata/m-070/in/src/epub/glossary-search-key-map.xml +++ b/tests/lint/metadata/m-070/in/src/epub/glossary-search-key-map.xml @@ -12,4 +12,7 @@ + + + diff --git a/tests/lint/metadata/m-070/in/src/epub/text/chapter-1.xhtml b/tests/lint/metadata/m-070/in/src/epub/text/chapter-1.xhtml index f290d151..8a99425f 100644 --- a/tests/lint/metadata/m-070/in/src/epub/text/chapter-1.xhtml +++ b/tests/lint/metadata/m-070/in/src/epub/text/chapter-1.xhtml @@ -12,6 +12,7 @@

A common theory was R+L=J.

A ’versal truth.

An unknown M.O.

+

Unsiker1 is an unusual term.

From b089973dadf2334612be0d5fb8611941034b91f3 Mon Sep 17 00:00:00 2001 From: Andrew Paseltiner Date: Mon, 15 Jul 2024 17:14:23 -0400 Subject: [PATCH 5/5] Vary m-070 searches based on whether has children --- se/se_epub_lint.py | 10 +++++++++- tests/lint/metadata/m-070/golden/m-070-out.txt | 1 + .../m-070/in/src/epub/glossary-search-key-map.xml | 6 ++++++ .../metadata/m-070/in/src/epub/text/chapter-1.xhtml | 1 + 4 files changed, 17 insertions(+), 1 deletion(-) diff --git a/se/se_epub_lint.py b/se/se_epub_lint.py index 1b3a666c..61754699 100644 --- a/se/se_epub_lint.py +++ b/se/se_epub_lint.py @@ -3515,7 +3515,15 @@ def lint(self, skip_lint_ignore: bool, allowed_messages: Optional[List[str]] = N if filename.name == "glossary-search-key-map.xml": ebook_flags["has_glossary_search_key_map"] = True # Map the glossary to tuples of the values and whether they’re used (initially false) - glossary_usage = list(map(lambda node: (node.get_attr("value"), False), xml_dom.xpath(".//*[@value]"))) + # If a has no children, its @value must appear in the text + # Otherwise, each of its children's @value must appear + for match in xml_dom.xpath("/search-key-map/search-key-group/match[@value]"): + values = match.xpath("./value[@value]") + if not values: + glossary_usage.append((match.get_attr("value"), False)) + else: + for value in values: + glossary_usage.append((value.get_attr("value"), False)) if filename.suffix == ".xhtml": # Read file contents into a DOM for querying diff --git a/tests/lint/metadata/m-070/golden/m-070-out.txt b/tests/lint/metadata/m-070/golden/m-070-out.txt index 6034b8e7..22cec68c 100644 --- a/tests/lint/metadata/m-070/golden/m-070-out.txt +++ b/tests/lint/metadata/m-070/golden/m-070-out.txt @@ -1,2 +1,3 @@ m-070 [Error] glossary-search-key-map.xml Glossary entry not found in the text. foo + pariahsss diff --git a/tests/lint/metadata/m-070/in/src/epub/glossary-search-key-map.xml b/tests/lint/metadata/m-070/in/src/epub/glossary-search-key-map.xml index 266070a0..aa902fa7 100644 --- a/tests/lint/metadata/m-070/in/src/epub/glossary-search-key-map.xml +++ b/tests/lint/metadata/m-070/in/src/epub/glossary-search-key-map.xml @@ -15,4 +15,10 @@ + + + + + + diff --git a/tests/lint/metadata/m-070/in/src/epub/text/chapter-1.xhtml b/tests/lint/metadata/m-070/in/src/epub/text/chapter-1.xhtml index 8a99425f..ed4fc696 100644 --- a/tests/lint/metadata/m-070/in/src/epub/text/chapter-1.xhtml +++ b/tests/lint/metadata/m-070/in/src/epub/text/chapter-1.xhtml @@ -13,6 +13,7 @@

A ’versal truth.

An unknown M.O.

Unsiker1 is an unusual term.

+

Pariahs is plural, but should match.