lint: Add t-075, Word in verse with acute accent for scansion instead…

… of grave accent
standardebooks · May 10, 2024 · 580e610 · 580e610
1 parent 9c18670
commit 580e610
Show file tree

Hide file tree

Showing 2 changed files with 43 additions and 5 deletions.
diff --git a/se/se_epub_lint.py b/se/se_epub_lint.py
@@ -15,6 +15,7 @@
 from pathlib import Path
 from typing import Dict, List, Set, Union, Optional
 import importlib_resources
+from unidecode import unidecode
 
 import cssutils
 import lxml.cssselect
@@ -27,6 +28,7 @@
 import se.easy_xml
 import se.formatting
 import se.images
+import se.spelling
 import se.typography
 
 SE_VARIABLES = [
@@ -426,6 +428,7 @@
 "t-072", "[text]various sources[/] link not preceded by [text]from[/]."
 "t-073", "Possible transcription error in Greek."
 "t-074", "Extended sound using hyphen-minus [text]-[/] instead of non-breaking hyphen [text]‑[/]."
+"t-075", "Word in verse with acute accent for scansion instead of grave accent."
 
 
 XHTML
@@ -2741,10 +2744,39 @@ def _lint_xhtml_typography_checks(filename: Path, dom: se.easy_xml.EasyXmlTree,
 
 	# Check for hyphen-minus instead of non-breaking hyphen in sounds.
 	# Ignore very long <i> as they are more likely to be a sentence containing a dash, than a sound
-	nodes = dom.xpath("/html/body//*[(name() = 'i' or name() = 'em') and not(@epub:type) and not(xml:lang) and re:test(., '-[A-Za-z]-') and string-length(.) < 50]")
+	nodes = dom.xpath("/html/body//*[(name() = 'i' or name() = 'em') and not(@epub:type) and not(@xml:lang) and re:test(., '-[A-Za-z]-') and string-length(.) < 50]")
 	if nodes:
 		messages.append(LintMessage("t-074", "Extended sound using hyphen-minus [text]-[/] instead of non-breaking hyphen [text]‑[/].", se.MESSAGE_TYPE_WARNING, filename, [node.to_string() for node in nodes]))
 
+	# Check if we have a word accented with an acute accent instead of a grave accent in verse scansion.
+	nodes = dom.xpath("/html/body//*[not(@xml:lang) and re:test(@epub:type, 'z3998:(poem|verse|hymn|song)')]//span[not(ancestor-or-self::*[not(name() = 'html') and @xml:lang]) and not(./span) and re:test(., '[A-Za-z][áéíóú][A-za-z]')]")
+	filtered_nodes = []
+
+	if nodes:
+		# These words are English but have acute accents. Don't include the accent in this list because below we compare against the unaccented version.
+		ignored_words = ["cafe",  "cafes", "regime", "regimes", "reveille", "reveilles"]
+
+		# Initialize our dictionary
+		se.spelling.initialize_dictionary()
+
+		for node in nodes:
+			# Remove any child nodes that have a language specified
+			for inner_node in node.xpath(".//*[@xml:lang]"):
+				inner_node.remove()
+
+			# Extract each accented word, then compare against our dictionary.
+			# If the word IS in the dictionary, add it to the error list.
+			# Words that are NOT in the dictinoary are more likely to be proper names
+			# Note that this doesn't match word with two accent marks, like résumé. Such words are highly unlikely
+			# to altered for scansion anyway.
+			for word in regex.findall(r"[A-Za-z]+[áéíóú]+[A-za-z]+", node.inner_text()):
+				unaccented_word = unidecode(word)
+				if unaccented_word in se.spelling.DICTIONARY and unaccented_word not in ignored_words:
+					filtered_nodes.append(node)
+
+	if filtered_nodes:
+		messages.append(LintMessage("t-075", "Word in verse with acute accent for scansion instead of grave accent.", se.MESSAGE_TYPE_WARNING, filename, [node.to_string() for node in filtered_nodes]))
+
 	return (messages, missing_files)
 
 def _lint_xhtml_xhtml_checks(filename: Path, dom: se.easy_xml.EasyXmlTree, file_contents: str, local_css_path: str) -> list:

diff --git a/se/spelling.py b/se/spelling.py
@@ -29,6 +29,15 @@ def get_xhtml_language(xhtml: str) -> str:
 
 	return language
 
+def initialize_dictionary():
+	"""
+	Initialize the spelling word list dictionary, if we haven't already.
+	"""
+
+	if not se.spelling.DICTIONARY:
+		with importlib_resources.open_text("se.data", "words") as dictionary:
+			se.spelling.DICTIONARY = {line.strip().lower() for line in dictionary}
+
 def modernize_hyphenation(xhtml: str) -> str:
 	"""
 	Convert old-timey hyphenated compounds into single words based on the passed DICTIONARY.
@@ -40,10 +49,7 @@ def modernize_hyphenation(xhtml: str) -> str:
 	A string representing the XHTML with its hyphenation modernized
 	"""
 
-	# First, initialize our dictionary if we haven't already
-	if not se.spelling.DICTIONARY:
-		with importlib_resources.open_text("se.data", "words") as dictionary:
-			se.spelling.DICTIONARY = {line.strip().lower() for line in dictionary}
+	initialize_dictionary()
 
 	# Easy fix for a common case
 	xhtml = regex.sub(r"\b([Nn])ow-a-days\b", r"\1owadays", xhtml)	# now-a-days -> nowadays