Skip to content

Commit

Permalink
lint: Add t-075, Word in verse with acute accent for scansion instead…
Browse files Browse the repository at this point in the history
… of grave accent
  • Loading branch information
acabal committed May 10, 2024
1 parent 9c18670 commit 580e610
Show file tree
Hide file tree
Showing 2 changed files with 43 additions and 5 deletions.
34 changes: 33 additions & 1 deletion se/se_epub_lint.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
from pathlib import Path
from typing import Dict, List, Set, Union, Optional
import importlib_resources
from unidecode import unidecode

import cssutils
import lxml.cssselect
Expand All @@ -27,6 +28,7 @@
import se.easy_xml
import se.formatting
import se.images
import se.spelling
import se.typography

SE_VARIABLES = [
Expand Down Expand Up @@ -426,6 +428,7 @@
"t-072", "[text]various sources[/] link not preceded by [text]from[/]."
"t-073", "Possible transcription error in Greek."
"t-074", "Extended sound using hyphen-minus [text]-[/] instead of non-breaking hyphen [text]‑[/]."
"t-075", "Word in verse with acute accent for scansion instead of grave accent."
XHTML
Expand Down Expand Up @@ -2741,10 +2744,39 @@ def _lint_xhtml_typography_checks(filename: Path, dom: se.easy_xml.EasyXmlTree,

# Check for hyphen-minus instead of non-breaking hyphen in sounds.
# Ignore very long <i> as they are more likely to be a sentence containing a dash, than a sound
nodes = dom.xpath("/html/body//*[(name() = 'i' or name() = 'em') and not(@epub:type) and not(xml:lang) and re:test(., '-[A-Za-z]-') and string-length(.) < 50]")
nodes = dom.xpath("/html/body//*[(name() = 'i' or name() = 'em') and not(@epub:type) and not(@xml:lang) and re:test(., '-[A-Za-z]-') and string-length(.) < 50]")
if nodes:
messages.append(LintMessage("t-074", "Extended sound using hyphen-minus [text]-[/] instead of non-breaking hyphen [text]‑[/].", se.MESSAGE_TYPE_WARNING, filename, [node.to_string() for node in nodes]))

# Check if we have a word accented with an acute accent instead of a grave accent in verse scansion.
nodes = dom.xpath("/html/body//*[not(@xml:lang) and re:test(@epub:type, 'z3998:(poem|verse|hymn|song)')]//span[not(ancestor-or-self::*[not(name() = 'html') and @xml:lang]) and not(./span) and re:test(., '[A-Za-z][áéíóú][A-za-z]')]")
filtered_nodes = []

if nodes:
# These words are English but have acute accents. Don't include the accent in this list because below we compare against the unaccented version.
ignored_words = ["cafe", "cafes", "regime", "regimes", "reveille", "reveilles"]

# Initialize our dictionary
se.spelling.initialize_dictionary()

for node in nodes:
# Remove any child nodes that have a language specified
for inner_node in node.xpath(".//*[@xml:lang]"):
inner_node.remove()

# Extract each accented word, then compare against our dictionary.
# If the word IS in the dictionary, add it to the error list.
# Words that are NOT in the dictinoary are more likely to be proper names
# Note that this doesn't match word with two accent marks, like résumé. Such words are highly unlikely
# to altered for scansion anyway.
for word in regex.findall(r"[A-Za-z]+[áéíóú]+[A-za-z]+", node.inner_text()):
unaccented_word = unidecode(word)
if unaccented_word in se.spelling.DICTIONARY and unaccented_word not in ignored_words:
filtered_nodes.append(node)

if filtered_nodes:
messages.append(LintMessage("t-075", "Word in verse with acute accent for scansion instead of grave accent.", se.MESSAGE_TYPE_WARNING, filename, [node.to_string() for node in filtered_nodes]))

return (messages, missing_files)

def _lint_xhtml_xhtml_checks(filename: Path, dom: se.easy_xml.EasyXmlTree, file_contents: str, local_css_path: str) -> list:
Expand Down
14 changes: 10 additions & 4 deletions se/spelling.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,15 @@ def get_xhtml_language(xhtml: str) -> str:

return language

def initialize_dictionary():
"""
Initialize the spelling word list dictionary, if we haven't already.
"""

if not se.spelling.DICTIONARY:
with importlib_resources.open_text("se.data", "words") as dictionary:
se.spelling.DICTIONARY = {line.strip().lower() for line in dictionary}

def modernize_hyphenation(xhtml: str) -> str:
"""
Convert old-timey hyphenated compounds into single words based on the passed DICTIONARY.
Expand All @@ -40,10 +49,7 @@ def modernize_hyphenation(xhtml: str) -> str:
A string representing the XHTML with its hyphenation modernized
"""

# First, initialize our dictionary if we haven't already
if not se.spelling.DICTIONARY:
with importlib_resources.open_text("se.data", "words") as dictionary:
se.spelling.DICTIONARY = {line.strip().lower() for line in dictionary}
initialize_dictionary()

# Easy fix for a common case
xhtml = regex.sub(r"\b([Nn])ow-a-days\b", r"\1owadays", xhtml) # now-a-days -> nowadays
Expand Down

0 comments on commit 580e610

Please sign in to comment.