Skip to content

Commit

Permalink
Improve y-029/y-031/y-032, add y-031 thru y-033 tests
Browse files Browse the repository at this point in the history
  • Loading branch information
vr8hub authored and acabal committed May 28, 2024
1 parent efb01d3 commit 46acd2c
Show file tree
Hide file tree
Showing 7 changed files with 163 additions and 4 deletions.
8 changes: 4 additions & 4 deletions se/se_epub_lint.py
Original file line number Diff line number Diff line change
Expand Up @@ -3047,8 +3047,8 @@ def _lint_xhtml_typo_checks(filename: Path, dom: se.easy_xml.EasyXmlTree, file_c
if typos:
messages.append(LintMessage("y-028", "Possible typo: [xhtml]<abbr>[/] directly preceded or followed by letter.", se.MESSAGE_TYPE_WARNING, filename, typos))

# Check for misapplied italics. Ignore 's' because the plural is too common.
typos = [node.to_string() for node in dom.xpath("/html/body//*[(name() = 'i' or name() = 'em') and ./following-sibling::node()[1][re:test(., '^[a-z]\\b', 'i') and not(re:test(., '^s\\b'))]]")]
# Check for misapplied italics. Ignore 's' because the plural is too common. i with epub:type handled by y-032.
typos = [node.to_string() for node in dom.xpath("/html/body//*[(name() = 'em' or (name() = 'i' and not(@epub:type))) and ./following-sibling::node()[1][re:test(., '^[a-z]\\b', 'i') and not(re:test(., '^s\\b'))]]")]
if typos:
messages.append(LintMessage("y-029", "Possible typo: Italics followed by a letter.", se.MESSAGE_TYPE_WARNING, filename, typos))

Expand All @@ -3059,13 +3059,13 @@ def _lint_xhtml_typo_checks(filename: Path, dom: se.easy_xml.EasyXmlTree, file_c

# Check for missing punctuation in continued quotations
# ” said Bob “
nodes = dom.xpath("/html/body//p[re:test(., '”\\s(?:said|[A-Za-z]{2,}ed)\\s[A-Za-z]+?(?<!\\bthe)(?<!\\bto)(?<!\\bwith)(?<!\\bfrom)(?<!\\ba\\b)(?<!\\bis)\\s“') or re:test(., '[^\\.]”\\s(\\bhe\\b|\\bshe\\b|I|[A-Z][a-z]+?)\\s(?:said|[A-Za-z]{2,}ed)\\s“') or re:test(., ',” (?:said|[A-Za-z]{2,}ed) [A-Za-z]+? [A-Za-z]+?ly “')]")
nodes = dom.xpath("/html/body//p[re:test(., '”\\s(?:said|[A-Za-z]{2,}ed)\\s[A-Za-z]+?(?<!\\bthe)(?<!\\bto)(?<!\\bwith)(?<!\\bfrom)(?<!\\ba\\b)(?<!\\bis)\\s“') or re:test(., '[^.?!]”\\s(he\\b|she\\b|I\\b|[A-Z][a-z]+?)\\s(?:said|[A-Za-z]{2,}ed)\\s“') or re:test(., ',” (?:said|[A-Za-z]{2,}ed) [A-Za-z]+? [A-Za-z]+?ly “')]")
if nodes:
messages.append(LintMessage("y-031", "Possible typo: Dialog tag missing punctuation.", se.MESSAGE_TYPE_WARNING, filename, [node.to_string() for node in nodes]))

# Check for italics having epub:type that run in to preceding or following characters
# Ignore things like <i>Newspaper</i>s
nodes = dom.xpath("/html/body//i[@epub:type and ( (following-sibling::node()[1][re:test(., '^[a-z]', 'i') and not(re:test(., '^(s|es|er)'))]) or preceding-sibling::node()[1][re:test(., '[a-z]$')]) ]")
nodes = dom.xpath("/html/body//i[@epub:type and ( (following-sibling::node()[1][re:test(., '^[a-z]', 'i') and not(re:test(., '^(s|es|er)\\b'))]) or preceding-sibling::node()[1][re:test(., '[a-z]$')]) ]")
if nodes:
messages.append(LintMessage("y-032", "Possible typo: Italics running into preceding or following characters.", se.MESSAGE_TYPE_WARNING, filename, [node.to_string() for node in nodes]))

Expand Down
26 changes: 26 additions & 0 deletions tests/lint/typos/y-031/golden/y-031-out.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
t-005 [Manual Review] chapter-1.xhtml Dialog without ending comma.
fay” she said
y-031 [Manual Review] chapter-1.xhtml Possible typo: Dialog tag missing
punctuation.
<p>“Good boy,” said old “Kiowa.” “You’d better go get some supper.”</p>
<p>He pronounced the inhibition lengthily and sonorously, so that the
“not” sounded like “n‑o‑o‑o‑t!”</p>
<p>“Ah well,” he said “then let’s take off this little frock.”</p>
<p>The sharp reprimand was not lost upon her, and in time it came to
pass that for “fay” she said “succeed.”</p>
<p>Catching, however, two words which sounded like the English “White”
and “Red,” I said “Yaw” after the last.</p>
<p>She hadn’t much reserve of patience, and at the end of the second
game, when Ella Stowbody sniffily asked her, “Are you going to send to
Minneapolis for your dress for the next soirée—heard you were,” Carol said
“Don’t know yet” with unnecessary sharpness.</p>
<p>“Keep away from women and horses and, and—” he stopped “—eagles,
Billy.”</p>
<p>When they told her to respond with “Amen,” she responded “Amen.”</p>
<p>When told to “look closely,” I said “why?”</p>
<p>“Forget that nonsense,” Andy exclaimed “forcefully.”</p>
<p>“A chary enquiry is a ghost of the mind,” said Billy adroitly
“unfortunately, that is wrong; on the contrary, their bathtub was, in this
moment, a petite underwear.”</p>
<p>“Liars are perjured quills,” whispered someone sweetly “a bathroom is
a lip's coal.”</p>
75 changes: 75 additions & 0 deletions tests/lint/typos/y-031/in/src/epub/text/chapter-1.xhtml
Original file line number Diff line number Diff line change
@@ -0,0 +1,75 @@
<?xml version="1.0" encoding="utf-8"?>
<html xmlns="http://www.w3.org/1999/xhtml" xmlns:epub="http://www.idpf.org/2007/ops" epub:prefix="z3998: http://www.daisy.org/z3998/2012/vocab/structure/, se: https://standardebooks.org/vocab/1.0" xml:lang="en-GB">
<head>
<title>I</title>
<link href="../css/core.css" rel="stylesheet" type="text/css"/>
<link href="../css/local.css" rel="stylesheet" type="text/css"/>
</head>
<body epub:type="bodymatter z3998:fiction">
<section id="chapter-1" epub:type="chapter">
<h2 epub:type="ordinal z3998:roman">I</h2>
<!-- VALID 1, quotes separated by appropriate punctuation -->
<p>“Father is very much hurt,” she said, “that you have written nothing to him.”</p>
<!-- first re.test -->
<!-- EXCLUSION 1, ” said [A-Za-z]+? the “ -->
<p>“Their felony was, in this moment, an arcane spain,” “Nancy” said to the “paperboy.”</p>
<!-- EXCLUSION 2, ” said [A-Za-z]+? to “ -->
<p>“The first daimen summer is, in its own way, a shock,” said Joe to “Charlie.”</p>
<!-- EXCLUSION 3, ” said [A-Za-z]+? with “ -->
<p>“What we don't know for sure is whether or not the shaven bait comes from a filthy scarf,” said John with “sarcasm.”</p>
<!-- EXCLUSION 4, ” said [A-Za-z]? from “ -->
<p>“Extending this logic, they were lost without the punchy field that composed their weather,” said Daniel from “compassion.”</p>
<!-- EXCLUSION 5, ” said [A-Za-z]+? a “ -->
<p>“Killjoy” said to a “friend,” “An earth sees a weather as an unstilled jacket.”</p>
<!-- EXCLUSION 6, ” said [A-Za-z]+? is “ -->
<p>“Nonsense,” said friend is “persona non grata.”</p>
<!-- EXCLUSION 7, ” [A-Za-z]{2,}ed the “ -->
<p>“Unless,” added the “mistress,” with a smile that half belied the severity of her speech ⁠…</p>
<!-- EXCLUSION 8, ” [A-Za-z]{2,}ed to “ -->
<p>“The Bottoms” succeeded to “Hell Row.”</p>
<!-- EXCLUSION 9, ” [A-Za-z]{2,}ed with “ -->
<p>Her plaintive dirges for “<i xml:lang="fr">notre malheureuse patrie</i>,” interpolated with “<i xml:lang="fr">charmant</i>” and “<i xml:lang="fr">mon prince</i>,” died away along the terrace.</p>
<!-- EXCLUSION 10, ” [A-Za-z]{2,}ed from “ -->
<p>Probably Smith had fresh in his recollection the passage in which Madox ridicules as a “piece of puerility” the use of the English word “misterie,” derived from “the Gallic word <i xml:lang="fr">mestera</i>.”</p>
<!-- EXCLUSION 11, ” [A-Za-z]{2,}ed a “ -->
<p>“Well, Mary,” answered a “dear-me-what-now” voice.</p>
<!-- EXCLUSION 12, ” [A-Za-z]{2,}ed is “ -->
<p>The order in council for the printing and publishing a declaration of war against Denmark is dated “Whitehall, <abbr>Sept.</abbr> 19, 1666”; annexed is “A True Declaration of all transactions⁠ ⁠…”</p>
<!-- FAIL 1, ” said [A-Za-z]+? “ -->
<p>“Good boy,” said old “Kiowa.” “You’d better go get some supper.”</p>
<!-- FAIL 2, ” [A-Za-z]{2,}ed [A-Za-z]+? “ -->
<p>He pronounced the inhibition lengthily and sonorously, so that the “not” sounded like “n‑o‑o‑o‑t!”</p>
<!-- second re.test -->
<!-- Note: [.!?]” he/she/I said “ will (validly) trigger y-014, so aren't shown here -->
<!-- EXCLUSION 13, [.!?]” I said “ -->
<p>You said “for a space.” I said “for a while.”</p>
<!-- EXCLUSION 14, [.!?]” [A-Z][a-z]+ said “ -->
<p>“Better’n a bleedin’ dipso!” Lips said “<i>S.h.h.h!</i></p>
<!-- EXCLUSION 15, [.!?]” I [A-Za-z]{2,}ed “ -->
<p>“What do you think you’re doing?” I said “Nothing.”</p>
<!-- EXCLUSION 16, [.!?]” [A-Z][a-z]+ [A-Za-z]{2,}ed “ -->
<p>“But he’s not here.” She added “He’s not been here with me.”</p>
<!-- FAIL 3, [^.!?]” he said “ -->
<p>“Ah well,” he said “then let’s take off this little frock.”</p>
<!-- FAIL 4, [^.!?]” she said “ (also triggers t-005) -->
<p>The sharp reprimand was not lost upon her, and in time it came to pass that for “fay” she said “succeed.”</p>
<!-- FAIL 5, [^.!?]” I said “ -->
<p>Catching, however, two words which sounded like the English “White” and “Red,” I said “Yaw” after the last.</p>
<!-- FAIL 6, [^.!?]” [A-Z][a-z]+ said “ -->
<p>She hadn’t much reserve of patience, and at the end of the second game, when Ella Stowbody sniffily asked her, “Are you going to send to Minneapolis for your dress for the next soirée—heard you were,” Carol said “Don’t know yet” with unnecessary sharpness.</p>
<!-- FAIL 7, [^.!?]” he [A-Za-z]{2,}ed “ -->
<p>“Keep away from women and horses and, and—” he stopped “—eagles, Billy.”</p>
<!-- FAIL 8, [^.!?]” she [A-Za-z]{2,}ed “ -->
<p>When they told her to respond with “Amen,” she responded “Amen.”</p>
<!-- FAIL 9, [^.!?]” I [A-Za-z]{2,}ed “ -->
<p>When told to “look closely,” I said “why?”</p>
<!-- FAIL 10, [^.!?]” [A-Z][a-z]+ [A-Za-z]{2,}ed “ -->
<p>“Forget that nonsense,” Andy exclaimed “forcefully.”</p>
<!-- third re.test -->
<!-- FAIL 11, ,” said [A-Za-z]+? [A-Za-z]+?ly “ -->
<p>“A chary enquiry is a ghost of the mind,” said Billy adroitly “unfortunately, that is wrong; on the contrary, their bathtub was, in this moment, a petite underwear.”</p>
<!-- FAIL 12, ,” [A-Za-z]{2,}ed [A-Za-z]+? [A-Za-z]+?ly “ -->
<p>“Liars are perjured quills,” whispered someone sweetly “a bathroom is a lip's coal.”</p>
</section>
</body>
</html>
4 changes: 4 additions & 0 deletions tests/lint/typos/y-032/golden/y-032-out.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
y-032 [Manual Review] chapter-1.xhtml Possible typo: Italics running into
preceding or following characters.
<i epub:type="se:name.publication.play">Hamlet</i>
<i epub:type="se:name.publication.book">Threatful Flood</i>
27 changes: 27 additions & 0 deletions tests/lint/typos/y-032/in/src/epub/text/chapter-1.xhtml
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
<?xml version="1.0" encoding="utf-8"?>
<html xmlns="http://www.w3.org/1999/xhtml" xmlns:epub="http://www.idpf.org/2007/ops" epub:prefix="z3998: http://www.daisy.org/z3998/2012/vocab/structure/, se: https://standardebooks.org/vocab/1.0" xml:lang="en-GB">
<head>
<title>I</title>
<link href="../css/core.css" rel="stylesheet" type="text/css"/>
<link href="../css/local.css" rel="stylesheet" type="text/css"/>
</head>
<body epub:type="bodymatter z3998:fiction">
<section id="chapter-1" epub:type="chapter">
<h2 epub:type="ordinal z3998:roman">I</h2>
<!-- VALID 1, italics with epub:type with whitespace on each side -->
<p>I will here produce a few extracts taken from our <i epub:type="se:name.publication.book">Protocol of Peace</i> for the year 1867.</p>
<!-- VALID 2, italics with epub:type with punctuation on one side -->
<p>It was <i epub:type="se:name.publication.book">Caesar’s Commentaries</i>, which La Ramee had lent him.</p>
<!-- EXCLUSION 1, italics with epub:type followed by 's' -->
<p>All my <i epub:type="se:name.publication.newspaper">Herald</i>s went off like hot cakes.</p>
<!-- EXCLUSION 2, italics with epub:type followed by 'es' -->
<p>Once there happened to be only two <i epub:type="z3998:taxonomy">Semper Augustus</i>es in all Holland, one in Haarlem and one in Amsterdam.</p>
<!-- EXCLUSION 3, italics with epub:type followed by 'er' -->
<p>A well-informed <i epub:type="se:name.publication.book">Critical Review</i>er would have amended the title thus: “Lucian reviv’d: or Gulliver Beat with his own Bow.”</p>
<!-- FAIL 1, italics with epub:type followed by something other than s/es/er -->
<p>This one was definitely the <i epub:type="se:name.publication.play">Hamlet</i>est of the fake <i epub:type="se:name.publication.play">Hamlet</i>s.</p>
<!-- FAIL 2, italics with epub:type immediately preceded by test -->
<p>Recent controversy aside, a<i epub:type="se:name.publication.book">Threatful Flood</i> is a litter of the mind.</p>
</section>
</body>
</html>
4 changes: 4 additions & 0 deletions tests/lint/typos/y-033/golden/y-033-out.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
y-033 [Manual Review] chapter-1.xhtml Possible typo: Three-em-dash obscuring an
entire word, but not preceded by a space.
<p>Joy to me! Come hither! Give me thy hand⸺ha! let be! aha!⸺Disgust,
disgust, disgust⸻alas to me!</p>
23 changes: 23 additions & 0 deletions tests/lint/typos/y-033/in/src/epub/text/chapter-1.xhtml
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
<?xml version="1.0" encoding="utf-8"?>
<html xmlns="http://www.w3.org/1999/xhtml" xmlns:epub="http://www.idpf.org/2007/ops" epub:prefix="z3998: http://www.daisy.org/z3998/2012/vocab/structure/, se: https://standardebooks.org/vocab/1.0" xml:lang="en-GB">
<head>
<title>I</title>
<link href="../css/core.css" rel="stylesheet" type="text/css"/>
<link href="../css/local.css" rel="stylesheet" type="text/css"/>
</head>
<body epub:type="bodymatter z3998:fiction">
<section id="chapter-1" epub:type="chapter">
<h2 epub:type="ordinal z3998:roman">I</h2>
<!-- EXCLUSION 1, 3em dash preceded by a > -->
<p><i>⸻ Odenheimer, restaurateur.</i> This witness volunteered his testimony.”</p>
<!-- EXCLUSION 2, 3em dash preceded by a ( -->
<p>Augustus seized one of the muskets lying on the floor and shot another mutineer (⸻ Wilson) through the breast.</p>
<!-- EXCLUSION 3, 3em dash preceded by a space -->
<p>At this time I did a little humble work for the ⸻, but was quite resolved to fly at higher game than that.</p>
<!-- EXCLUSION 4, 3em dash preceded by a word-joiner -->
<p>“⁠⸻ to you,” she flung scornfully at them over her shoulder.</p>
<!-- FAIL 1, 3em dash immediately preceded by something other than the above -->
<p>Joy to me! Come hither! Give me thy hand⸺ha! let be! aha!⸺Disgust, disgust, disgust⸻alas to me!</p>
</section>
</body>
</html>

0 comments on commit 46acd2c

Please sign in to comment.