diff --git a/se/commands/create_draft.py b/se/commands/create_draft.py
index 432054b5..78f2e516 100644
--- a/se/commands/create_draft.py
+++ b/se/commands/create_draft.py
@@ -727,8 +727,8 @@ def _create_draft(args: Namespace, plain_output: bool):
producers_text = regex.sub(r".+?Produced by (.+?)\s*$", "\\1", producers_text, flags=regex.DOTALL)
producers_text = regex.sub(r"\(.+?\)", "", producers_text, flags=regex.DOTALL)
- producers_text = regex.sub(r"(at )?https?://www\.pgdp\.net", "", producers_text, flags=regex.DOTALL)
- producers_text = regex.sub(r"[\r\n]+", " ", producers_text, flags=regex.DOTALL)
+ producers_text = regex.sub(r"(at )?https?://www\.pgdp\.net", "", producers_text)
+ producers_text = regex.sub(r"[\r\n]+", " ", producers_text)
producers_text = regex.sub(r",? and ", ", and ", producers_text)
producers_text = producers_text.replace(" and the Online", " and The Online")
producers_text = producers_text.replace(", and ", ", ").strip()
@@ -945,7 +945,7 @@ def _create_draft(args: Namespace, plain_output: bool):
i = i + 1
- metadata_xml = regex.sub(r"\t\tTRANSCRIBER\s*TRANSCRIBER_SORT\s*TRANSCRIBER_URL\s*trc", "\t\t" + producers_xhtml.strip(), metadata_xml, flags=regex.DOTALL)
+ metadata_xml = regex.sub(r"\t\tTRANSCRIBER\s*TRANSCRIBER_SORT\s*TRANSCRIBER_URL\s*trc", "\t\t" + producers_xhtml.strip(), metadata_xml)
if ebook_wiki_url:
metadata_xml = metadata_xml.replace(">EBOOK_WIKI_URL<", f">{ebook_wiki_url}<")
diff --git a/se/commands/word_count.py b/se/commands/word_count.py
index 233c92b9..da031163 100644
--- a/se/commands/word_count.py
+++ b/se/commands/word_count.py
@@ -66,7 +66,7 @@ def word_count(plain_output: bool) -> int:
else:
# We couldn't generate a dom, fall back to regex replacements
- xhtml = regex.sub(r"<(pre|div|p)[^>]*?>[^<]*Project Gutenberg[^<]+?\1>", "", xhtml, flags=regex.IGNORECASE|regex.DOTALL)
+ xhtml = regex.sub(r"<(pre|div|p)[^>]*?>[^<]*Project Gutenberg[^<]+?\1>", "", xhtml, flags=regex.IGNORECASE)
xhtml = regex.sub(r".+?", "", xhtml, flags=regex.IGNORECASE|regex.DOTALL)
total_word_count += se.formatting.get_word_count(xhtml)
diff --git a/se/formatting.py b/se/formatting.py
index 6a1677ef..06b8888f 100644
--- a/se/formatting.py
+++ b/se/formatting.py
@@ -362,16 +362,16 @@ def get_word_count(xhtml: str) -> int:
xhtml = regex.sub(r"<.+?>", " ", xhtml, flags=regex.DOTALL)
# Replace some formatting characters
- xhtml = regex.sub(r"[…–—― ‘’“”\{\}\(\)]", " ", xhtml, flags=regex.IGNORECASE | regex.DOTALL)
+ xhtml = regex.sub(r"[…–—― ‘’“”\{\}\(\)]", " ", xhtml)
# Remove word-connecting dashes, apostrophes, commas, and slashes (and/or), they count as a word boundry but they shouldn't
- xhtml = regex.sub(fr"[\p{{Letter}}0-9][\-\'\,\.\/{se.NO_BREAK_HYPHEN}{se.SHY_HYPHEN}][\p{{Letter}}0-9]", "aa", xhtml, flags=regex.IGNORECASE | regex.DOTALL)
+ xhtml = regex.sub(fr"[\p{{Letter}}0-9][\-\'\,\.\/{se.NO_BREAK_HYPHEN}{se.SHY_HYPHEN}][\p{{Letter}}0-9]", "aa", xhtml)
# Replace sequential spaces with one space
- xhtml = regex.sub(r"\s+", " ", xhtml, flags=regex.IGNORECASE | regex.DOTALL)
+ xhtml = regex.sub(r"\s+", " ", xhtml)
# Get the word count
- return len(regex.findall(r"\b\w+\b", xhtml, flags=regex.IGNORECASE | regex.DOTALL))
+ return len(regex.findall(r"\b\w+\b", xhtml))
def _replace_character_references(match_object) -> str:
"""Replace most XML character references with literal characters.
@@ -660,13 +660,13 @@ def format_xhtml(xhtml: str) -> str:
xhtml = regex.sub(r"?\w+;", _replace_character_references, xhtml)
# Remove unnecessary doctypes which can cause xmllint to hang
- xhtml = regex.sub(r"]+?>", "", xhtml, flags=regex.DOTALL)
+ xhtml = regex.sub(r"]+?>", "", xhtml)
# Remove white space between opening/closing tag and text nodes
# We do this first so that we can still format line breaks after
# Exclude comments
xhtml = regex.sub(r"(<(?:[^!/][^>]*?[^/]|[a-z])>)\s+([^\s<])", r"\1\2", xhtml, flags=regex.IGNORECASE)
- xhtml = regex.sub(r"([^\s>])\s+([^>]+?>)", r"\1\2", xhtml, flags=regex.IGNORECASE)
+ xhtml = regex.sub(r"([^\s>])\s+([^>]+?>)", r"\1\2", xhtml)
try:
tree = _format_xml_str(xhtml)
@@ -1088,7 +1088,7 @@ def format_css(css: str) -> str:
output = regex.sub(r"(@[\p{Letter}]+) \(", "\\1(", output)
# Remove empty rules
- output = regex.sub(r"^\t*[^\{\}]+?\{\s*\}\n", "", output, flags=regex.DOTALL|regex.MULTILINE)
+ output = regex.sub(r"^\t*[^\{\}]+?\{\s*\}\n", "", output, flags=regex.MULTILINE)
return output
@@ -1103,7 +1103,7 @@ def remove_tags(text: str) -> str:
A string with all HTML tags removed
"""
- return regex.sub(r"?[\p{Letter}]+[^>]*?>", "", text, flags=regex.DOTALL)
+ return regex.sub(r"?[\p{Letter}]+[^>]*?>", "", text)
def get_ordinal(number: str) -> str:
"""
@@ -1296,7 +1296,7 @@ def make_url_safe(text: str) -> str:
text = regex.sub(r"['‘’`]", "", text)
# 5. Convert any non-digit, non-letter character to a space
- text = regex.sub(r"[^0-9\p{Letter}]", " ", text, flags=regex.IGNORECASE)
+ text = regex.sub(r"[^0-9\p{Letter}]", " ", text)
# 6. Convert any instance of one or more space to a dash
text = regex.sub(r"\s+", "-", text)
diff --git a/se/se_epub.py b/se/se_epub.py
index b8e55ae0..a34780a3 100644
--- a/se/se_epub.py
+++ b/se/se_epub.py
@@ -699,7 +699,7 @@ def recompose(self, output_xhtml5: bool, extra_css_file: Union[Path,None] = None
output_xhtml = output_xhtml.replace("epub|type", "data-epub-type")
output_xhtml = output_xhtml.replace("xml|lang", "lang")
output_xhtml = regex.sub(r" xmlns.+?=\".+?\"", "", output_xhtml)
- output_xhtml = regex.sub(r"@namespace (epub|xml).+?\s+", "", output_xhtml, flags=regex.MULTILINE)
+ output_xhtml = regex.sub(r"@namespace (epub|xml).+?\s+", "", output_xhtml)
# The Nu HTML5 Validator barfs if non-void elements are self-closed (like
| )
# Try to un-self-close them for HTML5 output.
diff --git a/se/se_epub_generate_toc.py b/se/se_epub_generate_toc.py
index 2c5eacb4..542e515f 100644
--- a/se/se_epub_generate_toc.py
+++ b/se/se_epub_generate_toc.py
@@ -109,7 +109,7 @@ def toc_link(self) -> str:
out_string += f"{self.title}\n"
# Replace
with a single space
- out_string = regex.sub(r"
\s*", " ", out_string, flags=regex.DOTALL)
+ out_string = regex.sub(r"
\s*", " ", out_string)
return out_string
diff --git a/se/se_epub_lint.py b/se/se_epub_lint.py
index 89b66c8e..b87c11cb 100644
--- a/se/se_epub_lint.py
+++ b/se/se_epub_lint.py
@@ -1827,7 +1827,7 @@ def _lint_xhtml_syntax_checks(self, filename: Path, dom: se.easy_xml.EasyXmlTree
title = regex.sub(r"^[\s\.\,\!\?\:\;]*", "", title)
# Normalize whitespace
- title = regex.sub(r"\s+", " ", title, flags=regex.DOTALL).strip()
+ title = regex.sub(r"\s+", " ", title).strip()
# Do we have a subtitle? If so the first letter of that must be capitalized, so we pull that out
subtitle_matches = regex.findall(r"(.*?)(.*?)(.*?)", title, flags=regex.DOTALL)
@@ -2313,7 +2313,7 @@ def _lint_xhtml_typography_checks(filename: Path, dom: se.easy_xml.EasyXmlTree,
# Check for repeated punctuation, but first remove `&` so we don't match `&,`
# Remove tds with repeated ” as they are probably ditto marks
- matches = regex.findall(r"[,;]{2,}.{0,20}", file_contents.replace("&", "")) + regex.findall(r"(?:“\s*“|”\s*”|’ ’|‘\s*‘).{0,20}", regex.sub(r"[”\s]+?(.+?)? | ", "", file_contents)) + regex.findall(r"[\p{Letter}][,\.:;]\s[,\.:;]\s?[\p{Letter}<].{0,20}", file_contents, flags=regex.IGNORECASE)
+ matches = regex.findall(r"[,;]{2,}.{0,20}", file_contents.replace("&", "")) + regex.findall(r"(?:“\s*“|”\s*”|’ ’|‘\s*‘).{0,20}", regex.sub(r"[”\s]+?(.+?)? | ", "", file_contents)) + regex.findall(r"[\p{Letter}][,\.:;]\s[,\.:;]\s?[\p{Letter}<].{0,20}", file_contents)
if matches:
messages.append(LintMessage("t-008", "Repeated punctuation.", se.MESSAGE_TYPE_WARNING, filename, matches))
@@ -2608,7 +2608,7 @@ def _lint_xhtml_typography_checks(filename: Path, dom: se.easy_xml.EasyXmlTree,
messages.append(LintMessage("t-048", "Chapter opening text in all-caps.", se.MESSAGE_TYPE_ERROR, filename, [node.to_string() for node in nodes]))
# Check for two-em-dashes used for elision instead of three-em-dashes
- matches = regex.findall(fr"[^{se.WORD_JOINER}\p{{Letter}}”]⸺[^“{se.WORD_JOINER}\p{{Letter}}].*", file_contents, flags=regex.MULTILINE)
+ matches = regex.findall(fr"[^{se.WORD_JOINER}\p{{Letter}}”]⸺[^“{se.WORD_JOINER}\p{{Letter}}].*", file_contents)
if matches:
messages.append(LintMessage("t-049", "Two-em-dash used for eliding an entire word. Use a three-em-dash instead.", se.MESSAGE_TYPE_WARNING, filename, matches))
@@ -2934,7 +2934,7 @@ def _lint_xhtml_typo_checks(filename: Path, dom: se.easy_xml.EasyXmlTree, file_c
# Exclude paragraphs in blockquotes, which may have special quoting rules, and "continued" paragraphs, which may be continued dialog without an “
for node in dom_copy.xpath("/html/body//p[not(ancestor::blockquote) and not(contains(@class, 'continued'))]"):
node.set_attr("id", "lint-" + str(node_number))
- temp_xhtml = temp_xhtml + f"" + regex.sub(r"[\s\n]+", " ", node.inner_text(), flags=regex.DOTALL) + "\n"
+ temp_xhtml = temp_xhtml + f"
" + regex.sub(r"\s+", " ", node.inner_text()) + "\n"
node_number = node_number + 1
replacement_count = 1
@@ -2943,12 +2943,12 @@ def _lint_xhtml_typo_checks(filename: Path, dom: se.easy_xml.EasyXmlTree, file_c
(temp_xhtml, replacement_count) = regex.subn(r"“[^“]+?”", " ", temp_xhtml) # Remove all regular quotes
# Remove contractions to reduce rsquo for next regex
- temp_xhtml = regex.sub(r"[\p{Letter}]’[\p{Letter}]", " ", temp_xhtml, flags=regex.MULTILINE)
+ temp_xhtml = regex.sub(r"[\p{Letter}]’[\p{Letter}]", " ", temp_xhtml)
# Remove all runs of ldquo that are likely to spill to the next
replacement_count = 1
while replacement_count > 0:
- (temp_xhtml, replacement_count) = regex.subn(r"“[^“”]+?$", " ", temp_xhtml, flags=regex.MULTILINE)
+ (temp_xhtml, replacement_count) = regex.subn(r"“[^“”]+?$", " ", temp_xhtml)
# Match problem `‘` using regex, and if found, get the actual node text from the dom to return.
typos = []
@@ -3009,7 +3009,7 @@ def _lint_xhtml_typo_checks(filename: Path, dom: se.easy_xml.EasyXmlTree, file_c
# Check for closing rdquo without opening ldquo.
# Remove tds in case rdquo means "ditto mark"
- typos = regex.findall(r"”[^“‘]+?”", regex.sub(r"
]*?>[”\s]+?(.+?)? | ", "", file_contents), flags=regex.DOTALL)
+ typos = regex.findall(r"”[^“‘]+?”", regex.sub(r"]*?>[”\s]+?(.+?)? | ", "", file_contents))
# We create a filter to try to exclude nested quotations
# Remove tags in case they're enclosing punctuation we want to match against at the end of a sentence.
diff --git a/se/typography.py b/se/typography.py
index 11f9dc91..f9d70264 100644
--- a/se/typography.py
+++ b/se/typography.py
@@ -103,10 +103,10 @@ def typogrify(xhtml: str, smart_quotes: bool = True) -> str:
xhtml = xhtml.replace("——", "⸺")
# Smartypants doesn't do well on em dashes followed by open quotes. Fix that here
- xhtml = regex.sub(r"—”([\p{Letter}])", r"—“\1", xhtml, flags=regex.IGNORECASE)
- xhtml = regex.sub(r"—’([\p{Letter}])", r"—‘\1", xhtml, flags=regex.IGNORECASE)
- xhtml = regex.sub(r"-“", r"—”", xhtml, flags=regex.IGNORECASE)
- xhtml = regex.sub(r"‘”", fr"’{se.HAIR_SPACE}”", xhtml, flags=regex.IGNORECASE)
+ xhtml = regex.sub(r"—”([\p{Letter}])", r"—“\1", xhtml)
+ xhtml = regex.sub(r"—’([\p{Letter}])", r"—‘\1", xhtml)
+ xhtml = regex.sub(r"-“", r"—”", xhtml)
+ xhtml = regex.sub(r"‘”", fr"’{se.HAIR_SPACE}”", xhtml)
# Now that we've fixed Smartypants' output, put our quotes back in
xhtml = xhtml.replace("!#se:rsquo#!", "’")
@@ -114,7 +114,7 @@ def typogrify(xhtml: str, smart_quotes: bool = True) -> str:
# Remove spaces between en and em dashes
# Note that we match at least one character before the dashes, so that we don't catch start-of-line em dashes like in poetry.
# We do a negative lookbehind for
s from being included
- xhtml = regex.sub(r"(? str:
xhtml = xhtml.replace(se.SHY_HYPHEN, "")
# Fix some common em-dash transcription errors
- xhtml = regex.sub(r"([:;])-([\p{Letter}])", r"\1—\2", xhtml, flags=regex.IGNORECASE)
- xhtml = regex.sub(r"([\p{Letter}])-“", r"\1—“", xhtml, flags=regex.IGNORECASE)
+ xhtml = regex.sub(r"([:;])-([\p{Letter}])", r"\1—\2", xhtml)
+ xhtml = regex.sub(r"([\p{Letter}])-“", r"\1—“", xhtml)
xhtml = regex.sub(r":-", fr":{se.WORD_JOINER}—", xhtml)
# Em dashes and two-em-dashes can be broken before, so add a word joiner between letters/punctuation and the following em dash
- xhtml = regex.sub(fr"([^\s{se.WORD_JOINER}{se.NO_BREAK_SPACE}{se.HAIR_SPACE}])([—⸻])", fr"\1{se.WORD_JOINER}\2", xhtml, flags=regex.IGNORECASE)
+ xhtml = regex.sub(fr"([^\s{se.WORD_JOINER}{se.NO_BREAK_SPACE}{se.HAIR_SPACE}])([—⸻])", fr"\1{se.WORD_JOINER}\2", xhtml)
# Add en dashes; don't replace match that is within an html tag, since ids and attrs often contain the pattern DIGIT-DIGIT
xhtml = regex.sub(r"(?]*)([0-9]+)\-([0-9]+)", r"\1–\2", xhtml)
@@ -146,7 +146,7 @@ def typogrify(xhtml: str, smart_quotes: bool = True) -> str:
xhtml = regex.sub(fr"([\p{{Lowercase_Letter}}]){se.WORD_JOINER}—th\b", r"\1 —th", xhtml)
# Remove word joiners from following opening tags--they're usually never correct
- xhtml = regex.sub(fr"<([\p{{Letter}}]+)([^>]*?)>{se.WORD_JOINER}", r"<\1\2>", xhtml, flags=regex.IGNORECASE)
+ xhtml = regex.sub(fr"<([\p{{Letter}}]+)([^>]*?)>{se.WORD_JOINER}", r"<\1\2>", xhtml)
# Add a word joiner after em dashes within elements
xhtml = regex.sub(r"]*?)>—", fr"—{se.WORD_JOINER}", xhtml)
@@ -213,7 +213,7 @@ def typogrify(xhtml: str, smart_quotes: bool = True) -> str:
xhtml = regex.sub(r"(\s)‘a’(\s)", r"\1’a’\2", xhtml, flags=regex.IGNORECASE)
# Years
- xhtml = regex.sub(r"‘([0-9]{2,}[^\p{Letter}0-9’])", r"’\1", xhtml, flags=regex.IGNORECASE)
+ xhtml = regex.sub(r"‘([0-9]{2,}[^\p{Letter}0-9’])", r"’\1", xhtml)
xhtml = regex.sub(r"‘([Aa]ve|[Oo]me|[Ii]m|[Mm]idst|[Gg]ainst|[Nn]eath|[Ee]m|[Cc]os|[Tt]is|[Tt]isn’t|[Tt]was|[Tt]ain’t|[Tt]wixt|[Tt]were|[Tt]would|[Tt]wouldn|[Tt]won|[Tt]ween|[Tt]will|[Rr]ound|[Pp]on|[Uu]ns?|[Uu]d|[Cc]ept|[Oo]w|[Aa]ppen|[Ee])\b", r"’\1", xhtml)
@@ -245,34 +245,34 @@ def typogrify(xhtml: str, smart_quotes: bool = True) -> str:
xhtml = regex.sub(r"(?]*?)>{se.HAIR_SPACE}…", r"…", xhtml, flags=regex.IGNORECASE)
+ xhtml = regex.sub(r"\s*\.\s*\.\s*\.\s*", r"…", xhtml)
+ xhtml = regex.sub(fr"[\s{se.NO_BREAK_SPACE}]?…[\s{se.NO_BREAK_SPACE}]?\.", fr".{se.HAIR_SPACE}…", xhtml)
+ xhtml = regex.sub(fr"[\s{se.NO_BREAK_SPACE}]?…[\s{se.NO_BREAK_SPACE}]?", fr"{se.HAIR_SPACE}… ", xhtml)
+ xhtml = regex.sub(fr"
]*?)>{se.HAIR_SPACE}…", r"
…", xhtml)
# Remove spaces between opening tags and ellipses
- xhtml = regex.sub(fr"(<[\p{{Letter}}0-9]+[^<]+?>)[\s{se.NO_BREAK_SPACE}]+?…", r"\1…", xhtml, flags=regex.IGNORECASE)
+ xhtml = regex.sub(fr"(<[\p{{Letter}}0-9]+[^<]+?>)[\s{se.NO_BREAK_SPACE}]+?…", r"\1…", xhtml)
# Remove spaces between closing tags and ellipses
- xhtml = regex.sub(fr"…[\s{se.NO_BREAK_SPACE}]?([\p{{Letter}}0-9]+>)", r"…\1", xhtml, flags=regex.IGNORECASE)
- xhtml = regex.sub(fr"…[\s{se.NO_BREAK_SPACE}]+([\)”’])(?![\p{{Letter}}])", r"…\1", xhtml, flags=regex.IGNORECASE) # If followed by a letter, the single quote is probably a leading elision
- xhtml = regex.sub(fr"([\(“‘])[\s{se.NO_BREAK_SPACE}]+…", r"\1…", xhtml, flags=regex.IGNORECASE)
- xhtml = regex.sub(fr"…[\s{se.NO_BREAK_SPACE}]?([\!\?\.\;\,])", fr"…{se.HAIR_SPACE}\1", xhtml, flags=regex.IGNORECASE)
- xhtml = regex.sub(fr"([\!\?\.\;”’])[\s{se.NO_BREAK_SPACE}]?…", fr"\1{se.HAIR_SPACE}…", xhtml, flags=regex.IGNORECASE)
- xhtml = regex.sub(fr"\,[\s{se.NO_BREAK_SPACE}]?…", fr",{se.HAIR_SPACE}…", xhtml, flags=regex.IGNORECASE)
+ xhtml = regex.sub(fr"…[\s{se.NO_BREAK_SPACE}]?([\p{{Letter}}0-9]+>)", r"…\1", xhtml)
+ xhtml = regex.sub(fr"…[\s{se.NO_BREAK_SPACE}]+([\)”’])(?![\p{{Letter}}])", r"…\1", xhtml) # If followed by a letter, the single quote is probably a leading elision
+ xhtml = regex.sub(fr"([\(“‘])[\s{se.NO_BREAK_SPACE}]+…", r"\1…", xhtml)
+ xhtml = regex.sub(fr"…[\s{se.NO_BREAK_SPACE}]?([\!\?\.\;\,])", fr"…{se.HAIR_SPACE}\1", xhtml)
+ xhtml = regex.sub(fr"([\!\?\.\;”’])[\s{se.NO_BREAK_SPACE}]?…", fr"\1{se.HAIR_SPACE}…", xhtml)
+ xhtml = regex.sub(fr"\,[\s{se.NO_BREAK_SPACE}]?…", fr",{se.HAIR_SPACE}…", xhtml)
# Add nbsp to ellipses that open dialog
- xhtml = regex.sub(r"([“‘])…\s([\p{Letter}0-9])", fr"\1…{se.NO_BREAK_SPACE}\2", xhtml, flags=regex.IGNORECASE)
+ xhtml = regex.sub(r"([“‘])…\s([\p{Letter}0-9])", fr"\1…{se.NO_BREAK_SPACE}\2", xhtml)
# Don't use . ... if within a clause
xhtml = regex.sub(r"\.(\s…\s[\p{Lowercase_Letter}])", r"\1", xhtml)
diff --git a/se/vendor/kobo_touch_extended/kobo.py b/se/vendor/kobo_touch_extended/kobo.py
index 860b71db..ca9032fc 100644
--- a/se/vendor/kobo_touch_extended/kobo.py
+++ b/se/vendor/kobo_touch_extended/kobo.py
@@ -32,7 +32,7 @@ def append_kobo_spans_from_text(node, text):
return False
else:
# Split text in sentences
- groups = regex.split(fr'(.*?[\.\!\?\:](?:{se.HAIR_SPACE}…)?[\'"\u201d\u2019]?(?:{se.HAIR_SPACE}\u201d)?\s*)', text, flags=regex.MULTILINE)
+ groups = regex.split(fr'(.*?[\.\!\?\:](?:{se.HAIR_SPACE}…)?[\'"\u201d\u2019]?(?:{se.HAIR_SPACE}\u201d)?\s*)', text)
# Remove empty strings resulting from split()
groups = [g for g in groups if g != ""]