diff --git a/se/formatting.py b/se/formatting.py index 50186ed5..966b839d 100644 --- a/se/formatting.py +++ b/se/formatting.py @@ -50,31 +50,34 @@ def semanticate(xhtml: str) -> str: """ # Some common abbreviations - xhtml = regex.sub(r"(?]*?\>))Mr\.", r"""Mr.""", xhtml) - xhtml = regex.sub(r"(?]*?\>))Mrs\.", r"""Mrs.""", xhtml) - xhtml = regex.sub(r"(?]*?\>))Ms\.", r"""Ms.""", xhtml) - xhtml = regex.sub(r"(?]*?\>))Dr\.", r"""Dr.""", xhtml) - xhtml = regex.sub(r"(?]*?\>))Drs\.", r"""Drs.""", xhtml) - xhtml = regex.sub(r"(?]*?\>))Prof\.", r"""Prof.""", xhtml) - xhtml = regex.sub(r"(?]*?\>))Rev\.", r"""Rev.""", xhtml) - xhtml = regex.sub(r"(?]*?\>))Hon\.", r"""Hon.""", xhtml) - xhtml = regex.sub(r"(?]*?\>))Lieut\.", r"""Lieut.""", xhtml) - xhtml = regex.sub(r"(?]*?\>))Fr\.", r"""Fr.""", xhtml) - xhtml = regex.sub(r"(?]*?\>))Lt\.", r"""Lt.""", xhtml) - xhtml = regex.sub(r"(?]*?\>))Capt\.", r"""Capt.""", xhtml) - xhtml = regex.sub(r"(?]*?\>))Pvt\.", r"""Pvt.""", xhtml) - xhtml = regex.sub(r"(?]*?\>))Esq\.", r"""Esq.""", xhtml) + xhtml = regex.sub(r"(?]*?\>))(\L\.)", r"""\1""", xhtml, titles=[ + "Capt", + "Col", + "Dr", + "Drs", + "Esq", + "Fr", + "Hon", + "Lieut", + "Lt", + "MM", + "Mdlle", + "Messers", + "Messrs", + "Mlle", + "Mlles", + "Mme", + "Mmes", + "Mon", + "Mr", + "Mrs", + "Ms", + "Prof", + "Pvt", + "Rev", + ]) xhtml = regex.sub(r"(?]*?\>))Bros\.", r"Bros.", xhtml) xhtml = regex.sub(r"(?]*?\>))Mt\.", r"Mt.", xhtml) - xhtml = regex.sub(r"(?]*?\>))MM\.", r"""MM.""", xhtml) - xhtml = regex.sub(r"(?]*?\>))Mme\.", r"""Mme.""", xhtml) - xhtml = regex.sub(r"(?]*?\>))Mmes\.", r"""Mmes.""", xhtml) - xhtml = regex.sub(r"(?]*?\>))Mon\.", r"""Mon.""", xhtml) - xhtml = regex.sub(r"(?]*?\>))Mlle\.", r"""Mlle.""", xhtml) - xhtml = regex.sub(r"(?]*?\>))Mdlle\.", r"""Mdlle.""", xhtml) - xhtml = regex.sub(r"(?]*?\>))Mlles\.", r"""Mlles.""", xhtml) - xhtml = regex.sub(r"(?]*?\>))Messrs\.", r"""Messrs.""", xhtml) - xhtml = regex.sub(r"(?]*?\>))Messers\.", r"""Messers.""", xhtml) xhtml = regex.sub(r"(?]*?\>))([Vv])ol(s?)\.", r"\1ol\2.", xhtml) xhtml = regex.sub(r"(?]*?\>))([Cc])hap\. ([0-9])", r"\1hap. \2", xhtml) # The number allows us to avoid phrases like `Hello, old chap.` xhtml = regex.sub(r"(?]*?\>)|\.)(P\.(?:P\.)?S\.(?:S\.)?\B)", r"""\1""", xhtml) @@ -83,7 +86,6 @@ def semanticate(xhtml: str) -> str: xhtml = regex.sub(r"(?]*?\>))Ltd\.", r"Ltd.", xhtml) xhtml = regex.sub(r"(?]*?\>))St\.", r"St.", xhtml) xhtml = regex.sub(r"(?]*?\>))([Gg])ov\.", r"\1ov.", xhtml) - xhtml = regex.sub(r"(?]*?\>))Col\.", r"""Col.""", xhtml) xhtml = regex.sub(r"(?]*?\>))MS(S?)\.", r"""MS\1.""", xhtml) xhtml = regex.sub(r"(?]*?\>))([Vv])iz\.", r"\1iz.", xhtml) xhtml = regex.sub(r"(?]*?\>))etc\.", r"etc.", xhtml)