From f33a86cf7a759fc0205cd8d5a66fb3588d6dd7d3 Mon Sep 17 00:00:00 2001 From: Alex Cabal Date: Sat, 11 May 2024 11:50:04 -0500 Subject: [PATCH] word-count: Don't count no-break-hyphens as a word boundary --- se/formatting.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/se/formatting.py b/se/formatting.py index a5968e06..50186ed5 100644 --- a/se/formatting.py +++ b/se/formatting.py @@ -363,7 +363,7 @@ def get_word_count(xhtml: str) -> int: xhtml = regex.sub(r"[…–—― ‘’“”\{\}\(\)]", " ", xhtml, flags=regex.IGNORECASE | regex.DOTALL) # Remove word-connecting dashes, apostrophes, commas, and slashes (and/or), they count as a word boundry but they shouldn't - xhtml = regex.sub(r"[\p{Letter}0-9][\-\'\,\.\/][\p{Letter}0-9]", "aa", xhtml, flags=regex.IGNORECASE | regex.DOTALL) + xhtml = regex.sub(fr"[\p{{Letter}}0-9][\-\'\,\.\/{se.NO_BREAK_HYPHEN}{se.SHY_HYPHEN}][\p{{Letter}}0-9]", "aa", xhtml, flags=regex.IGNORECASE | regex.DOTALL) # Replace sequential spaces with one space xhtml = regex.sub(r"\s+", " ", xhtml, flags=regex.IGNORECASE | regex.DOTALL)