Skip to content

Commit

Permalink
word-count: Don't count no-break-hyphens as a word boundary
Browse files Browse the repository at this point in the history
  • Loading branch information
acabal committed May 11, 2024
1 parent 26bc937 commit f33a86c
Showing 1 changed file with 1 addition and 1 deletion.
2 changes: 1 addition & 1 deletion se/formatting.py
Original file line number Diff line number Diff line change
Expand Up @@ -363,7 +363,7 @@ def get_word_count(xhtml: str) -> int:
xhtml = regex.sub(r"[…–—― ‘’“”\{\}\(\)]", " ", xhtml, flags=regex.IGNORECASE | regex.DOTALL)

# Remove word-connecting dashes, apostrophes, commas, and slashes (and/or), they count as a word boundry but they shouldn't
xhtml = regex.sub(r"[\p{Letter}0-9][\-\'\,\.\/][\p{Letter}0-9]", "aa", xhtml, flags=regex.IGNORECASE | regex.DOTALL)
xhtml = regex.sub(fr"[\p{{Letter}}0-9][\-\'\,\.\/{se.NO_BREAK_HYPHEN}{se.SHY_HYPHEN}][\p{{Letter}}0-9]", "aa", xhtml, flags=regex.IGNORECASE | regex.DOTALL)

# Replace sequential spaces with one space
xhtml = regex.sub(r"\s+", " ", xhtml, flags=regex.IGNORECASE | regex.DOTALL)
Expand Down

0 comments on commit f33a86c

Please sign in to comment.