-
Notifications
You must be signed in to change notification settings - Fork 2
/
modifytext.py
85 lines (57 loc) · 1.98 KB
/
modifytext.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
import re
def modify_en_text(text, modify_mode='BASIC', keep_emoji=True):
""" Make your text easy to read and to translate.
Args:
text(str):
target dirty text.
modify_mode(str):
mode selection
keep_emoji(bool):
call back remove_emoji func or not.
Returns:
text_output(str):
formatted output.
"""
if not keep_emoji:
text = remove_emoji(text)
sents = split_text_to_sentences_en(text)
text_output = mode_factory(modify_mode)(sents)
return text_output
def remove_emoji(text):
""" Remove emoji. """
emoji_pattern = re.compile("["
u"\U0001F600-\U0001F64F" # emoticons
u"\U0001F300-\U0001F5FF" # symbols & pictographs
u"\U0001F680-\U0001F6FF" # transport & map symbols
u"\U0001F1E0-\U0001F1FF" # flags (iOS)
"]+", flags=re.UNICODE)
return emoji_pattern.sub(r'', text)
def split_text_to_sentences_en(text):
""" Make your text easy to read and to translate.
Args:
text(str):
target dirty text.
Returns:
sents(list):
sentences splited.
"""
text = text.replace(" . . . ","...") #replace LaTex ellipses with original ellipses
sents = re.split(r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?)\s', text)
return sents
def mode_factory(modify_mode):
if modify_mode == "BASIC":
return mode_basic
if modify_mode == 'LIST-MARK':
return mode_listmark
def mode_basic(sents):
text_output = ""
for _s in sents:
s = ' '.join(_s.split(), )
text_output += s + "\n\n"
return text_output
def mode_listmark(sents):
text_output = ""
for _s in sents:
s = ' '.join(_s.split(), )
text_output += "- " + s + "\n\n"
return text_output