forked from eBrevia/CharSplit
-
Notifications
You must be signed in to change notification settings - Fork 0
/
doc_split.py
executable file
·163 lines (147 loc) · 5.22 KB
/
doc_split.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
#!/usr/bin/python3
# Copyright eBrevia.com 2019
"""Document-level splitting module."""
import logging
import re
import sys
import doc_config
import char_split
# pylint: disable=invalid-name
logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)
DE_LETTER = ('ABCDEFGHIJKLMNOPQRSTUVWXYZ' +
'abcdefghijklmnopqrstuvwxyz' +
'ÄÖÜäöüß')
DE_WORD_PAT = re.compile(r'\b[%s]+\b' % (DE_LETTER))
GLUES = ['ens', 'en', 'er', 'es', 'e', 'n', 's']
MIDDLE_DOT = '\N{MIDDLE DOT}'
# Caches
KNOWN_WORDS = set()
ALREADY_SPLIT_WORDS = {}
def find_root(word):
"""Looks in KNOWN_WORDS; if that fails, peels off German compound glue.
If nothing works, return None.
"""
if word in KNOWN_WORDS:
return word
# might not actually be a noun
not_noun = word.lower()
if not_noun in KNOWN_WORDS:
return not_noun
# do glue processing
for glue in GLUES:
if word.endswith(glue):
root_length = len(word) - len(glue)
root = word[:root_length]
if root in KNOWN_WORDS:
return word
return None
def get_best_split(word):
"""First look up the word, and if found, return it.
Then get the best available split of a word into 2 words.
If score of best is less than 0, no split is available.
If no split is available, the two words are the same; return one.
"""
root = find_root(word)
if root:
return [root]
candidate = char_split.split_compound(word)[0]
if candidate[0] <= 0:
return [word]
if candidate[1] == candidate[2]:
return [candidate[1]]
return [candidate[1], candidate[2]]
def maximal_split(word, de_dict=doc_config.DEFAULT_DICT):
"""Recursively split a single word into a list of words.
Only the first result is split further, as CharSplit divides
compounds into non-final and final parts.
Try to avoid splitting words other than nouns,
as false positives are too likely.
"""
# This is an entry point, so load the dictionary just in case.
load_known_words(de_dict)
# Do not split a non-noun
if not word[0].isupper():
return [word]
word_list = get_best_split(word)
# Binary splitter was unable to split
if len(word_list) == 1:
return word_list
# If split product is too short, ignore the split
if len(word_list[0]) < 4 or len(word_list[1]) < 4:
return [word]
# Recursively split the non-head and prepend it to the head
return maximal_split(word_list[0]) + [word_list[1]]
def load_known_words(de_dict=doc_config.DEFAULT_DICT):
"""Load the dictionary into KNOWN_WORDS."""
if KNOWN_WORDS:
return # already loaded
if de_dict is None:
de_dict = doc_config.DEFAULT_DICT
with open(de_dict) as file:
for word in file:
if not (word == '' or word.startswith('#')):
KNOWN_WORDS.add(word.strip())
# print('%d known words loaded\n' % (len(KNOWN_WORDS)), file=sys.stderr)
logger.info('%d known words loaded', len(KNOWN_WORDS))
def maximal_split_str(word, de_dict=None):
"""Maximally split a word and return it with middle dots."""
# This is an entry point, so load the dictionary just in case.
load_known_words(de_dict)
# if memoized, don't split
try:
return ALREADY_SPLIT_WORDS[word]
except KeyError:
pass
upper_case = word[0].isupper()
result_list = [w.lower() for w in maximal_split(word)]
result_str = MIDDLE_DOT.join(result_list)
if upper_case:
result0 = result_str[0].upper()
else:
result0 = result_str[0]
result = result0 + result_str[1:]
ALREADY_SPLIT_WORDS[word] = result
return result
def doc_split(doc, de_dict=None, result_map=None):
"""Split a whole document (a string) using the specified dictionary.
Return the whole document with splitting dots.
"""
if doc == '':
return ''
# This is an entry point, so load the dictionary just in case.
load_known_words(de_dict)
result = []
if result_map is not None:
result_map.clear()
windexes = [(mobj.start(), mobj.end()) for mobj in DE_WORD_PAT.finditer(doc)]
# Non-word before the first word (OK if empty)
result.append(doc[:windexes[0][0]])
# pylint: disable=consider-using-enumerate
for i in range(0, len(windexes)):
# Add a split word and the following non-word (OK if empty)
(start, end) = windexes[i]
unsplit_word = doc[start:end]
split_word = maximal_split_str(unsplit_word)
result.append(split_word)
if result_map is not None and MIDDLE_DOT in split_word:
result_map[unsplit_word] = split_word
if i == len(windexes) - 1:
next_start = len(doc)
else:
next_start = windexes[i + 1][0]
result.append(doc[end:next_start])
return ''.join(result)
def main():
"""Read whole document from stdin, output in maximally split format.
Usage: ./doc_split.py dict
"""
if len(sys.argv) > 1:
de_dict = sys.argv[1]
else:
de_dict = doc_config.DEFAULT_DICT
input_str = sys.stdin.read()
output_str = doc_split(input_str, de_dict)
sys.stdout.write(output_str)
if __name__ == "__main__":
main()