-
Notifications
You must be signed in to change notification settings - Fork 2
/
tpyo_revealo.py
75 lines (63 loc) · 3 KB
/
tpyo_revealo.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
#!/usr/bin/env python3
import tempfile, zipfile
import glob, os, re, sys
import datetime
import xml.etree.ElementTree as ET
word_dir = r'ref_words'
docm_dir = r'input_doc'
if len(sys.argv) == 3:
word_dir = sys.argv[1]
docm_dir = sys.argv[2]
def tpyos(word_dir, docm_dir):
"""
build reference list of words
each line from all files in word_dir is considered a word
all words converted to lowercase, so not suitable to detect capitalization defects
encoding assumed: UTF-8
"""
ref_words = set()
for word_file in glob.glob(word_dir + os.sep + '*'):
if os.path.isfile(word_file):
with open(word_file, 'r', encoding='UTF-8') as wf:
ref_words.update(set(wf.read().lower().split('\n')))
"""
build typo dictionary
input document assumed as docx/epub, they are unzipped in temp dir
then each filename containing xml/htm is processed --> need to improve this
xhtml format files likely not processed correctly because of text formatting mixing
"""
tpyo_words = dict()
re_strip = re.compile(r'^[^a-zA-Z]*|[^a-zA-Z]*$')
re_valid_words = re.compile(r'[a-zA-Z]')
for doc_file in glob.glob(docm_dir + os.sep + '*'):
if re.search(r'\.(epub|docx)$', doc_file) and os.path.isfile(doc_file):
with zipfile.ZipFile(doc_file, 'r') as zp, tempfile.TemporaryDirectory() as td:
zp.extractall(td)
rp = os.sep + '**' + os.sep
all_files = glob.glob(td + rp + '*.*htm*', recursive=True)
all_files.extend(glob.glob(td + rp + '*.*xml*', recursive=True))
for ip_file in all_files:
xml_root = ET.parse(ip_file).getroot()
all_text = ET.tostringlist(xml_root, encoding='unicode', method='text')
for item in all_text:
for word in re.split(r'[\s—]', item):
word = re_strip.sub('', word)
# convert to ascii single quotes and lowercase the string for ref_words comparison
word_l = word.translate({0x2019:0x27}).lower()
if re_valid_words.search(word_l) and word_l not in ref_words:
if word in tpyo_words:
tpyo_words[word] = tpyo_words[word] + 1
else:
tpyo_words[word] = 1
return tpyo_words
tpyo_words = tpyos(word_dir, docm_dir)
log_dir = str(datetime.datetime.now()).translate({0x20:0x5f, 0x3a:0x5f})
os.mkdir(log_dir)
tpyo_log = log_dir + os.sep + r'tpyo_words.log'
hyph_log = log_dir + os.sep + r'hyphenated_words.log'
with open(tpyo_log, 'w', encoding='UTF-8') as tw, open(hyph_log, 'w', encoding='UTF-8') as hw:
for word in sorted(tpyo_words.keys(), key=str.lower):
if '-' in word:
print('{}: {}'.format(word, tpyo_words[word]), file=hw)
else:
print('{}: {}'.format(word, tpyo_words[word]), file=tw)