-
Notifications
You must be signed in to change notification settings - Fork 0
/
noname_db_gen.py
38 lines (31 loc) · 862 Bytes
/
noname_db_gen.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
import nltk, config, city_db
# Purpose: generates a words dictionary file by removing people names from full dictionary
OUTPUT = "nonames.txt"
# load names database
with open(config.MALE_NAME_FILE) as fp:
male = {line.strip().lower():True for line in fp}
with open(config.FEMALE_NAME_FILE) as fp:
female = {line.strip().lower():True for line in fp}
# load dictionary
with open(config.FULL_EN_DICTIONARY) as fp:
dictionary = {line.strip().lower():True for line in fp}
i = 0
for key in male.keys():
if key in dictionary:
del dictionary[key]
print "removed",key
i += 1
for key in female.keys():
if key in dictionary:
del dictionary[key]
print "removed",key
i += 1
fp = file(OUTPUT, "w")
keys = dictionary.keys()
keys.sort()
for w in keys:
if len(w.strip()) > 0:
fp.write(w+"\n")
fp.close()
print "removed words total =", i
print "done."