-
Notifications
You must be signed in to change notification settings - Fork 1
/
clean_wordlist.py
51 lines (36 loc) · 1.55 KB
/
clean_wordlist.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
import math
from tqdm import tqdm
def inv_tfidf(word_freq, doc_freq, total_words, total_articles):
tf = word_freq/total_words
# idf = math.log((total_articles+1)/(doc_freq + 1))
# tfidf = tf * idf
return tf
def term_freq(word_freq, total_words):
tf = word_freq/total_words
return tf
def clean_wordlist(wordlist, valid, total_words, total_articles):
possible_words = {}
for word in tqdm(valid):
if word in wordlist:
possible_words[word] = inv_tfidf(wordlist[word][0], wordlist[word][1], total_words, total_articles)
else:
possible_words[word] = inv_tfidf(1, 1, total_words, total_articles)
a = list(possible_words.values())
amin, amax = min(a), max(a)
for word in possible_words:
possible_words[word] = ((possible_words[word]-amin) / (amax-amin))
sorted_possible_words = dict(sorted(possible_words.items(), key=lambda item: item[1], reverse=True))
return sorted_possible_words
def clean_wordlist_alt(wordlist, valid, total_words):
possible_words = {}
for word in tqdm(valid):
if word in wordlist:
possible_words[word] = term_freq(wordlist[word], total_words)
else:
possible_words[word] = term_freq(1, total_words)
a = list(possible_words.values())
amin, amax = min(a), max(a)
for word in possible_words:
possible_words[word] = ((possible_words[word]-amin) / (amax-amin))
sorted_possible_words = dict(sorted(possible_words.items(), key=lambda item: item[1], reverse=True))
return sorted_possible_words