-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathTF-IDF.py
70 lines (62 loc) · 2.26 KB
/
TF-IDF.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
from nltk.stem import SnowballStemmer
from nltk.tokenize import word_tokenize, sent_tokenize
from collections import Counter
import glob
import os
import math as mt
import sys
import time
sys.stdout.reconfigure(encoding='utf-8')
def read_file(fname):
with open(fname, "r", encoding='utf-8') as f:
data = f.read()
return data
def calculate_stems_document(document, snow_stemmer):
words = word_tokenize(document)
stem_words = [ snow_stemmer.stem(w) for w in words if w.isalnum() ]
return stem_words
def calculate_Tf_Idf(term_freq, docum_freq, N):
Tf_Idf = {}
for t in term_freq:
tf = term_freq[t]
idf_tf = mt.log(N/docum_freq[t])*tf
Tf_Idf[t] = idf_tf
return Tf_Idf
def calculate_sentence_scores(document, Tf_Idf, snow_stemmer):
sentences = sent_tokenize(document)
scores = []
i = 0
for sent in sentences:
stem_words = calculate_stems_document(sent, snow_stemmer)
sent_df = []
for w in stem_words: #set
sent_df.append(Tf_Idf[w])
scores.append(( sent, i, (sum(sorted(sent_df, reverse=True)[:10]))))
i+=1
return scores
if __name__ == "__main__":
corpus_path = input()
file_input = input()
snow_stemmer = SnowballStemmer('english')
document = read_file(file_input)
stem_words = calculate_stems_document(document, snow_stemmer)
term_freq = Counter(stem_words)
corpus_file_paths = glob.glob(os.path.join(corpus_path, '**/*.txt'), recursive=True)
corpus_data = []
for f in corpus_file_paths:
corpus_data.append(read_file(f))
corpus_stems = []
for file in corpus_data:
sw = calculate_stems_document(file, snow_stemmer)
corpus_stems.extend(set(sw))
docum_freq = Counter(corpus_stems)
N = len(corpus_file_paths)
Tf_Idf = calculate_Tf_Idf(term_freq, docum_freq, N)
sorted_tf = sorted(Tf_Idf.items(), key=lambda x: (-x[1], x[0]))[:10]
p = [x[0] for x in sorted_tf]
scores = calculate_sentence_scores(document, Tf_Idf, snow_stemmer)
sort1 = (sorted(scores, key = lambda x:(-x[2])))[:5]
sort2 = sorted(sort1, key = lambda x: x[1])
p2 = [x[0] for x in sort2]
print(*p, sep = ", ")
print(*p2, sep = " ")