-
Notifications
You must be signed in to change notification settings - Fork 3
/
Indexer.py
113 lines (91 loc) · 3.23 KB
/
Indexer.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
import nltk #NLTK for natural language processing tasks
from nltk.corpus import stopwords # list of stop word
from nltk.tokenize import word_tokenize # To tokenize each word
from nltk.stem import PorterStemmer # For specific rules to transform words to their stems
# Preprosessing data before indexing
with open('scraper_results.json', 'r') as doc: scraper_results=doc.read()
# Initialize empty lists to store publication name, URL, author, and date
pubName = []
pubURL = []
pubCUAuthor = []
pubDate = []
# Load the scraped results using ujson
data_dict = ujson.loads(scraper_results)
# Get the length of the data_dict (number of publications)
array_length = len(data_dict)
# Print the number of publications
print(array_length)
#Seperate name, url, date, author in different file
for item in data_dict:
pubName.append(item["name"])
pubURL.append(item["pub_url"])
pubCUAuthor.append(item["cu_author"])
pubDate.append(item["date"])
with open('pub_name.json', 'w') as f:ujson.dump(pubName, f)
with open('pub_url.json', 'w') as f:ujson.dump(pubURL, f)
with open('pub_cu_author.json', 'w') as f:ujson.dump(pubCUAuthor, f)
with open('pub_date.json', 'w') as f: ujson.dump(pubDate, f)
#Open a file with publication names in read mode
with open('pub_name.json','r') as f:publication=f.read()
#Load JSON File
pubName = ujson.loads(publication)
#Downloading libraries to use its methods
nltk.download('stopwords')
nltk.download('punkt')
#Predefined stopwords in nltk are used
stop_words = stopwords.words('english')
stemmer = PorterStemmer()
pub_list_first_stem = []
pub_list = []
pub_list_wo_sc = []
print(len(pubName))
for file in pubName:
#Splitting strings to tokens(words)
words = word_tokenize(file)
stem_word = ""
for i in words:
if i.lower() not in stop_words:
stem_word += stemmer.stem(i) + " "
pub_list_first_stem.append(stem_word)
pub_list.append(file)
#Removing all below characters
special_characters = '''!()-—[]{};:'"\, <>./?@#$%^&*_~0123456789+=’‘'''
for file in pub_list:
word_wo_sc = ""
if len(file.split()) ==1 : pub_list_wo_sc.append(file)
else:
for a in file:
if a in special_characters:
word_wo_sc += ' '
else:
word_wo_sc += a
#print(word_wo_sc)
pub_list_wo_sc.append(word_wo_sc)
#Stemming Process
pub_list_stem_wo_sw = []
for name in pub_list_wo_sc:
words = word_tokenize(name)
stem_word = ""
for a in words:
if a.lower() not in stop_words:
stem_word += stemmer.stem(a) + ' '
pub_list_stem_wo_sw.append(stem_word.lower())
data_dict = {} #Inverted Index holder
# Indexing process
for a in range(len(pub_list_stem_wo_sw)):
for b in pub_list_stem_wo_sw[a].split():
if b not in data_dict:
data_dict[b] = [a]
else:
data_dict[b].append(a)
# printing the lenght
print(len(pub_list_wo_sc))
print(len(pub_list_stem_wo_sw))
print(len(pub_list_first_stem))
print(len(pub_list))
# with open('publication_list.json', 'w') as f:
# ujson.dump(pub_list, f)
with open('publication_list_stemmed.json', 'w') as f:
ujson.dump(pub_list_first_stem, f)
with open('publication_indexed_dictionary.json', 'w') as f:
ujson.dump(data_dict, f)