-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathindexer.py
162 lines (136 loc) Β· 5.87 KB
/
indexer.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
from posting import Posting
import json
from io import BytesIO
from gzip import GzipFile
import traceback
import linecache
import os
from configuration import ConfigClass
import utils
class Indexer:
def __init__(self, config):
self.inverted_idx = {}
self.postingDict = {}
self.tweet_index = {}
self.config = config
self.posting_file = []
self.inverted_idx_file = []
self.tweet_index_file = []
self.number_of_term = 0
self.num_of_file = 0
self.config = config
def set_thread(self, thread_number):
self.num_of_file = thread_number
# DO NOT MODIFY THIS SIGNATURE
# You can change the internal implementation as you see fit.
def add_new_doc(self, document):
"""
This function perform indexing process for a document object.
Saved information is captures via two dictionaries ('inverted index' and 'posting')
:param document: a document need to be indexed.
:return: -
"""
document_dictionary = document.term_doc_dictionary
most_recurrent_term = 0
unique_term = 0
number_of_word = 0
number_of_different_term = 0
create_posting_file = 1000000
"""
inverted_idx[term] = [ df , tf , pointer to the posting file (line in the text file) ]
postingDict[term] = [tweet_id, term appearance in tweet , [positional index of the term]]
tweet_index[tweet_id] = [max tf, number of words, number of unique words, number of different term]
We create a posting file every each certain amount of term
"""
# Go over each term in the doc
for term in document_dictionary.keys():
try:
# Update inverted index and posting
if term not in self.inverted_idx.keys():
self.inverted_idx[term] = [1, document_dictionary[term][0], 0]
self.postingDict[term] = []
else:
self.inverted_idx[term][0] += 1
self.inverted_idx[term][1] += document_dictionary[term][0]
if document_dictionary[term][0] > most_recurrent_term:
most_recurrent_term = document_dictionary[term][0]
if document_dictionary[term][0] == 1:
unique_term += 1
number_of_word += document_dictionary[term][0]
if term in self.postingDict:
self.postingDict[term].append((document.tweet_id, document_dictionary[term][0], document_dictionary[term][1]))
else:
self.postingDict[term] = [(document.tweet_id, document_dictionary[term][0], document_dictionary[term][1])]
self.number_of_term += 1
# if self.number_of_term == create_posting_file:
# self.create_posting_text()
# self.create_inverted_index()
# self.create_tweet_index()
# self.num_of_file += 3
# self.postingDict = {} # delete the posting file
# self.inverted_idx = {}
# self.tweet_index = {}
# self.number_of_term = 0
number_of_different_term += 1
except:
print('problem with the following key {}'.format(term[0]))
self.tweet_index[document.tweet_id] = (most_recurrent_term, number_of_word, unique_term, number_of_different_term)
def create_posting_text(self):
p = Posting(self.postingDict, self.num_of_file)
p.create_posting_file()
self.posting_file.append(p.get_posting_path())
# self.num_of_file += 3
#print(self.postingDict)
# self.postingDict = {} # delete the posting file
# self.number_of_term = 0
def create_inverted_index(self):
try:
with open("inverted_index{}.json".format(self.num_of_file), "w", encoding="cp437",errors='ignore') as fp:
json.dump(self.inverted_idx, fp)
self.inverted_idx_file.append("inverted_index{}.json".format(self.num_of_file))
except:
print(traceback.print_exc())
def create_tweet_index(self):
try:
with open("tweet_index{}.json".format(self.num_of_file), "w", encoding="cp437",errors='ignore') as fp:
json.dump(self.tweet_index, fp)
self.tweet_index_file.append("tweet_index{}.json".format(self.num_of_file))
except:
print(traceback.print_exc())
# DO NOT MODIFY THIS SIGNATURE
# You can change the internal implementation as you see fit.
def load_index(self, fn):
"""
Loads a pre-computed index (or indices) so we can answer queries.
Input:
fn - file name of pickled index.
"""
return utils.load_obj(fn)
# DO NOT MODIFY THIS SIGNATURE
# You can change the internal implementation as you see fit.
def save_index(self, fn):
"""
Saves a pre-computed index (or indices) so we can save our work.
Input:
fn - file name of pickled index.
"""
utils.save_obj(self, fn)
# feel free to change the signature and/or implementation of this function
# or drop altogether.
def _is_term_exist(self, term):
"""
Checks if a term exist in the dictionary.
"""
return term in self.inverted_idx
# feel free to change the signature and/or implementation of this function
# or drop altogether.
def get_term_posting_list(self, term):
"""
Return the posting list from the index for a term.
"""
if self._is_term_exist(term):
line = self.inverted_idx[term][2]
posting = linecache.getline('posting_file.txt', line)
term, tweet_id = posting.split(':')
result = list(eval(tweet_id[1:-2]))
return result