-
Notifications
You must be signed in to change notification settings - Fork 0
/
query.py
183 lines (133 loc) · 7.31 KB
/
query.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
from argparse import ArgumentParser
from nltk.stem import PorterStemmer
import pandas as pd
import re
import numpy as np
from itertools import islice
import statistics
from operator import itemgetter
OFFSET = 'Offset'
FREQUENCY = 'Frequency'
DOC_OCCURRENCES = 'Document Occurrences'
k1 = 1.2
k2 = 500
b = 0.75
gamma = 0.6
########################################################################################################################
def get_term_posting(term):
terms_index.seek(int(term_info.loc[term, OFFSET]))
term_posting = terms_index.readline().rstrip().split('\t')
# parsing posting list of term into numpy array of n * 2 where n is the total occurrence of term in corpus
return np.array([[int(doc), int(pos)] for doc, pos in
[x.split(':') for x in islice(term_posting, 1, len(term_posting))]])
def query_preprocessing(query):
# lowered, tokenized, stemmed, stop words removed
query = [term for term in
[stemmer.stem(term) for term in re.split('\W+', query.lower())]
if term not in stop_words]
# each term is converted to its id
q = []
for term in query:
try:
q.append(term_ids[term])
except KeyError:
pass
return q
########################################################################################################################
def create_count_vectors(query):
features = list(set(query))
query_vector = np.ones(len(features), np.int64)
documents_vectors = {}
for index, feature in enumerate(features):
term_posting = get_term_posting(feature)
try:
documents_vectors[str(term_posting[0, 0])][index] += 1
except KeyError:
documents_vectors[str(term_posting[0, 0])] = np.zeros(len(features), dtype=np.int64)
documents_vectors[str(term_posting[0, 0])][index] = 1
for i in range(1, len(term_posting)):
term_posting[i, 0] += term_posting[i - 1, 0]
try:
documents_vectors[str(term_posting[i, 0])][index] += 1
except KeyError:
documents_vectors[str(term_posting[i, 0])] = np.zeros(len(features), dtype=np.int64)
documents_vectors[str(term_posting[i, 0])][index] = 1
return query_vector, documents_vectors, features
def get_okapi_tf_vector(vector, doc_len): return vector / (vector + 0.5 + 1.5 * doc_len / AVG_DOC_LEN)
########################################################################################################################
def okapi_tf(query):
query_vector, doc_vectors, features = create_count_vectors(query)
# creating okapi-tf vectors of query and documents
query_vector = get_okapi_tf_vector(query_vector, np.sum(query_vector))
doc_vectors = {doc: get_okapi_tf_vector(doc_vector, doc_lengths[int(doc)])
for doc, doc_vector in doc_vectors.items()}
# finding cosine similarity scores of query with documents
query_vector_len = np.sqrt(query_vector.dot(query_vector))
return {doc: query_vector.dot(doc_vector) / (query_vector_len * np.sqrt(doc_vector.dot(doc_vector)))
for doc, doc_vector in doc_vectors.items()}
def okapi_tf_idf(query):
query_vector, doc_vectors, features = create_count_vectors(query)
# log_d_by_df is log(D / df(i)) of feature
df = np.array([int(term_info.loc[feature, DOC_OCCURRENCES]) for feature in features])
log_d_by_df = np.log10(DOC_COUNT / df)
# creating okapi-tf vectors of query and documents
query_vector = get_okapi_tf_vector(query_vector, np.sum(query_vector)) * log_d_by_df
doc_vectors = {doc: get_okapi_tf_vector(doc_vector, doc_lengths[int(doc)]) * log_d_by_df
for doc, doc_vector in doc_vectors.items()}
# finding cosine similarity scores of query with documents
query_vector_len = np.sqrt(query_vector.dot(query_vector))
return {doc: query_vector.dot(doc_vector) / (query_vector_len * np.sqrt(doc_vector.dot(doc_vector)))
for doc, doc_vector in doc_vectors.items()}
def okapi_bm25(query):
query_vector, doc_vectors, features = create_count_vectors(query)
# log_d_by_df is log((D + 0.5) / df(i) + 0.5) of features
df = np.array([int(term_info.loc[feature, DOC_OCCURRENCES]) for feature in features])
log_d_by_df = np.log10((DOC_COUNT + 0.5) / (df + 0.5))
# k1 * (1 - b + b * doc_lengths[int(doc)] / AVG_DOC_LEN is K
return {doc: np.sum(log_d_by_df * ((1 + k1) * doc_vector / (k1 * (1 - b + b * doc_lengths[int(doc)] / AVG_DOC_LEN) + doc_vector)) * ((1 + k2) * query_vector) / (k2 + query_vector))
for doc, doc_vector in doc_vectors.items()}
def jelinek_mercer_smoothing(query):
query_vector, doc_vectors, features = create_count_vectors(query)
# e_tf = summation d tf(d, i) of all terms under consideration
e_tf = np.array([int(term_info.loc[feature, FREQUENCY]) for feature in features])
return {doc: np.prod(gamma * doc_vector / doc_lengths[int(doc)] + (1 - gamma) * e_tf / DOC_LEN_SUM)
for doc, doc_vector in doc_vectors.items()}
########################################################################################################################
parser = ArgumentParser()
parser.add_argument('--score', dest='score', help='name of scoring function (TF or TF-IDF)',
metavar='SCORE', required=True)
parser.add_argument('--query', dest='query', help='Search query',
metavar='QUERY', required=True)
options = parser.parse_args()
score_function = options.score.upper()
if score_function == 'TF':
score_function = okapi_tf
elif score_function == 'TF-IDF':
score_function = okapi_tf_idf
elif score_function == 'BM25':
score_function = okapi_bm25
elif score_function == 'JM':
score_function = jelinek_mercer_smoothing
else:
print('Please select valid score function')
exit(-1)
doc_ids = pd.read_csv('docids.txt', sep='\t', dtype=str, header=None, index_col=0).to_dict()[1]
doc_lengths = pd.read_csv('doc_lengths.txt', sep='\t', dtype=int, header=None, index_col=0).to_dict()[1]
term_ids = pd.read_csv('termids.txt', encoding='utf8', sep='\t', dtype=str, header=None, index_col=1).to_dict()[0]
term_info = pd.read_csv('term_info.txt', sep='\t', dtype=str, header=None, names=(OFFSET, FREQUENCY, DOC_OCCURRENCES),
index_col=0)
terms_index = open('term_index.txt', encoding='cp1252')
with open('stoplist.txt') as f:
stop_words = f.read().split('\n')
AVG_DOC_LEN = statistics.mean(doc_lengths.values())
DOC_COUNT = len(doc_lengths)
DOC_LEN_SUM = AVG_DOC_LEN * DOC_COUNT
stemmer = PorterStemmer()
stop_words = set(stop_words + [stemmer.stem(stop_word) for stop_word in stop_words]) | {'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z'}
stop_words.add('')
document_scores = score_function(query_preprocessing(options.query))
doc_score_pairs = [(doc_ids[int(doc)], score) for doc, score in document_scores.items()]
doc_score_pairs.sort(key=itemgetter(1), reverse=True)
for rank, doc_score_pair in enumerate(doc_score_pairs):
print(doc_score_pair[0], rank + 1, doc_score_pair[1], score_function.__name__)
terms_index.close()