-
Notifications
You must be signed in to change notification settings - Fork 1
/
search_space_utils.py
95 lines (85 loc) · 3.9 KB
/
search_space_utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
#import Relevant packages
import time
import faiss
import numpy as np
import pandas as pd
import gensim
from preprocessing_utils import *
class InvalidFormatException(Exception):
"""Model Format does not conform with the required format."""
pass
def test_model(model):
error_message = "Model Format does not conform with the required format. \nThe required model format is gensim.models.word2vec.Word2Vec"
if type(model) != gensim.models.word2vec.Word2Vec:
raise InvalidFormatException(error_message)
class Optimized_search():
def __init__(self,query,docs_embeddings,w2v_model,docs_df,top_k):
"""
Object Oriented implementation of a FIASS optimized document retrieval system.
Parameters
----------
:param query: user query for which intend to make a search.
:param docs_embeddings: set of all document embeddings for which we intend to retrieve a document from.
:param w2v_model: The word2vec model used to train embedding representation.
:param docs_df: A dataframe containing all the documents from which we intend to make a search.
:param top_k: maximum number of relevant documents expected to be retrieved in a search.
"""
self.query = query
self.docs_embeddings = docs_embeddings
self.docs_df = docs_df
self.top_k = top_k
#Verify the model format;
if not test_model(w2v_model):
self.model = w2v_model
def optimized_similarity_search(self,query_embedding,embedding_dim):
"""
Perform fast query-document pair normalized cosine similarity search.
Parameters
----------
:param query_embedding: The embedding representation of user defined query.
:param embedding_dim: The embedding dimension (should be the same for both documents and queries).
"""
index = faiss.index_factory(embedding_dim, "Flat", faiss.METRIC_INNER_PRODUCT)
index.ntotal
faiss.normalize_L2(self.docs_embeddings)
index.add(self.docs_embeddings)
faiss.normalize_L2(query_embedding)
#begin search
distance, sorted_index = index.search(query_embedding, self.top_k)
#Get ranked results
sorted_index=np.asarray(sorted_index[0])
#To tally the results; you can compare the cosine similarity results here with those obtained from Scipy:
# from scipy import spatial
# for document in docs:
# result = 1 - spatial.distance.cosine(query, document)
# print('Distance by scipy:{}'.format(result))
return sorted_index
def fetch_document_info(self,dataframe_idx):
"""
Document Retrieval: Fetch metadata of the top k relevant documents using the indices returned in `optimized_similarity_search`.
Parameters
----------
:param dataframe_idx: indices for top k relevant documents. The value is the result of `optimized_similarity_search`.
"""
info = self.docs_df.iloc[dataframe_idx]
meta_dict = dict()
meta_dict['docid'] = info['docid']
meta_dict['document'] = info['body']
return meta_dict
def search(self):
"""
Perform document retrieval and search based on given query.
"""
start_time=time.time()
query_embedding = np.asarray(process_query(self.query,self.model))
if query_embedding.ndim == 1:
query_embedding = np.asarray([query_embeddings])
elif query_embedding.ndim == 3:
query_embedding = query_embedding.squeeze(1)
else:
pass
embedding_dim = query_embedding.shape[1]
ranked_indices = self.optimized_similarity_search(query_embedding,embedding_dim)
print('>>>> Total time taken to perform search: {}'.format(time.time()-start_time))
results = [self.fetch_document_info(index) for index in ranked_indices]
return results