-
Notifications
You must be signed in to change notification settings - Fork 1
/
lucene_object.py
255 lines (222 loc) · 10.7 KB
/
lucene_object.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
import os
import lucene
from java.io import File
from java.nio.file import Paths
from org.apache.lucene.analysis.standard import StandardAnalyzer
from org.apache.lucene.document import Document, Field, StringField, TextField, StoredField
from org.apache.lucene.index import IndexWriter, IndexWriterConfig, DirectoryReader, Term
from org.apache.lucene.store import MMapDirectory
from org.apache.lucene.util import Version
from org.apache.lucene.queryparser.classic import ParseException, QueryParser
from org.apache.lucene.search import IndexSearcher, ScoreDoc, TopScoreDocCollector
from org.apache.lucene.search.similarities import BM25Similarity
from org.apache.lucene.search import PhraseQuery, BooleanQuery, TermQuery, BooleanClause
from org.apache.lucene.util import BytesRef, BytesRefIterator
from org.apache.lucene.search.spans import SpanQuery, SpanTermQuery, SpanNearQuery, NearSpansOrdered, NearSpansUnordered
from org.apache.lucene.search.spans import SpanScorer, SpanWeight, Spans
from org.apache.lucene.search import DocIdSetIterator
from config import LIST_F
class Lucene_Object(object):
lucene_vm_init=None
index_name=None
index_dir=None
index_mm=None
analyzer=None
config=None
reader=None
searcher=None
searcher2=None
dict_term_freq=None
dict_doc=None
is_bigram_cache_used=None
conn_bigram_tf_cache=None
conn_bigram_cf_cache=None
conn_mapping_prob_cache=None
total_field_freq=None
def __init__(self,LUCENE_INDEX_DIR,similarity='BM25',lucene_vm_flag=False,is_bigram_cache_used=False,mongoObj=None):
if lucene_vm_flag==False:
lucene.initVM(vmargs=['-Djava.awt.headless=true'])
self.lucene_vm_init=True
self.index_dir=LUCENE_INDEX_DIR
self.index_mm = MMapDirectory(Paths.get(LUCENE_INDEX_DIR))
self.analyzer = StandardAnalyzer()
self.config = IndexWriterConfig(self.analyzer)
self.reader = DirectoryReader.open(self.index_mm)
self.searcher = IndexSearcher(self.reader)
self.dict_term_freq={}
self.dict_doc_field_title={}
if similarity=='BM25':
(self.searcher).setSimilarity(BM25Similarity())
# load bigram cache
self.is_bigram_cache_used=is_bigram_cache_used
if is_bigram_cache_used==True:
seperate_char='/' if self.index_dir.find('/')>-1 else '\\'
index_name=self.index_dir.split(seperate_char)[-1]
self.index_name=index_name
self.conn_bigram_tf_cache=mongoObj.db[index_name+'_tf_cache']
self.conn_bigram_cf_cache=mongoObj.db[index_name+'_cf_cache']
if 'stemmed_wikipedia' in LIST_F or 'wikipedia' in LIST_F:
self.conn_mapping_prob_cache=mongoObj.db[index_name+'_mapping_prob_cache_with_wikipedia']
else:
self.conn_mapping_prob_cache=mongoObj.db[index_name+'_mapping_prob_cache']
def getSecondarySearcher(self):
if self.searcher2 is None:
self.searcher2=IndexSearcher(self.reader)
return self.searcher2
def retrieve(self,query,field,hitsPerPage):
querystr=query
# build query
q_lucene = QueryParser(field, self.analyzer).parse(querystr)
# build searcher
collector = TopScoreDocCollector.create(hitsPerPage)
(self.searcher).search(q_lucene, collector);
hits = collector.topDocs().scoreDocs;
len_hits=len(hits)
single_query_result=[(self.searcher.doc(hits[j].doc),hits[j].doc) for j in range(len_hits)]
return single_query_result
def findDoc(self,title,field,is_docid_required=False):
searcher=self.getSecondarySearcher()
t=Term(field,title)
query=TermQuery(t)
docs=searcher.search(query,1)
if docs.totalHits==0:
if is_docid_required==True:
return None,None
else:
return None
docID=(docs.scoreDocs)[0].doc
d=searcher.doc(docID)
if is_docid_required==False:
return d
else:
return d,docID
def get_terms(self,docid,field):
terms=self.reader.getTermVector(docid,field)
te_itr=terms.iterator()
return [brf.utf8ToString() for brf in BytesRefIterator.cast_(te_itr)]
def clearCache(self):
self.dict_term_freq.clear()
def get_term_freq(self,docid,field,is_cached=False):
if is_cached==True and (field,docid) in self.dict_term_freq:
return self.dict_term_freq[(field,docid)]
if len(self.dict_term_freq)>1100:
self.dict_term_freq.clear()
terms=self.reader.getTermVector(docid,field)
term_freq={}
if terms is not None:
te_itr=terms.iterator()
for bytesref in BytesRefIterator.cast_(te_itr):
t=bytesref.utf8ToString()
freq=te_itr.totalTermFreq()
term_freq[t]=freq
self.dict_term_freq[(field,docid)]=term_freq
return self.dict_term_freq[(field,docid)]
def get_coll_termfreq(self, term, field):
"""
Returns collection term frequency for the given field.
:param term: string
:param field: string, document field
:return: int
"""
return self.reader.totalTermFreq(Term(field, term))
def get_doc_freq(self, term, field):
"""
Returns document frequency for the given term and field.
:param term: string, term
:param field: string, document field
:return: int
"""
return self.reader.docFreq(Term(field, term))
def get_doc_count(self, field):
"""
Returns number of documents with at least one term for the given field.
:param field: string, field name
:return: int
"""
return self.reader.getDocCount(field)
def get_coll_length(self, field):
"""
Returns length of field in the collection.
:param field: string, field name
:return: int
"""
return self.reader.getSumTotalTermFreq(field)
def get_avg_len(self, field):
"""
Returns average length of a field in the collection.
:param field: string, field name
"""
n = self.reader.getDocCount(field) # number of documents with at least one term for this field
len_all = self.reader.getSumTotalTermFreq(field)
if n == 0:
return 0
else:
return len_all / float(n)
def get_total_field_freq(self,fields):
"""Returns total occurrences of all fields"""
if self.total_field_freq is None:
total_field_freq = 0
for f in fields:
total_field_freq += self.get_doc_count(f)
self.total_field_freq = total_field_freq
return self.total_field_freq
def get_mapping_prob_cached(self,term,ordered,slop):
if self.conn_mapping_prob_cache is not None:
return self.conn_mapping_prob_cache.find_one({'term':term,'ordered':ordered,'slop':slop})
else:
return None
def insert_mapping_prob_cached(self,term,ordered,slop,weights):
if self.conn_mapping_prob_cache is not None:
self.conn_mapping_prob_cache.insert({'term':term,'ordered':ordered,'slop':slop,'weights':weights})
def get_coll_bigram_freq(self,bigram,field,ordered,slop,title,field_cache='title'):
if self.is_bigram_cache_used==True:
item_tf=self.conn_bigram_tf_cache.find_one({'title':title,'bigram':bigram,'field':field,'ordered':ordered,'slop':slop})
item_cf=self.conn_bigram_cf_cache.find_one({'bigram':bigram,'field':field,'ordered':ordered,'slop':slop})
if item_cf is not None:
cf=int(item_cf['value'])
if item_tf is not None:
tf=int(item_tf['value'])
else:
tf=0
return (tf,cf)
#print (bigram,field,ordered,slop,title)
#assert item_cf is not None
#print ('oh')
searcher=self.getSecondarySearcher()
SpanClauses=[]
for term in bigram.split(' '):
SpanClauses.append(SpanTermQuery(Term(field,term)))
builder=SpanNearQuery.Builder(field,ordered)
for i in range(len(SpanClauses)):
clause=SpanClauses[i]
builder.addClause(clause)
builder.setSlop(slop)
q_lucene=builder.build()
sw=q_lucene.createWeight(searcher,False)
list_leaves=self.reader.getContext().leaves()
frequency=0
doc_phrase_freq={}
for leave in list_leaves:
spans = sw.getSpans(leave, SpanWeight.Postings.POSITIONS)
if spans is None:
continue
while spans.nextDoc()!=DocIdSetIterator.NO_MORE_DOCS:
id=leave.reader().document(spans.docID()).get(field_cache)
#id=leave.reader().document(spans.docID()).get('wiki_id')
if self.is_bigram_cache_used==True:
item_tf=self.conn_bigram_tf_cache.find_one({'bigram':bigram,'field':field,'ordered':ordered,'title':id,'slop':slop})
if item_tf is not None:
continue
if id not in doc_phrase_freq:
doc_phrase_freq[id]=0
while spans.nextStartPosition()!=Spans.NO_MORE_POSITIONS:
doc_phrase_freq[id]+=1
frequency+=1
if self.is_bigram_cache_used==True:
self.conn_bigram_tf_cache.insert({'title':id,'bigram':bigram,'field':field,'ordered':ordered,'slop':slop,'value':doc_phrase_freq[id]})
cf=sum(doc_phrase_freq.values())
if self.is_bigram_cache_used==True:
self.conn_bigram_cf_cache.insert({'bigram':bigram,'field':field,'ordered':ordered,'slop':slop,'value':cf})
tf=doc_phrase_freq.get(title,0)
return tf,cf
#return doc_phrase_freq