Skip to content

Commit

Permalink
Fixed various bugs parsing indexed content
Browse files Browse the repository at this point in the history
  • Loading branch information
romanchyla committed May 26, 2020
1 parent 1363a91 commit cffa6d3
Showing 1 changed file with 35 additions and 11 deletions.
46 changes: 35 additions & 11 deletions scripts/parse_index.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,7 @@
def parse(lines, docid):
import sys
import pprint

def parse(lines, docid, fvalue):
out = [[] for x in range(1000)]
seqs = ['field', 'term', 'doc', 'freq', 'pos']
ix = 0
Expand All @@ -11,28 +14,41 @@ def parse(lines, docid):
if l == '':
i+= 1
continue
if ix > 3:
print 'processing', l, term
#print l
parts = l.split(' ', 1)
if len(parts) <= 1:
i+= 1
continue
key, value = parts[0], parts[1]


if key == seqs[ix]:
if key == 'field':
if value == 'title':
if value == fvalue:
ix += 1
else:
ix = 0
elif key == 'term':
term = value
ix += 1
elif key == 'doc':
doc = value
if value != docid:
j = i
while j+1 < len(lines):
k = lines[j+1].strip().split(' ', 1)[0]
if k == 'pos' or k == 'freq':
i+=1
j+=1
elif k == 'term':
i += 1
ix -= 1
break
else:
break
i += 1
ix -= 1
#ix -= 1
continue
doc = value
ix += 1
elif key == 'freq':
freq = value
Expand All @@ -43,16 +59,19 @@ def parse(lines, docid):
maxpos = pos
out[pos].append(term)
j = i
print 'adding', term, 'position', pos
#print 'adding', term, 'position', pos
while j+1 < len(lines) and lines[j+1].strip().split(' ', 1)[0] == 'pos':
pos = int(lines[j+1].strip().split(' ', 1)[1])
if pos > maxpos:
maxpos = pos
out[pos].append(term)
print 'adding', term, 'position', pos
#print 'adding', term, 'position', pos
j += 1
i = j
ix = 1
i += 1

ix -= 3
term = None

i += 1


Expand Down Expand Up @@ -87,4 +106,9 @@ def parse(lines, docid):
(17, []),
(18, []),
(19, [])]
"""
"""

if __name__ == '__main__':
if len(sys.argv) > 1:
lines = open(sys.argv[1], 'r').read().split('\n')
pprint.pprint(list(enumerate(parse(lines, sys.argv[2], sys.argv[3]))))

0 comments on commit cffa6d3

Please sign in to comment.