Skip to content

Commit

Permalink
optimization: avoiding storing of bitmaps when vector is in a single …
Browse files Browse the repository at this point in the history
…document (#181)

* avoiding bitmaps when the vector is present only in 1 doc

* minor refactor of the code

* code comment
  • Loading branch information
Thejas-bhat authored Nov 17, 2023
1 parent 6b9b047 commit 0296d71
Show file tree
Hide file tree
Showing 2 changed files with 74 additions and 17 deletions.
30 changes: 22 additions & 8 deletions faiss_vector_posting.go
Original file line number Diff line number Diff line change
Expand Up @@ -309,19 +309,33 @@ func (sb *SegmentBase) SimilarVectors(field string, qVector []float32, k int64,
vecID, n := binary.Uvarint(sb.mem[pos : pos+binary.MaxVarintLen64])
pos += n

bitMapLen, n := binary.Uvarint(sb.mem[pos : pos+binary.MaxVarintLen64])
numDocs, n := binary.Uvarint(sb.mem[pos : pos+binary.MaxVarintLen64])
pos += n

roaringBytes := sb.mem[pos : pos+int(bitMapLen)]
pos += int(bitMapLen)

bitMap := roaring.NewBitmap()
_, err := bitMap.FromBuffer(roaringBytes)
if err != nil {
return nil, err

// if the number docs is more than one, load the bitmap containing the
// docIDs, else use the optimized format where the single docID is
// varint encoded.
if numDocs > 1 {
bitMapLen, n := binary.Uvarint(sb.mem[pos : pos+binary.MaxVarintLen64])
pos += n

roaringBytes := sb.mem[pos : pos+int(bitMapLen)]
pos += int(bitMapLen)

_, err := bitMap.FromBuffer(roaringBytes)
if err != nil {
return nil, err
}

vecDocIDMap[int64(vecID)] = bitMap.ToArray()
continue
}
docID, n := binary.Uvarint(sb.mem[pos : pos+binary.MaxVarintLen64])
pos += n

vecDocIDMap[int64(vecID)] = bitMap.ToArray()
vecDocIDMap[int64(vecID)] = []uint32{uint32(docID)}
}

vecIndex, err := faiss.ReadIndexFromBuffer(indexBytes, faiss.IOFlagReadOnly)
Expand Down
61 changes: 52 additions & 9 deletions section_faiss_vector_index.go
Original file line number Diff line number Diff line change
Expand Up @@ -131,16 +131,25 @@ LOOP:
vecID, n := binary.Uvarint(sb.mem[pos : pos+binary.MaxVarintLen64])
pos += n

bitMapLen, n := binary.Uvarint(sb.mem[pos : pos+binary.MaxVarintLen64])
numDocs, n := binary.Uvarint(sb.mem[pos : pos+binary.MaxVarintLen64])
pos += n

roaringBytes := sb.mem[pos : pos+int(bitMapLen)]
pos += int(bitMapLen)

bitMap := roaring.NewBitmap()
_, err := bitMap.FromBuffer(roaringBytes)
if err != nil {
return err
if numDocs == 1 {
docID, n := binary.Uvarint(sb.mem[pos : pos+binary.MaxVarintLen64])
pos += n
bitMap.Add(uint32(docID))
} else {
bitMapLen, n := binary.Uvarint(sb.mem[pos : pos+binary.MaxVarintLen64])
pos += n

roaringBytes := sb.mem[pos : pos+int(bitMapLen)]
pos += int(bitMapLen)

_, err := bitMap.FromBuffer(roaringBytes)
if err != nil {
return err
}
}

// remap the docIDs from the old segment to the new document nos.
Expand Down Expand Up @@ -215,6 +224,24 @@ func (v *vectorIndexOpaque) flushVectorSection(vecToDocID map[int64]*roaring.Bit
return 0, err
}

numDocs := docIDs.GetCardinality()
n = binary.PutUvarint(tempBuf, numDocs)
_, err = w.Write(tempBuf[:n])
if err != nil {
return 0, err
}

// an optimization to avoid using the bitmaps if there is only 1 doc
// with the vecID.
if numDocs == 1 {
n = binary.PutUvarint(tempBuf, uint64(docIDs.Minimum()))
_, err = w.Write(tempBuf[:n])
if err != nil {
return 0, err
}
continue
}

// write the docIDs
_, err = writeRoaringWithLen(docIDs, w, tempBuf)
if err != nil {
Expand Down Expand Up @@ -362,7 +389,6 @@ func (vo *vectorIndexOpaque) writeVectorIndexes(w *CountHashWriter) (offset uint

var vecs []float32
var ids []int64

for hash, vecInfo := range content.vecs {
vecs = append(vecs, vecInfo.vec...)
ids = append(ids, int64(hash))
Expand Down Expand Up @@ -449,6 +475,24 @@ func (vo *vectorIndexOpaque) writeVectorIndexes(w *CountHashWriter) (offset uint
return 0, err
}

numDocs := docIDs.GetCardinality()
n = binary.PutUvarint(tempBuf, numDocs)
_, err = w.Write(tempBuf[:n])
if err != nil {
return 0, err
}

// an optimization to avoid using the bitmaps if there is only 1 doc
// with the vecID.
if numDocs == 1 {
n = binary.PutUvarint(tempBuf, numDocs)
_, err = w.Write(tempBuf[:n])
if err != nil {
return 0, err
}
continue
}

// write the docIDs
_, err = writeRoaringWithLen(docIDs, w, tempBuf)
if err != nil {
Expand All @@ -472,7 +516,6 @@ func (vo *vectorIndexOpaque) process(field index.VectorField, fieldID uint16, do
}

//process field

vec := field.Vector()
dim := field.Dims()
metric := field.Similarity()
Expand Down

0 comments on commit 0296d71

Please sign in to comment.