Skip to content
This repository has been archived by the owner on Nov 13, 2024. It is now read-only.

Use BOW values projected to dense as stub embeddings for test #80

Merged
merged 10 commits into from
Oct 22, 2023
1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@ pytest-mock = "^3.6.1"
pytest-xdist = "^3.3.1"
types-requests = "^2.31.0.2"
httpx = "^0.25.0"
scipy = "^1.7.1"

[build-system]
requires = ["poetry-core"]
Expand Down
55 changes: 38 additions & 17 deletions tests/unit/stubs/stub_dense_encoder.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,48 @@
import hashlib
import mmh3
import numpy as np
from collections import defaultdict
from scipy.sparse import csr_matrix
from typing import Union, List

from pinecone_text.dense.base_dense_ecoder import BaseDenseEncoder


class StubDenseEncoder(BaseDenseEncoder):

def __init__(self, dimension: int = 3):
"""
Bag-of-words encoder that uses a random projection matrix to
project sparse vectors to dense vectors.
uses Johnson–Lindenstrauss lemma to project BOW sparse vectors to dense vectors.
"""

def __init__(self,
dimension: int = 128,
vocab_size: int = 2 ** 18,
seed: int = 42):
self.input_dim = vocab_size
self.dimension = dimension
rng = np.random.default_rng(seed)
self.random_matrix = rng.standard_normal((self.input_dim, self.dimension))

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Wait - isn't that a HUGE shape??
It's a dense matrix, isn't it? Won't it take a lot of memory?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I removed the need in actually holding this matrix, for the default dimensions it's few MB but for output dim 1536 it can become larger than we would like. Anyway, we not even holding a matrix any more

def _text_to_sparse_vector(self, text: str) -> csr_matrix:
words = text.split()
word_counts = defaultdict(int)
for word in words:
hashed_word = mmh3.hash(word) % self.input_dim
word_counts[hashed_word] += 1

indices = list(word_counts.keys())
values = list(word_counts.values())
sparse_vector = csr_matrix((values, (np.zeros_like(indices), indices)),
shape=(1, self.input_dim))

return sparse_vector

def _encode_text(self, text: str) -> List[float]:
sparse_vector = self._text_to_sparse_vector(text)
projected_embedding = sparse_vector.dot(self.random_matrix).flatten()
projected_embedding = projected_embedding.astype(np.float32)
return list(projected_embedding / np.linalg.norm(projected_embedding))

def encode_documents(self,
texts: Union[str, List[str]]
Expand All @@ -20,23 +54,10 @@ def encode_queries(self,
) -> Union[List[float], List[List[float]]]:
return self._encode(texts)

def consistent_embedding(self, text: str) -> List[float]:
# consistent embedding function that project each text to a unique angle
embedding = []
for i in range(self.dimension):
sha256_hash = hashlib.sha256(f"{text} {i}".encode()).hexdigest()
int_value = int(sha256_hash, 16)
embedding.append(int_value / float(1 << 256))

l2_norm = np.linalg.norm(embedding)
normalized_embedding = [float(value / l2_norm) for value in embedding]

return normalized_embedding

def _encode(self,
texts: Union[str, List[str]]
) -> Union[List[float], List[List[float]]]:
if isinstance(texts, str):
return self.consistent_embedding(texts)
return self._encode_text(texts)
else:
return [self.consistent_embedding(text) for text in texts]
return [self._encode_text(text) for text in texts]
Loading