From 152b2798b0aded27a5c20ca829e518e9bd8f787e Mon Sep 17 00:00:00 2001 From: Amnon Catav Date: Wed, 18 Oct 2023 00:00:49 +0300 Subject: [PATCH 1/5] use BOW values projected to dense as stub embeddings for test --- tests/unit/stubs/stub_dense_encoder.py | 54 ++++++++++++++++++-------- 1 file changed, 37 insertions(+), 17 deletions(-) diff --git a/tests/unit/stubs/stub_dense_encoder.py b/tests/unit/stubs/stub_dense_encoder.py index d0e02ff2..8e041dbb 100644 --- a/tests/unit/stubs/stub_dense_encoder.py +++ b/tests/unit/stubs/stub_dense_encoder.py @@ -1,5 +1,7 @@ -import hashlib +import mmh3 import numpy as np +from collections import defaultdict +from scipy.sparse import csr_matrix from typing import Union, List from pinecone_text.dense.base_dense_ecoder import BaseDenseEncoder @@ -7,8 +9,39 @@ class StubDenseEncoder(BaseDenseEncoder): - def __init__(self, dimension: int = 3): + """ + Bag-of-words encoder that uses a random projection matrix to + project sparse vectors to dense vectors. + uses Johnson–Lindenstrauss lemma to project BOW sparse vectors to dense vectors. + """ + + def __init__(self, + dimension: int = 128, + vocab_size: int = 2 ** 20, + seed: int = 42): + self.input_dim = vocab_size self.dimension = dimension + rng = np.random.default_rng(seed) + self.random_matrix = rng.standard_normal((self.input_dim, self.dimension)) + + def _text_to_sparse_vector(self, text: str) -> csr_matrix: + words = text.split() + word_counts = defaultdict(int) + for word in words: + hashed_word = mmh3.hash(word) % self.input_dim + word_counts[hashed_word] += 1 + + indices = list(word_counts.keys()) + values = list(word_counts.values()) + sparse_vector = csr_matrix((values, (np.zeros_like(indices), indices)), + shape=(1, self.input_dim)) + + return sparse_vector + + def _encode_text(self, text: str) -> List[float]: + sparse_vector = self._text_to_sparse_vector(text) + projected_embedding = sparse_vector.dot(self.random_matrix).flatten() + return list(projected_embedding / np.linalg.norm(projected_embedding)) def encode_documents(self, texts: Union[str, List[str]] @@ -20,23 +53,10 @@ def encode_queries(self, ) -> Union[List[float], List[List[float]]]: return self._encode(texts) - def consistent_embedding(self, text: str) -> List[float]: - # consistent embedding function that project each text to a unique angle - embedding = [] - for i in range(self.dimension): - sha256_hash = hashlib.sha256(f"{text} {i}".encode()).hexdigest() - int_value = int(sha256_hash, 16) - embedding.append(int_value / float(1 << 256)) - - l2_norm = np.linalg.norm(embedding) - normalized_embedding = [float(value / l2_norm) for value in embedding] - - return normalized_embedding - def _encode(self, texts: Union[str, List[str]] ) -> Union[List[float], List[List[float]]]: if isinstance(texts, str): - return self.consistent_embedding(texts) + return self._encode_text(texts) else: - return [self.consistent_embedding(text) for text in texts] + return [self._encode_text(text) for text in texts] From 1ba7a36e75ded363cf86d8e0673ae1b537635a96 Mon Sep 17 00:00:00 2001 From: Amnon Catav Date: Wed, 18 Oct 2023 10:08:43 +0300 Subject: [PATCH 2/5] add scipy to dev dependencies --- pyproject.toml | 1 + 1 file changed, 1 insertion(+) diff --git a/pyproject.toml b/pyproject.toml index 529c3e87..6cb1a456 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -39,6 +39,7 @@ pytest-mock = "^3.6.1" pytest-xdist = "^3.3.1" types-requests = "^2.31.0.2" httpx = "^0.25.0" +scipy = "^1.7.1" [build-system] requires = ["poetry-core"] From 5e13407c662f66a3dae5fbd5c154ebbc4a2fb412 Mon Sep 17 00:00:00 2001 From: Amnon Catav Date: Wed, 18 Oct 2023 10:40:52 +0300 Subject: [PATCH 3/5] use float32 --- tests/unit/stubs/stub_dense_encoder.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/unit/stubs/stub_dense_encoder.py b/tests/unit/stubs/stub_dense_encoder.py index 8e041dbb..6e456919 100644 --- a/tests/unit/stubs/stub_dense_encoder.py +++ b/tests/unit/stubs/stub_dense_encoder.py @@ -41,6 +41,7 @@ def _text_to_sparse_vector(self, text: str) -> csr_matrix: def _encode_text(self, text: str) -> List[float]: sparse_vector = self._text_to_sparse_vector(text) projected_embedding = sparse_vector.dot(self.random_matrix).flatten() + projected_embedding = projected_embedding.astype(np.float32) return list(projected_embedding / np.linalg.norm(projected_embedding)) def encode_documents(self, From a14df188ecd2dabb60f3e4be60b189d3174537ae Mon Sep 17 00:00:00 2001 From: Amnon Catav Date: Wed, 18 Oct 2023 12:37:08 +0300 Subject: [PATCH 4/5] reduce default vocab size --- tests/unit/stubs/stub_dense_encoder.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/unit/stubs/stub_dense_encoder.py b/tests/unit/stubs/stub_dense_encoder.py index 6e456919..2854c31a 100644 --- a/tests/unit/stubs/stub_dense_encoder.py +++ b/tests/unit/stubs/stub_dense_encoder.py @@ -17,7 +17,7 @@ class StubDenseEncoder(BaseDenseEncoder): def __init__(self, dimension: int = 128, - vocab_size: int = 2 ** 20, + vocab_size: int = 2 ** 18, seed: int = 42): self.input_dim = vocab_size self.dimension = dimension From 6bda91d78a90e3d51fe9d83ece4ed07b156ea8ee Mon Sep 17 00:00:00 2001 From: Amnon Catav Date: Wed, 18 Oct 2023 17:13:25 +0300 Subject: [PATCH 5/5] remove the need in random matrix --- pyproject.toml | 1 - .../knowledge_base/test_knowledge_base.py | 2 +- tests/unit/stubs/stub_dense_encoder.py | 30 +++++++++---------- 3 files changed, 15 insertions(+), 18 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 6cb1a456..529c3e87 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -39,7 +39,6 @@ pytest-mock = "^3.6.1" pytest-xdist = "^3.3.1" types-requests = "^2.31.0.2" httpx = "^0.25.0" -scipy = "^1.7.1" [build-system] requires = ["poetry-core"] diff --git a/tests/system/knowledge_base/test_knowledge_base.py b/tests/system/knowledge_base/test_knowledge_base.py index 3ede466a..0f995ef5 100644 --- a/tests/system/knowledge_base/test_knowledge_base.py +++ b/tests/system/knowledge_base/test_knowledge_base.py @@ -59,7 +59,7 @@ def chunker(): @pytest.fixture(scope="module") def encoder(): return StubRecordEncoder( - StubDenseEncoder(dimension=3)) + StubDenseEncoder()) @pytest.fixture(scope="module", autouse=True) diff --git a/tests/unit/stubs/stub_dense_encoder.py b/tests/unit/stubs/stub_dense_encoder.py index 2854c31a..9d55bb7f 100644 --- a/tests/unit/stubs/stub_dense_encoder.py +++ b/tests/unit/stubs/stub_dense_encoder.py @@ -1,14 +1,12 @@ import mmh3 import numpy as np from collections import defaultdict -from scipy.sparse import csr_matrix from typing import Union, List from pinecone_text.dense.base_dense_ecoder import BaseDenseEncoder class StubDenseEncoder(BaseDenseEncoder): - """ Bag-of-words encoder that uses a random projection matrix to project sparse vectors to dense vectors. @@ -16,31 +14,31 @@ class StubDenseEncoder(BaseDenseEncoder): """ def __init__(self, - dimension: int = 128, - vocab_size: int = 2 ** 18, - seed: int = 42): + dimension: int = 8, + vocab_size: int = 2 ** 12): self.input_dim = vocab_size self.dimension = dimension - rng = np.random.default_rng(seed) - self.random_matrix = rng.standard_normal((self.input_dim, self.dimension)) - def _text_to_sparse_vector(self, text: str) -> csr_matrix: + def _text_to_word_counts(self, text: str) -> defaultdict: words = text.split() word_counts = defaultdict(int) for word in words: hashed_word = mmh3.hash(word) % self.input_dim word_counts[hashed_word] += 1 + return word_counts + + def _encode_text(self, text: str) -> List[float]: + word_counts = self._text_to_word_counts(text) - indices = list(word_counts.keys()) - values = list(word_counts.values()) - sparse_vector = csr_matrix((values, (np.zeros_like(indices), indices)), - shape=(1, self.input_dim)) + # This will hold the result of word_counts * random_matrix + projected_embedding = np.zeros(self.dimension, dtype=np.float32) - return sparse_vector + for hashed_word, count in word_counts.items(): + rng = np.random.default_rng(hashed_word) + # Seed the RNG with the hashed word index for consistency + random_vector = rng.standard_normal(self.dimension) + projected_embedding += count * random_vector - def _encode_text(self, text: str) -> List[float]: - sparse_vector = self._text_to_sparse_vector(text) - projected_embedding = sparse_vector.dot(self.random_matrix).flatten() projected_embedding = projected_embedding.astype(np.float32) return list(projected_embedding / np.linalg.norm(projected_embedding))