From d7465d2e84de16aa4620787ddfdca2258c452bde Mon Sep 17 00:00:00 2001 From: goldpulpy Date: Wed, 9 Oct 2024 02:32:21 +0300 Subject: [PATCH] UPDATE: Readme --- README.md | 59 ++++++++++++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 54 insertions(+), 5 deletions(-) diff --git a/README.md b/README.md index 6dad40a..bf5e7ef 100644 --- a/README.md +++ b/README.md @@ -84,7 +84,7 @@ embeddings = model.encode(sentences, progress_bar=True) # The rounding parameter allows us to round our float values # with a default of 2, which means 2 decimal places. compute_score(source_embedding, embeddings) -# Output: [0.86, 0.77, 0.48] +# Return: [0.86, 0.77, 0.48] ``` `compute_score` returns in the same index order in which the embedding was encoded. @@ -110,7 +110,7 @@ You can use the computational functions: `cosine`, `euclidean`, `manhattan`, `ja from pysentence_similarity.compute import euclidean compute_score(source_embedding, embeddings, compute_function=euclidean) -## Output: [2.52, 3.28, 5.62] +# Return: [2.52, 3.28, 5.62] ``` You can use `max_pooling`, `mean_pooling`, `min_pooling` or your custom function @@ -128,22 +128,71 @@ embeddings = model.encode(sentences, pooling_function=max_pooling) ```python from pysentence_similarity import Splitter -# Default split symbols: '\n' +# Default split markers: '\n' splitter = Splitter() # If you want to separate by specific characters. -splitter = Splitter(split_symbols='.!?') +splitter = Splitter(split_markers=["!", "?", "."], keep_markers=True) # Test text text = "Hello world! How are you? I'm fine." # Split from text splitter.split_from_text(text) -# Output: ['Hello world!', 'How are you?', "I'm fine."] +# Return: ['Hello world!', 'How are you?', "I'm fine."] ``` At this point, sources for the splitting are available: text, file, URL, CSV, and JSON. +### Storage 💾 + +The storage allows you to save and link sentences and their embeddings for easy access, so you don't need to encode a large corpus of text every time. The storage also enables similarity searching. + +The storage must store the **sentences** themselves and their **embeddings**. + +```python +from pysentence_similarity import Model, Storage + +# Create an instance of the model +model = Model("all-MiniLM-L6-v2", dtype="fp16") + +# Create an instance of the storage +storage = Storage() +sentences = [ + "This is another test.", + "This is yet another test.", + "We are testing sentence similarity." +] + +# Convert sentences to embeddings +embeddings = model.encode(sentences) + +# Add sentences and their embeddings +storage.add(sentences, embeddings) + +# Save the storage +storage.save("my_storage.h5") +``` + +Load embeddings from the storage + +```python +from pysentence_similarity import Model, Storage, compute_score + +# Create an instance of the model and storage +model = Model("all-MiniLM-L6-v2", dtype="fp16") +storage = Storage.load("my_storage.h5") + +# Convert sentence to embedding +source_embedding = model.encode("This is a test.") + +# Compute similarity scores with the storage +compute_score(source_embedding, storage) +# Return: [0.86, 0.77, 0.48] +``` + ## License 📜 This project is licensed under the MIT License. See the [LICENSE](LICENSE) file for details + +
Created by goldpulpy with ❤️