Skip to content

Commit

Permalink
More doctests (#47)
Browse files Browse the repository at this point in the history
* Update docs

* add -> add_single

* add_many -> add

* More doctests

* fix wasm
  • Loading branch information
anchpop authored Dec 26, 2024
1 parent bd938d7 commit 53f25f0
Show file tree
Hide file tree
Showing 7 changed files with 185 additions and 34 deletions.
3 changes: 2 additions & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -56,8 +56,9 @@ fastembed = "4.3.0"

[dev-dependencies]
wasm-bindgen-test = "0.3"
tokio = { version = "1", features = ["rt", "macros"] }
rand = "0.8"
tokio-test = "0.4"


[profile.release]
# Tell `rustc` to optimize for small code size.
Expand Down
6 changes: 4 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -67,13 +67,15 @@ let mut victor = Db::new(PathBuf::from("./victor_test_data"));
victor.clear_db().await.unwrap();

victor
.add_many(
.add(
vec!["Pineapple", "Rocks"], // documents
vec!["Pizza Toppings"], // tags (only used for filtering)
)
.await;

victor.add("Cheese pizza", vec!["Pizza Flavors"]).await; // Add another entry with no tags
victor
.add_single("Cheese pizza", vec!["Pizza Flavors"])
.await; // Add another entry with no tags

// read the 10 closest results from victor that are tagged with "Pizza Toppings"
// (only 2 will be returned because we only inserted two embeddings)
Expand Down
6 changes: 4 additions & 2 deletions examples/in_memory.rs
Original file line number Diff line number Diff line change
Expand Up @@ -7,13 +7,15 @@ async fn main() {
victor.clear_db().await.unwrap();

victor
.add_many(
.add(
vec!["Pineapple", "Rocks"], // documents
vec!["Pizza Toppings"], // tags (only used for filtering)
)
.await;

victor.add("Cheese pizza", vec!["Pizza Flavors"]).await; // Add another entry with no tags
victor
.add_single("Cheese pizza", vec!["Pizza Flavors"])
.await; // Add another entry with no tags

// read the 10 closest results from victor that are tagged with "Pizza Toppings"
// (only 2 will be returned because we only inserted two embeddings)
Expand Down
6 changes: 4 additions & 2 deletions examples/native_filesystem.rs
Original file line number Diff line number Diff line change
Expand Up @@ -10,13 +10,15 @@ async fn main() {
victor.clear_db().await.unwrap();

victor
.add_many(
.add(
vec!["Pineapple", "Rocks"], // documents
vec!["Pizza Toppings"], // tags (only used for filtering)
)
.await;

victor.add("Cheese pizza", vec!["Pizza Flavors"]).await; // Add another entry with no tags
victor
.add_single("Cheese pizza", vec!["Pizza Flavors"])
.await; // Add another entry with no tags

// read the 10 closest results from victor that are tagged with "Pizza Toppings"
// (only 2 will be returned because we only inserted two embeddings)
Expand Down
76 changes: 62 additions & 14 deletions src/db.rs
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,8 @@ use crate::{
similarity,
};

/// The main database struct.
/// Through this you can [`Victor::add`] and [`Victor::search`] for embeddings.
pub struct Victor<D> {
root: D,
}
Expand Down Expand Up @@ -72,19 +74,32 @@ extern "C" {
}

impl<D: DirectoryHandle> Victor<D> {
/// Create a new Victor database given a directory handle.
///
/// For example, you can use [`std::path::PathBuf`] to use the native filesystem.
/// Or you can use [`crate::memory::DirectoryHandle`] to use an in-memory database.
pub fn new(root: impl Into<D>) -> Self {
let root = root.into();
Self { root }
}

/// Add many documents to the database.
/// Embeddings will be generated for each document.
///
/// ```rust
/// # tokio_test::block_on(async {
/// # use victor_db::memory::{Db, DirectoryHandle};
/// # let mut victor = Db::new(DirectoryHandle::default());
/// victor
/// .add(
/// vec!["Pineapple", "Rocks"], // documents
/// vec!["Pizza Toppings"], // tags (only used for filtering)
/// )
/// .await;
/// # })
/// ```
#[cfg(not(target_arch = "wasm32"))]
pub async fn add_many(
&mut self,
content: Vec<impl Into<String>>,
tags: Vec<impl Into<String>>,
) {
pub async fn add(&mut self, content: Vec<impl Into<String>>, tags: Vec<impl Into<String>>) {
let tags = tags.into_iter().map(|t| t.into()).collect::<Vec<String>>();
let model = fastembed::TextEmbedding::try_new(Default::default()).unwrap();
let content = content
Expand All @@ -95,20 +110,36 @@ impl<D: DirectoryHandle> Victor<D> {
let vectors = model.embed(content.clone(), None).unwrap();

let to_add = content.into_iter().zip(vectors.into_iter()).collect();
self.add_embedding_many(to_add, tags).await;
self.add_embeddings(to_add, tags).await;
}

/// Add a single document to the database.
/// Embedding will be generated for the document.
/// When adding many documents, it is more efficient to use `add_many`.
/// When adding many documents, it is more efficient to use `add`.
///
/// ```rust
/// # tokio_test::block_on(async {
/// # use victor_db::memory::{Db, DirectoryHandle};
/// # let mut victor = Db::new(DirectoryHandle::default());
/// victor.add_single("Pepperoni pizza", vec!["Pizza Flavors"]).await;
/// # })
/// ```
#[cfg(not(target_arch = "wasm32"))]
pub async fn add(&mut self, content: impl Into<String>, tags: Vec<impl Into<String>>) {
self.add_many(vec![content], tags).await;
pub async fn add_single(&mut self, content: impl Into<String>, tags: Vec<impl Into<String>>) {
self.add(vec![content], tags).await;
}

/// Add many documen/embedding pairs to the database.
/// Add many document/embedding pairs to the database.
/// This is useful for adding embeddings that have already been generated.
pub async fn add_embedding_many(
///
/// ```rust
/// # tokio_test::block_on(async {
/// # use victor_db::memory::{Db, DirectoryHandle};
/// # let mut victor = Db::new(DirectoryHandle::default());
/// victor.add_embeddings(vec![("Pepperoni pizza", vec![0.1, 0.2, 0.3])], vec!["Pizza Flavors"]).await;
/// # })
/// ```
pub async fn add_embeddings(
&mut self,
to_add: Vec<(impl Into<String>, Vec<f32>)>,
tags: Vec<impl Into<String>>,
Expand All @@ -134,19 +165,35 @@ impl<D: DirectoryHandle> Victor<D> {

/// Add a single document/embedding pair to the database.
/// This is useful for adding embeddings that have already been generated.
/// When adding many documents, it is more efficient to use `add_embedding_many`.
pub async fn add_embedding(
/// When adding many documents, it is more efficient to use `add_embeddings`.
///
/// ```rust
/// # tokio_test::block_on(async {
/// # use victor_db::memory::{Db, DirectoryHandle};
/// # let mut victor = Db::new(DirectoryHandle::default());
/// victor.add_single_embedding("Pepperoni pizza", vec![0.1, 0.2, 0.3], vec!["Pizza Flavors"]).await;
/// # })
/// ```
pub async fn add_single_embedding(
&mut self,
content: impl Into<String>,
vector: Vec<f32>,
tags: Vec<impl Into<String>>,
) {
self.add_embedding_many(vec![(content, vector)], tags).await;
self.add_embeddings(vec![(content, vector)], tags).await;
}

/// Search the database for the nearest neighbors to a given document.
/// An embedding will be generated for the document being searched for.
/// This will return the top `top_n` nearest neighbors.
///
/// ```rust
/// # tokio_test::block_on(async {
/// # use victor_db::memory::{Db, DirectoryHandle};
/// # let mut victor = Db::new(DirectoryHandle::default());
/// victor.search("Pepperoni pizza", vec!["Pizza Flavors"], 10).await;
/// # })
/// ```
#[cfg(not(target_arch = "wasm32"))]
pub async fn search(
&self,
Expand Down Expand Up @@ -557,6 +604,7 @@ impl<D: DirectoryHandle> Victor<D> {
content.to_string()
}

/// Clear the database, deleting all data.
pub async fn clear_db(&mut self) -> Result<(), D::Error> {
// clear db files
let files = Index::get_all_db_filenames(&mut self.root).await?;
Expand Down
104 changes: 100 additions & 4 deletions src/lib.rs
Original file line number Diff line number Diff line change
@@ -1,4 +1,91 @@
//! A browser-optimized vector database. Backed by the private virtual filesystem API on web. On native, supports running with the native filesystem or in memory.
//! A browser-optimized vector database. Backed by the private virtual filesystem API on web.
//!
//! You're viewing this on crates.io, so you're probably interested in the native version. The native version supports running with the native filesystem or in memory.
//!
//! If you want to use it on the web, [check out victor-db on npm](https://www.npmjs.com/package/victor-db).
//!
//! ## In-memory database
//!
//! Use this if you want to run victor in-memory (all data is lost when the program exits).
//!
//! The in-memory version is useful for testing and applications where you don't need to persist data:
//! ```rust
//! # tokio_test::block_on(async {
//! // use victor_db::memory for the in-memory implementation
//! use victor_db::memory::{Db, DirectoryHandle};
//!
//! // create a new in-memory database
//! let mut victor = Db::new(DirectoryHandle::default());
//!
//! // add some embeddings to the database
//! victor
//! .add(
//! vec!["Pineapple", "Rocks"], // documents
//! vec!["Pizza Toppings"], // tags (only used for filtering)
//! )
//! .await;
//!
//! // add another embedding to the database, this time with no tags
//! victor.add_single("Cheese pizza", vec!["Pizza Flavors"]).await;
//!
//! // read the 10 closest results from victor that are tagged with "Pizza Toppings"
//! // (only 2 will be returned because we only inserted two embeddings)
//! let nearest = victor
//! .search("Hawaiian pizza", vec!["Pizza Toppings"], 10)
//! .await
//! .first()
//! .unwrap()
//! .content
//! .clone();
//! assert_eq!(nearest, "Pineapple".to_string());
//!
//! // Clear the database
//! victor.clear_db().await.unwrap();
//! # })
//! ```
//!
//! ## Native database
//!
//! Use this if you want to persist your database to disk.
//!
//! ```rust
//! # tokio_test::block_on(async {
//! // use victor_db::native for the native filesystem implementation
//! use victor_db::native::Db;
//! use std::path::PathBuf;
//!
//! // create a new native database under "./victor_test_data"
//! let _ = std::fs::create_dir("./victor_test_data");
//! let mut victor = Db::new(PathBuf::from("./victor_test_data"));
//!
//! // add some embeddings to the database
//! victor
//! .add(
//! vec!["Pineapple", "Rocks"], // documents
//! vec!["Pizza Toppings"], // tags (only used for filtering)
//! )
//! .await;
//!
//! // add another embedding to the database, this time with no tags
//! victor.add_single("Cheese pizza", vec!["Pizza Flavors"]).await;
//!
//! // read the 10 closest results from victor that are tagged with "Pizza Toppings"
//! // (only 2 will be returned because we only inserted two embeddings)
//! let nearest = victor
//! .search("Hawaiian pizza", vec!["Pizza Toppings"], 10)
//! .await
//! .first()
//! .unwrap()
//! .content
//! .clone();
//! assert_eq!(nearest, "Pineapple".to_string());
//!
//! // Clear the database
//! victor.clear_db().await.unwrap();
//! # })
//! ```
//!
//! See the docs for [`Victor`] for more information.
#![deny(missing_docs)]

Expand All @@ -9,6 +96,9 @@ mod packed_vector;
mod similarity;
mod utils;

#[cfg(not(target_arch = "wasm32"))]
pub use db::Victor;

#[cfg(test)]
mod tests;

Expand All @@ -22,7 +112,9 @@ type Victor = crate::db::Victor<filesystem::web::DirectoryHandle>;

// Native

/// Used to tell victor to use the native filesystem.
/// Victor's native filesystem implementation.
///
/// Use this if you want to persist your database to disk.
#[cfg(not(target_arch = "wasm32"))]
pub mod native {
use crate::db::Victor;
Expand All @@ -31,7 +123,9 @@ pub mod native {
pub type Db = Victor<crate::filesystem::native::DirectoryHandle>;
}

/// Used to tell victor to use an in-memory filesystem.
/// Victor's in-memory implementation.
///
/// Use this if you want to run victor in-memory (all data is lost when the program exits).
#[cfg(not(target_arch = "wasm32"))]
pub mod memory {
use crate::db::Victor;
Expand Down Expand Up @@ -105,7 +199,9 @@ impl Db {
})
.unwrap_or(vec![]);

self.victor.add_embedding(content, embedding, tags).await;
self.victor
.add_single_embedding(content, embedding, tags)
.await;
}

/// Search the database for the nearest neighbors to a given embedding.
Expand Down
18 changes: 9 additions & 9 deletions src/tests.rs
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ async fn store_and_retrieve() {
let mut victor = Db::new(DirectoryHandle::default());

victor
.add_embedding("hello", embedding.clone(), Vec::<String>::new())
.add_single_embedding("hello", embedding.clone(), Vec::<String>::new())
.await;

let result = victor
Expand All @@ -29,10 +29,10 @@ async fn store_two_and_retrieve() {
let mut victor = Db::new(DirectoryHandle::default());

victor
.add_embedding("hello", embedding_1.clone(), Vec::<String>::new())
.add_single_embedding("hello", embedding_1.clone(), Vec::<String>::new())
.await;
victor
.add_embedding("goodbye", embedding_2.clone(), Vec::<String>::new())
.add_single_embedding("goodbye", embedding_2.clone(), Vec::<String>::new())
.await;

{
Expand Down Expand Up @@ -67,10 +67,10 @@ async fn store_two_and_retrieve_with_tags() {
let mut victor = Db::new(DirectoryHandle::default());

victor
.add_embedding("hello", embedding_1.clone(), vec!["greetings".to_string()])
.add_single_embedding("hello", embedding_1.clone(), vec!["greetings".to_string()])
.await;
victor
.add_embedding("goodbye", embedding_2.clone(), vec!["goodbyes".to_string()])
.add_single_embedding("goodbye", embedding_2.clone(), vec!["goodbyes".to_string()])
.await;

{
Expand Down Expand Up @@ -135,19 +135,19 @@ async fn incompatible_size_panic() {
let mut victor = Db::new(DirectoryHandle::default());

victor
.add_embedding("hello", embedding_1, Vec::<String>::new())
.add_single_embedding("hello", embedding_1, Vec::<String>::new())
.await;
victor
.add_embedding("hello", embedding_2, Vec::<String>::new())
.add_single_embedding("hello", embedding_2, Vec::<String>::new())
.await;
}

#[tokio::test]
async fn add_many() {
async fn add() {
let mut victor = Db::new(DirectoryHandle::default());

victor
.add_many(vec!["pineapple", "rocks"], Vec::<String>::new())
.add(vec!["pineapple", "rocks"], Vec::<String>::new())
.await;

let result = victor
Expand Down

0 comments on commit 53f25f0

Please sign in to comment.