Skip to content

Commit

Permalink
feat(query): Add support for filters in SimilaritySingleEmbedding (#298)
Browse files Browse the repository at this point in the history
Adds support for filters for Qdrant and Lancedb in
SimilaritySingleEmbedding. Also fixes several small bugs and brings
improved tests.
  • Loading branch information
timonv authored Sep 13, 2024
1 parent f158960 commit e902cb7
Show file tree
Hide file tree
Showing 16 changed files with 460 additions and 73 deletions.
3 changes: 3 additions & 0 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

4 changes: 4 additions & 0 deletions swiftide-core/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -30,11 +30,15 @@ derive_builder = { workspace = true }
tera = { version = "1.20", default-features = false }
uuid = { workspace = true, features = ["v4", "v3"] }

# Integrations
qdrant-client = { workspace = true, optional = true }

[dev-dependencies]
test-case = { workspace = true }

[features]
test-utils = ["dep:mockall"]
qdrant = ["dep:qdrant-client"]

[lints]
workspace = true
15 changes: 15 additions & 0 deletions swiftide-core/src/metadata.rs
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,21 @@ where
}
}

impl<K, V> From<(K, V)> for Metadata
where
K: Into<String>,
V: Into<serde_json::Value>,
{
fn from(items: (K, V)) -> Self {
let sliced: [(K, V); 1] = [items];
let inner = sliced
.into_iter()
.map(|(k, v)| (k.into(), v.into()))
.collect();
Metadata { inner }
}
}

impl<'a, K, V> From<&'a [(K, V)]> for Metadata
where
K: Into<String> + Clone,
Expand Down
21 changes: 21 additions & 0 deletions swiftide-core/src/node.rs
Original file line number Diff line number Diff line change
Expand Up @@ -118,6 +118,27 @@ impl Node {
}
}

pub fn with_metadata(&mut self, metadata: impl Into<Metadata>) -> &mut Self {
self.metadata = metadata.into();
self
}

pub fn with_vectors(
&mut self,
vectors: impl Into<HashMap<EmbeddedField, Embedding>>,
) -> &mut Self {
self.vectors = Some(vectors.into());
self
}

pub fn with_sparse_vectors(
&mut self,
sparse_vectors: impl Into<HashMap<EmbeddedField, SparseEmbedding>>,
) -> &mut Self {
self.sparse_vectors = Some(sparse_vectors.into());
self
}

/// Creates embeddable data depending on chosen `EmbedMode`.
///
/// # Returns
Expand Down
7 changes: 5 additions & 2 deletions swiftide-core/src/query.rs
Original file line number Diff line number Diff line change
Expand Up @@ -81,8 +81,11 @@ impl<T: Clone> Query<T> {
}

impl Query<states::Pending> {
pub fn new() -> Self {
Self::default()
pub fn new(query: impl Into<String>) -> Self {
Self {
original: query.into(),
..Default::default()
}
}

/// Transforms the current query
Expand Down
Original file line number Diff line number Diff line change
@@ -1,21 +1,8 @@
use derive_builder::Builder;

const DEFAULT_TOP_K: u64 = 10;
const DEFAULT_TOP_N: u64 = 10;

/// Search strategies provide a generic way for Retrievers to implement their
/// search in various ways.
///
/// The strategy is also yielded to the Retriever and can contain addition configuration
use crate::{indexing::EmbeddedField, querying};

/// A very simple search where it takes the embedding on the current query
/// and returns `top_k` documents.
#[derive(Debug, Clone, Copy)]
pub struct SimilaritySingleEmbedding {
/// Maximum number of documents to return
top_k: u64,
}
use super::{DEFAULT_TOP_K, DEFAULT_TOP_N};

/// A hybrid search strategy that combines a similarity search with a
/// keyword search / sparse search.
Expand All @@ -41,6 +28,8 @@ pub struct HybridSearch {
sparse_vector_field: EmbeddedField,
}

impl querying::SearchStrategy for HybridSearch {}

impl Default for HybridSearch {
fn default() -> Self {
Self {
Expand Down Expand Up @@ -103,28 +92,3 @@ impl HybridSearch {
&self.sparse_vector_field
}
}

impl Default for SimilaritySingleEmbedding {
fn default() -> Self {
Self {
top_k: DEFAULT_TOP_K,
}
}
}

impl SimilaritySingleEmbedding {
/// Set the maximum amount of documents to be returned
pub fn with_top_k(&mut self, top_k: u64) -> &mut Self {
self.top_k = top_k;

self
}

/// Returns the maximum of documents to be returned
pub fn top_k(&self) -> u64 {
self.top_k
}
}

impl querying::SearchStrategy for SimilaritySingleEmbedding {}
impl querying::SearchStrategy for HybridSearch {}
23 changes: 23 additions & 0 deletions swiftide-core/src/search_strategies/mod.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
//! Search strategies provide a generic way for Retrievers to implement their
//! search in various ways.
//!
//! The strategy is also yielded to the Retriever and can contain addition configuration

mod hybrid_search;
mod similarity_single_embedding;

pub(crate) const DEFAULT_TOP_K: u64 = 10;
pub(crate) const DEFAULT_TOP_N: u64 = 10;

pub use hybrid_search::*;
pub use similarity_single_embedding::*;

pub trait SearchFilter: Clone + Sync + Send {}

#[cfg(feature = "qdrant")]
impl SearchFilter for qdrant_client::qdrant::Filter {}

// When no filters are applied
impl SearchFilter for () {}
// Lancedb uses a string filter
impl SearchFilter for String {}
72 changes: 72 additions & 0 deletions swiftide-core/src/search_strategies/similarity_single_embedding.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,72 @@
use crate::querying;

use super::{SearchFilter, DEFAULT_TOP_K};

/// A simple, single vector similarity search where it takes the embedding on the current query
/// and returns `top_k` documents.
///
/// Can optionally be used with a filter.
#[derive(Debug, Clone)]
pub struct SimilaritySingleEmbedding<FILTER: SearchFilter = ()> {
/// Maximum number of documents to return
top_k: u64,

filter: Option<FILTER>,
}

impl<FILTER: SearchFilter> querying::SearchStrategy for SimilaritySingleEmbedding<FILTER> {}

impl<FILTER: SearchFilter> Default for SimilaritySingleEmbedding<FILTER> {
fn default() -> Self {
Self {
top_k: DEFAULT_TOP_K,
filter: None,
}
}
}

impl SimilaritySingleEmbedding<()> {
/// Set an optional filter to be used in the query
pub fn into_concrete_filter<FILTER: SearchFilter>(&self) -> SimilaritySingleEmbedding<FILTER> {
SimilaritySingleEmbedding::<FILTER> {
top_k: self.top_k,
filter: None,
}
}
}

impl<FILTER: SearchFilter> SimilaritySingleEmbedding<FILTER> {
pub fn from_filter(filter: FILTER) -> Self {
Self {
filter: Some(filter),
..Default::default()
}
}

/// Set the maximum amount of documents to be returned
pub fn with_top_k(&mut self, top_k: u64) -> &mut Self {
self.top_k = top_k;

self
}

/// Returns the maximum of documents to be returned
pub fn top_k(&self) -> u64 {
self.top_k
}

/// Set an optional filter to be used in the query
pub fn with_filter<NEWFILTER: SearchFilter>(
self,
filter: NEWFILTER,
) -> SimilaritySingleEmbedding<NEWFILTER> {
SimilaritySingleEmbedding::<NEWFILTER> {
top_k: self.top_k,
filter: Some(filter),
}
}

pub fn filter(&self) -> &Option<FILTER> {
&self.filter
}
}
6 changes: 5 additions & 1 deletion swiftide-integrations/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,10 @@ arrow = { workspace = true, optional = true }

[dev-dependencies]
swiftide-core = { path = "../swiftide-core", features = ["test-utils"] }
swiftide-test-utils = { path = "../swiftide-test-utils", features = [
"test-utils",
] }
temp-dir = { workspace = true }

arrow = { workspace = true, features = ["test_utils"] }

Expand All @@ -95,7 +99,7 @@ default = ["rustls"]
# Ensures rustls is used
rustls = ["reqwest/rustls-tls-native-roots"]
# Qdrant for storage
qdrant = ["dep:qdrant-client"]
qdrant = ["dep:qdrant-client", "swiftide-core/qdrant"]
# Redis for caching and storage
redis = ["dep:redis"]
# Tree-sitter for code operations and chunking
Expand Down
Loading

0 comments on commit e902cb7

Please sign in to comment.