Skip to content

Commit

Permalink
Migrate to on-disk qdrant collections (#887)
Browse files Browse the repository at this point in the history
* Specify on-disk qdrant collections

* Address review comment
  • Loading branch information
rsdy authored Sep 5, 2023
1 parent 083954d commit e32c351
Show file tree
Hide file tree
Showing 4 changed files with 68 additions and 57 deletions.
4 changes: 4 additions & 0 deletions server/bleep/src/indexes.rs
Original file line number Diff line number Diff line change
Expand Up @@ -112,6 +112,10 @@ impl Indexes {
.delete()
.await?;
}

if let Some(ref semantic) = semantic {
semantic.delete_collection().await?;
}
}
config.source.save_index_version()?;

Expand Down
3 changes: 3 additions & 0 deletions server/bleep/src/indexes/schema.rs
Original file line number Diff line number Diff line change
@@ -1,3 +1,6 @@
//! Every change in this file will trigger a reset of the databases.
//! Use with care.
//!
use tantivy::schema::{
BytesOptions, Field, IndexRecordOption, Schema, SchemaBuilder, TextFieldIndexing, TextOptions,
FAST, STORED, STRING,
Expand Down
86 changes: 29 additions & 57 deletions server/bleep/src/semantic.rs
Original file line number Diff line number Diff line change
Expand Up @@ -5,11 +5,10 @@ use crate::{query::parser::SemanticQuery, Configuration};
use qdrant_client::{
prelude::{QdrantClient, QdrantClientConfig},
qdrant::{
point_id::PointIdOptions, r#match::MatchValue, vectors::VectorsOptions, vectors_config,
with_payload_selector, with_vectors_selector, CollectionOperationResponse,
CreateCollection, Distance, FieldCondition, FieldType, Filter, Match, PointId,
RetrievedPoint, ScoredPoint, SearchPoints, Value, VectorParams, Vectors, VectorsConfig,
WithPayloadSelector, WithVectorsSelector,
point_id::PointIdOptions, r#match::MatchValue, vectors::VectorsOptions,
with_payload_selector, with_vectors_selector, CollectionOperationResponse, FieldCondition,
FieldType, Filter, Match, PointId, RetrievedPoint, ScoredPoint, SearchPoints, Value,
Vectors, WithPayloadSelector, WithVectorsSelector,
},
};

Expand All @@ -25,10 +24,9 @@ mod schema;

pub use embedder::Embedder;
use embedder::LocalEmbedder;
use schema::{create_collection, EMBEDDING_DIM};
pub use schema::{Embedding, Payload};

pub(crate) const EMBEDDING_DIM: usize = 384;

#[derive(Error, Debug)]
pub enum SemanticError {
/// Represents failure to initialize Qdrant client
Expand Down Expand Up @@ -173,6 +171,17 @@ fn kind_to_value(kind: Option<qdrant_client::qdrant::value::Kind>) -> serde_json
}
}

async fn create_indexes(collection_name: &str, qdrant: &QdrantClient) -> anyhow::Result<()> {
let text_fields = &["repo_ref", "content_hash", "branches", "relative_path"];
for field in text_fields {
qdrant
.create_field_index(collection_name, field, FieldType::Text, None, None)
.await?;
}

Ok(())
}

impl Semantic {
pub async fn initialize(
model_dir: &Path,
Expand All @@ -183,20 +192,10 @@ impl Semantic {

match qdrant.has_collection(&config.collection_name).await {
Ok(false) => {
let CollectionOperationResponse { result, time } = qdrant
.create_collection(&CreateCollection {
collection_name: config.collection_name.to_string(),
vectors_config: Some(VectorsConfig {
config: Some(vectors_config::Config::Params(VectorParams {
size: EMBEDDING_DIM as u64,
distance: Distance::Cosine.into(),
..Default::default()
})),
}),
..Default::default()
})
.await
.unwrap();
let CollectionOperationResponse { result, time } =
create_collection(&config.collection_name, &qdrant)
.await
.unwrap();

debug!(
time,
Expand All @@ -211,42 +210,7 @@ impl Semantic {
Err(_) => return Err(SemanticError::QdrantInitializationError),
}

qdrant
.create_field_index(
&config.collection_name,
"repo_ref",
FieldType::Text,
None,
None,
)
.await?;
qdrant
.create_field_index(
&config.collection_name,
"content_hash",
FieldType::Text,
None,
None,
)
.await?;
qdrant
.create_field_index(
&config.collection_name,
"branches",
FieldType::Text,
None,
None,
)
.await?;
qdrant
.create_field_index(
&config.collection_name,
"relative_path",
FieldType::Text,
None,
None,
)
.await?;
create_indexes(&config.collection_name, &qdrant).await?;

if let Some(dylib_dir) = config.dylib_dir.as_ref() {
init_ort_dylib(dylib_dir);
Expand Down Expand Up @@ -280,6 +244,14 @@ impl Semantic {
pub fn embedder(&self) -> &dyn Embedder {
self.embedder.as_ref()
}
pub async fn delete_collection(&self) -> anyhow::Result<()> {
_ = self
.qdrant
.delete_collection(&self.config.collection_name)
.await?;

Ok(())
}

pub async fn health_check(&self) -> anyhow::Result<()> {
self.qdrant.health_check().await?;
Expand Down
32 changes: 32 additions & 0 deletions server/bleep/src/semantic/schema.rs
Original file line number Diff line number Diff line change
@@ -1,3 +1,15 @@
//! Every change in this file will trigger a reset of the databases.
//! Use with care.
//!
use qdrant_client::{
prelude::QdrantClient,
qdrant::{
vectors_config, CollectionOperationResponse, CreateCollection, Distance, VectorParams,
VectorsConfig,
},
};

pub(super) const EMBEDDING_DIM: usize = 384;
pub type Embedding = Vec<f32>;

#[derive(Default, Clone, Debug, serde::Deserialize, serde::Serialize)]
Expand Down Expand Up @@ -40,3 +52,23 @@ impl PartialEq for Payload {
// created payload
}
}

pub(super) async fn create_collection(
name: &str,
qdrant: &QdrantClient,
) -> anyhow::Result<CollectionOperationResponse> {
qdrant
.create_collection(&CreateCollection {
collection_name: name.to_string(),
vectors_config: Some(VectorsConfig {
config: Some(vectors_config::Config::Params(VectorParams {
size: EMBEDDING_DIM as u64,
distance: Distance::Cosine.into(),
on_disk: Some(true),
..Default::default()
})),
}),
..Default::default()
})
.await
}

0 comments on commit e32c351

Please sign in to comment.