bosun-ai · shamb0 · Oct 16, 2024 · Oct 20, 2024 · Oct 20, 2024 · Oct 20, 2024
diff --git a/Cargo.lock b/Cargo.lock
diff --git a/Cargo.toml b/Cargo.toml
@@ -52,6 +52,8 @@ arrow-array = { version = "52.0", default-features = false }
 arrow = { version = "52.2" }
 parquet = { version = "52.2", default-features = false, features = ["async"] }
 redb = { version = "2.1" }
+sqlx = { version = "0.8.2", features = ["postgres", "uuid"] }
+pgvector = { version = "0.4.0", features = ["sqlx"] }
 
 # Testing
 test-log = "0.2.16"
@@ -61,6 +63,8 @@ temp-dir = "0.1.13"
 wiremock = "0.6.0"
 test-case = "3.3.1"
 insta = { version = "1.39.0", features = ["yaml"] }
+tempfile = "3.10.1"
+portpicker = "0.1.1"
 
 [workspace.lints.rust]
 unsafe_code = "forbid"

diff --git a/examples/Cargo.toml b/examples/Cargo.toml
@@ -21,13 +21,17 @@ swiftide = { path = "../swiftide/", features = [
   "ollama",
   "fluvio",
   "lancedb",
+  "pgvector",  
 ] }
 tracing-subscriber = "0.3"
+tracing = { workspace = true }
 serde_json = { workspace = true }
 spider = { workspace = true }
 qdrant-client = { workspace = true }
 fluvio = { workspace = true }
 temp-dir = { workspace = true }
+sqlx = { workspace = true }
+swiftide-test-utils = { path = "../swiftide-test-utils" }
 
 [[example]]
 doc-scrape-examples = true
@@ -91,3 +95,7 @@ path = "fluvio.rs"
 [[example]]
 name = "lancedb"
 path = "lancedb.rs"
+
+[[example]]
+name = "index-md-pgvector"
+path = "index_md_into_pgvector.rs"
diff --git a/examples/index_md_into_pgvector.rs b/examples/index_md_into_pgvector.rs
@@ -0,0 +1,77 @@
+/**
+* This example demonstrates how to use the Pgvector integration with SwiftIDE
+*/
+use std::path::PathBuf;
+use swiftide::{
+    indexing::{
+        self,
+        loaders::FileLoader,
+        transformers::{
+            metadata_qa_text::NAME as METADATA_QA_TEXT_NAME, ChunkMarkdown, Embed, MetadataQAText,
+        },
+        EmbeddedField,
+    },
+    integrations::{self, pgvector::PgVector},
+};
+
+#[tokio::main]
+async fn main() -> Result<(), Box<dyn std::error::Error>> {
+    tracing_subscriber::fmt::init();
+    tracing::info!("Starting PgVector indexing test");
+
+    // Get the manifest directory path
+    let manifest_dir = std::env::var("CARGO_MANIFEST_DIR").expect("CARGO_MANIFEST_DIR not set");
+
+    // Create a PathBuf to test dataset from the manifest directory
+    let test_dataset_path = PathBuf::from(manifest_dir).join("../README.md");
+
+    tracing::info!("Test Dataset path: {:?}", test_dataset_path);
+
+    let (_pgv_db_container, pgv_db_url, _temp_dir) = swiftide_test_utils::start_postgres().await;
+
+    tracing::info!("pgv_db_url :: {:#?}", pgv_db_url);
+
+    let llm_client = integrations::ollama::Ollama::default()
+        .with_default_prompt_model("llama3.2:latest")
+        .to_owned();
+
+    let fastembed =
+        integrations::fastembed::FastEmbed::try_default().expect("Could not create FastEmbed");
+
+    // Configure Pgvector with a default vector size, a single embedding
+    // and in addition to embedding the text metadata, also store it in a field
+    let pgv_storage = PgVector::builder()
+        .try_connect_to_pool(pgv_db_url, Some(10))
+        .await
+        .expect("Failed to connect to postgres server")
+        .vector_size(384)
+        .with_vector(EmbeddedField::Combined)
+        .with_metadata(METADATA_QA_TEXT_NAME)
+        .table_name("swiftide_pgvector_test".to_string())
+        .build()
+        .unwrap();
+
+    // Drop the existing test table before running the test
+    tracing::info!("Dropping existing test table & index if it exists");
+    let drop_table_sql = "DROP TABLE IF EXISTS swiftide_pgvector_test";
+    let drop_index_sql = "DROP INDEX IF EXISTS swiftide_pgvector_test_embedding_idx";
+
+    if let Ok(pool) = pgv_storage.get_pool() {
+        sqlx::query(drop_table_sql).execute(&pool).await?;
+        sqlx::query(drop_index_sql).execute(&pool).await?;
+    } else {
+        return Err("Failed to get database connection pool".into());
+    }
+
+    tracing::info!("Starting indexing pipeline");
+    indexing::Pipeline::from_loader(FileLoader::new(test_dataset_path).with_extensions(&["md"]))
+        .then_chunk(ChunkMarkdown::from_chunk_range(10..2048))
+        .then(MetadataQAText::new(llm_client.clone()))
+        .then_in_batch(Embed::new(fastembed.clone()).with_batch_size(100))
+        .then_store_with(pgv_storage.clone())
+        .run()
+        .await?;
+
+    tracing::info!("PgVector Indexing test completed successfully");
+    Ok(())
+}
diff --git a/swiftide-integrations/Cargo.toml b/swiftide-integrations/Cargo.toml
@@ -34,6 +34,13 @@ async-openai = { workspace = true, optional = true }
 qdrant-client = { workspace = true, optional = true, default-features = false, features = [
   "serde",
 ] }
+sqlx = { workspace = true, optional = true, features = [
+  "postgres",
+  "runtime-tokio",
+  "chrono",
+  "uuid"
+] }
+pgvector = { workspace = true, optional = true, features = ["sqlx"] }
 redis = { version = "0.27", features = [
   "aio",
   "tokio-comp",
@@ -102,6 +109,8 @@ default = ["rustls"]
 rustls = ["reqwest/rustls-tls-native-roots"]
 # Qdrant for storage
 qdrant = ["dep:qdrant-client", "swiftide-core/qdrant"]
+# PgVector for storage
+pgvector = ["dep:sqlx", "dep:pgvector"]
 # Redis for caching and storage
 redis = ["dep:redis"]
 # Tree-sitter for code operations and chunking

diff --git a/swiftide-integrations/src/lib.rs b/swiftide-integrations/src/lib.rs
@@ -16,6 +16,8 @@ pub mod ollama;
 pub mod openai;
 #[cfg(feature = "parquet")]
 pub mod parquet;
+#[cfg(feature = "pgvector")]
+pub mod pgvector;
 #[cfg(feature = "qdrant")]
 pub mod qdrant;
 #[cfg(feature = "redb")]

diff --git a/swiftide-integrations/src/pgvector/mod.rs b/swiftide-integrations/src/pgvector/mod.rs
@@ -0,0 +1,163 @@
+//! This module integrates with the pgvector database, providing functionalities to create and manage vector collections,
+//! store data, and optimize indexing for efficient searches.
+//!
+//! pgvector is utilized in both the `indexing::Pipeline` and `query::Pipeline` modules.
+mod persist;
+mod pgv_table_types;
+use anyhow::Result;
+use derive_builder::Builder;
+use sqlx::PgPool;
+use std::fmt;
+
+use pgv_table_types::{FieldConfig, MetadataConfig, PgDBConnectionPool, VectorConfig};
+
+const DEFAULT_BATCH_SIZE: usize = 50;
+
+/// Represents a Pgvector client with configuration options.
+///
+/// This struct is used to interact with the Pgvector vector database, providing methods to manage vector collections,
+/// store data, and ensure efficient searches. The client can be cloned with low cost as it shares connections.
+#[derive(Builder, Clone)]
+#[builder(setter(into, strip_option), build_fn(error = "anyhow::Error"))]
+pub struct PgVector {
+    /// Database connection pool.
+    #[builder(default = "PgDBConnectionPool::default()")]
+    connection_pool: PgDBConnectionPool,
+
+    /// Table name to store vectors in.
+    #[builder(default = "String::from(\"swiftide_pgv_store\")")]
+    table_name: String,
+
+    /// Default sizes of vectors. Vectors can also be of different
+    /// sizes by specifying the size in the vector configuration.
+    vector_size: Option<i32>,
+
+    /// Batch size for storing nodes.
+    #[builder(default = "Some(DEFAULT_BATCH_SIZE)")]
+    batch_size: Option<usize>,
+
+    /// Field configuration for the Pgvector table, determining the eventual table schema.
+    ///
+    /// Supports multiple field types; see [`FieldConfig`] for details.
+    #[builder(default)]
+    fields: Vec<FieldConfig>,
+}
+
+impl fmt::Debug for PgVector {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        // Access the connection pool synchronously and determine the status.
+        let connection_status = self.connection_pool.connection_status();
+
+        f.debug_struct("PgVector")
+            .field("table_name", &self.table_name)
+            .field("vector_size", &self.vector_size)
+            .field("batch_size", &self.batch_size)
+            .field("connection_status", &connection_status)
+            .finish()
+    }
+}
+
+impl PgVector {
+    /// Creates a new instance of `PgVectorBuilder` with default settings.
+    ///
+    /// # Returns
+    ///
+    /// A new `PgVectorBuilder`.
+    pub fn builder() -> PgVectorBuilder {
+        PgVectorBuilder::default()
+    }
+
+    /// Retrieves a connection pool for `PostgreSQL`.
+    ///
+    /// This function returns the connection pool used for interacting with the `PostgreSQL` database.
+    /// It fetches the pool from the `PgDBConnectionPool` struct.
+    ///
+    /// # Returns
+    ///
+    /// A `Result` that, on success, contains the `PgPool` representing the database connection pool.
+    /// On failure, an error is returned.
+    ///
+    /// # Errors
+    ///
+    /// This function will return an error if it fails to retrieve the connection pool, which could occur
+    /// if the underlying connection to `PostgreSQL` has not been properly established.
+    pub fn get_pool(&self) -> Result<PgPool> {
+        self.connection_pool.get_pool()
+    }
+}
+
+impl PgVectorBuilder {
+    /// Tries to asynchronously connect to a `Postgres` server and initialize a connection pool.
+    ///
+    /// This function attempts to establish a connection to the specified `Postgres` server and
+    /// sets up a connection pool with an optional maximum number of connections.
+    ///
+    /// # Arguments
+    ///
+    /// * `url` - A string reference representing the URL of the `Postgres` server to connect to.
+    /// * `connection_max` - An optional value specifying the maximum number of connections in the pool.
+    ///
+    /// # Returns
+    ///
+    /// A `Result` that contains an updated `PgVector` instance with the new connection pool on success.
+    /// On failure, an error is returned.
+    ///
+    /// # Errors
+    ///
+    /// This function returns an error if the connection to the database fails or if retries are exhausted.
+    /// Possible reasons include invalid database URLs, unreachable servers, or exceeded retry limits.
+    pub async fn try_connect_to_pool(
+        mut self,
+        url: impl AsRef<str>,
+        connection_max: Option<u32>,
+    ) -> Result<Self> {
+        let pool = self.connection_pool.clone().unwrap_or_default();
+
+        self.connection_pool = Some(pool.try_connect_to_url(url, connection_max).await?);
+
+        Ok(self)
+    }
+
+    /// Adds a vector configuration to the builder.
+    ///
+    /// # Arguments
+    ///
+    /// * `config` - The vector configuration to add, which can be converted into a `VectorConfig`.
+    ///
+    /// # Returns
+    ///
+    /// A mutable reference to the builder with the new vector configuration added.
+    pub fn with_vector(&mut self, config: impl Into<VectorConfig>) -> &mut Self {
+        // Use `get_or_insert_with` to initialize `fields` if it's `None`
+        self.fields
+            .get_or_insert_with(Self::default_fields)
+            .push(FieldConfig::Vector(config.into()));
+
+        self
+    }
+
+    /// Sets the metadata configuration for the vector similarity search.
+    ///
+    /// This method allows you to specify metadata configurations for vector similarity search using `MetadataConfig`.
+    /// The provided configuration will be added as a new field in the builder.
+    ///
+    /// # Arguments
+    ///
+    /// * `config` - The metadata configuration to use.
+    ///
+    /// # Returns
+    ///
+    /// * Returns a mutable reference to `self` for method chaining.
+    pub fn with_metadata(&mut self, config: impl Into<MetadataConfig>) -> &mut Self {
+        // Use `get_or_insert_with` to initialize `fields` if it's `None`
+        self.fields
+            .get_or_insert_with(Self::default_fields)
+            .push(FieldConfig::Metadata(config.into()));
+
+        self
+    }
+
+    fn default_fields() -> Vec<FieldConfig> {
+        vec![FieldConfig::ID, FieldConfig::Chunk]
+    }
+}