feat: Index markdown in pgvector

- Prototype pipeline for loading, chunking, enhancing, embedding, and storing markdown content in pgvector. Signed-off-by: shamb0 <r.raajey@gmail.com>
bosun-ai · Oct 16, 2024 · bfa44b5 · bfa44b5
1 parent ef5d32e
commit bfa44b5
Show file tree

Hide file tree

Showing 13 changed files with 1,332 additions and 195 deletions.
diff --git a/Cargo.lock b/Cargo.lock
diff --git a/Cargo.toml b/Cargo.toml
@@ -61,6 +61,8 @@ arrow-array = { version = "52.0", default-features = false }
 arrow = { version = "52.2" }
 parquet = { version = "52.2", default-features = false, features = ["async"] }
 redb = { version = "2.1" }
+sqlx = { version = "0.8.2", features = ["postgres", "uuid"] }
+pgvector = { version = "0.4.0", features = ["sqlx"] }
 
 # Testing
 test-log = "0.2.16"

diff --git a/examples/Cargo.toml b/examples/Cargo.toml
@@ -21,13 +21,16 @@ swiftide = { path = "../swiftide/", features = [
   "ollama",
   "fluvio",
   "lancedb",
+  "pgvector",  
 ] }
 tracing-subscriber = "0.3"
+tracing = { workspace = true }
 serde_json = { workspace = true }
 spider = { workspace = true }
 qdrant-client = { workspace = true }
 fluvio = { workspace = true }
 temp-dir = { workspace = true }
+sqlx = { workspace = true }
 
 [[example]]
 doc-scrape-examples = true
@@ -91,3 +94,7 @@ path = "fluvio.rs"
 [[example]]
 name = "lancedb"
 path = "lancedb.rs"
+
+[[example]]
+name = "index-md-pgvector"
+path = "index_md_into_pgvector.rs"
diff --git a/examples/index_md_into_pgvector.rs b/examples/index_md_into_pgvector.rs
@@ -0,0 +1,75 @@
+/**
+* This example demonstrates how to use the Pgvector integration with Swiftide
+*/
+use std::path::PathBuf;
+use swiftide::{
+    indexing::{
+        self,
+        loaders::FileLoader,
+        transformers::{
+            metadata_qa_text::NAME as METADATA_QA_TEXT_NAME, ChunkMarkdown, Embed, MetadataQAText,
+        },
+        EmbeddedField,
+    },
+    integrations::{self, pgvector::PgVector},
+};
+
+#[tokio::main]
+async fn main() -> Result<(), Box<dyn std::error::Error>> {
+    tracing_subscriber::fmt::init();
+    tracing::info!("Starting PgVector indexing test");
+
+    // Get the manifest directory path
+    let manifest_dir = std::env::var("CARGO_MANIFEST_DIR").expect("CARGO_MANIFEST_DIR not set");
+
+    // Create a PathBuf to test dataset from the manifest directory
+    let test_dataset_path = PathBuf::from(manifest_dir).join("test_dataset");
+    tracing::info!("Test Dataset path: {:?}", test_dataset_path);
+
+    let pgv_db_url = std::env::var("DATABASE_URL")
+        .as_deref()
+        .unwrap_or("postgresql://myuser:mypassword@localhost:5432/mydatabase")
+        .to_owned();
+
+    let ollama_client = integrations::ollama::Ollama::default()
+        .with_default_prompt_model("llama3.2:latest")
+        .to_owned();
+
+    let fastembed =
+        integrations::fastembed::FastEmbed::try_default().expect("Could not create FastEmbed");
+
+    // Configure Pgvector with a default vector size, a single embedding
+    // and in addition to embedding the text metadata, also store it in a field
+    let pgv_storage = PgVector::builder()
+        .try_from_url(pgv_db_url, Some(10))
+        .await
+        .expect("Failed to connect to postgres server")
+        .vector_size(384)
+        .with_vector(EmbeddedField::Combined)
+        .with_metadata(METADATA_QA_TEXT_NAME)
+        .table_name("swiftide_pgvector_test".to_string())
+        .build()
+        .unwrap();
+
+    // Drop the existing test table before running the test
+    tracing::info!("Dropping existing test table if it exists");
+    let drop_table_sql = "DROP TABLE IF EXISTS swiftide_pgvector_test";
+
+    if let Some(pool) = pgv_storage.get_pool() {
+        sqlx::query(drop_table_sql).execute(pool).await?;
+    } else {
+        return Err("Failed to get database connection pool".into());
+    }
+
+    tracing::info!("Starting indexing pipeline");
+    indexing::Pipeline::from_loader(FileLoader::new(test_dataset_path).with_extensions(&["md"]))
+        .then_chunk(ChunkMarkdown::from_chunk_range(10..2048))
+        .then(MetadataQAText::new(ollama_client.clone()))
+        .then_in_batch(Embed::new(fastembed).with_batch_size(100))
+        .then_store_with(pgv_storage.clone())
+        .run()
+        .await?;
+
+    tracing::info!("Indexing test completed successfully");
+    Ok(())
+}
diff --git a/examples/test_dataset/README.md b/examples/test_dataset/README.md
@@ -0,0 +1,41 @@
+# **Swiftide: A Fast, Streaming Indexing and Query Library for AI Applications**
+
+Swiftide is a Rust-native library designed to simplify the development of Large Language Model (LLM) applications. It addresses the challenge of providing context to LLMs for solving real-world problems by enabling efficient ingestion, transformation, indexing, and querying of extensive data. This process, known as Retrieval Augmented Generation (RAG), enhances the capabilities of LLMs.
+
+## **Key Features:**
+
+* **Fast and Modular Indexing:** Swiftide offers a high-performance, streaming indexing pipeline with asynchronous, parallel processing capabilities.
+* **Query Pipeline:** An experimental query pipeline facilitates efficient retrieval and processing of information.
+* **Versatility:** The library includes various loaders, transformers, semantic chunkers, embedders, and other components, providing flexibility for different use cases.
+* **Extensibility:** Developers can bring their own transformers by extending straightforward traits or using closures.
+* **Pipeline Management:** Swiftide supports splitting and merging pipelines for complex workflows.
+* **Prompt Templating:** Jinja-like templating simplifies the creation of prompts.
+* **Storage Options:** Integration with multiple storage backends, including Qdrant, Redis, and LanceDB.
+* **Integrations:** Seamless integration with popular tools and platforms like OpenAI, Groq, Redis, Qdrant, Ollama, FastEmbed-rs, Fluvio, LanceDB, and Treesitter.
+* **Evaluation:** Pipeline evaluation using RAGAS for performance assessment.
+* **Sparse Vector Support:** Enables hybrid search with sparse vector support.
+* **Tracing:** Built-in tracing support for logging and debugging.
+
+## **Technical Insights:**
+
+* **Rust-Native:** Developed in Rust for performance, safety, and concurrency.
+* **Streaming Architecture:** Employs a streaming architecture for efficient processing of large datasets.
+* **Modularity:** Highly modular design allows for customization and extensibility.
+* **Asynchronous and Parallel Processing:** Leverages asynchronous and parallel processing for optimal performance.
+* **Strong Typing:** The query pipeline is fully and strongly typed, ensuring type safety and developer productivity.
+* **OpenAI Integration:** Provides seamless integration with OpenAI for powerful LLM capabilities.
+
+## **Getting Started:**
+
+To get started with Swiftide, developers need to set up a Rust project, add the Swiftide library as a dependency, enable the required integration features, and write a pipeline. Comprehensive examples and documentation are available to guide developers through the process.
+
+## **Current Status and Future Roadmap:**
+
+Swiftide is under active development and may introduce breaking changes as it progresses towards version 1.0. The documentation may not cover all features and could be slightly outdated. Despite these considerations, Swiftide offers a promising solution for building efficient and scalable LLM applications. The project's roadmap includes addressing open issues and incorporating proposed features to enhance its functionality and usability.
+
+## **Community and Contributions:**
+
+The Swiftide community welcomes feedback, questions, and contributions. Developers can connect with the community on Discord and contribute to the project by forking the repository, creating pull requests, or opening issues with enhancement tags.
+
+**Overall, Swiftide presents a powerful and flexible framework for building Retrieval Augmented Generation (RAG) pipelines in Rust. Its focus on performance, modularity, and extensibility makes it a valuable tool for developers working with LLMs and AI applications.**
+
diff --git a/scripts/docker/docker-compose-db-pg.yml b/scripts/docker/docker-compose-db-pg.yml
@@ -0,0 +1,46 @@
+services:
+  test_env_pgvector:
+    image: ankane/pgvector:v0.5.1
+    container_name: test_env_pgvector
+    environment:
+      POSTGRES_USER: ${POSTGRES_USER:-myuser}
+      POSTGRES_PASSWORD: ${POSTGRES_PASSWORD:-mypassword}
+      POSTGRES_DB: ${POSTGRES_DB:-mydatabase}
+    ports:
+      - "5432:5432"
+    volumes:
+      - test_env_pgvector_data:/var/lib/postgresql/data
+    networks:
+      - pg-network
+    healthcheck:
+      test: ["CMD-SHELL", "pg_isready -U ${POSTGRES_USER:-myuser} -d ${POSTGRES_DB:-mydatabase}"]
+      interval: 10s
+      timeout: 5s
+      retries: 5
+    restart: unless-stopped
+
+  pgadmin:
+    image: dpage/pgadmin4
+    container_name: test_env_pgadmin
+    environment:
+      PGADMIN_DEFAULT_EMAIL: ${PGADMIN_DEFAULT_EMAIL:-admin@admin.com}
+      PGADMIN_DEFAULT_PASSWORD: ${PGADMIN_DEFAULT_PASSWORD:-root}
+    ports:
+      - "8080:80"
+    volumes:
+      - test_env_pgadmin_data:/var/lib/pgadmin
+    depends_on:
+      - test_env_pgvector
+    networks:
+      - pg-network
+    restart: unless-stopped
+
+networks:
+  pg-network:
+    name: pg-network
+
+volumes:
+  test_env_pgvector_data:
+    name: test_env_pgvector_data
+  test_env_pgadmin_data:
+    name: test_env_pgadmin_data
diff --git a/swiftide-integrations/Cargo.toml b/swiftide-integrations/Cargo.toml
@@ -34,6 +34,13 @@ async-openai = { workspace = true, optional = true }
 qdrant-client = { workspace = true, optional = true, default-features = false, features = [
   "serde",
 ] }
+sqlx = { workspace = true, optional = true, features = [
+  "postgres",
+  "runtime-tokio",
+  "chrono",
+  "uuid"
+] }
+pgvector = { workspace = true, optional = true, features = ["sqlx"] }
 redis = { version = "0.27", features = [
   "aio",
   "tokio-comp",
@@ -102,6 +109,8 @@ default = ["rustls"]
 rustls = ["reqwest/rustls-tls-native-roots"]
 # Qdrant for storage
 qdrant = ["dep:qdrant-client", "swiftide-core/qdrant"]
+# PgVector for storage
+pgvector = ["dep:sqlx", "dep:pgvector"]
 # Redis for caching and storage
 redis = ["dep:redis"]
 # Tree-sitter for code operations and chunking

diff --git a/swiftide-integrations/src/lib.rs b/swiftide-integrations/src/lib.rs
@@ -16,6 +16,8 @@ pub mod ollama;
 pub mod openai;
 #[cfg(feature = "parquet")]
 pub mod parquet;
+#[cfg(feature = "pgvector")]
+pub mod pgvector;
 #[cfg(feature = "qdrant")]
 pub mod qdrant;
 #[cfg(feature = "redb")]