Skip to content

Commit

Permalink
feat: Index markdown in pgvector
Browse files Browse the repository at this point in the history
- Prototype pipeline for loading, chunking, enhancing, embedding, and storing markdown content in pgvector.

Signed-off-by: shamb0 <r.raajey@gmail.com>
  • Loading branch information
shamb0 committed Oct 16, 2024
1 parent ef5d32e commit bfa44b5
Show file tree
Hide file tree
Showing 13 changed files with 1,332 additions and 195 deletions.
859 changes: 665 additions & 194 deletions Cargo.lock

Large diffs are not rendered by default.

2 changes: 2 additions & 0 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,8 @@ arrow-array = { version = "52.0", default-features = false }
arrow = { version = "52.2" }
parquet = { version = "52.2", default-features = false, features = ["async"] }
redb = { version = "2.1" }
sqlx = { version = "0.8.2", features = ["postgres", "uuid"] }
pgvector = { version = "0.4.0", features = ["sqlx"] }

# Testing
test-log = "0.2.16"
Expand Down
7 changes: 7 additions & 0 deletions examples/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -21,13 +21,16 @@ swiftide = { path = "../swiftide/", features = [
"ollama",
"fluvio",
"lancedb",
"pgvector",
] }
tracing-subscriber = "0.3"
tracing = { workspace = true }
serde_json = { workspace = true }
spider = { workspace = true }
qdrant-client = { workspace = true }
fluvio = { workspace = true }
temp-dir = { workspace = true }
sqlx = { workspace = true }

[[example]]
doc-scrape-examples = true
Expand Down Expand Up @@ -91,3 +94,7 @@ path = "fluvio.rs"
[[example]]
name = "lancedb"
path = "lancedb.rs"

[[example]]
name = "index-md-pgvector"
path = "index_md_into_pgvector.rs"
75 changes: 75 additions & 0 deletions examples/index_md_into_pgvector.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,75 @@
/**
* This example demonstrates how to use the Pgvector integration with Swiftide
*/
use std::path::PathBuf;
use swiftide::{
indexing::{
self,
loaders::FileLoader,
transformers::{
metadata_qa_text::NAME as METADATA_QA_TEXT_NAME, ChunkMarkdown, Embed, MetadataQAText,
},
EmbeddedField,
},
integrations::{self, pgvector::PgVector},
};

#[tokio::main]
async fn main() -> Result<(), Box<dyn std::error::Error>> {
tracing_subscriber::fmt::init();
tracing::info!("Starting PgVector indexing test");

// Get the manifest directory path
let manifest_dir = std::env::var("CARGO_MANIFEST_DIR").expect("CARGO_MANIFEST_DIR not set");

// Create a PathBuf to test dataset from the manifest directory
let test_dataset_path = PathBuf::from(manifest_dir).join("test_dataset");
tracing::info!("Test Dataset path: {:?}", test_dataset_path);

let pgv_db_url = std::env::var("DATABASE_URL")
.as_deref()
.unwrap_or("postgresql://myuser:mypassword@localhost:5432/mydatabase")
.to_owned();

let ollama_client = integrations::ollama::Ollama::default()
.with_default_prompt_model("llama3.2:latest")
.to_owned();

let fastembed =
integrations::fastembed::FastEmbed::try_default().expect("Could not create FastEmbed");

// Configure Pgvector with a default vector size, a single embedding
// and in addition to embedding the text metadata, also store it in a field
let pgv_storage = PgVector::builder()
.try_from_url(pgv_db_url, Some(10))
.await
.expect("Failed to connect to postgres server")
.vector_size(384)
.with_vector(EmbeddedField::Combined)
.with_metadata(METADATA_QA_TEXT_NAME)
.table_name("swiftide_pgvector_test".to_string())
.build()
.unwrap();

// Drop the existing test table before running the test
tracing::info!("Dropping existing test table if it exists");
let drop_table_sql = "DROP TABLE IF EXISTS swiftide_pgvector_test";

if let Some(pool) = pgv_storage.get_pool() {
sqlx::query(drop_table_sql).execute(pool).await?;
} else {
return Err("Failed to get database connection pool".into());
}

tracing::info!("Starting indexing pipeline");
indexing::Pipeline::from_loader(FileLoader::new(test_dataset_path).with_extensions(&["md"]))
.then_chunk(ChunkMarkdown::from_chunk_range(10..2048))
.then(MetadataQAText::new(ollama_client.clone()))
.then_in_batch(Embed::new(fastembed).with_batch_size(100))
.then_store_with(pgv_storage.clone())
.run()
.await?;

tracing::info!("Indexing test completed successfully");
Ok(())
}
41 changes: 41 additions & 0 deletions examples/test_dataset/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
# **Swiftide: A Fast, Streaming Indexing and Query Library for AI Applications**

Swiftide is a Rust-native library designed to simplify the development of Large Language Model (LLM) applications. It addresses the challenge of providing context to LLMs for solving real-world problems by enabling efficient ingestion, transformation, indexing, and querying of extensive data. This process, known as Retrieval Augmented Generation (RAG), enhances the capabilities of LLMs.

## **Key Features:**

* **Fast and Modular Indexing:** Swiftide offers a high-performance, streaming indexing pipeline with asynchronous, parallel processing capabilities.
* **Query Pipeline:** An experimental query pipeline facilitates efficient retrieval and processing of information.
* **Versatility:** The library includes various loaders, transformers, semantic chunkers, embedders, and other components, providing flexibility for different use cases.
* **Extensibility:** Developers can bring their own transformers by extending straightforward traits or using closures.
* **Pipeline Management:** Swiftide supports splitting and merging pipelines for complex workflows.
* **Prompt Templating:** Jinja-like templating simplifies the creation of prompts.
* **Storage Options:** Integration with multiple storage backends, including Qdrant, Redis, and LanceDB.
* **Integrations:** Seamless integration with popular tools and platforms like OpenAI, Groq, Redis, Qdrant, Ollama, FastEmbed-rs, Fluvio, LanceDB, and Treesitter.
* **Evaluation:** Pipeline evaluation using RAGAS for performance assessment.
* **Sparse Vector Support:** Enables hybrid search with sparse vector support.
* **Tracing:** Built-in tracing support for logging and debugging.

## **Technical Insights:**

* **Rust-Native:** Developed in Rust for performance, safety, and concurrency.
* **Streaming Architecture:** Employs a streaming architecture for efficient processing of large datasets.
* **Modularity:** Highly modular design allows for customization and extensibility.
* **Asynchronous and Parallel Processing:** Leverages asynchronous and parallel processing for optimal performance.
* **Strong Typing:** The query pipeline is fully and strongly typed, ensuring type safety and developer productivity.
* **OpenAI Integration:** Provides seamless integration with OpenAI for powerful LLM capabilities.

## **Getting Started:**

To get started with Swiftide, developers need to set up a Rust project, add the Swiftide library as a dependency, enable the required integration features, and write a pipeline. Comprehensive examples and documentation are available to guide developers through the process.

## **Current Status and Future Roadmap:**

Swiftide is under active development and may introduce breaking changes as it progresses towards version 1.0. The documentation may not cover all features and could be slightly outdated. Despite these considerations, Swiftide offers a promising solution for building efficient and scalable LLM applications. The project's roadmap includes addressing open issues and incorporating proposed features to enhance its functionality and usability.

## **Community and Contributions:**

The Swiftide community welcomes feedback, questions, and contributions. Developers can connect with the community on Discord and contribute to the project by forking the repository, creating pull requests, or opening issues with enhancement tags.

**Overall, Swiftide presents a powerful and flexible framework for building Retrieval Augmented Generation (RAG) pipelines in Rust. Its focus on performance, modularity, and extensibility makes it a valuable tool for developers working with LLMs and AI applications.**

46 changes: 46 additions & 0 deletions scripts/docker/docker-compose-db-pg.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
services:
test_env_pgvector:
image: ankane/pgvector:v0.5.1
container_name: test_env_pgvector
environment:
POSTGRES_USER: ${POSTGRES_USER:-myuser}
POSTGRES_PASSWORD: ${POSTGRES_PASSWORD:-mypassword}
POSTGRES_DB: ${POSTGRES_DB:-mydatabase}
ports:
- "5432:5432"
volumes:
- test_env_pgvector_data:/var/lib/postgresql/data
networks:
- pg-network
healthcheck:
test: ["CMD-SHELL", "pg_isready -U ${POSTGRES_USER:-myuser} -d ${POSTGRES_DB:-mydatabase}"]
interval: 10s
timeout: 5s
retries: 5
restart: unless-stopped

pgadmin:
image: dpage/pgadmin4
container_name: test_env_pgadmin
environment:
PGADMIN_DEFAULT_EMAIL: ${PGADMIN_DEFAULT_EMAIL:-admin@admin.com}
PGADMIN_DEFAULT_PASSWORD: ${PGADMIN_DEFAULT_PASSWORD:-root}
ports:
- "8080:80"
volumes:
- test_env_pgadmin_data:/var/lib/pgadmin
depends_on:
- test_env_pgvector
networks:
- pg-network
restart: unless-stopped

networks:
pg-network:
name: pg-network

volumes:
test_env_pgvector_data:
name: test_env_pgvector_data
test_env_pgadmin_data:
name: test_env_pgadmin_data
9 changes: 9 additions & 0 deletions swiftide-integrations/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,13 @@ async-openai = { workspace = true, optional = true }
qdrant-client = { workspace = true, optional = true, default-features = false, features = [
"serde",
] }
sqlx = { workspace = true, optional = true, features = [
"postgres",
"runtime-tokio",
"chrono",
"uuid"
] }
pgvector = { workspace = true, optional = true, features = ["sqlx"] }
redis = { version = "0.27", features = [
"aio",
"tokio-comp",
Expand Down Expand Up @@ -102,6 +109,8 @@ default = ["rustls"]
rustls = ["reqwest/rustls-tls-native-roots"]
# Qdrant for storage
qdrant = ["dep:qdrant-client", "swiftide-core/qdrant"]
# PgVector for storage
pgvector = ["dep:sqlx", "dep:pgvector"]
# Redis for caching and storage
redis = ["dep:redis"]
# Tree-sitter for code operations and chunking
Expand Down
2 changes: 2 additions & 0 deletions swiftide-integrations/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,8 @@ pub mod ollama;
pub mod openai;
#[cfg(feature = "parquet")]
pub mod parquet;
#[cfg(feature = "pgvector")]
pub mod pgvector;
#[cfg(feature = "qdrant")]
pub mod qdrant;
#[cfg(feature = "redb")]
Expand Down
Loading

0 comments on commit bfa44b5

Please sign in to comment.