Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: Index markdown in pgvector #392

Open
wants to merge 35 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 1 commit
Commits
Show all changes
35 commits
Select commit Hold shift + click to select a range
bfa44b5
feat: Index markdown in pgvector
shamb0 Oct 16, 2024
3243fd5
chore(ci): Switch to dependabot for better grouping (#398)
timonv Oct 20, 2024
e914cba
chore(deps): bump SethCohen/github-releases-to-discord from 1.15.1 to…
dependabot[bot] Oct 20, 2024
f305ef8
chore(ci): Explicit allow all for dependabot
timonv Oct 20, 2024
b3b3175
fix(ci): Update dependabot.yml via ui (#402)
timonv Oct 21, 2024
fe25b17
fix(indexing): Improve splitters consistency and provide defaults (#403)
timonv Oct 21, 2024
b531bdd
fix(indexing): Visibility of ChunkMarkdown builder should be public
timonv Oct 21, 2024
2a43a75
chore: Improve workspace configuration (#404)
timonv Oct 21, 2024
c17e9a9
chore: release v0.13.4 (#400)
SwabbieBosun Oct 21, 2024
b6fa280
fix(ci): Remove explicit 'all' from dependabot config
timonv Oct 21, 2024
c08658f
chore: Soft update deps
timonv Oct 21, 2024
5c3aff8
fix(ci): Add zlib to allowed licenses
timonv Oct 21, 2024
57014d2
fix(ci): Add back allow all in dependabot and fix aws pattern
timonv Oct 21, 2024
f60d009
feat: Index markdown in pgvector
shamb0 Oct 16, 2024
4266bbe
Addressed review comments:
shamb0 Oct 22, 2024
9a32436
Addressed review comments:
shamb0 Oct 22, 2024
95e925a
Update examples/index_md_into_pgvector.rs
shamb0 Oct 30, 2024
72ba300
fix(ci): Remove cache fixing ci disk limits (#408)
timonv Oct 22, 2024
6781ec3
chore(deps): bump the minor group across 1 directory with 12 updates …
dependabot[bot] Oct 23, 2024
5c3458c
fix(indexing)!: Node ID no longer memoized (#414)
timonv Oct 27, 2024
40709be
fix(indexing): Use atomics for key generation in memory storage (#415)
timonv Oct 27, 2024
7fba78d
feat(integrations): Support in process hugging face models via mistra…
timonv Oct 27, 2024
ce3945b
chore(deps): bump the minor group across 1 directory with 16 updates …
dependabot[bot] Oct 27, 2024
ae7718d
chore: release v0.14.0 (#416)
SwabbieBosun Oct 27, 2024
3c74464
fix: Revert 0.14 release as mistralrs is unpublished (#417)
timonv Oct 27, 2024
e32f721
fix(integrations): Revert mistralrs support (#418)
timonv Oct 27, 2024
30c2d01
chore: Re-release 0.14 without mistralrs (#419)
timonv Oct 27, 2024
fade2fe
chore: release v0.14.1 (#420)
SwabbieBosun Oct 27, 2024
acb34af
feat: Index markdown in pgvector
shamb0 Oct 16, 2024
b7aa295
chore: release v0.13.4 (#400)
SwabbieBosun Oct 21, 2024
bd0b265
Completed release v0.14.1 intake
shamb0 Oct 30, 2024
3eb579f
Merge branch 'master' into feat/indexing-into-pgvector
shamb0 Oct 30, 2024
6ad22f1
merge to upstream master
shamb0 Oct 30, 2024
15b2909
Address review feedback:
shamb0 Nov 1, 2024
6817d4b
Merge remote-tracking branch 'upstream/master' into feat/indexing-int…
shamb0 Nov 1, 2024
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
859 changes: 665 additions & 194 deletions Cargo.lock

Large diffs are not rendered by default.

2 changes: 2 additions & 0 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,8 @@ arrow-array = { version = "52.0", default-features = false }
arrow = { version = "52.2" }
parquet = { version = "52.2", default-features = false, features = ["async"] }
redb = { version = "2.1" }
sqlx = { version = "0.8.2", features = ["postgres", "uuid"] }
pgvector = { version = "0.4.0", features = ["sqlx"] }

# Testing
test-log = "0.2.16"
Expand Down
7 changes: 7 additions & 0 deletions examples/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -21,13 +21,16 @@ swiftide = { path = "../swiftide/", features = [
"ollama",
"fluvio",
"lancedb",
"pgvector",
] }
tracing-subscriber = "0.3"
tracing = { workspace = true }
serde_json = { workspace = true }
spider = { workspace = true }
qdrant-client = { workspace = true }
fluvio = { workspace = true }
temp-dir = { workspace = true }
sqlx = { workspace = true }

[[example]]
doc-scrape-examples = true
Expand Down Expand Up @@ -91,3 +94,7 @@ path = "fluvio.rs"
[[example]]
name = "lancedb"
path = "lancedb.rs"

[[example]]
name = "index-md-pgvector"
path = "index_md_into_pgvector.rs"
75 changes: 75 additions & 0 deletions examples/index_md_into_pgvector.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,75 @@
/**
* This example demonstrates how to use the Pgvector integration with Swiftide
*/
use std::path::PathBuf;
use swiftide::{
indexing::{
self,
loaders::FileLoader,
transformers::{
metadata_qa_text::NAME as METADATA_QA_TEXT_NAME, ChunkMarkdown, Embed, MetadataQAText,
},
EmbeddedField,
},
integrations::{self, pgvector::PgVector},
};

#[tokio::main]
async fn main() -> Result<(), Box<dyn std::error::Error>> {
tracing_subscriber::fmt::init();
tracing::info!("Starting PgVector indexing test");

// Get the manifest directory path
let manifest_dir = std::env::var("CARGO_MANIFEST_DIR").expect("CARGO_MANIFEST_DIR not set");

// Create a PathBuf to test dataset from the manifest directory
let test_dataset_path = PathBuf::from(manifest_dir).join("test_dataset");
tracing::info!("Test Dataset path: {:?}", test_dataset_path);

let pgv_db_url = std::env::var("DATABASE_URL")
.as_deref()
.unwrap_or("postgresql://myuser:mypassword@localhost:5432/mydatabase")
.to_owned();

let ollama_client = integrations::ollama::Ollama::default()
.with_default_prompt_model("llama3.2:latest")
.to_owned();

let fastembed =
integrations::fastembed::FastEmbed::try_default().expect("Could not create FastEmbed");

// Configure Pgvector with a default vector size, a single embedding
// and in addition to embedding the text metadata, also store it in a field
let pgv_storage = PgVector::builder()
.try_from_url(pgv_db_url, Some(10))
.await
.expect("Failed to connect to postgres server")
.vector_size(384)
.with_vector(EmbeddedField::Combined)
.with_metadata(METADATA_QA_TEXT_NAME)
.table_name("swiftide_pgvector_test".to_string())
.build()
.unwrap();
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Almost exactly right! I prefer it if builders do not do IO if they can avoid it, for multiple reasons. In this case, that also has the benefit of being able to connect lazilly and hiding the details of the connection pool.

i.e. the builder api like:

 let pgv_storage = PgVector::builder()
        .database_url(pgv_db_url)
        .pool_size(10) // With a sane default if ommitted 
        .vector_size(384)
        .with_vector(EmbeddedField::Combined)
        .with_metadata(METADATA_QA_TEXT_NAME)
        .table_name("swiftide_pgvector_test".to_string())
        .build()
        .unwrap();

And then in PgVector::setup (which is only called once):

async fn setup(&self) -> Result<()> {
  self.try_connect_to_pool(self.database_url, self.pool_size).await?;
  ...

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@timonv, I'm looking for your input on a design choice here.

If we decide to handle the database connection pool setup within fn setup(&self) instead of PgVectorBuilder, we'll need to mutate PgVector within fn setup(). This change would mean updating the function signature in trait Persist to:

async fn setup(&mut self) -> Result<()>

For example:

async fn setup(&mut self) -> Result<()> {
  self.connection_pool = self.try_connect_to_pool(self.database_url, self.pool_size).await?;
  ...
}

This adjustment would introduce breaking changes across the stack, particularly impacting:

  • swiftide-indexing/src/persist/memory_storage.rs
  • swiftide-integrations/src/lancedb/persist.rs
  • swiftide-integrations/src/qdrant/persist.rs
  • swiftide-integrations/src/redis/persist.rs

Would you prefer moving the IO operations into Persist::setup() for these components? If so, we could handle this as a separate PR to streamline the updates.

Looking forward to your thoughts!


// Drop the existing test table before running the test
tracing::info!("Dropping existing test table if it exists");
let drop_table_sql = "DROP TABLE IF EXISTS swiftide_pgvector_test";

if let Some(pool) = pgv_storage.get_pool() {
sqlx::query(drop_table_sql).execute(pool).await?;
} else {
return Err("Failed to get database connection pool".into());
}

tracing::info!("Starting indexing pipeline");
indexing::Pipeline::from_loader(FileLoader::new(test_dataset_path).with_extensions(&["md"]))
.then_chunk(ChunkMarkdown::from_chunk_range(10..2048))
.then(MetadataQAText::new(ollama_client.clone()))
.then_in_batch(Embed::new(fastembed).with_batch_size(100))
.then_store_with(pgv_storage.clone())
.run()
.await?;

tracing::info!("Indexing test completed successfully");
Ok(())
}
41 changes: 41 additions & 0 deletions examples/test_dataset/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
# **Swiftide: A Fast, Streaming Indexing and Query Library for AI Applications**

Swiftide is a Rust-native library designed to simplify the development of Large Language Model (LLM) applications. It addresses the challenge of providing context to LLMs for solving real-world problems by enabling efficient ingestion, transformation, indexing, and querying of extensive data. This process, known as Retrieval Augmented Generation (RAG), enhances the capabilities of LLMs.

## **Key Features:**

* **Fast and Modular Indexing:** Swiftide offers a high-performance, streaming indexing pipeline with asynchronous, parallel processing capabilities.
* **Query Pipeline:** An experimental query pipeline facilitates efficient retrieval and processing of information.
* **Versatility:** The library includes various loaders, transformers, semantic chunkers, embedders, and other components, providing flexibility for different use cases.
* **Extensibility:** Developers can bring their own transformers by extending straightforward traits or using closures.
* **Pipeline Management:** Swiftide supports splitting and merging pipelines for complex workflows.
* **Prompt Templating:** Jinja-like templating simplifies the creation of prompts.
* **Storage Options:** Integration with multiple storage backends, including Qdrant, Redis, and LanceDB.
* **Integrations:** Seamless integration with popular tools and platforms like OpenAI, Groq, Redis, Qdrant, Ollama, FastEmbed-rs, Fluvio, LanceDB, and Treesitter.
* **Evaluation:** Pipeline evaluation using RAGAS for performance assessment.
* **Sparse Vector Support:** Enables hybrid search with sparse vector support.
* **Tracing:** Built-in tracing support for logging and debugging.

## **Technical Insights:**

* **Rust-Native:** Developed in Rust for performance, safety, and concurrency.
* **Streaming Architecture:** Employs a streaming architecture for efficient processing of large datasets.
* **Modularity:** Highly modular design allows for customization and extensibility.
* **Asynchronous and Parallel Processing:** Leverages asynchronous and parallel processing for optimal performance.
* **Strong Typing:** The query pipeline is fully and strongly typed, ensuring type safety and developer productivity.
* **OpenAI Integration:** Provides seamless integration with OpenAI for powerful LLM capabilities.

## **Getting Started:**

To get started with Swiftide, developers need to set up a Rust project, add the Swiftide library as a dependency, enable the required integration features, and write a pipeline. Comprehensive examples and documentation are available to guide developers through the process.

## **Current Status and Future Roadmap:**

Swiftide is under active development and may introduce breaking changes as it progresses towards version 1.0. The documentation may not cover all features and could be slightly outdated. Despite these considerations, Swiftide offers a promising solution for building efficient and scalable LLM applications. The project's roadmap includes addressing open issues and incorporating proposed features to enhance its functionality and usability.

## **Community and Contributions:**

The Swiftide community welcomes feedback, questions, and contributions. Developers can connect with the community on Discord and contribute to the project by forking the repository, creating pull requests, or opening issues with enhancement tags.

**Overall, Swiftide presents a powerful and flexible framework for building Retrieval Augmented Generation (RAG) pipelines in Rust. Its focus on performance, modularity, and extensibility makes it a valuable tool for developers working with LLMs and AI applications.**

46 changes: 46 additions & 0 deletions scripts/docker/docker-compose-db-pg.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
services:
test_env_pgvector:
image: ankane/pgvector:v0.5.1
container_name: test_env_pgvector
environment:
POSTGRES_USER: ${POSTGRES_USER:-myuser}
POSTGRES_PASSWORD: ${POSTGRES_PASSWORD:-mypassword}
POSTGRES_DB: ${POSTGRES_DB:-mydatabase}
ports:
- "5432:5432"
volumes:
- test_env_pgvector_data:/var/lib/postgresql/data
networks:
- pg-network
healthcheck:
test: ["CMD-SHELL", "pg_isready -U ${POSTGRES_USER:-myuser} -d ${POSTGRES_DB:-mydatabase}"]
interval: 10s
timeout: 5s
retries: 5
restart: unless-stopped

pgadmin:
image: dpage/pgadmin4
container_name: test_env_pgadmin
environment:
PGADMIN_DEFAULT_EMAIL: ${PGADMIN_DEFAULT_EMAIL:-admin@admin.com}
PGADMIN_DEFAULT_PASSWORD: ${PGADMIN_DEFAULT_PASSWORD:-root}
ports:
- "8080:80"
volumes:
- test_env_pgadmin_data:/var/lib/pgadmin
depends_on:
- test_env_pgvector
networks:
- pg-network
restart: unless-stopped

networks:
pg-network:
name: pg-network

volumes:
test_env_pgvector_data:
name: test_env_pgvector_data
test_env_pgadmin_data:
name: test_env_pgadmin_data
9 changes: 9 additions & 0 deletions swiftide-integrations/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,13 @@ async-openai = { workspace = true, optional = true }
qdrant-client = { workspace = true, optional = true, default-features = false, features = [
"serde",
] }
sqlx = { workspace = true, optional = true, features = [
"postgres",
"runtime-tokio",
"chrono",
"uuid"
] }
pgvector = { workspace = true, optional = true, features = ["sqlx"] }
redis = { version = "0.27", features = [
"aio",
"tokio-comp",
Expand Down Expand Up @@ -102,6 +109,8 @@ default = ["rustls"]
rustls = ["reqwest/rustls-tls-native-roots"]
# Qdrant for storage
qdrant = ["dep:qdrant-client", "swiftide-core/qdrant"]
# PgVector for storage
pgvector = ["dep:sqlx", "dep:pgvector"]
# Redis for caching and storage
redis = ["dep:redis"]
# Tree-sitter for code operations and chunking
Expand Down
2 changes: 2 additions & 0 deletions swiftide-integrations/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,8 @@ pub mod ollama;
pub mod openai;
#[cfg(feature = "parquet")]
pub mod parquet;
#[cfg(feature = "pgvector")]
pub mod pgvector;
#[cfg(feature = "qdrant")]
pub mod qdrant;
#[cfg(feature = "redb")]
Expand Down
Loading
Loading