From 2b3b401dcddb2cb32214850b9b4dbb0481943d38 Mon Sep 17 00:00:00 2001 From: Timon Vonk Date: Mon, 21 Oct 2024 12:32:25 +0200 Subject: [PATCH] fix(indexing): Improve splitters consistency and provide defaults (#403) --- .../src/transformers/chunk_markdown.rs | 104 +++++++++++++----- .../src/transformers/chunk_text.rs | 85 ++++++++++---- 2 files changed, 137 insertions(+), 52 deletions(-) diff --git a/swiftide-indexing/src/transformers/chunk_markdown.rs b/swiftide-indexing/src/transformers/chunk_markdown.rs index 9d6e090b..384fbdd3 100644 --- a/swiftide-indexing/src/transformers/chunk_markdown.rs +++ b/swiftide-indexing/src/transformers/chunk_markdown.rs @@ -4,10 +4,12 @@ use std::sync::Arc; use async_trait::async_trait; use derive_builder::Builder; use swiftide_core::{indexing::IndexingStream, indexing::Node, ChunkerTransformer}; -use text_splitter::{Characters, MarkdownSplitter}; +use text_splitter::{Characters, ChunkConfig, MarkdownSplitter}; -#[derive(Debug, Clone, Builder)] -#[builder(pattern = "owned", setter(strip_option))] +const DEFAULT_MAX_CHAR_SIZE: usize = 2056; + +#[derive(Clone, Builder)] +#[builder(setter(strip_option))] /// A transformer that chunks markdown content into smaller pieces. /// /// The transformer will split the markdown content into smaller pieces based on the specified @@ -17,42 +19,71 @@ use text_splitter::{Characters, MarkdownSplitter}; /// /// Technically that might work with every splitter `text_splitter` provides. pub struct ChunkMarkdown { - #[builder(setter(into))] - chunker: Arc>, + /// Defaults to `None`. If you use a splitter that is resource heavy, this parameter can be + /// tuned. #[builder(default)] - /// The number of concurrent chunks to process. concurrency: Option, - /// The splitter is not perfect in skipping min size nodes. + + /// Optional maximum number of characters per chunk. /// - /// If you provide a custom chunker, you might want to set the range as well. - #[builder(default)] - range: Option>, + /// Defaults to [`DEFAULT_MAX_CHAR_SIZE`]. + #[builder(default = "DEFAULT_MAX_CHAR_SIZE")] + max_characters: usize, + + /// A range of minimum and maximum characters per chunk. + /// + /// Chunks smaller than the range min will be ignored. `max_characters` will be ignored if this + /// is set. + /// + /// If you provide a custom chunker with a range, you might want to set the range as well. + /// + /// Defaults to 0..[`max_characters`] + #[builder(default = "0..DEFAULT_MAX_CHAR_SIZE")] + range: std::ops::Range, + + /// The markdown splitter from [`text_splitter`] + /// + /// Defaults to a new [`MarkdownSplitter`] with the specified `max_characters`. + #[builder(setter(into), default = "self.default_client()")] + chunker: Arc>, +} + +impl std::fmt::Debug for ChunkMarkdown { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.debug_struct("ChunkMarkdown") + .field("concurrency", &self.concurrency) + .field("max_characters", &self.max_characters) + .field("range", &self.range) + .finish() + } +} + +impl Default for ChunkMarkdown { + fn default() -> Self { + Self::from_max_characters(DEFAULT_MAX_CHAR_SIZE) + } } impl ChunkMarkdown { + fn builder() -> ChunkMarkdownBuilder { + ChunkMarkdownBuilder::default() + } + /// Create a new transformer with a maximum number of characters per chunk. + #[allow(clippy::missing_panics_doc)] pub fn from_max_characters(max_characters: usize) -> Self { - Self { - chunker: Arc::new(MarkdownSplitter::new(max_characters)), - concurrency: None, - range: None, - } + Self::builder() + .max_characters(max_characters) + .build() + .expect("Cannot fail") } /// Create a new transformer with a range of characters per chunk. /// /// Chunks smaller than the range will be ignored. + #[allow(clippy::missing_panics_doc)] pub fn from_chunk_range(range: std::ops::Range) -> Self { - Self { - chunker: Arc::new(MarkdownSplitter::new(range.clone())), - concurrency: None, - range: Some(range), - } - } - - /// Build a custom markdown chunker. - pub fn builder() -> ChunkMarkdownBuilder { - ChunkMarkdownBuilder::default() + Self::builder().range(range).build().expect("Cannot fail") } /// Set the number of concurrent chunks to process. @@ -63,13 +94,26 @@ impl ChunkMarkdown { } fn min_size(&self) -> usize { - self.range.as_ref().map_or(0, |r| r.start) + self.range.start + } +} + +impl ChunkMarkdownBuilder { + fn default_client(&self) -> Arc> { + let chunk_config: ChunkConfig = self + .range + .clone() + .map(ChunkConfig::::from) + .or_else(|| self.max_characters.map(Into::into)) + .unwrap_or(DEFAULT_MAX_CHAR_SIZE.into()); + + Arc::new(MarkdownSplitter::new(chunk_config)) } } #[async_trait] impl ChunkerTransformer for ChunkMarkdown { - #[tracing::instrument(skip_all, name = "transformers.chunk_markdown")] + #[tracing::instrument(skip_all)] async fn transform_node(&self, node: Node) -> IndexingStream { let chunks = self .chunker @@ -132,8 +176,12 @@ mod test { .await .unwrap(); + dbg!(&nodes.iter().map(|n| n.chunk.clone()).collect::>()); for line in MARKDOWN.lines().filter(|line| !line.trim().is_empty()) { - assert!(nodes.iter().any(|node| node.chunk == line.trim())); + nodes + .iter() + .find(|node| node.chunk == line.trim()) + .unwrap_or_else(|| panic!("Line not found: {line}")); } assert_eq!(nodes.len(), 6); diff --git a/swiftide-indexing/src/transformers/chunk_text.rs b/swiftide-indexing/src/transformers/chunk_text.rs index 0407c9db..1af33ab7 100644 --- a/swiftide-indexing/src/transformers/chunk_text.rs +++ b/swiftide-indexing/src/transformers/chunk_text.rs @@ -4,9 +4,12 @@ use std::sync::Arc; use async_trait::async_trait; use derive_builder::Builder; use swiftide_core::{indexing::IndexingStream, indexing::Node, ChunkerTransformer}; +use text_splitter::{Characters, ChunkConfig, TextSplitter}; + +const DEFAULT_MAX_CHAR_SIZE: usize = 2056; #[derive(Debug, Clone, Builder)] -#[builder(pattern = "owned", setter(strip_option))] +#[builder(setter(strip_option))] /// A transformer that chunks text content into smaller pieces. /// /// The transformer will split the text content into smaller pieces based on the specified @@ -17,42 +20,64 @@ use swiftide_core::{indexing::IndexingStream, indexing::Node, ChunkerTransformer /// /// Technically that might work with every splitter `text_splitter` provides. pub struct ChunkText { - #[builder(setter(into))] - chunker: Arc>, + /// The max number of concurrent chunks to process. + /// + /// Defaults to `None`. If you use a splitter that is resource heavy, this parameter can be + /// tuned. #[builder(default)] - /// The number of concurrent chunks to process. concurrency: Option, - /// The splitter is not perfect in skipping min size nodes. + + /// Optional maximum number of characters per chunk. /// - /// If you provide a custom chunker, you might want to set the range as well. - #[builder(default)] - range: Option>, + /// Defaults to [`DEFAULT_MAX_CHAR_SIZE`]. + #[builder(default = "DEFAULT_MAX_CHAR_SIZE")] + #[allow(dead_code)] + max_characters: usize, + + /// A range of minimum and maximum characters per chunk. + /// + /// Chunks smaller than the range min will be ignored. `max_characters` will be ignored if this + /// is set. + /// + /// If you provide a custom chunker with a range, you might want to set the range as well. + /// + /// Defaults to 0..[`max_characters`] + #[builder(default = "0..DEFAULT_MAX_CHAR_SIZE")] + range: std::ops::Range, + + /// The text splitter from [`text_splitter`] + /// + /// Defaults to a new [`TextSplitter`] with the specified `max_characters`. + #[builder(setter(into), default = "self.default_client()")] + chunker: Arc>, +} + +impl Default for ChunkText { + fn default() -> Self { + Self::from_max_characters(DEFAULT_MAX_CHAR_SIZE) + } } impl ChunkText { + pub fn builder() -> ChunkTextBuilder { + ChunkTextBuilder::default() + } + /// Create a new transformer with a maximum number of characters per chunk. + #[allow(clippy::missing_panics_doc)] pub fn from_max_characters(max_characters: usize) -> Self { - Self { - chunker: Arc::new(text_splitter::TextSplitter::new(max_characters)), - concurrency: None, - range: None, - } + Self::builder() + .max_characters(max_characters) + .build() + .expect("Cannot fail") } /// Create a new transformer with a range of characters per chunk. /// /// Chunks smaller than the range will be ignored. + #[allow(clippy::missing_panics_doc)] pub fn from_chunk_range(range: std::ops::Range) -> Self { - Self { - chunker: Arc::new(text_splitter::TextSplitter::new(range.clone())), - concurrency: None, - range: Some(range), - } - } - - /// Build a custom text chunker. - pub fn builder() -> ChunkTextBuilder { - ChunkTextBuilder::default() + Self::builder().range(range).build().expect("Cannot fail") } /// Set the number of concurrent chunks to process. @@ -63,10 +88,22 @@ impl ChunkText { } fn min_size(&self) -> usize { - self.range.as_ref().map_or(0, |r| r.start) + self.range.start } } +impl ChunkTextBuilder { + fn default_client(&self) -> Arc> { + let chunk_config: ChunkConfig = self + .range + .clone() + .map(ChunkConfig::::from) + .or_else(|| self.max_characters.map(Into::into)) + .unwrap_or(DEFAULT_MAX_CHAR_SIZE.into()); + + Arc::new(TextSplitter::new(chunk_config)) + } +} #[async_trait] impl ChunkerTransformer for ChunkText { #[tracing::instrument(skip_all, name = "transformers.chunk_text")]