From 2b3b401dcddb2cb32214850b9b4dbb0481943d38 Mon Sep 17 00:00:00 2001
From: Timon Vonk <timonv@gmail.com>
Date: Mon, 21 Oct 2024 12:32:25 +0200
Subject: [PATCH] fix(indexing): Improve splitters consistency and provide
 defaults (#403)

---
 .../src/transformers/chunk_markdown.rs        | 104 +++++++++++++-----
 .../src/transformers/chunk_text.rs            |  85 ++++++++++----
 2 files changed, 137 insertions(+), 52 deletions(-)
diff --git a/swiftide-indexing/src/transformers/chunk_markdown.rs b/swiftide-indexing/src/transformers/chunk_markdown.rs
index 9d6e090b..384fbdd3 100644
--- a/swiftide-indexing/src/transformers/chunk_markdown.rs
+++ b/swiftide-indexing/src/transformers/chunk_markdown.rs
@@ -4,10 +4,12 @@ use std::sync::Arc;
 use async_trait::async_trait;
 use derive_builder::Builder;
 use swiftide_core::{indexing::IndexingStream, indexing::Node, ChunkerTransformer};
-use text_splitter::{Characters, MarkdownSplitter};
+use text_splitter::{Characters, ChunkConfig, MarkdownSplitter};
 
-#[derive(Debug, Clone, Builder)]
-#[builder(pattern = "owned", setter(strip_option))]
+const DEFAULT_MAX_CHAR_SIZE: usize = 2056;
+
+#[derive(Clone, Builder)]
+#[builder(setter(strip_option))]
 /// A transformer that chunks markdown content into smaller pieces.
 ///
 /// The transformer will split the markdown content into smaller pieces based on the specified
@@ -17,42 +19,71 @@ use text_splitter::{Characters, MarkdownSplitter};
 ///
 /// Technically that might work with every splitter `text_splitter` provides.
 pub struct ChunkMarkdown {
-    #[builder(setter(into))]
-    chunker: Arc<MarkdownSplitter<Characters>>,
+    /// Defaults to `None`. If you use a splitter that is resource heavy, this parameter can be
+    /// tuned.
     #[builder(default)]
-    /// The number of concurrent chunks to process.
     concurrency: Option<usize>,
-    /// The splitter is not perfect in skipping min size nodes.
+
+    /// Optional maximum number of characters per chunk.
     ///
-    /// If you provide a custom chunker, you might want to set the range as well.
-    #[builder(default)]
-    range: Option<std::ops::Range<usize>>,
+    /// Defaults to [`DEFAULT_MAX_CHAR_SIZE`].
+    #[builder(default = "DEFAULT_MAX_CHAR_SIZE")]
+    max_characters: usize,
+
+    /// A range of minimum and maximum characters per chunk.
+    ///
+    /// Chunks smaller than the range min will be ignored. `max_characters` will be ignored if this
+    /// is set.
+    ///
+    /// If you provide a custom chunker with a range, you might want to set the range as well.
+    ///
+    /// Defaults to 0..[`max_characters`]
+    #[builder(default = "0..DEFAULT_MAX_CHAR_SIZE")]
+    range: std::ops::Range<usize>,
+
+    /// The markdown splitter from [`text_splitter`]
+    ///
+    /// Defaults to a new [`MarkdownSplitter`] with the specified `max_characters`.
+    #[builder(setter(into), default = "self.default_client()")]
+    chunker: Arc<MarkdownSplitter<Characters>>,
+}
+
+impl std::fmt::Debug for ChunkMarkdown {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        f.debug_struct("ChunkMarkdown")
+            .field("concurrency", &self.concurrency)
+            .field("max_characters", &self.max_characters)
+            .field("range", &self.range)
+            .finish()
+    }
+}
+
+impl Default for ChunkMarkdown {
+    fn default() -> Self {
+        Self::from_max_characters(DEFAULT_MAX_CHAR_SIZE)
+    }
 }
 
 impl ChunkMarkdown {
+    fn builder() -> ChunkMarkdownBuilder {
+        ChunkMarkdownBuilder::default()
+    }
+
     /// Create a new transformer with a maximum number of characters per chunk.
+    #[allow(clippy::missing_panics_doc)]
     pub fn from_max_characters(max_characters: usize) -> Self {
-        Self {
-            chunker: Arc::new(MarkdownSplitter::new(max_characters)),
-            concurrency: None,
-            range: None,
-        }
+        Self::builder()
+            .max_characters(max_characters)
+            .build()
+            .expect("Cannot fail")
     }
 
     /// Create a new transformer with a range of characters per chunk.
     ///
     /// Chunks smaller than the range will be ignored.
+    #[allow(clippy::missing_panics_doc)]
     pub fn from_chunk_range(range: std::ops::Range<usize>) -> Self {
-        Self {
-            chunker: Arc::new(MarkdownSplitter::new(range.clone())),
-            concurrency: None,
-            range: Some(range),
-        }
-    }
-
-    /// Build a custom markdown chunker.
-    pub fn builder() -> ChunkMarkdownBuilder {
-        ChunkMarkdownBuilder::default()
+        Self::builder().range(range).build().expect("Cannot fail")
     }
 
     /// Set the number of concurrent chunks to process.
@@ -63,13 +94,26 @@ impl ChunkMarkdown {
     }
 
     fn min_size(&self) -> usize {
-        self.range.as_ref().map_or(0, |r| r.start)
+        self.range.start
+    }
+}
+
+impl ChunkMarkdownBuilder {
+    fn default_client(&self) -> Arc<MarkdownSplitter<Characters>> {
+        let chunk_config: ChunkConfig<Characters> = self
+            .range
+            .clone()
+            .map(ChunkConfig::<Characters>::from)
+            .or_else(|| self.max_characters.map(Into::into))
+            .unwrap_or(DEFAULT_MAX_CHAR_SIZE.into());
+
+        Arc::new(MarkdownSplitter::new(chunk_config))
     }
 }
 
 #[async_trait]
 impl ChunkerTransformer for ChunkMarkdown {
-    #[tracing::instrument(skip_all, name = "transformers.chunk_markdown")]
+    #[tracing::instrument(skip_all)]
     async fn transform_node(&self, node: Node) -> IndexingStream {
         let chunks = self
             .chunker
@@ -132,8 +176,12 @@ mod test {
             .await
             .unwrap();
 
+        dbg!(&nodes.iter().map(|n| n.chunk.clone()).collect::<Vec<_>>());
         for line in MARKDOWN.lines().filter(|line| !line.trim().is_empty()) {
-            assert!(nodes.iter().any(|node| node.chunk == line.trim()));
+            nodes
+                .iter()
+                .find(|node| node.chunk == line.trim())
+                .unwrap_or_else(|| panic!("Line not found: {line}"));
         }
 
         assert_eq!(nodes.len(), 6);
diff --git a/swiftide-indexing/src/transformers/chunk_text.rs b/swiftide-indexing/src/transformers/chunk_text.rs
index 0407c9db..1af33ab7 100644
--- a/swiftide-indexing/src/transformers/chunk_text.rs
+++ b/swiftide-indexing/src/transformers/chunk_text.rs
@@ -4,9 +4,12 @@ use std::sync::Arc;
 use async_trait::async_trait;
 use derive_builder::Builder;
 use swiftide_core::{indexing::IndexingStream, indexing::Node, ChunkerTransformer};
+use text_splitter::{Characters, ChunkConfig, TextSplitter};
+
+const DEFAULT_MAX_CHAR_SIZE: usize = 2056;
 
 #[derive(Debug, Clone, Builder)]
-#[builder(pattern = "owned", setter(strip_option))]
+#[builder(setter(strip_option))]
 /// A transformer that chunks text content into smaller pieces.
 ///
 /// The transformer will split the text content into smaller pieces based on the specified
@@ -17,42 +20,64 @@ use swiftide_core::{indexing::IndexingStream, indexing::Node, ChunkerTransformer
 ///
 /// Technically that might work with every splitter `text_splitter` provides.
 pub struct ChunkText {
-    #[builder(setter(into))]
-    chunker: Arc<text_splitter::TextSplitter<text_splitter::Characters>>,
+    /// The max number of concurrent chunks to process.
+    ///
+    /// Defaults to `None`. If you use a splitter that is resource heavy, this parameter can be
+    /// tuned.
     #[builder(default)]
-    /// The number of concurrent chunks to process.
     concurrency: Option<usize>,
-    /// The splitter is not perfect in skipping min size nodes.
+
+    /// Optional maximum number of characters per chunk.
     ///
-    /// If you provide a custom chunker, you might want to set the range as well.
-    #[builder(default)]
-    range: Option<std::ops::Range<usize>>,
+    /// Defaults to [`DEFAULT_MAX_CHAR_SIZE`].
+    #[builder(default = "DEFAULT_MAX_CHAR_SIZE")]
+    #[allow(dead_code)]
+    max_characters: usize,
+
+    /// A range of minimum and maximum characters per chunk.
+    ///
+    /// Chunks smaller than the range min will be ignored. `max_characters` will be ignored if this
+    /// is set.
+    ///
+    /// If you provide a custom chunker with a range, you might want to set the range as well.
+    ///
+    /// Defaults to 0..[`max_characters`]
+    #[builder(default = "0..DEFAULT_MAX_CHAR_SIZE")]
+    range: std::ops::Range<usize>,
+
+    /// The text splitter from [`text_splitter`]
+    ///
+    /// Defaults to a new [`TextSplitter`] with the specified `max_characters`.
+    #[builder(setter(into), default = "self.default_client()")]
+    chunker: Arc<TextSplitter<Characters>>,
+}
+
+impl Default for ChunkText {
+    fn default() -> Self {
+        Self::from_max_characters(DEFAULT_MAX_CHAR_SIZE)
+    }
 }
 
 impl ChunkText {
+    pub fn builder() -> ChunkTextBuilder {
+        ChunkTextBuilder::default()
+    }
+
     /// Create a new transformer with a maximum number of characters per chunk.
+    #[allow(clippy::missing_panics_doc)]
     pub fn from_max_characters(max_characters: usize) -> Self {
-        Self {
-            chunker: Arc::new(text_splitter::TextSplitter::new(max_characters)),
-            concurrency: None,
-            range: None,
-        }
+        Self::builder()
+            .max_characters(max_characters)
+            .build()
+            .expect("Cannot fail")
     }
 
     /// Create a new transformer with a range of characters per chunk.
     ///
     /// Chunks smaller than the range will be ignored.
+    #[allow(clippy::missing_panics_doc)]
     pub fn from_chunk_range(range: std::ops::Range<usize>) -> Self {
-        Self {
-            chunker: Arc::new(text_splitter::TextSplitter::new(range.clone())),
-            concurrency: None,
-            range: Some(range),
-        }
-    }
-
-    /// Build a custom text chunker.
-    pub fn builder() -> ChunkTextBuilder {
-        ChunkTextBuilder::default()
+        Self::builder().range(range).build().expect("Cannot fail")
     }
 
     /// Set the number of concurrent chunks to process.
@@ -63,10 +88,22 @@ impl ChunkText {
     }
 
     fn min_size(&self) -> usize {
-        self.range.as_ref().map_or(0, |r| r.start)
+        self.range.start
     }
 }
 
+impl ChunkTextBuilder {
+    fn default_client(&self) -> Arc<TextSplitter<Characters>> {
+        let chunk_config: ChunkConfig<Characters> = self
+            .range
+            .clone()
+            .map(ChunkConfig::<Characters>::from)
+            .or_else(|| self.max_characters.map(Into::into))
+            .unwrap_or(DEFAULT_MAX_CHAR_SIZE.into());
+
+        Arc::new(TextSplitter::new(chunk_config))
+    }
+}
 #[async_trait]
 impl ChunkerTransformer for ChunkText {
     #[tracing::instrument(skip_all, name = "transformers.chunk_text")]