-
Notifications
You must be signed in to change notification settings - Fork 18
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
feat(indexing): Parquet loader (#279)
Ingest and index data from parquet files.
- Loading branch information
Showing
9 changed files
with
223 additions
and
1 deletion.
There are no files selected for viewing
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,101 @@ | ||
use anyhow::{Context as _, Result}; | ||
use arrow_array::StringArray; | ||
use futures_util::StreamExt as _; | ||
use parquet::arrow::{ParquetRecordBatchStreamBuilder, ProjectionMask}; | ||
use swiftide_core::{ | ||
indexing::{IndexingStream, Node}, | ||
Loader, | ||
}; | ||
use tokio::{fs::File, runtime::Handle}; | ||
|
||
use super::Parquet; | ||
|
||
impl Loader for Parquet { | ||
fn into_stream(self) -> IndexingStream { | ||
let mut builder = tokio::task::block_in_place(|| { | ||
Handle::current().block_on(async { | ||
let file = File::open(self.path).await.expect("Failed to open file"); | ||
|
||
ParquetRecordBatchStreamBuilder::new(file) | ||
.await | ||
.context("Failed to load builder") | ||
.unwrap() | ||
.with_batch_size(self.batch_size) | ||
}) | ||
}); | ||
|
||
let file_metadata = builder.metadata().file_metadata().clone(); | ||
dbg!(file_metadata.schema_descr().columns()); | ||
let column_idx = file_metadata | ||
.schema() | ||
.get_fields() | ||
.iter() | ||
.enumerate() | ||
.find_map(|(pos, column)| { | ||
if self.column_name == column.name() { | ||
Some(pos) | ||
} else { | ||
None | ||
} | ||
}) | ||
.unwrap_or_else(|| panic!("Column {} not found in dataset", &self.column_name)); | ||
|
||
let mask = ProjectionMask::roots(file_metadata.schema_descr(), [column_idx]); | ||
builder = builder.with_projection(mask); | ||
|
||
let stream = builder.build().expect("Failed to build parquet builder"); | ||
|
||
let swiftide_stream = stream.flat_map_unordered(None, move |result_batch| { | ||
let Ok(batch) = result_batch else { | ||
let new_result: Result<Node> = Err(anyhow::anyhow!(result_batch.unwrap_err())); | ||
|
||
return vec![new_result].into(); | ||
}; | ||
assert!(batch.num_columns() == 1, "Number of columns _must_ be 1"); | ||
|
||
let node_values = batch | ||
.column(0) // Should only have one column at this point | ||
.as_any() | ||
.downcast_ref::<StringArray>() | ||
.unwrap() | ||
.into_iter() | ||
.flatten() | ||
.map(Node::from) | ||
.map(Ok) | ||
.collect::<Vec<_>>(); | ||
|
||
IndexingStream::iter(node_values) | ||
}); | ||
|
||
swiftide_stream.boxed().into() | ||
|
||
// let mask = ProjectionMask:: | ||
} | ||
} | ||
|
||
#[cfg(test)] | ||
mod tests { | ||
use std::path::PathBuf; | ||
|
||
use futures_util::TryStreamExt as _; | ||
|
||
use super::*; | ||
|
||
#[test_log::test(tokio::test(flavor = "multi_thread"))] | ||
async fn test_parquet_loader() { | ||
let mut path = PathBuf::from(env!("CARGO_MANIFEST_DIR")); | ||
path.push("src/parquet/test.parquet"); | ||
dbg!(&path); | ||
|
||
let loader = Parquet::builder() | ||
.path(path) | ||
.column_name("chunk") | ||
.build() | ||
.unwrap(); | ||
|
||
let result = loader.into_stream().try_collect::<Vec<_>>().await.unwrap(); | ||
|
||
let expected = [Node::new("hello"), Node::new("world")]; | ||
assert_eq!(result, expected); | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,30 @@ | ||
//! Stream data from parquet files | ||
use std::path::PathBuf; | ||
|
||
use derive_builder::Builder; | ||
|
||
pub mod loader; | ||
|
||
/// Stream data from parquet files on a single column | ||
/// | ||
/// Provide a path, column and optional batch size. The column must be of type `StringArray`. Then | ||
/// the column is loaded into the chunks of the Node. | ||
/// | ||
/// # Panics | ||
/// | ||
/// The loader can panic during initialization if anything with parquet or arrow fails before | ||
/// starting the stream. | ||
#[derive(Debug, Clone, Builder)] | ||
#[builder(setter(into, strip_option))] | ||
pub struct Parquet { | ||
path: PathBuf, | ||
column_name: String, | ||
#[builder(default = "1024")] | ||
batch_size: usize, | ||
} | ||
|
||
impl Parquet { | ||
pub fn builder() -> ParquetBuilder { | ||
ParquetBuilder::default() | ||
} | ||
} |
Binary file not shown.