Skip to content

Commit

Permalink
Return error instead of panic when reading invalid Parquet metadata (#…
Browse files Browse the repository at this point in the history
…5382)

* Fix handling of invalid metadata

* Don't touch comments

* Fix lint error

---------

Co-authored-by: Matthieu Maitre <mmaitre@microsoft.com>
  • Loading branch information
mmaitre314 and maitre-matt authored Feb 11, 2024
1 parent 1f8470d commit 9d0abcc
Show file tree
Hide file tree
Showing 3 changed files with 31 additions and 14 deletions.
16 changes: 10 additions & 6 deletions parquet/regen.sh
Original file line number Diff line number Diff line change
Expand Up @@ -21,15 +21,19 @@ REVISION=46cc3a0647d301bb9579ca8dd2cc356caf2a72d2

SOURCE_DIR="$(cd "$(dirname "${BASH_SOURCE[0]:-$0}")" && pwd)"

docker run -v $SOURCE_DIR:/thrift/src -it archlinux pacman -Sy --noconfirm thrift && \
docker run -v $SOURCE_DIR:/thrift -it archlinux /bin/bash -c "\
pacman -Sy --noconfirm wget thrift && \
wget https://raw.githubusercontent.com/apache/parquet-format/$REVISION/src/main/thrift/parquet.thrift -O /tmp/parquet.thrift && \
thrift --gen rs /tmp/parquet.thrift && \
echo "Removing TProcessor" && \
echo 'Removing TProcessor' && \
sed -i '/use thrift::server::TProcessor;/d' parquet.rs && \
echo "Replacing TSerializable" && \
echo 'Replacing TSerializable' && \
sed -i 's/impl TSerializable for/impl crate::thrift::TSerializable for/g' parquet.rs && \
echo "Rewriting write_to_out_protocol" && \
echo 'Rewriting write_to_out_protocol' && \
sed -i 's/fn write_to_out_protocol(&self, o_prot: &mut dyn TOutputProtocol)/fn write_to_out_protocol<T: TOutputProtocol>(\&self, o_prot: \&mut T)/g' parquet.rs && \
echo "Rewriting read_from_in_protocol" && \
echo 'Rewriting read_from_in_protocol' && \
sed -i 's/fn read_from_in_protocol(i_prot: &mut dyn TInputProtocol)/fn read_from_in_protocol<T: TInputProtocol>(i_prot: \&mut T)/g' parquet.rs && \
mv parquet.rs src/format.rs
echo 'Rewriting return value expectations' && \
sed -i 's/Ok(ret.expect(\"return value should have been constructed\"))/ret.ok_or_else(|| thrift::Error::Protocol(ProtocolError::new(ProtocolErrorKind::InvalidData, \"return value should have been constructed\")))/g' parquet.rs && \
mv parquet.rs /thrift/src/format.rs
"
13 changes: 13 additions & 0 deletions parquet/src/file/serialized_reader.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1249,6 +1249,19 @@ mod tests {
Ok(())
}

#[test]
fn test_file_reader_invalid_metadata() {
let data = [
255, 172, 1, 0, 50, 82, 65, 73, 1, 0, 0, 0, 169, 168, 168, 162, 87, 255, 16, 0, 0, 0,
80, 65, 82, 49,
];
let ret = SerializedFileReader::new(Bytes::copy_from_slice(&data));
assert_eq!(
ret.err().unwrap().to_string(),
"Parquet error: Could not parse metadata: bad data"
);
}

#[test]
// Use java parquet-tools get below pageIndex info
// !```
Expand Down
16 changes: 8 additions & 8 deletions parquet/src/format.rs

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

0 comments on commit 9d0abcc

Please sign in to comment.