From b1ec8550e8c5e75c8020c40ac8adfee97ac804c8 Mon Sep 17 00:00:00 2001 From: Nick Moore <nick@zoic.org> Date: Sun, 24 Nov 2024 21:29:32 +1100 Subject: [PATCH] feat: add BZIP2 (.bz2) support for reading fasta and fastq --- python/biobear/biobear.pyi | 1 + python/biobear/compression.py | 3 +++ python/tests/data/test.fa.bz2 | Bin 0 -> 75 bytes python/tests/data/test.fq.bz2 | Bin 0 -> 189 bytes python/tests/test_session.py | 20 ++++++++++++++++++++ src/file_compression_type.rs | 5 +++++ 6 files changed, 29 insertions(+) create mode 100644 python/tests/data/test.fa.bz2 create mode 100644 python/tests/data/test.fq.bz2 diff --git a/python/biobear/biobear.pyi b/python/biobear/biobear.pyi index bf06cb3..b22af32 100644 --- a/python/biobear/biobear.pyi +++ b/python/biobear/biobear.pyi @@ -30,6 +30,7 @@ class FileCompressionType(enum.Enum): GZIP = 0 BGZIP = 1 NONE = 2 + BZIP2 = 3 class FastaSequenceDataType(enum.Enum): """How to treat the sequence data in a FASTA file.""" diff --git a/python/biobear/compression.py b/python/biobear/compression.py index d70b438..e2ed4b4 100644 --- a/python/biobear/compression.py +++ b/python/biobear/compression.py @@ -25,12 +25,15 @@ class Compression(Enum): INFERRED = "INFERRED" NONE = "NONE" GZIP = "GZIP" + BZIP2 = "BZIP2" @classmethod def from_file(cls, path: os.PathLike) -> "Compression": """Infer the compression type from the file extension.""" if Path(path).suffix == ".gz": return Compression.GZIP + if Path(path).suffix == ".bz2": + return Compression.BZIP2 return Compression.NONE def infer_or_use(self, path: os.PathLike) -> "Compression": diff --git a/python/tests/data/test.fa.bz2 b/python/tests/data/test.fa.bz2 new file mode 100644 index 0000000000000000000000000000000000000000..73b5d3a9a18806429a8d6654738d5472f7b781f7 GIT binary patch literal 75 zcmV-R0JQ%?T4*^jL0KkKS;pg_+W-Iq-+%xRKmZT{D1Zb2J|WxyAOIk$ri}yC1IiS_ hX`nR3(U*J1Igo@QE*&XJN%B<uUC9*TLPHylgKWxj8`uB< literal 0 HcmV?d00001 diff --git a/python/tests/data/test.fq.bz2 b/python/tests/data/test.fq.bz2 new file mode 100644 index 0000000000000000000000000000000000000000..2fd6db576e183e83b8ea0e207e47dc8bc0c63134 GIT binary patch literal 189 zcmV;u07CylT4*^jL0KkKSr{U(hX4RPUw~K;V*4`zZ=fsyu3_8&FaXRjl}wEQ0MGyc z1^{H!L6E@>F&Z!xRPiL#^#*`!kjdo$$k^#=K~Wz$3m0#?qEaD<ibR6SMS_J0286bj zOTkJ@HRb4Cj>NKbtxMUq?^)i>xpxLEjmUd0gc6cl(4?r5Dp5#!F-n$kXtM-57pT~h r5Y?kam#;!nh6yEhYb1pkYS)Z>;m@7Fd2)Y35s<r*DZ+$+!4-TsYbsCf literal 0 HcmV?d00001 diff --git a/python/tests/test_session.py b/python/tests/test_session.py index cb2fd69..a52d42a 100644 --- a/python/tests/test_session.py +++ b/python/tests/test_session.py @@ -139,6 +139,14 @@ def test_read_fastq(): assert len(df) == 2 + fastq_path = DATA / "test.fq.bz2" + options = FASTQReadOptions( + file_extension="fq", file_compression_type=FileCompressionType.BZIP2 + ) + + df = session.read_fastq_file(str(fastq_path), options=options).to_polars() + + assert len(df) == 2 @pytest.mark.skipif( not importlib.util.find_spec("polars"), reason="polars not installed" @@ -285,6 +293,18 @@ def test_read_fasta_gz(): assert len(df) == 2 +def test_read_fasta_bz2(): + """Test reading a fasta.bz2 file.""" + session = connect() + + fasta_path = DATA / "test.fa.bz2" + + options = FASTAReadOptions( + file_extension="fa", file_compression_type=FileCompressionType.BZIP2 + ) + df = session.read_fasta_file(str(fasta_path), options=options).to_polars() + + assert len(df) == 2 @pytest.mark.skipif( not importlib.util.find_spec("polars"), reason="polars not installed" diff --git a/src/file_compression_type.rs b/src/file_compression_type.rs index a0f43c2..34d02f9 100644 --- a/src/file_compression_type.rs +++ b/src/file_compression_type.rs @@ -28,6 +28,7 @@ pub enum FileCompressionType { GZIP, ZSTD, UNCOMPRESSED, + BZIP2, } impl Default for FileCompressionType { @@ -56,6 +57,7 @@ impl Display for FileCompressionType { Self::GZIP => write!(f, "GZIP"), Self::ZSTD => write!(f, "ZSTD"), Self::UNCOMPRESSED => write!(f, "UNCOMPRESSED"), + Self::BZIP2 => write!(f, "BZIP2"), } } } @@ -66,6 +68,7 @@ impl From<FileCompressionType> for DFFileCompressionType { FileCompressionType::GZIP => DFFileCompressionType::GZIP, FileCompressionType::ZSTD => DFFileCompressionType::ZSTD, FileCompressionType::UNCOMPRESSED => DFFileCompressionType::UNCOMPRESSED, + FileCompressionType::BZIP2 => DFFileCompressionType::BZIP2, } } } @@ -78,6 +81,7 @@ impl TryFrom<CompressionTypeVariant> for FileCompressionType { CompressionTypeVariant::GZIP => Ok(Self::GZIP), CompressionTypeVariant::ZSTD => Ok(Self::ZSTD), CompressionTypeVariant::UNCOMPRESSED => Ok(Self::UNCOMPRESSED), + CompressionTypeVariant::BZIP2 => Ok(Self::BZIP2), _ => Err(BioBearError::InvalidCompressionType(value.to_string())), } } @@ -91,6 +95,7 @@ impl TryFrom<DFFileCompressionType> for FileCompressionType { DFFileCompressionType::GZIP => Ok(Self::GZIP), DFFileCompressionType::ZSTD => Ok(Self::ZSTD), DFFileCompressionType::UNCOMPRESSED => Ok(Self::UNCOMPRESSED), + DFFileCompressionType::BZIP2 => Ok(Self::BZIP2), _ => Err(BioBearError::InvalidCompressionType( "Invalid compression type".to_string(), )),