diff --git a/python/biobear/biobear.pyi b/python/biobear/biobear.pyi index bf06cb3..b22af32 100644 --- a/python/biobear/biobear.pyi +++ b/python/biobear/biobear.pyi @@ -30,6 +30,7 @@ class FileCompressionType(enum.Enum): GZIP = 0 BGZIP = 1 NONE = 2 + BZIP2 = 3 class FastaSequenceDataType(enum.Enum): """How to treat the sequence data in a FASTA file.""" diff --git a/python/biobear/compression.py b/python/biobear/compression.py index d70b438..e2ed4b4 100644 --- a/python/biobear/compression.py +++ b/python/biobear/compression.py @@ -25,12 +25,15 @@ class Compression(Enum): INFERRED = "INFERRED" NONE = "NONE" GZIP = "GZIP" + BZIP2 = "BZIP2" @classmethod def from_file(cls, path: os.PathLike) -> "Compression": """Infer the compression type from the file extension.""" if Path(path).suffix == ".gz": return Compression.GZIP + if Path(path).suffix == ".bz2": + return Compression.BZIP2 return Compression.NONE def infer_or_use(self, path: os.PathLike) -> "Compression": diff --git a/python/tests/data/test.fa.bz2 b/python/tests/data/test.fa.bz2 new file mode 100644 index 0000000..73b5d3a Binary files /dev/null and b/python/tests/data/test.fa.bz2 differ diff --git a/python/tests/data/test.fq.bz2 b/python/tests/data/test.fq.bz2 new file mode 100644 index 0000000..2fd6db5 Binary files /dev/null and b/python/tests/data/test.fq.bz2 differ diff --git a/python/tests/test_session.py b/python/tests/test_session.py index cb2fd69..a52d42a 100644 --- a/python/tests/test_session.py +++ b/python/tests/test_session.py @@ -139,6 +139,14 @@ def test_read_fastq(): assert len(df) == 2 + fastq_path = DATA / "test.fq.bz2" + options = FASTQReadOptions( + file_extension="fq", file_compression_type=FileCompressionType.BZIP2 + ) + + df = session.read_fastq_file(str(fastq_path), options=options).to_polars() + + assert len(df) == 2 @pytest.mark.skipif( not importlib.util.find_spec("polars"), reason="polars not installed" @@ -285,6 +293,18 @@ def test_read_fasta_gz(): assert len(df) == 2 +def test_read_fasta_bz2(): + """Test reading a fasta.bz2 file.""" + session = connect() + + fasta_path = DATA / "test.fa.bz2" + + options = FASTAReadOptions( + file_extension="fa", file_compression_type=FileCompressionType.BZIP2 + ) + df = session.read_fasta_file(str(fasta_path), options=options).to_polars() + + assert len(df) == 2 @pytest.mark.skipif( not importlib.util.find_spec("polars"), reason="polars not installed" diff --git a/src/file_compression_type.rs b/src/file_compression_type.rs index a0f43c2..34d02f9 100644 --- a/src/file_compression_type.rs +++ b/src/file_compression_type.rs @@ -28,6 +28,7 @@ pub enum FileCompressionType { GZIP, ZSTD, UNCOMPRESSED, + BZIP2, } impl Default for FileCompressionType { @@ -56,6 +57,7 @@ impl Display for FileCompressionType { Self::GZIP => write!(f, "GZIP"), Self::ZSTD => write!(f, "ZSTD"), Self::UNCOMPRESSED => write!(f, "UNCOMPRESSED"), + Self::BZIP2 => write!(f, "BZIP2"), } } } @@ -66,6 +68,7 @@ impl From for DFFileCompressionType { FileCompressionType::GZIP => DFFileCompressionType::GZIP, FileCompressionType::ZSTD => DFFileCompressionType::ZSTD, FileCompressionType::UNCOMPRESSED => DFFileCompressionType::UNCOMPRESSED, + FileCompressionType::BZIP2 => DFFileCompressionType::BZIP2, } } } @@ -78,6 +81,7 @@ impl TryFrom for FileCompressionType { CompressionTypeVariant::GZIP => Ok(Self::GZIP), CompressionTypeVariant::ZSTD => Ok(Self::ZSTD), CompressionTypeVariant::UNCOMPRESSED => Ok(Self::UNCOMPRESSED), + CompressionTypeVariant::BZIP2 => Ok(Self::BZIP2), _ => Err(BioBearError::InvalidCompressionType(value.to_string())), } } @@ -91,6 +95,7 @@ impl TryFrom for FileCompressionType { DFFileCompressionType::GZIP => Ok(Self::GZIP), DFFileCompressionType::ZSTD => Ok(Self::ZSTD), DFFileCompressionType::UNCOMPRESSED => Ok(Self::UNCOMPRESSED), + DFFileCompressionType::BZIP2 => Ok(Self::BZIP2), _ => Err(BioBearError::InvalidCompressionType( "Invalid compression type".to_string(), )),