From b1ec8550e8c5e75c8020c40ac8adfee97ac804c8 Mon Sep 17 00:00:00 2001
From: Nick Moore <nick@zoic.org>
Date: Sun, 24 Nov 2024 21:29:32 +1100
Subject: [PATCH] feat: add BZIP2 (.bz2) support for reading fasta and fastq

---
 python/biobear/biobear.pyi    |   1 +
 python/biobear/compression.py |   3 +++
 python/tests/data/test.fa.bz2 | Bin 0 -> 75 bytes
 python/tests/data/test.fq.bz2 | Bin 0 -> 189 bytes
 python/tests/test_session.py  |  20 ++++++++++++++++++++
 src/file_compression_type.rs  |   5 +++++
 6 files changed, 29 insertions(+)
 create mode 100644 python/tests/data/test.fa.bz2
 create mode 100644 python/tests/data/test.fq.bz2

diff --git a/python/biobear/biobear.pyi b/python/biobear/biobear.pyi
index bf06cb3..b22af32 100644
--- a/python/biobear/biobear.pyi
+++ b/python/biobear/biobear.pyi
@@ -30,6 +30,7 @@ class FileCompressionType(enum.Enum):
     GZIP = 0
     BGZIP = 1
     NONE = 2
+    BZIP2 = 3
 
 class FastaSequenceDataType(enum.Enum):
     """How to treat the sequence data in a FASTA file."""
diff --git a/python/biobear/compression.py b/python/biobear/compression.py
index d70b438..e2ed4b4 100644
--- a/python/biobear/compression.py
+++ b/python/biobear/compression.py
@@ -25,12 +25,15 @@ class Compression(Enum):
     INFERRED = "INFERRED"
     NONE = "NONE"
     GZIP = "GZIP"
+    BZIP2 = "BZIP2"
 
     @classmethod
     def from_file(cls, path: os.PathLike) -> "Compression":
         """Infer the compression type from the file extension."""
         if Path(path).suffix == ".gz":
             return Compression.GZIP
+        if Path(path).suffix == ".bz2":
+            return Compression.BZIP2
         return Compression.NONE
 
     def infer_or_use(self, path: os.PathLike) -> "Compression":
diff --git a/python/tests/data/test.fa.bz2 b/python/tests/data/test.fa.bz2
new file mode 100644
index 0000000000000000000000000000000000000000..73b5d3a9a18806429a8d6654738d5472f7b781f7
GIT binary patch
literal 75
zcmV-R0JQ%?T4*^jL0KkKS;pg_+W-Iq-+%xRKmZT{D1Zb2J|WxyAOIk$ri}yC1IiS_
hX`nR3(U*J1Igo@QE*&XJN%B<uUC9*TLPHylgKWxj8`uB<

literal 0
HcmV?d00001

diff --git a/python/tests/data/test.fq.bz2 b/python/tests/data/test.fq.bz2
new file mode 100644
index 0000000000000000000000000000000000000000..2fd6db576e183e83b8ea0e207e47dc8bc0c63134
GIT binary patch
literal 189
zcmV;u07CylT4*^jL0KkKSr{U(hX4RPUw~K;V*4`zZ=fsyu3_8&FaXRjl}wEQ0MGyc
z1^{H!L6E@>F&Z!xRPiL#^#*`!kjdo$$k^#=K~Wz$3m0#?qEaD<ibR6SMS_J0286bj
zOTkJ@HRb4Cj>NKbtxMUq?^)i>xpxLEjmUd0gc6cl(4?r5Dp5#!F-n$kXtM-57pT~h
r5Y?kam#;!nh6yEhYb1pkYS)Z>;m@7Fd2)Y35s<r*DZ+$+!4-TsYbsCf

literal 0
HcmV?d00001

diff --git a/python/tests/test_session.py b/python/tests/test_session.py
index cb2fd69..a52d42a 100644
--- a/python/tests/test_session.py
+++ b/python/tests/test_session.py
@@ -139,6 +139,14 @@ def test_read_fastq():
 
     assert len(df) == 2
 
+    fastq_path = DATA / "test.fq.bz2"
+    options = FASTQReadOptions(
+        file_extension="fq", file_compression_type=FileCompressionType.BZIP2
+    )
+
+    df = session.read_fastq_file(str(fastq_path), options=options).to_polars()
+
+    assert len(df) == 2
 
 @pytest.mark.skipif(
     not importlib.util.find_spec("polars"), reason="polars not installed"
@@ -285,6 +293,18 @@ def test_read_fasta_gz():
 
     assert len(df) == 2
 
+def test_read_fasta_bz2():
+    """Test reading a fasta.bz2 file."""
+    session = connect()
+
+    fasta_path = DATA / "test.fa.bz2"
+
+    options = FASTAReadOptions(
+        file_extension="fa", file_compression_type=FileCompressionType.BZIP2
+    )
+    df = session.read_fasta_file(str(fasta_path), options=options).to_polars()
+
+    assert len(df) == 2
 
 @pytest.mark.skipif(
     not importlib.util.find_spec("polars"), reason="polars not installed"
diff --git a/src/file_compression_type.rs b/src/file_compression_type.rs
index a0f43c2..34d02f9 100644
--- a/src/file_compression_type.rs
+++ b/src/file_compression_type.rs
@@ -28,6 +28,7 @@ pub enum FileCompressionType {
     GZIP,
     ZSTD,
     UNCOMPRESSED,
+    BZIP2,
 }
 
 impl Default for FileCompressionType {
@@ -56,6 +57,7 @@ impl Display for FileCompressionType {
             Self::GZIP => write!(f, "GZIP"),
             Self::ZSTD => write!(f, "ZSTD"),
             Self::UNCOMPRESSED => write!(f, "UNCOMPRESSED"),
+            Self::BZIP2 => write!(f, "BZIP2"),
         }
     }
 }
@@ -66,6 +68,7 @@ impl From<FileCompressionType> for DFFileCompressionType {
             FileCompressionType::GZIP => DFFileCompressionType::GZIP,
             FileCompressionType::ZSTD => DFFileCompressionType::ZSTD,
             FileCompressionType::UNCOMPRESSED => DFFileCompressionType::UNCOMPRESSED,
+            FileCompressionType::BZIP2 => DFFileCompressionType::BZIP2,
         }
     }
 }
@@ -78,6 +81,7 @@ impl TryFrom<CompressionTypeVariant> for FileCompressionType {
             CompressionTypeVariant::GZIP => Ok(Self::GZIP),
             CompressionTypeVariant::ZSTD => Ok(Self::ZSTD),
             CompressionTypeVariant::UNCOMPRESSED => Ok(Self::UNCOMPRESSED),
+            CompressionTypeVariant::BZIP2 => Ok(Self::BZIP2),
             _ => Err(BioBearError::InvalidCompressionType(value.to_string())),
         }
     }
@@ -91,6 +95,7 @@ impl TryFrom<DFFileCompressionType> for FileCompressionType {
             DFFileCompressionType::GZIP => Ok(Self::GZIP),
             DFFileCompressionType::ZSTD => Ok(Self::ZSTD),
             DFFileCompressionType::UNCOMPRESSED => Ok(Self::UNCOMPRESSED),
+            DFFileCompressionType::BZIP2 => Ok(Self::BZIP2),
             _ => Err(BioBearError::InvalidCompressionType(
                 "Invalid compression type".to_string(),
             )),