docs: better for duckdb (#66)

wheretrue · Oct 21, 2023 · b815869 · b815869
1 parent 40f53cc
commit b815869
Show file tree

Hide file tree

Showing 4 changed files with 33 additions and 4 deletions.
diff --git a/Makefile b/Makefile
@@ -2,7 +2,9 @@ build:
 	cargo build --release
 	maturin develop --release
 
-test: build
+test:
+	cargo build
+	maturin develop
 	pytest
 
 run-benchmarks:

diff --git a/README.md b/README.md
@@ -87,6 +87,32 @@ df.head()
 # └──────────────┴─────────────────┴──────┴───────┴───┴────────────┴────────┴───────┴───────────────────────────────────┘
 ```
 
+## Using DuckDB
+
+biobear can also be used to read files into a [duckdb][] database.
+
+```python
+import biobear as bb
+import duckdb
+
+session = bb.connect()
+
+session.sql("""
+    CREATE EXTERNAL TABLE gene_annotations STORED AS GFF LOCATION 'python/tests/data/test.gff'
+""")
+
+result = session.sql("""
+    SELECT * FROM gene_annotations
+""")
+
+gff_table_arrow_table = result.to_arrow()
+
+duckdb_conn = duckdb.connect()
+
+result = duckdb_conn.execute('SELECT * FROM gff_table_arrow_table').fetchall()
+print(result)
+```
+
 ## Performance
 
 Please see the [exon][]'s performance metrics for thorough benchmarks, but in short, biobear is generally faster than other Python libraries for reading bioinformatic file formats.
@@ -101,3 +127,4 @@ For example, here's quick benchmarks for reading one FASTA file with 1 million r
 The larger difference multiple files is due to biobear's ability to read multiple files in parallel.
 
 [exon]: https://github.com/wheretrue/exon/tree/main/exon-benchmarks
+[duckdb]: https://duckdb.org/
diff --git a/python/tests/test_session.py b/python/tests/test_session.py
@@ -22,7 +22,7 @@
 DATA = Path(__file__).parent / "data"
 
 
-def test_connect():
+def test_connect_and_to_arrow():
     """Test connecting to a context."""
     session = connect()
 
@@ -32,7 +32,7 @@ def test_connect():
     session.sql(query)
 
     query = "SELECT * FROM gff_file"
-    arrow_table = session.sql(query).to_arrow_table()
+    arrow_table = session.sql(query).to_arrow()
 
     assert len(arrow_table) == 2
 

diff --git a/src/execution_result.rs b/src/execution_result.rs
@@ -50,7 +50,7 @@ impl PyExecutionResult {
     }
 
     /// Convert to Arrow Table
-    fn to_arrow_table(&self, py: Python) -> PyResult<PyObject> {
+    fn to_arrow(&self, py: Python) -> PyResult<PyObject> {
         let batches = self.collect(py)?.to_object(py);
 
         Python::with_gil(|py| {