diff --git a/resources/home/dnanexus/generate_gcnv_bed/generate_gcnv_bed.py b/resources/home/dnanexus/generate_gcnv_bed/generate_gcnv_bed.py index 81fe341..3e84eae 100644 --- a/resources/home/dnanexus/generate_gcnv_bed/generate_gcnv_bed.py +++ b/resources/home/dnanexus/generate_gcnv_bed/generate_gcnv_bed.py @@ -10,6 +10,7 @@ import argparse from concurrent.futures import as_completed, ProcessPoolExecutor +import numpy as np from os import cpu_count from pathlib import Path import subprocess @@ -101,7 +102,12 @@ def read_single_copy_ratio_file(copy_ratio_file) -> Tuple[str, pd.DataFrame]: comment="@", header=0, names=["chr", "start", "end", sample_name], - dtype={"chr": str, "start": int, "end": int, sample_name: float}, + dtype={ + "chr": "category", + "start": np.uint32, + "end": np.uint32, + sample_name: float, + }, float_precision="high", ) except Exception as exc: diff --git a/resources/home/dnanexus/generate_gcnv_bed/tests/test_generate_gcnv_bed.py b/resources/home/dnanexus/generate_gcnv_bed/tests/test_generate_gcnv_bed.py index 35bec49..b4f7c39 100644 --- a/resources/home/dnanexus/generate_gcnv_bed/tests/test_generate_gcnv_bed.py +++ b/resources/home/dnanexus/generate_gcnv_bed/tests/test_generate_gcnv_bed.py @@ -6,6 +6,7 @@ from glob import glob import gzip from math import sqrt +import numpy as np import os from pathlib import Path from uuid import uuid4 @@ -89,6 +90,15 @@ def test_contents_correctly_parsed_into_dataframe(self): }, ) + expected_df = expected_df.astype( + { + "chr": "category", + "start": np.uint32, + "end": np.uint32, + "sample_1": float, + } + ) + _, parsed_df = read_single_copy_ratio_file( copy_ratio_file=self.sample_1_test_file ) @@ -181,6 +191,16 @@ def test_reading_all_files_to_dataframe_correct(self): ], }, ) + expected_df = expected_df.astype( + { + "chr": "category", + "start": np.uint32, + "end": np.uint32, + "sample_1": float, + "sample_2": float, + "sample_3": float, + } + ) self.assertTrue(copy_ratio_df.equals(expected_df)) @@ -383,9 +403,9 @@ def test_contents_correctly_written_to_file(self): "mean_minus_std2", ], dtype={ - "chr": str, - "start": int, - "end": int, + "chr": "category", + "start": np.uint32, + "end": np.uint32, "sample_1": float, "sample_2": float, "sample_3": float, @@ -482,9 +502,9 @@ def test_contents_correct_when_keep_all_samples_true(self): "mean_minus_std2", ], dtype={ - "chr": str, - "start": int, - "end": int, + "chr": "category", + "start": np.uint32, + "end": np.uint32, "sample_1": float, "sample_2": float, "sample_3": float, @@ -561,9 +581,9 @@ def test_contents_correct_when_keep_all_samples_false(self): "sample_1", ], dtype={ - "chr": str, - "start": int, - "end": int, + "chr": "category", + "start": np.uint32, + "end": np.uint32, "sample_1": float, "mean": float, "mean_plus_std": float,