Skip to content

Commit

Permalink
use more efficient types for dataframes
Browse files Browse the repository at this point in the history
  • Loading branch information
jethror1 committed Nov 21, 2024
1 parent d093fe0 commit 285db20
Show file tree
Hide file tree
Showing 2 changed files with 36 additions and 10 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@

import argparse
from concurrent.futures import as_completed, ProcessPoolExecutor
import numpy as np
from os import cpu_count
from pathlib import Path
import subprocess
Expand Down Expand Up @@ -101,7 +102,12 @@ def read_single_copy_ratio_file(copy_ratio_file) -> Tuple[str, pd.DataFrame]:
comment="@",
header=0,
names=["chr", "start", "end", sample_name],
dtype={"chr": str, "start": int, "end": int, sample_name: float},
dtype={
"chr": "category",
"start": np.uint32,
"end": np.uint32,
sample_name: float,
},
float_precision="high",
)
except Exception as exc:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
from glob import glob
import gzip
from math import sqrt
import numpy as np
import os
from pathlib import Path
from uuid import uuid4
Expand Down Expand Up @@ -89,6 +90,15 @@ def test_contents_correctly_parsed_into_dataframe(self):
},
)

expected_df = expected_df.astype(
{
"chr": "category",
"start": np.uint32,
"end": np.uint32,
"sample_1": float,
}
)

_, parsed_df = read_single_copy_ratio_file(
copy_ratio_file=self.sample_1_test_file
)
Expand Down Expand Up @@ -181,6 +191,16 @@ def test_reading_all_files_to_dataframe_correct(self):
],
},
)
expected_df = expected_df.astype(
{
"chr": "category",
"start": np.uint32,
"end": np.uint32,
"sample_1": float,
"sample_2": float,
"sample_3": float,
}
)

self.assertTrue(copy_ratio_df.equals(expected_df))

Expand Down Expand Up @@ -383,9 +403,9 @@ def test_contents_correctly_written_to_file(self):
"mean_minus_std2",
],
dtype={
"chr": str,
"start": int,
"end": int,
"chr": "category",
"start": np.uint32,
"end": np.uint32,
"sample_1": float,
"sample_2": float,
"sample_3": float,
Expand Down Expand Up @@ -482,9 +502,9 @@ def test_contents_correct_when_keep_all_samples_true(self):
"mean_minus_std2",
],
dtype={
"chr": str,
"start": int,
"end": int,
"chr": "category",
"start": np.uint32,
"end": np.uint32,
"sample_1": float,
"sample_2": float,
"sample_3": float,
Expand Down Expand Up @@ -561,9 +581,9 @@ def test_contents_correct_when_keep_all_samples_false(self):
"sample_1",
],
dtype={
"chr": str,
"start": int,
"end": int,
"chr": "category",
"start": np.uint32,
"end": np.uint32,
"sample_1": float,
"mean": float,
"mean_plus_std": float,
Expand Down

0 comments on commit 285db20

Please sign in to comment.