fix #15 and #16

LKremer · Mar 24, 2023 · 34df36f · 34df36f
1 parent 2182dd6
commit 34df36f
Show file tree

Hide file tree

Showing 6 changed files with 85 additions and 26 deletions.
diff --git a/scbs/diff.py b/scbs/diff.py
@@ -54,16 +54,18 @@ def calc_fdr(datatype):
 
 
 @njit(nogil=True)
-def welch_t_test(group1, group2, min_cells):
+def calc_welch_tstat_df(group1, group2, min_cells):
     """
-    calculates the t-statistic according to Welch's t-test for unequal variances
+    Calculates the t-statistic and the degrees of freedom (df) according to Welch's
+    t-test for unequal variances.
+    Returns the t-statistic, df, and the two group sizes.
     """
     len_g1 = len(group1)
-    if len_g1 < min_cells:
-        return np.nan
     len_g2 = len(group2)
+    if len_g1 < min_cells:
+        return np.nan, np.nan, len_g1, len_g2
     if len_g2 < min_cells:
-        return np.nan
+        return np.nan, np.nan, len_g1, len_g2
 
     mean_g1 = np.mean(group1)
     mean_g2 = np.mean(group2)
@@ -80,28 +82,37 @@ def welch_t_test(group1, group2, min_cells):
         sum2 += sqdif2
 
     if sum1 == 0.0 and sum2 == 0.0:
-        return np.nan
+        return np.nan, np.nan, len_g1, len_g2
 
     var_g1 = sum1 / (len_g1 - 1)
     var_g2 = sum2 / (len_g2 - 1)
 
     s_delta = math.sqrt(var_g1 / len_g1 + var_g2 / len_g2)
     t = (mean_g1 - mean_g2) / s_delta
-    return t
+
+    # calculate degrees of freedom
+    numerator = ((var_g1 / len_g1) + (var_g2 / len_g2)) ** 2
+    denominator = ((var_g1 / len_g1) ** 2 / (len_g1 - 1)) + (
+        (var_g2 / len_g2) ** 2 / (len_g2 - 1)
+    )
+    df = numerator / denominator
+    return t, df, len_g1, len_g2
 
 
 @njit(nogil=True)
-def welch_t_test_df(group1, group2, min_cells):
+def calc_welch_tstat(group1, group2, min_cells):
     """
-    calculates the t-statistic and the degrees of freedom according to Welch's
-    t-test for unequal variances. todo: somehow merge with welch_t_test()
+    Calculates the t-statistic according to Welch's t-test for unequal variances.
+    Mostly copy-paste from calc_welch_tstat_df, this is ugly but gives a slight edge
+    in performance.
+    Returns the t-statistic, df, and the two group sizes.
     """
     len_g1 = len(group1)
-    len_g2 = len(group2)
     if len_g1 < min_cells:
-        return np.nan, np.nan, len_g1, len_g2
+        return np.nan
+    len_g2 = len(group2)
     if len_g2 < min_cells:
-        return np.nan, np.nan, len_g1, len_g2
+        return np.nan
 
     mean_g1 = np.mean(group1)
     mean_g2 = np.mean(group2)
@@ -118,21 +129,14 @@ def welch_t_test_df(group1, group2, min_cells):
         sum2 += sqdif2
 
     if sum1 == 0.0 and sum2 == 0.0:
-        return np.nan, np.nan, len_g1, len_g2
+        return np.nan
 
     var_g1 = sum1 / (len_g1 - 1)
     var_g2 = sum2 / (len_g2 - 1)
 
     s_delta = math.sqrt(var_g1 / len_g1 + var_g2 / len_g2)
     t = (mean_g1 - mean_g2) / s_delta
-
-    # calculate degrees of freedom
-    numerator = ((var_g1 / len_g1) + (var_g2 / len_g2)) ** 2
-    denominator = ((var_g1 / len_g1) ** 2 / (len_g1 - 1)) + (
-        (var_g2 / len_g2) ** 2 / (len_g2 - 1)
-    )
-    df = numerator / denominator
-    return t, df, len_g1, len_g2
+    return t
 
 
 @njit(parallel=True)
@@ -194,7 +198,7 @@ def _move_windows(
             group2 = mean_shrunk_resid[index[1, chrom_bin]]
             group2 = group2[~np.isnan(group2)]
 
-        t = welch_t_test(group1, group2, min_cells)
+        t = calc_welch_tstat(group1, group2, min_cells)
         t_array[i] = t
 
     return windows, t_array
@@ -273,7 +277,7 @@ def calc_tstat_peaks(
                 g1_mfrac = np.nanmean(mfracs[index[0, chrom_bin]])
                 g2_mfrac = np.nanmean(mfracs[index[1, chrom_bin]])
 
-            t_stat, df, n_cells_g1, n_cells_g2 = welch_t_test_df(
+            t_stat, df, n_cells_g1, n_cells_g2 = calc_welch_tstat_df(
                 group1, group2, min_cells
             )
             p = 2 * (1 - t.cdf(x=abs(t_stat), df=df))  # raw two-tailed p-value

diff --git a/scbs/filter.py b/scbs/filter.py
@@ -1,6 +1,7 @@
 import os
 from csv import DictReader
 from glob import glob
+from datetime import datetime
 
 import scipy.sparse as sp_sparse
 
@@ -92,6 +93,22 @@ def _check_cell_number(n, n_before):
         )
 
 
+def _copy_log(original_path, copy_path, n_cells_postfilter, n_cells_prefilter):
+    content = "run_info.txt was missing before calling scbs filter...\n\n\n"
+    if os.path.isfile(original_path):
+        with open(original_path, "r") as infile:
+            content = infile.read() + "\n\n\n"
+    with open(copy_path, "w") as outfile:
+        now = datetime.now()
+        n_filtered = n_cells_prefilter - n_cells_postfilter
+        content += (
+            f"---------- {now.strftime('%a %b %d %H:%M:%S %Y')} ----------"
+            f"\n{n_filtered} of {n_cells_prefilter} cells were discarded "
+            "with scbs filter.\n"
+        )
+        outfile.write(content)
+
+
 def filter_(
     data_dir, filtered_dir, min_sites, max_sites, min_meth, max_meth, cell_names, keep
 ):
@@ -100,6 +117,8 @@ def filter_(
     stats_path_out = os.path.join(filtered_dir, "cell_stats.csv")
     colname_path = os.path.join(data_dir, "column_header.txt")
     colname_path_out = os.path.join(filtered_dir, "column_header.txt")
+    log_path = os.path.join(data_dir, "run_info.txt")
+    log_path_out = os.path.join(filtered_dir, "run_info.txt")
     if cell_names:
         if any([min_sites, max_sites, min_meth, max_meth]):
             secho(
@@ -135,3 +154,4 @@ def filter_(
 
     _filter_text_file(stats_path, cell_idx, stats_path_out, header=True)
     _filter_text_file(colname_path, cell_idx, colname_path_out, header=False)
+    _copy_log(log_path, log_path_out, len(cell_idx), n_cells_prefilter)
diff --git a/scbs/numerics.py b/scbs/numerics.py
@@ -74,8 +74,8 @@ def _calc_mean_shrunken_residuals_and_mfracs(
     shrinkage_factor=1,
 ):
     """
-    also reports the methylation % of each cell. todo: merge somehow with
-    _calc_mean_shrunken_residuals
+    A copy of _calc_mean_shrunken_residuals() that also returns the average methylation
+    of both cell groups. The function exists twice for performance reasons.
     """
     shrunken_resid = np.full(n_cells, np.nan)
     if start > chrom_len:

diff --git a/scbs/utils.py b/scbs/utils.py
@@ -76,11 +76,23 @@ def _load_chrom_mat(data_dir, chrom):
     return mat
 
 
+def _check_if_file_exists(directory, file_name, required=False):
+    file_exists = os.path.isfile(os.path.join(directory, file_name))
+    if not file_exists:
+        if required:
+            raise Exception(f"{file_name} is missing from {directory}")
+        secho("Warning: ", fg="red", nl=False)
+        echo(f"{file_name} is missing from {directory}")
+    return file_exists
+
+
 def _check_data_dir(data_dir, assert_smoothed=False):
     """
     quickly peek into data_dir to make sure the user
     did not specify an empty directory
     """
+    _check_if_file_exists(data_dir, "column_header.txt", required=True)
+    _check_if_file_exists(data_dir, "cell_stats.csv")
     npz_files = glob(os.path.join(data_dir, "*.npz"))
     if not npz_files:
         raise Exception(

diff --git a/tests/data/tiny/data_dir/run_info.txt b/tests/data/tiny/data_dir/run_info.txt
@@ -0,0 +1,21 @@
+This directory was generated on Fri Mar 24 20:54:16 2023
+with scbs prepare version 0.6.1.
+The total runtime was 0:00:00 (hour:min:s).
+
+The following parameters were used:
+
+data_dir:
+data_dir2
+
+input_format:
+bismark
+
+round_sites:
+False
+
+chunksize:
+10000000
+
+input_files:
+<_io.BufferedReader name='a.cov'>
+<_io.BufferedReader name='b.cov.gz'>
diff --git a/tests/test_cli.py b/tests/test_cli.py
@@ -105,6 +105,8 @@ def test_filter_cli_threshold(tmp_path):
     with open(os.path.join(p, "cell_stats.csv")) as csv:
         assert csv.readline().startswith("cell_name,")
         assert csv.readline().startswith("b,")
+    with open(os.path.join(p, "run_info.txt")) as run_info:
+        assert "scbs prepare version" in run_info.read()
 
 
 def test_filter_cli_toostrict(tmp_path):