Skip to content

Commit

Permalink
fix #15 and #16
Browse files Browse the repository at this point in the history
  • Loading branch information
LKremer committed Mar 24, 2023
1 parent 2182dd6 commit 34df36f
Show file tree
Hide file tree
Showing 6 changed files with 85 additions and 26 deletions.
52 changes: 28 additions & 24 deletions scbs/diff.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,16 +54,18 @@ def calc_fdr(datatype):


@njit(nogil=True)
def welch_t_test(group1, group2, min_cells):
def calc_welch_tstat_df(group1, group2, min_cells):
"""
calculates the t-statistic according to Welch's t-test for unequal variances
Calculates the t-statistic and the degrees of freedom (df) according to Welch's
t-test for unequal variances.
Returns the t-statistic, df, and the two group sizes.
"""
len_g1 = len(group1)
if len_g1 < min_cells:
return np.nan
len_g2 = len(group2)
if len_g1 < min_cells:
return np.nan, np.nan, len_g1, len_g2
if len_g2 < min_cells:
return np.nan
return np.nan, np.nan, len_g1, len_g2

mean_g1 = np.mean(group1)
mean_g2 = np.mean(group2)
Expand All @@ -80,28 +82,37 @@ def welch_t_test(group1, group2, min_cells):
sum2 += sqdif2

if sum1 == 0.0 and sum2 == 0.0:
return np.nan
return np.nan, np.nan, len_g1, len_g2

var_g1 = sum1 / (len_g1 - 1)
var_g2 = sum2 / (len_g2 - 1)

s_delta = math.sqrt(var_g1 / len_g1 + var_g2 / len_g2)
t = (mean_g1 - mean_g2) / s_delta
return t

# calculate degrees of freedom
numerator = ((var_g1 / len_g1) + (var_g2 / len_g2)) ** 2
denominator = ((var_g1 / len_g1) ** 2 / (len_g1 - 1)) + (
(var_g2 / len_g2) ** 2 / (len_g2 - 1)
)
df = numerator / denominator
return t, df, len_g1, len_g2


@njit(nogil=True)
def welch_t_test_df(group1, group2, min_cells):
def calc_welch_tstat(group1, group2, min_cells):
"""
calculates the t-statistic and the degrees of freedom according to Welch's
t-test for unequal variances. todo: somehow merge with welch_t_test()
Calculates the t-statistic according to Welch's t-test for unequal variances.
Mostly copy-paste from calc_welch_tstat_df, this is ugly but gives a slight edge
in performance.
Returns the t-statistic, df, and the two group sizes.
"""
len_g1 = len(group1)
len_g2 = len(group2)
if len_g1 < min_cells:
return np.nan, np.nan, len_g1, len_g2
return np.nan
len_g2 = len(group2)
if len_g2 < min_cells:
return np.nan, np.nan, len_g1, len_g2
return np.nan

mean_g1 = np.mean(group1)
mean_g2 = np.mean(group2)
Expand All @@ -118,21 +129,14 @@ def welch_t_test_df(group1, group2, min_cells):
sum2 += sqdif2

if sum1 == 0.0 and sum2 == 0.0:
return np.nan, np.nan, len_g1, len_g2
return np.nan

var_g1 = sum1 / (len_g1 - 1)
var_g2 = sum2 / (len_g2 - 1)

s_delta = math.sqrt(var_g1 / len_g1 + var_g2 / len_g2)
t = (mean_g1 - mean_g2) / s_delta

# calculate degrees of freedom
numerator = ((var_g1 / len_g1) + (var_g2 / len_g2)) ** 2
denominator = ((var_g1 / len_g1) ** 2 / (len_g1 - 1)) + (
(var_g2 / len_g2) ** 2 / (len_g2 - 1)
)
df = numerator / denominator
return t, df, len_g1, len_g2
return t


@njit(parallel=True)
Expand Down Expand Up @@ -194,7 +198,7 @@ def _move_windows(
group2 = mean_shrunk_resid[index[1, chrom_bin]]
group2 = group2[~np.isnan(group2)]

t = welch_t_test(group1, group2, min_cells)
t = calc_welch_tstat(group1, group2, min_cells)
t_array[i] = t

return windows, t_array
Expand Down Expand Up @@ -273,7 +277,7 @@ def calc_tstat_peaks(
g1_mfrac = np.nanmean(mfracs[index[0, chrom_bin]])
g2_mfrac = np.nanmean(mfracs[index[1, chrom_bin]])

t_stat, df, n_cells_g1, n_cells_g2 = welch_t_test_df(
t_stat, df, n_cells_g1, n_cells_g2 = calc_welch_tstat_df(
group1, group2, min_cells
)
p = 2 * (1 - t.cdf(x=abs(t_stat), df=df)) # raw two-tailed p-value
Expand Down
20 changes: 20 additions & 0 deletions scbs/filter.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import os
from csv import DictReader
from glob import glob
from datetime import datetime

import scipy.sparse as sp_sparse

Expand Down Expand Up @@ -92,6 +93,22 @@ def _check_cell_number(n, n_before):
)


def _copy_log(original_path, copy_path, n_cells_postfilter, n_cells_prefilter):
content = "run_info.txt was missing before calling scbs filter...\n\n\n"
if os.path.isfile(original_path):
with open(original_path, "r") as infile:
content = infile.read() + "\n\n\n"
with open(copy_path, "w") as outfile:
now = datetime.now()
n_filtered = n_cells_prefilter - n_cells_postfilter
content += (
f"---------- {now.strftime('%a %b %d %H:%M:%S %Y')} ----------"
f"\n{n_filtered} of {n_cells_prefilter} cells were discarded "
"with scbs filter.\n"
)
outfile.write(content)


def filter_(
data_dir, filtered_dir, min_sites, max_sites, min_meth, max_meth, cell_names, keep
):
Expand All @@ -100,6 +117,8 @@ def filter_(
stats_path_out = os.path.join(filtered_dir, "cell_stats.csv")
colname_path = os.path.join(data_dir, "column_header.txt")
colname_path_out = os.path.join(filtered_dir, "column_header.txt")
log_path = os.path.join(data_dir, "run_info.txt")
log_path_out = os.path.join(filtered_dir, "run_info.txt")
if cell_names:
if any([min_sites, max_sites, min_meth, max_meth]):
secho(
Expand Down Expand Up @@ -135,3 +154,4 @@ def filter_(

_filter_text_file(stats_path, cell_idx, stats_path_out, header=True)
_filter_text_file(colname_path, cell_idx, colname_path_out, header=False)
_copy_log(log_path, log_path_out, len(cell_idx), n_cells_prefilter)
4 changes: 2 additions & 2 deletions scbs/numerics.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,8 +74,8 @@ def _calc_mean_shrunken_residuals_and_mfracs(
shrinkage_factor=1,
):
"""
also reports the methylation % of each cell. todo: merge somehow with
_calc_mean_shrunken_residuals
A copy of _calc_mean_shrunken_residuals() that also returns the average methylation
of both cell groups. The function exists twice for performance reasons.
"""
shrunken_resid = np.full(n_cells, np.nan)
if start > chrom_len:
Expand Down
12 changes: 12 additions & 0 deletions scbs/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -76,11 +76,23 @@ def _load_chrom_mat(data_dir, chrom):
return mat


def _check_if_file_exists(directory, file_name, required=False):
file_exists = os.path.isfile(os.path.join(directory, file_name))
if not file_exists:
if required:
raise Exception(f"{file_name} is missing from {directory}")
secho("Warning: ", fg="red", nl=False)
echo(f"{file_name} is missing from {directory}")
return file_exists


def _check_data_dir(data_dir, assert_smoothed=False):
"""
quickly peek into data_dir to make sure the user
did not specify an empty directory
"""
_check_if_file_exists(data_dir, "column_header.txt", required=True)
_check_if_file_exists(data_dir, "cell_stats.csv")
npz_files = glob(os.path.join(data_dir, "*.npz"))
if not npz_files:
raise Exception(
Expand Down
21 changes: 21 additions & 0 deletions tests/data/tiny/data_dir/run_info.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
This directory was generated on Fri Mar 24 20:54:16 2023
with scbs prepare version 0.6.1.
The total runtime was 0:00:00 (hour:min:s).

The following parameters were used:

data_dir:
data_dir2

input_format:
bismark

round_sites:
False

chunksize:
10000000

input_files:
<_io.BufferedReader name='a.cov'>
<_io.BufferedReader name='b.cov.gz'>
2 changes: 2 additions & 0 deletions tests/test_cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -105,6 +105,8 @@ def test_filter_cli_threshold(tmp_path):
with open(os.path.join(p, "cell_stats.csv")) as csv:
assert csv.readline().startswith("cell_name,")
assert csv.readline().startswith("b,")
with open(os.path.join(p, "run_info.txt")) as run_info:
assert "scbs prepare version" in run_info.read()


def test_filter_cli_toostrict(tmp_path):
Expand Down

0 comments on commit 34df36f

Please sign in to comment.