Skip to content

Commit

Permalink
Add new distances (#304)
Browse files Browse the repository at this point in the history
* Add new distances

* add tests

* add kl-divergence

* re-implement kl-divergence
implement T-test statistic
rename the distances

* implement rbf and quadratic polynomial mmd distances

* RTD config

Signed-off-by: zethson <lukas.heumos@posteo.net>

* RTD config

Signed-off-by: zethson <lukas.heumos@posteo.net>

* RTD config (#333)

* RTD config

Signed-off-by: zethson <lukas.heumos@posteo.net>

* RTD config

Signed-off-by: zethson <lukas.heumos@posteo.net>

* RTD config

Signed-off-by: zethson <lukas.heumos@posteo.net>

* RTD config

Signed-off-by: zethson <lukas.heumos@posteo.net>

---------

Signed-off-by: zethson <lukas.heumos@posteo.net>

* Add Negative Binomial NLL distance

* speedup

Signed-off-by: zethson <lukas.heumos@posteo.net>

* Less aggressive subsampling

Signed-off-by: zethson <lukas.heumos@posteo.net>

* Conditional subsampling

Signed-off-by: zethson <lukas.heumos@posteo.net>

* Fix NBNLL

* Enable counts_layer for Distances

Signed-off-by: zethson <lukas.heumos@posteo.net>

* Formatting

Signed-off-by: zethson <lukas.heumos@posteo.net>

* Fix test

Signed-off-by: zethson <lukas.heumos@posteo.net>

* Further distance fixes

Signed-off-by: zethson <lukas.heumos@posteo.net>

* Enable layers for DistanceTests

Signed-off-by: zethson <lukas.heumos@posteo.net>

* use future annotation imports

Signed-off-by: zethson <lukas.heumos@posteo.net>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* bug fix, convert to distance, check sparse csr

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* add test for sparse matrices

* use the internal __call__ method in pairwise

* rename nb-nll to nb-ll

* rename distances + add epsilon to kl-divergance and t-test

* fix

* add mse distance

* add Kendall Tau distance

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Change handling of cell_wise_metric in distances
Moved from argument to Distance class attribute. Affects how
precomputed distances are stored and named.
Changed metric used in Edistance to sqeuclidean as in original paper.
Also fixed / added some tests.

* Add fix again that was lost during merge resolve

* Fix CINEMA OT test

---------

Signed-off-by: zethson <lukas.heumos@posteo.net>
Co-authored-by: zethson <lukas.heumos@posteo.net>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Co-authored-by: Yuge Ji <di93zoj@cm2login1.cos.lrz.de>
Co-authored-by: stefanpeidli <stefanpeidli@gmail.com>
  • Loading branch information
5 people authored Oct 5, 2023
1 parent 2308f58 commit 3a8e597
Show file tree
Hide file tree
Showing 12 changed files with 515 additions and 111 deletions.
34 changes: 0 additions & 34 deletions .github/workflows/publish_docs.yml

This file was deleted.

2 changes: 1 addition & 1 deletion pertpy/tools/_coda/_sccoda.py
Original file line number Diff line number Diff line change
Expand Up @@ -417,7 +417,7 @@ def run_nuts(
modality_key: str = "coda",
num_samples: int = 10000,
num_warmup: int = 1000,
rng_key: int = None,
rng_key: int = 0,
copy: bool = False,
*args,
**kwargs,
Expand Down
2 changes: 1 addition & 1 deletion pertpy/tools/_coda/_tasccoda.py
Original file line number Diff line number Diff line change
Expand Up @@ -568,7 +568,7 @@ def run_nuts(
modality_key: str = "coda",
num_samples: int = 10000,
num_warmup: int = 1000,
rng_key: int = None,
rng_key: int = 0,
copy: bool = False,
*args,
**kwargs,
Expand Down
5 changes: 4 additions & 1 deletion pertpy/tools/_dialogue.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
import numpy as np
import pandas as pd
import scanpy as sc
import scipy.sparse as sp
import statsmodels.formula.api as smf
import statsmodels.stats.multitest as ssm
from anndata import AnnData
Expand Down Expand Up @@ -466,8 +467,10 @@ def _calculate_cca_sig(
ct_adata = ct_subs[ct]
conf_m = ct_adata.obs[n_counts_key].values

if not isinstance(ct_adata.X, np.ndarray):
ct_adata.X = ct_adata.X.toarray()
R_cca_gene_cor1_x = self._corr2_coeff(
ct_adata.X.toarray().T, mcp_scores[ct].T
ct_adata.X.T, mcp_scores[ct].T
) # TODO: there are some nans here, also in R

# get genes that are most positively and negatively correlated across all MCPS
Expand Down
74 changes: 50 additions & 24 deletions pertpy/tools/_distances/_distance_tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,11 +22,17 @@ class DistanceTest:
group (which normally would be your "control" cells).
Args:
metric: Distance metric to use.
metric: Distance metric to use between groups of cells.
n_perms: Number of permutations to run. Defaults to 1000.
obsm_key: Name of embedding to use for distance computation. Defaults to 'X_pca'.
layer_key: Name of the counts layer containing raw counts to calculate distances for.
Mutually exclusive with 'obsm_key'.
Defaults to None and is then not used.
obsm_key: Name of embedding in adata.obsm to use.
Mutually exclusive with 'counts_layer_key'.
Defaults to None, but is set to "X_pca" if not set explicitly internally.
alpha: Significance level. Defaults to 0.05.
correction: Multiple testing correction method. Defaults to 'holm-sidak'.
cell_wise_metric: Metric to use between single cells. Defaults to "euclidean".
Examples:
>>> import pertpy as pt
Expand All @@ -39,18 +45,40 @@ def __init__(
self,
metric: str,
n_perms: int = 1000,
obsm_key: str = "X_pca",
layer_key: str = None,
obsm_key: str = None,
alpha: float = 0.05,
correction: str = "holm-sidak",
cell_wise_metric=None,
):
self.metric = metric
self.n_perms = n_perms

if layer_key and obsm_key:
raise ValueError(
"Cannot use 'counts_layer_key' and 'obsm_key' at the same time.\n"
"Please provide only one of the two keys."
)
if not layer_key and not obsm_key:
obsm_key = "X_pca"
self.layer_key = layer_key
self.obsm_key = obsm_key
self.alpha = alpha
self.correction = correction
self.cell_wise_metric = (
cell_wise_metric if cell_wise_metric else Distance(self.metric, self.obsm_key).cell_wise_metric
)

self.distance = Distance(
self.metric, layer_key=self.layer_key, obsm_key=self.obsm_key, cell_wise_metric=self.cell_wise_metric
)

def __call__(
self, adata: AnnData, groupby: str, contrast: str, cell_wise_metric: str = "euclidean", verbose: bool = True
self,
adata: AnnData,
groupby: str,
contrast: str,
show_progressbar: bool = True,
) -> pd.DataFrame:
"""Run a permutation test using the specified distance metric, testing
all groups of cells against a specified contrast group ("control").
Expand All @@ -59,7 +87,7 @@ def __call__(
adata: Annotated data matrix.
groupby: Key in adata.obs for grouping cells.
contrast: Name of the contrast group.
verbose: Whether to print progress. Defaults to True.
show_progressbar: Whether to print progress. Defaults to True.
Returns:
pandas.DataFrame: Results of the permutation test, with columns:
Expand All @@ -75,14 +103,14 @@ def __call__(
>>> etest = pt.tl.DistanceTest('edistance', n_perms=1000)
>>> tab = etest(adata, groupby='perturbation', contrast='control')
"""
if Distance(self.metric, self.obsm_key).metric_fct.accepts_precomputed:
if self.distance.metric_fct.accepts_precomputed:
# Much faster if the metric can be called on the precomputed
# distance matrix, but not all metrics can do that.
return self.test_precomputed(adata, groupby, contrast, cell_wise_metric, verbose)
return self.test_precomputed(adata, groupby, contrast, show_progressbar)
else:
return self.test_xy(adata, groupby, contrast, verbose)
return self.test_xy(adata, groupby, contrast, show_progressbar)

def test_xy(self, adata: AnnData, groupby: str, contrast: str, verbose: bool = True) -> pd.DataFrame:
def test_xy(self, adata: AnnData, groupby: str, contrast: str, show_progressbar: bool = True) -> pd.DataFrame:
"""Run permutation test for metric not supporting precomputed distances.
Runs a permutation test for a metric that can not be computed using
Expand All @@ -93,8 +121,7 @@ def test_xy(self, adata: AnnData, groupby: str, contrast: str, verbose: bool = T
adata: Annotated data matrix.
groupby: Key in adata.obs for grouping cells.
contrast: Name of the contrast group.
cell_wise_metric: Metric to use for pairwise distances. Defaults to "euclidean".
verbose: Whether to print progress. Defaults to True.
show_progressbar: Whether to print progress. Defaults to True.
Returns:
pandas.DataFrame: Results of the permutation test, with columns:
Expand All @@ -104,11 +131,10 @@ def test_xy(self, adata: AnnData, groupby: str, contrast: str, verbose: bool = T
- pvalue_adj: p-value after multiple testing correction
- significant_adj: whether the group is significantly different from the contrast group after multiple testing correction
"""
distance = Distance(self.metric, self.obsm_key)
groups = adata.obs[groupby].unique()
if contrast not in groups:
raise ValueError(f"Contrast group {contrast} not found in {groupby} of adata.obs.")
fct = track if verbose else lambda iterable: iterable
fct = track if show_progressbar else lambda iterable: iterable
embedding = adata.obsm[self.obsm_key]

# Generate the null distribution
Expand All @@ -127,7 +153,7 @@ def test_xy(self, adata: AnnData, groupby: str, contrast: str, verbose: bool = T

X = embedding[mask][idx] # shuffled group
Y = embedding[mask][~idx] # shuffled contrast
dist = distance(X, Y)
dist = self.distance(X, Y)

df.loc[group, "distance"] = dist
results.append(df.sort_index())
Expand All @@ -138,7 +164,7 @@ def test_xy(self, adata: AnnData, groupby: str, contrast: str, verbose: bool = T
continue
X = embedding[adata.obs[groupby] == group]
Y = embedding[adata.obs[groupby] == contrast]
df.loc[group, "distance"] = distance(X, Y)
df.loc[group, "distance"] = self.distance(X, Y)

# Evaluate the test
# count times shuffling resulted in larger distance
Expand Down Expand Up @@ -172,9 +198,7 @@ def test_xy(self, adata: AnnData, groupby: str, contrast: str, verbose: bool = T

return tab

def test_precomputed(
self, adata: AnnData, groupby: str, contrast: str, cell_wise_metric: str = "euclidean", verbose: bool = True
) -> pd.DataFrame:
def test_precomputed(self, adata: AnnData, groupby: str, contrast: str, verbose: bool = True) -> pd.DataFrame:
"""Run permutation test for metrics that take precomputed distances.
Args:
Expand All @@ -192,8 +216,7 @@ def test_precomputed(
- pvalue_adj: p-value after multiple testing correction
- significant_adj: whether the group is significantly different from the contrast group after multiple testing correction
"""
distance = Distance(self.metric, self.obsm_key)
if not distance.metric_fct.accepts_precomputed:
if not self.distance.metric_fct.accepts_precomputed:
raise ValueError(f"Metric {self.metric} does not accept precomputed distances.")

groups = adata.obs[groupby].unique()
Expand All @@ -204,8 +227,11 @@ def test_precomputed(
# Precompute the pairwise distances
precomputed_distances = {}
for group in groups:
cells = adata[adata.obs[groupby].isin([group, contrast])].obsm[self.obsm_key].copy()
pwd = pairwise_distances(cells, cells, metric=cell_wise_metric)
if self.layer_key:
cells = adata[adata.obs[groupby].isin([group, contrast])].layers[self.layer_key].copy()
else:
cells = adata[adata.obs[groupby].isin([group, contrast])].obsm[self.obsm_key].copy()
pwd = pairwise_distances(cells, cells, metric=self.distance.cell_wise_metric)
precomputed_distances[group] = pwd

# Generate the null distribution
Expand All @@ -223,7 +249,7 @@ def test_precomputed(
idx = shuffled_labels == group

precomputed_distance = precomputed_distances[group]
distance_result = distance.metric_fct.from_precomputed(precomputed_distance, idx)
distance_result = self.distance.metric_fct.from_precomputed(precomputed_distance, idx)

df.loc[group, "distance"] = distance_result
results.append(df.sort_index())
Expand All @@ -237,7 +263,7 @@ def test_precomputed(
idx = labels == group

precomputed_distance = precomputed_distances[group]
distance_result = distance.metric_fct.from_precomputed(precomputed_distance, idx)
distance_result = self.distance.metric_fct.from_precomputed(precomputed_distance, idx)

df.loc[group, "distance"] = distance_result

Expand Down
Loading

0 comments on commit 3a8e597

Please sign in to comment.