Skip to content

Commit

Permalink
Merge branch 'main' into fix/discriminator_classifier_NN_dimensions
Browse files Browse the repository at this point in the history
  • Loading branch information
Lilly-May authored Dec 29, 2023
2 parents 7d16bc1 + 8299e5d commit 1aa5f42
Show file tree
Hide file tree
Showing 2 changed files with 767 additions and 13 deletions.
106 changes: 93 additions & 13 deletions pertpy/tools/_differential_gene_expression.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,36 +14,52 @@
class DifferentialGeneExpression:
"""Support for differential gene expression for scverse."""

def pseudobulk(
def get_pseudobulk(
self,
adata: AnnData,
sample_col: str,
groups_col: str,
obs: pd.DataFrame = None,
layer: str = None,
use_raw: bool = False,
min_prop: float = 0.2,
mode: str = "sum",
min_cells=10,
min_counts: int = 1000,
min_samples: int = 2,
dtype: npt.DTypeLike = np.float32,
skip_checks: bool = False,
) -> AnnData:
"""Generate Pseudobulk for DE analysis.
"""Summarizes expression profiles across cells per sample and group.
Wraps decoupler's get_pseudobulk function.
Generates summarized expression profiles across cells per sample (e.g. sample id) and group (e.g. cell type) based on the metadata found in .obs.
To ensure a minimum quality control, this function removes genes that are not expressed enough across cells (min_prop) or samples (min_smpls),
and samples with not enough cells (min_cells) or gene counts (min_counts).
By default this function expects raw integer counts as input and sums them per sample and group (mode='sum'), but other modes are available.
This function produces some quality control metrics to assess if is necessary to filter some samples.
The number of cells that belong to each sample is stored in `.obs['psbulk_n_cells']`,
the total sum of counts per sample in .obs['psbulk_counts'], and the proportion of cells that express a given gene in `.layers[‘psbulk_props’]`.
Wraps decoupler's `get_pseudobulk` function.
See: https://decoupler-py.readthedocs.io/en/latest/generated/decoupler.get_pseudobulk.html#decoupler.get_pseudobulk
for more details
for more details.
Args:
adata: Input AnnData object.
sample_col: Column of obs where to extract the samples names.
groups_col: Column of obs where to extract the groups names.
obs: If provided, metadata dataframe.
obs: If provided, metadata DataFrame.
layer: If provided, which layer to use.
use_raw: Use raw attribute of adata if present.
min_prop: Minimum proportion of cells with non-zero values.
min_counts: Minimum number of cells per sample.
min_samples: Minimum number of samples per feature.
use_raw: Use raw attribute of the AnnData object if present.
mode: How to perform the pseudobulk.
Available options are 'sum', 'mean' or 'median'. Also accepts callback functions to perform custom aggregations.
Additionally, it is also possible to provide a dictionary of different callback functions, each one stored in a different resulting `.layer`.
In this case, the result of the first callback function of the dictionary is stored in .X by default.
min_cells: Filter to remove samples by a minimum number of cells in a sample-group pair.
min_counts: Filter to remove samples by a minimum number of summed counts in a sample-group pair.
dtype: Type of float used.
skip_checks: Whether to skip input checks.
Set to True when working with positive and negative data, or when counts are not integers.
Returns:
Returns new AnnData object with unormalized pseudobulk profiles per sample and group.
Expand All @@ -55,14 +71,78 @@ def pseudobulk(
obs=obs,
layer=layer,
use_raw=use_raw,
min_prop=min_prop,
mode=mode,
min_counts=min_counts,
min_smpls=min_samples,
dtype=dtype,
min_cells=min_cells,
skip_checks=skip_checks,
)

return pseudobulk_adata

def filter_by_expr(
self,
adata: AnnData,
obs: pd.DataFrame = None,
group: str | None = None,
lib_size: int | float | None = None,
min_count: int = 10,
min_total_count: int = 15,
large_n: int = 10,
min_prop: float = 0.7,
) -> AnnData:
"""Filter AnnData by which genes have sufficiently large counts to be retained in a statistical analysis.
Wraps decoupler's `filter_by_expr` function.
See https://decoupler-py.readthedocs.io/en/latest/generated/decoupler.filter_by_expr.html#decoupler.filter_by_expr
for more details.
Args:
adata: AnnData obtained after running `get_pseudobulk`.
obs: Metadata dataframe, only needed if `adata` is not an `AnnData`.
group: Name of the `.obs` column to group by. If None, assumes all samples belong to one group.
lib_size: Library size. Defaults to the sum of reads per sample if None.
min_count: Minimum count required per gene for at least some samples.
min_total_count: Minimum total count required per gene across all samples.
large_n: Number of samples per group considered to be "large".
min_prop: Minimum proportion of samples in the smallest group that express the gene.
Returns:
AnnData with only the genes that are to be kept.
"""
genes = dc.filter_by_expr(
adata=adata,
obs=obs,
group=group,
lib_size=lib_size,
min_count=min_count,
min_total_count=min_total_count,
large_n=large_n,
min_prop=min_prop,
)
filtered_adata = adata[:, genes].copy()

return filtered_adata

def filter_by_prop(self, adata: AnnData, min_prop: float = 0.2, min_samples: int = 2) -> AnnData:
"""Determine which genes are expressed in a sufficient proportion of cells across samples.
This function selects genes that are sufficiently expressed across cells in each sample and that this condition
is met across a minimum number of samples.
Args:
adata: AnnData obtained after running `get_pseudobulk`. It requieres `.layer['psbulk_props']`.
min_prop: Minimum proportion of cells that express a gene in a sample.
min_samples: Minimum number of samples with bigger or equal proportion of cells with expression than `min_prop`.
Returns:
AnnData with only the genes that are to be kept.
"""
genes = dc.filter_by_prop(adata=adata, min_prop=min_prop, min_smpls=min_samples)
filtered_adata = adata[:, genes].copy()

return filtered_adata

def de_analysis(
self,
adata: AnnData,
Expand Down
Loading

0 comments on commit 1aa5f42

Please sign in to comment.