Skip to content

Commit

Permalink
Initial Release: v1.0.0
Browse files Browse the repository at this point in the history
  • Loading branch information
savagyan00 committed Mar 5, 2024
0 parents commit 9bb52c1
Show file tree
Hide file tree
Showing 137 changed files with 114,021 additions and 0 deletions.
Binary file added .DS_Store
Binary file not shown.
8 changes: 8 additions & 0 deletions LICENSE
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
STANFORD NON-COMMERCIAL SOFTWARE LICENSE AGREEMENT
1. The Board of Trustees of the Leland Stanford Junior University (“Stanford”) provides CytoTRACE 2 software and code (“Service”) free of charge for non-commercial use only. Use of the Service by any commercial entity for any purpose, including research, is prohibited.
2. By using the Service, you agree to be bound by the terms of this Agreement. Please read it carefully.
3. You agree not to use the Service for commercial advantage, or in the course of for-profit activities. You agree not to use the Service on behalf of any organization that is not a non-profit organization. Commercial entities wishing to use this Service should contact Stanford University’s Office of Technology Licensing and reference docket S24-057.
4. THE SERVICE IS OFFERED “AS IS”, AND, TO THE EXTENT PERMITTED BY LAW, STANFORD MAKES NO REPRESENTATIONS AND EXTENDS NO WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED. STANFORD SHALL NOT BE LIABLE FOR ANY CLAIMS OR DAMAGES WITH RESPECT TO ANY LOSS OR OTHER CLAIM BY YOU OR ANY THIRD PARTY ON ACCOUNT OF, OR ARISING FROM THE USE OF THE SERVICE. YOU HEREBY AGREE TO DEFEND AND INDEMNIFY STANFORD, ITS TRUSTEES, EMPLOYEES, OFFICERS, STUDENTS, AGENTS, FACULTY, REPRESENTATIVES, AND VOLUNTEERS (“STANFORD INDEMNITIES”) FROM ANY LOSS OR CLAIM ASSERTED AGAINST STANFORD INDEMNITIES ARISING FROM YOUR USE OF THE SERVICE.
5. All rights not expressly granted to you in this Agreement are reserved and retained by Stanford or its licensors or content providers. This Agreement provides no license under any patent.
6. You agree that this Agreement and any dispute arising under it is governed by the laws of the State of California, United States of America, applicable to agreements negotiated, executed, and performed within California.
7. Subject to your compliance with the terms and conditions set forth in this Agreement, Stanford grants you a revocable, non-exclusive, non-transferable right to access and make use of the Service.
679 changes: 679 additions & 0 deletions README.md

Large diffs are not rendered by default.

2,281 changes: 2,281 additions & 0 deletions Vignette1_results.csv

Large diffs are not rendered by default.

Binary file added Vignette2_results.rds
Binary file not shown.
479 changes: 479 additions & 0 deletions cytotrace2_python/README.md

Large diffs are not rendered by default.

1 change: 1 addition & 0 deletions cytotrace2_python/cytotrace2_py/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
from .cytotrace2_py import *
47 changes: 47 additions & 0 deletions cytotrace2_python/cytotrace2_py/common/argument_parser.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
import argparse


def argument_parser():
parser = argparse.ArgumentParser(description="CytoTRACE 2 is a computational method for "
"predicting cellular potency categories and "
"absolute developmental potential from "
"single-cell RNA-sequencing data.")

# Required arguments
required = parser.add_argument_group('Required arguments')
required.add_argument("-f", "--input-path", help="Path to input scRNA-seq data", type=str,
default=None, required=True)


# Options

parser.add_argument("-a", "--annotation-path", type=str, default="",
help="Path to input annotations")

parser.add_argument("-sp", "--species", default="mouse",
help="Species name from which the gene names for the input data come, default 'mouse'",
choices=["mouse", "human"])

parser.add_argument("-fm", "--full-model", help="Use full ensemble of 17 models rather than default reduced ensemble of 5 most predictive models", action="store_true")

parser.add_argument("-bs", "--batch-size", help="Integer or NULL, indicating the number of cells to subsample for the pipeline steps. No subsampling if NULL. (default is 10000, recommended value for input data size > 10K cells)", type=int, default=10000)

parser.add_argument("-sbs", "--smooth-batch-size", help="Integer or NULL, indicating the number of cells to subsample further for the smoothing step of the pipeline. No subsampling if NULL. (default is 1000, recommended value for input data size > 1K cells)", type=int, default=1000)

parser.add_argument("-dp", "--disable-parallelization", help="Disable parallelization for CytoTRACE 2", action="store_true")

parser.add_argument("-mc", "--max-cores", help="Integer, indicating the maximum number of CPU cores to use for parallelization", type=int, default=None)

parser.add_argument("-mpc", "--max-pcs", help="Integer, indicating the maximum number of principal components to use in the smoothing by kNN step (default is 200)", type=int, default=200)

parser.add_argument("-r", "--seed", help="Integer, specifying the seed for reproducibility in random processes (default is 14).", type=int, default=14)

parser.add_argument("-o", "--output-dir", type=str, default="cytotrace2_results",
help="Directory path for output predictions and intermediate files (default is 'cytotrace2_results')")




arguments = parser.parse_args()

return arguments.__dict__
305 changes: 305 additions & 0 deletions cytotrace2_python/cytotrace2_py/common/gen_utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,305 @@
import concurrent.futures
import sys
import os
import math
import numpy as np
import pandas as pd
import torch
import scipy
import random
import warnings
from sklearn.preprocessing import MinMaxScaler
import pkg_resources
from cytotrace2_py.common import models

# functions for one-to-one best match mapping of orthology
def top_hit_human(x):
r = x.sort_values('%id. target Mouse gene identical to query gene')
return r.iloc[-1]
def top_hit_mouse(x):
r = x.sort_values('%id. query gene identical to target Mouse gene')
return r.iloc[-1]

# data loader
def generic_data_loader(expression, batch_size, train=False):

test_tensor = torch.utils.data.TensorDataset(torch.Tensor(expression))
data_loader = torch.utils.data.DataLoader(test_tensor, batch_size=batch_size, shuffle=False, drop_last=False)

return data_loader

# one model predictor
def validation(data_loader, model, device):
prob_pred_list = []
order_pred_list = []

model.eval()
softmax = torch.nn.Softmax(dim=1)
order_vector = torch.Tensor(np.arange(6).reshape(6,1)/5).to(device)
for batch_idx, X in enumerate(data_loader):
X = X[0].to(device)

model_output = model(X)
_, prob_pred = model_output
prob_pred = prob_pred.squeeze(2)
prob_pred = softmax(prob_pred)
prob_order = torch.matmul(prob_pred,order_vector)

prob_pred_list.append(prob_pred.detach().cpu().numpy())
order_pred_list.append(prob_order.squeeze(1).detach().cpu().numpy())


return np.concatenate(prob_pred_list,0), np.concatenate(order_pred_list,0)


# dispersion function
def disp_fn(x):
if len(np.unique(x)) == 1:
return 0
else:
return np.var(x)/np.mean(x)


# choosing top variable genes
def top_var_genes(ranked_data):

dispersion_index = [disp_fn(ranked_data[:, i]) for i in range(ranked_data.shape[1])]
top_col_inds = np.argsort(dispersion_index)[-1000:]

return top_col_inds



def build_mapping_dict():
fn_mart_export = pkg_resources.resource_filename("cytotrace2_py", "resources/mart_export.txt")
human_mapping = pd.read_csv(fn_mart_export,sep='\t').dropna().reset_index()
fn_features = pkg_resources.resource_filename("cytotrace2_py", "resources/features_model_training_17.csv")
features = pd.read_csv(fn_features)['0']
mapping_unique = human_mapping.groupby('Gene name').apply(top_hit_human)
mapping_unique = mapping_unique.groupby('Mouse gene name').apply(top_hit_mouse)
mt_dict = dict(zip(mapping_unique['Gene name'].values,mapping_unique['Mouse gene name'].values))
fn_alias_list = pkg_resources.resource_filename("cytotrace2_py", "resources/human_alias_list.txt")
human_mapping_alias_and_previous_symbols = pd.read_csv(fn_alias_list,sep='\t')
mt_dict_alias_and_previous_symbols = dict(zip(human_mapping_alias_and_previous_symbols['Alias or Previous Gene name'].values,
human_mapping_alias_and_previous_symbols['Mouse gene name'].values))

for gene in features[~features.isin(mt_dict.values())].values:
if gene.upper() in mt_dict.keys():
print(gene)
mt_dict[gene.upper()] = gene

return mt_dict, mt_dict_alias_and_previous_symbols, features


def load(input_path):
print("cytotrace2: Loading dataset")
expression = pd.read_csv(input_path,sep='\t',index_col=0).T # read data
# expression = expression.loc[:,~expression.columns.duplicated()].copy() #drop duplicate gene names, to avoid random information loss make sure the genes are unique
if expression.columns.duplicated().any():
raise ValueError(" Please make sure the gene names are unique.")
if expression.index.duplicated().any():
raise ValueError(" Please make sure the cell names are unique.")
return expression


def preprocess(expression, species):
gene_names = expression.columns
mt_dict, mt_dict_alias_and_previous_symbols, features = build_mapping_dict()
# mapping to orthologs

if species == "human":
mapped_genes = gene_names.map(mt_dict)
mapped_genes = mapped_genes.astype(object)

unmapped_genes = {value: index for index, value in enumerate(gene_names) if value in gene_names[mapped_genes.isna()]}
mapped_genes.values[mapped_genes.isna()] = gene_names[mapped_genes.isna()].map(mt_dict_alias_and_previous_symbols)
expression.columns = mapped_genes.values
num_genes_mapped = len([i for i in mapped_genes if i in features.values])
print(" Mapped "+str(num_genes_mapped)+" input gene names to mouse orthologs")
duplicate_genes = expression.columns[expression.columns.duplicated()].values
duplicate_genes = [i for i in duplicate_genes if i is not np.nan]
idx = [unmapped_genes[i.upper()] for i in duplicate_genes if i.upper() in unmapped_genes.keys()]
expression = expression.iloc[:, [j for j, c in enumerate(expression.columns) if j not in idx]]

else:
mapped_genes = gene_names
expression.columns = mapped_genes

expression = expression[expression.columns[~expression.columns.isna()]]

# check the number of input genes mapped to model features
intersection = set(expression.columns).intersection(features)
print(" "+str(len(intersection))+" input genes are present in the model features.")
if len(intersection) < 9000:
warnings.warn(" Please verify the input species is correct.\n In case of a correct species input, be advised that model performance might be compromised due to gene space differences.")
expression = pd.DataFrame(index=features).join(expression.T).T
expression = expression.fillna(0)
cell_names = expression.index
gene_names = expression.columns

#print(" Ranking gene expression values within each cell")
ranked_data = scipy.stats.rankdata(expression.values*-1,axis=1,method='average') # ranking data
return cell_names, gene_names, ranked_data

def predict(ranked_data, cell_names, model_dir, batch_size = 10000):

device = "cpu"
n_labels = 6
n_genes = ranked_data.shape[1]
dropout = 0

all_preds_test = []
all_order_test = []

all_models_path = pd.Series(np.array([os.path.join(dp, f) for dp, dn, fn in os.walk(os.path.expanduser(model_dir)) for f in fn]))
all_models_path = all_models_path[all_models_path.str.contains('pt')]


data_loader = generic_data_loader(ranked_data, batch_size)

#print(' Started prediction')
for model_path in all_models_path:
pytorch_model = torch.load(model_path, map_location=torch.device('cpu'))

hidden_size = pytorch_model['layers.0.weight'].shape[1]

model = models.BinaryEncoder(num_layers=n_labels, input_size=n_genes, dropout=dropout, hidden_size=hidden_size,num_labels=1)
model = model.to(device)
model.load_state_dict(pytorch_model)

prob_test, order_test = validation(data_loader, model, device)

all_preds_test.append(prob_test.reshape(-1,6,1))
all_order_test.append(order_test.reshape(-1,1))

predicted_order = np.mean(np.concatenate(all_order_test,1),1)
predicted_potency = np.argmax(np.concatenate(all_preds_test,2).mean(2),1)
labels = ['Differentiated','Unipotent','Oligopotent','Multipotent','Pluripotent','Totipotent']
labels_dict = dict(zip([0,1,2,3,4,5],labels))
predicted_df = pd.DataFrame({'preKNN_CytoTRACE2_Score':predicted_order,'preKNN_CytoTRACE2_Potency':predicted_potency})
predicted_df['preKNN_CytoTRACE2_Potency'] = predicted_df['preKNN_CytoTRACE2_Potency'].map(labels_dict)
predicted_df.index = cell_names

return predicted_df
# CytoTRACE-based smoothing functions
def get_markov_matrix(ranked_data, top_col_inds):
num_samples, num_genes = ranked_data.shape

sub_mat = ranked_data[:, top_col_inds]

D = np.corrcoef(sub_mat) # Pairwise pearson-r corrs

D[np.arange(num_samples), np.arange(num_samples)] = 0
D[np.where(D != D)] = 0
cutoff = max(np.mean(D), 0)
D[np.where(D < cutoff)] = 0

A = D / (D.sum(1, keepdims=True) + 1e-5)
return A


def rescale_fn(adjusted_score, original_score, deg=1, ratio=None):
params = np.polyfit(adjusted_score, original_score, deg)
smoothed_score = 0
for i, p in enumerate(params):
smoothed_score += p * np.power(adjusted_score, deg - i)

if not ratio is None:
# Revert to original SD
smoothed_range = np.std(smoothed_score)
original_range = np.std(original_score) * ratio

smoothed_score = (smoothed_score - smoothed_score.mean()) / (smoothed_range + 1e-9)
smoothed_score = smoothed_score * original_range + original_score.mean()
return smoothed_score

def smooth_subset(chunk_ranked_data,chunk_predicted_df,top_col_inds,maxiter):
markov_mat = get_markov_matrix(chunk_ranked_data, top_col_inds)
score = chunk_predicted_df["preKNN_CytoTRACE2_Score"]
init_score = score.copy()
prev_score = score.copy()
traj = []

for _ in range(int(maxiter)):
cur_score = 0.9 * markov_mat.dot(prev_score) + 0.1 * init_score
traj.append(np.mean(np.abs(cur_score - prev_score)) / (np.mean(init_score) + 1e-6))
if np.mean(np.abs(cur_score - prev_score)) / (np.mean(init_score) + 1e-6) < 1e-6:
break
prev_score = cur_score

return cur_score

def smoothing_by_diffusion(predicted_df, ranked_data, top_col_inds, smooth_batch_size=1000, smooth_cores_to_use=1, seed = 14,
maxiter=1e4,rescale=True, rescale_deg=1, rescale_ratio=None):
# Set seed for reproducibility
np.random.seed(seed)
if smooth_batch_size > len(ranked_data):
print("The passed subsample size is greater than the number of cells in the subsample. \n Now setting subsample size to "+str(min(len(ranked_data), 1000))+". \n Please consider reducing the smooth_batch_size to a number in range 1000 - 3000\n for runtime and memory efficiency. ")
smooth_batch_size <- min(len(ranked_data), 1000)
elif len(ranked_data) > 1000 and smooth_batch_size > 1000:
print("Please consider reducing the smooth_batch_size to a number in range 1000 - 3000 for runtime and memory efficiency.")

#print(' Started smoothing')

# Calculate chunk number
chunk_number = math.ceil(len(ranked_data) / smooth_batch_size)

original_names = predicted_df.index
subsamples_indices = np.arange(len(ranked_data))
np.random.shuffle(subsamples_indices)
subsamples = np.array_split(subsamples_indices, chunk_number)

# Extract sample names for each subsample
# shuffled_names = [predicted_df.index[subsample] for subsample in subsamples]

smoothed_scores = []
smooth_results = []
# Process each chunk separately
with concurrent.futures.ProcessPoolExecutor(max_workers=smooth_cores_to_use) as executor:
for subsample in subsamples:
chunk_ranked_data = ranked_data[subsample, :]
chunk_predicted_df = predicted_df.iloc[subsample, :]
smooth_results.append(executor.submit(smooth_subset,chunk_ranked_data,chunk_predicted_df,top_col_inds,maxiter))
for f in concurrent.futures.as_completed(smooth_results):
cur_score = f.result()
smoothed_scores.append(cur_score)

# Concatenate the smoothed scores for all chunks
smoothed_scores_concatenated = pd.concat(smoothed_scores)

return smoothed_scores_concatenated[original_names]






def binning(predicted_df, scores): # scores is smoothed scores
labels = ['Differentiated',
'Unipotent',
'Oligopotent',
'Multipotent',
'Pluripotent',
'Totipotent']

#print(' Started binning')
pred_potencies = predicted_df["preKNN_CytoTRACE2_Potency"]
unique_potency = np.unique(pred_potencies)
score = 'preKNN_CytoTRACE2_Score'
df_pred_potency = pd.DataFrame({'preKNN_CytoTRACE2_Potency':pred_potencies,'preKNN_CytoTRACE2_Score':scores})
limits = np.arange(7)/6
for potency_i, potency in enumerate(labels):
lower = limits[potency_i]
upper = limits[potency_i+1]
if potency in unique_potency:
data_order = df_pred_potency[df_pred_potency['preKNN_CytoTRACE2_Potency']==potency]['preKNN_CytoTRACE2_Score'].sort_values()
index = data_order.index
n = len(index)
scaler = MinMaxScaler(feature_range=(lower+1e-8, upper-1e-8))
order = scaler.fit_transform(np.arange(n).reshape(-1,1))[:,0]
df_pred_potency.loc[index,score] = order

predicted_df["preKNN_CytoTRACE2_Score"] = df_pred_potency[score][predicted_df.index]


return predicted_df
Loading

0 comments on commit 9bb52c1

Please sign in to comment.