DeepCycle.py

import argparse

import tensorflow as tf
import tensorflow_probability as tfp
import tensorflow_datasets as tfds

import numpy as np
import pandas as pd
from scipy import stats
from scipy.spatial import distance_matrix
from scipy.stats import gaussian_kde
from scipy import interpolate
from sklearn.mixture import GaussianMixture

import matplotlib.pyplot as plt
import matplotlib
import matplotlib.cm as cm
import seaborn as sns

import math
import os
import shutil
import sys
from copy import copy
import time
import multiprocessing as mp
import json

import anndata

if not sys.warnoptions:
    import warnings
    warnings.simplefilter("ignore")


parser = argparse.ArgumentParser(description='Run DeepCycle.')
parser.add_argument('--input_adata',type=str, required=True,help='Anndata input file preprocessed with velocyto and scvelo (moments).')
parser.add_argument('--gene_list',type=str, required=True,help='Subset of genes to run the inference on.')
parser.add_argument('--base_gene',type=str, required=True,help='Gene used to have an initial guess of the phase.')
parser.add_argument('--expression_threshold',type=float, required=True,help='Unsplced/spliced expression threshold.')
parser.add_argument('--gpu',const=True,default=False, nargs='?',help='Use GPUs.')
parser.add_argument('--hotelling',const=True,default=False, nargs='?',help='Use Hotelling filter.')
parser.add_argument('--output_adata',type=str, required=True,help='Anndata output file. ')
args = parser.parse_args()

#REQUIRED INPUTS
input_adata_file = args.input_adata
list_of_genes_file = args.gene_list
output_anndata_file = args.output_adata
base_gene = args.base_gene
GPU = args.gpu
HOTELLING = args.hotelling
expression_threshold = args.expression_threshold

cwd = os.getcwd()
print("[Current working directory]:",cwd)

CycleAE_dir = cwd+'/DeepCycle'
if not os.path.exists( CycleAE_dir ):
    os.makedirs( CycleAE_dir )

#TRAINING SETTINGS
fraction_of_cells_to_validation = 0.17
BATCH_SIZE = 5       # number of data points in each batch
N_EPOCHS = 1000           # times to run the model on complete data
lr = 1e-4          # initial learning rate

print("[TensorFlow version]:", tf.version.VERSION)

tfk = tf.keras
tfkl = tf.keras.layers
tfpl = tfp.layers
tfd = tfp.distributions

if GPU:
    print("[Using GPUs]")
    physical_devices = tf.config.list_physical_devices('GPU')
    tf.config.set_visible_devices(physical_devices,'GPU')
    print("[Number of available GPUs]:", len(tf.config.experimental.list_physical_devices('GPU')))


class Hotelling:
    
    def __init__(self, x, y, dt, eta):
        self.x = x
        self.y = y
        self.sd_x = np.std(x)
        self.sd_y = np.std(y)
        self.delta_t = dt
        self.eta = eta
        self.xmin = self.x.min()
        self.xmax = self.x.max()
        self.ymin = self.y.min()
        self.ymax = self.y.max()
        s = 100000
        self.hx = (self.xmax-self.xmin)/s
        self.hy = (self.ymax-self.ymin)/s
        self.flag = 'OK'
        try:
            self.density, self.potential, self.X, self.Y, self.Z = self.potential_estimation()
            self.x_ss_min, self.x_ss_max = self.find_minima()
            hessian_min = self.hessian_estimation(self.x_ss_min)
            hessian_max = self.hessian_estimation(self.x_ss_max)

            self.covariance_min = np.linalg.inv(hessian_min)
            self.covariance_max = np.linalg.inv(hessian_max)
            pooled_inverse_covariance_matrix = np.linalg.inv(0.5*(self.covariance_max+self.covariance_min))

            if self.confidence_level(self.x_ss_max)<0.05 or self.confidence_level(self.x_ss_min)<0.05:
                self.hotelling_t_squared = 0
                self.flag = 'LOW_DENSITY'
            else:
                self.hotelling_t_squared = (self.x_ss_max - self.x_ss_min).dot(pooled_inverse_covariance_matrix.dot(self.x_ss_max - self.x_ss_min))
                if np.linalg.norm(self.x_ss_min-self.x_ss_max)<0.1:
                    self.hotelling_t_squared = 0
                    self.flag = 'TOO_CLOSE'
        except np.linalg.LinAlgError as err:
            if ( ('singular matrix' in str(err)) or ('Singular matrix' in str(err)) ):
                self.hotelling_t_squared = 0
                self.flag = 'SINGULAR_MATRIX'
            else:
                raise
        self.path = None

    def potential_estimation(self):
        
        X, Y = np.mgrid[self.xmin:self.xmax:200j, self.ymin:self.ymax:200j]
        positions = np.vstack([X.ravel(), Y.ravel()])
        values = np.vstack([self.x, self.y])
        kernel = gaussian_kde(values)
        Z = np.reshape(kernel(positions).T, X.shape)
        
        return kernel, kernel.logpdf, X, Y, Z
    
    def calculate_potential_force(self,path):
        
        #Initialization
        Fv = []
        hx = self.hx
        hy = self.hy
        potential = self.potential
        
        #Calculate force for each point in path
        for i in range(len(path)):
            x = path[i][0]
            y = path[i][1]
            dV_dx = ( potential((x+hx,y)) - potential((x-hx,y)) )/(2*hx)
            dV_dy = ( potential((x,y+hy)) - potential((x,y-hy)) )/(2*hy)
            orthogonal_potential = np.array([dV_dx,dV_dy]).flatten()
            Fv.append(orthogonal_potential)
        
        return Fv
    
    def find_minima(self):
        
        #Initialization
        potential = self.potential
        delta_t = self.delta_t
        eta = self.eta
        
        start_time = time.time()
        
        #2 minima first estimation with Gaussian Mixture Model
        xy_dict = {'x':self.x,'y':self.y}
        xy_df = pd.DataFrame(xy_dict)
        gm = GaussianMixture(n_components=2)
        gm.fit(xy_df)
        points = gm.means_
        
        #Refining the minima estimation with the potential descent
        velocities = [np.array([0.0,0.0]) for i in range(len(points))]
        for i in range(1000):
            potential_force = self.calculate_potential_force( points )
            acceleration = [potential_force[k]-eta*velocities[k] for k in range(len(potential_force)) ]
            old_points = copy(points)
            total_acceleration = 0.0
            for j in range(0,len(points)):
                points[j] += 0.5*acceleration[j]*delta_t**2 + velocities[j]*delta_t
                velocities[j] += acceleration[j]*delta_t
                total_acceleration += np.linalg.norm(acceleration[j])
            if np.linalg.norm(points[0]-points[1])<0.1 or total_acceleration < 0.00001:
                break
        
        x0 , x1 = points
        
        #Sort the minima
        if np.linalg.norm(x1) < np.linalg.norm(x0):
            return x1, x0
        else:
            return x0, x1
    
    def hessian_estimation(self,xp):
        
        #Estimate the inverse of the hessian matrix in x
        x = xp[0]
        y = xp[1]
        hx = self.hx
        hy = self.hy
        potential = self.potential
        d2V_dx2 = float( (potential((x+hx,y))-2*potential((x,y))+potential((x-hx,y)))/(hx**2) )
        d2V_dxdy = float( (potential((x+hx,y+hy))-potential((x+hx,y))-potential((x,y+hy))+potential((x,y)))/(hx*hy) )
        d2V_dy2 = float( (potential((x,y+hy))-2*potential((x,y))+potential((x,y+hy)))/(hy**2) )
        hessian = -np.array([[d2V_dx2,d2V_dxdy],[d2V_dxdy,d2V_dy2]])
        
        return hessian
    
    def plot_density(self,file=None):
        
        xmin = self.xmin
        xmax = self.xmax
        ymin = self.ymin
        ymax = self.ymax

        data_xy = np.array([self.x,self.y])
        cov_matrix = np.cov( data_xy )
        v, w = np.linalg.eigh( cov_matrix )

        x_avg, y_avg = np.mean( data_xy, axis=1 )
        first_component = np.sqrt(v[1])*w[:,1]
        second_component = np.sqrt(v[0])*w[:,0]
        u = [first_component[0],second_component[0]]
        v = [first_component[1],second_component[1]]
        origin = [x_avg], [y_avg]
        
        fig = plt.figure()
        ax = fig.add_subplot(111)
        ax.imshow(np.rot90(self.Z), cmap=plt.cm.jet_r, extent=[xmin, xmax, ymin, ymax], aspect='auto',zorder=0)
        cset = ax.contour(self.X,self.Y,self.Z,zorder=1)
        ax.clabel(cset, inline=1, fontsize=10)
        ax.plot(self.x_ss_min[0],self.x_ss_min[1], 'k.', markersize=10, color='red')
        ax.plot(self.x_ss_max[0],self.x_ss_max[1], 'k.', markersize=10, color='red')
        ax.quiver(x_avg, y_avg, first_component[0], first_component[1], color='white', angles='xy', scale_units='xy', scale=1,zorder=3)
        ax.quiver(x_avg, y_avg, second_component[0], second_component[0], color='gray', angles='xy', scale_units='xy', scale=1,zorder=3)
        ax.set_xlim([xmin, xmax])
        ax.set_ylim([ymin, ymax])
        ax.set_title('T-squared='+str(self.hotelling_t_squared))
        ax.set_xlabel('spliced')
        ax.set_ylabel('unspliced')
        if file!=None:
            plt.savefig(file)
    
    def confidence_level(self, point):
        density = self.density
        iso = density(point)
        sample = density.resample(size=200)
        insample = density(sample)<iso
        integral = insample.sum() / float(insample.shape[0])
        return integral


#CIRCULAR LAYER FOR THE AUTOENCODER
class Circularize(tfkl.Layer):
    def __init__(self, input_dim=1):
        super(Circularize, self).__init__()

    def call(self, inputs):
        return tf.concat([tf.math.cos(2*np.pi*inputs),tf.math.sin(2*np.pi*inputs)], axis=1)


sys.stdout.write("[Loading anndata]: ")
sys.stdout.flush()
adata = anndata.read_h5ad(input_adata_file)
print(input_adata_file)


sys.stdout.write("[Loading genes]: ")
cell_cycle_genes = []
with open(list_of_genes_file,'r') as fp:
    for line in fp:
        line = line.rstrip()
        if line != '':
            cell_cycle_genes.append(line)
print(list_of_genes_file)

#FILTER GENES BY HOTELLING
gene_list = list(adata.var.index)

hot_dir = CycleAE_dir+'/hotelling/'

def process_gene(gene, adata=adata, hot_dir=hot_dir):
    try:
        n = gene_list.index(gene)
        hotelling = 0
        df = pd.DataFrame({ 'spliced':adata.layers['Ms'][:,n], 'unspliced':adata.layers['Mu'][:,n] })
        if (df.mean()<0.5).all():
            return
        hot = Hotelling( x=df['spliced'], y=df['unspliced'], dt=0.1, eta=10 )
        if hot.flag!='OK':
            lock.acquire()
            print('['+gene+']: discarded because of', hot.flag)
            lock.release()
            return
        hotelling = abs(hot.hotelling_t_squared)
    except ValueError:
        lock.acquire()
        print('['+gene+']: discarded because of ValueError')
        lock.release()
        return
    if hotelling<0.5:
        lock.acquire()
        print('['+gene+']: discarded because of low T-squared')
        lock.release()
        return
    else:
        svgfile_hotelling = hot_dir+gene+'.svg'
        try:
            hot.plot_density(svgfile_hotelling)
        except Exception as e:
            lock.acquire()
            print('['+gene+']: error saving',svgfile_hotelling, e)
            lock.release()
        lock.acquire()
        print('['+gene+']: OK')
        lock.release()
        return(gene)

if HOTELLING:
    try:
        if os.path.exists(hot_dir):
            shutil.rmtree(hot_dir, ignore_errors=True)
            os.mkdir(hot_dir)
        else:
            os.mkdir(hot_dir)
    except:
        print("[ERROR]: Creation of the directory %s failed" % hot_dir)
        raise
        
    def init(l):
        global lock
        lock = l
    
    l = mp.Lock()
    pool = mp.Pool(initializer=init, initargs=(l,))
    pool_output = pool.map(process_gene, cell_cycle_genes)
    
    filtered_cell_cycle_genes = [i for i in pool_output if i]
    
    gene_json_file = hot_dir+'filtered_genes.json'
    with open(gene_json_file,'w') as fp:
        json.dump(filtered_cell_cycle_genes, fp)
    
    print("[Filtered genes]:", gene_json_file)
else:
    filtered_cell_cycle_genes = cell_cycle_genes

#FUNCTION TO GENERATE THE INPUTS OF THE AUTOENCODER
def generate_input(list_of_genes, adata):
    gene_list = list(adata.var.index)
    
    n = gene_list.index(list_of_genes[0])
    df_all = pd.DataFrame({ 'spliced':adata.layers['Ms'][:,n], 'unspliced':adata.layers['Mu'][:,n] })

    for gene in list_of_genes[1:]:
        try:
            n = gene_list.index(gene)
            df = pd.DataFrame({ 'spliced':adata.layers['Ms'][:,n], 'unspliced':adata.layers['Mu'][:,n] })
            if (df.mean()<expression_threshold).all():
                list_of_genes.remove(gene)
                continue
        except ValueError:
            list_of_genes.remove(gene)
            continue
        df_all = pd.concat([df_all, df], axis=1)

    normalized_df=(df_all-df_all.mean())/df_all.std()
    np_data = normalized_df.to_numpy()
    nan_columns = list(np.where(np.any(~np.isfinite(np_data),axis=0))[0])
    nan_indexes = list([ n//2 for n in nan_columns])
    even_columns = list([ 2*n for n in nan_indexes ])
    odd_columns = list([ 2*n+1 for n in nan_indexes ])
    for idx in sorted(nan_indexes, reverse=True):
        del list_of_genes[idx]
    columns_to_drop = sorted(even_columns + odd_columns)
    normalized_df.columns = list(range(normalized_df.shape[1]))
    df_all.columns = list(range(df_all.shape[1]))
    df_all = df_all.drop(df_all.columns[columns_to_drop], axis=1)
    normalized_df = normalized_df.drop(normalized_df.columns[columns_to_drop], axis=1)
    #print("Left genes: ",len(list_of_genes), normalized_df.shape)
    
    return list_of_genes, normalized_df.to_numpy() #cells x (2 genes)

#GENERATE INPUTS
genes, np_data = generate_input(filtered_cell_cycle_genes, adata)
print("[Genes]:", genes)
print("[N. OF USED GENES]",len(genes))
np.savez( CycleAE_dir+"/input_data.npz", genes, np_data, allow_pickle=True)
n_genes = len(genes)
n_cells = np_data.shape[0]
n_columns = np_data.shape[1]
if n_columns != 2*n_genes:
    print("[ERROR]: incoherent number of genes and columns")

n_cells_to_validation = int(n_cells*fraction_of_cells_to_validation)
print( "[Total number of cells]:", n_cells)
print( "[Number of cells used for training]:", n_cells-n_cells_to_validation)
print( "[Number of cells used for validation]:", n_cells_to_validation)


#BUILD INITIAL GUESS BASED ON A GENE 
index_gene = genes.index(base_gene)

angles = 1.5*(np.array([ [math.atan2(np_data[i,2*index_gene+1],np_data[i,2*index_gene])] for i in range(len(np_data)) ]) % (2*np.pi))/(2*np.pi)


#PARAMETERS
INPUT_DIM = len(genes)*2     # size of each input
HIDDEN_DIM = len(genes)*4        # hidden dimension
LATENT_DIM = 1        # latent vector dimension

input_shape = (INPUT_DIM,)
encoded_size = LATENT_DIM
base_depth = HIDDEN_DIM
print("[Model input shape]:", input_shape)

#BUILD THE INPUT DATASET OBJECTS
e_dataset = tf.data.Dataset.from_tensor_slices((np_data,angles)).shuffle(1000).batch(BATCH_SIZE)
n_batches_to_validation = math.ceil(tf.data.experimental.cardinality(e_dataset).numpy() * fraction_of_cells_to_validation)
eval_e_dataset = e_dataset.take(n_batches_to_validation)
train_e_dataset = e_dataset.skip(n_batches_to_validation)

ae_dataset = tf.data.Dataset.from_tensor_slices((np_data,np_data)).shuffle(1000).batch(BATCH_SIZE)
eval_ae_dataset = ae_dataset.take(n_batches_to_validation)
train_ae_dataset = ae_dataset.skip(n_batches_to_validation)

#FUNCTION TO PLOT THE TRAINING
def plot_training(fit):
    best_epoch = fit.epoch[fit.history['val_loss'].index(min(fit.history['val_loss']))]
    fig, ax = plt.subplots(figsize=(3,3))
    ax.plot(fit.epoch,fit.history['val_loss'],'.-',color='red', label='validation')
    ax.plot(fit.epoch,fit.history['loss'],'.-',color='orange', label='train')
    ax.set_yscale('log')
    ax.set(ylabel='MSE')
    ax.axvspan(best_epoch-0.5,best_epoch+0.5, alpha=0.5, color='red')
    ax.legend()
    print("[Best epoch]:", best_epoch)
    print("[MSE]:", min(fit.history['val_loss']))


#ENCODER
inputs = tfk.Input(shape=input_shape)

norm_atans = tfkl.Reshape((1,))((tf.math.atan2(inputs[...,2*index_gene+1],inputs[...,2*index_gene]) % (2*np.pi))/(2*np.pi) )

x = tfkl.GaussianNoise(0.01)(inputs)
x = tfkl.Dense(base_depth,activation=tf.nn.leaky_relu)(x)
x = tfkl.Dense(base_depth,activation=tf.nn.leaky_relu)(x)
x = tfkl.Dense(base_depth,activation=tf.nn.leaky_relu)(x)
x = tfkl.Dense(base_depth,activation=tf.nn.leaky_relu)(x)

noisy_atans = tfkl.GaussianNoise(0.05)(norm_atans)
#manip_inputs = tfkl.concatenate([x,noisy_angle], axis=1)

manip_inputs = tfkl.concatenate([x,norm_atans], axis=1)

manip_inputs = tfkl.Dense(1)(manip_inputs)

outputs = tfkl.GaussianNoise(0.05)(manip_inputs)

encoder = tfk.Model(inputs, outputs, name="non_seq_encoder_noisy")
#encoder.summary()

#DECODER
decoder = tfk.Sequential([
    tfkl.InputLayer(input_shape=encoded_size),
    Circularize(),
    tfkl.GaussianNoise(0.03),
    tfkl.Dense(base_depth, activation=tf.nn.leaky_relu),
    tfkl.Dense(base_depth, activation=tf.nn.leaky_relu),
    tfkl.Dense(base_depth, activation=tf.nn.leaky_relu),
    tfkl.Dense(base_depth, activation=tf.nn.leaky_relu),
    tfkl.Dense(input_shape[0],
               activation=None),
])

#decoder.summary()

#AUTOENCODER
ae = tfk.Model(inputs=encoder.inputs,
                outputs=decoder(encoder.outputs))


#ENCODER PRETRAINING
encoder.compile(optimizer=tf.optimizers.Adam(learning_rate=lr),
            loss=tf.keras.losses.mean_squared_error, metrics=[tf.keras.metrics.kullback_leibler_divergence])

e_fit = encoder.fit(train_e_dataset,
            epochs=N_EPOCHS,
            validation_data=eval_e_dataset, batch_size=BATCH_SIZE,
            callbacks=[tf.keras.callbacks.ReduceLROnPlateau(monitor='val_loss', factor=0.8, patience=5, min_lr=0.00001),
                       tf.keras.callbacks.EarlyStopping(monitor='val_loss', min_delta=0.0, patience=20, verbose=1, mode='auto', restore_best_weights=True)
                      ]
            )


if not os.path.exists(CycleAE_dir+'/training'):
    os.makedirs(CycleAE_dir+'/training')
print("[ ENCODER PRE-TRAINING ]")
plot_training(e_fit)
plt.savefig(CycleAE_dir+'/training/encoder_pretraining.svg')
plt.clf()

#AUTOENCODER TRAINING
ae.compile(optimizer=tf.optimizers.Adam(learning_rate=lr/10),
            loss=tf.keras.losses.mean_squared_error, metrics=[tf.keras.metrics.kullback_leibler_divergence])

ae_fit = ae.fit(train_ae_dataset,
            epochs=N_EPOCHS,
            validation_data=eval_ae_dataset, batch_size=BATCH_SIZE,
            callbacks=[tf.keras.callbacks.ReduceLROnPlateau(monitor='val_loss', factor=0.8, patience=5, min_lr=0.000001),
                       tf.keras.callbacks.EarlyStopping(monitor='val_loss', min_delta=0.0, patience=20, verbose=1, mode='auto', restore_best_weights=True)]
          )


print("[ AUTOENCODER TRAINING ]")
plot_training(ae_fit)
plt.savefig(CycleAE_dir+'/training/autoencoder_training.svg')
plt.clf()

# Save the entire model
if not os.path.exists( CycleAE_dir+'/model' ):
    os.makedirs( CycleAE_dir+'/model' )
ae.save( CycleAE_dir+'/model' )


#CALCULATE PHASES
predicted_dataset = ae.predict(ae_dataset)
test_input_to_decoder = tf.convert_to_tensor(np.array([np.arange(0,1,step=0.01)]).T )
predicted_decoder_test = decoder(test_input_to_decoder)

distances = distance_matrix(np_data,predicted_decoder_test)

t_closest_theta = tf.convert_to_tensor(np.array([]))
for cell in distances:
    dist = list(cell)
    t2 = test_input_to_decoder[dist.index(min(dist))]
    t_closest_theta = tf.concat([t_closest_theta, t2],0)

t_closest_theta = t_closest_theta*-1+1
adata.obs['cell_cycle_theta'] = t_closest_theta
adata.write_h5ad(output_anndata_file)
print("[Output anndata]:", output_anndata_file)


print("[PRELIMINARY ANALYSIS]")
th=adata.obs['cell_cycle_theta']

minima = min(th)
maxima = max(th)

norm = matplotlib.colors.Normalize(vmin=minima, vmax=maxima)
mapper = cm.ScalarMappable(norm=norm, cmap=cm.coolwarm)

plt.scatter(adata.obsm['X_umap'][:,0],adata.obsm['X_umap'][:,1],c=th,cmap=cm.coolwarm,alpha=0.5)
plt.colorbar(mapper)
plt.savefig(CycleAE_dir+'/umap_cell_cycle_theta.svg')
plt.clf()

flipped_time = np.flip(test_input_to_decoder[:,0])
good_genes = genes
sorted_good_genes = sorted(good_genes)

w = 3
h = (len(sorted_good_genes))//w +1
if (len(sorted_good_genes)) % w >0:
    h += 1

fit_dir = CycleAE_dir+'/fits/'

try:
    if os.path.exists(fit_dir):
        shutil.rmtree(fit_dir, ignore_errors=True)
        os.mkdir(fit_dir)
    else:
        os.mkdir(fit_dir)
except:
    print("[ERROR]: Creation of the directory %s failed" % fit_dir)
    raise


for gene in sorted_good_genes:
    index = genes.index(gene)
    fig, ax = plt.subplots()
    sns.scatterplot(np_data[:,2*index],np_data[:,2*index+1],alpha=0.1,ax=ax)
    sns.kdeplot(np_data[:,2*index],np_data[:,2*index+1],shade=False,cmap="Reds",ax=ax)
    ax.plot(predicted_dataset[:,2*index],predicted_dataset[:,2*index+1],'.',alpha=0.05,color='black')
    ax.set_title(genes[index])
    ax.set_xlabel('spliced (z-score)')
    ax.set_ylabel('unspliced (z-score)')
    ax.set_aspect(1)
    plt.ylim(-5, 5)
    plt.xlim(-5, 5)
    plt.savefig(fit_dir+gene+'.svg')
    plt.clf()
    

for gene in sorted_good_genes:
    index = genes.index(gene)
    fig, ax = plt.subplots()
    ax.plot(flipped_time,predicted_decoder_test[:,2*index],'.-',alpha=0.5,color='red')
    ax.plot(flipped_time,predicted_decoder_test[:,2*index+1],'.-',alpha=0.5,color='green')
    ax.set_title(genes[index])
    ax.set_xlabel('transcriptional phase')
    ax.set_ylabel('expression (z-score)')
    plt.ylim(-5, 5)
    plt.xlim(-5, 5)
    plt.savefig(fit_dir+gene+'_series.svg')
    plt.clf()