neutral_covariance_test.py

# translation of the original Washburne code in R

import scipy.stats
from scipy.special import kolmogorov
import numpy as np
from statsmodels.stats.diagnostic import het_breuschpagan
import pandas as pd
import statsmodels.api as sm
import statsmodels.formula.api as smfa
from multiprocessing import Pool
from functools import partial

# TODO not sure if this file is correct, small numerical deviations

def predict(ntests, S, qr=False):
    aa = -1
    bb = 3

    return (aa + bb * np.log(S) - np.log(ntests))

def draw_WFP_CVT(ts, n=None, m=None):
    # constant-volatility transformation

    if n == None:
        n = ts.shape[1] # number of species
    if m == None:
        m = ts.shape[0] # number of timesteps

    #dum = True
    #while dum == True:
    # array of random 1's and -1's
    a = np.random.uniform(-1,1,[n, 1]); a[a<0] = -1; a[a>0] = 1;
    #   if (abs(sum(a) < n)):
    #       dum = False

    vec = ts.dot(a).flatten()
    vec[vec < -1] = -1
    vec[vec > 1] = 1
    output = pd.DataFrame(columns=['f', 'DF'])
    output['f'] = np.arcsin(vec[1:])
    output['DF'] = np.arcsin(vec[1:]) - np.arcsin(vec[:-1])

    return output

def pval(ts, regress='f', formula=None, DT=0, seed=0):
    np.random.seed(seed)

    cvt = draw_WFP_CVT(ts)
    cvt['f2'] = cvt['f']**2

    if not regress == 'f':
        cvt['DF'] = cvt['DF'] / np.sqrt(DT)

    # linear fit as in formula
    try:
        model = smfa.ols(formula=formula, data=cvt).fit()
    except:
        print("code was failing, cvt")
        print(cvt)
    _, r, _, _ = het_breuschpagan(model.resid, model.model.exog) # pvalue chi-squared test _, _, _, pvalue

    return r

def cv_test(ntests, ts, regress='f', formula=None, varformula=None, tm=None, ncores=1, seed=0):
    np.random.seed(seed)

    if not regress in ['f', 'time']:
        print('Error: unknown inputt "regress" - must be either "f" or "time"')
        return

    if formula == None:
        if regress == 'f':
            formula = 'DF~f+f2'
        else:
            formula = 'DF~tm'

    if varformula == None:
        if regress == 'f':
            varformula = 'DF~f+f2'
        else:
            varformula = 'DF~tm+tm2'

    p = np.zeros(ntests)
    b = np.zeros([3, ntests]) # intercept, coef for f, coef for f**2

    DT = 0
    if regress == 'time' and tm == None:
        tm = np.arange(1, len(ts))
        DT = tm[1:] - tm[:-1]
        tm = tm[:-1]

    if ncores == 1:
        for nn in range(ntests):
            p[nn] = pval(ts, regress, formula, seed=np.random.randint(1e6))

            if False:
                if regress == 'f':
                    cvt['DF'] = model.resid ** 2

                b[:, nn] = sm.formula.glm(varformula, data=cvt).fit().params
                # orig code glm function -> iteratively reweighted least squares (IWLS)
                # can give rise to an error: PerfectSeparationError: Perfect separation detected, results not available
    else:
        pval_ts = partial(pval, ts, regress, formula, DT)

        pool = Pool(ncores)
        p = pool.map(pval_ts, np.random.randint(0,1e6,ntests))

    return p

def neutral_covariance_test(ts, ntests=None, regress='f', formula='DF~f+I(f**2)',
                            varformula=None, standard=True, method='logitnorm', verbose=False, ncores=1, seed=0):

    if method not in ['Kolmogorov', 'logitnorm', 'uncorrected']:
        print('Unknown input method. Must be either "Kolmogorov", "logitnorm", or "uncorrected".')
        return

    S = ts.shape[1]  # number of species
    m = ts.shape[0]  # number of timepoints

    # check if timeseries are normalized
    sum_ts = np.sum(ts, axis=1)
    if max(sum_ts) > 1.01 or min(sum_ts) < 0.99:
        raise ValueError('Timeseries is not normalized.')

    upperbound = 0.2889705

    if ntests == None:
        ntests = min(S, int((upperbound * S) ** 3))
    elif ntests > (upperbound * S) ** 3:
        print('Warning: ntests input is large relative to number of species, leading to high false-positive rates for P<0.05')

    pvalues = cv_test(ntests, ts, regress, formula, varformula, ncores=ncores, seed=seed)

    if verbose:
        print("pvals", pvalues)

    ntests = len(pvalues)
    D = scipy.stats.kstest(pvalues, 'uniform').statistic

    if verbose:
        print("D", D)

    if method == 'Kolmogorov':
        Q = predict(ntests, S)
        if verbose:
            print("Q", Q)
        nstar_est = ntests / (1.0 + np.exp(-Q))
        if verbose:
            print("nstar_est", nstar_est)
        P = kolmogorov(D * np.sqrt(nstar_est))  # complementary cumulative Kolmogorov distribution

    return P

def normalize(ts):
    ts /= np.sum(ts, axis=1, keepdims=True)
    return ts


def main():
    #ts = np.loadtxt('../../enterotypes_ibm/timeseries-Faust/5_timeseries/5_timeseries.txt', skiprows=1).T
    #ts = pd.read_csv('../study_with_interaction/timeseries_Langevin_linear_interaction0.csv', index_col=1).values
    ts = pd.read_csv('../study_no_interaction/timeseries_Langevin_linear_dt1.csv', index_col=1).values
    ts = normalize(ts)
    print(neutral_covariance_test(ts, ntests=500, method = 'Kolmogorov', verbose=False))

if __name__ == "__main__":
    main()