diff --git a/doc/images/sm_scatter-2.png b/doc/images/sm_scatter-2.png index a8eecbd..8d974c5 100644 Binary files a/doc/images/sm_scatter-2.png and b/doc/images/sm_scatter-2.png differ diff --git a/starvine/bvcopula/copula/clayton_copula.py b/starvine/bvcopula/copula/clayton_copula.py index a34315e..eabddb0 100644 --- a/starvine/bvcopula/copula/clayton_copula.py +++ b/starvine/bvcopula/copula/clayton_copula.py @@ -14,11 +14,11 @@ class ClaytonCopula(CopulaBase): \f] """ def __init__(self, rotation=0, init_params=None): + super(ClaytonCopula, self).__init__(rotation, params=init_params) self.thetaBounds = ((1e-9, np.inf),) self.theta0 = (1.0, ) self.rotation = rotation self.name = 'clayton' - super(ClaytonCopula, self).__init__(rotation, params=init_params) @CopulaBase._rotPDF def _pdf(self, u, v, rotation=0, *theta): diff --git a/starvine/bvcopula/copula/copula_base.py b/starvine/bvcopula/copula/copula_base.py index 23b8a1e..d2f1475 100644 --- a/starvine/bvcopula/copula/copula_base.py +++ b/starvine/bvcopula/copula/copula_base.py @@ -4,6 +4,7 @@ # H function. from __future__ import print_function, absolute_import, division import numpy as np +import six, abc import scipy.integrate as spi from scipy.optimize import bisect, newton, brentq from scipy.optimize import minimize @@ -12,6 +13,7 @@ warnings.filterwarnings('ignore') +@six.add_metaclass(abc.ABCMeta) class CopulaBase(object): """! @brief Bivariate Copula base class. Meant to be subclassed @@ -21,23 +23,85 @@ class CopulaBase(object): Copula can be rotated by 90, 180, 270 degrees to accommodate negative dependence. """ - def __init__(self, rotation=0, **kwargs): + def __init__(self, rotation=0, thetaBounds=((-np.inf, np.inf),), + theta0=(0.0,), name='defaut', **kwargs): """! @brief Init copula - @param rotation int Copula orientation + @param rotation int Copula orientation in [0, 1, 2, 3] + @param thetaBounds tuple of tuples should have len = len(theta0). + provides upper and lower bounds for each copula parameter + @param theta0 tuple initial parameter guesses + @name str name of coupula """ + self._thetaBounds = thetaBounds self.rotation = rotation + self.theta0 = theta0 + self.name = name self._fittedParams = kwargs.pop("params", None) @property def fittedParams(self): + """! + @brief Fitted parameters from MLE or MCMC + """ return self._fittedParams @fittedParams.setter def fittedParams(self, fp): - # TODO: bounds check input fp! + if not self._bounds_check(*fp): + raise RuntimeError("Fitted params not in bounds.") self._fittedParams = fp + @property + def theta0(self): + """! + @brief Initial parameter guess + """ + return self._theta0 + + @theta0.setter + def theta0(self, theta): + if not self._bounds_check(*theta): + raise RuntimeError("Theta_0 not in bounds.") + self._theta0 = theta + + @property + def thetaBounds(self): + """! + @brief Copula parameter bounds + """ + return self._thetaBounds + + @thetaBounds.setter + def thetaBounds(self, bounds): + """! + @brief Copula parameter bounds + """ + self._thetaBounds = bounds + + @property + def rotation(self): + """! + @brief Copula orientation + """ + return self._rotation + + @rotation.setter + def rotation(self, rot): + assert rot in [0, 1, 2, 3] + self._rotation = rot + + @property + def name(self): + """! + @brief Copula name + """ + return self._name + + @name.setter + def name(self, nm_str): + self._name = nm_str + # ---------------------------- PUBLIC METHODS ------------------------------ # def cdf(self, u, v, *theta): """! @@ -75,9 +139,29 @@ def fitMcmc(self, u, v, *theta0, **kwargs): @param theta0 Initial guess for copula parameter list @return tuple : (np_array Array of MLE fit copula parameters, - int Fitting success flag, 1==success) + np_2darray sample array of shape (nparams, nsamples)) """ - pass + from emcee import EnsembleSampler + wgts = kwargs.pop("weights", np.ones(len(u))) + rotation = 0 + ln_prob = lambda theta: self._ln_prior(*theta, **kwargs) + \ + self._ln_like(u, v, wgts, rotation, *theta) + if None in theta0: + params0 = self.theta0 + else: + params0 = theta0 + ndim = len(params0) + ngen = kwargs.get("ngen", 200) + nburn = kwargs.get("nburn", 100) + nwalkers = kwargs.get("nwalkers", 50) + # initilize walkers in gaussian ball around theta0 + pos_0 = [np.array(params0) + 1e-6 * np.asarray(params0)*np.random.randn(ndim) for i in range(nwalkers)] + emcee_mcmc = EnsembleSampler(nwalkers, ndim, ln_prob) + emcee_mcmc.run_mcmc(pos_0, ngen) + samples = emcee_mcmc.chain[:, nburn:, :].reshape((-1, ndim)) + res = np.mean(samples, axis=0) + self._fittedParams = res + return res, samples def fitMLE(self, u, v, *theta0, **kwargs): """! @@ -157,8 +241,7 @@ def setRotation(self, rotation=0): """! @brief Set the copula's orientation: Allows for modeling negative dependence with the - frank, gumbel, and clayton copulas (Archimedean Copula family is - non-symmetric) + frank, gumbel, and clayton copulas @param rotation int Copula rotation. 0 == 0 deg, 1 == 90 deg rotation, @@ -168,6 +251,7 @@ def setRotation(self, rotation=0): self.rotation = rotation # ---------------------------- PRIVATE METHODS ------------------------------ # + @abc.abstractmethod def _pdf(self, u, v, rotation=0, *theta): """! @brief Pure virtual density function. @@ -210,6 +294,7 @@ def _ppf(self, u, v, rotation=0, *theta): v_hat = self._hinv(u_hat, v, rotation, *theta) return (u_hat, v_hat) + @abc.abstractmethod def _h(self, u, v, rotation=0, *theta): """! @brief Copula conditional distribution function. @@ -226,6 +311,7 @@ def _h(self, u, v, rotation=0, *theta): """ raise NotImplementedError + @abc.abstractmethod def _hinv(self, u, v, rotation=0, *theta): """! @brief Inverse H function. @@ -259,12 +345,43 @@ def _logLike(self, u, v, wgts=None, rotation=0, *theta): wgts = np.ones(len(u)) return np.sum(wgts * np.log(self._pdf(u, v, rotation, *theta))) + def _ln_like(self, u, v, wgts=None, rotation=0, *theta): + """! + @brief log likelihood func wrapper. return -np.inf if + theta is out of bounds + """ + try: + a = self._logLike(u, v, wgts, rotation, *theta) + if np.isnan(a): + return -np.inf + else: + return a + except: + return -np.inf + + def _bounds_check(self, *theta, **kwargs): + """! + @brief Check if parameters are in bounds. + """ + bounds=kwargs.pop("bounds", self.thetaBounds) + assert len(theta) == len(bounds) + for i, param in enumerate(theta): + if bounds[i][0] < param < bounds[i][1]: + pass + else: + return False + return True + + def _ln_prior(self, *theta, **kwargs): + in_bounds = self._bounds_check(*theta, **kwargs) + if in_bounds: + return 0.0 + else: + return -np.inf + def _invhfun_bisect(self, U, V, rotation, *theta): """! @brief Compute inverse of H function using bisection. - Update (11/08/2016) performance improvement: finish with newton iterations - Update (12/04/2016) Newton can minimization needs bounds checks! - TODO: move to scipy.minmize """ # Apply limiters U = np.clip(U, 1e-8, 1. - 1e-8) @@ -307,6 +424,7 @@ def _AIC(self, u, v, rotation=0, *theta, **kwargs): AICc = AIC + (2. * k ** 2. + 2. * k) / (len(u) - k - 1) return AICc + @abc.abstractmethod def _gen(self, t, *theta): """! @brief Copula generator function. @@ -537,9 +655,6 @@ def wrapper(self, *args, **kwargs): return 1. - f(self, 1. - u, 1. - v, rot, *nargs) elif self.rotation == 3: # 270 deg rotation - # Clayton is flipped! - # TODO: Fix GUMBEL 270 rotation - # return 1. - f(self, 1. - u, v, rot, *nargs) # works for gumbel return 1. - f(self, u, 1. - v, rot, *nargs) return wrapper @@ -563,10 +678,13 @@ def icdf_uv_bisect(X, frozen_marginal_model): """ @brief Apply marginal model. @param X np_1darray samples from copula margin (uniform distributed) - @param frozen_marginal_model function frozen scipy.stats.rv_continuous python function object - CDF(ux) - Note: margin CDF model should be a monotonic function with a range in [0, 1] + @param frozen_marginal_model frozen scipy.stats.rv_continuous python object + or function inverse cdf fuction + Note: inv_CDF should be a monotonic function with a domain in [0, 1] @return np_1darray Samples drawn from supplied margin model """ - icdf = frozen_marginal_model.ppf(X) + if hasattr(frozen_marginal_model, 'ppf'): + icdf = frozen_marginal_model.ppf(X) + else: + icdf = frozen_marginal_model(X) return icdf diff --git a/starvine/bvcopula/copula/frank_copula.py b/starvine/bvcopula/copula/frank_copula.py index ece9e3d..54369d1 100644 --- a/starvine/bvcopula/copula/frank_copula.py +++ b/starvine/bvcopula/copula/frank_copula.py @@ -14,11 +14,11 @@ class FrankCopula(CopulaBase): \f$\theta \in [0, \infty) \f$ """ def __init__(self, rotation=0, init_params=None): + super(FrankCopula, self).__init__(rotation, params=init_params) self.thetaBounds = ((1e-9, np.inf),) self.theta0 = (1.0,) self.rotation = rotation self.name = 'frank' - super(FrankCopula, self).__init__(rotation, params=init_params) @CopulaBase._rotPDF def _pdf(self, u, v, rotation=0, *theta): diff --git a/starvine/bvcopula/copula/gauss_copula.py b/starvine/bvcopula/copula/gauss_copula.py index 8561954..37bcdfc 100644 --- a/starvine/bvcopula/copula/gauss_copula.py +++ b/starvine/bvcopula/copula/gauss_copula.py @@ -19,11 +19,11 @@ def __init__(self, rotation=0, init_params=None): @param rotation Int. in (0, 1, 2, 3) @param init_params List of initial copula parameters """ + super(GaussCopula, self).__init__(rotation, params=init_params) self.thetaBounds = ((-1 + 1e-9, 1 - 1e-9),) self.theta0 = (0.7,) self.name = 'gauss' self.rotation = rotation - super(GaussCopula, self).__init__(rotation, params=init_params) @CopulaBase._rotPDF def _pdf(self, u, v, rotation=0, *theta): diff --git a/starvine/bvcopula/copula/gumbel_copula.py b/starvine/bvcopula/copula/gumbel_copula.py index 9a6562a..b1f6d3d 100644 --- a/starvine/bvcopula/copula/gumbel_copula.py +++ b/starvine/bvcopula/copula/gumbel_copula.py @@ -12,11 +12,11 @@ class GumbelCopula(CopulaBase): \f$\theta \in [1, \infty) \f$ """ def __init__(self, rotation=0, init_params=None): + super(GumbelCopula, self).__init__(rotation, params=init_params) self.thetaBounds = ((1 + 1e-9, np.inf),) self.theta0 = (2.0, ) self.rotation = rotation self.name = 'gumbel' - super(GumbelCopula, self).__init__(rotation, params=init_params) @CopulaBase._rotPDF def _pdf(self, u, v, rotation=0, *theta): @@ -79,22 +79,27 @@ def _hinv(self, v, u, rotation=0, *theta): """ U = np.asarray(u) V = np.asarray(v) - # uu = np.zeros(U.size) - # for i, (ui, vi) in enumerate(zip(U, V)): - # uu[i] = self._invhfun_bisect(ui, vi, rotation, *theta) - # return uu + uu = np.zeros(U.size) + for i, (ui, vi) in enumerate(zip(U, V)): + uu[i] = self._invhfun_bisect(ui, vi, rotation, *theta) + return uu # Apply rotation - if rotation == 1: - U = 1. - U - elif rotation == 3: - V = 1. - V - elif rotation == 0: - pass - vv = self.vec_newton_hinv(V, U, theta[0], z_init=np.ones(len(U))) - if rotation == 1 or rotation == 3: - return 1. - vv - else: - return vv + + # TODO: Fix! Properly vectorize hinv! + # TODO: This is causing an issue in c-vine sampling + # It is faster but it is incorrect. + #if rotation == 1: + # U = 1. - U + #elif rotation == 3: + # V = 1. - V + #elif rotation == 0: + # pass + #vv = self.vec_newton_hinv(V, U, theta[0], z_init=np.ones(len(U))) + #if rotation == 1 or rotation == 3: + # return 1. - vv + #else: + # return vv + pass @CopulaBase._rotGen def _gen(self, t, *theta): diff --git a/starvine/bvcopula/copula/indep_copula.py b/starvine/bvcopula/copula/indep_copula.py index 2f9f90f..e009d98 100644 --- a/starvine/bvcopula/copula/indep_copula.py +++ b/starvine/bvcopula/copula/indep_copula.py @@ -7,6 +7,7 @@ class IndepCopula(CopulaBase): def __init__(self, rotation=0): + super(IndepCopula, self).__init__(rotation) self.name = 'indep' self.theta0 = (1.0, ) self.fittedParams = [1.0] diff --git a/starvine/bvcopula/copula/marshall_olkin_copula.py b/starvine/bvcopula/copula/marshall_olkin_copula.py index 9370cf4..1f8da6c 100644 --- a/starvine/bvcopula/copula/marshall_olkin_copula.py +++ b/starvine/bvcopula/copula/marshall_olkin_copula.py @@ -14,11 +14,11 @@ class OlkinCopula(CopulaBase): \f$\theta \in [1, \infty) \f$ """ def __init__(self, rotation=0, init_params=None): + super(OlkinCopula, self).__init__(rotation, params=init_params) self.thetaBounds = ((0, 1.), (0, 1.),) self.theta0 = (0.5, 0.7) self.rotation = rotation self.name = 'olkin' - super(OlkinCopula, self).__init__(rotation, params=init_params) @CopulaBase._rotPDF def _pdf(self, u, v, rotation=0, *theta): diff --git a/starvine/bvcopula/copula/t_copula.py b/starvine/bvcopula/copula/t_copula.py index 2a66e56..3d142f5 100644 --- a/starvine/bvcopula/copula/t_copula.py +++ b/starvine/bvcopula/copula/t_copula.py @@ -22,11 +22,11 @@ class StudentTCopula(CopulaBase): \f$ \theta_1 \in (2, \infty) \f$ """ def __init__(self, rotation=0, init_params=None): + super(StudentTCopula, self).__init__(rotation, params=init_params) self.thetaBounds = ((-1 + 1e-9, 1 - 1e-9), (2.0, np.inf),) self.theta0 = (0.7, 10.0) self.name = 't' self.rotation = 0 - super(StudentTCopula, self).__init__(rotation, params=init_params) @CopulaBase._rotPDF def _pdf(self, u, v, rotation=0, *theta): @@ -149,6 +149,8 @@ def _kTau(self, rotation=0, *theta): kt = (2.0 / np.pi) * np.arcsin(theta[0]) return kt + def _gen(self, t, *theta): + raise NotImplementedError def ggamma(x): return np.log(gammaln(x)) diff --git a/starvine/bvcopula/pc_base.py b/starvine/bvcopula/pc_base.py index f6cc7ff..4c3987e 100644 --- a/starvine/bvcopula/pc_base.py +++ b/starvine/bvcopula/pc_base.py @@ -47,7 +47,7 @@ def __init__(self, x, y, weights=None, resample=0, **kwargs): self.weights = self.weights / np.average(self.weights) if resample > 0: self.resample(resample, kwargs.pop("jitter", 1e-12)) - self.setTrialCopula(kwargs.pop("family", self.defaultFamily)) + self.setTrialCopula(kwargs.pop("family", {})) # default data ranking method self.rank_method = kwargs.pop("rankMethod", 0) self.rank(self.rank_method) @@ -124,7 +124,9 @@ def rank_method(self, method): assert type(method) is int self._rank_method = method - def setTrialCopula(self, family): + def setTrialCopula(self, family={}): + if not family: + family = self.defaultFamily self.trialFamily = family self.copulaBank = {} for name, rotation in iteritems(self.trialFamily): @@ -216,7 +218,7 @@ def copulaTournament(self, criterion='AIC', **kwargs): self.copulaParams = goldParams return (self.copulaModel, self.copulaParams) - def fitCopula(self, copula, thetaGuess=(None, None, )): + def fitCopula(self, copula, thetaGuess=(None, None,)): """! @brief fit specified copula to data. @param copula CopulaBase Copula instance diff --git a/starvine/bvcopula/tests/test_mcmc_fit.py b/starvine/bvcopula/tests/test_mcmc_fit.py new file mode 100644 index 0000000..24ab957 --- /dev/null +++ b/starvine/bvcopula/tests/test_mcmc_fit.py @@ -0,0 +1,44 @@ +## +# \brief Tests mcmc estimation of copula paramters +from __future__ import print_function, division +import unittest +from scipy.stats.mstats import rankdata +# COPULA IMPORTS +from starvine.bvcopula.copula.gauss_copula import GaussCopula as gc +import numpy as np +import os +pwd_ = os.path.dirname(os.path.abspath(__file__)) +dataDir = pwd_ + "/data/" +np.random.seed(123) +tol = 4e-2 + + +class TestMcmcFit(unittest.TestCase): + def testMcmcCoplulaFit(self): + print("--------------------- MCMC COPULA FIT TEST --------------------------") + # Load matlab data set + stocks = np.loadtxt(dataDir + 'stocks.csv', delimiter=',') + x = stocks[:, 0] + y = stocks[:, 1] + + # Rank transform the data + u = rankdata(x) / (len(x) + 1) + v = rankdata(y) / (len(y) + 1) + + # Fit t copula and gaussian copula + thetag0 = [0.2] + g_copula = gc() + theta_g_fit_mle = g_copula.fitMLE(u, v, *thetag0, bounds=((-0.99, 0.99),))[0] + aic_g_fit_mle = g_copula._AIC(u, v, 0, *theta_g_fit_mle) + theta_g_fit_mcmc = g_copula.fitMcmc(u, v, *thetag0, bounds=((-0.99, 0.99),), + ngen=500, nburn=200)[0] + aic_g_fit_mcmc = g_copula._AIC(u, v, 0, *theta_g_fit_mcmc) + print("Gaussian copula MLE paramter [rho]: " + str(theta_g_fit_mle) + " AIC =" + str(aic_g_fit_mle)) + print("Gaussian copula MCMC paramter [rho]: " + str(theta_g_fit_mcmc) + " AIC =" + str(aic_g_fit_mcmc)) + + # check MLE and MCMC solution are the same in this case + self.assertAlmostEqual(theta_g_fit_mle[0], theta_g_fit_mcmc[0], delta=tol) + + # check againt expected + true_rho_ranked = 0.7387 + self.assertAlmostEqual(theta_g_fit_mcmc[0], true_rho_ranked, delta=tol) diff --git a/starvine/sklearn_vine.py b/starvine/sklearn_vine.py new file mode 100644 index 0000000..2e80a87 --- /dev/null +++ b/starvine/sklearn_vine.py @@ -0,0 +1,59 @@ +## +# @brief Pipeline interface for using with sklearn pipelines +## +from __future__ import print_function, absolute_import, division +import numpy as np +import pandas as pd +from sklearn.base import BaseEstimator +from starvine.vine.C_vine import Cvine + + +class VineModel(BaseEstimator): + """! + @brief Implements the fit, fit_transform interface of the sklearn + pipeline workflow for vine copula. + """ + def __init__(self, vine_type='c', + trial_copula={}, + copula_fit_method='mle', + vine_fit_method='ktau', + rank_transform=True): + self.trial_copula_dict = trial_copula + self._rank_transform = rank_transform + if vine_type == 'c': + self._vine = Cvine([]) + else: + raise NotImplementedError + + def fit(self, X, weights=None): + """ + @brief Fit vine copula model to data. + @param X corrolated vectors with shape (Nsamples, Ndim). Can be + unranked or ranked data + """ + if not isinstance(X, pd.DataFrame): + tstData = pd.DataFrame(X) + else: + tstData = X + if self.rank_transform: + x_r = tstData.dropna().rank()/(len(tstData)+1) + else: + x_r = tstData + self.vine = Cvine(x_r, trial_copula=self.trial_copula_dict) + self.vine.constructVine() + + def predict(self, n): + """! + @brief Predict correlated output vectors given uncorrolated inputs. + @param n int. Number of samples to draw. + """ + return self.vine.sample(n) + + @property + def rank_transform(self): + return self._rank_transform + + @rank_transform.setter + def rank_transform(self, rt): + self._rank_transform = bool(rt) + diff --git a/starvine/vine/C_vine.py b/starvine/vine/C_vine.py index 775196c..fe66d52 100644 --- a/starvine/vine/C_vine.py +++ b/starvine/vine/C_vine.py @@ -12,12 +12,13 @@ import networkx as nx from starvine.bvcopula import pc_base as pc import numpy as np +from six import iteritems from starvine.vine.tree import Vtree class Cvine(BaseVine): """! - @brief Cononical vine (C-vine). Provides methods to fit pair + @brief Canonical vine (C-vine). Provides methods to fit pair copula constructions sequentially and simultaneously. Additional methods are provided to draw samples from a constructed C-vine. @@ -53,7 +54,7 @@ class Cvine(BaseVine): \f[ h(x| v, \theta) = F(x|v, \theta) = \frac{\partial C_{xv}(x,v,\theta)}{\partial v} \f] - Where we have defined the convinience conditional distribution + Where we have defined, for convinience, the conditional distribution: \f$ h(\cdot) \f$ The nodes of the lower level trees are formed by using \f$ h(x| v,\theta) \f$ @@ -71,9 +72,8 @@ class Cvine(BaseVine): and the inner product indicies represent the pair copula constructions (PCC) withen the given tree. """ - def __init__(self, data, dataWeights=None): - self.data = data - self.weights = dataWeights + def __init__(self, data, dataWeights=None, **kwargs): + super(Cvine, self).__init__(data, dataWeights, **kwargs) self.nLevels = int(data.shape[1] - 1) self.vine = [] @@ -84,7 +84,7 @@ def constructVine(self): build all tree levels. """ # 0th tree build - tree0 = Ctree(self.data, lvl=0) + tree0 = Ctree(self.data, lvl=0, trial_copula=self.trial_copula_dict) tree0.seqCopulaFit() self.vine.append(tree0) # build all other trees @@ -98,7 +98,8 @@ def buildDeepTrees(self, level=1): """ treeT = Ctree(self.vine[level - 1].evalH(), lvl=level, - parentTree=self.vine[level - 1]) + parentTree=self.vine[level - 1], + trial_copula=self.trial_copula_dict) treeT.seqCopulaFit() if self.nLevels > 1: self.vine.append(treeT) @@ -111,7 +112,7 @@ def buildDeepTrees(self, level=1): class Ctree(Vtree): """! @brief A C-tree is a tree with a single root node. - Each level of a cononical vine is a C-tree. + Each level of a canonical vine is a C-tree. """ def __init__(self, data, lvl=None, **kwargs): """! @@ -139,7 +140,7 @@ def seqCopulaFit(self): def treeNLLH(self, treeCopulaParams=None): """! @brief Compute this tree's negative log likelihood. - For C-trees this is just the sum of copula-log-likeyhoods over all + For C-trees this is just the sum of copula-log-likelihoods over all node-pairs. @param treeCopulaParams np_1darray Copula parameter array. Contains parameters for all PCC in the tree. @@ -195,16 +196,14 @@ def _optimNodePairs(self): # iterate though all child nodes, # root dataset cannot be paired with itself if nodeID != rootNodeID: - ## VV is OK UU is wrong! trialPair = pc.PairCopula(self.tree.node[nodeID]["data"].values, self.tree.node[rootNodeID]["data"].values) - # trialPair = pc.PairCopula(self.tree.node[rootNodeID]["data"].values, - # self.tree.node[nodeID]["data"].values) trialKtau, trialP = trialPair.empKTau() trialKtauSum[i] += abs(trialKtau) - # trialPairings[i].append((rootNodeID, nodeID, trialKtau)) trialPairings[i].append((nodeID, rootNodeID, trialKtau)) + print("Tree level: %d, configuration: %d, Ktau Metric: %f" % (self.level, i, trialKtauSum[i])) bestPairingIndex = np.argmax(np.abs(trialKtauSum)) + print(" === Configuration %d selected === " % (bestPairingIndex)) self.rootNodeID = trialPairings[bestPairingIndex][0][1] return trialPairings[bestPairingIndex] @@ -231,8 +230,6 @@ def _evalH(self): rootData = data["pc"].UU condData[(nonRootID, rootID)] = data["h-dist"](data["pc"].VV, data["pc"].UU) - # condData[(nonRootID, rootID)] = data["h-dist"](nonRootData, - # rootData) return condData def _getEdgeCopulaParams(self, u, v): diff --git a/starvine/vine/base_vine.py b/starvine/vine/base_vine.py index 7d402f7..6bbc0d9 100644 --- a/starvine/vine/base_vine.py +++ b/starvine/vine/base_vine.py @@ -5,6 +5,7 @@ import networkx as nx import numpy as np import pandas as pd +from six import iteritems # from starvine.mvar.mv_plot import matrixPairPlot @@ -12,8 +13,37 @@ class BaseVine(object): """! @brief Regular vine base class. """ - def __init__(self, data=None, weights=None): - pass + def __init__(self, data, dataWeights=None, **kwargs): + self.trial_copula_dict = \ + self._validate_trial_copula( \ + kwargs.get("trial_copula", self._all_trial_copula)) + self.data = data + self.weights = dataWeights + + def _validate_trial_copula(self, trial_copula): + assert isinstance(trial_copula, dict) + for key, val in iteritems(trial_copula): + assert key in self._all_trial_copula + assert self._all_trial_copula[key] == val + + @property + def _all_trial_copula(self): + default_copula = {'t': 0, + 'gauss': 0, + 'frank': 0, + 'frank-90': 1, + 'frank-180': 2, + 'frank-270': 3, + 'clayton': 0, + 'clayton-90': 1, + 'clayton-180': 2, + 'clayton-270': 3, + 'gumbel': 0, + 'gumbel-90': 1, + 'gumbel-180': 2, + 'gumbel-270': 3, + } + return default_copula def loadVineStructure(self, vS): """! @@ -67,8 +97,9 @@ def treeHfun(self, level=0): def sample(self, n=1000): """! @brief Draws n samples from the vine. + @param n int. number of samples to draw @returns size == (n, nvars) pandas.DataFrame - samples from vine + samples from vine """ # gen random samples u_n0 = np.random.rand(n) @@ -81,8 +112,8 @@ def sample(self, n=1000): # sample from edge of last tree u_n1 = edge_info["hinv-dist"](u_n0, u_n1) - edge_sample = {n0: 1. - u_n0, n1: 1. - u_n1} - # matrixPairPlot(pd.DataFrame(edge_sample), savefig="tree_1_edge_sample.png") + edge_sample = {n0: u_n0, n1: u_n1} + # matrixPairPlot(pd.DataFrame(edge_sample), savefig="c_test/tree_1_edge_sample.png") # store edge sample inside graph data struct current_tree.tree[n0][n1]['sample'] = edge_sample @@ -117,6 +148,40 @@ def sample(self, n=1000): # convert sample dict of arrays to dataFrame return pd.DataFrame(sample_result) + def sampleScale(self, n, frozen_margin_dict): + """! + @brief Sample vine copula and apply inverse transform sampling + to margins. + @param n int. number of samples to draw. + @param frozen_margin_dict dict of frozen single dimensional + prob density functions. See: scipy.stats.rv_continuous + """ + df_x = self.sample(n) + return self.scaleSamples(df_x, frozen_margin_dict) + + def scaleSamples(self, df_x, frozen_margin_dict): + """! + @brief Apply inverse transform sampling + @param df_x corrolated samples in [0, 1] from vine copula + @param frozen_margin_dict dict of frozen single dimensional + prob density functions. See: scipy.stats.rv_continuous. + Each entry in the dict is an instance of: + frozen_marginal_model frozen scipy.stats.rv_continuous + python object + or is a inverse cdf fuction + """ + sample_result = {} + assert isinstance(df_x, pd.DataFrame) + for col_name in df_x.columns: + assert col_name in frozen_margin_dict.keys() + frozen_marginal_model = frozen_margin_dict[col_name] + x = df_x[col_name] + if hasattr(frozen_marginal_model, 'ppf'): + sample_result[col_name] = frozen_marginal_model.ppf(x) + else: + sample_result[col_name] = frozen_marginal_model(x) + return pd.DataFrame(sample_result) + def plotVine(self, plotAll=True, savefig=None): """! @brief Plots the vine's graph structure. diff --git a/starvine/vine/tree.py b/starvine/vine/tree.py index 62cf02d..145bd4a 100644 --- a/starvine/vine/tree.py +++ b/starvine/vine/tree.py @@ -7,7 +7,6 @@ from starvine.bvcopula import pc_base as pc import networkx as nx import numpy as np -# from starvine.mvar.mv_plot import matrixPairPlot class Vtree(object): @@ -22,6 +21,7 @@ def __init__(self, data, lvl, parentTree=None, **kwargs): """ assert(type(data) is DataFrame) assert(len(data.shape) == 2) + self.trial_copula_dict = kwargs.get("trial_copula", {}) self.data = data self._upperTree = parentTree # @@ -63,7 +63,8 @@ def setEdges(self, nodePairs=None): pc= \ pc.PairCopula(self.tree.node[pair[0]]["data"], self.tree.node[pair[1]]["data"], - id=(pair[0], pair[1])), + id=(pair[0], pair[1]), + family=self.trial_copula_dict), id=(pair[0], pair[1]), edge_data={pair[0]: self.tree.node[pair[0]]["data"], pair[1]: self.tree.node[pair[1]]["data"]}, @@ -230,7 +231,6 @@ def unrollNodes(l): u_prev_n0 = prev_edge_info['sample'][prev_n0] u_prev_n2 = prev_edge_info['sample'][prev_n2] u_n1 = prev_edge_info["h-dist"](u_prev_n2, u_prev_n0) - # u_n1 = prev_edge_info["h-dist"](u_prev_n0, u_prev_n2) else: u_n1 = np.random.rand(size) else: @@ -240,10 +240,9 @@ def unrollNodes(l): try: u_n0 = edge_info["hinv-dist"](u_n1, next_tree_info['sample'][(n0, n1)]) except: - u_n0 = edge_info["hinv-dist"](u_n1, next_tree_info['sample'][(n1, n0)]) - edge_sample = {n0: 1. - u_n0, n1: 1. - u_n1} - # matrixPairPlot(DataFrame(edge_sample), - # savefig="edge" + str(n0) + "_" + str(n1) + "sample.png") + # u_n0 = edge_info["hinv-dist"](u_n1, next_tree_info['sample'][(n1, n0)]) + raise RuntimeError("Edge with nodes: " + str((n0, n1)), " does not exist.") + edge_sample = {n0: u_n0, n1: u_n1} current_tree[n0][n1]['sample'] = edge_sample # If current tree is 0th tree: copy marginal sample to diff --git a/tests/test_C_vine_construct.py b/tests/test_C_vine_construct.py index e226284..d9ab37a 100644 --- a/tests/test_C_vine_construct.py +++ b/tests/test_C_vine_construct.py @@ -5,6 +5,7 @@ from starvine.vine.C_vine import Cvine from starvine.mvar.mv_plot import matrixPairPlot # extra imports +from scipy.stats import norm, beta import unittest import os import numpy as np @@ -46,8 +47,47 @@ def testCvineConstruct(self): tstVine.plotVine(savefig="c_vine_graph_ex.png") # sample from vine - samples = tstVine.sample(n=8000) - matrixPairPlot(samples, savefig="quad_varaite_resampled_ex.png") + c_vine_samples = tstVine.sample(n=8000) + matrixPairPlot(c_vine_samples, savefig="vine_resampled_ex.png") + + # check that the original data has same correlation coefficients as re-sampled + # data from the fitted c-vine + tst_rho_matrix = ranked_data.corr(method='pearson') + tst_ktau_matrix = ranked_data.corr(method='kendall') + sample_rho_matrix = c_vine_samples.corr(method='pearson') + sample_ktau_matrix = c_vine_samples.corr(method='kendall') + # sort by col labels + tst_rho_matrix = tst_rho_matrix.reindex(sorted(tst_rho_matrix.columns), axis=1) + tst_ktau_matrix = tst_ktau_matrix.reindex(sorted(tst_ktau_matrix.columns), axis=1) + sample_rho_matrix = sample_rho_matrix.reindex(sorted(sample_rho_matrix.columns), axis=1) + sample_ktau_matrix = sample_ktau_matrix.reindex(sorted(sample_ktau_matrix.columns), axis=1) + + print("Original data corr matrix:") + print(tst_rho_matrix) + print("Vine sample corr matrix:") + print(sample_rho_matrix) + print("Diff:") + print(tst_rho_matrix - sample_rho_matrix) + self.assertTrue(np.allclose(tst_rho_matrix - sample_rho_matrix, 0, atol=0.10)) + self.assertTrue(np.allclose(tst_ktau_matrix - sample_ktau_matrix, 0, atol=0.10)) + + # fit marginal distributions to original data + marginal_dict = {} + for col_name in tstData.columns: + marginal_dict[col_name] = beta(*beta.fit(tstData[col_name])) + # scale the samples + c_vine_scaled_samples_a = tstVine.scaleSamples(c_vine_samples, marginal_dict) + matrixPairPlot(c_vine_scaled_samples_a, savefig="vine_varaite_resampled_scaled_a.png") + + c_vine_scaled_samples_b = tstVine.sampleScale(8000, marginal_dict) + + # compute correlation coeffs + sample_scaled_rho_matrix_a = c_vine_scaled_samples_a.corr(method='pearson') + sample_scaled_rho_matrix_b = c_vine_scaled_samples_b.corr(method='pearson') + + # check for consistency + self.assertTrue(np.allclose(tst_rho_matrix - sample_scaled_rho_matrix_a, 0, atol=0.1)) + self.assertTrue(np.allclose(tst_rho_matrix - sample_scaled_rho_matrix_b, 0, atol=0.1)) if __name__ == "__main__": diff --git a/tests/test_weighted_reg.py b/tests/test_weighted_reg.py index ee8eb08..7b5b7ec 100644 --- a/tests/test_weighted_reg.py +++ b/tests/test_weighted_reg.py @@ -79,13 +79,6 @@ def testWgtCopula(self): data = pd.DataFrame([x_wt, y_wt]).T matrixPairPlot(data, savefig='x_gauss_weighted_fit.png') - def testWgtMargins(self): - """! - @brief Test the ability to construct a marginal PDF - given samples with unequal weights. - """ - pass - def testWgtResampledCopula(self): """! @brief Test ability to construct copula