From cf42a93841d9b9165f3b1cd78dd793aae4ef5cf9 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Eneko=20Uru=C3=B1uela?= <e.urunuela@icloud.com>
Date: Thu, 10 Feb 2022 15:31:03 +0100
Subject: [PATCH] Compute PCA with all possible components then give solution
 with optimal number of selected criterion (#49)

* Compute PCA with all possible components then select optimal number

Also share variance explained with all criteria.

* Updated docstring

* Fix linting error
---
 mapca/mapca.py | 130 +++++++++++++++++++++++++++++++++----------------
 1 file changed, 89 insertions(+), 41 deletions(-)

diff --git a/mapca/mapca.py b/mapca/mapca.py
index 6b17ed2..2350247 100644
--- a/mapca/mapca.py
+++ b/mapca/mapca.py
@@ -84,22 +84,32 @@ class MovingAveragePCA:
     n_features_ : int
         Number of features in the training data.
     n_samples_ : int
-        Number of samples in the training data.
-    noise_variance_ : float
-        The estimated noise covariance following the Probabilistic PCA model
-        from Tipping and Bishop 1999.
-        See “Pattern Recognition and Machine Learning” by C. Bishop, 12.2.1 p. 574
-        or http://www.miketipping.com/papers/met-mppca.pdf.
-        It is required to compute the estimated data covariance and score samples.
-
-        Equal to the average of (min(n_features, n_samples) - n_components) smallest
-        eigenvalues of the covariance matrix of X.
-    aic_ : :obj:`numpy.ndarray`, shape (n_components)
-        The Akaike Information Criterion optimization curve.
-    kic_ : :obj:`numpy.ndarray`, shape (n_components)
-        The Kullback-Leibler Information Criterion optimization curve.
-    mdl_ : :obj:`numpy.ndarray`, shape (n_components)
-        The Minimum Description Length optimization curve.
+        Number of samples in the training data
+    aic_ : dict
+        Dictionary containing the Akaike Information Criterion results:
+            - 'n_components': The number of components chosen by the AIC criterion.
+            - 'value': The AIC curve values.
+            - 'explained_variance_total': The total explained variance of the components.
+    kic_ : dict
+        Dictionary containing the Kullback-Leibler Information Criterion results:
+            - 'n_components': The number of components chosen by the KIC criterion.
+            - 'value': The KIC curve values.
+            - 'explained_variance_total': The total explained variance of the components.
+    mdl_ : dict
+        Dictionary containing the Minimum Description Length results:
+            - 'n_components': The number of components chosen by the MDL criterion.
+            - 'value': The MDL curve values.
+            - 'explained_variance_total': The total explained variance of the components.
+    varexp_90 : dict
+        Dictionary containing the 90% variance explained results:
+            - 'n_components': The number of components chosen by the 90% variance explained
+                              criterion.
+            - 'explained_variance_total': The total explained variance of the components.
+    varexp_95 : dict
+        Dictionary containing the 95% variance explained results:
+            - 'n_components': The number of components chosen by the 95% variance explained
+                              criterion.
+            - 'explained_variance_total': The total explained variance of the components.
 
     References
     ----------
@@ -240,66 +250,104 @@ def _fit(self, img, mask):
 
         LGR.info("Estimating the dimensionality ...")
         p = n_timepoints
-        self.aic_ = np.zeros(p - 1)
-        self.kic_ = np.zeros(p - 1)
-        self.mdl_ = np.zeros(p - 1)
+        aic = np.zeros(p - 1)
+        kic = np.zeros(p - 1)
+        mdl = np.zeros(p - 1)
 
         for k_idx, k in enumerate(np.arange(1, p)):
             LH = np.log(np.prod(np.power(eigenvalues[k:], 1 / (p - k))) / np.mean(eigenvalues[k:]))
             mlh = 0.5 * N * (p - k) * LH
             df = 1 + 0.5 * k * (2 * p - k + 1)
-            self.aic_[k_idx] = (-2 * mlh) + (2 * df)
-            self.kic_[k_idx] = (-2 * mlh) + (3 * df)
-            self.mdl_[k_idx] = -mlh + (0.5 * df * np.log(N))
+            aic[k_idx] = (-2 * mlh) + (2 * df)
+            kic[k_idx] = (-2 * mlh) + (3 * df)
+            mdl[k_idx] = -mlh + (0.5 * df * np.log(N))
 
-        itc = np.row_stack([self.aic_, self.kic_, self.mdl_])
+        itc = np.row_stack([aic, kic, mdl])
 
         dlap = np.diff(itc, axis=1)
 
+        # Calculate optimal number of components with each criterion
         # AIC
         a_aic = np.where(dlap[0, :] > 0)[0] + 1
         if a_aic.size == 0:
-            self.n_aic_ = itc[0, :].shape[0]
+            n_aic = itc[0, :].shape[0]
         else:
-            self.n_aic_ = a_aic[0]
+            n_aic = a_aic[0]
 
         # KIC
         a_kic = np.where(dlap[1, :] > 0)[0] + 1
         if a_kic.size == 0:
-            self.n_kic_ = itc[1, :].shape[0]
+            n_kic = itc[1, :].shape[0]
         else:
-            self.n_kic_ = a_kic[0]
+            n_kic = a_kic[0]
 
         # MDL
         a_mdl = np.where(dlap[2, :] > 0)[0] + 1
         if a_mdl.size == 0:
-            self.n_mdl_ = itc[2, :].shape[0]
+            n_mdl = itc[2, :].shape[0]
         else:
-            self.n_mdl_ = a_mdl[0]
+            n_mdl = a_mdl[0]
 
         if self.criterion == "aic":
-            n_components = self.n_aic_
+            n_components = n_aic
         elif self.criterion == "kic":
-            n_components = self.n_kic_
+            n_components = n_kic
         elif self.criterion == "mdl":
-            n_components = self.n_mdl_
+            n_components = n_mdl
 
-        LGR.info("Estimated number of components is %d" % n_components)
+        LGR.info("Performing PCA")
 
-        # PCA with estimated number of components
-        ppca = PCA(n_components=n_components, svd_solver="full", copy=False, whiten=False)
+        # PCA with all possible components (the estimated selection is made after)
+        ppca = PCA(n_components=None, svd_solver="full", copy=False, whiten=False)
         ppca.fit(X)
 
+        # Get cumulative explained variance as components are added
+        cumsum_varexp = np.cumsum(ppca.explained_variance_ratio_)
+
+        # Calculate number of components for 90% varexp
+        n_comp_varexp_90 = np.where(cumsum_varexp >= 0.9)[0][0] + 1
+
+        # Calculate number of components for 95% varexp
+        n_comp_varexp_95 = np.where(cumsum_varexp >= 0.95)[0][0] + 1
+
+        LGR.info("Estimated number of components is %d" % n_components)
+
+        # Save results of each criterion into dictionaries
+        self.aic_ = {
+            "n_components": n_aic,
+            "value": aic,
+            "explained_variance_total": cumsum_varexp[n_aic - 1],
+        }
+        self.kic_ = {
+            "n_components": n_kic,
+            "value": kic,
+            "explained_variance_total": cumsum_varexp[n_kic - 1],
+        }
+        self.mdl_ = {
+            "n_components": n_mdl,
+            "value": mdl,
+            "explained_variance_total": cumsum_varexp[n_mdl - 1],
+        }
+        self.varexp_90_ = {
+            "n_components": n_comp_varexp_90,
+            "explained_variance_total": cumsum_varexp[n_comp_varexp_90 - 1],
+        }
+        self.varexp_95_ = {
+            "n_components": n_comp_varexp_95,
+            "explained_variance_total": cumsum_varexp[n_comp_varexp_95 - 1],
+        }
+
         # Assign attributes from model
-        self.components_ = ppca.components_
-        self.explained_variance_ = ppca.explained_variance_
-        self.explained_variance_ratio_ = ppca.explained_variance_ratio_
-        self.singular_values_ = ppca.singular_values_
+        self.components_ = ppca.components_[:n_components, :]
+        self.explained_variance_ = ppca.explained_variance_[:n_components]
+        self.explained_variance_ratio_ = ppca.explained_variance_ratio_[:n_components]
+        self.singular_values_ = ppca.singular_values_[:n_components]
         self.mean_ = ppca.mean_
-        self.n_components_ = ppca.n_components_
+        self.n_components_ = n_components
         self.n_features_ = ppca.n_features_
         self.n_samples_ = ppca.n_samples_
-        self.noise_variance_ = ppca.noise_variance_
+        # Commenting out noise variance as it depends on the covariance of the estimation
+        # self.noise_variance_ = ppca.noise_variance_
         component_maps = np.dot(
             np.dot(X, self.components_.T), np.diag(1.0 / self.explained_variance_)
         )