diff --git a/src/gglasso/helper/data_generation.py b/src/gglasso/helper/data_generation.py
index cb52b5e..0c5786c 100644
--- a/src/gglasso/helper/data_generation.py
+++ b/src/gglasso/helper/data_generation.py
@@ -11,8 +11,7 @@
 
 from .basic_linalg import trp
 
-
-def generate_precision_matrix(p=100, M=10, style = 'powerlaw', gamma = 2.8, prob = 0.1, scale = False, seed = None):
+def generate_precision_matrix(p=100, M=10, style='powerlaw', gamma=2.8, prob=0.1, scale=False, seed=None):
     """
     Generates a sparse precision matrix with associated covariance matrix from a random network.
     
@@ -52,8 +51,8 @@ def generate_precision_matrix(p=100, M=10, style = 'powerlaw', gamma = 2.8, prob
     L = int(p/M)
     assert M*L == p
     
-    A = np.zeros((p,p))
-    Sigma = np.zeros((p,p))
+    A = np.zeros((p, p))
+    Sigma = np.zeros((p, p))
     
     if seed is not None:
         nxseed = seed
@@ -66,9 +65,9 @@ def generate_precision_matrix(p=100, M=10, style = 'powerlaw', gamma = 2.8, prob
             nxseed = int(nxseed +m)
         
         if style == 'powerlaw':
-            G_m = nx.generators.random_graphs.random_powerlaw_tree(n=L, gamma=gamma, tries=max(5*p,1000), seed = nxseed)
+            G_m = nx.generators.random_graphs.random_powerlaw_tree(n=L, gamma=gamma, tries=max(5*p,1000), seed=nxseed)
         elif style == 'erdos':
-            G_m = nx.generators.random_graphs.erdos_renyi_graph(n=L , p=prob, seed=nxseed, directed=False)
+            G_m = nx.generators.random_graphs.erdos_renyi_graph(n=L, p=prob, seed=nxseed, directed=False)
         else:
             raise ValueError(f"{style} is not a valid choice for the network generation.")
         A_m = nx.to_numpy_array(G_m)
@@ -79,15 +78,11 @@ def generate_precision_matrix(p=100, M=10, style = 'powerlaw', gamma = 2.8, prob
         else:
             rng = np.random.default_rng(np.random.randint(low=11111, high=99999))
 
-        B1 = rng.uniform(low = .1, high = .4, size = (L,L))
-        B2 = rng.choice(a = [-1,1], p=[.5, .5], size = (L,L))
+        B1 = rng.uniform(low=.1, high=.4, size=(L,L))
+        B2 = rng.choice(a=[-1,1], p=[.5, .5], size=(L,L))
         
         A_m = A_m * (B1*B2)
         
-        # only use upper triangle and symmetrize
-        #A_m = np.triu(A_m)
-        #A_m = .5 * (A_m + A_m.T)
-        
         A[m*L:(m+1)*L, m*L:(m+1)*L] = A_m
     
     row_sum_od = 1.5 * abs(A).sum(axis = 1) + 1e-10
@@ -105,15 +100,12 @@ def generate_precision_matrix(p=100, M=10, style = 'powerlaw', gamma = 2.8, prob
     if D.min() < 1e-8:
         A += (0.1+abs(D.min())) * np.eye(p)    
         
-    #D = np.linalg.eigvalsh(A)
-    #assert D.min() > 0, f"generated matrix A is not positive definite, min EV is {D.min()}"
-    
-    Ainv = np.linalg.pinv(A, hermitian = True)
+    Ainv = np.linalg.pinv(A, hermitian=True)
     
     # scale by inverse of diagonal and 0.6*1/sqrt(d_ii*d_jj) on off-diag
     if scale:
         d = np.diag(Ainv)
-        scale_mat = np.tile(np.sqrt(d),(Ainv.shape[0],1))
+        scale_mat = np.tile(np.sqrt(d),(Ainv.shape[0], 1))
         scale_mat = (1/0.6)*(scale_mat.T * scale_mat)
         np.fill_diagonal(scale_mat, d)
 
@@ -144,7 +136,7 @@ def time_varying_power_network(p=100, K=10, M=10, scale = False, seed = None):
     assert M*L == p
     assert M >=3
     
-    Sigma_0,_ = generate_precision_matrix(p = p, M = M, style = 'powerlaw', scale = scale, seed = seed) 
+    Sigma_0,_ = generate_precision_matrix(p=p, M=M, style='powerlaw', scale=scale, seed=seed) 
     
     for k in np.arange(K):
         Sigma_k = Sigma_0.copy()
@@ -156,10 +148,10 @@ def time_varying_power_network(p=100, K=10, M=10, scale = False, seed = None):
                   
         Sigma[k,:,:] = Sigma_k
         
-    Theta = np.linalg.pinv(Sigma, hermitian = True)
+    Theta = np.linalg.pinv(Sigma, hermitian=True)
     
     decay = np.exp(-.5 * np.arange(K)) 
-    helper = np.ones((K,L,L)) * decay[:,None,None]
+    helper = np.ones((K, L, L)) * decay[:, None, None]
     for k in np.arange(K):
         np.fill_diagonal(helper[k,:,:], 1)
 
@@ -169,7 +161,7 @@ def time_varying_power_network(p=100, K=10, M=10, scale = False, seed = None):
     
     return Sigma, Theta
     
-def group_power_network(p=100, K=10, M=10, scale = False, seed = None):
+def group_power_network(p=100, K=10, M=10, scale=False, seed=None):
     """
     generates a power law network. In each single network one block disappears (randomly)
     p: dimension
@@ -181,14 +173,14 @@ def group_power_network(p=100, K=10, M=10, scale = False, seed = None):
     L = int(p/M)
     assert M*L == p
     
-    Sigma_0,_ = generate_precision_matrix(p = p, M = M, style = 'powerlaw', scale = scale, seed = seed)
+    Sigma_0,_ = generate_precision_matrix(p=p, M=M, style='powerlaw', scale=scale, seed=seed)
     # contains the number of the block disappearing for each k=1,..,K
     if seed is not None:
         rng = np.random.default_rng(seed)
     else:
         rng = np.random.default_rng(np.random.randint(low=11111, high=99999))
 
-    block = rng.integers(M, size = K)
+    block = rng.integers(M, size=K)
     
     for k in np.arange(K):    
         Sigma_k = Sigma_0.copy()           
@@ -197,7 +189,7 @@ def group_power_network(p=100, K=10, M=10, scale = False, seed = None):
         
         Sigma[k,:,:] = Sigma_k
             
-    Theta = np.linalg.pinv(Sigma, hermitian = True)
+    Theta = np.linalg.pinv(Sigma, hermitian=True)
     Sigma, Theta = ensure_sparsity(Sigma, Theta)
     
     return Sigma, Theta    
@@ -209,12 +201,12 @@ def ensure_sparsity(Sigma, Theta):
     D = np.linalg.eigvalsh(Theta)
     assert D.min() > 0, "generated matrix Theta is not positive definite"
     
-    Sigma = np.linalg.pinv(Theta, hermitian = True)
+    Sigma = np.linalg.pinv(Theta, hermitian=True)
     
     return Sigma, Theta
 
     
-def sample_covariance_matrix(Sigma, N, seed = None):
+def sample_covariance_matrix(Sigma, N, seed=None):
     """
     samples data for a given covariance matrix Sigma (with K layers)
     return: sample covariance matrix S
@@ -226,7 +218,7 @@ def sample_covariance_matrix(Sigma, N, seed = None):
         (p,p) = Sigma.shape
         
         sample = rng.multivariate_normal(np.zeros(p), Sigma, N).T
-        S = np.cov(sample, bias = True)
+        S = np.cov(sample, bias=True)
         
     else:
         assert abs(Sigma - trp(Sigma)).max() <= 1e-10
@@ -239,7 +231,7 @@ def sample_covariance_matrix(Sigma, N, seed = None):
         S = np.zeros((K,p,p))
         for k in np.arange(K):
             # normalize with N --> bias = True
-            S[k,:,:] = np.cov(sample[k,:,:], bias = True)
+            S[k,:,:] = np.cov(sample[k,:,:], bias=True)
             
     return S,sample
 
diff --git a/src/gglasso/helper/experiment_helper.py b/src/gglasso/helper/experiment_helper.py
index a6c79ae..ae84ec1 100644
--- a/src/gglasso/helper/experiment_helper.py
+++ b/src/gglasso/helper/experiment_helper.py
@@ -199,7 +199,6 @@ def plot_runtime(iA, iP, vecN, save = False):
             p1 = ax.plot(iA[j]['residual'], c = color_dict['ADMM'], label = 'ADMM residual')
             p2 = ax.plot(iP[j]['residual'], c = color_dict['PPDNA'], marker = 'o', markersize = 3, label = 'PPDNA residual')
             
-            #ax.tick_params(axis='both', which='major', labelsize=7)
             ax.set_yscale('log')
             ax.set_xscale('log')
             ax.set_ylim(1e-6,0.2)
@@ -213,9 +212,6 @@ def plot_runtime(iA, iP, vecN, save = False):
             ax.vlines(iP[j]['iter_admm']-1, 0, 0.2, 'grey')
             ax.set_xlim(max(iP[j]['iter_admm'] - 5,1), )
             
-            #ax.get_xaxis().set_major_formatter(matplotlib.ticker.ScalarFormatter())
-            #ax2.get_xaxis().set_major_formatter(matplotlib.ticker.ScalarFormatter())
-            
             if j in [0,2]:
                 ax.set_ylabel('KKT residual')
             if j in [1,3]:
@@ -377,8 +373,6 @@ def single_surface_plot(L1, L2, C, ax, name = 'eBIC'):
     
     ax.set_xlabel(r'$\lambda_1$', fontsize = 14)
     ax.set_ylabel(r'$\lambda_2$', fontsize = 14)
-    #ax.set_xlabel(r'$w_1$', fontsize = 14)
-    #ax.set_ylabel(r'$w_2$', fontsize = 14)
     ax.set_zlabel(name, fontsize = 14)
     ax.view_init(elev = 18, azim = 51)
     
diff --git a/src/gglasso/helper/model_selection.py b/src/gglasso/helper/model_selection.py
index fab2a98..b385332 100644
--- a/src/gglasso/helper/model_selection.py
+++ b/src/gglasso/helper/model_selection.py
@@ -3,6 +3,7 @@
 """
 
 import numpy as np
+from typing import Optional, Union
 
 from .basic_linalg import Sdot
 from .utils import mean_sparsity, sparsity
@@ -17,18 +18,23 @@
 TAU_MIN = 1e-12
 N_TAU = 20
       
-def lambda_parametrizer(l1 = 0.05, w2 = 0.5):
+def lambda_parametrizer(l1: Union[float, np.ndarray]=0.05,
+                        w2: Union[float, np.ndarray]=0.5
+    ):
     """transforms given l1 and w2 into the respective l2"""
     a = 1/np.sqrt(2)
     l2 = (w2*l1)/(a*(1-w2))
     return l2
 
-def map_l_to_w(l1, l2):
+def map_l_to_w(l1: float, l2: float):
     w1 = l1 + (1/np.sqrt(2)) *l2
     w2 = l2/(w1*np.sqrt(2))   
     return (w1,w2)
     
-def lambda_grid(l1, l2 = None, w2 = None):
+def lambda_grid(l1: np.ndarray,
+                l2: Optional[np.ndarray]=None,
+                w2: Optional[np.ndarray]=None,
+    ):
     """
     l1, l2, w2: values for the grid
     either l2 or w2 has to be specified
@@ -38,16 +44,33 @@ def lambda_grid(l1, l2 = None, w2 = None):
     assert np.all(l2 is not None) | np.all(w2 is not None), "Either a range of lambda2 or w2 values have to be specified"
         
     if np.all(l2 is not None):
-        L1, L2 = np.meshgrid(l1,l2)
+        L1, L2 = np.meshgrid(l1, l2)
     else:
-        l1grid, w2grid = np.meshgrid(l1,w2)
+        l1grid, w2grid = np.meshgrid(l1, w2)
         L2 = lambda_parametrizer(l1grid, w2grid)
         L1 = l1grid.copy()
         
     return L1.squeeze(), L2.squeeze()
 
-def grid_search(solver, S, N, p, reg, l1, l2 = None, w2 = None, method= 'eBIC', gamma = 0.3, \
-                G = None, latent = False, mu_range = None, ix_mu = None, thresholding = False, tol = 1e-7, rtol = 1e-7, verbose = False):
+def grid_search(solver: str,
+                S: Union[np.ndarray, dict],
+                N: np.ndarray,
+                p: Union[np.ndarray, int],
+                reg: str,
+                l1: np.ndarray,
+                l2: Optional[np.ndarray]=None,
+                w2: Optional[np.ndarray]=None,
+                method: str='eBIC',
+                gamma: float=0.3,
+                G: Optional[np.ndarray]=None,
+                latent: bool=False,
+                mu_range: Optional[np.ndarray]=None,
+                ix_mu: Optional[np.ndarray]=None,
+                thresholding: bool=False,
+                tol: float=1e-7,
+                rtol: float=1e-7,
+                verbose: bool=False
+    ):
     """
     method for doing model selection for MGL problems using grid search and AIC/eBIC
     parameters to select: lambda1 (sparsity), lambda2 (group sparsity or total variation)
@@ -111,11 +134,13 @@ def grid_search(solver, S, N, p, reg, l1, l2 = None, w2 = None, method= 'eBIC',
     assert method in ['AIC', 'eBIC']
     assert reg in ['FGL', 'GGL']
     
-    if type(S) == dict:
+    if isinstance(S, dict):
         K = len(S.keys())
-    elif type(S) == np.ndarray:
+    elif isinstance(S, np.ndarray):
         K = S.shape[0]
-        
+    else:
+        raise Exception("S must be specified either as array or dict.")
+    
     assert len(N) == K, f"N must be given as array, is given as {N}."
     
     if latent:
@@ -128,7 +153,8 @@ def grid_search(solver, S, N, p, reg, l1, l2 = None, w2 = None, method= 'eBIC',
         print(L1)
         print(L2)
     
-    grid1 = L1.shape[0]; grid2 = L2.shape[1]
+    grid1 = L1.shape[0]
+    grid2 = L2.shape[1]
     AIC = np.nan*np.zeros((grid1, grid2))
     
     # use default gammas plus the one for eBIC selection
@@ -138,11 +164,11 @@ def grid_search(solver, S, N, p, reg, l1, l2 = None, w2 = None, method= 'eBIC',
      
     BIC = dict()
     for g in gammas:
-        BIC[g] = np.nan*np.zeros((grid1, grid2))
+        BIC[g] = np.nan * np.zeros((grid1, grid2))
         
-    SP = np.nan*np.zeros((grid1, grid2))
+    SP = np.nan * np.zeros((grid1, grid2))
     #SKIP = np.zeros((grid1, grid2), dtype = bool)
-    RANK = np.nan*np.zeros((K, grid1, grid2))
+    RANK = np.nan * np.zeros((K, grid1, grid2))
     
     if thresholding:
         TAU = np.zeros((K, grid1, grid2))
@@ -150,16 +176,22 @@ def grid_search(solver, S, N, p, reg, l1, l2 = None, w2 = None, method= 'eBIC',
         TAU = None
     
     # solver kwargs
-    kwargs = {'reg': reg, 'S': S, 'tol': tol, 'rtol': rtol, 'verbose': False, 'measure': False}
-    if type(S) == dict:
+    kwargs = {'reg': reg,
+              'S': S,
+              'tol': tol,
+              'rtol': rtol,
+              'verbose': False,
+              'measure': False
+    }
+    
+    if isinstance(S, dict):
         Omega_0 = id_dict(p)
         kwargs['G'] = G
-    elif type(S) == np.ndarray:
-        Omega_0 = id_array(K,p)
+    elif isinstance(S, np.ndarray):
+        Omega_0 = id_array(K, p)
         
     kwargs['Omega_0'] = Omega_0.copy()
     
-    
     curr_min = np.inf
     curr_best = None
     
@@ -185,7 +217,6 @@ def grid_search(solver, S, N, p, reg, l1, l2 = None, w2 = None, method= 'eBIC',
                 this_mu = mu_range[ix_mu[:,g2]]
                 kwargs['latent'] = True
                 kwargs['mu1'] = this_mu.copy()
-                #print("MU values", kwargs['mu1'])
                 
             # solve
             sol, info = solver(**kwargs)
@@ -195,55 +226,70 @@ def grid_search(solver, S, N, p, reg, l1, l2 = None, w2 = None, method= 'eBIC',
             # thresholding
             if thresholding:
                 # store also the best solution without solution
-                _no_thr_this_score = ebic(S, sol['Theta'], N, gamma = gamma)
+                _no_thr_this_score = ebic(S, sol['Theta'], N, gamma=gamma)
                 if  _no_thr_this_score < _no_thr_curr_min:
-                    _no_thr_best_params = {'lambda1': L1[g1,g2], 'lambda2':L2[g1,g2]}
+                    _no_thr_best_params = {'lambda1': L1[g1,g2],
+                                           'lambda2': L2[g1,g2]
+                    }
                     _no_thr_curr_min = _no_thr_this_score
                     _no_thr_curr_best = sol.copy()
                 
                 # now tune threshold
-                sol['Theta'], opt_tau, _ = tune_multiple_threshold(sol['Theta'], S, N, tau_range = None,\
-                                                                   method = method, gamma = gamma)
-                TAU[:,g1,g2] = opt_tau
+                sol['Theta'], opt_tau, _ = tune_multiple_threshold(sol['Theta'],
+                                                                   S,
+                                                                   N,
+                                                                   tau_range=None,
+                                                                   method=method,
+                                                                   gamma=gamma
+                )
+                TAU[:, g1, g2] = opt_tau
             
             #########################################         
             # store diagnostics
-            AIC[g1,g2] = aic(S, sol['Theta'], N)
+            AIC[g1, g2] = aic(S, sol['Theta'], N)
             for g in gammas:
-                BIC[g][g1,g2] = ebic(S, sol['Theta'], N, gamma = g)
+                BIC[g][g1, g2] = ebic(S, sol['Theta'], N, gamma=g)
             
             if latent:
-                RANK[:,g1,g2] = [np.linalg.matrix_rank(sol['L'][k]) for k in np.arange(K)]
+                RANK[:, g1, g2] = [np.linalg.matrix_rank(sol['L'][k]) for k in np.arange(K)]
             
-            SP[g1,g2] = mean_sparsity(sol['Theta'])
+            SP[g1, g2] = mean_sparsity(sol['Theta'])
                         
             # new best point found
             if method == 'eBIC':
-                if BIC[gamma][g1,g2] < curr_min:
-                    curr_min = BIC[gamma][g1,g2]
+                if BIC[gamma][g1, g2] < curr_min:
+                    curr_min = BIC[gamma][g1, g2]
                     curr_best = sol.copy()
             elif method == 'AIC':
-                 if AIC[g1,g2] < curr_min:
-                    curr_min = AIC[g1,g2]
+                 if AIC[g1, g2] < curr_min:
+                    curr_min = AIC[g1, g2]
                     curr_best = sol.copy()
         
             if verbose:
-                print(f"Grid point: (l1,l2): {(L1[g1,g2],L2[g1,g2])}, sparsity: {np.round(SP[g1,g2],3)}, best score: {np.round(curr_min,1)}")
+                print(f"Grid point: (l1,l2): {(L1[g1, g2],L2[g1, g2])}, sparsity: {np.round(SP[g1, g2],3)}, best score: {np.round(curr_min, 1)}")
             
     # get optimal lambda
     if method == 'AIC':
         AIC[AIC==-np.inf] = np.nan
-        ix= np.unravel_index(np.nanargmin(AIC), AIC.shape)   
+        ix = np.unravel_index(np.nanargmin(AIC), AIC.shape)   
     elif method == 'eBIC':    
         for g in gammas:
             BIC[g][BIC[g]==-np.inf] = np.nan            
-        ix= np.unravel_index(np.nanargmin(BIC[gamma]), BIC[gamma].shape)
+        ix = np.unravel_index(np.nanargmin(BIC[gamma]), BIC[gamma].shape)
     
     if verbose:
-        print(f"Best regularization parameters: (l1,l2): {(L1[ix],L2[ix])}")
+        print(f"Best regularization parameters: (l1,l2): {(L1[ix], L2[ix])}")
         
-    stats = {'BIC': BIC, 'AIC': AIC, 'SP': SP, 'RANK': RANK, 'TAU': TAU, 'L1': L1, 'L2': L2, \
-             'BEST': {'lambda1': L1[ix], 'lambda2': L2[ix]}, 'GAMMA': gammas}
+    stats = {'BIC': BIC,
+             'AIC': AIC,
+             'SP': SP,
+             'RANK': RANK,
+             'TAU': TAU,
+             'L1': L1,
+             'L2': L2,
+             'BEST': {'lambda1': L1[ix], 'lambda2': L2[ix]},
+             'GAMMA': gammas
+    }
     
     if thresholding:
         stats['NO_THRESHOLDING_SOL'] = _no_thr_curr_best
@@ -251,8 +297,19 @@ def grid_search(solver, S, N, p, reg, l1, l2 = None, w2 = None, method= 'eBIC',
     
     return stats, ix, curr_best
 
-def K_single_grid(S, lambda_range, N, method = 'eBIC', gamma = 0.3, latent = False, mu_range = None,\
-                  thresholding = False, use_block = True, store_all = True, tol = 1e-7, rtol = 1e-7):
+def K_single_grid(S: Union[np.ndarray, dict],
+                  lambda_range: np.ndarray,
+                  N: np.ndarray,
+                  method: str='eBIC',
+                  gamma: float=0.3,
+                  latent: bool=False,
+                  mu_range: Optional[np.ndarray]=None,
+                  thresholding: bool=False,
+                  use_block: bool=True,
+                  store_all: bool=True,
+                  tol: float=1e-7,
+                  rtol: float=1e-7
+    ):
     """
     method for doing model selection for K single Graphical Lasso problems, using grid search and AIC/eBIC
     parameters to select: lambda1 (sparsity), mu1 (lowrank, if latent=True)
@@ -303,11 +360,13 @@ def K_single_grid(S, lambda_range, N, method = 'eBIC', gamma = 0.3, latent = Fal
     """
     assert method in ['AIC', 'eBIC']
     
-    if type(S) == dict:
+    if isinstance(S, dict):
         K = len(S.keys())
-    elif type(S) == np.ndarray:
+    elif isinstance(S, np.ndarray):
         K = S.shape[0]
-        
+    else:
+        raise Exception("S must be specified either as array or dict.")
+       
     assert len(N) == K, f"N must be given as array, is given as {N}."
         
     if latent:
@@ -328,11 +387,11 @@ def K_single_grid(S, lambda_range, N, method = 'eBIC', gamma = 0.3, latent = Fal
     
     BIC = dict()
     for g in gammas:
-        BIC[g] = np.nan*np.zeros((K,_L,_M))
+        BIC[g] = np.nan * np.zeros((K, _L, _M))
         
-    AIC = np.nan*np.zeros((K,_L,_M))
-    SP = np.nan*np.zeros((K,_L,_M))
-    RANK = np.zeros((K,_L,_M))
+    AIC = np.nan * np.zeros((K, _L, _M))
+    SP = np.nan * np.zeros((K, _L, _M))
+    RANK = np.zeros((K, _L, _M))
     
     estimates = dict()
     lowrank = dict()
@@ -347,14 +406,24 @@ def K_single_grid(S, lambda_range, N, method = 'eBIC', gamma = 0.3, latent = Fal
     for k in np.arange(K):
         print(f"------------Range search for instance {k}------------")
         
-        if type(S) == dict:
+        if isinstance(S, dict):
             S_k = S[k].copy()    
-        elif type(S) == np.ndarray:
+        elif isinstance(S, np.ndarray):
             S_k = S[k,:,:].copy()
         
-        best, est_k, lr_k, stats_k = single_grid_search(S = S_k, lambda_range = lambda_range, N = N[k], method = method, gamma = gamma, \
-                                                        latent = latent, mu_range = mu_range, thresholding = thresholding,\
-                                                        use_block = use_block, store_all = store_all, tol = tol, rtol = rtol)
+        best, est_k, lr_k, stats_k = single_grid_search(S=S_k,
+                                                        lambda_range=lambda_range,
+                                                        N=N[k],
+                                                        method=method,
+                                                        gamma=gamma,
+                                                        latent=latent,
+                                                        mu_range=mu_range,
+                                                        thresholding=thresholding,
+                                                        use_block=use_block,
+                                                        store_all=store_all,
+                                                        tol=tol,
+                                                        rtol=rtol
+        )
         
         #store best individual estimator
         est_indv['Theta'][k] = best['Theta'].copy() 
@@ -367,33 +436,34 @@ def K_single_grid(S, lambda_range, N, method = 'eBIC', gamma = 0.3, latent = Fal
         
         for g in gammas:
             BIC[g][k,:,:] = stats_k['BIC'][g].copy()
+
         AIC[k,:,:] = stats_k['AIC'].copy()
         SP[k,:,:] = stats_k['SP'].copy()
         RANK[k,:,:] = stats_k['RANK'].copy()
     
     ###########################################
     # get optimal low rank for each lambda
-    tmpSCORE = np.zeros((K,_L))
+    tmpSCORE = np.zeros((K, _L))
     tmpSCORE[:] = np.nan
 
-    ix_mu = np.zeros((K,_L), dtype = int)
+    ix_mu = np.zeros((K, _L), dtype=int)
     # for each lambda, get optimal mu
     for k in np.arange(K):
         for j in np.arange(_L):       
             if method == 'AIC':
-                ix_mu[k,j] = np.nanargmin(AIC[k,j,:])     
-                tmpSCORE[k,j] = AIC[k,j,ix_mu[k,j]]
+                ix_mu[k, j] = np.nanargmin(AIC[k,j,:])     
+                tmpSCORE[k, j] = AIC[k, j, ix_mu[k,j]]
             elif method == 'eBIC':
-                ix_mu[k,j] = np.nanargmin(BIC[gamma][k,j,:])
-                tmpSCORE[k,j] = BIC[gamma][k,j,ix_mu[k,j]]
+                ix_mu[k, j] = np.nanargmin(BIC[gamma][k,j,:])
+                tmpSCORE[k, j] = BIC[gamma][k, j, ix_mu[k,j]]
     
     # get optimal lambda (uniform over k=1,..,K and individual)
     tmpSCORE[tmpSCORE==-np.inf] = np.nan
     ix_uniform = np.nanargmin(tmpSCORE.sum(axis=0))
-    ix_indv = np.nanargmin(tmpSCORE, axis = 1)
+    ix_indv = np.nanargmin(tmpSCORE, axis=1)
          
     # stack in case of array
-    if type(S) == np.ndarray:
+    if isinstance(S, np.ndarray):
         est_indv['Theta'] = np.stack([e for e in est_indv['Theta'].values()])
         if latent:
             est_indv['L'] = np.stack([e for e in est_indv['L'].values()])
@@ -407,28 +477,45 @@ def K_single_grid(S, lambda_range, N, method = 'eBIC', gamma = 0.3, latent = Fal
             est_uniform['L'] = dict()
         
         for k in np.arange(K):  
-            est_uniform['Theta'][k] = estimates[k][ix_uniform, ix_mu[k,ix_uniform] , :,:]           
+            est_uniform['Theta'][k] = estimates[k][ix_uniform, ix_mu[k, ix_uniform],:,:]           
             if latent:
-                est_uniform['L'][k] = lowrank[k][ix_uniform, ix_mu[k,ix_uniform] , :,:]
+                est_uniform['L'][k] = lowrank[k][ix_uniform, ix_mu[k, ix_uniform],:,:]
                    
-        if type(S) == np.ndarray:
+        if isinstance(S, np.ndarray):
             est_uniform['Theta'] = np.stack([e for e in est_uniform['Theta'].values()])
             if latent:
                 est_uniform['L'] = np.stack([e for e in est_uniform['L'].values()])
     else:
         est_uniform = None
         
-    statistics = {'BIC': BIC[gamma], 'AIC': AIC, 'SP': SP, 'RANK': RANK, \
-                  'LAMB': LAMB, 'MU': MU,\
-                  'ix_uniform': ix_uniform, 'ix_indv': ix_indv, 'ix_mu': ix_mu}
-    
-            
+    statistics = {'BIC': BIC[gamma],
+                  'AIC': AIC,
+                  'SP': SP,
+                  'RANK': RANK,
+                  'LAMB': LAMB,
+                  'MU': MU,
+                  'ix_uniform': ix_uniform,
+                  'ix_indv': ix_indv,
+                  'ix_mu': ix_mu
+    }
+      
     return est_uniform, est_indv, statistics
 
 
-def single_grid_search(S, lambda_range, N, method = 'eBIC', gamma = 0.3, latent = False, mu_range = None,\
-                       thresholding = False, use_block = True, store_all = True, tol = 1e-7, rtol = 1e-7,
-                       lambda1_mask = None):
+def single_grid_search(S: np.ndarray,
+                       lambda_range: np.ndarray,
+                       N: int,
+                       method: str='eBIC',
+                       gamma: float=0.3,
+                       latent: bool=False,
+                       mu_range: np.ndarray=None,
+                       thresholding:  bool=False,
+                       use_block: bool=True,
+                       store_all: bool=True,
+                       tol: float=1e-7,
+                       rtol: float=1e-7,
+                       lambda1_mask: Optional[np.ndarray]=None
+    ):
     """
     method for model selection for SGL problem, doing grid search and selection via eBIC or AIC
 
@@ -497,24 +584,30 @@ def single_grid_search(S, lambda_range, N, method = 'eBIC', gamma = 0.3, latent
     for g in gammas:
         BIC[g] = np.nan*np.zeros((_L, _M))
         
-    AIC = np.nan*np.zeros((_L, _M))
-    SP = np.nan*np.zeros((_L, _M))
-    RANK = np.zeros((_L,_M))
+    AIC = np.nan * np.zeros((_L, _M))
+    SP = np.nan * np.zeros((_L, _M))
+    RANK = np.zeros((_L, _M))
     
     if thresholding:
         TAU = np.zeros((_L, _M))
     else:
         TAU = None
     
-    kwargs = {'S': S, 'Omega_0': np.eye(p), 'X_0': np.eye(p), 'tol': tol, 'rtol': rtol,\
-              'verbose': False, 'measure': False}
+    kwargs = {'S': S,
+              'Omega_0': np.eye(p),
+              'X_0': np.eye(p),
+              'tol': tol,
+              'rtol': rtol,
+              'verbose': False,
+              'measure': False
+    }
         
     if lambda1_mask is not None:
         kwargs['lambda1_mask'] = lambda1_mask
     
     if store_all:
-        estimates = np.zeros((_L,_M,p,p))
-        lowrank = np.zeros((_L,_M,p,p))
+        estimates = np.zeros((_L, _M, p, p))
+        lowrank = np.zeros((_L, _M, p, p))
     else:
         estimates = None
         lowrank = None
@@ -542,17 +635,22 @@ def single_grid_search(S, lambda_range, N, method = 'eBIC', gamma = 0.3, latent
             if latent:
                 if store_all:
                     lowrank[j,m,:,:] = sol['L'].copy()
-                RANK[j,m] = np.linalg.matrix_rank(sol['L'])
+                RANK[j, m] = np.linalg.matrix_rank(sol['L'])
             
             # tune optimal threshold, changes sol['Theta']
             if thresholding:
-                sol['Theta'], opt_tau, _ = tune_threshold(sol['Theta'], S, N,\
-                                                          tau_range = None, method = method, gamma = gamma)
-                TAU[j,m] = opt_tau
+                sol['Theta'], opt_tau, _ = tune_threshold(sol['Theta'],
+                                                          S,
+                                                          N,
+                                                          tau_range=None,
+                                                          method=method,
+                                                          gamma=gamma
+                )
+                TAU[j, m] = opt_tau
              
-            AIC[j,m] = aic_single(S, sol['Theta'], N)
+            AIC[j, m] = aic_single(S, sol['Theta'], N)
             for g in gammas:
-                BIC[g][j, m] = ebic_single(S, sol['Theta'], N, gamma = g, lambda1_mask = lambda1_mask)
+                BIC[g][j, m] = ebic_single(S, sol['Theta'], N, gamma=g, lambda1_mask=lambda1_mask)
             
             SP[j,m] = sparsity(sol['Theta'])
             
@@ -561,12 +659,12 @@ def single_grid_search(S, lambda_range, N, method = 'eBIC', gamma = 0.3, latent
             
             # new best point found
             if method == 'eBIC':
-                if BIC[gamma][j,m] < curr_min:
-                    curr_min = BIC[gamma][j,m]
+                if BIC[gamma][j, m] < curr_min:
+                    curr_min = BIC[gamma][j, m]
                     best_sol = sol.copy()
             elif method == 'AIC':
-                 if AIC[j,m] < curr_min:
-                    curr_min = AIC[j,m]
+                 if AIC[j, m] < curr_min:
+                    curr_min = AIC[j, m]
                     best_sol = sol.copy()
         
 
@@ -580,8 +678,16 @@ def single_grid_search(S, lambda_range, N, method = 'eBIC', gamma = 0.3, latent
         ix = np.unravel_index(np.nanargmin(BIC[gamma]), BIC[gamma].shape)
         
     
-    stats = {'BIC': BIC, 'AIC': AIC, 'SP': SP, 'RANK': RANK, 'LAMBDA': LAMB, 'MU': MU, 'TAU': TAU,\
-             'BEST': {'lambda1': LAMB[ix], 'mu1': MU[ix]}, 'GAMMA': gammas}
+    stats = {'BIC': BIC,
+             'AIC': AIC,
+             'SP': SP,
+             'RANK': RANK,
+             'LAMBDA': LAMB,
+             'MU': MU,
+             'TAU': TAU,
+             'BEST': {'lambda1': LAMB[ix], 'mu1': MU[ix]},
+             'GAMMA': gammas
+    }
             
     return best_sol, estimates, lowrank, stats
 
@@ -598,14 +704,14 @@ def thresholding(A, tau):
        
     return A*mask
 
-def tune_threshold(Theta, S, N, tau_range = None, method = 'eBIC', gamma = 0.1):
+def tune_threshold(Theta, S, N, tau_range=None, method='eBIC', gamma=0.1):
     """
     Pick the best threshold for 2d-array according to eBIC or AIC.
     """
     if tau_range is None:
         # diagonal is upper bound as this would make Theta indefinite.                   
         #tau_range = np.linspace(TAU_MIN, np.diag(Theta).min()*0.9, N_TAU) 
-        tau_range = np.logspace(-12,-1,N_TAU)
+        tau_range = np.logspace(-12, -1, N_TAU)
         
     assert np.all(tau_range > 0)
 
@@ -614,7 +720,7 @@ def tune_threshold(Theta, S, N, tau_range = None, method = 'eBIC', gamma = 0.1):
     for j in range(len(tau_range)):
         tau = tau_range[j]
         if method == 'eBIC':
-            E = ebic(S, thresholding(Theta, tau), N, gamma = gamma)
+            E = ebic(S, thresholding(Theta, tau), N, gamma=gamma)
         elif method == 'AIC':
             E = aic(S, thresholding(Theta, tau), N)
         scores[j] = E
@@ -627,13 +733,13 @@ def tune_threshold(Theta, S, N, tau_range = None, method = 'eBIC', gamma = 0.1):
     t_Theta = thresholding(Theta, opt_tau)
     return t_Theta, opt_tau, scores
 
-def tune_multiple_threshold(Theta, S, N, tau_range, method = 'eBIC', gamma = 0.1):
+def tune_multiple_threshold(Theta, S, N, tau_range, method='eBIC', gamma=0.1):
     """
     Pick the best threshold for 3d-array or dict according to eBIC or AIC. 
     """
-    if type(S) == dict:
+    if isinstance(S, dict):
         K = len(S.keys())
-    elif type(S) == np.ndarray:
+    elif isinstance(S, np.ndarray):
         K = S.shape[0]
     
     t_Theta = Theta.copy()
@@ -641,7 +747,13 @@ def tune_multiple_threshold(Theta, S, N, tau_range, method = 'eBIC', gamma = 0.1
     tau = np.zeros(K)
     
     for k in np.arange(K):
-        Th_k, tau_k, scores_k = tune_threshold(Theta[k], S[k], N[k], tau_range, method, gamma)
+        Th_k, tau_k, scores_k = tune_threshold(Theta[k],
+                                               S[k],
+                                               N[k],
+                                               tau_range,
+                                               method,
+                                               gamma
+        )
         score[k] = scores_k
         tau[k] = tau_k
         t_Theta[k] = Th_k
@@ -659,9 +771,9 @@ def aic(S, Theta, N):
     AIC information criterion after Danaher et al.
     excludes the diagonal
     """
-    if type(S) == dict:
+    if isinstance(S, dict):
         aic = aic_dict(S, Theta, N)
-    elif type(S) == np.ndarray:
+    elif isinstance(S, np.ndarray):
         if len(S.shape) == 3:
             aic = aic_array(S, Theta, N)
         else:
@@ -691,7 +803,7 @@ def aic_array(S,Theta, N):
     
     if isinstance(N, SINGLE_FLOAT_INT_TYPES):
         N = np.ones(K) * N
-    
+
     aic = 0
     for k in np.arange(K):
         aic += aic_single(S[k,:,:], Theta[k,:,:], N[k])
@@ -710,13 +822,13 @@ def aic_single(S, Theta, N):
 
 ################################################################
     
-def ebic(S, Theta, N, gamma = 0.5):
+def ebic(S, Theta, N, gamma=0.5):
     """
     extended BIC after Drton et al.
     """
-    if type(S) == dict:
+    if isinstance(S, dict):
         ebic = ebic_dict(S, Theta, N, gamma)
-    elif type(S) == np.ndarray:
+    elif isinstance(S, np.ndarray):
         if len(S.shape) == 3:
             ebic = ebic_array(S, Theta, N, gamma)
         else:
@@ -728,6 +840,7 @@ def ebic(S, Theta, N, gamma = 0.5):
 
 def ebic_single(S, Theta, N, gamma, lambda1_mask=None):
     (p,p) = S.shape
+    
     assert isinstance(N, SINGLE_FLOAT_INT_TYPES)
     if lambda1_mask is not None:
         assert lambda1_mask.shape == S.shape
diff --git a/src/gglasso/solver/admm_solver.py b/src/gglasso/solver/admm_solver.py
index cbe89f5..6578ea2 100644
--- a/src/gglasso/solver/admm_solver.py
+++ b/src/gglasso/solver/admm_solver.py
@@ -5,15 +5,30 @@
 import numpy as np
 import time
 import warnings
+from typing import Optional, Union
 
 from ..helper.basic_linalg import trp, Gdot
 from .ggl_helper import prox_p, phiplus, prox_rank_norm, f, P_val
 
-
-def ADMM_MGL(S, lambda1, lambda2, reg , Omega_0 , \
-             Theta_0 = np.array([]), X_0 = np.array([]), n_samples = None, \
-             tol = 1e-5 , rtol = 1e-4, stopping_criterion = 'boyd', update_rho = True,\
-             rho= 1., max_iter = 1000, verbose = False, measure = False, latent = False, mu1 = None):
+def ADMM_MGL(S: np.ndarray,
+             lambda1: float,
+             lambda2: float,
+             reg: str,
+             Omega_0: np.ndarray,
+             Theta_0: np.ndarray=np.array([]),
+             X_0: np.ndarray=np.array([]),
+             n_samples: Optional[Union[int, np.ndarray]]=None,
+             tol: float=1e-5,
+             rtol: float=1e-4,
+             stopping_criterion: str='boyd',
+             update_rho: bool=True,
+             rho: float=1.,
+             max_iter: int=1000,
+             verbose: bool=False,
+             measure: bool=False,
+             latent: bool=False,
+             mu1: Optional[Union[float, np.ndarray]]=None
+    ):
     """
     This is an ADMM solver for the (Latent variable) Multiple Graphical Lasso problem (MGL). It jointly estimates K precision matrices of shape (p,p).
 
@@ -117,7 +132,7 @@ def ADMM_MGL(S, lambda1, lambda2, reg , Omega_0 , \
     # else --> n_samples should be array with sample sizes
     if n_samples == None:
         nk = np.ones((K,1,1))
-    elif type(n_samples) == int:
+    elif isinstance(n_samples, int):
         nk = n_samples * np.ones((K,1,1)) 
     else:
         assert len(nk) == K
@@ -166,22 +181,32 @@ def ADMM_MGL(S, lambda1, lambda2, reg , Omega_0 , \
         eigD, eigQ = np.linalg.eigh(W_t)
         
         for k in np.arange(K):
-            Omega_t[k,:,:] = phiplus(beta = nk[k,0,0]/rho, D = eigD[k,:], Q = eigQ[k,:,:])
+            Omega_t[k,:,:] = phiplus(beta=nk[k,0,0]/rho,
+                                     D=eigD[k,:],
+                                     Q=eigQ[k,:,:]
+            )
         
         # Theta Update
-        Theta_t = prox_p(Omega_t + L_t + X_t, (1/rho)*lambda1, (1/rho)*lambda2, reg)
+        Theta_t = prox_p(Omega_t + L_t + X_t,
+                         (1/rho)*lambda1,
+                         (1/rho)*lambda2,
+                         reg
+        )
         
         #L Update
         if latent:
             C_t = Theta_t - X_t - Omega_t
             eigD, eigQ = np.linalg.eigh(C_t)
             for k in np.arange(K):
-                L_t[k] = prox_rank_norm(C_t[k,:,:], mu1[k]/rho, D = eigD[k,:], Q = eigQ[k,:,:])
+                L_t[k] = prox_rank_norm(C_t[k,:,:],
+                                        mu1[k]/rho,
+                                        D=eigD[k,:],
+                                        Q = eigQ[k,:,:]
+                )
                 
         # X Update
         X_t += Omega_t - Theta_t + L_t
         
-        
         if measure:
             end = time.time()
             runtime[iter_t] = end-start
@@ -189,8 +214,14 @@ def ADMM_MGL(S, lambda1, lambda2, reg , Omega_0 , \
         
         # Stopping condition
         if stopping_criterion == 'boyd':
-            r_t,s_t,e_pri,e_dual = ADMM_stopping_criterion(Omega_t, Omega_t_1, Theta_t, L_t, X_t,\
-                                                           S, rho, tol, rtol, latent)
+            r_t,s_t,e_pri,e_dual = ADMM_stopping_criterion(Omega_t,
+                                                           Omega_t_1,
+                                                           Theta_t,
+                                                           L_t,
+                                                           X_t,
+                                                           S,
+                                                           rho, tol, rtol, latent
+            )
         
             # update rho
             if update_rho:
@@ -215,7 +246,13 @@ def ADMM_MGL(S, lambda1, lambda2, reg , Omega_0 , \
                 break
             
         elif stopping_criterion == 'kkt':
-            eta_A = kkt_stopping_criterion(Omega_t, Theta_t, L_t, rho*X_t, S , lambda1, lambda2, nk, reg, latent, mu1)
+            eta_A = kkt_stopping_criterion(Omega_t,
+                                           Theta_t,
+                                           L_t,
+                                           rho*X_t,
+                                           S,
+                                           lambda1, lambda2, nk, reg, latent, mu1
+            )
             residual[iter_t] = eta_A
             
             if verbose:
@@ -265,7 +302,11 @@ def ADMM_MGL(S, lambda1, lambda2, reg , Omega_0 , \
         
     sol = {'Omega': Omega_t, 'Theta': Theta_t, 'L': L_t, 'X': X_t}
     if measure:
-        info = {'status': status , 'runtime': runtime[:iter_t+1], 'residual': residual[:iter_t+1], 'objective': objective[:iter_t+1]}
+        info = {'status': status ,
+                'runtime': runtime[:iter_t+1],
+                'residual': residual[:iter_t+1],
+                'objective': objective[:iter_t+1]
+        }
     else:
         info = {'status': status}
                
@@ -289,7 +330,7 @@ def ADMM_stopping_criterion(Omega, Omega_t_1, Theta, L, X, S, rho, eps_abs, eps_
 
     return r,s,e_pri,e_dual
 
-def kkt_stopping_criterion(Omega, Theta, L, X, S , lambda1, lambda2, nk, reg, latent = False, mu1 = None):
+def kkt_stopping_criterion(Omega, Theta, L, X, S, lambda1, lambda2, nk, reg, latent=False, mu1=None):
     
     assert Omega.shape == Theta.shape == S.shape
     assert S.shape[1] == S.shape[2]
@@ -299,14 +340,17 @@ def kkt_stopping_criterion(Omega, Theta, L, X, S , lambda1, lambda2, nk, reg, la
     
     (K,p,p) = S.shape
     
-    term1 = np.linalg.norm(Theta - prox_p(Theta + X , l1 = lambda1, l2= lambda2, reg = reg)) / (1 + np.linalg.norm(Theta))
+    term1 = np.linalg.norm(Theta - prox_p(Theta+X,
+                                          l1=lambda1,
+                                          l2=lambda2,
+                                          reg=reg)) / (1 + np.linalg.norm(Theta))
     
     term2 = np.linalg.norm(Theta - Omega - L) / (1 + np.linalg.norm(Theta))
     
     proxK = np.zeros((K,p,p))
     eigD, eigQ = np.linalg.eigh(Omega - nk*S - X)
     for k in np.arange(K):       
-        proxK[k,:,:] = phiplus(beta = nk[k,0,0], D = eigD[k,:], Q = eigQ[k,:,:])
+        proxK[k,:,:] = phiplus(beta=nk[k,0,0], D=eigD[k,:], Q=eigQ[k,:,:])
      
     term3 = np.linalg.norm(Omega - proxK) / (1 + np.linalg.norm(Omega))
     
@@ -314,7 +358,11 @@ def kkt_stopping_criterion(Omega, Theta, L, X, S , lambda1, lambda2, nk, reg, la
         proxL = np.zeros((K,p,p))
         eigD, eigQ = np.linalg.eigh(L - X)
         for k in np.arange(K):
-            proxL[k,:,:] = prox_rank_norm(L[k,:,:] - X[k,:,:], beta = mu1[k], D = eigD[k,:], Q = eigQ[k,:,:])
+            proxL[k,:,:] = prox_rank_norm(L[k,:,:] - X[k,:,:],
+                                          beta=mu1[k],
+                                          D=eigD[k,:],
+                                          Q=eigQ[k,:,:]
+            )
         
         term4 = np.linalg.norm(L - proxL) / (1 + np.linalg.norm(L))
     else:
diff --git a/src/gglasso/solver/ggl_helper.py b/src/gglasso/solver/ggl_helper.py
index 1569b37..55bfdad 100644
--- a/src/gglasso/solver/ggl_helper.py
+++ b/src/gglasso/solver/ggl_helper.py
@@ -3,10 +3,9 @@
 """
 
 import numpy as np
-#from tick.prox import ProxTV
 from numba import njit
 
-from ..helper.basic_linalg import trp,Gdot,Sdot
+from ..helper.basic_linalg import trp, Gdot, Sdot
 from .fgl_helper import condat_method
 
 # functions specifically related to the GGL regularizer
@@ -19,15 +18,15 @@ def prox_od_1norm(A, l):
     """
     calculates the prox of the off-diagonal 1norm at a point A
     """    
-    (d1,d2) = A.shape
+    (d1, d2) = A.shape
     res = np.sign(A) * np.maximum(np.abs(A) - l, 0.)
     
-    for i in np.arange(np.minimum(d1,d2)):
+    for i in np.arange(np.minimum(d1, d2)):
         res[i,i] = A[i,i]
     
     return res
 
-def prox_rank_norm(A, beta, D = np.array([]), Q = np.array([])):
+def prox_rank_norm(A, beta, D=np.array([]), Q=np.array([])):
 
     if len(D) != A.shape[0]:
         D, Q = np.linalg.eigh(A)
@@ -50,7 +49,7 @@ def prox_sum_Frob(X, M, l):
     \lambda \sum_{j\neq l} \|X^M_{jl}\|_F 
     """
     (pM, pM) = X.shape
-    assert pM%M == 0
+    assert pM % M == 0
     
     p = int(pM/M)
     Y = np.zeros((pM,pM))
@@ -58,11 +57,11 @@ def prox_sum_Frob(X, M, l):
     for i in np.arange(p):
         for j in np.arange(start=i, stop=p):
             if j == i:
-                Y[i*M:(i+1)*M,j*M:(j+1)*M] = X[i*M:(i+1)*M,j*M:(j+1)*M] 
+                Y[i*M:(i+1)*M, j*M:(j+1)*M] = X[i*M:(i+1)*M, j*M:(j+1)*M] 
             else:
-                B = prox_2norm(X[i*M:(i+1)*M,j*M:(j+1)*M], l)
-                Y[i*M:(i+1)*M,j*M:(j+1)*M] = B
-                Y[j*M:(j+1)*M,i*M:(i+1)*M] = B.T
+                B = prox_2norm(X[i*M:(i+1)*M, j*M:(j+1)*M], l)
+                Y[i*M:(i+1)*M, j*M:(j+1)*M] = B
+                Y[j*M:(j+1)*M, i*M:(i+1)*M] = B.T
                              
     return Y
 
@@ -78,7 +77,7 @@ def jacobian_projection(v, l):
     if a <= l:
         g = np.eye(K)
     else:
-        g = (l/a) * ( np.eye(K) - (1/a**2) * np.outer(v,v) )
+        g = (l/a) * (np.eye(K) - (1/a**2) * np.outer(v,v))
         
     return g
     
@@ -96,7 +95,7 @@ def jacobian_1norm(v,l):
     return np.diag(d)
 
 @njit()
-def jacobian_prox_phi_ggl(v , l1 , l2):
+def jacobian_prox_phi_ggl(v, l1, l2):
     u = prox_1norm(v, l1)
     sig = jacobian_projection(u, l2)
     lam = jacobian_1norm(v, l1)
@@ -106,14 +105,13 @@ def jacobian_prox_phi_ggl(v , l1 , l2):
     return M
 
 # functions specifically related to the FGL regularizer
-
 @njit()
 def construct_B(K):
     dd = np.eye(K)
-    ld = - np.tri(K, k = -1) + np.tri(K, k = -2) 
+    ld = - np.tri(K, k=-1) + np.tri(K, k=-2) 
     
-    B = dd+ld
-    B = B[1:,:]
+    B = dd + ld
+    B = B[1:, :]
     # older numba versions modify B when applying pinv, hence copy
     tB = B.T.copy()
     # this is the left-inverse of B.T, is needed to reconstruct the dual solution z_lambda
@@ -121,19 +119,20 @@ def construct_B(K):
     
     return B, invB
 
-# also implemented in package tick, but our implementation is faster
+# old version using tick package:
+# from tick.prox import ProxTV
+# a = ProxTV(l).call(np.ascontiguousarray(v))
+
 @njit()
 def prox_tv(v,l):
-    a = condat_method(v,l)
-    #a = ProxTV(l).call(np.ascontiguousarray(v))
+    a = condat_method(v, l)
     return a
 
 @njit()
 def prox_phi_fgl(v, l1, l2):
-    res = prox_1norm(prox_tv(v,l2) , l1)
+    res = prox_1norm(prox_tv(v,l2), l1)
     return res
 
-
 @njit()
 def jacobian_tv(v,l):   
     K = len(v)   
@@ -147,14 +146,14 @@ def jacobian_tv(v,l):
     z_tmp[ind1] = 0.
     
     Sigma = np.diag(z_tmp)   
-    P_hat = np.linalg.pinv(Sigma @ B@ B.T @ Sigma)
+    P_hat = np.linalg.pinv(Sigma @ B @ B.T @ Sigma)
     P = np.eye(K) - B.T @ P_hat @ B
     return P
 
 @njit()
-def jacobian_prox_phi_fgl(v , l1 , l2): 
-    x = prox_tv(v,l2)
-    P = jacobian_tv(v,l2)
+def jacobian_prox_phi_fgl(v, l1, l2): 
+    x = prox_tv(v, l2)
+    P = jacobian_tv(v, l2)
     Theta = jacobian_1norm(x, l1)
     
     return Theta @ P
@@ -167,11 +166,11 @@ def P_val(X, l1, l2, reg):
     res = 0
     for i in np.arange(p):
         # start at i+1 because P does NOT operate on diagonal
-        for j in np.arange(start = i + 1 , stop = p):
+        for j in np.arange(start=i+1 , stop=p):
             if reg == 'GGL':
-                res += l1 * np.linalg.norm(X[:,i,j] , 1) + l2 * np.linalg.norm(X[:,i,j] , 2)
+                res += l1 * np.linalg.norm(X[:,i,j], 1) + l2 * np.linalg.norm(X[:,i,j], 2)
             elif reg == 'FGL':
-                res += l1 * np.linalg.norm(X[:,i,j] , 1) + l2 * np.linalg.norm(X[1:,i,j] - X[:-1,i,j] , 1)
+                res += l1 * np.linalg.norm(X[:,i,j], 1) + l2 * np.linalg.norm(X[1:,i,j] - X[:-1,i,j], 1)
                 
     # multiply by 2 as we only summed the upper triangular
     return 2 * res
@@ -186,19 +185,18 @@ def prox_phi(v, l1, l2, reg):
     elif reg == 'FGL':
         res = prox_phi_fgl(v, l1, l2)
     return res
-    
 
 
 @njit()
 def prox_p(X, l1, l2, reg):
     #X is always symmetric and hence we only calculate upper diagonals
     assert np.abs(X - trp(X)).max() <= 1e-5, "input X is not symmetric"
-    assert np.minimum(l1,l2) > 0, "lambda 1 and lambda2 have to be positive"
+    assert np.minimum(l1, l2) > 0, "lambda 1 and lambda2 have to be positive"
     
     (K,p,p) = X.shape
     M = np.zeros((K,p,p))
     for i in np.arange(p):
-        for j in np.arange(start = i, stop = p):
+        for j in np.arange(start=i, stop=p):
             if i == j:
                 # factor 1/2 because we later add again
                 M[:,i,j] = (1/2)*X[:,i,j]
@@ -221,9 +219,9 @@ def jacobian_prox_phi(v , l1 , l2, reg):
     assert reg in ['GGL', 'FGL']
     
     if reg == 'GGL':
-        res = jacobian_prox_phi_ggl(v , l1 , l2)
+        res = jacobian_prox_phi_ggl(v, l1, l2)
     elif reg == 'FGL':
-        res = jacobian_prox_phi_fgl(v , l1 , l2)
+        res = jacobian_prox_phi_fgl(v, l1, l2)
         
     return res
 
@@ -241,21 +239,22 @@ def construct_jacobian_prox_p(X, l1 , l2, reg):
     
     W = np.zeros((K,K,p,p))
     for i in np.arange(p):
-        for j in np.arange(start = i, stop = p):
+        for j in np.arange(start=i, stop=p):
             if i == j:
                 W[:,:,i,j] = np.eye(K)
             else:
-                ij_entry = jacobian_prox_phi(X[:,i,j] , l1 , l2, reg)
+                ij_entry = jacobian_prox_phi(X[:,i,j], l1, l2, reg)
                 W[:,:,i,j] = ij_entry
                 W[:,:,j,i] = ij_entry                 
     return W
 
 
-def eval_jacobian_prox_p(Y , W):
+def eval_jacobian_prox_p(Y, W):
     # W is the result of construct_jacobian_prox_p
     (K,p,p) = Y.shape
-    assert W.shape == (K,K,p,p)
+    assert W.shape == (K, K, p, p)
     fun = np.einsum('lkij,kij->lij', W, Y)
+    # without einsum (but slow):
     # fun = np.zeros((K,p,p))
     # for i in np.arange(p):
     #     for j in np.arange(p):
@@ -300,13 +299,12 @@ def phiplus(beta, D, Q):
     B : array of shape (p,p)
         proximal operator.
     """
-    #B = Q @ np.diag(phip(D,beta)) @ Q.T
-    B = (Q * phip(D,beta))@Q.T   
+    B = (Q * phip(D,beta)) @ Q.T   
     return B
 
 @njit() 
 def phiminus(beta, D, Q):
-    B = (Q * phim(D,beta))@Q.T
+    B = (Q * phim(D,beta)) @ Q.T
     return B
 
 #@njit() 
@@ -315,26 +313,25 @@ def moreau_h(beta, D, Q):
     D: array (p,p)
     Q: array (p,p)
     """
-    
     pp = phiplus(beta, D, Q)
     pm = phiminus(beta, D, Q)
-    psi =  - (beta * np.log (np.linalg.det(pp))) + (0.5 * np.linalg.norm(pm)**2 )
+    psi =  -(beta * np.log (np.linalg.det(pp))) + (0.5 * np.linalg.norm(pm)**2)
     return psi, pp, pm
 
 
 # tile is not numba supported, could be replaced by repeat+reshape
-def construct_gamma(A, beta, D = np.array([]), Q = np.array([])):
+def construct_gamma(A, beta, D=np.array([]), Q=np.array([])):
     (K,p,p) = A.shape
     Gamma = np.zeros((K,p,p))
     
     if D.shape[0] != A.shape[0]:
-        raise KeyError
+        raise KeyError("Shapes don't match.")
     
     for k in np.arange(K):
-        phip_d = phip(D[k,:] , beta) 
+        phip_d = phip(D[k,:], beta) 
             
-        h1 = np.tile(np.sqrt(D[k,:]**2 + 4* beta), (p,1))
-        h1 = h1 + h1.T 
+        h1 = np.tile(np.sqrt(D[k,:]**2 + 4*beta), (p,1))
+        h1 = h1 + h1.T
         
         h2 = np.tile(phip_d, (p,1))
         h2 = h2 + h2.T 
@@ -357,12 +354,10 @@ def eval_jacobian_phiplus(B, Gamma, Q):
     return res
 
 # functions related to the proximal point algorithm
-    
 def Phi_t(Omega, Theta, S, Omega_t, Theta_t, sigma_t, lambda1, lambda2, reg):
     res = f(Omega, S) + P_val(Theta, lambda1, lambda2, reg) + 1/(2*sigma_t) * (np.linalg.norm(Omega - Omega_t)**2 + np.linalg.norm(Theta - Theta_t)**2)
     return res
 
- 
 def hessian_Y(D , Gamma, eigQ, W, sigma_t):
     """
     this is the linear operator for the CG method
@@ -371,18 +366,16 @@ def hessian_Y(D , Gamma, eigQ, W, sigma_t):
     """
     tmp1 = eval_jacobian_phiplus(D, Gamma, eigQ)
     tmp2 = eval_jacobian_prox_p(D, W)
-
     res = - sigma_t * (tmp1 + tmp2) 
     return res
 
-
-def cg_ppdna(Gamma, eigQ, W, sigma_t, b, tol = 1e-6, max_iter = 20):
+def cg_ppdna(Gamma, eigQ, W, sigma_t, b, tol=1e-6, max_iter=20):
     """
     solves the linear system in the PPDNA subproblem
     
-    Gamma, eigQ,W, sigma_t are constructed beforehand
-    b: right-hand-side of linear system
+    Gamma, eigQ, W, sigma_t are constructed beforehand
     
+    b: right-hand-side of linear system
     tol: tolerance to CG method
     max_iter: max iterations of CG method
     """
@@ -390,7 +383,7 @@ def cg_ppdna(Gamma, eigQ, W, sigma_t, b, tol = 1e-6, max_iter = 20):
     dim = b.shape
     x = np.zeros(dim)
     r = b - hessian_Y(x, Gamma, eigQ, W, sigma_t)
-    r_norm = np.linalg.norm(r)**2 #Gdot(r,r)
+    r_norm = np.linalg.norm(r)**2
     p = r.copy()
     
     j = 0
@@ -415,7 +408,7 @@ def cg_ppdna(Gamma, eigQ, W, sigma_t, b, tol = 1e-6, max_iter = 20):
         
     return x, status
 
-def Y_t( X, Omega_t, Theta_t, S, lambda1, lambda2, sigma_t, reg):
+def Y_t(X, Omega_t, Theta_t, S, lambda1, lambda2, sigma_t, reg):
     assert np.min(np.array([lambda1, lambda2, sigma_t])) > 0 , "at least one parameter is not positive"
     assert X.shape[1] == X.shape[2], "dimensions are not as expected"
   
@@ -429,17 +422,16 @@ def Y_t( X, Omega_t, Theta_t, S, lambda1, lambda2, sigma_t, reg):
     grad1 = np.zeros((K,p,p))
     term1 = 0
     for k in np.arange(K):
-        Psi_h, proxh, _ = moreau_h(sigma_t, D = eigD[k,:], Q = eigQ[k,:,:])
+        Psi_h, proxh, _ = moreau_h(sigma_t, D=eigD[k,:], Q=eigQ[k,:,:])
         term1 += (1/sigma_t) * Psi_h
         grad1[k,:,:] = proxh
     
-    term2 = - 1/(2*sigma_t) * (np.linalg.norm(W_t)**2 + np.linalg.norm(V_t)**2) # (Gdot(W_t, W_t) + Gdot(V_t, V_t))
-    term3 = 1/(2*sigma_t) * (np.linalg.norm(Omega_t)**2 + np.linalg.norm(Theta_t)**2) # (Gdot(Omega_t, Omega_t)  +  Gdot(Theta_t, Theta_t))  
+    term2 = - 1/(2*sigma_t) * (np.linalg.norm(W_t)**2 + np.linalg.norm(V_t)**2)
+    term3 = 1/(2*sigma_t) * (np.linalg.norm(Omega_t)**2 + np.linalg.norm(Theta_t)**2)
   
-    Psi_P , U = moreau_P(V_t, sigma_t * lambda1, sigma_t * lambda2, reg)  
+    Psi_P, U = moreau_P(V_t, sigma_t*lambda1, sigma_t*lambda2, reg)  
     term4 = (1/sigma_t) * Psi_P
   
     fun = term1 + term2 + term3 + term4
-    #grad = grad1 - U
   
     return fun, grad1, U, (eigD, eigQ)
diff --git a/src/gglasso/solver/ppdna_solver.py b/src/gglasso/solver/ppdna_solver.py
index 9e961ca..cf1cf67 100644
--- a/src/gglasso/solver/ppdna_solver.py
+++ b/src/gglasso/solver/ppdna_solver.py
@@ -12,7 +12,7 @@
 from .admm_solver import ADMM_MGL
 from ..helper.basic_linalg import trp, Gdot
 
-def get_ppdna_params(ppdna_params = None):
+def get_ppdna_params(ppdna_params=None):
     
     # initialize if empty
     if ppdna_params == None:
@@ -36,10 +36,14 @@ def get_ppdna_params(ppdna_params = None):
          
 
 def get_ppa_sub_params_default():
-    ppa_sub_params = { 'sigma_t' : 1e3, 
-          'eta' : 1e-1, 'tau' : .2, 'rho' : .5, 'mu' : 1e-4,
-          'eps_t' : .95, 'delta_t' : .95} 
-    
+    ppa_sub_params = {'sigma_t': 1e3,
+                      'eta': 1e-1,
+                      'tau': .2,
+                      'rho': .5,
+                      'mu': 1e-4,
+                      'eps_t': .95,
+                      'delta_t': .95
+    } 
     return ppa_sub_params
 
 def check_ppa_sub_params(ppa_sub_params):
@@ -58,7 +62,7 @@ def check_ppa_sub_params(ppa_sub_params):
     
     return
 
-def PPA_subproblem(Omega_t, Theta_t, X_t, S, reg, ppa_sub_params = None, verbose = False):
+def PPA_subproblem(Omega_t, Theta_t, X_t, S, reg, ppa_sub_params=None, verbose=False):
     """
     This is the dual based semismooth Newton method solver for the PPA subproblems
     Algorithm 1 in Zhang et al.
@@ -98,29 +102,41 @@ def PPA_subproblem(Omega_t, Theta_t, X_t, S, reg, ppa_sub_params = None, verbose
         
         # fun evaluation and eig can be reused from Armijo in  laster iter
         if sub_iter == 0:
-            funY_Xt, Omega_sol, Theta_sol, (eigD, eigQ) = Y_t( X_t, Omega_t, Theta_t, S, lambda1, lambda2, sigma_t, reg)
+            funY_Xt, Omega_sol, Theta_sol, (eigD, eigQ) = Y_t(X_t,
+                                                              Omega_t,
+                                                              Theta_t,
+                                                              S,
+                                                              lambda1, lambda2, sigma_t, reg
+            )
         else:
             funY_Xt = Y_t_new
             
         gradY_Xt = Omega_sol - Theta_sol
-        #eigD, eigQ = np.linalg.eigh(W_t)
     
-        Gamma = construct_gamma(W_t, sigma_t, D = eigD, Q = eigQ)
+        Gamma = construct_gamma(W_t, sigma_t, D=eigD, Q=eigQ)
         W = construct_jacobian_prox_p( (1/sigma_t)*V_t, lambda1, lambda2, reg)
         # step 1: CG method
         cg_accur = min(eta, np.linalg.norm(gradY_Xt)**(1+tau))
         
-        D, cg_status = cg_ppdna(Gamma, eigQ, W, sigma_t, -gradY_Xt, tol = cg_accur, max_iter = 10)
+        D, cg_status = cg_ppdna(Gamma, eigQ, W, sigma_t, -gradY_Xt, tol=cg_accur, max_iter=10)
         
         # step 2: line search 
         alpha = 1.
-        Y_t_new, Omega_sol, Theta_sol, (eigD, eigQ) = Y_t(X_t + alpha*D, Omega_t, Theta_t, S, lambda1, lambda2,\
-                                                          sigma_t, reg)
+        Y_t_new, Omega_sol, Theta_sol, (eigD, eigQ) = Y_t(X_t + alpha*D,
+                                                          Omega_t,
+                                                          Theta_t,
+                                                          S,
+                                                          lambda1, lambda2, sigma_t, reg
+        )
             
-        while Y_t_new < funY_Xt + mu * alpha * Gdot(gradY_Xt , D):
+        while Y_t_new < funY_Xt + mu*alpha*Gdot(gradY_Xt , D):
             alpha *= rho
-            Y_t_new, Omega_sol, Theta_sol, (eigD, eigQ) = Y_t(X_t + alpha*D, Omega_t, Theta_t, S, lambda1, lambda2,\
-                                                              sigma_t, reg)
+            Y_t_new, Omega_sol, Theta_sol, (eigD, eigQ) = Y_t(X_t + alpha*D,
+                                                              Omega_t,
+                                                              Theta_t,
+                                                              S,
+                                                              lambda1, lambda2, sigma_t, reg
+            )
             
                 
         # step 3: update variables and check stopping condition
@@ -136,12 +152,23 @@ def PPA_subproblem(Omega_t, Theta_t, X_t, S, reg, ppa_sub_params = None, verbose
     if verbose and not(condA and condB):
         print("Subproblem could not be solve with the given accuracy! -- reached maximal iterations")
             
-    sub_info= {'niter' : sub_iter, 'armijo' : alpha}
+    sub_info= {'niter': sub_iter, 'armijo': alpha}
     
     return Omega_sol, Theta_sol, X_t, sub_info
 
 
-def PPDNA(S, lambda1, lambda2, reg, Omega_0, Theta_0 = np.array([]), X_0 = np.array([]), ppdna_params = None, eps_ppdna = 1e-5 , verbose = False, measure = False):
+def PPDNA(S,
+          lambda1,
+          lambda2,
+          reg,
+          Omega_0,
+          Theta_0=np.array([]),
+          X_0=np.array([]),
+          ppdna_params=None,
+          eps_ppdna=1e-5,
+          verbose=False,
+          measure=False
+    ):
     """
     Proximal Point algorithm for the Multiple Graphical Lasso problem (MGL).
     It solves
@@ -236,7 +263,7 @@ def PPDNA(S, lambda1, lambda2, reg, Omega_0, Theta_0 = np.array([]), X_0 = np.ar
     for iter_t in np.arange(max_iter):
              
         # check stopping criterion
-        eta_P = PPDNA_stopping_criterion(Omega_t, Theta_t, X_t, S , ppa_sub_params, reg)
+        eta_P = PPDNA_stopping_criterion(Omega_t, Theta_t, X_t, S, ppa_sub_params, reg)
         residual[iter_t] = eta_P
         
         if measure:
@@ -246,8 +273,14 @@ def PPDNA(S, lambda1, lambda2, reg, Omega_0, Theta_0 = np.array([]), X_0 = np.ar
             status = 'optimal'
             break
         
-        Omega_t, Theta_t, X_t, sub_info = PPA_subproblem(Omega_t, Theta_t, X_t, S, reg = reg, ppa_sub_params = ppa_sub_params,\
-                                                         verbose = False)
+        Omega_t, Theta_t, X_t, sub_info = PPA_subproblem(Omega_t,
+                                                         Theta_t,
+                                                         X_t,
+                                                         S,
+                                                         reg=reg,
+                                                         ppa_sub_params=ppa_sub_params,
+                                                         verbose=False
+        )
         
         if measure:
             end = time.time()
@@ -261,8 +294,7 @@ def PPDNA(S, lambda1, lambda2, reg, Omega_0, Theta_0 = np.array([]), X_0 = np.ar
             ppa_sub_params['sigma_t'] = 1.3 * ppa_sub_params['sigma_t']
         ppa_sub_params['eps_t'] = 0.95 * ppa_sub_params['eps_t']
         ppa_sub_params['delta_t'] = 0.95 * ppa_sub_params['delta_t']
-                    
-     
+
     if eta_P > eps_ppdna:
             status = 'max iterations reached'    
         
@@ -280,33 +312,40 @@ def PPDNA(S, lambda1, lambda2, reg, Omega_0, Theta_0 = np.array([]), X_0 = np.ar
     sol = {'Omega': Omega_t, 'Theta': Theta_t, 'X': X_t}
     if measure:
         # last runtime irrelevant (as break) and first residual irrelevant
-        info = {'status': status , 'runtime': runtime[:iter_t], 'residual': residual[1:iter_t + 1], 'objective': objective[:iter_t]}
+        info = {'status': status ,
+                'runtime': runtime[:iter_t],
+                'residual': residual[1:iter_t + 1],
+                'objective': objective[:iter_t]
+        }
     else:
         info = {'status': status}
     return sol, info
 
 
-def PPDNA_stopping_criterion(Omega, Theta, X, S , ppa_sub_params, reg): 
+def PPDNA_stopping_criterion(Omega, Theta, X, S, ppa_sub_params, reg): 
     assert Omega.shape == Theta.shape == S.shape
     assert S.shape[1] == S.shape[2]
     
     (K,p,p) = S.shape
     
-    term1 = np.linalg.norm(Theta- prox_p(Theta + X , l1 = ppa_sub_params['lambda1'], l2= ppa_sub_params['lambda2'], reg = reg)) / (1 + np.linalg.norm(Theta))
+    term1 = np.linalg.norm(Theta-prox_p(Theta+X,
+                                        l1=ppa_sub_params['lambda1'],
+                                        l2=ppa_sub_params['lambda2'],
+                                        reg=reg)) / (1+np.linalg.norm(Theta))
     
-    term2 = np.linalg.norm(Theta - Omega) / (1 + np.linalg.norm(Theta))
+    term2 = np.linalg.norm(Theta - Omega) / (1+np.linalg.norm(Theta))
     
     proxK = np.zeros((K,p,p))
-    eigD, eigQ = np.linalg.eigh(Omega-S-X)
+    eigD, eigQ = np.linalg.eigh(Omega - S - X)
     for k in np.arange(K):       
-        proxK[k,:,:] = phiplus(beta = 1., D = eigD[k,:], Q = eigQ[k,:,:])
+        proxK[k,:,:] = phiplus(beta=1., D=eigD[k,:], Q=eigQ[k,:,:])
     
-    term3 = np.linalg.norm(Omega - proxK) / (1 + np.linalg.norm(Omega))
+    term3 = np.linalg.norm(Omega - proxK) / (1+np.linalg.norm(Omega))
     
     return max(term1, term2, term3)
 
 
-def warmPPDNA(S, lambda1, lambda2, reg, Omega_0, Theta_0 = np.array([]), X_0 = np.array([]), admm_params = None, ppdna_params = None, eps = 1e-5 , eps_admm = 1e-3, verbose = False, measure = False):
+def warmPPDNA(S, lambda1, lambda2, reg, Omega_0, Theta_0=np.array([]), X_0=np.array([]), admm_params=None, ppdna_params=None, eps=1e-5 , eps_admm=1e-3, verbose=False, measure=False):
     """
     function for solving a MGL problem with PPDNA but using ADMM as a warm start (i.e. solve to low accuracy with ADMM first)
     """
@@ -319,9 +358,20 @@ def warmPPDNA(S, lambda1, lambda2, reg, Omega_0, Theta_0 = np.array([]), X_0 = n
         phase2 = True
     
     rho = 1.
-    sol1, info1 = ADMM_MGL(S, lambda1, lambda2, reg , Omega_0 , Theta_0, X_0, \
-                           tol = eps_admm, stopping_criterion = 'kkt', verbose = verbose, measure = measure,\
-                           rho = rho, update_rho = False)
+    sol1, info1 = ADMM_MGL(S,
+                           lambda1,
+                           lambda2,
+                           reg,
+                           Omega_0,
+                           Theta_0,
+                           X_0,
+                           tol=eps_admm,
+                           stopping_criterion='kkt',
+                           verbose=verbose,
+                           measure=measure,
+                           rho=rho,
+                           update_rho=False
+    )
     
     assert info1['status'] == 'optimal'
     
@@ -331,8 +381,18 @@ def warmPPDNA(S, lambda1, lambda2, reg, Omega_0, Theta_0 = np.array([]), X_0 = n
     X_0 = rho*sol1['X']
     
     if phase2:
-        sol2, info2 = PPDNA(S, lambda1, lambda2, reg, Omega_0 = Omega_0, Theta_0 = Theta_0, X_0 = X_0,\
-                            ppdna_params = ppdna_params,  eps_ppdna = eps_ppdna , verbose = verbose, measure = measure)
+        sol2, info2 = PPDNA(S,
+                            lambda1,
+                            lambda2,
+                            reg,
+                            Omega_0=Omega_0,
+                            Theta_0=Theta_0,
+                            X_0=X_0,
+                            ppdna_params=ppdna_params,
+                            eps_ppdna=eps_ppdna,
+                            verbose=verbose,
+                            measure=measure
+        )
     
         # append the infos
         if measure: