From 08c43cefaa950742a02346d0583451e86474d27a Mon Sep 17 00:00:00 2001
From: lincoln-harris <ljharris018@gmail.com>
Date: Thu, 29 Aug 2024 15:45:46 -0700
Subject: [PATCH] converted to ensemble model

---
 lupine/lupine.py      | 168 +++++++++++++++++++++++++++---------------
 lupine/lupine_base.py |  19 +++--
 2 files changed, 119 insertions(+), 68 deletions(-)

diff --git a/lupine/lupine.py b/lupine/lupine.py
index b019251..fd8b658 100644
--- a/lupine/lupine.py
+++ b/lupine/lupine.py
@@ -5,9 +5,13 @@
 This modules contains the `Lupine` class and the implementation of
 the `impute` command. `Lupine` is the high-level implementation for
 a PyTorch model for imputing protein-level quantifications using
-deep matrix factorization. Missing values are imputed by taking the
+a multilayer perceptron. Missing values are imputed by taking the
 concatenation of the corresponding protein and run factors and 
 feeding them through a deep neural network.
+
+This module implements the method's `impute` command, which fits an
+ensemble of Lupine models to the provided matrix and writes a single 
+consensus imputed quants matrix as output. 
 """
 from lupine.lupine_base import LupineBase
 import torch
@@ -15,6 +19,7 @@
 import pandas as pd
 import numpy as np
 import torch
+import shutil
 
 from lupine.os_utils import os
 from pathlib import Path
@@ -113,66 +118,113 @@ def forward(self, locs):
 @click.command()
 @click.argument("csv", required=True, nargs=1)
 
-@click.option("--n_prot_factors", default=128, 
-    help="Number of protein factors", required=False, type=int)
-@click.option("--n_run_factors", default=128, 
-    help="Number of run factors", required=False, type=int)
-@click.option("--n_layers", default=2, 
-    help="Number of hidden layers", required=False, type=int)
-@click.option("--n_nodes", default=1024, 
-    help="Number of nodes per layer", required=False, type=int)
-@click.option("--rand_seed", default=None, help="Random seed",
-    required=False, type=int)
+@click.option("--outpath", required=True, nargs=1, type=str,
+	help="Output directory")
+@click.option("--n_models", default=10, 
+	help="The number of models to fit.", required=False, type=int)
 @click.option("--biased", default=True, 
-    help="Biased batch selection?", required=False, type=bool)
+	help="Biased batch selection?", required=False, type=bool)
 @click.option("--device", default="cpu", 
-    help="The device to load model on", required=False, type=str)
+	help="The device to load model on", required=False, type=str)
 @click.option("--mode", default="run", 
-    help="The model run mode.", required=False, type=str)
+	help="The model run mode.", required=False, type=str)
 
 def impute(
-        csv, 
-        n_prot_factors, 
-        n_run_factors, 
-        n_layers, 
-        n_nodes, 
-        rand_seed, 
-        biased, 
-        device,
-        mode, 
+		csv, 
+		outpath,
+		n_models,
+		biased, 
+		device,
+		mode, 
 ):
-    """Impute missing values in a protein or peptide quantifications matrix."""
-
-    # Read in the csv
-    mat_pd = pd.read_csv(csv, index_col=0)
-    rows = list(mat_pd.index)
-    cols = list(mat_pd.columns)
-    mat = np.array(mat_pd)
-
-    test_bool = False
-    if mode == "Testing":
-    	test_bool = True
-
-    Path("results/").mkdir(parents=True, exist_ok=True)
-
-    # Init the model 
-    model = Lupine(  
-                n_prots=mat.shape[0],
-                n_runs=mat.shape[1], 
-                n_prot_factors=n_prot_factors,
-                n_run_factors=n_run_factors,
-                n_layers=n_layers,
-                n_nodes=n_nodes,
-                rand_seed=rand_seed,
-                testing=test_bool,
-                biased=biased,
-                device=device
-    )
-    # Fit the model 
-    print("fitting model")
-    model_recon = model.fit_transform(mat)
-
-    print("done!")
-    model_recon_pd = \
-    	pd.DataFrame(model_recon, index=rows, columns=cols)
-    pd.DataFrame(model_recon_pd, "results/lupine_recon_quants.csv")
+	"""
+	Impute missing values in a protein or peptide quantifications
+	matrix.
+	"""
+
+	# Read in the csv
+	mat_pd = pd.read_csv(csv, index_col=0)
+	rows = list(mat_pd.index)
+	cols = list(mat_pd.columns)
+	mat = np.array(mat_pd)
+
+	test_bool = False
+	if mode == "Testing":
+		test_bool = True
+
+	# Define the full hyperparam search spaces a
+	gen = np.random.default_rng(seed=18)
+	n_layers_hparam_space=[1, 2]
+	n_factors_hparam_space=[32, 64, 128, 256]
+	n_nodes_hparam_space=[256, 512, 1024, 2048]
+
+	print(" ")
+	print("----------------------------------")
+	print("--------   L U P I N E   ---------")
+	print("----------------------------------")
+	print(" ")
+	print(f"Fitting ensemble of models on: {device}\n")
+
+	Path(outpath).mkdir(parents=True, exist_ok=True)
+	Path(outpath+"/tmp").mkdir(parents=True, exist_ok=True)
+
+	# The driver loop for ensemble model
+	for n_iter in range(0, n_models): 
+		print(f"Fitting model {n_iter+1} of {n_models}")
+
+		# Randomly select the hparams
+		n_layers_curr = gen.choice(n_layers_hparam_space)
+		prot_factors_curr = gen.choice(n_factors_hparam_space)
+		run_factors_curr = gen.choice(n_factors_hparam_space)
+		n_nodes_curr = gen.choice(n_nodes_hparam_space)
+
+		curr_seed = gen.integers(low=1, high=1e4)
+
+		# Init an individual model 
+		model = Lupine(  
+					n_prots=mat.shape[0],
+					n_runs=mat.shape[1], 
+					n_prot_factors=prot_factors_curr,
+					n_run_factors=run_factors_curr,
+					n_layers=n_layers_curr,
+					n_nodes=n_nodes_curr,
+					rand_seed=curr_seed,
+					testing=test_bool,
+					biased=biased,
+					device=device
+		)
+
+		# Fit the individual model 
+		model_recon = model.fit_transform(mat)
+		model_recon_pd = \
+			pd.DataFrame(model_recon, index=rows, columns=cols)
+
+		# Write. 
+		#   These filenames may be helpful for debugging. 
+		outpath_curr = \
+			outpath + "tmp/qmat_tmp_" + \
+			str(n_layers_curr) + "layers_" + \
+			str(prot_factors_curr) + "protFactors_" + \
+			str(run_factors_curr) + "runFactors_" + \
+			str(n_nodes_curr) + "nodes_" + \
+			str(curr_seed) + "seed" + ".csv"
+
+		model_recon_pd.to_csv(outpath_curr)
+
+	# Do the model ensembling
+	qmats = []
+	for n_iter in range(0, n_models):
+		curr_path = outpath + "tmp/qmat_tmp" + str(n_iter) + ".csv"
+		tmp = pd.read_csv(curr_path)
+		qmats.append(tmp)
+
+	qmats_mean = np.mean(qmats, axis=0)
+	outpath_ensemble = outpath + "lupine_recon_quants.csv"
+	pd.DataFrame(qmats_mean).to_csv(outpath_ensemble)
+	shutil.rmtree(outpath+"tmp")
+
+	print(" ")
+	print("Done!")
+	print("----------------------------------")
+	print("----------------------------------")
+	print(" ")
diff --git a/lupine/lupine_base.py b/lupine/lupine_base.py
index 906ffdd..272f34e 100644
--- a/lupine/lupine_base.py
+++ b/lupine/lupine_base.py
@@ -51,7 +51,8 @@ class LupineBase(torch.nn.Module):
 		The tolerance criteria for early stopping, according to the
 		standard early stopping criteria
 	max_epochs : int, optional,
-		The maximum number of training epochs for the model
+		The maximum number of training epochs for the model. 
+		Default 42. 
 	patience : int, optional
 		The number of training epochs to wait before stopping if
 		it seems like the model has converged
@@ -76,7 +77,7 @@ def __init__(
 		learning_rate=0.01,
 		batch_size=128,
 		tolerance=0.001,
-		max_epochs=128,
+		max_epochs=1,
 		patience=10,
 		rand_seed=None,
 		testing=False,
@@ -104,13 +105,11 @@ def __init__(
 			torch.manual_seed(self.rand_seed)
 
 		# For writing the model state to disk
-		self.MODELPATH = "results/OPT_MODEL_INTERNAL.pt"
-
-		# Is there a better way to do this? 
-		try:
-			os.remove(self.MODELPATH)
-		except FileNotFoundError:
-			pass
+		#self.MODELPATH = "results/OPT_MODEL_INTERNAL.pt"
+		# try:
+		# 	os.remove(self.MODELPATH)
+		# except FileNotFoundError:
+		# 	pass
 
 		# Init the run factors
 		self.run_factors = torch.nn.Parameter(
@@ -291,7 +290,7 @@ def fit(self, X_mat, X_val_mat=None):
 				curr_loss = train_loss
 
 			if curr_loss < best_loss:
-				torch.save(self, self.MODELPATH)
+				#torch.save(self, self.MODELPATH)
 				best_loss = curr_loss
 
 			# Evaluate early stopping: