nf-core · PascalIversen · Dec 16, 2024 · Dec 16, 2024
diff --git a/bin/train_and_predict_cv.py b/bin/train_and_predict_cv.py
@@ -19,6 +19,7 @@ def get_parser():
     parser.add_argument("--hyperparameters", type=str, help="hyperparameters for the model")
     parser.add_argument("--cv_data", type=str, help="path to the cv data split")
     parser.add_argument("--response_transformation", type=str, help="response transformation to apply to the dataset")
+    parser.add_argument("--model_checkpoint_dir", type=str, default="TEMPORARY", help="model checkpoint directory, if not provided: temporary directory is used")
     return parser
 
 
@@ -45,6 +46,7 @@ def main():
         prediction_dataset=validation_dataset,
         early_stopping_dataset=es_dataset,
         response_transformation=response_transform,
+        model_checkpoint_dir=args.model_checkpoint_dir
     )
     with open(f"prediction_dataset_{model_name}_{str(args.cv_data).split('.pkl')[0]}_"
               f"{str(args.hyperparameters).split('.yaml')[0]}.pkl",

diff --git a/bin/train_and_predict_final.py b/bin/train_and_predict_final.py
@@ -42,6 +42,8 @@ def get_parser():
     )
     parser.add_argument("--robustness_trial", type=int, help="Robustness trial index.")
     parser.add_argument("--cross_study_datasets", nargs="+", help="Path to cross study datasets.")
+    parser.add_argument("--model_checkpoint_dir", type=str, default="TEMPORARY", help="model checkpoint directory, if not provided: temporary directory is used")
+
     return parser
 
 
@@ -82,7 +84,8 @@ def compute_randomization(
     split_id: str,
     randomization_type: str = "permutation",
     response_transformation=Optional[TransformerMixin],
-    randomization_test_path: str = ""
+    randomization_test_path: str = "",
+    model_checkpoint_dir: str = "TEMPORARY",
 ):
     randomization_test_file = os.path.join(
         randomization_test_path,
@@ -99,7 +102,8 @@ def compute_randomization(
         train_dataset=train_dataset,
         test_dataset=test_dataset,
         early_stopping_dataset=early_stopping_dataset,
-        response_transformation=response_transformation
+        response_transformation=response_transformation,
+        model_checkpoint_dir=model_checkpoint_dir
     )
 
 
@@ -113,7 +117,8 @@ def compute_robustness(
     split_id: str,
     trial: int,
     response_transformation=Optional[TransformerMixin],
-    rob_path: str = ""
+    rob_path: str = "",
+    model_checkpoint_dir: str = "TEMPORARY",
 ):
     robustness_test_file = os.path.join(
         rob_path,
@@ -129,6 +134,7 @@ def compute_robustness(
         hpam_set=hpam_set,
         path_data=path_data,
         response_transformation=response_transformation,
+        model_checkpoint_dir=model_checkpoint_dir
     )
 
 
@@ -197,12 +203,13 @@ def compute_cross(
             prediction_dataset=test_set,
             early_stopping_dataset=es_set,
             response_transformation=transformation,
+            model_checkpoint_dir=args.model_checkpoint_dir
         )
         prediction_dataset = os.path.join(
             predictions_path,
             f"predictions_{args.split_id}.csv",
         )
-        test_set.save(prediction_dataset)
+        test_set.to_csv(prediction_dataset)
         for ds in args.cross_study_datasets:
             if ds == "NONE.csv":
                 continue
@@ -238,6 +245,8 @@ def compute_cross(
             randomization_type=args.randomization_type,
             response_transformation=transformation,
             randomization_test_path=rand_path,
+            model_checkpoint_dir=args.model_checkpoint_dir
+
         )
     elif args.mode == "robustness":
         rob_path = generate_data_saving_path(
@@ -256,7 +265,8 @@ def compute_cross(
             split_id=args.split_id,
             trial=args.robustness_trial,
             response_transformation=transformation,
-            rob_path=rob_path
+            rob_path=rob_path,
+            model_checkpoint_dir=args.model_checkpoint_dir
         )
     else:
         raise ValueError(f"Invalid mode: {args.mode}. Choose full, randomization, or robustness.")

diff --git a/modules/local/randomization_test/main.nf b/modules/local/randomization_test/main.nf
@@ -9,6 +9,7 @@ process RANDOMIZATION_TEST {
     path(path_data)
     val(randomization_type)
     val(response_transformation)
+    val model_checkpoint_dir
 
     output:
     tuple val(test_mode), val(model_name), path('**randomization*.csv'),     emit: ch_vis
@@ -25,7 +26,8 @@ process RANDOMIZATION_TEST {
         --test_mode $test_mode \\
         --path_data $path_data \\
         --randomization_views_path $randomization_views \\
-        --randomization_type $randomization_type
+        --randomization_type $randomization_type \\
+        --model_checkpoint_dir $model_checkpoint_dir \\
     """
 
 }
diff --git a/modules/local/robustness_test/main.nf b/modules/local/robustness_test/main.nf
@@ -9,6 +9,7 @@ process ROBUSTNESS_TEST {
     path(path_data)
     val(randomization_type)
     val(response_transformation)
+    val model_checkpoint_dir
 
     output:
     tuple val(test_mode), val(model_name), path('**robustness*.csv'),     emit: ch_vis
@@ -24,7 +25,8 @@ process ROBUSTNESS_TEST {
         --response_transformation $response_transformation \\
         --test_mode $test_mode \\
         --path_data $path_data \\
-        --robustness_trial $robustness_iteration
+        --robustness_trial $robustness_iteration \\
+        --model_checkpoint_dir $model_checkpoint_dir \\
     """
 
 }
diff --git a/modules/local/train_and_predict_cv/main.nf b/modules/local/train_and_predict_cv/main.nf
@@ -7,6 +7,8 @@ process TRAIN_AND_PREDICT_CV {
     tuple val(model_name), val(test_mode), path(cv_data), path(hyperparameters)
     path path_data
     val response_transformation
+    val model_checkpoint_dir
+
 
     output:
     tuple val(model_name), val(test_mode), val(cv_data.baseName), path(hyperparameters), path("prediction_dataset_*.pkl"), emit: pred_data
@@ -19,6 +21,7 @@ process TRAIN_AND_PREDICT_CV {
         --test_mode $test_mode \\
         --hyperparameters $hyperparameters \\
         --cv_data $cv_data \\
-        --response_transformation $response_transformation
+        --response_transformation $response_transformation \\
+        --model_checkpoint_dir $model_checkpoint_dir
     """
 }
diff --git a/nextflow.config b/nextflow.config
@@ -27,6 +27,7 @@ params {
     response_transformation = 'None'
     path_data = './data'
     save_datasets = false
+    model_checkpoint_dir = 'TEMPORARY'
 
     // Boilerplate options
     outdir                       = 'results'

diff --git a/subworkflows/local/run_cv/main.nf b/subworkflows/local/run_cv/main.nf
@@ -12,7 +12,7 @@ workflow RUN_CV {
     models                          // model names for full testing
     baselines                        // model names for comparison
     path_data                      // path to data
-
+    
     main:
     LOAD_RESPONSE(params.dataset_name, path_data, params.cross_study_datasets)
 
@@ -72,11 +72,8 @@ workflow RUN_CV {
     // [model_name, test_mode, split_X.pkl, hpam_X.yaml]
     ch_test_combis = ch_model_cv.combine(ch_hpam_combis, by: 0)
 
-    TRAIN_AND_PREDICT_CV (
-        ch_test_combis,
-        path_data,
-        params.response_transformation
-    )
+    TRAIN_AND_PREDICT_CV(ch_test_combis, path_data, params.response_transformation, params.model_checkpoint_dir)
+
     // [model_name, test_mode, split_id,
     // [hpam_0.yaml, hpam_1.yaml, ..., hpam_n.yaml],
     // [prediction_dataset_0.pkl, ..., prediction_dataset_n.pkl] ]