pose_process function for the pipeline

paninski-lab · Dec 16, 2024 · 578bd8e · 578bd8e
1 parent 0810ed5
commit 578bd8e
Show file tree

Hide file tree

Showing 4 changed files with 344 additions and 15 deletions.
diff --git a/configs/pipeline.yaml b/configs/pipeline.yaml
@@ -1,11 +1,16 @@
+#TODO 
+# ask Matt about adding the views variable to the config file so we can pick the views that we have and in that way instead of iterating over files we will iterate over views - which will determine things
+
+
+
 # absolute path to lp yaml file
 lightning_pose_config: /teamspace/studios/this_studio/lp3d-analysis/configs/config_mirror-mouse-separate.yaml
 # lightning_pose_config: /teamspace/studios/this_studio/lp3d-analysis/configs/config_crim13.yaml
 
 # [needed?] pipeline seed for initial data split
 pipeline_seeds: 0
 
-intermediate_results_dir: testing
+intermediate_results_dir: results_500
 
 # initial training of an ensemble of networks
 train_networks:
@@ -18,8 +23,8 @@ train_networks:
   # ensemble seeds
   ensemble_seeds:
     - 0
-    # - 1
-    # - 2
+    - 1
+    - 2
     # - 3
     # - 4
   # number of ground truth labels for training
@@ -40,8 +45,8 @@ train_networks:
   # milestone_steps: [2000, 3000, 4000]
   # val_check_interval: 50
   # train_check_interval: 10
-  min_steps: 50
-  max_steps: 50
+  min_steps: 500
+  max_steps: 500
   milestone_steps: [2000, 3000, 4000]
   val_check_interval: 5
   train_check_interval: 5
@@ -54,6 +59,13 @@ post_processing:
   eks_multiview:
     run: False
     overwrite: False
+  ensemble_mean:
+    run: True
+    overwrite: False
+  ensemble_median:
+    run: True
+    overwrite: False
+
 
 # visualization options
 visualization:

diff --git a/lp3d_analysis/post_process.py b/lp3d_analysis/post_process.py
@@ -0,0 +1,196 @@
+import os
+import pandas as pd
+import numpy as np
+
+from omegaconf import DictConfig
+from typing import List, Literal
+from pathlib import Path
+
+from lightning_pose.utils.scripts import (
+    compute_metrics,
+)
+
+#TODO
+#1. in the variables for the function pose_process_ensemble, try to change the issue with the mode variable so we can add the eks mode and things like that 
+#2. Change the way we are using the cfp_lp.data.csv_file - it will make more sense in the function 
+#3 change variables names so it will make more sense 
+
+
+
+def post_process_ensemble(
+    cfg_lp: DictConfig,
+    results_dir: str,
+    model_type: str,
+    n_labels: int,
+    seed_range: tuple[int, int],
+    views: list[str], 
+    mode: Literal['ensemble_mean', 'ensemble_median'],
+    overwrite: bool,
+) -> None:
+
+    base_dir = os.path.dirname(results_dir)
+    ensemble_dir = os.path.join(
+        base_dir,
+        f"{model_type}_{n_labels}_{seed_range[0]}-{seed_range[1]}"
+    )
+    output_dir = os.path.join(ensemble_dir, mode)
+    os.makedirs(output_dir, exist_ok=True)
+
+    seed_dirs = [
+        os.path.join(base_dir, f"{model_type}_{n_labels}_{seed}")
+        for seed in range(seed_range[0], seed_range[1] + 1)
+    ]
+
+    new_predictions_files = []
+
+    for view in views:
+        stacked_arrays = []
+        column_structure = None
+
+        for seed_dir in seed_dirs:
+            pred_file = os.path.join(
+                seed_dir,
+                'videos-for-each-labeled-frame',
+                f'predictions_{view}_new.csv'
+            )
+            if os.path.exists(pred_file):
+                df = pd.read_csv(pred_file, header=[0, 1, 2], index_col=0)
+                if column_structure is None:
+                    column_structure = df.loc[:, df.columns.get_level_values(2).isin(['x', 'y', 'likelihood'])].columns
+
+                # Select only numeric columns (x, y, likelihood)
+                numeric_cols = df.loc[:, column_structure]
+
+                # Convert DataFrame to a 2D array (numeric values only)
+                stacked_arrays.append(numeric_cols.to_numpy())
+            else:
+                print(f"Warning: Could not find predictions file: {pred_file}")
+
+        if not stacked_arrays or column_structure is None:
+            print(f"Could not find predictions for view: {view}")
+            continue
+
+        # Stack all arrays along the third dimension
+        stacked_arrays = np.stack(stacked_arrays, axis=-1)
+
+        # Compute the mean/median along the third dimension
+        if mode == 'ensemble_mean':
+            aggregated_array = np.nanmean(stacked_arrays, axis=-1)
+        elif mode == 'ensemble_median':
+            aggregated_array = np.nanmedian(stacked_arrays, axis=-1)
+        else:
+            print(f"Invalid mode: {mode}")
+            continue
+
+        # Create a new DataFrame with the aggregated data
+        result_df = pd.DataFrame(
+            data=aggregated_array,
+            index=df.index,
+            columns=column_structure
+        )
+
+        result_df.loc[:,("set", "", "")] = "train"
+
+        preds_file = os.path.join(output_dir, f'predictions_{view}_new.csv')
+        result_df.to_csv(preds_file)
+        new_predictions_files.append(preds_file)
+        print(f"Saved ensemble {mode} predictions for {view} view to {preds_file}")
+
+        cfg_lp.data.csv_file = ['CollectedData_top_new.csv', 'CollectedData_bot_new.csv'] # try to 
+        cfg_lp.data.view_names = [view]
+
+        try:
+            compute_metrics(cfg=cfg_lp, preds_file=preds_file, data_module=None)
+            print(f"Successfully computed metrics for {pred_file}")
+        except Exception as e:
+            print(f"Error computing metrics\n{e}")
+
+
+
+
+
+
+
+
+
+
+
+
+#     cfg_lp.data.csv_file = predictions_new_ensembles
+#     if predictions_new_ensembles and not overwrite:
+#         for preds_file in predictions_new_ensembles:
+#             try:
+#                 compute_metrics(cfg=cfg_lp, preds_file=preds_file, data_module=None)
+#                 print(f"Succesfully computed metrics for {preds_file}")
+#             except Exception as e:
+#                 print(f"Error computing metrics for {preds_file}\n{e}")
+#     else: 
+#         print("No new predictions to compute metrics on")
+
+
+
+
+        # cfg_lp.data.csv_file = preds_file
+        # print(cfg_lp.data.csv_file)
+        # cfg_lp.data.view_names = [view] 
+
+        # try:
+        #     compute_metrics(cfg=cfg_lp, preds_file= preds_file , data_module=None)
+        #     print(f"Succesfully computed metrics for {preds_file}")
+        # except Exception as e:
+        #     print(f"Error computing metrics\n{e}")
+
+
+
+
+
+
+        #         all_predictions.append(df)
+        #     else:
+        #         print(f"Warning: Could not find predictions file: {pred_file}")
+
+        # if not all_predictions:
+        #     print(f"Could not find predictions for view: {view}")
+        #     continue
+
+        # combined_df = pd.concat(all_predictions, axis =0)
+        # print(combined_df.head())
+
+
+
+        # # group by any identifying columns ( assuming frame/ timestamps columns exists)
+        # # modify these groupby columns based on csv structure 
+        # group_cols = [col for col in combined_df]
+
+
+
+
+
+
+#     pp_dir = os.path.join(
+# #         outputs_dir,
+# #         'post-processors',
+# #         f"{cfg['pseudo_labeler']}_rng={cfg['ensemble_seeds'][0]}-{cfg['ensemble_seeds'][-1]}"
+# #     )
+
+
+
+
+# Here will start looping over the post processes 
+# want to check if want to run the particular post process and have a couple of if statmetns
+# combine predictions from multiple models in the ensemble if want ensmble_mean run this function and if want eks run this
+# make a new py file called pose processing and basically want to load predictions from different models, want to take the mean / median of all the x and y and also of likelihood - that will be the ensemble mean and median 
+# that will all be saved as a data frame in csv file inside the supervised_100_0-1 directory and make another directory for each post processor - ensemble_mean, ensemble_median 
+# once have that data frame  I can run compute metrics from the new set of predictions and it will do the pixel_error 
+
+
+
+
+
+
+
+# after loop through all the seeds want to run through the post=processes  
+# for this I need to implement ensemble mean and median 
+# take the predictions files in the videos-for-each-labeled-frame and load the csv files from each seed and each view 
+# I want the prediction files from supervised-100-0 and supervised 100-1 
+#. I will have to make a new directory supervised_100_0 and supervised_100_1 and the directory for the ensemble will be supervised_100_0-1 (if had more it is 0-5 for example)
diff --git a/lp3d_analysis/utils.py b/lp3d_analysis/utils.py
@@ -1,17 +1,93 @@
+import os 
+import numpy as np
+import pandas as pd
 
+from lightning_pose.utils.scripts import (
+    compute_metrics,
+)
+
+from omegaconf import DictConfig
+
+
+#TODO 
+# 1. need to make a decision about the naming of the output file and how to access the correct model name because it is not part of the results_dir 
+# 2. change names of variables so will look better in the code and make more sense 
+# 3. should I use cfg_lp_copy?
 
 def extract_ood_frame_predictions(
+    cfg_lp: DictConfig,
     data_dir: str,
     results_dir: str,
     overwrite: bool,
+    video_dir: str,
 ) -> None:
 
-    pass
+    #pass
+
+    new_csv_files = [f for f in os.listdir(data_dir) if f.endswith('_new.csv')]   
+    # Use cfg_lp instead of cfg_lp_copy
+    print(f" the nwe csv files are {new_csv_files}")
+
+    for csv_file in new_csv_files:
+        # load each of the new csv files and iterate through the index 
+        prediction_name = '_'.join(csv_file.split('_')[1:])
+        preds_file = os.path.join(results_dir, video_dir , f'predictions_{prediction_name}') # here need a better way than writing 'videos-for-each-labeled-frame'
+
+        if os.path.exists(preds_file) and not overwrite:
+            print(f'Predictions file {preds_file} already exists. Skipping.')
+            continue
+
+        results_list = []
+        file_path = os.path.join(data_dir, csv_file)
+
+        df = pd.read_csv(file_path, header=[0,1,2], index_col=0)
+
+        for img_path in df.index:
+            # process the paths 
+            relative_img_path = '/'.join(img_path.split('/')[1:]) # removed 'labeled-data/'
+            snippet_path = relative_img_path.replace('png', 'mp4')
+
+
+            # Load the 51-frame csv file 
+            snippet_file = os.path.join(results_dir, video_dir , snippet_path.replace('mp4', 'csv'))
+            if os.path.exists(snippet_file):
+                snippet_df = pd.read_csv(snippet_file, header=[0,1,2], index_col=0)
+
+                # extract center frame 
+                assert snippet_df.shape[0] & 2 != 0 # ensure odd number of frames 
+                idx_frame = int(np.floor(snippet_df.shape[0] / 2))
+
+                # create results with original image path as index 
+                result = snippet_df[snippet_df.index == idx_frame].rename(index={idx_frame: img_path})
+                results_list.append(result)
 
+        # combine all results 
+        if results_list:
+            results_df = pd.concat(results_list)
+            results_df.sort_index(inplace=True)
+
+            # Add "set" column so this df is interpreted as labeled data predictions
+            results_df.loc[:,("set", "", "")] = "train"
+
+            # save predictions
+            results_df.to_csv(preds_file)
+            print(f'Saved predictions to {preds_file}')
+
+            cfg_lp.data.csv_file = new_csv_files 
+
+            try:
+                compute_metrics(cfg=cfg_lp, preds_file=preds_file, data_module=None)
+                print(f"Succesfully computed metrics for {preds_file}")
+            except Exception as e:
+                print(f"Error computing metrics\n{e}")
+
+
     # look for all files that end in _new.csv -> these are OOD labels
+     # loop through these
+     #load the csv file and iterate through the rows/index
     # loop through these
     # for each, load the csv file, and iterate through the rows/index
-    #      'labeled-data/<vid_name>/img<#>.png'
+    # 'labeled-data/<vid_name>/img<#>.png'
     # s = 'labeled-data/vid_name/img0000.png'
     # s2 = '/'.join(s.split('/')[1:])
     # s3 = s2.replace('png', 'mp4')
@@ -20,3 +96,6 @@ def extract_ood_frame_predictions(
     # put in dataframe
     # save out predictions_<cam_name>.csv
     # compute pixel
+
+
+# Next step - do one of the post processes and the ensemble mean