diff --git a/benchmarks/cloudmask/target/greene_v0.5/cloudmask_v0.5.in.slurm b/benchmarks/cloudmask/target/greene_v0.5/cloudmask_v0.5.in.slurm index 1ed64b7d..1a7ab1bc 100644 --- a/benchmarks/cloudmask/target/greene_v0.5/cloudmask_v0.5.in.slurm +++ b/benchmarks/cloudmask/target/greene_v0.5/cloudmask_v0.5.in.slurm @@ -84,7 +84,7 @@ echo "USER_SCRATCH: $USER_SCRATCH" echo "PROJECT_DIR: $PROJECT_DIR" echo "PYTHON_DIR: $PYTHON_DIR" echo "PROJECT_DATA: $PROJECT_DATA" -echo "CONTAINERDIR: $CONTAINERDIR" +echo "TARGET: $TARGET" # mkdir -p $OUTPUTS_DIR @@ -144,7 +144,7 @@ echo "# check filesystem" echo "# ===================================" pwd ls -singularity exec --nv $CONTAINERDIR/cloudmask.sif bash -c "python -c \"import os; os.system('ls')\"" +singularity exec --nv ${TARGET}/cloudmask.sif bash -c "python -c \"import os; os.system('ls')\"" PROGRESS "running" 10 @@ -159,7 +159,7 @@ echo "# ===================================" echo "# start gpu log" echo "# ===================================" -cms gpu watch --gpu=0 --delay=0.5 --dense > ${TARGET}/project/{ee.identifier}/gpu0.log & +cms gpu watch --gpu=0 --delay=0.5 --dense > ${OUTPUTS_DIR}/gpu0.log & PROGRESS "running" 21 @@ -167,7 +167,7 @@ echo "# ===================================" echo "# start cloudmask" echo "# ===================================" -singularity exec --nv ${CONTAINERDIR}/cloudmask.sif bash -c "cd ${TARGET}/project/{ee.identifier} ; python cloudmask_v0.5.py --config=config.yaml" +singularity exec --nv ${TARGET}/cloudmask.sif bash -c "cd ${OUTPUTS_DIR} ; python cloudmask_v0.5.py --config=config.yaml" PROGRESS "running" 99 diff --git a/benchmarks/cloudmask/target/greene_v0.5/cloudmask_v0.5.py b/benchmarks/cloudmask/target/greene_v0.5/cloudmask_v0.5.py index 694dd55e..e2df98c4 100755 --- a/benchmarks/cloudmask/target/greene_v0.5/cloudmask_v0.5.py +++ b/benchmarks/cloudmask/target/greene_v0.5/cloudmask_v0.5.py @@ -110,7 +110,7 @@ def cloud_inference(config) -> None: N_CHANNELS = config['image.N_CHANNELS'] # Load model - # modelPath = os.path.expanduser(config['model_file']) + # modelPath = os.path.expanduser(config['output.model_file']) model = tf.keras.models.load_model(modelPath) # Read inference files @@ -150,7 +150,7 @@ def cloud_inference(config) -> None: mask = reconstruct_from_patches(config, mask_patches, nx, ny, patch_size=PATCH_SIZE - CROP_SIZE) # Save reconstructed image (mask) - output_dir = os.path.expanduser(config['output_dir']) + output_dir = os.path.expanduser(config['output.directory']) mask_name = f"{output_dir}/{file_name.name}.h5" with h5py.File(mask_name, 'w') as handle: handle.create_dataset('mask', data=mask) @@ -308,7 +308,7 @@ def string_to_boolean(input_string): # else: # mode: original # modelPath = os.path.expanduser(config['model_file']) - modelPath = os.path.expanduser(config['model_file']) + modelPath = os.path.expanduser(config['output.model_file']) tf.keras.models.save_model(model, modelPath) banner('END slstr_cloud in training mode.') @@ -364,11 +364,11 @@ def main(): print(config) # setup - log_file = os.path.expanduser(config['log_file']) + log_file = os.path.expanduser(config['output.log_file']) user_name = config["submission.submitter"] # MLCommons logging - mlperf_logfile = os.path.expanduser(config['mlperf_logfile']) + mlperf_logfile = os.path.expanduser(config['output.mlperf_logfile']) mllog.config(filename=mlperf_logfile) print("user", user_name) diff --git a/benchmarks/cloudmask/target/greene_v0.5/config-rivanna.in.yaml b/benchmarks/cloudmask/target/greene_v0.5/config-rivanna.in.yaml index 83aef95f..e7e1ff74 100644 --- a/benchmarks/cloudmask/target/greene_v0.5/config-rivanna.in.yaml +++ b/benchmarks/cloudmask/target/greene_v0.5/config-rivanna.in.yaml @@ -101,13 +101,22 @@ training: # Inference data # inference_dir: "/scratch/{os.USER}/data/cloudmask/data/ssts" # Output directory -output_dir: "{os.TARGET}/outputs" +output: + directory: "./outputs" + # Model file + model_file: "./outputs/cloudModel.h5" + # Log file for recording runtimes + log_file: "./outputs/cloudmask_final.log" + # Log file for MLCommons logging + mlperf_logfile: "./outputs/mlperf_cloudmask_final.log" + +output_dir: "./outputs" # Model file -model_file: "{output_dir}/cloudModel-{identifier}.h5" +model_file: "./outputs/cloudModel.h5" # Log file for recording runtimes -log_file: "{output_dir}/cloudmask_final_{identifier}.log" +log_file: "./outputs/cloudmask_final.log" # Log file for MLCommons logging -mlperf_logfile: "{output_dir}/mlperf_cloudmask_final_{identifier}.log" +mlperf_logfile: "./outputs/mlperf_cloudmask_final.log" data: diff --git a/benchmarks/cloudmask/target/greene_v0.5/config.in.yaml b/benchmarks/cloudmask/target/greene_v0.5/config.in.yaml index cb0d42cb..e7e1ff74 100644 --- a/benchmarks/cloudmask/target/greene_v0.5/config.in.yaml +++ b/benchmarks/cloudmask/target/greene_v0.5/config.in.yaml @@ -60,13 +60,14 @@ experiment: # card_name: a100-dgx # card_name: "a100,v100,p100,k80,rtx2080,rtx3090" #card_name: "a100" - directive: "a100-dgx" + # directive: "a100-dgx,a100,v100,p100,k80,rtx2080,rtx3090" + directive: "a100-dgx,v100" gpu_count: "1" cpu_num: 1 mem: "64GB" repeat: "1" # epoch: "2,10,30,50,70,100,200" - epoch: "1" + epoch: "1,2" seed: 1234 learning_rate: 0.001 batch_size: 32 @@ -78,7 +79,7 @@ experiment: early_stoppage_patience: "25" early_stoppage: "False" -identifier: "{experiment.card_name}-{experiment.early_stoppage}-{experiment.early_stoppage_patience}-{experiment.epoch}-{experiment.learning_rate}-{experiment.repeat}" +identifier: "{experiment.directive}_{experiment.early_stoppage}_{experiment.early_stoppage_patience}_{experiment.epoch}_{experiment.learning_rate}_{experiment.repeat}" system: host: "rivanna" @@ -100,13 +101,22 @@ training: # Inference data # inference_dir: "/scratch/{os.USER}/data/cloudmask/data/ssts" # Output directory -output_dir: "{os.TARGET}/outputs" +output: + directory: "./outputs" + # Model file + model_file: "./outputs/cloudModel.h5" + # Log file for recording runtimes + log_file: "./outputs/cloudmask_final.log" + # Log file for MLCommons logging + mlperf_logfile: "./outputs/mlperf_cloudmask_final.log" + +output_dir: "./outputs" # Model file -model_file: "{output_dir}/cloudModel-{identifier}.h5" +model_file: "./outputs/cloudModel.h5" # Log file for recording runtimes -log_file: "{output_dir}/cloudmask_final_{identifier}.log" +log_file: "./outputs/cloudmask_final.log" # Log file for MLCommons logging -mlperf_logfile: "{output_dir}/mlperf_cloudmask_final_{identifier}.log" +mlperf_logfile: "./outputs/mlperf_cloudmask_final.log" data: diff --git a/benchmarks/cloudmask/target/greene_v0.5/deprecated/slstr_cloud.py b/benchmarks/cloudmask/target/greene_v0.5/deprecated/slstr_cloud.py index 0ad885b8..2a14cbd9 100755 --- a/benchmarks/cloudmask/target/greene_v0.5/deprecated/slstr_cloud.py +++ b/benchmarks/cloudmask/target/greene_v0.5/deprecated/slstr_cloud.py @@ -107,7 +107,7 @@ def cloud_inference(config) -> None: N_CHANNELS = config['image.N_CHANNELS'] # Load model - # modelPath = os.path.expanduser(config['model_file']) + # modelPath = os.path.expanduser(config['output.model_file']) model = tf.keras.models.load_model(modelPath) # Read inference files @@ -147,7 +147,7 @@ def cloud_inference(config) -> None: mask = reconstruct_from_patches(config, mask_patches, nx, ny, patch_size=PATCH_SIZE - CROP_SIZE) # Save reconstructed image (mask) - output_dir = os.path.expanduser(config['output_dir']) + output_dir = os.path.expanduser(config['output.directory']) mask_name = f"{output_dir}/{file_name.name}.h5" with h5py.File(mask_name, 'w') as handle: handle.create_dataset('mask', data=mask)