Skip to content

Commit

Permalink
improve config
Browse files Browse the repository at this point in the history
  • Loading branch information
laszewsk committed Oct 5, 2023
1 parent 51efc22 commit fbbfd29
Show file tree
Hide file tree
Showing 5 changed files with 41 additions and 22 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -84,7 +84,7 @@ echo "USER_SCRATCH: $USER_SCRATCH"
echo "PROJECT_DIR: $PROJECT_DIR"
echo "PYTHON_DIR: $PYTHON_DIR"
echo "PROJECT_DATA: $PROJECT_DATA"
echo "CONTAINERDIR: $CONTAINERDIR"
echo "TARGET: $TARGET"


# mkdir -p $OUTPUTS_DIR
Expand Down Expand Up @@ -144,7 +144,7 @@ echo "# check filesystem"
echo "# ==================================="
pwd
ls
singularity exec --nv $CONTAINERDIR/cloudmask.sif bash -c "python -c \"import os; os.system('ls')\""
singularity exec --nv ${TARGET}/cloudmask.sif bash -c "python -c \"import os; os.system('ls')\""

PROGRESS "running" 10

Expand All @@ -159,15 +159,15 @@ echo "# ==================================="
echo "# start gpu log"
echo "# ==================================="

cms gpu watch --gpu=0 --delay=0.5 --dense > ${TARGET}/project/{ee.identifier}/gpu0.log &
cms gpu watch --gpu=0 --delay=0.5 --dense > ${OUTPUTS_DIR}/gpu0.log &

PROGRESS "running" 21

echo "# ==================================="
echo "# start cloudmask"
echo "# ==================================="

singularity exec --nv ${CONTAINERDIR}/cloudmask.sif bash -c "cd ${TARGET}/project/{ee.identifier} ; python cloudmask_v0.5.py --config=config.yaml"
singularity exec --nv ${TARGET}/cloudmask.sif bash -c "cd ${OUTPUTS_DIR} ; python cloudmask_v0.5.py --config=config.yaml"

PROGRESS "running" 99

Expand Down
10 changes: 5 additions & 5 deletions benchmarks/cloudmask/target/greene_v0.5/cloudmask_v0.5.py
Original file line number Diff line number Diff line change
Expand Up @@ -110,7 +110,7 @@ def cloud_inference(config) -> None:
N_CHANNELS = config['image.N_CHANNELS']

# Load model
# modelPath = os.path.expanduser(config['model_file'])
# modelPath = os.path.expanduser(config['output.model_file'])
model = tf.keras.models.load_model(modelPath)

# Read inference files
Expand Down Expand Up @@ -150,7 +150,7 @@ def cloud_inference(config) -> None:
mask = reconstruct_from_patches(config, mask_patches, nx, ny, patch_size=PATCH_SIZE - CROP_SIZE)

# Save reconstructed image (mask)
output_dir = os.path.expanduser(config['output_dir'])
output_dir = os.path.expanduser(config['output.directory'])
mask_name = f"{output_dir}/{file_name.name}.h5"
with h5py.File(mask_name, 'w') as handle:
handle.create_dataset('mask', data=mask)
Expand Down Expand Up @@ -308,7 +308,7 @@ def string_to_boolean(input_string):
# else: # mode: original
# modelPath = os.path.expanduser(config['model_file'])

modelPath = os.path.expanduser(config['model_file'])
modelPath = os.path.expanduser(config['output.model_file'])

tf.keras.models.save_model(model, modelPath)
banner('END slstr_cloud in training mode.')
Expand Down Expand Up @@ -364,11 +364,11 @@ def main():
print(config)

# setup
log_file = os.path.expanduser(config['log_file'])
log_file = os.path.expanduser(config['output.log_file'])
user_name = config["submission.submitter"]

# MLCommons logging
mlperf_logfile = os.path.expanduser(config['mlperf_logfile'])
mlperf_logfile = os.path.expanduser(config['output.mlperf_logfile'])
mllog.config(filename=mlperf_logfile)

print("user", user_name)
Expand Down
17 changes: 13 additions & 4 deletions benchmarks/cloudmask/target/greene_v0.5/config-rivanna.in.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -101,13 +101,22 @@ training:
# Inference data
# inference_dir: "/scratch/{os.USER}/data/cloudmask/data/ssts"
# Output directory
output_dir: "{os.TARGET}/outputs"
output:
directory: "./outputs"
# Model file
model_file: "./outputs/cloudModel.h5"
# Log file for recording runtimes
log_file: "./outputs/cloudmask_final.log"
# Log file for MLCommons logging
mlperf_logfile: "./outputs/mlperf_cloudmask_final.log"

output_dir: "./outputs"
# Model file
model_file: "{output_dir}/cloudModel-{identifier}.h5"
model_file: "./outputs/cloudModel.h5"
# Log file for recording runtimes
log_file: "{output_dir}/cloudmask_final_{identifier}.log"
log_file: "./outputs/cloudmask_final.log"
# Log file for MLCommons logging
mlperf_logfile: "{output_dir}/mlperf_cloudmask_final_{identifier}.log"
mlperf_logfile: "./outputs/mlperf_cloudmask_final.log"


data:
Expand Down
24 changes: 17 additions & 7 deletions benchmarks/cloudmask/target/greene_v0.5/config.in.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -60,13 +60,14 @@ experiment:
# card_name: a100-dgx
# card_name: "a100,v100,p100,k80,rtx2080,rtx3090"
#card_name: "a100"
directive: "a100-dgx"
# directive: "a100-dgx,a100,v100,p100,k80,rtx2080,rtx3090"
directive: "a100-dgx,v100"
gpu_count: "1"
cpu_num: 1
mem: "64GB"
repeat: "1"
# epoch: "2,10,30,50,70,100,200"
epoch: "1"
epoch: "1,2"
seed: 1234
learning_rate: 0.001
batch_size: 32
Expand All @@ -78,7 +79,7 @@ experiment:
early_stoppage_patience: "25"
early_stoppage: "False"

identifier: "{experiment.card_name}-{experiment.early_stoppage}-{experiment.early_stoppage_patience}-{experiment.epoch}-{experiment.learning_rate}-{experiment.repeat}"
identifier: "{experiment.directive}_{experiment.early_stoppage}_{experiment.early_stoppage_patience}_{experiment.epoch}_{experiment.learning_rate}_{experiment.repeat}"

system:
host: "rivanna"
Expand All @@ -100,13 +101,22 @@ training:
# Inference data
# inference_dir: "/scratch/{os.USER}/data/cloudmask/data/ssts"
# Output directory
output_dir: "{os.TARGET}/outputs"
output:
directory: "./outputs"
# Model file
model_file: "./outputs/cloudModel.h5"
# Log file for recording runtimes
log_file: "./outputs/cloudmask_final.log"
# Log file for MLCommons logging
mlperf_logfile: "./outputs/mlperf_cloudmask_final.log"

output_dir: "./outputs"
# Model file
model_file: "{output_dir}/cloudModel-{identifier}.h5"
model_file: "./outputs/cloudModel.h5"
# Log file for recording runtimes
log_file: "{output_dir}/cloudmask_final_{identifier}.log"
log_file: "./outputs/cloudmask_final.log"
# Log file for MLCommons logging
mlperf_logfile: "{output_dir}/mlperf_cloudmask_final_{identifier}.log"
mlperf_logfile: "./outputs/mlperf_cloudmask_final.log"


data:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -107,7 +107,7 @@ def cloud_inference(config) -> None:
N_CHANNELS = config['image.N_CHANNELS']

# Load model
# modelPath = os.path.expanduser(config['model_file'])
# modelPath = os.path.expanduser(config['output.model_file'])
model = tf.keras.models.load_model(modelPath)

# Read inference files
Expand Down Expand Up @@ -147,7 +147,7 @@ def cloud_inference(config) -> None:
mask = reconstruct_from_patches(config, mask_patches, nx, ny, patch_size=PATCH_SIZE - CROP_SIZE)

# Save reconstructed image (mask)
output_dir = os.path.expanduser(config['output_dir'])
output_dir = os.path.expanduser(config['output.directory'])
mask_name = f"{output_dir}/{file_name.name}.h5"
with h5py.File(mask_name, 'w') as handle:
handle.create_dataset('mask', data=mask)
Expand Down

0 comments on commit fbbfd29

Please sign in to comment.