diff --git a/cm-mlops/script/get-git-repo/customize.py b/cm-mlops/script/get-git-repo/customize.py index eed6737de9..044b884740 100644 --- a/cm-mlops/script/get-git-repo/customize.py +++ b/cm-mlops/script/get-git-repo/customize.py @@ -13,7 +13,7 @@ def preprocess(i): meta = i['meta'] env_key = get_env_key(env) - + if 'CM_GIT_REPO_NAME' not in env: update_env(env, 'CM_GIT_REPO{}_NAME', env_key, os.path.basename(env['CM_GIT_URL'])) @@ -44,7 +44,7 @@ def postprocess(i): git_checkout_path = env['CM_GIT_CHECKOUT_PATH'] env_key = get_env_key(env) - + # We remap CM_GIT variables with CM_GIT_REPO prefix so that they don't contaminate the env of the parent script update_env(env, 'CM_GIT_REPO{}_CHECKOUT_PATH', env_key, env['CM_GIT_CHECKOUT_PATH']) update_env(env, 'CM_GIT_REPO{}_URL', env_key, env['CM_GIT_URL']) diff --git a/cm-mlops/script/get-ml-model-huggingface-zoo/_cm.json b/cm-mlops/script/get-ml-model-huggingface-zoo/_cm.json index 06d56a1da5..374625a57f 100644 --- a/cm-mlops/script/get-ml-model-huggingface-zoo/_cm.json +++ b/cm-mlops/script/get-ml-model-huggingface-zoo/_cm.json @@ -47,7 +47,7 @@ }, "prune":{ "env":{ - " CM_MODEL_TASK": "prune" + "CM_MODEL_TASK": "prune" } } } diff --git a/cm-mlops/script/get-ml-model-huggingface-zoo/download_model.py b/cm-mlops/script/get-ml-model-huggingface-zoo/download_model.py index 2cd629a718..5ed130a501 100644 --- a/cm-mlops/script/get-ml-model-huggingface-zoo/download_model.py +++ b/cm-mlops/script/get-ml-model-huggingface-zoo/download_model.py @@ -5,15 +5,16 @@ model_task = os.environ.get('CM_MODEL_TASK', '') if model_task == "prune": - print("Downloading model: "+model_stub) - downloaded_model_path = hf_hub_download(repo_id=model_stub, - filename="pytorch_model.bin", - cache_dir=os.getcwd()) - downloaded_model_path = hf_hub_download(repo_id=model_stub, - filename="config.json", - cache_dir=os.getcwd()) - with open('tmp-run-env.out', 'w') as f: - f.write(f"CM_ML_MODEL_FILE_WITH_PATH={os.path.join(os.getcwd(),'')}") + print("Downloading model: " + model_stub) + + for filename in ["pytorch_model.bin", "config.json"]: + + downloaded_model_path = hf_hub_download(repo_id=model_stub, + filename=filename, + cache_dir=os.getcwd()) + + with open('tmp-run-env.out', 'w') as f: + f.write(f"CM_ML_MODEL_FILE_WITH_PATH={os.path.join(os.getcwd(),'')}") else: model_filename = os.environ.get('CM_MODEL_ZOO_FILENAME', '') diff --git a/cm-mlops/script/get-ml-model-neuralmagic-zoo/_cm.json b/cm-mlops/script/get-ml-model-neuralmagic-zoo/_cm.json index e115ec5f5e..551a410bb2 100644 --- a/cm-mlops/script/get-ml-model-neuralmagic-zoo/_cm.json +++ b/cm-mlops/script/get-ml-model-neuralmagic-zoo/_cm.json @@ -28,6 +28,10 @@ "tags": "get,python3", "names": [ "python3", "python" ] }, + { + "tags": "get,generic-python-lib,_package.protobuf", + "version_max": "3.20.1" + }, { "tags": "get,generic-python-lib,_sparsezoo" } diff --git a/cm-mlops/script/prune-bert-models/README-extra.md b/cm-mlops/script/prune-bert-models/README-extra.md new file mode 100644 index 0000000000..e69de29bb2 diff --git a/cm-mlops/script/prune-bert-models/_cm.json b/cm-mlops/script/prune-bert-models/_cm.json index 598d877e8e..61673215f8 100644 --- a/cm-mlops/script/prune-bert-models/_cm.json +++ b/cm-mlops/script/prune-bert-models/_cm.json @@ -4,6 +4,16 @@ "automation_uid": "5b4e0237da074764", "category": "Modular ML/AI applications", "category_sort": 10000, + "default_env": { + "CM_BERT_PRUNE_TASK":"squad", + "CM_BERT_PRUNE_MODEL_NAME":"bert-large-uncased", + "CM_MODEL_ZOO_STUB":"bert-large-uncased", + "CM_BERT_PRUNE_CONSTRAINT": "0.5" + }, + "input_mapping": { + "constraint": "CM_BERT_PRUNE_CONSTRAINT", + "output_dir": "CM_BERT_PRUNE_OUTPUT_DIR" + }, "deps": [ { "tags": "get,python3" @@ -28,46 +38,51 @@ }, { "tags": "get,generic-python-lib,_transformers" - }, + }, { "tags": "get,generic-python-lib,_scikit-learn" }, { - "tags": "get,git,repo,_repo.https://github.com/anandhu-eng/retraining-free-pruning" + "tags": "get,git,repo,_repo.https://github.com/cknowledge/retraining-free-pruning", + "env": { + "CM_GIT_ENV_KEY":"BERT_PRUNER_NEURIPS_2022" + } }, { "names": [ - "get-model" + "get-model" ], - "tags": "get, ml-model, model, zoo, model-zoo, huggingface, _prune" + "tags": "get,ml-model,model,zoo,model-zoo,huggingface,_prune" } ], "tags": [ "prune", + "bert-models", "bert-prune", "prune-bert-models" ], "uid": "76182d4896414216", "variations":{ "path.#":{ - "env":{ - "CM_UNPRUNED_MODEL_PATH":"#" - } + "env":{ + "CM_BERT_PRUNE_CKPT_PATH":"#" + } }, "task.#":{ - "env":{ - "CM_PRUNE_TASK":"#" - } + "env":{ + "CM_BERT_PRUNE_TASK":"#" + } }, - "model-name.#":{ - "adr":{ - "get-model":{ - "tags":"_model-stub.#" - } - }, - "env":{ - "CM_PRUNE_MODEL_NAME":"#" - } + "model.#":{ + "adr":{ + "get-model":{ + "tags":"_model-stub.#" + } + }, + "env":{ + "CM_BERT_PRUNE_MODEL_NAME":"#", + "CM_MODEL_ZOO_STUB":"#" + } } } } diff --git a/cm-mlops/script/prune-bert-models/customize.py b/cm-mlops/script/prune-bert-models/customize.py index ba7901e7f3..5e0def42fe 100644 --- a/cm-mlops/script/prune-bert-models/customize.py +++ b/cm-mlops/script/prune-bert-models/customize.py @@ -6,12 +6,38 @@ def preprocess(i): os_info = i['os_info'] env = i['env'] - env['BERT_PRUNE_REPO_PATH'] = env['CM_GIT_CHECKOUT_PATH'] - print("Pruning repo path:"+env['BERT_PRUNE_REPO_PATH']) - env['CM_UNPRUNED_MODEL_PATH']=env['CM_ML_MODEL_FILE_WITH_PATH']+"models--bert-large-uncased/snapshots/80792f8e8216b29f3c846b653a0ff0a37c210431" - out_dir="/home/ubuntu/prune_model/out" - cmd = "python3 "+env['BERT_PRUNE_REPO_PATH']+"/main.py --model_name " + env['CM_PRUNE_MODEL_NAME'] + " --task_name " + env['CM_PRUNE_TASK'] + " --ckpt_dir "+env['CM_UNPRUNED_MODEL_PATH']+" --constraint 0.5 --output_dir "+out_dir - os.system(cmd) + + ckpt_path = env.get('CM_BERT_PRUNE_CKPT_PATH','') + if ckpt_path == '': + p = env['CM_ML_MODEL_FILE_WITH_PATH'] + x = os.listdir(p) + for y in x: + if y.startswith('models--'): + z = os.path.join(p,y) + if os.path.isdir(z): + z1 = os.path.join(z, 'snapshots') + if os.path.isdir(z1): + z2 = os.listdir(z1) + if len(z2)>0: + ckpt_path=os.path.join(z1, z2[0]) + + env['CM_BERT_PRUNE_CKPT_PATH'] = ckpt_path + + out_dir=env.get('CM_BERT_PRUNE_OUTPUT_DIR','') + if out_dir == '': + out_dir = os.path.join(os.getcwd(), 'pruned-model-output') + env['CM_BERT_PRUNE_OUTPUT_DIR'] = out_dir + + print ('') + print ('Local CM cache path to the updated BERT pruner src from NeurIPS 2022: ' + env['CM_GIT_REPO_BERT_PRUNER_NEURIPS_2022_CHECKOUT_PATH']) + + print ('') + for k in ["CM_ML_MODEL_FILE_WITH_PATH", "CM_BERT_PRUNE_CKPT_PATH", "CM_BERT_PRUNE_OUTPUT_DIR"]: + print ('ENV["{}"]: {}'.format(k, env[k])) + + print ('') + input ('xyz') + return {'return': 0} def postprocess(i): diff --git a/cm-mlops/script/prune-bert-models/run.sh b/cm-mlops/script/prune-bert-models/run.sh index e69de29bb2..68c0779688 100644 --- a/cm-mlops/script/prune-bert-models/run.sh +++ b/cm-mlops/script/prune-bert-models/run.sh @@ -0,0 +1,19 @@ +#!/bin/bash + +echo "====================================================================" +echo "Start pruning ..." +echo "" + +CM_TMP_CURRENT_SCRIPT_PATH=${CM_TMP_CURRENT_SCRIPT_PATH:-$PWD} + +time ${CM_PYTHON_BIN_WITH_PATH} \ + ${CM_GIT_REPO_BERT_PRUNER_NEURIPS_2022_CHECKOUT_PATH}/main.py \ + --model_name ${CM_BERT_PRUNE_MODEL_NAME} \ + --task_name ${CM_BERT_PRUNE_TASK} \ + --ckpt_dir ${CM_BERT_PRUNE_CKPT_PATH} \ + --constraint ${CM_BERT_PRUNE_CONSTRAINT} \ + --output_dir ${CM_BERT_PRUNE_OUTPUT_DIR} + +test $? -eq 0 || exit $? + +echo "===================================================================="