From 74b47e1b669dc9ea8caf5a514046b76a4a90852d Mon Sep 17 00:00:00 2001 From: Michal Futrega Date: Tue, 10 Sep 2024 14:21:40 +0200 Subject: [PATCH] Minor fixes for 4.1.0 submission --- mlperf_logging/compliance_checker/README.md | 6 +++--- .../compliance_checker/training_4.1.0/closed_common.yaml | 2 +- .../training_4.1.0/closed_llama2_70b_lora.yaml | 5 +---- mlperf_logging/rcp_checker/rcp_checker.py | 2 +- mlperf_logging/result_summarizer/config.yaml | 2 +- 5 files changed, 7 insertions(+), 10 deletions(-) diff --git a/mlperf_logging/compliance_checker/README.md b/mlperf_logging/compliance_checker/README.md index 9af31c8..36e4603 100644 --- a/mlperf_logging/compliance_checker/README.md +++ b/mlperf_logging/compliance_checker/README.md @@ -10,7 +10,7 @@ To check a log file for compliance: python -m mlperf_logging.compliance_checker [--config YAML] [--usage training/hpc] [--ruleset MLPERF_EDITION] FILENAME -By default, 3.1.0 training edition rules are used and the default config is set to `4.1.0/common.yaml`. +By default, 4.1.0 training edition rules are used and the default config is set to `4.1.0/common.yaml`. This config will check all common keys and enqueue benchmark specific config to be checked as well. Old training editions, still supported are 4.0.0, 3.1.0, 3.0.0, 2.1.0, 2.0.0, 1.1.0, 1.0.0, 0.7.0 and 0.6.0 @@ -25,14 +25,14 @@ As log examples use [NVIDIA's training logs](https://github.com/mlperf/training_ 4.1.0/common.yaml - currently the default config file, checks common fields complience and equeues benchmark-specific config file 4.1.0/closed_common.yaml - the common rules file for closed submissions. These rules apply to all benchmarks 4.1.0/open_common.yaml - the common rules file for open submissions. These rules apply to all benchmarks - 4.1.0/closed_ssd.yaml - Per-benchmark rules, closed submissions. + 4.1.0/closed_ssd.yaml - Per-benchmark rules, closed submissions. 4.1.0/closed_bert.yaml 4.1.0/closed_dlrm_dcnv2.yaml 4.1.0/closed_gpt3.yaml 4.1.0/closed_gnn.yaml 4.1.0/closed_llama2_70b_lora.yaml 4.1.0/closed_stable_diffusion.yaml - 4.1.0/open_ssd.yaml - Per-benchmark rules, closed submissions. + 4.1.0/open_ssd.yaml - Per-benchmark rules, open submissions. 4.1.0/open_bert.yaml 4.1.0/open_dlrm_dcnv2.yaml 4.1.0/open_gpt3.yaml diff --git a/mlperf_logging/compliance_checker/training_4.1.0/closed_common.yaml b/mlperf_logging/compliance_checker/training_4.1.0/closed_common.yaml index e551bfe..501cf1f 100755 --- a/mlperf_logging/compliance_checker/training_4.1.0/closed_common.yaml +++ b/mlperf_logging/compliance_checker/training_4.1.0/closed_common.yaml @@ -2,7 +2,7 @@ - KEY: NAME: submission_benchmark REQ: EXACTLY_ONE - CHECK: " v['value'] in ['resnet', 'ssd', 'stable_diffusion', 'maskrcnn', 'gpt3', 'dlrm_dcnv2', 'bert', 'rnnt', 'unet3d', 'gnn','llama2_70b_lora'] " + CHECK: " v['value'] in ['resnet', 'ssd', 'stable_diffusion', 'maskrcnn', 'gpt3', 'dlrm_dcnv2', 'bert', 'rnnt', 'unet3d', 'gnn', 'llama2_70b_lora'] " POST: " enqueue_config('training_4.1.0/closed_{}.yaml'.format(v['value'])) " - KEY: diff --git a/mlperf_logging/compliance_checker/training_4.1.0/closed_llama2_70b_lora.yaml b/mlperf_logging/compliance_checker/training_4.1.0/closed_llama2_70b_lora.yaml index 3d80b91..46de03e 100755 --- a/mlperf_logging/compliance_checker/training_4.1.0/closed_llama2_70b_lora.yaml +++ b/mlperf_logging/compliance_checker/training_4.1.0/closed_llama2_70b_lora.yaml @@ -7,10 +7,7 @@ - KEY: NAME: opt_base_learning_rate REQ: EXACTLY_ONE - -- KEY: - NAME: lora_alpha - REQ: EXACTLY_ONE + - KEY: NAME: opt_learning_rate_training_steps diff --git a/mlperf_logging/rcp_checker/rcp_checker.py b/mlperf_logging/rcp_checker/rcp_checker.py index 90d5254..4ee5cb3 100644 --- a/mlperf_logging/rcp_checker/rcp_checker.py +++ b/mlperf_logging/rcp_checker/rcp_checker.py @@ -162,7 +162,7 @@ class RCP_Checker: def __init__(self, usage, ruleset, benchmark, verbose, rcp_file=None): if ruleset not in {'1.0.0', "1.1.0", "2.0.0", "2.1.0", "3.0.0", "3.1.0", "4.0.0", "4.1.0"}: - raise Exception('RCP Checker only supported in 1.0.0, 1.1.0, 2.0.0, 2.1.0, 3.0.0, 3.1.0, 4.0.0, and "4.1.0"') + raise Exception('RCP Checker only supported in 1.0.0, 1.1.0, 2.0.0, 2.1.0, 3.0.0, 3.1.0, 4.0.0, and 4.1.0') self.usage = usage self.ruleset = ruleset self.benchmark = benchmark diff --git a/mlperf_logging/result_summarizer/config.yaml b/mlperf_logging/result_summarizer/config.yaml index 4ad71db..e59a58f 100644 --- a/mlperf_logging/result_summarizer/config.yaml +++ b/mlperf_logging/result_summarizer/config.yaml @@ -75,7 +75,7 @@ columns: llama2_70b_lora: ["Benchmark results (minutes)", "LLM-Finetune", "SCROLSS Gov Report", "LLama2-70B-LoRA"] gnn: ["Benchmark results (minutes)", "Graph node classification", "IGBH-Full", "R-GAT"] default: [" ", " ", " "] - "4.0.1": + "4.1.0": bert: ["Benchmark results (minutes)", "NLP", "Wikipedia", "BERT"] gpt3: ["Benchmark results (minutes)", "LLM", "C4", "GPT3"] dlrm_dcnv2: ["Benchmark results (minutes)", "Recommendation", "1TB Multihot Clickthrough", "DLRM DCNv2"]