diff --git a/evals/elsuite/ml_agent_bench/scripts/install_all_requirements.sh b/evals/elsuite/ml_agent_bench/scripts/install_all_requirements.sh index c2a9a991d7..0554c77a01 100644 --- a/evals/elsuite/ml_agent_bench/scripts/install_all_requirements.sh +++ b/evals/elsuite/ml_agent_bench/scripts/install_all_requirements.sh @@ -11,4 +11,9 @@ fi find "$start_directory" -type f -name 'requirements.txt' | while read -r file; do echo "Installing requirements from: $file" pip install -r "$file" + + if [[ $? -ne 0 ]]; then + echo "Error: Failed to install requirements from $file" + exit 1 + fi done diff --git a/evals/registry/data/ml_agent_bench/babylm.jsonl b/evals/registry/data/ml_agent_bench/babylm.jsonl deleted file mode 100644 index f47f2bbe2e..0000000000 --- a/evals/registry/data/ml_agent_bench/babylm.jsonl +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:86b162bf21ea6ade9b9d6143d1589756dec77ee1e241174cfa952e7419c71d85 -size 873 diff --git a/evals/registry/data/ml_agent_bench/clrs.jsonl b/evals/registry/data/ml_agent_bench/clrs.jsonl deleted file mode 100644 index a3b33d4129..0000000000 --- a/evals/registry/data/ml_agent_bench/clrs.jsonl +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:3e607412a0dea8d6abbc18767119d384aeb6de61d0086d1d8f38eae8cd767c0a -size 1151 diff --git a/evals/registry/data/ml_agent_bench/fathomnet/fathomnet.jsonl b/evals/registry/data/ml_agent_bench/fathomnet/fathomnet.jsonl deleted file mode 100644 index 6144cf5f36..0000000000 --- a/evals/registry/data/ml_agent_bench/fathomnet/fathomnet.jsonl +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:8b81c93d2295485a0c27d47fcbb5dea20891d0a6f4a3f189fe5b2f30a7569d7d -size 624 diff --git a/evals/registry/data/ml_agent_bench/identify_contrails/identify-contrails.jsonl b/evals/registry/data/ml_agent_bench/identify_contrails/identify-contrails.jsonl deleted file mode 100644 index 3d3db4794c..0000000000 --- a/evals/registry/data/ml_agent_bench/identify_contrails/identify-contrails.jsonl +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:6a92e2e73e26f9e2371de48cf588ae3c4210c3c66bc07da2f7c7330e0ac3564e -size 616 diff --git a/evals/registry/data/ml_agent_bench/llama-inference.jsonl b/evals/registry/data/ml_agent_bench/llama-inference.jsonl deleted file mode 100644 index 55531ca820..0000000000 --- a/evals/registry/data/ml_agent_bench/llama-inference.jsonl +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:9cf3153e2fc19fd7a917cf0f3651854970e197ef40fe1a5ef3c75c2e8bf1f733 -size 604 diff --git a/evals/registry/eval_sets/ml-agent-bench.yaml b/evals/registry/eval_sets/ml-agent-bench.yaml index 64a999b99d..98c40f2468 100644 --- a/evals/registry/eval_sets/ml-agent-bench.yaml +++ b/evals/registry/eval_sets/ml-agent-bench.yaml @@ -9,7 +9,6 @@ ml-agent-bench: - ml-agent-bench.humanoid - ml-agent-bench.imdb - ml-agent-bench.inverted-pendulum - - ml-agent-bench.llama-inference - ml-agent-bench.ogbn-arxiv - ml-agent-bench.parkinsons-disease - ml-agent-bench.pong diff --git a/evals/registry/evals/ml-agent-bench.yaml b/evals/registry/evals/ml-agent-bench.yaml index 02ea57513d..72f33be693 100644 --- a/evals/registry/evals/ml-agent-bench.yaml +++ b/evals/registry/evals/ml-agent-bench.yaml @@ -1,12 +1,3 @@ -ml-agent-bench.babylm: - id: ml-agent-bench.babylm.v0 - metrics: - [model_score, naive_baseline_score, human_baseline_score, model_score_normalized, naive_baseline_score_normalized, human_baseline_score_normalized, model_score_humanrelative] -ml-agent-bench.babylm.v0: - class: evals.elsuite.ml_agent_bench.eval:MLAgentBench - args: - samples_jsonl: ml_agent_bench/babylm.jsonl - ml-agent-bench.ant: id: ml-agent-bench.ant.gpu.v0 metrics: @@ -24,6 +15,10 @@ ml-agent-bench.cifar10: id: ml-agent-bench.cifar10.v0 metrics: [model_score, naive_baseline_score, human_baseline_score, model_score_normalized, naive_baseline_score_normalized, human_baseline_score_normalized, model_score_humanrelative] +ml-agent-bench.cifar10.v0: + class: evals.elsuite.ml_agent_bench.eval:MLAgentBench + args: + samples_jsonl: ml_agent_bench/cifar10.jsonl ml-agent-bench.bipedal-walker: id: ml-agent-bench.bipedal-walker.v0 @@ -43,29 +38,6 @@ ml-agent-bench.cartpole.v0: args: samples_jsonl: ml_agent_bench/cartpole.jsonl -ml-agent-bench.cifar10.v0: - class: evals.elsuite.ml_agent_bench.eval:MLAgentBench - args: - samples_jsonl: ml_agent_bench/cifar10.jsonl - -ml-agent-bench.clrs: - id: ml-agent-bench.clrs.v0 - metrics: - [model_score, naive_baseline_score, human_baseline_score, model_score_normalized, naive_baseline_score_normalized, human_baseline_score_normalized, model_score_humanrelative] -ml-agent-bench.clrs.v0: - class: evals.elsuite.ml_agent_bench.eval:MLAgentBench - args: - samples_jsonl: ml_agent_bench/clrs.jsonl - -ml-agent-bench.fathomnet: - id: ml-agent-bench.fathomnet.v0 - metrics: - [model_score, naive_baseline_score, human_baseline_score, model_score_normalized, naive_baseline_score_normalized, human_baseline_score_normalized, model_score_humanrelative] -ml-agent-bench.fathomnet.v0: - class: evals.elsuite.ml_agent_bench.eval:MLAgentBench - args: - samples_jsonl: ml_agent_bench/fathomnet/fathomnet.jsonl - ml-agent-bench.feedback: id: ml-agent-bench.feedback.v0 metrics: @@ -97,15 +69,6 @@ ml-agent-bench.humanoid.gpu.v0: args: samples_jsonl: ml_agent_bench/humanoid/gpu.jsonl -ml-agent-bench.identify-contrails: - id: ml-agent-bench.identify-contrails.v0 - metrics: - [model_score, naive_baseline_score, human_baseline_score, model_score_normalized, naive_baseline_score_normalized, human_baseline_score_normalized, model_score_humanrelative] -ml-agent-bench.identify-contrails.v0: - class: evals.elsuite.ml_agent_bench.eval:MLAgentBench - args: - samples_jsonl: ml_agent_bench/identify_contrails/identify-contrails.jsonl - ml-agent-bench.imdb: id: ml-agent-bench.imdb.v0 metrics: @@ -133,15 +96,6 @@ ml-agent-bench.parkinsons-disease.v0: args: samples_jsonl: ml_agent_bench/parkinsons_disease/parkinsons-disease.jsonl -ml-agent-bench.llama-inference: - id: ml-agent-bench.llama-inference.v0 - metrics: - [model_score, naive_baseline_score, human_baseline_score, model_score_normalized, naive_baseline_score_normalized, human_baseline_score_normalized, model_score_humanrelative] -ml-agent-bench.llama-inference.v0: - class: evals.elsuite.ml_agent_bench.eval:MLAgentBench - args: - samples_jsonl: ml_agent_bench/llama-inference.jsonl - ml-agent-bench.ogbn-arxiv: id: ml-agent-bench.ogbn-arxiv.v0 metrics: