Merge pull request #148 from klei22/add_mmlu_benchmark

Add scripts compatible with MMLU Benchmark
ReaLLMASIC · Apr 15, 2024 · 0c097b1 · 0c097b1
2 parents ff56285 + d3392db
commit 0c097b1
Show file tree

Hide file tree

Showing 6 changed files with 241 additions and 0 deletions.
diff --git a/data/mmlu/README.md b/data/mmlu/README.md
@@ -0,0 +1,68 @@
+MMLU Benchmark Dataset
+=======================
+
+This folder contains scripts compatible with the MMLU (Massive Multitask
+Language Understanding) benchmark dataset.
+
+Introduction
+------------
+The MMLU benchmark is a collection of educational and assessment data designed
+to evaluate language understanding capabilities across a diverse range of
+subjects. This dataset includes questions from multiple academic and
+professional fields, providing a broad spectrum for testing comprehension and
+reasoning in language models.
+
+Downloading the Dataset
+-----------------------
+To download the MMLU benchmark dataset, you can use the provided
+`get_dataset.py` script. This script automates the process of scraping and
+converting Parquet files from a specified URL to JSON and saves its contents to
+a text file.
+
+Here's how to use the script:
+
+```bash
+python3 get_dataset.py
+```
+
+Dataset Structure
+-----------------
+
+The dataset includes multiple-choice questions with the following features:
+
+    question: The text of the question being asked.
+    subject: The academic subject or field the question pertains to.
+    choices: A list of possible answers to the question.
+    answer: The index of the correct answer in the choices list.
+
+The dataset covers numerous subjects including but not limited to abstract
+algebra, anatomy, astronomy, business ethics, and virology. This diversity
+supports comprehensive assessments of language understanding across different
+knowledge domains.
+
+Dataset Licensing Information
+-----------------------------
+
+MIT
+
+Citation
+--------
+
+If you use the MMLU benchmark dataset in your research, please cite the following:
+
+```bibtex
+@article{hendryckstest2021,
+  title={Measuring Massive Multitask Language Understanding},
+  author={Dan Hendrycks and Collin Burns and Steven Basart and Andy Zou and Mantas Mazeika and Dawn Song and Jacob Steinhardt},
+  journal={Proceedings of the International Conference on Learning Representations (ICLR)},
+  year={2021}
+}
+
+@article{hendrycks2021ethics,
+  title={Aligning AI With Shared Human Values},
+  author={Dan Hendrycks and Collin Burns and Steven Basart and Andrew Critch and Jerry Li and Dawn Song and Jacob Steinhardt},
+  journal={Proceedings of the International Conference on Learning Representations (ICLR)},
+  year={2021}
+}
+```
+
diff --git a/data/mmlu/get_dataset.py b/data/mmlu/get_dataset.py
@@ -0,0 +1,83 @@
+import argparse
+import json
+import os
+from bs4 import BeautifulSoup
+import requests
+from tqdm import tqdm
+import pandas as pd
+
+def download_file(url, filename):
+    response = requests.get(url, stream=True)
+    response.raise_for_status()
+    total_size = int(response.headers.get("content-length", 0))
+    block_size = 1024
+    progress_bar = tqdm(total=total_size, unit="iB", unit_scale=True)
+    with open(filename, "wb") as f:
+        for data in response.iter_content(block_size):
+            progress_bar.update(len(data))
+            f.write(data)
+    progress_bar.close()
+    if total_size != 0 and progress_bar.n != total_size:
+        raise Exception("Error: Failed to download the file completely.")
+    print(f"Downloaded {filename}")
+
+def convert_to_json(parquet_path, json_path):
+    if not os.path.exists(json_path):
+        df = pd.read_parquet(parquet_path)
+        df.to_json(json_path, orient="records")
+        print(f"Converted {parquet_path} to JSON at {json_path}")
+    else:
+        print(f"{json_path} already exists, skipping conversion.")
+
+def emit_json_contents(json_path, output_text_file):
+    with open(json_path, "r") as f:
+        data = json.load(f)
+    with open(output_text_file, "a") as f:
+        for item in data:
+            f.write(f"Question: {item['question']}\n")
+            for idx, choice in enumerate(item['choices']):
+                f.write(f"Choice {idx + 1}: {choice}\n")
+            correct_answer = item['choices'][item['answer']]
+            f.write(f"Correct Answer: Choice {item['answer'] + 1}: {correct_answer}\n\n")
+
+def find_parquet_links(url):
+    response = requests.get(url)
+    soup = BeautifulSoup(response.text, 'html.parser')
+    base_url = "https://huggingface.co"
+    links = [base_url + a['href'] for a in soup.find_all('a', href=True) if a['href'].endswith('.parquet?download=true')]
+    return links
+
+def main(base_url, subdirectories, output_text_file):
+    download_dir = "./downloaded_parquets"
+    json_dir = "./json_output"
+    os.makedirs(download_dir, exist_ok=True)
+    os.makedirs(json_dir, exist_ok=True)
+    open(output_text_file, "w").close()
+
+    for subdir in subdirectories:
+        url = f"{base_url}/{subdir}/"
+        print(f"Downloading .parquet files for {subdir}...")
+        links = find_parquet_links(url)
+        for link in links:
+            print(f"Found file: {link}")
+            file_name = f"{subdir}_{os.path.basename(link.split('?')[0])}"
+            parquet_path = os.path.join(download_dir, file_name)
+            json_path = os.path.join(json_dir, file_name.replace('.parquet', '.json'))
+            if not os.path.isfile(parquet_path):
+                download_file(link, parquet_path)
+            if not os.path.exists(json_path):
+                convert_to_json(parquet_path, json_path)
+            emit_json_contents(json_path, output_text_file)
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Scrape specific subdirectories for Parquet files.")
+    parser.add_argument("--output_text_file", type=str, default="input.txt", help="Path to the output text file.")
+    args = parser.parse_args()
+    base_url = "https://huggingface.co/datasets/cais/mmlu/tree/main"
+    subdirectories = [
+        "abstract_algebra", "anatomy", "astronomy", "business_ethics", "clinical_knowledge", "college_biology",
+        "college_chemistry", "college_computer_science", "college_mathematics", "college_medicine", "college_physics",
+        "computer_security", "virology"
+    ]
+    main(base_url, subdirectories, args.output_text_file)
+
diff --git a/data/mmlu/prepare.py b/data/mmlu/prepare.py
@@ -0,0 +1 @@
+../template/prepare.py
diff --git a/data/mmlu/utils/meta_util.py b/data/mmlu/utils/meta_util.py
@@ -0,0 +1 @@
+../template/utils/meta_util.py
diff --git a/data/mmlu/utils/phoneme_list.txt b/data/mmlu/utils/phoneme_list.txt
@@ -0,0 +1,87 @@
+i:
+I
+iI
+eI
+a
+A:
+Q
+0
+'
+O:
+U
+u:
+V
+@
+eI
+aI
+OI
+aU
+oU
+p
+b
+t
+d
+k
+g
+f
+v
+T
+D
+s
+z
+S
+Z
+h
+m
+n
+N
+l
+r
+w
+j
+iu
+i
+e
+o
+u
+W
+A
+y
+E
+ME
+O
+oo
+ou
+ye
+
+\n
+\r
+:
+,
+F
+C
+Y
+?
+.
+B
+c
+R
+M
+L
+c
+;
+!
+H
+P
+q
+
+G
+-
+x
+$
+&
+3
+J
+K
+X
+_
diff --git a/data/mmlu/utils/txt_to_phonemes.sh b/data/mmlu/utils/txt_to_phonemes.sh
@@ -0,0 +1 @@
+../template/utils/txt_to_phonemes.sh