diff --git a/data/mmlu/README.md b/data/mmlu/README.md new file mode 100644 index 000000000..3692a7ba7 --- /dev/null +++ b/data/mmlu/README.md @@ -0,0 +1,68 @@ +MMLU Benchmark Dataset +======================= + +This folder contains scripts compatible with the MMLU (Massive Multitask +Language Understanding) benchmark dataset. + +Introduction +------------ +The MMLU benchmark is a collection of educational and assessment data designed +to evaluate language understanding capabilities across a diverse range of +subjects. This dataset includes questions from multiple academic and +professional fields, providing a broad spectrum for testing comprehension and +reasoning in language models. + +Downloading the Dataset +----------------------- +To download the MMLU benchmark dataset, you can use the provided +`get_dataset.py` script. This script automates the process of scraping and +converting Parquet files from a specified URL to JSON and saves its contents to +a text file. + +Here's how to use the script: + +```bash +python3 get_dataset.py +``` + +Dataset Structure +----------------- + +The dataset includes multiple-choice questions with the following features: + + question: The text of the question being asked. + subject: The academic subject or field the question pertains to. + choices: A list of possible answers to the question. + answer: The index of the correct answer in the choices list. + +The dataset covers numerous subjects including but not limited to abstract +algebra, anatomy, astronomy, business ethics, and virology. This diversity +supports comprehensive assessments of language understanding across different +knowledge domains. + +Dataset Licensing Information +----------------------------- + +MIT + +Citation +-------- + +If you use the MMLU benchmark dataset in your research, please cite the following: + +```bibtex +@article{hendryckstest2021, + title={Measuring Massive Multitask Language Understanding}, + author={Dan Hendrycks and Collin Burns and Steven Basart and Andy Zou and Mantas Mazeika and Dawn Song and Jacob Steinhardt}, + journal={Proceedings of the International Conference on Learning Representations (ICLR)}, + year={2021} +} + +@article{hendrycks2021ethics, + title={Aligning AI With Shared Human Values}, + author={Dan Hendrycks and Collin Burns and Steven Basart and Andrew Critch and Jerry Li and Dawn Song and Jacob Steinhardt}, + journal={Proceedings of the International Conference on Learning Representations (ICLR)}, + year={2021} +} +``` + diff --git a/data/mmlu/get_dataset.py b/data/mmlu/get_dataset.py new file mode 100644 index 000000000..5201dc91f --- /dev/null +++ b/data/mmlu/get_dataset.py @@ -0,0 +1,83 @@ +import argparse +import json +import os +from bs4 import BeautifulSoup +import requests +from tqdm import tqdm +import pandas as pd + +def download_file(url, filename): + response = requests.get(url, stream=True) + response.raise_for_status() + total_size = int(response.headers.get("content-length", 0)) + block_size = 1024 + progress_bar = tqdm(total=total_size, unit="iB", unit_scale=True) + with open(filename, "wb") as f: + for data in response.iter_content(block_size): + progress_bar.update(len(data)) + f.write(data) + progress_bar.close() + if total_size != 0 and progress_bar.n != total_size: + raise Exception("Error: Failed to download the file completely.") + print(f"Downloaded {filename}") + +def convert_to_json(parquet_path, json_path): + if not os.path.exists(json_path): + df = pd.read_parquet(parquet_path) + df.to_json(json_path, orient="records") + print(f"Converted {parquet_path} to JSON at {json_path}") + else: + print(f"{json_path} already exists, skipping conversion.") + +def emit_json_contents(json_path, output_text_file): + with open(json_path, "r") as f: + data = json.load(f) + with open(output_text_file, "a") as f: + for item in data: + f.write(f"Question: {item['question']}\n") + for idx, choice in enumerate(item['choices']): + f.write(f"Choice {idx + 1}: {choice}\n") + correct_answer = item['choices'][item['answer']] + f.write(f"Correct Answer: Choice {item['answer'] + 1}: {correct_answer}\n\n") + +def find_parquet_links(url): + response = requests.get(url) + soup = BeautifulSoup(response.text, 'html.parser') + base_url = "https://huggingface.co" + links = [base_url + a['href'] for a in soup.find_all('a', href=True) if a['href'].endswith('.parquet?download=true')] + return links + +def main(base_url, subdirectories, output_text_file): + download_dir = "./downloaded_parquets" + json_dir = "./json_output" + os.makedirs(download_dir, exist_ok=True) + os.makedirs(json_dir, exist_ok=True) + open(output_text_file, "w").close() + + for subdir in subdirectories: + url = f"{base_url}/{subdir}/" + print(f"Downloading .parquet files for {subdir}...") + links = find_parquet_links(url) + for link in links: + print(f"Found file: {link}") + file_name = f"{subdir}_{os.path.basename(link.split('?')[0])}" + parquet_path = os.path.join(download_dir, file_name) + json_path = os.path.join(json_dir, file_name.replace('.parquet', '.json')) + if not os.path.isfile(parquet_path): + download_file(link, parquet_path) + if not os.path.exists(json_path): + convert_to_json(parquet_path, json_path) + emit_json_contents(json_path, output_text_file) + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="Scrape specific subdirectories for Parquet files.") + parser.add_argument("--output_text_file", type=str, default="input.txt", help="Path to the output text file.") + args = parser.parse_args() + base_url = "https://huggingface.co/datasets/cais/mmlu/tree/main" + subdirectories = [ + "abstract_algebra", "anatomy", "astronomy", "business_ethics", "clinical_knowledge", "college_biology", + "college_chemistry", "college_computer_science", "college_mathematics", "college_medicine", "college_physics", + "computer_security", "virology" + ] + main(base_url, subdirectories, args.output_text_file) + diff --git a/data/mmlu/prepare.py b/data/mmlu/prepare.py new file mode 120000 index 000000000..713f6b001 --- /dev/null +++ b/data/mmlu/prepare.py @@ -0,0 +1 @@ +../template/prepare.py \ No newline at end of file diff --git a/data/mmlu/utils/meta_util.py b/data/mmlu/utils/meta_util.py new file mode 120000 index 000000000..d25555789 --- /dev/null +++ b/data/mmlu/utils/meta_util.py @@ -0,0 +1 @@ +../template/utils/meta_util.py \ No newline at end of file diff --git a/data/mmlu/utils/phoneme_list.txt b/data/mmlu/utils/phoneme_list.txt new file mode 100644 index 000000000..107a397c1 --- /dev/null +++ b/data/mmlu/utils/phoneme_list.txt @@ -0,0 +1,87 @@ +i: +I +iI +eI +a +A: +Q +0 +' +O: +U +u: +V +@ +eI +aI +OI +aU +oU +p +b +t +d +k +g +f +v +T +D +s +z +S +Z +h +m +n +N +l +r +w +j +iu +i +e +o +u +W +A +y +E +ME +O +oo +ou +ye + +\n +\r +: +, +F +C +Y +? +. +B +c +R +M +L +c +; +! +H +P +q + +G +- +x +$ +& +3 +J +K +X +_ diff --git a/data/mmlu/utils/txt_to_phonemes.sh b/data/mmlu/utils/txt_to_phonemes.sh new file mode 120000 index 000000000..baec9c434 --- /dev/null +++ b/data/mmlu/utils/txt_to_phonemes.sh @@ -0,0 +1 @@ +../template/utils/txt_to_phonemes.sh \ No newline at end of file