Skip to content

Commit

Permalink
Merge pull request #148 from klei22/add_mmlu_benchmark
Browse files Browse the repository at this point in the history
Add scripts compatible with MMLU Benchmark
  • Loading branch information
gkielian authored Apr 15, 2024
2 parents ff56285 + d3392db commit 0c097b1
Show file tree
Hide file tree
Showing 6 changed files with 241 additions and 0 deletions.
68 changes: 68 additions & 0 deletions data/mmlu/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,68 @@
MMLU Benchmark Dataset
=======================

This folder contains scripts compatible with the MMLU (Massive Multitask
Language Understanding) benchmark dataset.

Introduction
------------
The MMLU benchmark is a collection of educational and assessment data designed
to evaluate language understanding capabilities across a diverse range of
subjects. This dataset includes questions from multiple academic and
professional fields, providing a broad spectrum for testing comprehension and
reasoning in language models.

Downloading the Dataset
-----------------------
To download the MMLU benchmark dataset, you can use the provided
`get_dataset.py` script. This script automates the process of scraping and
converting Parquet files from a specified URL to JSON and saves its contents to
a text file.

Here's how to use the script:

```bash
python3 get_dataset.py
```

Dataset Structure
-----------------

The dataset includes multiple-choice questions with the following features:

question: The text of the question being asked.
subject: The academic subject or field the question pertains to.
choices: A list of possible answers to the question.
answer: The index of the correct answer in the choices list.

The dataset covers numerous subjects including but not limited to abstract
algebra, anatomy, astronomy, business ethics, and virology. This diversity
supports comprehensive assessments of language understanding across different
knowledge domains.

Dataset Licensing Information
-----------------------------

MIT

Citation
--------

If you use the MMLU benchmark dataset in your research, please cite the following:

```bibtex
@article{hendryckstest2021,
title={Measuring Massive Multitask Language Understanding},
author={Dan Hendrycks and Collin Burns and Steven Basart and Andy Zou and Mantas Mazeika and Dawn Song and Jacob Steinhardt},
journal={Proceedings of the International Conference on Learning Representations (ICLR)},
year={2021}
}
@article{hendrycks2021ethics,
title={Aligning AI With Shared Human Values},
author={Dan Hendrycks and Collin Burns and Steven Basart and Andrew Critch and Jerry Li and Dawn Song and Jacob Steinhardt},
journal={Proceedings of the International Conference on Learning Representations (ICLR)},
year={2021}
}
```

83 changes: 83 additions & 0 deletions data/mmlu/get_dataset.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,83 @@
import argparse
import json
import os
from bs4 import BeautifulSoup
import requests
from tqdm import tqdm
import pandas as pd

def download_file(url, filename):
response = requests.get(url, stream=True)
response.raise_for_status()
total_size = int(response.headers.get("content-length", 0))
block_size = 1024
progress_bar = tqdm(total=total_size, unit="iB", unit_scale=True)
with open(filename, "wb") as f:
for data in response.iter_content(block_size):
progress_bar.update(len(data))
f.write(data)
progress_bar.close()
if total_size != 0 and progress_bar.n != total_size:
raise Exception("Error: Failed to download the file completely.")
print(f"Downloaded {filename}")

def convert_to_json(parquet_path, json_path):
if not os.path.exists(json_path):
df = pd.read_parquet(parquet_path)
df.to_json(json_path, orient="records")
print(f"Converted {parquet_path} to JSON at {json_path}")
else:
print(f"{json_path} already exists, skipping conversion.")

def emit_json_contents(json_path, output_text_file):
with open(json_path, "r") as f:
data = json.load(f)
with open(output_text_file, "a") as f:
for item in data:
f.write(f"Question: {item['question']}\n")
for idx, choice in enumerate(item['choices']):
f.write(f"Choice {idx + 1}: {choice}\n")
correct_answer = item['choices'][item['answer']]
f.write(f"Correct Answer: Choice {item['answer'] + 1}: {correct_answer}\n\n")

def find_parquet_links(url):
response = requests.get(url)
soup = BeautifulSoup(response.text, 'html.parser')
base_url = "https://huggingface.co"
links = [base_url + a['href'] for a in soup.find_all('a', href=True) if a['href'].endswith('.parquet?download=true')]
return links

def main(base_url, subdirectories, output_text_file):
download_dir = "./downloaded_parquets"
json_dir = "./json_output"
os.makedirs(download_dir, exist_ok=True)
os.makedirs(json_dir, exist_ok=True)
open(output_text_file, "w").close()

for subdir in subdirectories:
url = f"{base_url}/{subdir}/"
print(f"Downloading .parquet files for {subdir}...")
links = find_parquet_links(url)
for link in links:
print(f"Found file: {link}")
file_name = f"{subdir}_{os.path.basename(link.split('?')[0])}"
parquet_path = os.path.join(download_dir, file_name)
json_path = os.path.join(json_dir, file_name.replace('.parquet', '.json'))
if not os.path.isfile(parquet_path):
download_file(link, parquet_path)
if not os.path.exists(json_path):
convert_to_json(parquet_path, json_path)
emit_json_contents(json_path, output_text_file)

if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Scrape specific subdirectories for Parquet files.")
parser.add_argument("--output_text_file", type=str, default="input.txt", help="Path to the output text file.")
args = parser.parse_args()
base_url = "https://huggingface.co/datasets/cais/mmlu/tree/main"
subdirectories = [
"abstract_algebra", "anatomy", "astronomy", "business_ethics", "clinical_knowledge", "college_biology",
"college_chemistry", "college_computer_science", "college_mathematics", "college_medicine", "college_physics",
"computer_security", "virology"
]
main(base_url, subdirectories, args.output_text_file)

1 change: 1 addition & 0 deletions data/mmlu/prepare.py
1 change: 1 addition & 0 deletions data/mmlu/utils/meta_util.py
87 changes: 87 additions & 0 deletions data/mmlu/utils/phoneme_list.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,87 @@
i:
I
iI
eI
a
A:
Q
0
'
O:
U
u:
V
@
eI
aI
OI
aU
oU
p
b
t
d
k
g
f
v
T
D
s
z
S
Z
h
m
n
N
l
r
w
j
iu
i
e
o
u
W
A
y
E
ME
O
oo
ou
ye

\n
\r
:
,
F
C
Y
?
.
B
c
R
M
L
c
;
!
H
P
q

G
-
x
$
&
3
J
K
X
_
1 change: 1 addition & 0 deletions data/mmlu/utils/txt_to_phonemes.sh

0 comments on commit 0c097b1

Please sign in to comment.