diff --git a/data/cosmopedia_100k/README.md b/data/cosmopedia_100k/README.md new file mode 100644 index 0000000000..49cb226975 --- /dev/null +++ b/data/cosmopedia_100k/README.md @@ -0,0 +1,48 @@ +Cosmopedia-100k Dataset +======================= + +This folder contains scripts compatible with the cosmopedia-100k dataset. + +Introduction +------------ +Cosmopedia-100k is a smaller subset of the Cosmopedia dataset, which is a synthetic dataset of textbooks, blogposts, stories, posts, and WikiHow articles generated by Mixtral-8x7B-Instruct-v0.1. This dataset aims to map world knowledge present in Web datasets like RefinedWeb and RedPajama, generating synthetic content that covers a wide range of topics. Cosmopedia-100k offers a more manageable size for researchers and enthusiasts to explore the capabilities of synthetic data. + +Downloading the Dataset +----------------------- +To download the Cosmopedia-100k dataset, you can use the provided `get_dataset.py` script. This script automates the process of scraping and converting Parquet files from a specified URL to JSON and saves its contents to a text file. + +Here's how to use the script: + +```bash +python get_dataset.py --url https://huggingface.co/datasets/HuggingFaceTB/cosmopedia-100k/tree/main/data -o output_text_file.txt +``` + +Replace `output_text_file.txt` with the path where you want the dataset's text content to be saved. + +Dataset Structure +----------------- +The dataset contains the following features: + +- `prompt`: The prompt used to generate the content with Mixtral-8x7B-Instruct-v0.1. +- `text`: The synthetic generated content. +- `seed_data`: The name of the dataset or external source used in the prompt. +- `token_length`: The number of tokens in the text, computed using Mistral-7B's tokenizer. +- `format`: The style of text, which can be a textbook, a blogpost, a story, etc. +- `audience`: The target audience defined in the prompt. + +The dataset is divided into 8 splits depending on the source of the seed data used. It covers a variety of styles and audiences, enhancing the diversity of the content. + +Citation +-------- +If you use the Cosmopedia-100k dataset in your research, please cite the following: + +```bibtex +@software{benallal2024cosmopedia, + author = {Ben Allal, Loubna and Lozhkov, Anton and Penedo, Guilherme and Wolf, Thomas and von Werra, Leandro}, + title = {Cosmopedia-100k}, + month = February, + year = 2024, + url = {https://huggingface.co/datasets/HuggingFaceTB/cosmopedia-100k} +} +``` + diff --git a/data/cosmopedia_100k/get_dataset.py b/data/cosmopedia_100k/get_dataset.py new file mode 100644 index 0000000000..2b6c62555f --- /dev/null +++ b/data/cosmopedia_100k/get_dataset.py @@ -0,0 +1,88 @@ +import argparse +import json +import os +from bs4 import BeautifulSoup +from tqdm import tqdm +import pandas as pd +import requests + +def download_file(url, filename): + response = requests.get(url, stream=True) + response.raise_for_status() # Ensure the download was successful. + total_size = int(response.headers.get("content-length", 0)) + block_size = 1024 # 1 Kibibyte + progress_bar = tqdm(total=total_size, unit="iB", unit_scale=True) + with open(filename, "wb") as f: + for data in response.iter_content(block_size): + progress_bar.update(len(data)) + f.write(data) + if total_size and progress_bar.n != total_size: + raise Exception("Error: Failed to download the file completely.") + print(f"Downloaded {filename}") + +def convert_to_json(parquet_path, json_path): + if not os.path.exists(json_path): + df = pd.read_parquet(parquet_path) + df.to_json(json_path, orient="records") + print(f"Converted {parquet_path} to JSON at {json_path}") + print(f"{json_path} already exists, skipping conversion.") + +def emit_json_contents(json_path, output_text_file): + with open(json_path, "r") as f: + data = json.load(f) + + with open(output_text_file, "a") as f: + for item in data: + content_line = f"{item['prompt']}" + f.write(content_line.strip()) + f.write("\n") # Separator between prompts and texts + content_line = f"{item['text']}" + f.write(content_line.strip()) + f.write("\n\n") # Separator between entries + +def find_parquet_links(url): + response = requests.get(url) + soup = BeautifulSoup(response.text, 'html.parser') + base_url = "https://huggingface.co" + links = [base_url + a['href'] for a in soup.find_all('a', href=True) if a['href'].endswith('.parquet?download=true')] + return links + +def main(url, output_text_file): + + download_dir = "./downloaded_parquets" + json_dir = "./json_output" + + os.makedirs(download_dir, exist_ok=True) + os.makedirs(json_dir, exist_ok=True) + + # Ensure the output text file is empty before starting + open(output_text_file, "w").close() + + for link in find_parquet_links(url): + file_name = os.path.basename(link.split("?")[0]) # Extract filename + parquet_path = os.path.join(download_dir, file_name) + json_path = os.path.join(json_dir, file_name.replace('.parquet', '.json')) + + if not os.path.isfile(parquet_path): + download_file(link, parquet_path) # Download if not present + + convert_to_json(parquet_path, json_path) # Convert to JSON + + emit_json_contents(json_path, output_text_file) # Emit contents + + +if __name__ == "__main__": + description = "Scrape and convert Parquet files from URL to JSON and save its contents to a text file." + parser = argparse.ArgumentParser(description=description) + + parser.add_argument("--url", type=str, required=True, help="URL to scrape for Parquet files.") + parser.add_argument( + "-o", + "--output_text_file", + type=str, + default="input.txt", + help="Path to the output text file.", + ) + args = parser.parse_args() + + main(args.url, args.output_text_file) diff --git a/data/cosmopedia_100k/prepare.py b/data/cosmopedia_100k/prepare.py new file mode 120000 index 0000000000..713f6b0012 --- /dev/null +++ b/data/cosmopedia_100k/prepare.py @@ -0,0 +1 @@ +../template/prepare.py \ No newline at end of file diff --git a/data/cosmopedia_100k/utils/meta_util.py b/data/cosmopedia_100k/utils/meta_util.py new file mode 120000 index 0000000000..62953cf396 --- /dev/null +++ b/data/cosmopedia_100k/utils/meta_util.py @@ -0,0 +1 @@ +../template/meta_util.py \ No newline at end of file diff --git a/data/cosmopedia_100k/utils/txt_to_phonemes.sh b/data/cosmopedia_100k/utils/txt_to_phonemes.sh new file mode 120000 index 0000000000..51a03dc660 --- /dev/null +++ b/data/cosmopedia_100k/utils/txt_to_phonemes.sh @@ -0,0 +1 @@ +../template/txt_to_phonemes.sh \ No newline at end of file diff --git a/data/create_new_dataset.sh b/data/create_new_dataset.sh index 20705dbb9a..f7880ab522 100755 --- a/data/create_new_dataset.sh +++ b/data/create_new_dataset.sh @@ -1,14 +1,14 @@ #!/bin/bash new_dataset="${1}" -mkdir "$new_dataset" +mkdir -p "${new_dataset}/utils" pushd "$new_dataset" # Use softlinks so we can use template/prepare.py for development ln -s ../template/prepare.py prepare.py -ln -s ../template/meta_util.py meta_util.py -ln -s ../template/txt_to_phonemes.sh txt_to_phonemes.sh +ln -s ../template/utils/meta_util.py utils/meta_util.py +ln -s ../template/utils/txt_to_phonemes.sh utils/txt_to_phonemes.sh # Different datasets may have different phoneme sets -cp ../template/phoneme_list.txt . +cp ../template/utils/phoneme_list.txt utils/phoneme_list.txt diff --git a/data/template/meta_util.py b/data/template/utils/meta_util.py similarity index 100% rename from data/template/meta_util.py rename to data/template/utils/meta_util.py diff --git a/data/template/phoneme_list.txt b/data/template/utils/phoneme_list.txt similarity index 100% rename from data/template/phoneme_list.txt rename to data/template/utils/phoneme_list.txt diff --git a/data/template/txt_to_phonemes.sh b/data/template/utils/txt_to_phonemes.sh similarity index 100% rename from data/template/txt_to_phonemes.sh rename to data/template/utils/txt_to_phonemes.sh