Merge pull request #142 from klei22/add_cosmo100k_dataset

Add scripts compatible with cosmo100k dataset
ReaLLMASIC · Apr 11, 2024 · ff56285 · ff56285
2 parents a5bdbfe + 981ef69
commit ff56285
Show file tree

Hide file tree

Showing 9 changed files with 143 additions and 4 deletions.
diff --git a/data/cosmopedia_100k/README.md b/data/cosmopedia_100k/README.md
@@ -0,0 +1,48 @@
+Cosmopedia-100k Dataset
+=======================
+
+This folder contains scripts compatible with the cosmopedia-100k dataset.
+
+Introduction
+------------
+Cosmopedia-100k is a smaller subset of the Cosmopedia dataset, which is a synthetic dataset of textbooks, blogposts, stories, posts, and WikiHow articles generated by Mixtral-8x7B-Instruct-v0.1. This dataset aims to map world knowledge present in Web datasets like RefinedWeb and RedPajama, generating synthetic content that covers a wide range of topics. Cosmopedia-100k offers a more manageable size for researchers and enthusiasts to explore the capabilities of synthetic data.
+
+Downloading the Dataset
+-----------------------
+To download the Cosmopedia-100k dataset, you can use the provided `get_dataset.py` script. This script automates the process of scraping and converting Parquet files from a specified URL to JSON and saves its contents to a text file.
+
+Here's how to use the script:
+
+```bash
+python get_dataset.py --url https://huggingface.co/datasets/HuggingFaceTB/cosmopedia-100k/tree/main/data -o output_text_file.txt
+```
+
+Replace `output_text_file.txt` with the path where you want the dataset's text content to be saved.
+
+Dataset Structure
+-----------------
+The dataset contains the following features:
+
+- `prompt`: The prompt used to generate the content with Mixtral-8x7B-Instruct-v0.1.
+- `text`: The synthetic generated content.
+- `seed_data`: The name of the dataset or external source used in the prompt.
+- `token_length`: The number of tokens in the text, computed using Mistral-7B's tokenizer.
+- `format`: The style of text, which can be a textbook, a blogpost, a story, etc.
+- `audience`: The target audience defined in the prompt.
+
+The dataset is divided into 8 splits depending on the source of the seed data used. It covers a variety of styles and audiences, enhancing the diversity of the content.
+
+Citation
+--------
+If you use the Cosmopedia-100k dataset in your research, please cite the following:
+
+```bibtex
+@software{benallal2024cosmopedia,
+  author = {Ben Allal, Loubna and Lozhkov, Anton and Penedo, Guilherme and Wolf, Thomas and von Werra, Leandro},
+  title = {Cosmopedia-100k},
+  month = February,
+  year = 2024,
+  url = {https://huggingface.co/datasets/HuggingFaceTB/cosmopedia-100k}
+}
+```
+
diff --git a/data/cosmopedia_100k/get_dataset.py b/data/cosmopedia_100k/get_dataset.py
@@ -0,0 +1,88 @@
+import argparse
+import json
+import os
+from bs4 import BeautifulSoup
+from tqdm import tqdm
+import pandas as pd
+import requests
+
+def download_file(url, filename):
+    response = requests.get(url, stream=True)
+    response.raise_for_status()  # Ensure the download was successful.
+    total_size = int(response.headers.get("content-length", 0))
+    block_size = 1024  # 1 Kibibyte
+    progress_bar = tqdm(total=total_size, unit="iB", unit_scale=True)
+    with open(filename, "wb") as f:
+        for data in response.iter_content(block_size):
+            progress_bar.update(len(data))
+            f.write(data)
+    if total_size and progress_bar.n != total_size:
+        raise Exception("Error: Failed to download the file completely.")
+    print(f"Downloaded {filename}")
+
+def convert_to_json(parquet_path, json_path):
+    if not os.path.exists(json_path):
+        df = pd.read_parquet(parquet_path)
+        df.to_json(json_path, orient="records")
+        print(f"Converted {parquet_path} to JSON at {json_path}")
+    print(f"{json_path} already exists, skipping conversion.")
+
+def emit_json_contents(json_path, output_text_file):
+    with open(json_path, "r") as f:
+        data = json.load(f)
+
+    with open(output_text_file, "a") as f:
+        for item in data:
+            content_line = f"{item['prompt']}"
+            f.write(content_line.strip())
+            f.write("\n")  # Separator between prompts and texts
+            content_line = f"{item['text']}"
+            f.write(content_line.strip())
+            f.write("\n\n")  # Separator between entries
+
+def find_parquet_links(url):
+    response = requests.get(url)
+    soup = BeautifulSoup(response.text, 'html.parser')
+    base_url = "https://huggingface.co"
+    links = [base_url + a['href'] for a in soup.find_all('a', href=True) if a['href'].endswith('.parquet?download=true')]
+    return links
+
+def main(url, output_text_file):
+
+    download_dir = "./downloaded_parquets"
+    json_dir = "./json_output"
+
+    os.makedirs(download_dir, exist_ok=True)
+    os.makedirs(json_dir, exist_ok=True)
+
+    # Ensure the output text file is empty before starting
+    open(output_text_file, "w").close()
+
+    for link in find_parquet_links(url):
+        file_name = os.path.basename(link.split("?")[0])  # Extract filename
+        parquet_path = os.path.join(download_dir, file_name)
+        json_path = os.path.join(json_dir, file_name.replace('.parquet', '.json'))
+
+        if not os.path.isfile(parquet_path):
+            download_file(link, parquet_path)  # Download if not present
+
+        convert_to_json(parquet_path, json_path)  # Convert to JSON
+
+        emit_json_contents(json_path, output_text_file)  # Emit contents
+
+
+if __name__ == "__main__":
+    description = "Scrape and convert Parquet files from URL to JSON and save its contents to a text file."
+    parser = argparse.ArgumentParser(description=description)
+
+    parser.add_argument("--url", type=str, required=True, help="URL to scrape for Parquet files.")
+    parser.add_argument(
+        "-o",
+        "--output_text_file",
+        type=str,
+        default="input.txt",
+        help="Path to the output text file.",
+    )
+    args = parser.parse_args()
+
+    main(args.url, args.output_text_file)
diff --git a/data/cosmopedia_100k/prepare.py b/data/cosmopedia_100k/prepare.py
@@ -0,0 +1 @@
+../template/prepare.py
diff --git a/data/cosmopedia_100k/utils/meta_util.py b/data/cosmopedia_100k/utils/meta_util.py
@@ -0,0 +1 @@
+../template/meta_util.py
diff --git a/data/cosmopedia_100k/utils/txt_to_phonemes.sh b/data/cosmopedia_100k/utils/txt_to_phonemes.sh
@@ -0,0 +1 @@
+../template/txt_to_phonemes.sh
diff --git a/data/create_new_dataset.sh b/data/create_new_dataset.sh
@@ -1,14 +1,14 @@
 #!/bin/bash
 
 new_dataset="${1}"
-mkdir "$new_dataset"
+mkdir -p "${new_dataset}/utils"
 pushd "$new_dataset"
 
 # Use softlinks so we can use template/prepare.py for development
 ln -s ../template/prepare.py prepare.py
-ln -s ../template/meta_util.py meta_util.py
-ln -s ../template/txt_to_phonemes.sh txt_to_phonemes.sh
+ln -s ../template/utils/meta_util.py utils/meta_util.py
+ln -s ../template/utils/txt_to_phonemes.sh utils/txt_to_phonemes.sh
 
 # Different datasets may have different phoneme sets
-cp ../template/phoneme_list.txt .
+cp ../template/utils/phoneme_list.txt utils/phoneme_list.txt
 
diff --git a/data/template/meta_util.py → data/template/utils/meta_util.py b/data/template/meta_util.py → data/template/utils/meta_util.py
diff --git a/data/template/phoneme_list.txt → data/template/utils/phoneme_list.txt b/data/template/phoneme_list.txt → data/template/utils/phoneme_list.txt
diff --git a/data/template/txt_to_phonemes.sh → data/template/utils/txt_to_phonemes.sh b/data/template/txt_to_phonemes.sh → data/template/utils/txt_to_phonemes.sh