From fa398b7b022843b76eb61c44cb931d33babf21c0 Mon Sep 17 00:00:00 2001 From: Sarah Yurick Date: Tue, 30 Jul 2024 12:06:37 -0700 Subject: [PATCH 01/14] add notebook Signed-off-by: Sarah Yurick --- .../multiple_quality_models.ipynb | 697 ++++++++++++++++++ 1 file changed, 697 insertions(+) create mode 100644 tutorials/distributed_data_classification/multiple_quality_models.ipynb diff --git a/tutorials/distributed_data_classification/multiple_quality_models.ipynb b/tutorials/distributed_data_classification/multiple_quality_models.ipynb new file mode 100644 index 000000000..d1358edb4 --- /dev/null +++ b/tutorials/distributed_data_classification/multiple_quality_models.ipynb @@ -0,0 +1,697 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Distributed Data Classification with Multiple Quality Classifiers\n", + "\n", + "The notebook demonstrates the use of five quality classifiers for distributed data classification. Each fold was trained on 80% of the data, so the results can be ensembled into a single prediction. These classifers help with annotation which helps data blending for foundation model training.\n", + "\n", + "The classifiers are accelerated using [CrossFit](https://github.com/rapidsai/crossfit), a library that leverages intellegent batching and RAPIDS to accelerate the offline inference on large datasets." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "env: PYTHONWARNINGS=ignore\n", + "env: DASK_DATAFRAME__QUERY_PLANNING=False\n" + ] + } + ], + "source": [ + "# Silence Warnings (HuggingFace internal warnings)\n", + "\n", + "%env PYTHONWARNINGS=ignore\n", + "%env DASK_DATAFRAME__QUERY_PLANNING=False\n", + "import warnings\n", + "warnings.filterwarnings(\"ignore\")" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "from nemo_curator import QualityClassifier\n", + "from nemo_curator.datasets import DocumentDataset\n", + "from nemo_curator.utils.distributed_utils import get_client\n", + "import cudf\n", + "import dask_cudf" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "client = get_client(cluster_type=\"gpu\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Set File Paths " + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "output_file_path = \"output_data_dir/\"\n", + "output_file_path = \"/home/nfs/syurick/NeMo-Curator/tutorials/distributed_data_classification/nb_q_result\"\n", + "quality_model_paths = [\n", + " \"quality_model0.pth\",\n", + " \"quality_model1.pth\",\n", + " \"quality_model2.pth\",\n", + " \"quality_model3.pth\",\n", + " \"quality_model4.pth\",\n", + "]\n", + "quality_model_paths = [\n", + " \"/home/nfs/syurick/LLM_quality_classifier_inference/ensemble/quality_surge_full_22828_1e5_5ep_bs64_1024_fold0_best.pth\",\n", + " \"/home/nfs/syurick/LLM_quality_classifier_inference/ensemble/quality_surge_full_22828_1e5_5ep_bs64_1024_fold1_best.pth\",\n", + " \"/home/nfs/syurick/LLM_quality_classifier_inference/ensemble/quality_surge_full_22828_1e5_5ep_bs64_1024_fold2_best.pth\",\n", + " \"/home/nfs/syurick/LLM_quality_classifier_inference/ensemble/quality_surge_full_22828_1e5_5ep_bs64_1024_fold3_best.pth\",\n", + " \"/home/nfs/syurick/LLM_quality_classifier_inference/ensemble/quality_surge_full_22828_1e5_5ep_bs64_1024_fold4_best.pth\",\n", + "]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Create and Run Classifiers" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "# Create sample DataFrame\n", + "text = [\n", + " \"Quantum computing is set to revolutionize the field of cryptography.\",\n", + " \"Investing in index funds is a popular strategy for long-term financial growth.\",\n", + " \"Recent advancements in gene therapy offer new hope for treating genetic disorders.\",\n", + " \"Online learning platforms have transformed the way students access educational resources.\",\n", + " \"Traveling to Europe during the off-season can be a more budget-friendly option.\",\n", + " \"Training regimens for athletes have become more sophisticated with the use of data analytics.\",\n", + " \"Streaming services are changing the way people consume television and film content.\",\n", + " \"Vegan recipes have gained popularity as more people adopt plant-based diets.\",\n", + " \"Climate change research is critical for developing sustainable environmental policies.\",\n", + " \"Telemedicine has become increasingly popular due to its convenience and accessibility.\",\n", + "]\n", + "df = cudf.DataFrame({\"text\": text})\n", + "dataset = DocumentDataset(dask_cudf.from_cudf(df, npartitions=1))\n", + "write_to_filename = False\n", + "\n", + "# Alternatively, read existing directory of JSONL files\n", + "# input_file_path=\"/input_data_dir/\"\n", + "# input_dataset = DocumentDataset.read_json(\n", + "# input_file_path, backend=\"cudf\", add_filename=True\n", + "# )\n", + "# write_to_filename = True" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Starting Quality classifier inference\n", + "Starting Quality classifier inference\n", + "Starting Quality classifier inference\n", + "Starting Quality classifier inference\n", + "Starting Quality classifier inference\n" + ] + } + ], + "source": [ + "fold = 0\n", + "pred_columns = []\n", + "for quality_model_path in quality_model_paths:\n", + " pred_column = \"quality_pred_\" + str(fold)\n", + " prob_column = \"quality_prob_\" + str(fold)\n", + " pred_columns.append(pred_column)\n", + "\n", + " classifier = QualityClassifier(\n", + " model_path=quality_model_path,\n", + " batch_size=1024,\n", + " pred_column=pred_column,\n", + " prob_column=prob_column,\n", + " )\n", + " dataset = classifier(dataset=dataset)\n", + " fold += 1" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "GPU: 0, Part: 0: 100%|██████████| 10/10 [00:04<00:00, 2.44it/s]\n", + "GPU: 0, Part: 0: 100%|██████████| 10/10 [00:00<00:00, 30.69it/s]\n", + "GPU: 0, Part: 0: 100%|██████████| 10/10 [00:00<00:00, 29.82it/s]\n", + "GPU: 0, Part: 0: 100%|██████████| 10/10 [00:00<00:00, 31.67it/s]\n", + "GPU: 0, Part: 0: 100%|██████████| 10/10 [00:00<00:00, 82.84it/s]" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "CPU times: user 613 ms, sys: 361 ms, total: 974 ms\n", + "Wall time: 10.7 s\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "GPU: 0, Part: 0: 100%|██████████| 10/10 [00:00<00:00, 31.16it/s]\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
textquality_prob_0quality_pred_0quality_prob_1quality_pred_1quality_prob_2quality_pred_2quality_prob_3quality_pred_3quality_prob_4quality_pred_4
0Quantum computing is set to revolutionize the ...[0.3728572130203247, 0.499016135931015, 0.1281...Medium[0.3728572130203247, 0.499016135931015, 0.1281...Medium[0.3728572130203247, 0.499016135931015, 0.1281...Medium[0.3728572130203247, 0.499016135931015, 0.1281...Medium[0.3728572130203247, 0.499016135931015, 0.1281...Medium
1Investing in index funds is a popular strategy...[0.34133055806159973, 0.5345032215118408, 0.12...Medium[0.34133055806159973, 0.5345032215118408, 0.12...Medium[0.34133055806159973, 0.5345032215118408, 0.12...Medium[0.34133055806159973, 0.5345032215118408, 0.12...Medium[0.34133055806159973, 0.5345032215118408, 0.12...Medium
2Recent advancements in gene therapy offer new ...[0.3898108899593353, 0.4821754992008209, 0.128...Medium[0.3898108899593353, 0.4821754992008209, 0.128...Medium[0.3898108899593353, 0.4821754992008209, 0.128...Medium[0.3898108899593353, 0.4821754992008209, 0.128...Medium[0.3898108899593353, 0.4821754992008209, 0.128...Medium
3Online learning platforms have transformed the...[0.38701269030570984, 0.4876796007156372, 0.12...Medium[0.38701269030570984, 0.4876796007156372, 0.12...Medium[0.38701269030570984, 0.4876796007156372, 0.12...Medium[0.38701269030570984, 0.4876796007156372, 0.12...Medium[0.38701269030570984, 0.4876796007156372, 0.12...Medium
4Traveling to Europe during the off-season can ...[0.32102224230766296, 0.5830105543136597, 0.09...Medium[0.32102224230766296, 0.5830105543136597, 0.09...Medium[0.32102224230766296, 0.5830105543136597, 0.09...Medium[0.32102224230766296, 0.5830105543136597, 0.09...Medium[0.32102224230766296, 0.5830105543136597, 0.09...Medium
5Training regimens for athletes have become mor...[0.34178370237350464, 0.5548713207244873, 0.10...Medium[0.34178370237350464, 0.5548713207244873, 0.10...Medium[0.34178370237350464, 0.5548713207244873, 0.10...Medium[0.34178370237350464, 0.5548713207244873, 0.10...Medium[0.34178370237350464, 0.5548713207244873, 0.10...Medium
6Streaming services are changing the way people...[0.35998600721359253, 0.525088906288147, 0.114...Medium[0.35998600721359253, 0.525088906288147, 0.114...Medium[0.35998600721359253, 0.525088906288147, 0.114...Medium[0.35998600721359253, 0.525088906288147, 0.114...Medium[0.35998600721359253, 0.525088906288147, 0.114...Medium
7Vegan recipes have gained popularity as more p...[0.3145926594734192, 0.5717698335647583, 0.113...Medium[0.3145926594734192, 0.5717698335647583, 0.113...Medium[0.3145926594734192, 0.5717698335647583, 0.113...Medium[0.3145926594734192, 0.5717698335647583, 0.113...Medium[0.3145926594734192, 0.5717698335647583, 0.113...Medium
8Climate change research is critical for develo...[0.3767526149749756, 0.5015591979026794, 0.121...Medium[0.3767526149749756, 0.5015591979026794, 0.121...Medium[0.3767526149749756, 0.5015591979026794, 0.121...Medium[0.3767526149749756, 0.5015591979026794, 0.121...Medium[0.3767526149749756, 0.5015591979026794, 0.121...Medium
9Telemedicine has become increasingly popular d...[0.34938278794288635, 0.5144903659820557, 0.13...Medium[0.34938278794288635, 0.5144903659820557, 0.13...Medium[0.34938278794288635, 0.5144903659820557, 0.13...Medium[0.34938278794288635, 0.5144903659820557, 0.13...Medium[0.34938278794288635, 0.5144903659820557, 0.13...Medium
\n", + "
" + ], + "text/plain": [ + " text \\\n", + "0 Quantum computing is set to revolutionize the ... \n", + "1 Investing in index funds is a popular strategy... \n", + "2 Recent advancements in gene therapy offer new ... \n", + "3 Online learning platforms have transformed the... \n", + "4 Traveling to Europe during the off-season can ... \n", + "5 Training regimens for athletes have become mor... \n", + "6 Streaming services are changing the way people... \n", + "7 Vegan recipes have gained popularity as more p... \n", + "8 Climate change research is critical for develo... \n", + "9 Telemedicine has become increasingly popular d... \n", + "\n", + " quality_prob_0 quality_pred_0 \\\n", + "0 [0.3728572130203247, 0.499016135931015, 0.1281... Medium \n", + "1 [0.34133055806159973, 0.5345032215118408, 0.12... Medium \n", + "2 [0.3898108899593353, 0.4821754992008209, 0.128... Medium \n", + "3 [0.38701269030570984, 0.4876796007156372, 0.12... Medium \n", + "4 [0.32102224230766296, 0.5830105543136597, 0.09... Medium \n", + "5 [0.34178370237350464, 0.5548713207244873, 0.10... Medium \n", + "6 [0.35998600721359253, 0.525088906288147, 0.114... Medium \n", + "7 [0.3145926594734192, 0.5717698335647583, 0.113... Medium \n", + "8 [0.3767526149749756, 0.5015591979026794, 0.121... Medium \n", + "9 [0.34938278794288635, 0.5144903659820557, 0.13... Medium \n", + "\n", + " quality_prob_1 quality_pred_1 \\\n", + "0 [0.3728572130203247, 0.499016135931015, 0.1281... Medium \n", + "1 [0.34133055806159973, 0.5345032215118408, 0.12... Medium \n", + "2 [0.3898108899593353, 0.4821754992008209, 0.128... Medium \n", + "3 [0.38701269030570984, 0.4876796007156372, 0.12... Medium \n", + "4 [0.32102224230766296, 0.5830105543136597, 0.09... Medium \n", + "5 [0.34178370237350464, 0.5548713207244873, 0.10... Medium \n", + "6 [0.35998600721359253, 0.525088906288147, 0.114... Medium \n", + "7 [0.3145926594734192, 0.5717698335647583, 0.113... Medium \n", + "8 [0.3767526149749756, 0.5015591979026794, 0.121... Medium \n", + "9 [0.34938278794288635, 0.5144903659820557, 0.13... Medium \n", + "\n", + " quality_prob_2 quality_pred_2 \\\n", + "0 [0.3728572130203247, 0.499016135931015, 0.1281... Medium \n", + "1 [0.34133055806159973, 0.5345032215118408, 0.12... Medium \n", + "2 [0.3898108899593353, 0.4821754992008209, 0.128... Medium \n", + "3 [0.38701269030570984, 0.4876796007156372, 0.12... Medium \n", + "4 [0.32102224230766296, 0.5830105543136597, 0.09... Medium \n", + "5 [0.34178370237350464, 0.5548713207244873, 0.10... Medium \n", + "6 [0.35998600721359253, 0.525088906288147, 0.114... Medium \n", + "7 [0.3145926594734192, 0.5717698335647583, 0.113... Medium \n", + "8 [0.3767526149749756, 0.5015591979026794, 0.121... Medium \n", + "9 [0.34938278794288635, 0.5144903659820557, 0.13... Medium \n", + "\n", + " quality_prob_3 quality_pred_3 \\\n", + "0 [0.3728572130203247, 0.499016135931015, 0.1281... Medium \n", + "1 [0.34133055806159973, 0.5345032215118408, 0.12... Medium \n", + "2 [0.3898108899593353, 0.4821754992008209, 0.128... Medium \n", + "3 [0.38701269030570984, 0.4876796007156372, 0.12... Medium \n", + "4 [0.32102224230766296, 0.5830105543136597, 0.09... Medium \n", + "5 [0.34178370237350464, 0.5548713207244873, 0.10... Medium \n", + "6 [0.35998600721359253, 0.525088906288147, 0.114... Medium \n", + "7 [0.3145926594734192, 0.5717698335647583, 0.113... Medium \n", + "8 [0.3767526149749756, 0.5015591979026794, 0.121... Medium \n", + "9 [0.34938278794288635, 0.5144903659820557, 0.13... Medium \n", + "\n", + " quality_prob_4 quality_pred_4 \n", + "0 [0.3728572130203247, 0.499016135931015, 0.1281... Medium \n", + "1 [0.34133055806159973, 0.5345032215118408, 0.12... Medium \n", + "2 [0.3898108899593353, 0.4821754992008209, 0.128... Medium \n", + "3 [0.38701269030570984, 0.4876796007156372, 0.12... Medium \n", + "4 [0.32102224230766296, 0.5830105543136597, 0.09... Medium \n", + "5 [0.34178370237350464, 0.5548713207244873, 0.10... Medium \n", + "6 [0.35998600721359253, 0.525088906288147, 0.114... Medium \n", + "7 [0.3145926594734192, 0.5717698335647583, 0.113... Medium \n", + "8 [0.3767526149749756, 0.5015591979026794, 0.121... Medium \n", + "9 [0.34938278794288635, 0.5144903659820557, 0.13... Medium " + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "%%time\n", + "\n", + "dataset.df.compute()" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "GPU: 0, Part: 0: 100%|██████████| 10/10 [00:04<00:00, 2.41it/s]\n", + "GPU: 0, Part: 0: 100%|██████████| 10/10 [00:00<00:00, 27.78it/s]\n", + "GPU: 0, Part: 0: 100%|██████████| 10/10 [00:00<00:00, 29.53it/s]\n", + "GPU: 0, Part: 0: 100%|██████████| 10/10 [00:00<00:00, 28.27it/s]\n", + "GPU: 0, Part: 0: 100%|██████████| 10/10 [00:00<00:00, 30.57it/s]\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Writing to disk complete for 1 partitions\n" + ] + } + ], + "source": [ + "dataset.to_json(output_file_dir=output_file_path, write_to_filename=write_to_filename)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Inspect the Output" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Reading 1 files\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
quality_pred_0quality_pred_1quality_pred_2quality_pred_3quality_pred_4quality_prob_0quality_prob_1quality_prob_2quality_prob_3quality_prob_4text
0MediumMediumMediumMediumMedium[0.3728572130000001, 0.4990161359000001, 0.128...[0.3728572130000001, 0.4990161359000001, 0.128...[0.3728572130000001, 0.4990161359000001, 0.128...[0.3728572130000001, 0.4990161359000001, 0.128...[0.3728572130000001, 0.4990161359000001, 0.128...Quantum computing is set to revolutionize the ...
1MediumMediumMediumMediumMedium[0.3413305581, 0.5345032215, 0.12416620550000003][0.3413305581, 0.5345032215, 0.12416620550000003][0.3413305581, 0.5345032215, 0.12416620550000003][0.3413305581, 0.5345032215, 0.12416620550000003][0.3413305581, 0.5345032215, 0.12416620550000003]Investing in index funds is a popular strategy...
2MediumMediumMediumMediumMedium[0.38981089000000013, 0.4821754992000001, 0.12...[0.38981089000000013, 0.4821754992000001, 0.12...[0.38981089000000013, 0.4821754992000001, 0.12...[0.38981089000000013, 0.4821754992000001, 0.12...[0.38981089000000013, 0.4821754992000001, 0.12...Recent advancements in gene therapy offer new ...
3MediumMediumMediumMediumMedium[0.38701269030000013, 0.48767960070000005, 0.1...[0.38701269030000013, 0.48767960070000005, 0.1...[0.38701269030000013, 0.48767960070000005, 0.1...[0.38701269030000013, 0.48767960070000005, 0.1...[0.38701269030000013, 0.48767960070000005, 0.1...Online learning platforms have transformed the...
4MediumMediumMediumMediumMedium[0.3210222423000001, 0.5830105542999999, 0.095...[0.3210222423000001, 0.5830105542999999, 0.095...[0.3210222423000001, 0.5830105542999999, 0.095...[0.3210222423000001, 0.5830105542999999, 0.095...[0.3210222423000001, 0.5830105542999999, 0.095...Traveling to Europe during the off-season can ...
\n", + "
" + ], + "text/plain": [ + " quality_pred_0 quality_pred_1 quality_pred_2 quality_pred_3 quality_pred_4 \\\n", + "0 Medium Medium Medium Medium Medium \n", + "1 Medium Medium Medium Medium Medium \n", + "2 Medium Medium Medium Medium Medium \n", + "3 Medium Medium Medium Medium Medium \n", + "4 Medium Medium Medium Medium Medium \n", + "\n", + " quality_prob_0 \\\n", + "0 [0.3728572130000001, 0.4990161359000001, 0.128... \n", + "1 [0.3413305581, 0.5345032215, 0.12416620550000003] \n", + "2 [0.38981089000000013, 0.4821754992000001, 0.12... \n", + "3 [0.38701269030000013, 0.48767960070000005, 0.1... \n", + "4 [0.3210222423000001, 0.5830105542999999, 0.095... \n", + "\n", + " quality_prob_1 \\\n", + "0 [0.3728572130000001, 0.4990161359000001, 0.128... \n", + "1 [0.3413305581, 0.5345032215, 0.12416620550000003] \n", + "2 [0.38981089000000013, 0.4821754992000001, 0.12... \n", + "3 [0.38701269030000013, 0.48767960070000005, 0.1... \n", + "4 [0.3210222423000001, 0.5830105542999999, 0.095... \n", + "\n", + " quality_prob_2 \\\n", + "0 [0.3728572130000001, 0.4990161359000001, 0.128... \n", + "1 [0.3413305581, 0.5345032215, 0.12416620550000003] \n", + "2 [0.38981089000000013, 0.4821754992000001, 0.12... \n", + "3 [0.38701269030000013, 0.48767960070000005, 0.1... \n", + "4 [0.3210222423000001, 0.5830105542999999, 0.095... \n", + "\n", + " quality_prob_3 \\\n", + "0 [0.3728572130000001, 0.4990161359000001, 0.128... \n", + "1 [0.3413305581, 0.5345032215, 0.12416620550000003] \n", + "2 [0.38981089000000013, 0.4821754992000001, 0.12... \n", + "3 [0.38701269030000013, 0.48767960070000005, 0.1... \n", + "4 [0.3210222423000001, 0.5830105542999999, 0.095... \n", + "\n", + " quality_prob_4 \\\n", + "0 [0.3728572130000001, 0.4990161359000001, 0.128... \n", + "1 [0.3413305581, 0.5345032215, 0.12416620550000003] \n", + "2 [0.38981089000000013, 0.4821754992000001, 0.12... \n", + "3 [0.38701269030000013, 0.48767960070000005, 0.1... \n", + "4 [0.3210222423000001, 0.5830105542999999, 0.095... \n", + "\n", + " text \n", + "0 Quantum computing is set to revolutionize the ... \n", + "1 Investing in index funds is a popular strategy... \n", + "2 Recent advancements in gene therapy offer new ... \n", + "3 Online learning platforms have transformed the... \n", + "4 Traveling to Europe during the off-season can ... " + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "output_dataset = DocumentDataset.read_json(output_file_path, backend=\"cudf\", add_filename=write_to_filename)\n", + "output_dataset.df.head()" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "NeMo-Curator-env-2", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.14" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} From 13ff99cd21e46727125a14fcb3245f27f08b2458 Mon Sep 17 00:00:00 2001 From: Sarah Yurick Date: Wed, 7 Aug 2024 13:26:07 -0700 Subject: [PATCH 02/14] continue debugging Signed-off-by: Sarah Yurick --- .../multiple_quality_models.ipynb | 397 +++--------------- 1 file changed, 54 insertions(+), 343 deletions(-) diff --git a/tutorials/distributed_data_classification/multiple_quality_models.ipynb b/tutorials/distributed_data_classification/multiple_quality_models.ipynb index d1358edb4..4f799597b 100644 --- a/tutorials/distributed_data_classification/multiple_quality_models.ipynb +++ b/tutorials/distributed_data_classification/multiple_quality_models.ipynb @@ -40,7 +40,7 @@ "metadata": {}, "outputs": [], "source": [ - "from nemo_curator import QualityClassifier\n", + "from nemo_curator.classifiers import QualityClassifier\n", "from nemo_curator.datasets import DocumentDataset\n", "from nemo_curator.utils.distributed_utils import get_client\n", "import cudf\n", @@ -96,7 +96,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 17, "metadata": {}, "outputs": [], "source": [ @@ -127,68 +127,62 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 18, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "Starting Quality classifier inference\n", - "Starting Quality classifier inference\n", - "Starting Quality classifier inference\n", "Starting Quality classifier inference\n", "Starting Quality classifier inference\n" ] } ], "source": [ - "fold = 0\n", "pred_columns = []\n", - "for quality_model_path in quality_model_paths:\n", - " pred_column = \"quality_pred_\" + str(fold)\n", - " prob_column = \"quality_prob_\" + str(fold)\n", - " pred_columns.append(pred_column)\n", + "prob_columns = []\n", + "\n", + "# Fold 0\n", + "quality_model_path = quality_model_paths[0]\n", + "pred_column = \"quality_pred_0\"\n", + "prob_column = \"quality_prob_0\"\n", + "pred_columns.append(pred_column)\n", + "prob_columns.append(prob_column)\n", + "classifier_0 = QualityClassifier(\n", + " model_path=quality_model_path,\n", + " batch_size=1024,\n", + " pred_column=pred_column,\n", + " prob_column=prob_column,\n", + ")\n", + "dataset_0 = classifier_0(dataset=dataset)\n", "\n", - " classifier = QualityClassifier(\n", - " model_path=quality_model_path,\n", - " batch_size=1024,\n", - " pred_column=pred_column,\n", - " prob_column=prob_column,\n", - " )\n", - " dataset = classifier(dataset=dataset)\n", - " fold += 1" + "# Fold 1\n", + "quality_model_path = quality_model_paths[1]\n", + "pred_column = \"quality_pred_1\"\n", + "prob_column = \"quality_prob_1\"\n", + "pred_columns.append(pred_column)\n", + "prob_columns.append(prob_column)\n", + "classifier_1 = QualityClassifier(\n", + " model_path=quality_model_path,\n", + " batch_size=1024,\n", + " pred_column=pred_column,\n", + " prob_column=prob_column,\n", + ")\n", + "dataset_1 = classifier_1(dataset=dataset)" ] }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 22, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ - "GPU: 0, Part: 0: 100%|██████████| 10/10 [00:04<00:00, 2.44it/s]\n", - "GPU: 0, Part: 0: 100%|██████████| 10/10 [00:00<00:00, 30.69it/s]\n", - "GPU: 0, Part: 0: 100%|██████████| 10/10 [00:00<00:00, 29.82it/s]\n", - "GPU: 0, Part: 0: 100%|██████████| 10/10 [00:00<00:00, 31.67it/s]\n", - "GPU: 0, Part: 0: 100%|██████████| 10/10 [00:00<00:00, 82.84it/s]" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "CPU times: user 613 ms, sys: 361 ms, total: 974 ms\n", - "Wall time: 10.7 s\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "GPU: 0, Part: 0: 100%|██████████| 10/10 [00:00<00:00, 31.16it/s]\n" + "GPU: 0, Part: 0: 100%|██████████| 10/10 [00:03<00:00, 3.18it/s]\n", + "GPU: 0, Part: 0: 100%|██████████| 10/10 [00:03<00:00, 3.23it/s]\n" ] }, { @@ -217,12 +211,6 @@ " quality_pred_0\n", " quality_prob_1\n", " quality_pred_1\n", - " quality_prob_2\n", - " quality_pred_2\n", - " quality_prob_3\n", - " quality_pred_3\n", - " quality_prob_4\n", - " quality_pred_4\n", " \n", " \n", " \n", @@ -233,12 +221,6 @@ " Medium\n", " [0.3728572130203247, 0.499016135931015, 0.1281...\n", " Medium\n", - " [0.3728572130203247, 0.499016135931015, 0.1281...\n", - " Medium\n", - " [0.3728572130203247, 0.499016135931015, 0.1281...\n", - " Medium\n", - " [0.3728572130203247, 0.499016135931015, 0.1281...\n", - " Medium\n", " \n", " \n", " 1\n", @@ -247,12 +229,6 @@ " Medium\n", " [0.34133055806159973, 0.5345032215118408, 0.12...\n", " Medium\n", - " [0.34133055806159973, 0.5345032215118408, 0.12...\n", - " Medium\n", - " [0.34133055806159973, 0.5345032215118408, 0.12...\n", - " Medium\n", - " [0.34133055806159973, 0.5345032215118408, 0.12...\n", - " Medium\n", " \n", " \n", " 2\n", @@ -261,12 +237,6 @@ " Medium\n", " [0.3898108899593353, 0.4821754992008209, 0.128...\n", " Medium\n", - " [0.3898108899593353, 0.4821754992008209, 0.128...\n", - " Medium\n", - " [0.3898108899593353, 0.4821754992008209, 0.128...\n", - " Medium\n", - " [0.3898108899593353, 0.4821754992008209, 0.128...\n", - " Medium\n", " \n", " \n", " 3\n", @@ -275,12 +245,6 @@ " Medium\n", " [0.38701269030570984, 0.4876796007156372, 0.12...\n", " Medium\n", - " [0.38701269030570984, 0.4876796007156372, 0.12...\n", - " Medium\n", - " [0.38701269030570984, 0.4876796007156372, 0.12...\n", - " Medium\n", - " [0.38701269030570984, 0.4876796007156372, 0.12...\n", - " Medium\n", " \n", " \n", " 4\n", @@ -289,12 +253,6 @@ " Medium\n", " [0.32102224230766296, 0.5830105543136597, 0.09...\n", " Medium\n", - " [0.32102224230766296, 0.5830105543136597, 0.09...\n", - " Medium\n", - " [0.32102224230766296, 0.5830105543136597, 0.09...\n", - " Medium\n", - " [0.32102224230766296, 0.5830105543136597, 0.09...\n", - " Medium\n", " \n", " \n", " 5\n", @@ -303,12 +261,6 @@ " Medium\n", " [0.34178370237350464, 0.5548713207244873, 0.10...\n", " Medium\n", - " [0.34178370237350464, 0.5548713207244873, 0.10...\n", - " Medium\n", - " [0.34178370237350464, 0.5548713207244873, 0.10...\n", - " Medium\n", - " [0.34178370237350464, 0.5548713207244873, 0.10...\n", - " Medium\n", " \n", " \n", " 6\n", @@ -317,12 +269,6 @@ " Medium\n", " [0.35998600721359253, 0.525088906288147, 0.114...\n", " Medium\n", - " [0.35998600721359253, 0.525088906288147, 0.114...\n", - " Medium\n", - " [0.35998600721359253, 0.525088906288147, 0.114...\n", - " Medium\n", - " [0.35998600721359253, 0.525088906288147, 0.114...\n", - " Medium\n", " \n", " \n", " 7\n", @@ -331,12 +277,6 @@ " Medium\n", " [0.3145926594734192, 0.5717698335647583, 0.113...\n", " Medium\n", - " [0.3145926594734192, 0.5717698335647583, 0.113...\n", - " Medium\n", - " [0.3145926594734192, 0.5717698335647583, 0.113...\n", - " Medium\n", - " [0.3145926594734192, 0.5717698335647583, 0.113...\n", - " Medium\n", " \n", " \n", " 8\n", @@ -345,12 +285,6 @@ " Medium\n", " [0.3767526149749756, 0.5015591979026794, 0.121...\n", " Medium\n", - " [0.3767526149749756, 0.5015591979026794, 0.121...\n", - " Medium\n", - " [0.3767526149749756, 0.5015591979026794, 0.121...\n", - " Medium\n", - " [0.3767526149749756, 0.5015591979026794, 0.121...\n", - " Medium\n", " \n", " \n", " 9\n", @@ -359,12 +293,6 @@ " Medium\n", " [0.34938278794288635, 0.5144903659820557, 0.13...\n", " Medium\n", - " [0.34938278794288635, 0.5144903659820557, 0.13...\n", - " Medium\n", - " [0.34938278794288635, 0.5144903659820557, 0.13...\n", - " Medium\n", - " [0.34938278794288635, 0.5144903659820557, 0.13...\n", - " Medium\n", " \n", " \n", "\n", @@ -395,43 +323,7 @@ "8 [0.3767526149749756, 0.5015591979026794, 0.121... Medium \n", "9 [0.34938278794288635, 0.5144903659820557, 0.13... Medium \n", "\n", - " quality_prob_1 quality_pred_1 \\\n", - "0 [0.3728572130203247, 0.499016135931015, 0.1281... Medium \n", - "1 [0.34133055806159973, 0.5345032215118408, 0.12... Medium \n", - "2 [0.3898108899593353, 0.4821754992008209, 0.128... Medium \n", - "3 [0.38701269030570984, 0.4876796007156372, 0.12... Medium \n", - "4 [0.32102224230766296, 0.5830105543136597, 0.09... Medium \n", - "5 [0.34178370237350464, 0.5548713207244873, 0.10... Medium \n", - "6 [0.35998600721359253, 0.525088906288147, 0.114... Medium \n", - "7 [0.3145926594734192, 0.5717698335647583, 0.113... Medium \n", - "8 [0.3767526149749756, 0.5015591979026794, 0.121... Medium \n", - "9 [0.34938278794288635, 0.5144903659820557, 0.13... Medium \n", - "\n", - " quality_prob_2 quality_pred_2 \\\n", - "0 [0.3728572130203247, 0.499016135931015, 0.1281... Medium \n", - "1 [0.34133055806159973, 0.5345032215118408, 0.12... Medium \n", - "2 [0.3898108899593353, 0.4821754992008209, 0.128... Medium \n", - "3 [0.38701269030570984, 0.4876796007156372, 0.12... Medium \n", - "4 [0.32102224230766296, 0.5830105543136597, 0.09... Medium \n", - "5 [0.34178370237350464, 0.5548713207244873, 0.10... Medium \n", - "6 [0.35998600721359253, 0.525088906288147, 0.114... Medium \n", - "7 [0.3145926594734192, 0.5717698335647583, 0.113... Medium \n", - "8 [0.3767526149749756, 0.5015591979026794, 0.121... Medium \n", - "9 [0.34938278794288635, 0.5144903659820557, 0.13... Medium \n", - "\n", - " quality_prob_3 quality_pred_3 \\\n", - "0 [0.3728572130203247, 0.499016135931015, 0.1281... Medium \n", - "1 [0.34133055806159973, 0.5345032215118408, 0.12... Medium \n", - "2 [0.3898108899593353, 0.4821754992008209, 0.128... Medium \n", - "3 [0.38701269030570984, 0.4876796007156372, 0.12... Medium \n", - "4 [0.32102224230766296, 0.5830105543136597, 0.09... Medium \n", - "5 [0.34178370237350464, 0.5548713207244873, 0.10... Medium \n", - "6 [0.35998600721359253, 0.525088906288147, 0.114... Medium \n", - "7 [0.3145926594734192, 0.5717698335647583, 0.113... Medium \n", - "8 [0.3767526149749756, 0.5015591979026794, 0.121... Medium \n", - "9 [0.34938278794288635, 0.5144903659820557, 0.13... Medium \n", - "\n", - " quality_prob_4 quality_pred_4 \n", + " quality_prob_1 quality_pred_1 \n", "0 [0.3728572130203247, 0.499016135931015, 0.1281... Medium \n", "1 [0.34133055806159973, 0.5345032215118408, 0.12... Medium \n", "2 [0.3898108899593353, 0.4821754992008209, 0.128... Medium \n", @@ -444,43 +336,27 @@ "9 [0.34938278794288635, 0.5144903659820557, 0.13... Medium " ] }, - "execution_count": 7, + "execution_count": 22, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "%%time\n", + "# Only saving Classifier 0 results\n", + "# Classifier 0: 0.3728572130203247\n", + "# Classifier 1: 0.3029068112373352\n", "\n", - "dataset.df.compute()" + "merged_df = dataset_0.df.merge(dataset_1.df, on=\"text\")\n", + "merged_df.compute()" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "GPU: 0, Part: 0: 100%|██████████| 10/10 [00:04<00:00, 2.41it/s]\n", - "GPU: 0, Part: 0: 100%|██████████| 10/10 [00:00<00:00, 27.78it/s]\n", - "GPU: 0, Part: 0: 100%|██████████| 10/10 [00:00<00:00, 29.53it/s]\n", - "GPU: 0, Part: 0: 100%|██████████| 10/10 [00:00<00:00, 28.27it/s]\n", - "GPU: 0, Part: 0: 100%|██████████| 10/10 [00:00<00:00, 30.57it/s]\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Writing to disk complete for 1 partitions\n" - ] - } - ], + "outputs": [], "source": [ - "dataset.to_json(output_file_dir=output_file_path, write_to_filename=write_to_filename)" + "# dataset.to_json(output_file_dir=output_file_path, write_to_filename=write_to_filename)" ] }, { @@ -492,185 +368,20 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 23, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Reading 1 files\n" - ] - }, - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
quality_pred_0quality_pred_1quality_pred_2quality_pred_3quality_pred_4quality_prob_0quality_prob_1quality_prob_2quality_prob_3quality_prob_4text
0MediumMediumMediumMediumMedium[0.3728572130000001, 0.4990161359000001, 0.128...[0.3728572130000001, 0.4990161359000001, 0.128...[0.3728572130000001, 0.4990161359000001, 0.128...[0.3728572130000001, 0.4990161359000001, 0.128...[0.3728572130000001, 0.4990161359000001, 0.128...Quantum computing is set to revolutionize the ...
1MediumMediumMediumMediumMedium[0.3413305581, 0.5345032215, 0.12416620550000003][0.3413305581, 0.5345032215, 0.12416620550000003][0.3413305581, 0.5345032215, 0.12416620550000003][0.3413305581, 0.5345032215, 0.12416620550000003][0.3413305581, 0.5345032215, 0.12416620550000003]Investing in index funds is a popular strategy...
2MediumMediumMediumMediumMedium[0.38981089000000013, 0.4821754992000001, 0.12...[0.38981089000000013, 0.4821754992000001, 0.12...[0.38981089000000013, 0.4821754992000001, 0.12...[0.38981089000000013, 0.4821754992000001, 0.12...[0.38981089000000013, 0.4821754992000001, 0.12...Recent advancements in gene therapy offer new ...
3MediumMediumMediumMediumMedium[0.38701269030000013, 0.48767960070000005, 0.1...[0.38701269030000013, 0.48767960070000005, 0.1...[0.38701269030000013, 0.48767960070000005, 0.1...[0.38701269030000013, 0.48767960070000005, 0.1...[0.38701269030000013, 0.48767960070000005, 0.1...Online learning platforms have transformed the...
4MediumMediumMediumMediumMedium[0.3210222423000001, 0.5830105542999999, 0.095...[0.3210222423000001, 0.5830105542999999, 0.095...[0.3210222423000001, 0.5830105542999999, 0.095...[0.3210222423000001, 0.5830105542999999, 0.095...[0.3210222423000001, 0.5830105542999999, 0.095...Traveling to Europe during the off-season can ...
\n", - "
" - ], - "text/plain": [ - " quality_pred_0 quality_pred_1 quality_pred_2 quality_pred_3 quality_pred_4 \\\n", - "0 Medium Medium Medium Medium Medium \n", - "1 Medium Medium Medium Medium Medium \n", - "2 Medium Medium Medium Medium Medium \n", - "3 Medium Medium Medium Medium Medium \n", - "4 Medium Medium Medium Medium Medium \n", - "\n", - " quality_prob_0 \\\n", - "0 [0.3728572130000001, 0.4990161359000001, 0.128... \n", - "1 [0.3413305581, 0.5345032215, 0.12416620550000003] \n", - "2 [0.38981089000000013, 0.4821754992000001, 0.12... \n", - "3 [0.38701269030000013, 0.48767960070000005, 0.1... \n", - "4 [0.3210222423000001, 0.5830105542999999, 0.095... \n", - "\n", - " quality_prob_1 \\\n", - "0 [0.3728572130000001, 0.4990161359000001, 0.128... \n", - "1 [0.3413305581, 0.5345032215, 0.12416620550000003] \n", - "2 [0.38981089000000013, 0.4821754992000001, 0.12... \n", - "3 [0.38701269030000013, 0.48767960070000005, 0.1... \n", - "4 [0.3210222423000001, 0.5830105542999999, 0.095... \n", - "\n", - " quality_prob_2 \\\n", - "0 [0.3728572130000001, 0.4990161359000001, 0.128... \n", - "1 [0.3413305581, 0.5345032215, 0.12416620550000003] \n", - "2 [0.38981089000000013, 0.4821754992000001, 0.12... \n", - "3 [0.38701269030000013, 0.48767960070000005, 0.1... \n", - "4 [0.3210222423000001, 0.5830105542999999, 0.095... \n", - "\n", - " quality_prob_3 \\\n", - "0 [0.3728572130000001, 0.4990161359000001, 0.128... \n", - "1 [0.3413305581, 0.5345032215, 0.12416620550000003] \n", - "2 [0.38981089000000013, 0.4821754992000001, 0.12... \n", - "3 [0.38701269030000013, 0.48767960070000005, 0.1... \n", - "4 [0.3210222423000001, 0.5830105542999999, 0.095... \n", - "\n", - " quality_prob_4 \\\n", - "0 [0.3728572130000001, 0.4990161359000001, 0.128... \n", - "1 [0.3413305581, 0.5345032215, 0.12416620550000003] \n", - "2 [0.38981089000000013, 0.4821754992000001, 0.12... \n", - "3 [0.38701269030000013, 0.48767960070000005, 0.1... \n", - "4 [0.3210222423000001, 0.5830105542999999, 0.095... \n", - "\n", - " text \n", - "0 Quantum computing is set to revolutionize the ... \n", - "1 Investing in index funds is a popular strategy... \n", - "2 Recent advancements in gene therapy offer new ... \n", - "3 Online learning platforms have transformed the... \n", - "4 Traveling to Europe during the off-season can ... " - ] - }, - "execution_count": 9, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ - "output_dataset = DocumentDataset.read_json(output_file_path, backend=\"cudf\", add_filename=write_to_filename)\n", - "output_dataset.df.head()" + "# output_dataset = DocumentDataset.read_json(output_file_path, backend=\"cudf\", add_filename=write_to_filename)\n", + "# output_dataset.df.head()" ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] } ], "metadata": { From 79aced5a0d237716f4884af76f9f39455e6d92f9 Mon Sep 17 00:00:00 2001 From: Sarah Yurick Date: Fri, 8 Nov 2024 16:02:48 -0800 Subject: [PATCH 03/14] add working example Signed-off-by: Sarah Yurick --- nemo_curator/classifiers/__init__.py | 2 + nemo_curator/classifiers/pytorch_deberta.py | 228 +++++++++ .../multiple_quality_models.ipynb | 408 ---------------- .../pytorch_ensemble_classification.ipynb | 437 ++++++++++++++++++ 4 files changed, 667 insertions(+), 408 deletions(-) create mode 100644 nemo_curator/classifiers/pytorch_deberta.py delete mode 100644 tutorials/distributed_data_classification/multiple_quality_models.ipynb create mode 100644 tutorials/distributed_data_classification/pytorch_ensemble_classification.ipynb diff --git a/nemo_curator/classifiers/__init__.py b/nemo_curator/classifiers/__init__.py index f10d63c15..a26e4eac3 100644 --- a/nemo_curator/classifiers/__init__.py +++ b/nemo_curator/classifiers/__init__.py @@ -18,6 +18,7 @@ from .aegis import AegisClassifier from .domain import DomainClassifier from .fineweb_edu import FineWebEduClassifier +from .pytorch_deberta import PyTorchClassifier from .quality import QualityClassifier __all__ = [ @@ -25,4 +26,5 @@ "QualityClassifier", "AegisClassifier", "FineWebEduClassifier", + "PyTorchClassifier", ] diff --git a/nemo_curator/classifiers/pytorch_deberta.py b/nemo_curator/classifiers/pytorch_deberta.py new file mode 100644 index 000000000..550bc8985 --- /dev/null +++ b/nemo_curator/classifiers/pytorch_deberta.py @@ -0,0 +1,228 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os + +os.environ["RAPIDS_NO_INITIALIZE"] = "1" +from dataclasses import dataclass +from typing import List, Optional + +import torch +import torch.nn as nn +from crossfit.backend.torch.hf.model import HFModel +from transformers import AutoConfig, AutoModel +from transformers.models.deberta_v2 import DebertaV2TokenizerFast + +from nemo_curator.classifiers.base import DistributedDataClassifier, _run_classifier_helper +from nemo_curator.datasets import DocumentDataset + + + +@dataclass +class PyTorchModelConfig: + base_model: str = "microsoft/deberta-v3-base" + fc_dropout: float = 0.2 + max_len: int = 512 + + +class NCCustomModel(nn.Module): + def __init__( + self, + config: dataclass, + out_dim: int, + config_path: str = None, + pretrained: bool = False, + autocast: bool = False, + ): + super().__init__() + self.config = config + if config_path is None: + self.config = AutoConfig.from_pretrained( + config.base_model, output_hidden_states=True + ) + else: + self.config = torch.load(config_path) + + if pretrained: + self.model = AutoModel.from_pretrained(config.base_model, config=self.config) + else: + self.model = AutoModel(self.config) + + self.fc_dropout = nn.Dropout(config.fc_dropout) + self.fc = nn.Linear(self.config.hidden_size, out_dim) + self._init_weights(self.fc) + self.autocast = autocast + + def _init_weights(self, module): + if isinstance(module, nn.Linear): + module.weight.data.normal_(mean=0.0, std=self.config.initializer_range) + if module.bias is not None: + module.bias.data.zero_() + elif isinstance(module, nn.Embedding): + module.weight.data.normal_(mean=0.0, std=self.config.initializer_range) + if module.padding_idx is not None: + module.weight.data[module.padding_idx].zero_() + elif isinstance(module, nn.LayerNorm): + module.bias.data.zero_() + module.weight.data.fill_(1.0) + + def feature(self, input_ids, attention_mask): + outputs = self.model(input_ids=input_ids, attention_mask=attention_mask) + last_hidden_states = outputs[0] + return last_hidden_states + + def _forward(self, batch): + feature = self.feature(batch["input_ids"], batch["attention_mask"]) + output = self.fc(self.fc_dropout(feature)) + output = output.to(torch.float32) + return torch.softmax(output[:, 0, :], dim=1) + + def forward(self, batch): + if self.autocast: + with torch.autocast(device_type="cuda"): + return self._forward(batch) + else: + return self._forward(batch) + + +class PyTorchModel(HFModel): + def __init__( + self, + config: dataclass, + out_dim: int, + model_path: str, + autocast: bool = False, + ): + self.config = config + self.out_dim = out_dim + self.model_path = model_path + self.autocast = autocast + super().__init__(self.config.base_model) + + def load_model(self, device: str = "cuda"): + model = NCCustomModel( + self.config, + out_dim=self.out_dim, + config_path=None, + pretrained=True, + autocast=self.autocast, + ) + model = model.to(device) + + if os.path.exists(self.model_path): + sd = torch.load(self.model_path, map_location="cpu") + if "model_state_dict" in sd: + sd = sd["model_state_dict"] + sd = {k[7:] if k.startswith("module.") else k: sd[k] for k in sd.keys()} + model.load_state_dict(sd, strict=True) + else: + raise ValueError(f"Model path {self.model_path} does not exist") + + return model.eval() + + def load_tokenizer(self): + # TODO: Allow user to pass in their own tokenizer if base_model is not Deberta + return DebertaV2TokenizerFast.from_pretrained(self.config.base_model) + + def load_config(self): + return AutoConfig.from_pretrained(self.path_or_name) + + +class PyTorchClassifier(DistributedDataClassifier): + """ + PyTorchClassifier is a general classifier designed for running generic PTH model files. + This class is optimized for running on multi-node, multi-GPU setups to enable fast and efficient inference on large datasets. + + Attributes: + pretrained_model_name_or_path (str): The path to your PyTorch model file. + labels (list[str]): The classes output by the model classifier. + out_dim (list[str], optional): Set to 1 for a binary classification task. Otherwise, defaults to len(labels). + filter_by (list[str], optional): The classes to filter the dataset by. If None, all classes will be included. Defaults to None. + batch_size (int): The number of samples per batch for inference. Defaults to 256. + text_field (str): The field in the dataset that should be classified. + pred_column (str): The column name where predictions will be stored. Defaults to "pred". + prob_column (str): The column name where prediction probabilities will be stored. Defaults to "prob". + max_chars (int): The maximum number of characters in each document to consider for classification. Defaults to 6000. + device_type (str): The type of device to use for inference, either "cuda" or "cpu". Defaults to "cuda". + autocast (bool): Whether to use mixed precision for faster inference. Defaults to True. + base_model (str): The base model on which your PyTorch model was trained. Defaults to "microsoft/deberta-v3-base". + fc_dropout (str): Dropout rate used during training. Defaults to 0.2. + max_len (str): Maximum sequence length used during training. Defaults to 512. + """ + + def __init__( + self, + pretrained_model_name_or_path: str, + labels: List[str], + out_dim: Optional[int] = None, + filter_by: Optional[List[str]] = None, + batch_size: int = 256, + text_field: str = "text", + pred_column: str = "pred", + prob_column: str = "prob", + max_chars: int = 6000, + device_type: str = "cuda", + autocast: bool = True, + base_model: str = "microsoft/deberta-v3-base", + fc_dropout: float = 0.2, + max_len: int = 512, + ): + config = PyTorchModelConfig( + base_model=base_model, + fc_dropout=fc_dropout, + max_len=max_len, + ) + + self.labels = labels + if out_dim: + self.out_dim = out_dim + else: + self.out_dim = len(labels) + + self.text_field = text_field + self.prob_column = prob_column + + model = PyTorchModel( + config=config, + out_dim=self.out_dim, + model_path=pretrained_model_name_or_path, + autocast=autocast, + ) + + super().__init__( + model=model, + labels=self.labels, + filter_by=filter_by, + batch_size=batch_size, + out_dim=self.out_dim, + pred_column=pred_column, + max_chars=max_chars, + device_type=device_type, + autocast=autocast, + ) + + def _run_classifier(self, dataset: DocumentDataset): + print("Starting PyTorch classifier inference", flush=True) + df = dataset.df + df = _run_classifier_helper( + df=df, + model=self.model, + labels=self.labels, + max_chars=self.max_chars, + batch_size=self.batch_size, + label_col=self.pred_column, + text_field=self.text_field, + prob_col=self.prob_column, + ) + return DocumentDataset(df) diff --git a/tutorials/distributed_data_classification/multiple_quality_models.ipynb b/tutorials/distributed_data_classification/multiple_quality_models.ipynb deleted file mode 100644 index 4f799597b..000000000 --- a/tutorials/distributed_data_classification/multiple_quality_models.ipynb +++ /dev/null @@ -1,408 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Distributed Data Classification with Multiple Quality Classifiers\n", - "\n", - "The notebook demonstrates the use of five quality classifiers for distributed data classification. Each fold was trained on 80% of the data, so the results can be ensembled into a single prediction. These classifers help with annotation which helps data blending for foundation model training.\n", - "\n", - "The classifiers are accelerated using [CrossFit](https://github.com/rapidsai/crossfit), a library that leverages intellegent batching and RAPIDS to accelerate the offline inference on large datasets." - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "env: PYTHONWARNINGS=ignore\n", - "env: DASK_DATAFRAME__QUERY_PLANNING=False\n" - ] - } - ], - "source": [ - "# Silence Warnings (HuggingFace internal warnings)\n", - "\n", - "%env PYTHONWARNINGS=ignore\n", - "%env DASK_DATAFRAME__QUERY_PLANNING=False\n", - "import warnings\n", - "warnings.filterwarnings(\"ignore\")" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [], - "source": [ - "from nemo_curator.classifiers import QualityClassifier\n", - "from nemo_curator.datasets import DocumentDataset\n", - "from nemo_curator.utils.distributed_utils import get_client\n", - "import cudf\n", - "import dask_cudf" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": {}, - "outputs": [], - "source": [ - "client = get_client(cluster_type=\"gpu\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Set File Paths " - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": {}, - "outputs": [], - "source": [ - "output_file_path = \"output_data_dir/\"\n", - "output_file_path = \"/home/nfs/syurick/NeMo-Curator/tutorials/distributed_data_classification/nb_q_result\"\n", - "quality_model_paths = [\n", - " \"quality_model0.pth\",\n", - " \"quality_model1.pth\",\n", - " \"quality_model2.pth\",\n", - " \"quality_model3.pth\",\n", - " \"quality_model4.pth\",\n", - "]\n", - "quality_model_paths = [\n", - " \"/home/nfs/syurick/LLM_quality_classifier_inference/ensemble/quality_surge_full_22828_1e5_5ep_bs64_1024_fold0_best.pth\",\n", - " \"/home/nfs/syurick/LLM_quality_classifier_inference/ensemble/quality_surge_full_22828_1e5_5ep_bs64_1024_fold1_best.pth\",\n", - " \"/home/nfs/syurick/LLM_quality_classifier_inference/ensemble/quality_surge_full_22828_1e5_5ep_bs64_1024_fold2_best.pth\",\n", - " \"/home/nfs/syurick/LLM_quality_classifier_inference/ensemble/quality_surge_full_22828_1e5_5ep_bs64_1024_fold3_best.pth\",\n", - " \"/home/nfs/syurick/LLM_quality_classifier_inference/ensemble/quality_surge_full_22828_1e5_5ep_bs64_1024_fold4_best.pth\",\n", - "]" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Create and Run Classifiers" - ] - }, - { - "cell_type": "code", - "execution_count": 17, - "metadata": {}, - "outputs": [], - "source": [ - "# Create sample DataFrame\n", - "text = [\n", - " \"Quantum computing is set to revolutionize the field of cryptography.\",\n", - " \"Investing in index funds is a popular strategy for long-term financial growth.\",\n", - " \"Recent advancements in gene therapy offer new hope for treating genetic disorders.\",\n", - " \"Online learning platforms have transformed the way students access educational resources.\",\n", - " \"Traveling to Europe during the off-season can be a more budget-friendly option.\",\n", - " \"Training regimens for athletes have become more sophisticated with the use of data analytics.\",\n", - " \"Streaming services are changing the way people consume television and film content.\",\n", - " \"Vegan recipes have gained popularity as more people adopt plant-based diets.\",\n", - " \"Climate change research is critical for developing sustainable environmental policies.\",\n", - " \"Telemedicine has become increasingly popular due to its convenience and accessibility.\",\n", - "]\n", - "df = cudf.DataFrame({\"text\": text})\n", - "dataset = DocumentDataset(dask_cudf.from_cudf(df, npartitions=1))\n", - "write_to_filename = False\n", - "\n", - "# Alternatively, read existing directory of JSONL files\n", - "# input_file_path=\"/input_data_dir/\"\n", - "# input_dataset = DocumentDataset.read_json(\n", - "# input_file_path, backend=\"cudf\", add_filename=True\n", - "# )\n", - "# write_to_filename = True" - ] - }, - { - "cell_type": "code", - "execution_count": 18, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Starting Quality classifier inference\n", - "Starting Quality classifier inference\n" - ] - } - ], - "source": [ - "pred_columns = []\n", - "prob_columns = []\n", - "\n", - "# Fold 0\n", - "quality_model_path = quality_model_paths[0]\n", - "pred_column = \"quality_pred_0\"\n", - "prob_column = \"quality_prob_0\"\n", - "pred_columns.append(pred_column)\n", - "prob_columns.append(prob_column)\n", - "classifier_0 = QualityClassifier(\n", - " model_path=quality_model_path,\n", - " batch_size=1024,\n", - " pred_column=pred_column,\n", - " prob_column=prob_column,\n", - ")\n", - "dataset_0 = classifier_0(dataset=dataset)\n", - "\n", - "# Fold 1\n", - "quality_model_path = quality_model_paths[1]\n", - "pred_column = \"quality_pred_1\"\n", - "prob_column = \"quality_prob_1\"\n", - "pred_columns.append(pred_column)\n", - "prob_columns.append(prob_column)\n", - "classifier_1 = QualityClassifier(\n", - " model_path=quality_model_path,\n", - " batch_size=1024,\n", - " pred_column=pred_column,\n", - " prob_column=prob_column,\n", - ")\n", - "dataset_1 = classifier_1(dataset=dataset)" - ] - }, - { - "cell_type": "code", - "execution_count": 22, - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "GPU: 0, Part: 0: 100%|██████████| 10/10 [00:03<00:00, 3.18it/s]\n", - "GPU: 0, Part: 0: 100%|██████████| 10/10 [00:03<00:00, 3.23it/s]\n" - ] - }, - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
textquality_prob_0quality_pred_0quality_prob_1quality_pred_1
0Quantum computing is set to revolutionize the ...[0.3728572130203247, 0.499016135931015, 0.1281...Medium[0.3728572130203247, 0.499016135931015, 0.1281...Medium
1Investing in index funds is a popular strategy...[0.34133055806159973, 0.5345032215118408, 0.12...Medium[0.34133055806159973, 0.5345032215118408, 0.12...Medium
2Recent advancements in gene therapy offer new ...[0.3898108899593353, 0.4821754992008209, 0.128...Medium[0.3898108899593353, 0.4821754992008209, 0.128...Medium
3Online learning platforms have transformed the...[0.38701269030570984, 0.4876796007156372, 0.12...Medium[0.38701269030570984, 0.4876796007156372, 0.12...Medium
4Traveling to Europe during the off-season can ...[0.32102224230766296, 0.5830105543136597, 0.09...Medium[0.32102224230766296, 0.5830105543136597, 0.09...Medium
5Training regimens for athletes have become mor...[0.34178370237350464, 0.5548713207244873, 0.10...Medium[0.34178370237350464, 0.5548713207244873, 0.10...Medium
6Streaming services are changing the way people...[0.35998600721359253, 0.525088906288147, 0.114...Medium[0.35998600721359253, 0.525088906288147, 0.114...Medium
7Vegan recipes have gained popularity as more p...[0.3145926594734192, 0.5717698335647583, 0.113...Medium[0.3145926594734192, 0.5717698335647583, 0.113...Medium
8Climate change research is critical for develo...[0.3767526149749756, 0.5015591979026794, 0.121...Medium[0.3767526149749756, 0.5015591979026794, 0.121...Medium
9Telemedicine has become increasingly popular d...[0.34938278794288635, 0.5144903659820557, 0.13...Medium[0.34938278794288635, 0.5144903659820557, 0.13...Medium
\n", - "
" - ], - "text/plain": [ - " text \\\n", - "0 Quantum computing is set to revolutionize the ... \n", - "1 Investing in index funds is a popular strategy... \n", - "2 Recent advancements in gene therapy offer new ... \n", - "3 Online learning platforms have transformed the... \n", - "4 Traveling to Europe during the off-season can ... \n", - "5 Training regimens for athletes have become mor... \n", - "6 Streaming services are changing the way people... \n", - "7 Vegan recipes have gained popularity as more p... \n", - "8 Climate change research is critical for develo... \n", - "9 Telemedicine has become increasingly popular d... \n", - "\n", - " quality_prob_0 quality_pred_0 \\\n", - "0 [0.3728572130203247, 0.499016135931015, 0.1281... Medium \n", - "1 [0.34133055806159973, 0.5345032215118408, 0.12... Medium \n", - "2 [0.3898108899593353, 0.4821754992008209, 0.128... Medium \n", - "3 [0.38701269030570984, 0.4876796007156372, 0.12... Medium \n", - "4 [0.32102224230766296, 0.5830105543136597, 0.09... Medium \n", - "5 [0.34178370237350464, 0.5548713207244873, 0.10... Medium \n", - "6 [0.35998600721359253, 0.525088906288147, 0.114... Medium \n", - "7 [0.3145926594734192, 0.5717698335647583, 0.113... Medium \n", - "8 [0.3767526149749756, 0.5015591979026794, 0.121... Medium \n", - "9 [0.34938278794288635, 0.5144903659820557, 0.13... Medium \n", - "\n", - " quality_prob_1 quality_pred_1 \n", - "0 [0.3728572130203247, 0.499016135931015, 0.1281... Medium \n", - "1 [0.34133055806159973, 0.5345032215118408, 0.12... Medium \n", - "2 [0.3898108899593353, 0.4821754992008209, 0.128... Medium \n", - "3 [0.38701269030570984, 0.4876796007156372, 0.12... Medium \n", - "4 [0.32102224230766296, 0.5830105543136597, 0.09... Medium \n", - "5 [0.34178370237350464, 0.5548713207244873, 0.10... Medium \n", - "6 [0.35998600721359253, 0.525088906288147, 0.114... Medium \n", - "7 [0.3145926594734192, 0.5717698335647583, 0.113... Medium \n", - "8 [0.3767526149749756, 0.5015591979026794, 0.121... Medium \n", - "9 [0.34938278794288635, 0.5144903659820557, 0.13... Medium " - ] - }, - "execution_count": 22, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# Only saving Classifier 0 results\n", - "# Classifier 0: 0.3728572130203247\n", - "# Classifier 1: 0.3029068112373352\n", - "\n", - "merged_df = dataset_0.df.merge(dataset_1.df, on=\"text\")\n", - "merged_df.compute()" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "metadata": {}, - "outputs": [], - "source": [ - "# dataset.to_json(output_file_dir=output_file_path, write_to_filename=write_to_filename)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Inspect the Output" - ] - }, - { - "cell_type": "code", - "execution_count": 23, - "metadata": {}, - "outputs": [], - "source": [ - "# output_dataset = DocumentDataset.read_json(output_file_path, backend=\"cudf\", add_filename=write_to_filename)\n", - "# output_dataset.df.head()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "NeMo-Curator-env-2", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.10.14" - } - }, - "nbformat": 4, - "nbformat_minor": 2 -} diff --git a/tutorials/distributed_data_classification/pytorch_ensemble_classification.ipynb b/tutorials/distributed_data_classification/pytorch_ensemble_classification.ipynb new file mode 100644 index 000000000..c02fac299 --- /dev/null +++ b/tutorials/distributed_data_classification/pytorch_ensemble_classification.ipynb @@ -0,0 +1,437 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Distributed Data Classification with Multiple Classifiers\n", + "\n", + "Cross-validation is a machine learning technique in which multiple models are trained on multiple subsets of your data and validated on the remaining data portions. It is useful because it reduces the risk of overfitting to your data and provides a better estimate of how the model will perform on unseen data. This is particularly valuable when dealing with limited data, as it allows for more efficient use of the available samples.\n", + "\n", + "In this tutorial, we demonstrate how to use NeMo Curator's `PyTorchClassifier` class to load and perform batched inference with multiple pretrained models. We assume the user has pretrained PTH model files, with [DeBERTaV3](https://huggingface.co/microsoft/deberta-v3-base) as the base model used for training. The classifiers are accelerated using [CrossFit](https://github.com/rapidsai/crossfit), a library that leverages intellegent batching and RAPIDS to accelerate the offline inference on large datasets.\n", + "\n", + "First, let's run some preliminary imports and set up our Dask client." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "env: PYTHONWARNINGS=ignore\n" + ] + } + ], + "source": [ + "# Silence Warnings (HuggingFace internal warnings)\n", + "%env PYTHONWARNINGS=ignore\n", + "import warnings\n", + "warnings.filterwarnings(\"ignore\")" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "from nemo_curator.classifiers import PyTorchClassifier\n", + "from nemo_curator.datasets import DocumentDataset\n", + "from nemo_curator import get_client\n", + "import cudf\n", + "import dask_cudf" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "cuDF Spilling is enabled\n" + ] + } + ], + "source": [ + "client = get_client(cluster_type=\"gpu\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Prepare Dataset and Set File Paths\n", + "\n", + "Next, we need to create or read the dataset on which we want to run inference. In this notebook, we provide a sample dataset with 10 text sentences to evaluate. Alternatively, the user may read in their own existing data (e.g., JSON or Parquet files) as demonstrated by the commented code." + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "# Create sample DataFrame\n", + "text = [\n", + " \"Quantum computing is set to revolutionize the field of cryptography.\",\n", + " \"Investing in index funds is a popular strategy for long-term financial growth.\",\n", + " \"Recent advancements in gene therapy offer new hope for treating genetic disorders.\",\n", + " \"Online learning platforms have transformed the way students access educational resources.\",\n", + " \"Traveling to Europe during the off-season can be a more budget-friendly option.\",\n", + " \"Training regimens for athletes have become more sophisticated with the use of data analytics.\",\n", + " \"Streaming services are changing the way people consume television and film content.\",\n", + " \"Vegan recipes have gained popularity as more people adopt plant-based diets.\",\n", + " \"Climate change research is critical for developing sustainable environmental policies.\",\n", + " \"Telemedicine has become increasingly popular due to its convenience and accessibility.\",\n", + "]\n", + "df = cudf.DataFrame({\"text\": text})\n", + "dataset = DocumentDataset(dask_cudf.from_cudf(df, npartitions=1))\n", + "write_to_filename = False\n", + "\n", + "# Alternatively, read existing directory of JSONL files\n", + "# input_file_path=\"/input_data_dir/\"\n", + "# input_dataset = DocumentDataset.read_json(\n", + "# input_file_path, backend=\"cudf\", add_filename=True\n", + "# )\n", + "# write_to_filename = True" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The user should also specify where to write the results, as well as the local file paths to the pretrained PyTorch classifiers. Finally, the user should include the labels the classifier is expected to produce." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "output_file_path = \"output_data_dir/\"\n", + "model_paths = [\n", + " \"model0.pth\",\n", + " \"model1.pth\",\n", + " \"model2.pth\",\n", + " \"model3.pth\",\n", + " \"model4.pth\",\n", + "]\n", + "labels = [\"label_a\", \"label_b\", \"label_c\"]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Run Classification with Multiple Models\n", + "\n", + "Now we can use the `PyTorchClassifier` class to load each of our PyTorch models and run inference. We will write the results to a JSON file." + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Starting PyTorch classifier inference\n", + "Starting PyTorch classifier inference\n", + "Starting PyTorch classifier inference\n", + "Starting PyTorch classifier inference\n", + "Starting PyTorch classifier inference\n" + ] + } + ], + "source": [ + "fold = 0\n", + "pred_columns = []\n", + "for model_path in model_paths:\n", + " pred_column = \"pred_\" + str(fold)\n", + " prob_column = \"prob_\" + str(fold)\n", + " pred_columns.append(pred_column)\n", + "\n", + " classifier = PyTorchClassifier(\n", + " pretrained_model_name_or_path=model_path,\n", + " labels=labels,\n", + " batch_size=1024,\n", + " text_field=\"text\",\n", + " pred_column=pred_column,\n", + " prob_column=prob_column,\n", + " )\n", + " dataset = classifier(dataset=dataset)\n", + " fold += 1" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "GPU: tcp://127.0.0.1:34075, Part: 0: 100%|██████████| 10/10 [00:08<00:00, 1.23it/s]\n", + "GPU: tcp://127.0.0.1:34075, Part: 0: 100%|██████████| 10/10 [00:05<00:00, 1.83it/s]\n", + "GPU: tcp://127.0.0.1:34075, Part: 0: 100%|██████████| 10/10 [00:05<00:00, 1.81it/s]\n", + "GPU: tcp://127.0.0.1:34075, Part: 0: 100%|██████████| 10/10 [00:05<00:00, 1.80it/s]\n", + "GPU: tcp://127.0.0.1:34075, Part: 0: 100%|██████████| 10/10 [00:04<00:00, 2.02it/s]" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Writing to disk complete for 1 partitions\n", + "CPU times: user 5.39 s, sys: 3.1 s, total: 8.49 s\n", + "Wall time: 48.8 s\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "GPU: tcp://127.0.0.1:34075, Part: 0: 100%|██████████| 10/10 [00:05<00:00, 1.80it/s]\n" + ] + } + ], + "source": [ + "%%time\n", + "\n", + "dataset.to_json(output_file_dir=output_file_path, write_to_filename=write_to_filename)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Inspect the Output\n", + "\n", + "Finally, let's verify that everything worked as expected." + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Reading 1 files\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
pred_0pred_1pred_2pred_3pred_4prob_0prob_1prob_2prob_3prob_4text
0label_blabel_blabel_blabel_blabel_b[0.37283509970000006, 0.49910834430000006, 0.1...[0.3027972281, 0.5215288401, 0.1756739765][0.41288739440000005, 0.5265461801999999, 0.06...[0.32485893370000013, 0.46514019370000004, 0.2...[0.3685780168000001, 0.5256645678999999, 0.105...Quantum computing is set to revolutionize the ...
1label_blabel_blabel_blabel_blabel_b[0.34135937690000007, 0.5343321562, 0.1243084297][0.34347015620000004, 0.5304207801999999, 0.12...[0.4346009791000001, 0.5130862594, 0.052312787...[0.3181181848000001, 0.4944583774000001, 0.187...[0.39643365140000003, 0.5143401027, 0.08922628...Investing in index funds is a popular strategy...
2label_blabel_blabel_blabel_blabel_b[0.38975748420000006, 0.48216831680000005, 0.1...[0.33265304570000004, 0.5090963244, 0.1582506448][0.44722059370000006, 0.4945448935000001, 0.05...[0.3444236219000001, 0.45550799370000006, 0.20...[0.3919632137000001, 0.5084934831, 0.099543325...Recent advancements in gene therapy offer new ...
3label_blabel_blabel_blabel_blabel_b[0.38686266540000014, 0.48784771560000006, 0.1...[0.3482291102, 0.5138959289, 0.13787493110000001][0.4499093592, 0.49849084020000006, 0.05159985...[0.3489176929000001, 0.45996120570000004, 0.19...[0.38338246940000015, 0.5131927133, 0.10342480...Online learning platforms have transformed the...
4label_blabel_blabel_blabel_blabel_b[0.3207181096000001, 0.5833522080999999, 0.095...[0.3277938664, 0.5600519180000001, 0.112154245...[0.39969193940000003, 0.5546463728000001, 0.04...[0.3249147236000001, 0.5021025537999999, 0.172...[0.35228130220000003, 0.5585800408999999, 0.08...Traveling to Europe during the off-season can ...
\n", + "
" + ], + "text/plain": [ + " pred_0 pred_1 pred_2 pred_3 pred_4 \\\n", + "0 label_b label_b label_b label_b label_b \n", + "1 label_b label_b label_b label_b label_b \n", + "2 label_b label_b label_b label_b label_b \n", + "3 label_b label_b label_b label_b label_b \n", + "4 label_b label_b label_b label_b label_b \n", + "\n", + " prob_0 \\\n", + "0 [0.37283509970000006, 0.49910834430000006, 0.1... \n", + "1 [0.34135937690000007, 0.5343321562, 0.1243084297] \n", + "2 [0.38975748420000006, 0.48216831680000005, 0.1... \n", + "3 [0.38686266540000014, 0.48784771560000006, 0.1... \n", + "4 [0.3207181096000001, 0.5833522080999999, 0.095... \n", + "\n", + " prob_1 \\\n", + "0 [0.3027972281, 0.5215288401, 0.1756739765] \n", + "1 [0.34347015620000004, 0.5304207801999999, 0.12... \n", + "2 [0.33265304570000004, 0.5090963244, 0.1582506448] \n", + "3 [0.3482291102, 0.5138959289, 0.13787493110000001] \n", + "4 [0.3277938664, 0.5600519180000001, 0.112154245... \n", + "\n", + " prob_2 \\\n", + "0 [0.41288739440000005, 0.5265461801999999, 0.06... \n", + "1 [0.4346009791000001, 0.5130862594, 0.052312787... \n", + "2 [0.44722059370000006, 0.4945448935000001, 0.05... \n", + "3 [0.4499093592, 0.49849084020000006, 0.05159985... \n", + "4 [0.39969193940000003, 0.5546463728000001, 0.04... \n", + "\n", + " prob_3 \\\n", + "0 [0.32485893370000013, 0.46514019370000004, 0.2... \n", + "1 [0.3181181848000001, 0.4944583774000001, 0.187... \n", + "2 [0.3444236219000001, 0.45550799370000006, 0.20... \n", + "3 [0.3489176929000001, 0.45996120570000004, 0.19... \n", + "4 [0.3249147236000001, 0.5021025537999999, 0.172... \n", + "\n", + " prob_4 \\\n", + "0 [0.3685780168000001, 0.5256645678999999, 0.105... \n", + "1 [0.39643365140000003, 0.5143401027, 0.08922628... \n", + "2 [0.3919632137000001, 0.5084934831, 0.099543325... \n", + "3 [0.38338246940000015, 0.5131927133, 0.10342480... \n", + "4 [0.35228130220000003, 0.5585800408999999, 0.08... \n", + "\n", + " text \n", + "0 Quantum computing is set to revolutionize the ... \n", + "1 Investing in index funds is a popular strategy... \n", + "2 Recent advancements in gene therapy offer new ... \n", + "3 Online learning platforms have transformed the... \n", + "4 Traveling to Europe during the off-season can ... " + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "output_dataset = DocumentDataset.read_json(output_file_path, backend=\"cudf\", add_filename=write_to_filename)\n", + "output_dataset.df.head()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Thank you for reading! In this tutorial, we demonstrated how to use the `PyTorchClassifier` to load locally-stored PyTorch models and run inference on our dataset.\n", + "\n", + "For more information about NeMo Curator's `DistributedDataClassifier`, please reference the [documentation page](https://docs.nvidia.com/nemo-framework/user-guide/latest/datacuration/distributeddataclassification.html). For an example on how to run NeMo Curator's `DomainClassifier` and `QualityClassifier`, please see [this sample notebook](https://github.com/NVIDIA/NeMo-Curator/blob/main/tutorials/distributed_data_classification/distributed_data_classification.ipynb)." + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "nemo_curator_dev", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.15" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} From 876f9980b73ab9f47a85a7a60aa3b4ca161dd615 Mon Sep 17 00:00:00 2001 From: Sarah Yurick Date: Fri, 8 Nov 2024 16:05:52 -0800 Subject: [PATCH 04/14] run black Signed-off-by: Sarah Yurick --- nemo_curator/classifiers/pytorch_deberta.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/nemo_curator/classifiers/pytorch_deberta.py b/nemo_curator/classifiers/pytorch_deberta.py index 550bc8985..a28c956f0 100644 --- a/nemo_curator/classifiers/pytorch_deberta.py +++ b/nemo_curator/classifiers/pytorch_deberta.py @@ -24,11 +24,13 @@ from transformers import AutoConfig, AutoModel from transformers.models.deberta_v2 import DebertaV2TokenizerFast -from nemo_curator.classifiers.base import DistributedDataClassifier, _run_classifier_helper +from nemo_curator.classifiers.base import ( + DistributedDataClassifier, + _run_classifier_helper, +) from nemo_curator.datasets import DocumentDataset - @dataclass class PyTorchModelConfig: base_model: str = "microsoft/deberta-v3-base" @@ -55,7 +57,9 @@ def __init__( self.config = torch.load(config_path) if pretrained: - self.model = AutoModel.from_pretrained(config.base_model, config=self.config) + self.model = AutoModel.from_pretrained( + config.base_model, config=self.config + ) else: self.model = AutoModel(self.config) From 785875ead6e1beb52c15dfecd7e0e42bac22b695 Mon Sep 17 00:00:00 2001 From: Sarah Yurick Date: Mon, 4 Nov 2024 15:59:16 -0800 Subject: [PATCH 05/14] minor updates from main Signed-off-by: Sarah Yurick --- nemo_curator/__init__.py | 2 + nemo_curator/pii/algorithm.py | 4 +- pyproject.toml | 150 +++++++++++++++++++++++++++++++++- 3 files changed, 150 insertions(+), 6 deletions(-) diff --git a/nemo_curator/__init__.py b/nemo_curator/__init__.py index c9982b72a..83924e4e5 100644 --- a/nemo_curator/__init__.py +++ b/nemo_curator/__init__.py @@ -29,3 +29,5 @@ # See https://github.com/NVIDIA/NeMo-Curator/issues/33 # This also happens when reading and writing to files dask.config.set({"dataframe.convert-string": False}) + +__version__ = "0.6.0.dev0" diff --git a/nemo_curator/pii/algorithm.py b/nemo_curator/pii/algorithm.py index 2b5e16ed0..2c1559fb0 100644 --- a/nemo_curator/pii/algorithm.py +++ b/nemo_curator/pii/algorithm.py @@ -52,7 +52,7 @@ def __init__( language: str = DEFAULT_LANGUAGE, supported_entities: List[str] = None, anonymize_action: str = "replace", - **kwargs + **kwargs, ): """ Parameters: @@ -154,7 +154,7 @@ def from_config(config: Mapping[str, Any]): language=language, supported_entities=supported_entities, anonymize_action=operator_name, - **operator_config + **operator_config, ) @staticmethod diff --git a/pyproject.toml b/pyproject.toml index a026c06c2..8ddfd8a49 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -12,16 +12,158 @@ # See the License for the specific language governing permissions and # limitations under the License. +[build-system] +requires = ["setuptools"] +build-backend = "setuptools.build_meta" + +[project] +name = "nemo_curator" +description = "Scalable Data Preprocessing Tool for Training Large Language Models" +readme = { file = "README.md", content-type = "text/markdown" } +authors = [ + { name = "Joseph Jennings", email = "jjennings@nvidia.com" }, + { name = "Mostofa Patwary", email = "mpatwary@nvidia.com" }, + { name = "Sandeep Subramanian", email = "sasubramania@nvidia.com" }, + { name = "Shrimai Prabhumoye", email = "sprabhumoye@nvidia.com" }, + { name = "Ayush Dattagupta", email = "adattagupta@nvidia.com" }, + { name = "Vibhu Jawa", email = "vjawa@nvidia.com" }, + { name = "Jiwei Liu", email = "jiweil@nvidia.com" }, + { name = "Ryan Wolf", email = "rywolf@nvidia.com" }, + { name = "Sarah Yurick", email = "syurick@nvidia.com" }, +] +classifiers = [ + "Development Status :: 3 - Alpha", + "Programming Language :: Python :: 3", + "Programming Language :: Python :: 3.10", +] +requires-python = ">=3.10, <3.11" +dependencies = [ + "awscli>=1.22.55", + "beautifulsoup4", + "charset_normalizer>=3.1.0", + "comment_parser", + "crossfit>=0.0.6", + "dask-mpi>=2021.11.0", + "dask[complete]>=2021.7.1", + "distributed>=2021.7.1", + "fasttext==0.9.2", + "ftfy==6.1.1", + "in-place==0.5.0", + "jieba==0.42.1", + "justext==3.0.1", + "lxml_html_clean", + "mecab-python3", + "mwparserfromhell==0.6.5", + "nemo_toolkit[nlp]>=1.23.0", + "numpy<2", + "openai", + "peft", + "presidio-analyzer==2.2.351", + "presidio-anonymizer==2.2.351", + "pycld2", + "resiliparse", + "spacy>=3.6.0, <3.8.0", + "unidic-lite==1.0.8", + "usaddress==0.5.10", + "warcio==1.7.4", + "zstandard==0.18.0", +] +dynamic = ["version"] + +[project.optional-dependencies] +# Installs CPU + GPU text curation modules +cuda12x = [ + "cudf-cu12>=24.10", + "cugraph-cu12>=24.10", + "cuml-cu12>=24.10", + "dask-cuda>=24.10", + "dask-cudf-cu12>=24.10", + "spacy[cuda12x]>=3.6.0, <3.8.0", +] +# Installs CPU + GPU text curation modules with RAPIDS Nightlies +cuda12x_nightly = [ + "cudf-cu12>=24.12.0a0,<=24.12", + "cugraph-cu12>=24.12.0a0,<=24.12", + "cuml-cu12>=24.12.0a0,<=24.12", + "dask-cuda>=24.12.0a0,<=24.12", + "dask-cudf-cu12>=24.12.0a0,<=24.12", + "spacy[cuda12x]>=3.6.0, <3.8.0", +] +# Installs CPU + GPU text and image curation modules +image = [ + "nvidia-dali-cuda120", + "nvidia-nvjpeg2k-cu12", + "timm>=1.0.8", + "nemo_curator[cuda12x]", +] +# Installs CPU + GPU text and image curation modules with RAPIDS Nightlies +image_nightly = [ + "nvidia-dali-cuda120", + "nvidia-nvjpeg2k-cu12", + "timm>=1.0.8", + "nemo_curator[cuda12x_nightly]", +] +# Installs all of the above with Stable RAPIDS +all = [ + "nemo_curator[image]", +] +# Installs all of the above with RAPIDS Nightlies +all_nightly = [ + "nemo_curator[image_nightly]", +] + +[project.scripts] +get_common_crawl_urls = "nemo_curator.scripts.get_common_crawl_urls:console_script" +get_wikipedia_urls = "nemo_curator.scripts.get_wikipedia_urls:console_script" +download_and_extract = "nemo_curator.scripts.download_and_extract:console_script" +text_cleaning = "nemo_curator.scripts.text_cleaning:console_script" +add_id = "nemo_curator.scripts.add_id:console_script" +make_data_shards = "nemo_curator.scripts.make_data_shards:console_script" +prepare_fasttext_training_data = "nemo_curator.scripts.prepare_fasttext_training_data:console_script" +train_fasttext = "nemo_curator.scripts.train_fasttext:console_script" +filter_documents = "nemo_curator.scripts.filter_documents:console_script" +separate_by_metadata = "nemo_curator.scripts.separate_by_metadata:console_script" +prepare_task_data = "nemo_curator.scripts.prepare_task_data:console_script" +find_matching_ngrams = "nemo_curator.scripts.find_matching_ngrams:console_script" +remove_matching_ngrams = "nemo_curator.scripts.remove_matching_ngrams:console_script" +gpu_compute_minhashes = "nemo_curator.scripts.fuzzy_deduplication.compute_minhashes:console_script" +minhash_buckets = "nemo_curator.scripts.fuzzy_deduplication.minhash_lsh:console_script" +jaccard_map_buckets = "nemo_curator.scripts.fuzzy_deduplication.map_buckets:console_script" +jaccard_shuffle = "nemo_curator.scripts.fuzzy_deduplication.jaccard_shuffle:console_script" +jaccard_compute = "nemo_curator.scripts.fuzzy_deduplication.jaccard_compute:console_script" +gpu_connected_component = "nemo_curator.scripts.fuzzy_deduplication.connected_components:console_script" +buckets_to_edges = "nemo_curator.scripts.fuzzy_deduplication.buckets_to_edges:console_script" +gpu_exact_dups = "nemo_curator.scripts.find_exact_duplicates:console_script" +deidentify = "nemo_curator.scripts.find_pii_and_deidentify:console_script" +domain_classifier_inference = "nemo_curator.scripts.classifiers.domain_classifier_inference:console_script" +quality_classifier_inference = "nemo_curator.scripts.classifiers.quality_classifier_inference:console_script" +aegis_classifier_inference = "nemo_curator.scripts.classifiers.aegis_classifier_inference:console_script" +fineweb_edu_classifier_inference = "nemo_curator.scripts.classifiers.fineweb_edu_classifier_inference:console_script" +verify_classification_results = "nemo_curator.scripts.verify_classification_results:console_script" +blend_datasets = "nemo_curator.scripts.blend_datasets:console_script" +semdedup_extract_embeddings = "nemo_curator.scripts.semdedup.compute_embeddings:console_script" +semdedup_clustering = "nemo_curator.scripts.semdedup.clustering:console_script" +semdedup_extract_unique_ids = "nemo_curator.scripts.semdedup.extract_dedup_data:console_script" + +[project.urls] +Homepage = "https://github.com/NVIDIA/NeMo-Curator" + +[tool.black] +line-length = 88 + [tool.isort] profile = "black" # black-compatible line_length = 88 # should match black parameters py_version = 310 -extend_skip = ["setup.py"] - -[tool.black] -line_length = 88 [tool.pytest.ini_options] markers = [ "gpu: marks tests as GPU tests (deselect with '-m \"not gpu\"')" ] + +[tool.setuptools.dynamic] +version = { attr = "nemo_curator.__version__" } + +[tool.setuptools.packages.find] +include = ["*"] +exclude = ["tests", "tests.*"] From 29313257f209d7d4f39760cae72c2c875ea67433 Mon Sep 17 00:00:00 2001 From: Sarah Yurick Date: Tue, 5 Nov 2024 11:22:00 -0800 Subject: [PATCH 06/14] update readme, add pin Signed-off-by: Sarah Yurick --- README.md | 14 ++++++-------- pyproject.toml | 4 ++++ 2 files changed, 10 insertions(+), 8 deletions(-) diff --git a/README.md b/README.md index 71ece6851..4513c7af6 100644 --- a/README.md +++ b/README.md @@ -83,14 +83,12 @@ You can get NeMo-Curator in 3 ways. #### PyPi ```bash -pip install cython pip install --extra-index-url https://pypi.nvidia.com nemo-curator[all] ``` #### Source ```bash git clone https://github.com/NVIDIA/NeMo-Curator.git -pip install cython pip install --extra-index-url https://pypi.nvidia.com "./NeMo-Curator[all]" ``` @@ -119,17 +117,17 @@ pip install --extra-index-url https://pypi.nvidia.com nemo-curator[all] # Instal #### Using Nightly Dependencies for RAPIDS -You can also install NeMo Curator using the [RAPIDS Nightly Builds](https://docs.rapids.ai/install). To do so, you can set the environment variable `RAPIDS_NIGHTLY=1`. +You can also install NeMo Curator using the [RAPIDS Nightly Builds](https://docs.rapids.ai/install): ```bash -# installing from pypi -RAPIDS_NIGHTLY=1 pip install --extra-index-url=https://pypi.anaconda.org/rapidsai-wheels-nightly/simple "nemo-curator[cuda12x]" +# Installing from PyPi +pip install --extra-index-url=https://pypi.anaconda.org/rapidsai-wheels-nightly/simple "nemo-curator[cuda12x_nightly]" -# installing from source -RAPIDS_NIGHTLY=1 pip install --extra-index-url=https://pypi.anaconda.org/rapidsai-wheels-nightly/simple ".[cuda12x]" +# Installing from source +pip install --extra-index-url=https://pypi.anaconda.org/rapidsai-wheels-nightly/simple "./NeMo-Curator[cuda12x_nightly]" ``` -When the `RAPIDS_NIGHTLY` variable is set to 0 (which is the default), it will use the stable version of RAPIDS. +For the image curation modules and all modules, you can use `[image_nightly]` and `[all_nightly]`, respectively. ## Use NeMo Curator ### Python API Quick Example diff --git a/pyproject.toml b/pyproject.toml index 8ddfd8a49..e086dc9d1 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -79,6 +79,8 @@ cuda12x = [ "dask-cuda>=24.10", "dask-cudf-cu12>=24.10", "spacy[cuda12x]>=3.6.0, <3.8.0", + # See https://github.com/NVIDIA/cuda-python/issues/215 + "cuda-python<=12.6.0", ] # Installs CPU + GPU text curation modules with RAPIDS Nightlies cuda12x_nightly = [ @@ -88,6 +90,8 @@ cuda12x_nightly = [ "dask-cuda>=24.12.0a0,<=24.12", "dask-cudf-cu12>=24.12.0a0,<=24.12", "spacy[cuda12x]>=3.6.0, <3.8.0", + # See https://github.com/NVIDIA/cuda-python/issues/215 + "cuda-python<=12.6.0", ] # Installs CPU + GPU text and image curation modules image = [ From 0e88c415ce35c1eb76b53654c55e94be55261820 Mon Sep 17 00:00:00 2001 From: Sarah Yurick Date: Tue, 5 Nov 2024 13:15:51 -0800 Subject: [PATCH 07/14] test remove cython Signed-off-by: Sarah Yurick --- .github/workflows/test.yml | 3 +-- Dockerfile | 2 +- docs/user-guide/image/gettingstarted.rst | 2 -- 3 files changed, 2 insertions(+), 5 deletions(-) diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index baa968f47..1d8cc9258 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -37,9 +37,8 @@ jobs: # Installing wheel beforehand due to fasttext issue: # https://github.com/facebookresearch/fastText/issues/512#issuecomment-1837367666 - # Explicitly install cython: https://github.com/VKCOM/YouTokenToMe/issues/94 run: | - pip install wheel cython + pip install wheel pip install --no-cache-dir . pip install pytest - name: Run tests diff --git a/Dockerfile b/Dockerfile index aa782055e..1ce8b1b3d 100644 --- a/Dockerfile +++ b/Dockerfile @@ -28,6 +28,6 @@ RUN bash -exu < Date: Tue, 5 Nov 2024 13:20:42 -0800 Subject: [PATCH 08/14] re-add cython Signed-off-by: Sarah Yurick --- .github/workflows/test.yml | 3 ++- Dockerfile | 2 +- README.md | 2 ++ docs/user-guide/image/gettingstarted.rst | 2 ++ 4 files changed, 7 insertions(+), 2 deletions(-) diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 1d8cc9258..baa968f47 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -37,8 +37,9 @@ jobs: # Installing wheel beforehand due to fasttext issue: # https://github.com/facebookresearch/fastText/issues/512#issuecomment-1837367666 + # Explicitly install cython: https://github.com/VKCOM/YouTokenToMe/issues/94 run: | - pip install wheel + pip install wheel cython pip install --no-cache-dir . pip install pytest - name: Run tests diff --git a/Dockerfile b/Dockerfile index 1ce8b1b3d..aa782055e 100644 --- a/Dockerfile +++ b/Dockerfile @@ -28,6 +28,6 @@ RUN bash -exu < Date: Wed, 6 Nov 2024 12:22:08 -0800 Subject: [PATCH 09/14] try using setuptools_scm Signed-off-by: Sarah Yurick --- nemo_curator/__init__.py | 2 -- pyproject.toml | 7 ++++--- 2 files changed, 4 insertions(+), 5 deletions(-) diff --git a/nemo_curator/__init__.py b/nemo_curator/__init__.py index 83924e4e5..c9982b72a 100644 --- a/nemo_curator/__init__.py +++ b/nemo_curator/__init__.py @@ -29,5 +29,3 @@ # See https://github.com/NVIDIA/NeMo-Curator/issues/33 # This also happens when reading and writing to files dask.config.set({"dataframe.convert-string": False}) - -__version__ = "0.6.0.dev0" diff --git a/pyproject.toml b/pyproject.toml index e086dc9d1..68735bea3 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -13,7 +13,7 @@ # limitations under the License. [build-system] -requires = ["setuptools"] +requires = ["setuptools", "setuptools_scm"] build-backend = "setuptools.build_meta" [project] @@ -165,8 +165,9 @@ markers = [ "gpu: marks tests as GPU tests (deselect with '-m \"not gpu\"')" ] -[tool.setuptools.dynamic] -version = { attr = "nemo_curator.__version__" } +[tool.setuptools_scm] +version_scheme = "post-release" +local_scheme = "node-and-date" [tool.setuptools.packages.find] include = ["*"] From 2fc37483ea6ea23cf86b8b1b9b19c3e9d2d30f96 Mon Sep 17 00:00:00 2001 From: Sarah Yurick Date: Wed, 6 Nov 2024 12:35:49 -0800 Subject: [PATCH 10/14] add fetch-depth Signed-off-by: Sarah Yurick --- .github/workflows/test.yml | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index baa968f47..a6eddc483 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -15,23 +15,31 @@ concurrency: jobs: build_and_test: runs-on: ${{ matrix.os }} + strategy: fail-fast: false matrix: os: [ubuntu-latest] python-version: ["3.10"] + steps: - uses: actions/checkout@v4 + with: + # Ensure full history and tags are fetched + fetch-depth: 0 + - name: Optionally free up space on Ubuntu run: | sudo rm -rf /usr/share/dotnet sudo rm -rf /opt/ghc sudo rm -rf "/usr/local/share/boost" sudo rm -rf "$AGENT_TOOLSDIRECTORY" + - name: Set up Python ${{ matrix.python-version }} uses: actions/setup-python@v5 with: python-version: ${{ matrix.python-version }} + - name: Install NeMo-Curator and pytest # TODO: Remove pytest when optional test dependencies are added to setup.py @@ -42,6 +50,7 @@ jobs: pip install wheel cython pip install --no-cache-dir . pip install pytest + - name: Run tests run: | python -m pytest -v --cpu From 95918ac53968c0b76b3b26b16f631a9b3edf5d60 Mon Sep 17 00:00:00 2001 From: Sarah Yurick Date: Wed, 6 Nov 2024 12:44:26 -0800 Subject: [PATCH 11/14] add fetch-tags Signed-off-by: Sarah Yurick --- .github/workflows/test.yml | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index a6eddc483..d51f60d2f 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -27,6 +27,10 @@ jobs: with: # Ensure full history and tags are fetched fetch-depth: 0 + fetch-tags: true + + - name: Show tags for debugging + run: git tag - name: Optionally free up space on Ubuntu run: | From d3ac5f0115e0ac05d542505b20e86a99826333fb Mon Sep 17 00:00:00 2001 From: Sarah Yurick Date: Tue, 12 Nov 2024 11:16:45 -0800 Subject: [PATCH 12/14] move everything to tutorials Signed-off-by: Sarah Yurick --- .github/workflows/test.yml | 13 ------------- nemo_curator/classifiers/__init__.py | 2 -- ...ipynb => pytorch-ensemble-classification.ipynb} | 14 +++++++++----- .../pytorch_deberta.py | 0 4 files changed, 9 insertions(+), 20 deletions(-) rename tutorials/distributed_data_classification/{pytorch_ensemble_classification.ipynb => pytorch-ensemble-classification.ipynb} (91%) rename {nemo_curator/classifiers => tutorials/distributed_data_classification}/pytorch_deberta.py (100%) diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index d51f60d2f..baa968f47 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -15,35 +15,23 @@ concurrency: jobs: build_and_test: runs-on: ${{ matrix.os }} - strategy: fail-fast: false matrix: os: [ubuntu-latest] python-version: ["3.10"] - steps: - uses: actions/checkout@v4 - with: - # Ensure full history and tags are fetched - fetch-depth: 0 - fetch-tags: true - - - name: Show tags for debugging - run: git tag - - name: Optionally free up space on Ubuntu run: | sudo rm -rf /usr/share/dotnet sudo rm -rf /opt/ghc sudo rm -rf "/usr/local/share/boost" sudo rm -rf "$AGENT_TOOLSDIRECTORY" - - name: Set up Python ${{ matrix.python-version }} uses: actions/setup-python@v5 with: python-version: ${{ matrix.python-version }} - - name: Install NeMo-Curator and pytest # TODO: Remove pytest when optional test dependencies are added to setup.py @@ -54,7 +42,6 @@ jobs: pip install wheel cython pip install --no-cache-dir . pip install pytest - - name: Run tests run: | python -m pytest -v --cpu diff --git a/nemo_curator/classifiers/__init__.py b/nemo_curator/classifiers/__init__.py index a26e4eac3..f10d63c15 100644 --- a/nemo_curator/classifiers/__init__.py +++ b/nemo_curator/classifiers/__init__.py @@ -18,7 +18,6 @@ from .aegis import AegisClassifier from .domain import DomainClassifier from .fineweb_edu import FineWebEduClassifier -from .pytorch_deberta import PyTorchClassifier from .quality import QualityClassifier __all__ = [ @@ -26,5 +25,4 @@ "QualityClassifier", "AegisClassifier", "FineWebEduClassifier", - "PyTorchClassifier", ] diff --git a/tutorials/distributed_data_classification/pytorch_ensemble_classification.ipynb b/tutorials/distributed_data_classification/pytorch-ensemble-classification.ipynb similarity index 91% rename from tutorials/distributed_data_classification/pytorch_ensemble_classification.ipynb rename to tutorials/distributed_data_classification/pytorch-ensemble-classification.ipynb index c02fac299..3acf4a641 100644 --- a/tutorials/distributed_data_classification/pytorch_ensemble_classification.ipynb +++ b/tutorials/distributed_data_classification/pytorch-ensemble-classification.ipynb @@ -8,7 +8,7 @@ "\n", "Cross-validation is a machine learning technique in which multiple models are trained on multiple subsets of your data and validated on the remaining data portions. It is useful because it reduces the risk of overfitting to your data and provides a better estimate of how the model will perform on unseen data. This is particularly valuable when dealing with limited data, as it allows for more efficient use of the available samples.\n", "\n", - "In this tutorial, we demonstrate how to use NeMo Curator's `PyTorchClassifier` class to load and perform batched inference with multiple pretrained models. We assume the user has pretrained PTH model files, with [DeBERTaV3](https://huggingface.co/microsoft/deberta-v3-base) as the base model used for training. The classifiers are accelerated using [CrossFit](https://github.com/rapidsai/crossfit), a library that leverages intellegent batching and RAPIDS to accelerate the offline inference on large datasets.\n", + "In this tutorial, we will use a custom `PyTorchClassifier` class to load and perform batched inference with multiple pretrained models. We assume the user has pretrained PTH model files, with [DeBERTaV3](https://huggingface.co/microsoft/deberta-v3-base) as the base model used for training. The classifiers are accelerated using [CrossFit](https://github.com/rapidsai/crossfit), a library that leverages intellegent batching and RAPIDS to accelerate the offline inference on large datasets.\n", "\n", "First, let's run some preliminary imports and set up our Dask client." ] @@ -35,15 +35,17 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ - "from nemo_curator.classifiers import PyTorchClassifier\n", "from nemo_curator.datasets import DocumentDataset\n", "from nemo_curator import get_client\n", "import cudf\n", - "import dask_cudf" + "import dask_cudf\n", + "\n", + "# For importing the PyTorchClassifier\n", + "%run -i \"./pytorch_deberta.py\"" ] }, { @@ -133,7 +135,9 @@ "source": [ "# Run Classification with Multiple Models\n", "\n", - "Now we can use the `PyTorchClassifier` class to load each of our PyTorch models and run inference. We will write the results to a JSON file." + "Now we can use our custom `PyTorchClassifier` class to load each of our PyTorch models and run inference. We will write the results to a JSON file.\n", + "\n", + "The `PyTorchClassifier` functions very similarly to the `DomainClassifier`, `QualityClassifier`, `AegisClassifier`, and `FineWebEduClassifier` as documented in the [user guide](https://docs.nvidia.com/nemo-framework/user-guide/latest/datacuration/distributeddataclassification.html). The main difference is that the `PyTorchClassifier` accepts a local file path to a pretrained model, whereas the other existing classifiers grab their pretrained models from Hugging Face. Feel free to check out the `pytorch_deberta.py` script in this directory to see how this class uses NeMo Curator's `DistributedDataClassifier` as well as the PyTorch, Transformers, and CrossFit libraries to perform extremely fast classification tasks using pretrained PyTorch model files." ] }, { diff --git a/nemo_curator/classifiers/pytorch_deberta.py b/tutorials/distributed_data_classification/pytorch_deberta.py similarity index 100% rename from nemo_curator/classifiers/pytorch_deberta.py rename to tutorials/distributed_data_classification/pytorch_deberta.py From 120f9d7995d6db523708e31c4912d4e637ca5e54 Mon Sep 17 00:00:00 2001 From: Sarah Yurick Date: Tue, 12 Nov 2024 11:23:28 -0800 Subject: [PATCH 13/14] remove _init_weights Signed-off-by: Sarah Yurick --- .../pytorch_deberta.py | 14 -------------- 1 file changed, 14 deletions(-) diff --git a/tutorials/distributed_data_classification/pytorch_deberta.py b/tutorials/distributed_data_classification/pytorch_deberta.py index a28c956f0..215bac604 100644 --- a/tutorials/distributed_data_classification/pytorch_deberta.py +++ b/tutorials/distributed_data_classification/pytorch_deberta.py @@ -65,22 +65,8 @@ def __init__( self.fc_dropout = nn.Dropout(config.fc_dropout) self.fc = nn.Linear(self.config.hidden_size, out_dim) - self._init_weights(self.fc) self.autocast = autocast - def _init_weights(self, module): - if isinstance(module, nn.Linear): - module.weight.data.normal_(mean=0.0, std=self.config.initializer_range) - if module.bias is not None: - module.bias.data.zero_() - elif isinstance(module, nn.Embedding): - module.weight.data.normal_(mean=0.0, std=self.config.initializer_range) - if module.padding_idx is not None: - module.weight.data[module.padding_idx].zero_() - elif isinstance(module, nn.LayerNorm): - module.bias.data.zero_() - module.weight.data.fill_(1.0) - def feature(self, input_ids, attention_mask): outputs = self.model(input_ids=input_ids, attention_mask=attention_mask) last_hidden_states = outputs[0] From e543084df6db64aef3c58bb142210e7accbf8893 Mon Sep 17 00:00:00 2001 From: Sarah Yurick Date: Wed, 20 Nov 2024 19:19:28 -0800 Subject: [PATCH 14/14] remove py file and add it to ipynb Signed-off-by: Sarah Yurick --- .../pytorch-ensemble-classification.ipynb | 329 ++++++++++++++++-- .../pytorch_deberta.py | 218 ------------ 2 files changed, 302 insertions(+), 245 deletions(-) delete mode 100644 tutorials/distributed_data_classification/pytorch_deberta.py diff --git a/tutorials/distributed_data_classification/pytorch-ensemble-classification.ipynb b/tutorials/distributed_data_classification/pytorch-ensemble-classification.ipynb index 3acf4a641..77a3960e1 100644 --- a/tutorials/distributed_data_classification/pytorch-ensemble-classification.ipynb +++ b/tutorials/distributed_data_classification/pytorch-ensemble-classification.ipynb @@ -8,7 +8,7 @@ "\n", "Cross-validation is a machine learning technique in which multiple models are trained on multiple subsets of your data and validated on the remaining data portions. It is useful because it reduces the risk of overfitting to your data and provides a better estimate of how the model will perform on unseen data. This is particularly valuable when dealing with limited data, as it allows for more efficient use of the available samples.\n", "\n", - "In this tutorial, we will use a custom `PyTorchClassifier` class to load and perform batched inference with multiple pretrained models. We assume the user has pretrained PTH model files, with [DeBERTaV3](https://huggingface.co/microsoft/deberta-v3-base) as the base model used for training. The classifiers are accelerated using [CrossFit](https://github.com/rapidsai/crossfit), a library that leverages intellegent batching and RAPIDS to accelerate the offline inference on large datasets.\n", + "In this tutorial, we demonstrate how to use NeMo Curator's `DistributedDataClassifier` to build our own `PyTorchClassifier` class for loading and performing batched inference with multiple pretrained models. We assume the user has pretrained PTH model files, with [DeBERTaV3](https://huggingface.co/microsoft/deberta-v3-base) as the base model used for training. The classifiers are accelerated using [CrossFit](https://github.com/rapidsai/crossfit), a library that leverages intellegent batching and RAPIDS to accelerate the offline inference on large datasets.\n", "\n", "First, let's run some preliminary imports and set up our Dask client." ] @@ -27,6 +27,9 @@ } ], "source": [ + "import os\n", + "os.environ[\"RAPIDS_NO_INITIALIZE\"] = \"1\"\n", + "\n", "# Silence Warnings (HuggingFace internal warnings)\n", "%env PYTHONWARNINGS=ignore\n", "import warnings\n", @@ -35,22 +38,47 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "from dataclasses import dataclass\n", + "from typing import List, Optional" + ] + }, + { + "cell_type": "code", + "execution_count": 3, "metadata": {}, "outputs": [], "source": [ - "from nemo_curator.datasets import DocumentDataset\n", - "from nemo_curator import get_client\n", "import cudf\n", "import dask_cudf\n", - "\n", - "# For importing the PyTorchClassifier\n", - "%run -i \"./pytorch_deberta.py\"" + "import torch\n", + "import torch.nn as nn\n", + "from crossfit.backend.torch.hf.model import HFModel\n", + "from transformers import AutoConfig, AutoModel\n", + "from transformers.models.deberta_v2 import DebertaV2TokenizerFast" ] }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "# NeMo Curator modules\n", + "from nemo_curator import get_client\n", + "from nemo_curator.classifiers.base import (\n", + " DistributedDataClassifier,\n", + " _run_classifier_helper,\n", + ")\n", + "from nemo_curator.datasets import DocumentDataset" + ] + }, + { + "cell_type": "code", + "execution_count": 5, "metadata": {}, "outputs": [ { @@ -69,14 +97,263 @@ "cell_type": "markdown", "metadata": {}, "source": [ + "# Create `PyTorchClassifier` Class\n", + "\n", + "To create our `PyTorchClassifier` class, we will be extendeding NeMo Curator's `DistributedDataClassifier` class.\n", + "\n", + "The goal of the base `DistributedDataClassifier` class is to enable multi-node multi-GPU data classification of your data. NeMo Curator provides several subclasses that focus on domain, quality, content safety, and educational content classification. However, the `DistributedDataClassifier` can be extended to fit any model; the only requirement is that the model can fit on a single GPU. See NeMo Curator's [Distributed Data Classification](https://docs.nvidia.com/nemo-framework/user-guide/latest/datacuration/distributeddataclassification.html) documentation for more information.\n", + "\n", + "First, let's create a `PyTorchModelConfig` class. Its purpose is to store some of the attributes that will be used by our model, including the base model of the classifier." + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [], + "source": [ + "@dataclass\n", + "class PyTorchModelConfig:\n", + " base_model: str = \"microsoft/deberta-v3-base\"\n", + " fc_dropout: float = 0.2\n", + " max_len: int = 512" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Next, we create an `NCCustomModel` (for \"NeMo Curator Custom Model\") class. It inherits from `nn.Module`, the base class for all neural network modules in PyTorch.\n", + "\n", + "Inside `__init__`, the model loads the model configuration and model. The `autocast` boolean determines whether mixed precision (`torch.autocast`) is used during inference to speed up computations on CUDA devices. The `forward` method is required by `nn.Module` and runs the model's forward pass (the computation performed at every call)." + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [], + "source": [ + "class NCCustomModel(nn.Module):\n", + " def __init__(\n", + " self,\n", + " config: dataclass,\n", + " out_dim: int,\n", + " config_path: str = None,\n", + " pretrained: bool = False,\n", + " autocast: bool = False,\n", + " ):\n", + " super().__init__()\n", + " self.config = config\n", + " if config_path is None:\n", + " self.config = AutoConfig.from_pretrained(\n", + " config.base_model, output_hidden_states=True\n", + " )\n", + " else:\n", + " self.config = torch.load(config_path)\n", + "\n", + " if pretrained:\n", + " self.model = AutoModel.from_pretrained(\n", + " config.base_model, config=self.config\n", + " )\n", + " else:\n", + " self.model = AutoModel(self.config)\n", + "\n", + " self.fc_dropout = nn.Dropout(config.fc_dropout)\n", + " self.fc = nn.Linear(self.config.hidden_size, out_dim)\n", + " self.autocast = autocast\n", + "\n", + " def feature(self, input_ids, attention_mask):\n", + " outputs = self.model(input_ids=input_ids, attention_mask=attention_mask)\n", + " last_hidden_states = outputs[0]\n", + " return last_hidden_states\n", + "\n", + " def _forward(self, batch):\n", + " feature = self.feature(batch[\"input_ids\"], batch[\"attention_mask\"])\n", + " output = self.fc(self.fc_dropout(feature))\n", + " output = output.to(torch.float32)\n", + " return torch.softmax(output[:, 0, :], dim=1)\n", + "\n", + " def forward(self, batch):\n", + " if self.autocast:\n", + " with torch.autocast(device_type=\"cuda\"):\n", + " return self._forward(batch)\n", + " else:\n", + " return self._forward(batch)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now, let's create the `PyTorchModel` class, a model management class. It inherits from `HFModel`, a class created by NVIDIA's [CrossFit](https://github.com/rapidsai/crossfit) library, which enables multi-node and multi-GPU offline inference. In it, we create several methods which define how to load our model, its configuration, and its tokenizer." + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [], + "source": [ + "class PyTorchModel(HFModel):\n", + " def __init__(\n", + " self,\n", + " config: dataclass,\n", + " out_dim: int,\n", + " model_path: str,\n", + " autocast: bool = False,\n", + " ):\n", + " self.config = config\n", + " self.out_dim = out_dim\n", + " self.model_path = model_path\n", + " self.autocast = autocast\n", + " super().__init__(self.config.base_model)\n", + "\n", + " def load_model(self, device: str = \"cuda\"):\n", + " model = NCCustomModel(\n", + " self.config,\n", + " out_dim=self.out_dim,\n", + " config_path=None,\n", + " pretrained=True,\n", + " autocast=self.autocast,\n", + " )\n", + " model = model.to(device)\n", + "\n", + " if os.path.exists(self.model_path):\n", + " sd = torch.load(self.model_path, map_location=\"cpu\")\n", + " if \"model_state_dict\" in sd:\n", + " sd = sd[\"model_state_dict\"]\n", + " sd = {k[7:] if k.startswith(\"module.\") else k: sd[k] for k in sd.keys()}\n", + " model.load_state_dict(sd, strict=True)\n", + " else:\n", + " raise ValueError(f\"Model path {self.model_path} does not exist\")\n", + "\n", + " return model.eval()\n", + "\n", + " def load_tokenizer(self):\n", + " return DebertaV2TokenizerFast.from_pretrained(self.config.base_model)\n", + "\n", + " def load_config(self):\n", + " return AutoConfig.from_pretrained(self.path_or_name)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Finally, we create the `PyTorchClassifier` class. We start with the `__init__` method, which uses the `DistributedDataClassifier`, `PyTorchModelConfig`, and `PyTorchModel` classes described above. Next is the `_run_classifier` method, which is called by `DistributedDataClassifier`'s `__call__` method; it is required for all classes that inherit the `DistributedDataClassifier` class.\n", + "\n", + "Here is a quick rundown of all the attributes of the `PyTorchClassifier` class:\n", + "- `pretrained_model_name_or_path` (`str`): The path to your PyTorch model file.\n", + "- `labels` (`list[str]`): The classes output by the model classifier.\n", + "- `out_dim` (`list[str], optional`): Set to 1 for a binary classification task. Otherwise, defaults to `len(labels)`.\n", + "- `filter_by` (`list[str], optional`): The classes to filter the dataset by. If None, all classes will be included. Defaults to None.\n", + "- `batch_size` (`int`): The number of samples per batch for inference. Defaults to 256.\n", + "- `text_field` (`str`): The field in the dataset that should be classified.\n", + "- `pred_column` (`str`): The column name where predictions will be stored. Defaults to \"pred\".\n", + "- `prob_column` (`str`): The column name where prediction probabilities will be stored. Defaults to \"prob\".\n", + "- `max_chars` (`int`): The maximum number of characters in each document to consider for classification. Defaults to 6000.\n", + "- `device_type` (`str`): The type of device to use for inference, either \"cuda\" or \"cpu\". Defaults to \"cuda\".\n", + "- `autocast` (`bool`): Whether to use mixed precision for faster inference. Defaults to True.\n", + "- `base_model` (`str`): The base model on which your PyTorch model was trained. Defaults to \"microsoft/deberta-v3-base\".\n", + "- `fc_dropout` (`str`): Dropout rate used during training. Defaults to 0.2.\n", + "- `max_len` (`str`): Maximum sequence length used during training. Defaults to 512." + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [], + "source": [ + "class PyTorchClassifier(DistributedDataClassifier):\n", + " \"\"\"\n", + " PyTorchClassifier is a general classifier designed for running generic PTH model files.\n", + " This class is optimized for running on multi-node, multi-GPU setups to enable fast and efficient inference on large datasets.\n", + "\n", + " \"\"\"\n", + "\n", + " def __init__(\n", + " self,\n", + " pretrained_model_name_or_path: str,\n", + " labels: List[str],\n", + " out_dim: Optional[int] = None,\n", + " filter_by: Optional[List[str]] = None,\n", + " batch_size: int = 256,\n", + " text_field: str = \"text\",\n", + " pred_column: str = \"pred\",\n", + " prob_column: str = \"prob\",\n", + " max_chars: int = 6000,\n", + " device_type: str = \"cuda\",\n", + " autocast: bool = True,\n", + " base_model: str = \"microsoft/deberta-v3-base\",\n", + " fc_dropout: float = 0.2,\n", + " max_len: int = 512,\n", + " ):\n", + " config = PyTorchModelConfig(\n", + " base_model=base_model,\n", + " fc_dropout=fc_dropout,\n", + " max_len=max_len,\n", + " )\n", + "\n", + " self.labels = labels\n", + " if out_dim:\n", + " self.out_dim = out_dim\n", + " else:\n", + " self.out_dim = len(labels)\n", + "\n", + " self.text_field = text_field\n", + " self.prob_column = prob_column\n", + "\n", + " model = PyTorchModel(\n", + " config=config,\n", + " out_dim=self.out_dim,\n", + " model_path=pretrained_model_name_or_path,\n", + " autocast=autocast,\n", + " )\n", + "\n", + " super().__init__(\n", + " model=model,\n", + " labels=self.labels,\n", + " filter_by=filter_by,\n", + " batch_size=batch_size,\n", + " out_dim=self.out_dim,\n", + " pred_column=pred_column,\n", + " max_chars=max_chars,\n", + " device_type=device_type,\n", + " autocast=autocast,\n", + " )\n", + "\n", + " def _run_classifier(self, dataset: DocumentDataset):\n", + " print(\"Starting PyTorch classifier inference\", flush=True)\n", + " df = dataset.df\n", + " df = _run_classifier_helper(\n", + " df=df,\n", + " model=self.model,\n", + " labels=self.labels,\n", + " max_chars=self.max_chars,\n", + " batch_size=self.batch_size,\n", + " label_col=self.pred_column,\n", + " text_field=self.text_field,\n", + " prob_col=self.prob_column,\n", + " )\n", + " return DocumentDataset(df)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We have successfully built our PyTorch classifier pipeline! Now, let's demonstrate how to use it with a simple example.\n", + "\n", "# Prepare Dataset and Set File Paths\n", "\n", - "Next, we need to create or read the dataset on which we want to run inference. In this notebook, we provide a sample dataset with 10 text sentences to evaluate. Alternatively, the user may read in their own existing data (e.g., JSON or Parquet files) as demonstrated by the commented code." + "For our demonstration, we need to create or read the dataset on which we want to run inference. In this notebook, we provide a sample dataset with 10 text sentences to evaluate. Alternatively, the user may read in their own existing data (e.g., JSON or Parquet files) as demonstrated by the commented code." ] }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 10, "metadata": {}, "outputs": [], "source": [ @@ -135,14 +412,12 @@ "source": [ "# Run Classification with Multiple Models\n", "\n", - "Now we can use our custom `PyTorchClassifier` class to load each of our PyTorch models and run inference. We will write the results to a JSON file.\n", - "\n", - "The `PyTorchClassifier` functions very similarly to the `DomainClassifier`, `QualityClassifier`, `AegisClassifier`, and `FineWebEduClassifier` as documented in the [user guide](https://docs.nvidia.com/nemo-framework/user-guide/latest/datacuration/distributeddataclassification.html). The main difference is that the `PyTorchClassifier` accepts a local file path to a pretrained model, whereas the other existing classifiers grab their pretrained models from Hugging Face. Feel free to check out the `pytorch_deberta.py` script in this directory to see how this class uses NeMo Curator's `DistributedDataClassifier` as well as the PyTorch, Transformers, and CrossFit libraries to perform extremely fast classification tasks using pretrained PyTorch model files." + "Now we can use the `PyTorchClassifier` class to load each of our PyTorch models and run inference. We will write the results to a JSON file." ] }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 12, "metadata": {}, "outputs": [ { @@ -179,18 +454,18 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 13, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ - "GPU: tcp://127.0.0.1:34075, Part: 0: 100%|██████████| 10/10 [00:08<00:00, 1.23it/s]\n", - "GPU: tcp://127.0.0.1:34075, Part: 0: 100%|██████████| 10/10 [00:05<00:00, 1.83it/s]\n", - "GPU: tcp://127.0.0.1:34075, Part: 0: 100%|██████████| 10/10 [00:05<00:00, 1.81it/s]\n", - "GPU: tcp://127.0.0.1:34075, Part: 0: 100%|██████████| 10/10 [00:05<00:00, 1.80it/s]\n", - "GPU: tcp://127.0.0.1:34075, Part: 0: 100%|██████████| 10/10 [00:04<00:00, 2.02it/s]" + "GPU: tcp://127.0.0.1:32893, Part: 0: 100%|██████████| 10/10 [00:21<00:00, 2.13s/it]\n", + "GPU: tcp://127.0.0.1:32893, Part: 0: 100%|██████████| 10/10 [00:05<00:00, 1.81it/s]\n", + "GPU: tcp://127.0.0.1:32893, Part: 0: 100%|██████████| 10/10 [00:05<00:00, 1.81it/s]\n", + "GPU: tcp://127.0.0.1:32893, Part: 0: 100%|██████████| 10/10 [00:05<00:00, 1.84it/s]\n", + "GPU: tcp://127.0.0.1:32893, Part: 0: 100%|██████████| 10/10 [00:04<00:00, 2.01it/s]" ] }, { @@ -198,15 +473,15 @@ "output_type": "stream", "text": [ "Writing to disk complete for 1 partitions\n", - "CPU times: user 5.39 s, sys: 3.1 s, total: 8.49 s\n", - "Wall time: 48.8 s\n" + "CPU times: user 7.01 s, sys: 4.56 s, total: 11.6 s\n", + "Wall time: 1min 4s\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ - "GPU: tcp://127.0.0.1:34075, Part: 0: 100%|██████████| 10/10 [00:05<00:00, 1.80it/s]\n" + "GPU: tcp://127.0.0.1:32893, Part: 0: 100%|██████████| 10/10 [00:05<00:00, 1.81it/s]\n" ] } ], @@ -227,7 +502,7 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 14, "metadata": {}, "outputs": [ { @@ -397,7 +672,7 @@ "4 Traveling to Europe during the off-season can ... " ] }, - "execution_count": 8, + "execution_count": 14, "metadata": {}, "output_type": "execute_result" } @@ -411,7 +686,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Thank you for reading! In this tutorial, we demonstrated how to use the `PyTorchClassifier` to load locally-stored PyTorch models and run inference on our dataset.\n", + "Thank you for reading! In this tutorial, we demonstrated how to create and use the `PyTorchClassifier` class to load locally-stored PyTorch models and run inference on our dataset.\n", "\n", "For more information about NeMo Curator's `DistributedDataClassifier`, please reference the [documentation page](https://docs.nvidia.com/nemo-framework/user-guide/latest/datacuration/distributeddataclassification.html). For an example on how to run NeMo Curator's `DomainClassifier` and `QualityClassifier`, please see [this sample notebook](https://github.com/NVIDIA/NeMo-Curator/blob/main/tutorials/distributed_data_classification/distributed_data_classification.ipynb)." ] @@ -419,7 +694,7 @@ ], "metadata": { "kernelspec": { - "display_name": "nemo_curator_dev", + "display_name": "nemo_curator", "language": "python", "name": "python3" }, diff --git a/tutorials/distributed_data_classification/pytorch_deberta.py b/tutorials/distributed_data_classification/pytorch_deberta.py deleted file mode 100644 index 215bac604..000000000 --- a/tutorials/distributed_data_classification/pytorch_deberta.py +++ /dev/null @@ -1,218 +0,0 @@ -# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import os - -os.environ["RAPIDS_NO_INITIALIZE"] = "1" -from dataclasses import dataclass -from typing import List, Optional - -import torch -import torch.nn as nn -from crossfit.backend.torch.hf.model import HFModel -from transformers import AutoConfig, AutoModel -from transformers.models.deberta_v2 import DebertaV2TokenizerFast - -from nemo_curator.classifiers.base import ( - DistributedDataClassifier, - _run_classifier_helper, -) -from nemo_curator.datasets import DocumentDataset - - -@dataclass -class PyTorchModelConfig: - base_model: str = "microsoft/deberta-v3-base" - fc_dropout: float = 0.2 - max_len: int = 512 - - -class NCCustomModel(nn.Module): - def __init__( - self, - config: dataclass, - out_dim: int, - config_path: str = None, - pretrained: bool = False, - autocast: bool = False, - ): - super().__init__() - self.config = config - if config_path is None: - self.config = AutoConfig.from_pretrained( - config.base_model, output_hidden_states=True - ) - else: - self.config = torch.load(config_path) - - if pretrained: - self.model = AutoModel.from_pretrained( - config.base_model, config=self.config - ) - else: - self.model = AutoModel(self.config) - - self.fc_dropout = nn.Dropout(config.fc_dropout) - self.fc = nn.Linear(self.config.hidden_size, out_dim) - self.autocast = autocast - - def feature(self, input_ids, attention_mask): - outputs = self.model(input_ids=input_ids, attention_mask=attention_mask) - last_hidden_states = outputs[0] - return last_hidden_states - - def _forward(self, batch): - feature = self.feature(batch["input_ids"], batch["attention_mask"]) - output = self.fc(self.fc_dropout(feature)) - output = output.to(torch.float32) - return torch.softmax(output[:, 0, :], dim=1) - - def forward(self, batch): - if self.autocast: - with torch.autocast(device_type="cuda"): - return self._forward(batch) - else: - return self._forward(batch) - - -class PyTorchModel(HFModel): - def __init__( - self, - config: dataclass, - out_dim: int, - model_path: str, - autocast: bool = False, - ): - self.config = config - self.out_dim = out_dim - self.model_path = model_path - self.autocast = autocast - super().__init__(self.config.base_model) - - def load_model(self, device: str = "cuda"): - model = NCCustomModel( - self.config, - out_dim=self.out_dim, - config_path=None, - pretrained=True, - autocast=self.autocast, - ) - model = model.to(device) - - if os.path.exists(self.model_path): - sd = torch.load(self.model_path, map_location="cpu") - if "model_state_dict" in sd: - sd = sd["model_state_dict"] - sd = {k[7:] if k.startswith("module.") else k: sd[k] for k in sd.keys()} - model.load_state_dict(sd, strict=True) - else: - raise ValueError(f"Model path {self.model_path} does not exist") - - return model.eval() - - def load_tokenizer(self): - # TODO: Allow user to pass in their own tokenizer if base_model is not Deberta - return DebertaV2TokenizerFast.from_pretrained(self.config.base_model) - - def load_config(self): - return AutoConfig.from_pretrained(self.path_or_name) - - -class PyTorchClassifier(DistributedDataClassifier): - """ - PyTorchClassifier is a general classifier designed for running generic PTH model files. - This class is optimized for running on multi-node, multi-GPU setups to enable fast and efficient inference on large datasets. - - Attributes: - pretrained_model_name_or_path (str): The path to your PyTorch model file. - labels (list[str]): The classes output by the model classifier. - out_dim (list[str], optional): Set to 1 for a binary classification task. Otherwise, defaults to len(labels). - filter_by (list[str], optional): The classes to filter the dataset by. If None, all classes will be included. Defaults to None. - batch_size (int): The number of samples per batch for inference. Defaults to 256. - text_field (str): The field in the dataset that should be classified. - pred_column (str): The column name where predictions will be stored. Defaults to "pred". - prob_column (str): The column name where prediction probabilities will be stored. Defaults to "prob". - max_chars (int): The maximum number of characters in each document to consider for classification. Defaults to 6000. - device_type (str): The type of device to use for inference, either "cuda" or "cpu". Defaults to "cuda". - autocast (bool): Whether to use mixed precision for faster inference. Defaults to True. - base_model (str): The base model on which your PyTorch model was trained. Defaults to "microsoft/deberta-v3-base". - fc_dropout (str): Dropout rate used during training. Defaults to 0.2. - max_len (str): Maximum sequence length used during training. Defaults to 512. - """ - - def __init__( - self, - pretrained_model_name_or_path: str, - labels: List[str], - out_dim: Optional[int] = None, - filter_by: Optional[List[str]] = None, - batch_size: int = 256, - text_field: str = "text", - pred_column: str = "pred", - prob_column: str = "prob", - max_chars: int = 6000, - device_type: str = "cuda", - autocast: bool = True, - base_model: str = "microsoft/deberta-v3-base", - fc_dropout: float = 0.2, - max_len: int = 512, - ): - config = PyTorchModelConfig( - base_model=base_model, - fc_dropout=fc_dropout, - max_len=max_len, - ) - - self.labels = labels - if out_dim: - self.out_dim = out_dim - else: - self.out_dim = len(labels) - - self.text_field = text_field - self.prob_column = prob_column - - model = PyTorchModel( - config=config, - out_dim=self.out_dim, - model_path=pretrained_model_name_or_path, - autocast=autocast, - ) - - super().__init__( - model=model, - labels=self.labels, - filter_by=filter_by, - batch_size=batch_size, - out_dim=self.out_dim, - pred_column=pred_column, - max_chars=max_chars, - device_type=device_type, - autocast=autocast, - ) - - def _run_classifier(self, dataset: DocumentDataset): - print("Starting PyTorch classifier inference", flush=True) - df = dataset.df - df = _run_classifier_helper( - df=df, - model=self.model, - labels=self.labels, - max_chars=self.max_chars, - batch_size=self.batch_size, - label_col=self.pred_column, - text_field=self.text_field, - prob_col=self.prob_column, - ) - return DocumentDataset(df)