From cb49b1fd2f486dfaecd38773b908a9bfb2fec2bc Mon Sep 17 00:00:00 2001 From: Sarah Yurick <53962159+sarahyurick@users.noreply.github.com> Date: Thu, 21 Nov 2024 13:42:04 -0800 Subject: [PATCH] Remove lingering `DASK_DATAFRAME__QUERY_PLANNING` environment variables (#346) * remove extra DASK_DATAFRAME__QUERY_PLANNING Signed-off-by: Sarah Yurick * sort imports Signed-off-by: Sarah Yurick * last file Signed-off-by: Sarah Yurick --------- Signed-off-by: Sarah Yurick Signed-off-by: Vinay Raman --- tutorials/image-curation/image-curation.ipynb | 3 +-- .../start-distributed-notebook.sh | 1 - ...pretraining-vietnamese-data-curation.ipynb | 27 +++++-------------- .../0_processing/process_dclm.py | 5 +--- .../0_processing/process_dolma_cc.py | 5 +--- .../0_processing/process_fwe2.py | 5 +--- .../0_processing/process_zyda.py | 5 +--- .../zyda2-tutorial/1_fuzzy_dedup/0_minhash.py | 5 +--- .../zyda2-tutorial/1_fuzzy_dedup/1_lsh.py | 5 +--- .../1_fuzzy_dedup/2_buckets_to_edges.py | 5 +--- .../1_fuzzy_dedup/3_connected_components.py | 5 +--- .../2_dupes_removal/0_id_mapping.py | 5 +--- .../2_dupes_removal/1_id_conversion.py | 5 +--- .../2_dupes_removal/2_compute_counts.py | 5 +--- .../2_dupes_removal/3_prep_dupes.py | 5 +--- .../2_dupes_removal/4_get_dupes_dclm.py | 5 +--- .../2_dupes_removal/4_get_dupes_dolma-cc.py | 5 +--- .../2_dupes_removal/5_get_dupes_zyda.py | 5 +--- .../2_dupes_removal/remove_dupes.py | 5 +--- .../3_quality_model/run_quality_classifier.py | 5 +--- .../zyda2-tutorial/4_filtering/filter_fwe.py | 5 +--- .../4_filtering/filter_quality.py | 6 +---- 22 files changed, 26 insertions(+), 101 deletions(-) diff --git a/tutorials/image-curation/image-curation.ipynb b/tutorials/image-curation/image-curation.ipynb index 1ac3c1029..8e1b51706 100644 --- a/tutorials/image-curation/image-curation.ipynb +++ b/tutorials/image-curation/image-curation.ipynb @@ -51,8 +51,7 @@ "source": [ "!pip install ipywidgets aiofiles\n", "# Install from source by default\n", - "!pip install --extra-index-url https://pypi.nvidia.com ../../[image]\n", - "%env DASK_DATAFRAME__QUERY_PLANNING False" + "!pip install --extra-index-url https://pypi.nvidia.com ../../[image]" ] }, { diff --git a/tutorials/pretraining-data-curation/start-distributed-notebook.sh b/tutorials/pretraining-data-curation/start-distributed-notebook.sh index 0c1cd7ee3..4baaf02a1 100644 --- a/tutorials/pretraining-data-curation/start-distributed-notebook.sh +++ b/tutorials/pretraining-data-curation/start-distributed-notebook.sh @@ -62,7 +62,6 @@ export CUDF_SPILL="1" export RMM_SCHEDULER_POOL_SIZE="1GB" export RMM_WORKER_POOL_SIZE="72GiB" export LIBCUDF_CUFILE_POLICY=OFF -export DASK_DATAFRAME__QUERY_PLANNING=False # ================================================================= diff --git a/tutorials/pretraining-vietnamese-data-curation/pretraining-vietnamese-data-curation.ipynb b/tutorials/pretraining-vietnamese-data-curation/pretraining-vietnamese-data-curation.ipynb index ad226b034..dae11fa51 100644 --- a/tutorials/pretraining-vietnamese-data-curation/pretraining-vietnamese-data-curation.ipynb +++ b/tutorials/pretraining-vietnamese-data-curation/pretraining-vietnamese-data-curation.ipynb @@ -131,11 +131,10 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ - "import nemo_curator\n", "from dask.distributed import Client, LocalCluster\n", "\n", "# Start a Dask cluster with 12 workers, each limited at 64GB of memory. \n", @@ -708,17 +707,7 @@ }, { "cell_type": "code", - "execution_count": 11, - "metadata": {}, - "outputs": [], - "source": [ - "import os\n", - "os.environ[\"DASK_DATAFRAME__QUERY_PLANNING\"] = \"False\"" - ] - }, - { - "cell_type": "code", - "execution_count": 12, + "execution_count": null, "metadata": {}, "outputs": [ { @@ -734,7 +723,6 @@ ], "source": [ "from nemo_curator.utils.distributed_utils import get_client\n", - "import dask.dataframe\n", "\n", "def pre_imports():\n", " import cudf \n", @@ -1030,13 +1018,12 @@ }, { "cell_type": "code", - "execution_count": 20, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import os\n", - "from datasets import load_dataset as load_hf_dataset\n", - "from datasets import DownloadConfig " + "from datasets import load_dataset as load_hf_dataset" ] }, { @@ -1133,14 +1120,12 @@ }, { "cell_type": "code", - "execution_count": 25, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from nemo_curator import Modify\n", - "from nemo_curator.modifiers import UnicodeReformatter\n", - "from nemo_curator.utils.distributed_utils import read_data, write_to_disk\n", - "from nemo_curator.utils.file_utils import get_all_files_paths_under\n", + "from nemo_curator.utils.distributed_utils import write_to_disk\n", "from nemo_curator.datasets import DocumentDataset" ] }, diff --git a/tutorials/zyda2-tutorial/0_processing/process_dclm.py b/tutorials/zyda2-tutorial/0_processing/process_dclm.py index bcfe6e06b..9a65372e6 100644 --- a/tutorials/zyda2-tutorial/0_processing/process_dclm.py +++ b/tutorials/zyda2-tutorial/0_processing/process_dclm.py @@ -1,8 +1,5 @@ -import os - -os.environ["DASK_DATAFRAME__QUERY_PLANNING"] = "False" - import logging +import os from dask.distributed import Client, LocalCluster from helper import process_data diff --git a/tutorials/zyda2-tutorial/0_processing/process_dolma_cc.py b/tutorials/zyda2-tutorial/0_processing/process_dolma_cc.py index c4f9a2872..33718b04a 100644 --- a/tutorials/zyda2-tutorial/0_processing/process_dolma_cc.py +++ b/tutorials/zyda2-tutorial/0_processing/process_dolma_cc.py @@ -1,8 +1,5 @@ -import os - -os.environ["DASK_DATAFRAME__QUERY_PLANNING"] = "False" - import logging +import os from dask.distributed import Client, LocalCluster from helper import process_data diff --git a/tutorials/zyda2-tutorial/0_processing/process_fwe2.py b/tutorials/zyda2-tutorial/0_processing/process_fwe2.py index a425a18ec..3c3ebb234 100644 --- a/tutorials/zyda2-tutorial/0_processing/process_fwe2.py +++ b/tutorials/zyda2-tutorial/0_processing/process_fwe2.py @@ -1,10 +1,7 @@ -import os - -os.environ["DASK_DATAFRAME__QUERY_PLANNING"] = "False" - import ctypes import gc import logging +import os from pathlib import Path from dask.distributed import Client, LocalCluster diff --git a/tutorials/zyda2-tutorial/0_processing/process_zyda.py b/tutorials/zyda2-tutorial/0_processing/process_zyda.py index 6cc951cde..c9097a684 100644 --- a/tutorials/zyda2-tutorial/0_processing/process_zyda.py +++ b/tutorials/zyda2-tutorial/0_processing/process_zyda.py @@ -1,8 +1,5 @@ -import os - -os.environ["DASK_DATAFRAME__QUERY_PLANNING"] = "False" - import logging +import os from dask.distributed import Client, LocalCluster from helper import process_data diff --git a/tutorials/zyda2-tutorial/1_fuzzy_dedup/0_minhash.py b/tutorials/zyda2-tutorial/1_fuzzy_dedup/0_minhash.py index d3c732520..fcbbf9dde 100644 --- a/tutorials/zyda2-tutorial/1_fuzzy_dedup/0_minhash.py +++ b/tutorials/zyda2-tutorial/1_fuzzy_dedup/0_minhash.py @@ -1,8 +1,5 @@ -import os - -os.environ["DASK_DATAFRAME__QUERY_PLANNING"] = "False" - import logging +import os import time import dask_cudf diff --git a/tutorials/zyda2-tutorial/1_fuzzy_dedup/1_lsh.py b/tutorials/zyda2-tutorial/1_fuzzy_dedup/1_lsh.py index b574c1e81..0df90c207 100644 --- a/tutorials/zyda2-tutorial/1_fuzzy_dedup/1_lsh.py +++ b/tutorials/zyda2-tutorial/1_fuzzy_dedup/1_lsh.py @@ -1,8 +1,5 @@ -import os - -os.environ["DASK_DATAFRAME__QUERY_PLANNING"] = "False" - import logging +import os import time import cudf diff --git a/tutorials/zyda2-tutorial/1_fuzzy_dedup/2_buckets_to_edges.py b/tutorials/zyda2-tutorial/1_fuzzy_dedup/2_buckets_to_edges.py index 2ae7f408c..853fe6fda 100644 --- a/tutorials/zyda2-tutorial/1_fuzzy_dedup/2_buckets_to_edges.py +++ b/tutorials/zyda2-tutorial/1_fuzzy_dedup/2_buckets_to_edges.py @@ -1,8 +1,5 @@ -import os - -os.environ["DASK_DATAFRAME__QUERY_PLANNING"] = "False" - import logging +import os import time import dask_cudf diff --git a/tutorials/zyda2-tutorial/1_fuzzy_dedup/3_connected_components.py b/tutorials/zyda2-tutorial/1_fuzzy_dedup/3_connected_components.py index 467e3c4e2..db76ce5b3 100644 --- a/tutorials/zyda2-tutorial/1_fuzzy_dedup/3_connected_components.py +++ b/tutorials/zyda2-tutorial/1_fuzzy_dedup/3_connected_components.py @@ -1,8 +1,5 @@ -import os - -os.environ["DASK_DATAFRAME__QUERY_PLANNING"] = "False" - import logging +import os import time from nemo_curator.modules.fuzzy_dedup import ConnectedComponents diff --git a/tutorials/zyda2-tutorial/2_dupes_removal/0_id_mapping.py b/tutorials/zyda2-tutorial/2_dupes_removal/0_id_mapping.py index b2f95e259..cf30c3af9 100644 --- a/tutorials/zyda2-tutorial/2_dupes_removal/0_id_mapping.py +++ b/tutorials/zyda2-tutorial/2_dupes_removal/0_id_mapping.py @@ -1,9 +1,6 @@ -import os - -os.environ["DASK_DATAFRAME__QUERY_PLANNING"] = "False" - import json import logging +import os import cudf import dask_cudf diff --git a/tutorials/zyda2-tutorial/2_dupes_removal/1_id_conversion.py b/tutorials/zyda2-tutorial/2_dupes_removal/1_id_conversion.py index 82b37cb3c..94aba73a9 100644 --- a/tutorials/zyda2-tutorial/2_dupes_removal/1_id_conversion.py +++ b/tutorials/zyda2-tutorial/2_dupes_removal/1_id_conversion.py @@ -1,9 +1,6 @@ import json -import os - -os.environ["DASK_DATAFRAME__QUERY_PLANNING"] = "False" - import logging +import os import dask.dataframe as dd from dask.distributed import Client, LocalCluster diff --git a/tutorials/zyda2-tutorial/2_dupes_removal/2_compute_counts.py b/tutorials/zyda2-tutorial/2_dupes_removal/2_compute_counts.py index c59d0f893..f0b4d6b76 100644 --- a/tutorials/zyda2-tutorial/2_dupes_removal/2_compute_counts.py +++ b/tutorials/zyda2-tutorial/2_dupes_removal/2_compute_counts.py @@ -1,11 +1,8 @@ import json +import logging import os import time -os.environ["DASK_DATAFRAME__QUERY_PLANNING"] = "False" - -import logging - import dask.dataframe as dd import pandas as pd from dask.distributed import Client, LocalCluster diff --git a/tutorials/zyda2-tutorial/2_dupes_removal/3_prep_dupes.py b/tutorials/zyda2-tutorial/2_dupes_removal/3_prep_dupes.py index d9d30f367..5532071ad 100644 --- a/tutorials/zyda2-tutorial/2_dupes_removal/3_prep_dupes.py +++ b/tutorials/zyda2-tutorial/2_dupes_removal/3_prep_dupes.py @@ -1,9 +1,6 @@ import json -import os - -os.environ["DASK_DATAFRAME__QUERY_PLANNING"] = "False" - import logging +import os import dask.dataframe as dd import pandas as pd diff --git a/tutorials/zyda2-tutorial/2_dupes_removal/4_get_dupes_dclm.py b/tutorials/zyda2-tutorial/2_dupes_removal/4_get_dupes_dclm.py index 6de77cd36..61297fcfc 100644 --- a/tutorials/zyda2-tutorial/2_dupes_removal/4_get_dupes_dclm.py +++ b/tutorials/zyda2-tutorial/2_dupes_removal/4_get_dupes_dclm.py @@ -1,9 +1,6 @@ import json -import os - -os.environ["DASK_DATAFRAME__QUERY_PLANNING"] = "False" - import logging +import os import dask.dataframe as dd from dask.distributed import Client, LocalCluster diff --git a/tutorials/zyda2-tutorial/2_dupes_removal/4_get_dupes_dolma-cc.py b/tutorials/zyda2-tutorial/2_dupes_removal/4_get_dupes_dolma-cc.py index f6749c572..ea3d24868 100644 --- a/tutorials/zyda2-tutorial/2_dupes_removal/4_get_dupes_dolma-cc.py +++ b/tutorials/zyda2-tutorial/2_dupes_removal/4_get_dupes_dolma-cc.py @@ -1,9 +1,6 @@ import json -import os - -os.environ["DASK_DATAFRAME__QUERY_PLANNING"] = "False" - import logging +import os import dask.dataframe as dd from dask.distributed import Client, LocalCluster diff --git a/tutorials/zyda2-tutorial/2_dupes_removal/5_get_dupes_zyda.py b/tutorials/zyda2-tutorial/2_dupes_removal/5_get_dupes_zyda.py index cabb471e7..0675fa1c5 100644 --- a/tutorials/zyda2-tutorial/2_dupes_removal/5_get_dupes_zyda.py +++ b/tutorials/zyda2-tutorial/2_dupes_removal/5_get_dupes_zyda.py @@ -1,9 +1,6 @@ import json -import os - -os.environ["DASK_DATAFRAME__QUERY_PLANNING"] = "False" - import logging +import os import dask.dataframe as dd from dask.distributed import Client, LocalCluster diff --git a/tutorials/zyda2-tutorial/2_dupes_removal/remove_dupes.py b/tutorials/zyda2-tutorial/2_dupes_removal/remove_dupes.py index 16238c790..20ca0fe21 100644 --- a/tutorials/zyda2-tutorial/2_dupes_removal/remove_dupes.py +++ b/tutorials/zyda2-tutorial/2_dupes_removal/remove_dupes.py @@ -1,12 +1,9 @@ import argparse import json +import logging import os import time -os.environ["DASK_DATAFRAME__QUERY_PLANNING"] = "False" - -import logging - import dask.dataframe as dd from dask.distributed import Client, LocalCluster diff --git a/tutorials/zyda2-tutorial/3_quality_model/run_quality_classifier.py b/tutorials/zyda2-tutorial/3_quality_model/run_quality_classifier.py index c19720a75..6e4c9130a 100644 --- a/tutorials/zyda2-tutorial/3_quality_model/run_quality_classifier.py +++ b/tutorials/zyda2-tutorial/3_quality_model/run_quality_classifier.py @@ -1,11 +1,8 @@ import argparse +import logging import os import time -os.environ["DASK_DATAFRAME__QUERY_PLANNING"] = "False" - -import logging - from nemo_curator.classifiers import QualityClassifier from nemo_curator.datasets import DocumentDataset from nemo_curator.utils.distributed_utils import get_client, get_num_workers diff --git a/tutorials/zyda2-tutorial/4_filtering/filter_fwe.py b/tutorials/zyda2-tutorial/4_filtering/filter_fwe.py index 90a1ea977..d30eefd09 100644 --- a/tutorials/zyda2-tutorial/4_filtering/filter_fwe.py +++ b/tutorials/zyda2-tutorial/4_filtering/filter_fwe.py @@ -1,10 +1,7 @@ +import logging import os import time -os.environ["DASK_DATAFRAME__QUERY_PLANNING"] = "False" - -import logging - import dask.dataframe as dd import pyarrow as pa from dask.distributed import Client, LocalCluster diff --git a/tutorials/zyda2-tutorial/4_filtering/filter_quality.py b/tutorials/zyda2-tutorial/4_filtering/filter_quality.py index a027b9f43..f2cce92e9 100644 --- a/tutorials/zyda2-tutorial/4_filtering/filter_quality.py +++ b/tutorials/zyda2-tutorial/4_filtering/filter_quality.py @@ -1,10 +1,6 @@ import argparse -import os -import time - -os.environ["DASK_DATAFRAME__QUERY_PLANNING"] = "False" - import logging +import time import dask.dataframe as dd import pyarrow as pa