Skip to content

Commit

Permalink
Remove lingering DASK_DATAFRAME__QUERY_PLANNING environment variabl…
Browse files Browse the repository at this point in the history
…es (NVIDIA#346)

* remove extra DASK_DATAFRAME__QUERY_PLANNING

Signed-off-by: Sarah Yurick <sarahyurick@gmail.com>

* sort imports

Signed-off-by: Sarah Yurick <sarahyurick@gmail.com>

* last file

Signed-off-by: Sarah Yurick <sarahyurick@gmail.com>

---------

Signed-off-by: Sarah Yurick <sarahyurick@gmail.com>
Signed-off-by: Vinay Raman <viraman@nvidia.com>
  • Loading branch information
sarahyurick authored and vinay-raman committed Nov 26, 2024
1 parent 47aec91 commit cb49b1f
Show file tree
Hide file tree
Showing 22 changed files with 26 additions and 101 deletions.
3 changes: 1 addition & 2 deletions tutorials/image-curation/image-curation.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -51,8 +51,7 @@
"source": [
"!pip install ipywidgets aiofiles\n",
"# Install from source by default\n",
"!pip install --extra-index-url https://pypi.nvidia.com ../../[image]\n",
"%env DASK_DATAFRAME__QUERY_PLANNING False"
"!pip install --extra-index-url https://pypi.nvidia.com ../../[image]"
]
},
{
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,6 @@ export CUDF_SPILL="1"
export RMM_SCHEDULER_POOL_SIZE="1GB"
export RMM_WORKER_POOL_SIZE="72GiB"
export LIBCUDF_CUFILE_POLICY=OFF
export DASK_DATAFRAME__QUERY_PLANNING=False


# =================================================================
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -131,11 +131,10 @@
},
{
"cell_type": "code",
"execution_count": 2,
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import nemo_curator\n",
"from dask.distributed import Client, LocalCluster\n",
"\n",
"# Start a Dask cluster with 12 workers, each limited at 64GB of memory. \n",
Expand Down Expand Up @@ -708,17 +707,7 @@
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [],
"source": [
"import os\n",
"os.environ[\"DASK_DATAFRAME__QUERY_PLANNING\"] = \"False\""
]
},
{
"cell_type": "code",
"execution_count": 12,
"execution_count": null,
"metadata": {},
"outputs": [
{
Expand All @@ -734,7 +723,6 @@
],
"source": [
"from nemo_curator.utils.distributed_utils import get_client\n",
"import dask.dataframe\n",
"\n",
"def pre_imports():\n",
" import cudf \n",
Expand Down Expand Up @@ -1030,13 +1018,12 @@
},
{
"cell_type": "code",
"execution_count": 20,
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import os\n",
"from datasets import load_dataset as load_hf_dataset\n",
"from datasets import DownloadConfig "
"from datasets import load_dataset as load_hf_dataset"
]
},
{
Expand Down Expand Up @@ -1133,14 +1120,12 @@
},
{
"cell_type": "code",
"execution_count": 25,
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from nemo_curator import Modify\n",
"from nemo_curator.modifiers import UnicodeReformatter\n",
"from nemo_curator.utils.distributed_utils import read_data, write_to_disk\n",
"from nemo_curator.utils.file_utils import get_all_files_paths_under\n",
"from nemo_curator.utils.distributed_utils import write_to_disk\n",
"from nemo_curator.datasets import DocumentDataset"
]
},
Expand Down
5 changes: 1 addition & 4 deletions tutorials/zyda2-tutorial/0_processing/process_dclm.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,5 @@
import os

os.environ["DASK_DATAFRAME__QUERY_PLANNING"] = "False"

import logging
import os

from dask.distributed import Client, LocalCluster
from helper import process_data
Expand Down
5 changes: 1 addition & 4 deletions tutorials/zyda2-tutorial/0_processing/process_dolma_cc.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,5 @@
import os

os.environ["DASK_DATAFRAME__QUERY_PLANNING"] = "False"

import logging
import os

from dask.distributed import Client, LocalCluster
from helper import process_data
Expand Down
5 changes: 1 addition & 4 deletions tutorials/zyda2-tutorial/0_processing/process_fwe2.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,7 @@
import os

os.environ["DASK_DATAFRAME__QUERY_PLANNING"] = "False"

import ctypes
import gc
import logging
import os
from pathlib import Path

from dask.distributed import Client, LocalCluster
Expand Down
5 changes: 1 addition & 4 deletions tutorials/zyda2-tutorial/0_processing/process_zyda.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,5 @@
import os

os.environ["DASK_DATAFRAME__QUERY_PLANNING"] = "False"

import logging
import os

from dask.distributed import Client, LocalCluster
from helper import process_data
Expand Down
5 changes: 1 addition & 4 deletions tutorials/zyda2-tutorial/1_fuzzy_dedup/0_minhash.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,5 @@
import os

os.environ["DASK_DATAFRAME__QUERY_PLANNING"] = "False"

import logging
import os
import time

import dask_cudf
Expand Down
5 changes: 1 addition & 4 deletions tutorials/zyda2-tutorial/1_fuzzy_dedup/1_lsh.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,5 @@
import os

os.environ["DASK_DATAFRAME__QUERY_PLANNING"] = "False"

import logging
import os
import time

import cudf
Expand Down
5 changes: 1 addition & 4 deletions tutorials/zyda2-tutorial/1_fuzzy_dedup/2_buckets_to_edges.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,5 @@
import os

os.environ["DASK_DATAFRAME__QUERY_PLANNING"] = "False"

import logging
import os
import time

import dask_cudf
Expand Down
Original file line number Diff line number Diff line change
@@ -1,8 +1,5 @@
import os

os.environ["DASK_DATAFRAME__QUERY_PLANNING"] = "False"

import logging
import os
import time

from nemo_curator.modules.fuzzy_dedup import ConnectedComponents
Expand Down
5 changes: 1 addition & 4 deletions tutorials/zyda2-tutorial/2_dupes_removal/0_id_mapping.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,6 @@
import os

os.environ["DASK_DATAFRAME__QUERY_PLANNING"] = "False"

import json
import logging
import os

import cudf
import dask_cudf
Expand Down
5 changes: 1 addition & 4 deletions tutorials/zyda2-tutorial/2_dupes_removal/1_id_conversion.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,6 @@
import json
import os

os.environ["DASK_DATAFRAME__QUERY_PLANNING"] = "False"

import logging
import os

import dask.dataframe as dd
from dask.distributed import Client, LocalCluster
Expand Down
5 changes: 1 addition & 4 deletions tutorials/zyda2-tutorial/2_dupes_removal/2_compute_counts.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,8 @@
import json
import logging
import os
import time

os.environ["DASK_DATAFRAME__QUERY_PLANNING"] = "False"

import logging

import dask.dataframe as dd
import pandas as pd
from dask.distributed import Client, LocalCluster
Expand Down
5 changes: 1 addition & 4 deletions tutorials/zyda2-tutorial/2_dupes_removal/3_prep_dupes.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,6 @@
import json
import os

os.environ["DASK_DATAFRAME__QUERY_PLANNING"] = "False"

import logging
import os

import dask.dataframe as dd
import pandas as pd
Expand Down
5 changes: 1 addition & 4 deletions tutorials/zyda2-tutorial/2_dupes_removal/4_get_dupes_dclm.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,6 @@
import json
import os

os.environ["DASK_DATAFRAME__QUERY_PLANNING"] = "False"

import logging
import os

import dask.dataframe as dd
from dask.distributed import Client, LocalCluster
Expand Down
Original file line number Diff line number Diff line change
@@ -1,9 +1,6 @@
import json
import os

os.environ["DASK_DATAFRAME__QUERY_PLANNING"] = "False"

import logging
import os

import dask.dataframe as dd
from dask.distributed import Client, LocalCluster
Expand Down
5 changes: 1 addition & 4 deletions tutorials/zyda2-tutorial/2_dupes_removal/5_get_dupes_zyda.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,6 @@
import json
import os

os.environ["DASK_DATAFRAME__QUERY_PLANNING"] = "False"

import logging
import os

import dask.dataframe as dd
from dask.distributed import Client, LocalCluster
Expand Down
5 changes: 1 addition & 4 deletions tutorials/zyda2-tutorial/2_dupes_removal/remove_dupes.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,9 @@
import argparse
import json
import logging
import os
import time

os.environ["DASK_DATAFRAME__QUERY_PLANNING"] = "False"

import logging

import dask.dataframe as dd
from dask.distributed import Client, LocalCluster

Expand Down
Original file line number Diff line number Diff line change
@@ -1,11 +1,8 @@
import argparse
import logging
import os
import time

os.environ["DASK_DATAFRAME__QUERY_PLANNING"] = "False"

import logging

from nemo_curator.classifiers import QualityClassifier
from nemo_curator.datasets import DocumentDataset
from nemo_curator.utils.distributed_utils import get_client, get_num_workers
Expand Down
5 changes: 1 addition & 4 deletions tutorials/zyda2-tutorial/4_filtering/filter_fwe.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,7 @@
import logging
import os
import time

os.environ["DASK_DATAFRAME__QUERY_PLANNING"] = "False"

import logging

import dask.dataframe as dd
import pyarrow as pa
from dask.distributed import Client, LocalCluster
Expand Down
6 changes: 1 addition & 5 deletions tutorials/zyda2-tutorial/4_filtering/filter_quality.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,6 @@
import argparse
import os
import time

os.environ["DASK_DATAFRAME__QUERY_PLANNING"] = "False"

import logging
import time

import dask.dataframe as dd
import pyarrow as pa
Expand Down

0 comments on commit cb49b1f

Please sign in to comment.