Skip to content

Commit

Permalink
fix import paths
Browse files Browse the repository at this point in the history
Signed-off-by: Sarah Yurick <sarahyurick@gmail.com>
  • Loading branch information
sarahyurick committed Nov 22, 2024
1 parent 6d0771a commit 275a20a
Show file tree
Hide file tree
Showing 8 changed files with 37 additions and 54 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
import os
import time

from nemo_curator.modules.fuzzy_dedup import ConnectedComponents
from nemo_curator import ConnectedComponents
from nemo_curator.utils.distributed_utils import get_client
from nemo_curator.utils.script_utils import ArgumentHelper

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
import os
import time

from nemo_curator.modules.fuzzy_dedup import JaccardSimilarity
from nemo_curator import JaccardSimilarity
from nemo_curator.utils.distributed_utils import get_client, get_num_workers
from nemo_curator.utils.script_utils import ArgumentHelper

Expand Down
4 changes: 2 additions & 2 deletions nemo_curator/scripts/fuzzy_deduplication/jaccard_shuffle.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
import os
import time

from nemo_curator.modules.fuzzy_dedup import _Shuffle
from nemo_curator import _Shuffle
from nemo_curator.utils.distributed_utils import get_client, get_num_workers
from nemo_curator.utils.fuzzy_dedup_utils.io_utils import (
get_text_ddf_from_json_path_with_blocksize,
Expand All @@ -27,7 +27,7 @@
def func():
import cudf

from nemo_curator.modules.fuzzy_dedup import _Shuffle
from nemo_curator import _Shuffle


def main(args):
Expand Down
2 changes: 1 addition & 1 deletion nemo_curator/scripts/fuzzy_deduplication/map_buckets.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
import os
import time

from nemo_curator.modules.fuzzy_dedup import _MapBuckets
from nemo_curator import _MapBuckets
from nemo_curator.utils.distributed_utils import get_client, get_num_workers
from nemo_curator.utils.fuzzy_dedup_utils.io_utils import (
get_bucket_ddf_from_parquet_path,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -103,7 +103,7 @@
},
{
"cell_type": "code",
"execution_count": 1,
"execution_count": null,
"id": "5de0fe93",
"metadata": {
"tags": []
Expand All @@ -121,31 +121,19 @@
"source": [
"import os\n",
"import time\n",
"from dask.distributed import Client\n",
"import warnings\n",
"import dask.dataframe as dd\n",
"import dask_cudf\n",
"import cudf\n",
"import gzip\n",
"import json\n",
"import dask.bag as db\n",
"import glob\n",
"from dask.distributed import wait\n",
"import numpy as np\n",
"\n",
"from nemo_curator import get_client\n",
"from nemo_curator.datasets import DocumentDataset\n",
"from nemo_curator.utils.distributed_utils import (\n",
" get_num_workers,\n",
" read_data,\n",
" write_to_disk,\n",
")\n",
"from nemo_curator.utils.file_utils import (\n",
" expand_outdir_and_mkdir, \n",
" get_all_files_paths_under, \n",
" separate_by_metadata,\n",
" get_batched_files,\n",
")\n",
"\n",
"warnings.filterwarnings('ignore')\n",
"base_dir = \"/path/to/data\""
Expand Down Expand Up @@ -495,21 +483,20 @@
},
{
"cell_type": "code",
"execution_count": 3,
"execution_count": null,
"id": "7419a216-0dad-4d13-89ee-c3c1d009efa8",
"metadata": {
"tags": []
},
"outputs": [],
"source": [
"from nemo_curator import ScoreFilter, Modify\n",
"from nemo_curator import ScoreFilter\n",
"from nemo_curator.filters import FastTextLangId\n",
"from nemo_curator.modifiers import UnicodeReformatter\n",
"from nemo_curator.utils.file_utils import get_all_files_paths_under, separate_by_metadata\n",
"\n",
"# Language ID path\n",
"language_output_path = expand_outdir_and_mkdir(os.path.join(base_dir,\"rpv2-2023-06-language\"))\n",
"language_data_output_path = expand_outdir_and_mkdir(os.path.join(language_output_path,\"data\"))\n",
"language_output_path = expand_outdir_and_mkdir(os.path.join(base_dir, \"rpv2-2023-06-language\"))\n",
"language_data_output_path = expand_outdir_and_mkdir(os.path.join(language_output_path, \"data\"))\n",
"\n",
"# Fasttext model path\n",
"model_path = language_output_path\n",
Expand Down Expand Up @@ -808,14 +795,13 @@
},
{
"cell_type": "code",
"execution_count": 2,
"execution_count": null,
"id": "f6dc1754",
"metadata": {
"tags": []
},
"outputs": [],
"source": [
"from nemo_curator.log import create_logger\n",
"from nemo_curator.modules import ExactDuplicates\n",
"\n",
"def pre_imports():\n",
Expand Down Expand Up @@ -1796,14 +1782,14 @@
},
{
"cell_type": "code",
"execution_count": 11,
"execution_count": null,
"id": "7985cf1a-9d88-4844-8ce4-e68d9792118c",
"metadata": {
"tags": []
},
"outputs": [],
"source": [
"from nemo_curator.modules.fuzzy_dedup import _MapBuckets\n",
"from nemo_curator import _MapBuckets\n",
"from nemo_curator.utils.fuzzy_dedup_utils.io_utils import (\n",
" get_bucket_ddf_from_parquet_path,\n",
" get_text_ddf_from_json_path_with_blocksize,\n",
Expand Down Expand Up @@ -2031,14 +2017,14 @@
},
{
"cell_type": "code",
"execution_count": 23,
"execution_count": null,
"id": "11d7184d-4ca5-4b49-85b4-1264056f5c33",
"metadata": {
"tags": []
},
"outputs": [],
"source": [
"from nemo_curator.modules.fuzzy_dedup import _Shuffle\n",
"from nemo_curator import _Shuffle\n",
"\n",
"log_dir = os.path.join(base_dir, \"logs\")\n",
"input_anchor_docs_with_bk_path = os.path.join(base_dir,\"fuzzy-dedup-output-2023-06/anchor_docs_with_bk.parquet\")\n",
Expand Down Expand Up @@ -2512,14 +2498,14 @@
},
{
"cell_type": "code",
"execution_count": 5,
"execution_count": null,
"id": "573dccf7-2e23-4aae-a3ec-2b9e1a42d97d",
"metadata": {
"tags": []
},
"outputs": [],
"source": [
"from nemo_curator.modules.fuzzy_dedup import JaccardSimilarity\n",
"from nemo_curator import JaccardSimilarity\n",
"\n",
"id_field = 'id'\n",
"text_field = 'raw_content'\n",
Expand Down Expand Up @@ -2670,14 +2656,14 @@
},
{
"cell_type": "code",
"execution_count": 4,
"execution_count": null,
"id": "f9aeb619-3fab-4a18-b582-bccae3eefd17",
"metadata": {
"tags": []
},
"outputs": [],
"source": [
"from nemo_curator.modules.fuzzy_dedup import ConnectedComponents\n",
"from nemo_curator import ConnectedComponents\n",
"\n",
"cache_dir = expand_outdir_and_mkdir(\n",
" os.path.join(base_dir, \"fuzzy-dedup-output-2023-06/cc-cache\")\n",
Expand Down Expand Up @@ -3255,7 +3241,7 @@
},
{
"cell_type": "code",
"execution_count": 4,
"execution_count": null,
"id": "f1461b61-887c-4099-bd9f-32e79dc5fdbb",
"metadata": {
"tags": []
Expand All @@ -3264,10 +3250,10 @@
"source": [
"from nemo_curator import MinHash\n",
"from nemo_curator import LSH\n",
"from nemo_curator.modules.fuzzy_dedup import _MapBuckets\n",
"from nemo_curator.modules.fuzzy_dedup import _Shuffle\n",
"from nemo_curator.modules.fuzzy_dedup import ConnectedComponents\n",
"from nemo_curator.modules.fuzzy_dedup import JaccardSimilarity\n",
"from nemo_curator import _MapBuckets\n",
"from nemo_curator import _Shuffle\n",
"from nemo_curator import ConnectedComponents\n",
"from nemo_curator import JaccardSimilarity\n",
"\n",
"from nemo_curator.utils.file_utils import reshard_jsonl\n",
"from nemo_curator.utils.fuzzy_dedup_utils.id_mapping import convert_str_id_to_int\n",
Expand Down Expand Up @@ -4718,14 +4704,13 @@
},
{
"cell_type": "code",
"execution_count": 6,
"execution_count": null,
"id": "49273a8b-848f-4f24-a0ba-3c0b478d17cc",
"metadata": {
"tags": []
},
"outputs": [],
"source": [
"import nemo_curator\n",
"from nemo_curator.utils.config_utils import build_filter_pipeline\n",
"\n",
"filter_config_file = os.path.join(base_dir, \"config/heuristic_filter_en.yaml\")\n",
Expand Down
22 changes: 10 additions & 12 deletions tutorials/single_node_tutorial/single_gpu_tutorial.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -122,23 +122,21 @@
},
"outputs": [],
"source": [
"import argparse\n",
"import os\n",
"\n",
"from nemo_curator.utils.distributed_utils import get_client,get_num_workers\n",
"from nemo_curator.utils.distributed_utils import get_client, get_num_workers\n",
"from nemo_curator.utils.file_utils import get_all_files_paths_under, separate_by_metadata\n",
"from nemo_curator.utils.distributed_utils import read_data,write_to_disk\n",
"from nemo_curator.utils.distributed_utils import read_data, write_to_disk\n",
"from nemo_curator.datasets import DocumentDataset\n",
"\n",
"import sys\n",
"import pandas as pd\n",
"import time\n",
"import cudf\n",
"import dask_cudf\n",
"import dask\n",
"import numpy as np\n",
"from dask.distributed import Client, LocalCluster\n",
"import jsonlines\n"
"import jsonlines"
]
},
{
Expand Down Expand Up @@ -406,7 +404,7 @@
},
"outputs": [],
"source": [
"from nemo_curator import ScoreFilter,Modify\n",
"from nemo_curator import ScoreFilter, Modify\n",
"from nemo_curator.filters import FastTextLangId\n",
"from nemo_curator.modifiers import UnicodeReformatter"
]
Expand Down Expand Up @@ -1360,7 +1358,7 @@
" get_bucket_ddf_from_parquet_path,\n",
" get_text_ddf_from_json_path_with_blocksize,\n",
")\n",
"from nemo_curator.modules.fuzzy_dedup import _MapBuckets,_Shuffle"
"from nemo_curator import _MapBuckets, _Shuffle"
]
},
{
Expand Down Expand Up @@ -1572,7 +1570,7 @@
},
"outputs": [],
"source": [
"from nemo_curator.modules.fuzzy_dedup import JaccardSimilarity"
"from nemo_curator import JaccardSimilarity"
]
},
{
Expand Down Expand Up @@ -1691,7 +1689,7 @@
},
"outputs": [],
"source": [
"from nemo_curator.modules.fuzzy_dedup import ConnectedComponents"
"from nemo_curator import ConnectedComponents"
]
},
{
Expand Down Expand Up @@ -2258,8 +2256,8 @@
"outputs": [],
"source": [
"from nemo_curator.utils.config_utils import build_filter_pipeline\n",
"from nemo_curator import Score, Filter, ScoreFilter\n",
"from nemo_curator.utils.file_utils import get_batched_files,expand_outdir_and_mkdir"
"from nemo_curator import Score, ScoreFilter\n",
"from nemo_curator.utils.file_utils import expand_outdir_and_mkdir"
]
},
{
Expand All @@ -2282,7 +2280,7 @@
"import warnings\n",
"\n",
"# Disable the metadata warning\n",
"warnings.filterwarnings(\"ignore\",module=\"dask.dataframe.core\")"
"warnings.filterwarnings(\"ignore\", module=\"dask.dataframe.core\")"
]
},
{
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,8 @@

import dask_cudf

from nemo_curator import BucketsToEdges
from nemo_curator.datasets import DocumentDataset
from nemo_curator.modules.fuzzy_dedup import BucketsToEdges
from nemo_curator.utils.distributed_utils import get_client, get_num_workers

logging.basicConfig(format="%(asctime)s: %(message)s", level=logging.INFO)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
import os
import time

from nemo_curator.modules.fuzzy_dedup import ConnectedComponents
from nemo_curator import ConnectedComponents
from nemo_curator.utils.distributed_utils import get_client, get_num_workers

logging.basicConfig(format="%(asctime)s: %(message)s", level=logging.INFO)
Expand Down

0 comments on commit 275a20a

Please sign in to comment.