Skip to content

Commit

Permalink
create separate fuzzy dedup files
Browse files Browse the repository at this point in the history
Signed-off-by: Sarah Yurick <sarahyurick@gmail.com>
  • Loading branch information
sarahyurick committed Nov 22, 2024
1 parent 9173db3 commit 6d0771a
Show file tree
Hide file tree
Showing 11 changed files with 1,941 additions and 1,733 deletions.
50 changes: 33 additions & 17 deletions nemo_curator/modules/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,18 +31,29 @@
from .task import TaskDecontamination

# GPU packages
LSH = gpu_only_import_from("nemo_curator.modules.fuzzy_dedup", "LSH")
MinHash = gpu_only_import_from("nemo_curator.modules.fuzzy_dedup", "MinHash")
FuzzyDuplicates = gpu_only_import_from(
"nemo_curator.modules.fuzzy_dedup", "FuzzyDuplicates"
MinHash = gpu_only_import_from("nemo_curator.modules.fuzzy_dedup.minhash", "MinHash")
LSH = gpu_only_import_from("nemo_curator.modules.fuzzy_dedup.lsh", "LSH")
_MapBuckets = gpu_only_import_from(
"nemo_curator.modules.fuzzy_dedup._mapbuckets", "_MapBuckets"
)
_Shuffle = gpu_only_import_from(
"nemo_curator.modules.fuzzy_dedup._shuffle", "_Shuffle"
)
JaccardSimilarity = gpu_only_import_from(
"nemo_curator.modules.fuzzy_dedup.jaccardsimilarity", "JaccardSimilarity"
)
BucketsToEdges = gpu_only_import_from(
"nemo_curator.modules.fuzzy_dedup", "BucketsToEdges"
"nemo_curator.modules.fuzzy_dedup.bucketstoedges", "BucketsToEdges"
)
ConnectedComponents = gpu_only_import_from(
"nemo_curator.modules.fuzzy_dedup.connectedcomponents", "ConnectedComponents"
)
FuzzyDuplicates = gpu_only_import_from(
"nemo_curator.modules.fuzzy_dedup.fuzzyduplicates", "FuzzyDuplicates"
)
# Pytorch related imports must come after all imports that require cugraph,
# because of context cleanup issues b/w pytorch and cugraph
# See this issue: https://github.com/rapidsai/cugraph/issues/2718
SemDedup = gpu_only_import_from("nemo_curator.modules.semantic_dedup", "SemDedup")
EmbeddingCreator = gpu_only_import_from(
"nemo_curator.modules.semantic_dedup", "EmbeddingCreator"
)
Expand All @@ -52,26 +63,31 @@
SemanticClusterLevelDedup = gpu_only_import_from(
"nemo_curator.modules.semantic_dedup", "SemanticClusterLevelDedup"
)
SemDedup = gpu_only_import_from("nemo_curator.modules.semantic_dedup", "SemDedup")

__all__ = [
"AddId",
"FuzzyDuplicatesConfig",
"SemDedupConfig",
"blend_datasets",
"Shuffle",
"ExactDuplicates",
"Filter",
"FuzzyDuplicatesConfig",
"FuzzyDuplicates",
"BucketsToEdges",
"LSH",
"MinHash",
"Modify",
"Score",
"ScoreFilter",
"Sequential",
"Modify",
"TaskDecontamination",
"AddId",
"blend_datasets",
"Shuffle",
"SemDedup",
"SemDedupConfig",
"MinHash",
"LSH",
"_MapBuckets",
"_Shuffle",
"JaccardSimilarity",
"BucketsToEdges",
"ConnectedComponents",
"FuzzyDuplicates",
"EmbeddingCreator",
"ClusteringModel",
"SemanticClusterLevelDedup",
"SemDedup",
]
Loading

0 comments on commit 6d0771a

Please sign in to comment.