LICENSE
README.md
pyproject.toml
setup.py
nemo_curator/__init__.py
nemo_curator/_compat.py
nemo_curator/log.py
nemo_curator/sample_dataframe.py
nemo_curator.egg-info/PKG-INFO
nemo_curator.egg-info/SOURCES.txt
nemo_curator.egg-info/dependency_links.txt
nemo_curator.egg-info/entry_points.txt
nemo_curator.egg-info/requires.txt
nemo_curator.egg-info/top_level.txt
nemo_curator/datasets/__init__.py
nemo_curator/datasets/doc_dataset.py
nemo_curator/distributed_data_classification/__init__.py
nemo_curator/distributed_data_classification/arg_utils.py
nemo_curator/distributed_data_classification/domain_classifier_inference.py
nemo_curator/distributed_data_classification/generate_statistics.py
nemo_curator/distributed_data_classification/pytorch_utils.py
nemo_curator/distributed_data_classification/quality_classifier_inference.py
nemo_curator/distributed_data_classification/quality_classifier_multiple_models_inference.py
nemo_curator/distributed_data_classification/verify_results.py
nemo_curator/download/__init__.py
nemo_curator/download/arxiv.py
nemo_curator/download/commoncrawl.py
nemo_curator/download/doc_builder.py
nemo_curator/download/wikipedia.py
nemo_curator/filters/__init__.py
nemo_curator/filters/classifier_filter.py
nemo_curator/filters/code.py
nemo_curator/filters/doc_filter.py
nemo_curator/filters/heuristic_filter.py
nemo_curator/modifiers/__init__.py
nemo_curator/modifiers/c4.py
nemo_curator/modifiers/doc_modifier.py
nemo_curator/modifiers/fasttext.py
nemo_curator/modifiers/pii_modifier.py
nemo_curator/modifiers/unicode_reformatter.py
nemo_curator/modules/__init__.py
nemo_curator/modules/add_id.py
nemo_curator/modules/config.py
nemo_curator/modules/dataset_ops.py
nemo_curator/modules/distributed_data_classifier.py
nemo_curator/modules/exact_dedup.py
nemo_curator/modules/filter.py
nemo_curator/modules/fuzzy_dedup.py
nemo_curator/modules/meta.py
nemo_curator/modules/modify.py
nemo_curator/modules/task.py
nemo_curator/pii/__init__.py
nemo_curator/pii/algorithm.py
nemo_curator/pii/constants.py
nemo_curator/pii/custom_batch_analyzer_engine.py
nemo_curator/pii/custom_nlp_engine.py
nemo_curator/pii/recognizers/__init__.py
nemo_curator/pii/recognizers/address_recognizer.py
nemo_curator/scripts/__init__.py
nemo_curator/scripts/add_id.py
nemo_curator/scripts/blend_datasets.py
nemo_curator/scripts/download_and_extract.py
nemo_curator/scripts/filter_documents.py
nemo_curator/scripts/find_exact_duplicates.py
nemo_curator/scripts/find_matching_ngrams.py
nemo_curator/scripts/find_pii_and_deidentify.py
nemo_curator/scripts/get_common_crawl_urls.py
nemo_curator/scripts/get_wikipedia_urls.py
nemo_curator/scripts/make_data_shards.py
nemo_curator/scripts/prepare_fasttext_training_data.py
nemo_curator/scripts/prepare_task_data.py
nemo_curator/scripts/remove_matching_ngrams.py
nemo_curator/scripts/separate_by_metadata.py
nemo_curator/scripts/text_cleaning.py
nemo_curator/scripts/train_fasttext.py
nemo_curator/scripts/fuzzy_deduplication/__init__.py
nemo_curator/scripts/fuzzy_deduplication/compute_minhashes.py
nemo_curator/scripts/fuzzy_deduplication/connected_components.py
nemo_curator/scripts/fuzzy_deduplication/jaccard_compute.py
nemo_curator/scripts/fuzzy_deduplication/jaccard_shuffle.py
nemo_curator/scripts/fuzzy_deduplication/map_buckets.py
nemo_curator/scripts/fuzzy_deduplication/minhash_lsh.py
nemo_curator/tasks/__init__.py
nemo_curator/tasks/downstream_task.py
nemo_curator/tasks/metrics.py
nemo_curator/utils/__init__.py
nemo_curator/utils/config_utils.py
nemo_curator/utils/constants.py
nemo_curator/utils/decorators.py
nemo_curator/utils/distributed_utils.py
nemo_curator/utils/download_utils.py
nemo_curator/utils/file_utils.py
nemo_curator/utils/gpu_utils.py
nemo_curator/utils/import_utils.py
nemo_curator/utils/module_utils.py
nemo_curator/utils/script_utils.py
nemo_curator/utils/text_utils.py
nemo_curator/utils/fuzzy_dedup_utils/__init__.py
nemo_curator/utils/fuzzy_dedup_utils/id_mapping.py
nemo_curator/utils/fuzzy_dedup_utils/io_utils.py
nemo_curator/utils/fuzzy_dedup_utils/merge_utils.py
nemo_curator/utils/fuzzy_dedup_utils/output_map_utils.py
nemo_curator/utils/fuzzy_dedup_utils/shuffle_utils.py
tests/__init__.py
tests/test_add_id.py
tests/test_blend_datasets.py
tests/test_config.py
tests/test_download.py
tests/test_exact_dedup.py
tests/test_filters.py
tests/test_fuzzy_dedup.py
tests/test_pii_accuracy.py
tests/test_shuffle.py
tests/test_task_decontamination.py
tests/test_unicode_reformatter.py