LICENSE
README.md
pyproject.toml
setup.py
nemo_curator/__init__.py
nemo_curator/_compat.py
nemo_curator/log.py
nemo_curator/sample_dataframe.py
nemo_curator.egg-info/PKG-INFO
nemo_curator.egg-info/SOURCES.txt
nemo_curator.egg-info/dependency_links.txt
nemo_curator.egg-info/entry_points.txt
nemo_curator.egg-info/requires.txt
nemo_curator.egg-info/top_level.txt
nemo_curator/classifiers/__init__.py
nemo_curator/classifiers/aegis.py
nemo_curator/classifiers/base.py
nemo_curator/classifiers/domain.py
nemo_curator/classifiers/fineweb_edu.py
nemo_curator/classifiers/quality.py
nemo_curator/datasets/__init__.py
nemo_curator/datasets/doc_dataset.py
nemo_curator/datasets/image_text_pair_dataset.py
nemo_curator/download/__init__.py
nemo_curator/download/arxiv.py
nemo_curator/download/commoncrawl.py
nemo_curator/download/doc_builder.py
nemo_curator/download/wikipedia.py
nemo_curator/filters/__init__.py
nemo_curator/filters/classifier_filter.py
nemo_curator/filters/code.py
nemo_curator/filters/doc_filter.py
nemo_curator/filters/heuristic_filter.py
nemo_curator/image/__init__.py
nemo_curator/image/classifiers/__init__.py
nemo_curator/image/classifiers/aesthetic.py
nemo_curator/image/classifiers/base.py
nemo_curator/image/classifiers/nsfw.py
nemo_curator/image/embedders/__init__.py
nemo_curator/image/embedders/base.py
nemo_curator/image/embedders/timm.py
nemo_curator/modifiers/__init__.py
nemo_curator/modifiers/c4.py
nemo_curator/modifiers/doc_modifier.py
nemo_curator/modifiers/fasttext.py
nemo_curator/modifiers/pii_modifier.py
nemo_curator/modifiers/unicode_reformatter.py
nemo_curator/modules/__init__.py
nemo_curator/modules/add_id.py
nemo_curator/modules/config.py
nemo_curator/modules/dataset_ops.py
nemo_curator/modules/exact_dedup.py
nemo_curator/modules/filter.py
nemo_curator/modules/fuzzy_dedup.py
nemo_curator/modules/meta.py
nemo_curator/modules/modify.py
nemo_curator/modules/semantic_dedup.py
nemo_curator/modules/task.py
nemo_curator/nemo_run/__init__.py
nemo_curator/nemo_run/slurm.py
nemo_curator/pii/__init__.py
nemo_curator/pii/algorithm.py
nemo_curator/pii/constants.py
nemo_curator/pii/custom_batch_analyzer_engine.py
nemo_curator/pii/custom_nlp_engine.py
nemo_curator/pii/recognizers/__init__.py
nemo_curator/pii/recognizers/address_recognizer.py
nemo_curator/scripts/__init__.py
nemo_curator/scripts/add_id.py
nemo_curator/scripts/blend_datasets.py
nemo_curator/scripts/download_and_extract.py
nemo_curator/scripts/filter_documents.py
nemo_curator/scripts/find_exact_duplicates.py
nemo_curator/scripts/find_matching_ngrams.py
nemo_curator/scripts/find_pii_and_deidentify.py
nemo_curator/scripts/get_common_crawl_urls.py
nemo_curator/scripts/get_wikipedia_urls.py
nemo_curator/scripts/make_data_shards.py
nemo_curator/scripts/prepare_fasttext_training_data.py
nemo_curator/scripts/prepare_task_data.py
nemo_curator/scripts/remove_matching_ngrams.py
nemo_curator/scripts/separate_by_metadata.py
nemo_curator/scripts/text_cleaning.py
nemo_curator/scripts/train_fasttext.py
nemo_curator/scripts/verify_classification_results.py
nemo_curator/scripts/classifiers/__init__.py
nemo_curator/scripts/classifiers/aegis_classifier_inference.py
nemo_curator/scripts/classifiers/domain_classifier_inference.py
nemo_curator/scripts/classifiers/fineweb_edu_classifier_inference.py
nemo_curator/scripts/classifiers/quality_classifier_inference.py
nemo_curator/scripts/fuzzy_deduplication/__init__.py
nemo_curator/scripts/fuzzy_deduplication/buckets_to_edges.py
nemo_curator/scripts/fuzzy_deduplication/compute_minhashes.py
nemo_curator/scripts/fuzzy_deduplication/connected_components.py
nemo_curator/scripts/fuzzy_deduplication/jaccard_compute.py
nemo_curator/scripts/fuzzy_deduplication/jaccard_shuffle.py
nemo_curator/scripts/fuzzy_deduplication/map_buckets.py
nemo_curator/scripts/fuzzy_deduplication/minhash_lsh.py
nemo_curator/scripts/semdedup/__init__.py
nemo_curator/scripts/semdedup/clustering.py
nemo_curator/scripts/semdedup/compute_embeddings.py
nemo_curator/scripts/semdedup/extract_dedup_data.py
nemo_curator/services/__init__.py
nemo_curator/services/conversation_formatter.py
nemo_curator/services/model_client.py
nemo_curator/services/nemo_client.py
nemo_curator/services/openai_client.py
nemo_curator/synthetic/__init__.py
nemo_curator/synthetic/async_nemotron.py
nemo_curator/synthetic/error.py
nemo_curator/synthetic/mixtral.py
nemo_curator/synthetic/nemotron.py
nemo_curator/synthetic/no_format.py
nemo_curator/synthetic/prompts.py
nemo_curator/tasks/__init__.py
nemo_curator/tasks/downstream_task.py
nemo_curator/tasks/metrics.py
nemo_curator/utils/__init__.py
nemo_curator/utils/aegis_utils.py
nemo_curator/utils/config_utils.py
nemo_curator/utils/constants.py
nemo_curator/utils/cudf_utils.py
nemo_curator/utils/decorators.py
nemo_curator/utils/distributed_utils.py
nemo_curator/utils/download_utils.py
nemo_curator/utils/file_utils.py
nemo_curator/utils/gpu_utils.py
nemo_curator/utils/import_utils.py
nemo_curator/utils/module_utils.py
nemo_curator/utils/script_utils.py
nemo_curator/utils/semdedup_utils.py
nemo_curator/utils/text_utils.py
nemo_curator/utils/fuzzy_dedup_utils/__init__.py
nemo_curator/utils/fuzzy_dedup_utils/id_mapping.py
nemo_curator/utils/fuzzy_dedup_utils/io_utils.py
nemo_curator/utils/fuzzy_dedup_utils/merge_utils.py
nemo_curator/utils/fuzzy_dedup_utils/output_map_utils.py
nemo_curator/utils/fuzzy_dedup_utils/shuffle_utils.py
nemo_curator/utils/image/__init__.py
nemo_curator/utils/image/transforms.py
tests/__init__.py
tests/test_add_id.py
tests/test_blend_datasets.py
tests/test_config.py
tests/test_dataset.py
tests/test_download.py
tests/test_exact_dedup.py
tests/test_filters.py
tests/test_fuzzy_dedup.py
tests/test_io.py
tests/test_pii_accuracy.py
tests/test_semdedup.py
tests/test_seperate_by_metadata.py
tests/test_shuffle.py
tests/test_task_decontamination.py
tests/test_unicode_reformatter.py