LICENSE
MANIFEST.in
README.md
pyproject.toml
examples/async_llm_pii_redaction.py
examples/blend_and_shuffle.py
examples/classifier_filtering.py
examples/download_arxiv.py
examples/download_common_crawl.py
examples/download_wikipedia.py
examples/exact_deduplication.py
examples/find_pii_and_deidentify.py
examples/fuzzy_deduplication.py
examples/identify_languages.py
examples/llm_pii_redaction.py
examples/raw_download_common_crawl.py
examples/semdedup_example.py
examples/task_decontamination.py
examples/classifiers/aegis_example.py
examples/classifiers/content_type_example.py
examples/classifiers/domain_example.py
examples/classifiers/fineweb_edu_example.py
examples/classifiers/fineweb_mixtral_edu_example.py
examples/classifiers/fineweb_nemotron_edu_example.py
examples/classifiers/instruction_data_guard_example.py
examples/classifiers/multilingual_domain_example.py
examples/classifiers/prompt_task_complexity_example.py
examples/classifiers/quality_example.py
examples/k8s/create_dask_cluster.py
examples/nemo_run/launch_slurm.py
nemo_curator/__init__.py
nemo_curator/_compat.py
nemo_curator/log.py
nemo_curator/package_info.py
nemo_curator.egg-info/PKG-INFO
nemo_curator.egg-info/SOURCES.txt
nemo_curator.egg-info/dependency_links.txt
nemo_curator.egg-info/entry_points.txt
nemo_curator.egg-info/requires.txt
nemo_curator.egg-info/top_level.txt
nemo_curator/classifiers/__init__.py
nemo_curator/classifiers/aegis.py
nemo_curator/classifiers/base.py
nemo_curator/classifiers/content_type.py
nemo_curator/classifiers/domain.py
nemo_curator/classifiers/fineweb_edu.py
nemo_curator/classifiers/prompt_task_complexity.py
nemo_curator/classifiers/quality.py
nemo_curator/datasets/__init__.py
nemo_curator/datasets/doc_dataset.py
nemo_curator/datasets/image_text_pair_dataset.py
nemo_curator/datasets/parallel_dataset.py
nemo_curator/download/__init__.py
nemo_curator/download/arxiv.py
nemo_curator/download/commoncrawl.py
nemo_curator/download/doc_builder.py
nemo_curator/download/ja_stopwords.py
nemo_curator/download/th_stopwords.py
nemo_curator/download/wikipedia.py
nemo_curator/download/zh_stopwords.py
nemo_curator/filters/__init__.py
nemo_curator/filters/bitext_filter.py
nemo_curator/filters/classifier_filter.py
nemo_curator/filters/code.py
nemo_curator/filters/doc_filter.py
nemo_curator/filters/heuristic_filter.py
nemo_curator/filters/synthetic.py
nemo_curator/filters/models/__init__.py
nemo_curator/filters/models/qe_models.py
nemo_curator/image/__init__.py
nemo_curator/image/classifiers/__init__.py
nemo_curator/image/classifiers/aesthetic.py
nemo_curator/image/classifiers/base.py
nemo_curator/image/classifiers/nsfw.py
nemo_curator/image/embedders/__init__.py
nemo_curator/image/embedders/base.py
nemo_curator/image/embedders/timm.py
nemo_curator/modifiers/__init__.py
nemo_curator/modifiers/async_llm_pii_modifier.py
nemo_curator/modifiers/c4.py
nemo_curator/modifiers/doc_modifier.py
nemo_curator/modifiers/fasttext.py
nemo_curator/modifiers/line_remover.py
nemo_curator/modifiers/llm_pii_modifier.py
nemo_curator/modifiers/markdown_remover.py
nemo_curator/modifiers/newline_normalizer.py
nemo_curator/modifiers/pii_modifier.py
nemo_curator/modifiers/quotation_remover.py
nemo_curator/modifiers/slicer.py
nemo_curator/modifiers/unicode_reformatter.py
nemo_curator/modifiers/url_remover.py
nemo_curator/modules/__init__.py
nemo_curator/modules/add_id.py
nemo_curator/modules/base.py
nemo_curator/modules/config.py
nemo_curator/modules/dataset_ops.py
nemo_curator/modules/exact_dedup.py
nemo_curator/modules/filter.py
nemo_curator/modules/joiner.py
nemo_curator/modules/meta.py
nemo_curator/modules/modify.py
nemo_curator/modules/splitter.py
nemo_curator/modules/task.py
nemo_curator/modules/to_backend.py
nemo_curator/modules/fuzzy_dedup/_mapbuckets.py
nemo_curator/modules/fuzzy_dedup/_shuffle.py
nemo_curator/modules/fuzzy_dedup/bucketstoedges.py
nemo_curator/modules/fuzzy_dedup/connectedcomponents.py
nemo_curator/modules/fuzzy_dedup/fuzzyduplicates.py
nemo_curator/modules/fuzzy_dedup/jaccardsimilarity.py
nemo_curator/modules/fuzzy_dedup/lsh.py
nemo_curator/modules/fuzzy_dedup/minhash.py
nemo_curator/modules/semantic_dedup/clusteringmodel.py
nemo_curator/modules/semantic_dedup/embeddings.py
nemo_curator/modules/semantic_dedup/semanticclusterleveldedup.py
nemo_curator/modules/semantic_dedup/semdedup.py
nemo_curator/nemo_run/__init__.py
nemo_curator/nemo_run/slurm.py
nemo_curator/pii/__init__.py
nemo_curator/pii/algorithm.py
nemo_curator/pii/constants.py
nemo_curator/pii/custom_batch_analyzer_engine.py
nemo_curator/pii/custom_nlp_engine.py
nemo_curator/pii/recognizers/__init__.py
nemo_curator/pii/recognizers/address_recognizer.py
nemo_curator/scripts/__init__.py
nemo_curator/scripts/add_id.py
nemo_curator/scripts/async_llm_pii_redaction.py
nemo_curator/scripts/blend_datasets.py
nemo_curator/scripts/download_and_extract.py
nemo_curator/scripts/filter_documents.py
nemo_curator/scripts/find_exact_duplicates.py
nemo_curator/scripts/find_matching_ngrams.py
nemo_curator/scripts/find_pii_and_deidentify.py
nemo_curator/scripts/get_common_crawl_urls.py
nemo_curator/scripts/get_wikipedia_urls.py
nemo_curator/scripts/llm_pii_redaction.py
nemo_curator/scripts/make_data_shards.py
nemo_curator/scripts/prepare_fasttext_training_data.py
nemo_curator/scripts/prepare_task_data.py
nemo_curator/scripts/remove_matching_ngrams.py
nemo_curator/scripts/separate_by_metadata.py
nemo_curator/scripts/text_cleaning.py
nemo_curator/scripts/train_fasttext.py
nemo_curator/scripts/verify_classification_results.py
nemo_curator/scripts/classifiers/__init__.py
nemo_curator/scripts/classifiers/aegis_classifier_inference.py
nemo_curator/scripts/classifiers/content_type_classifier_inference.py
nemo_curator/scripts/classifiers/domain_classifier_inference.py
nemo_curator/scripts/classifiers/fineweb_edu_classifier_inference.py
nemo_curator/scripts/classifiers/fineweb_mixtral_edu_classifier_inference.py
nemo_curator/scripts/classifiers/fineweb_nemotron_edu_classifier_inference.py
nemo_curator/scripts/classifiers/instruction_data_guard_classifier_inference.py
nemo_curator/scripts/classifiers/multilingual_domain_classifier_inference.py
nemo_curator/scripts/classifiers/prompt_task_complexity_classifier_inference.py
nemo_curator/scripts/classifiers/quality_classifier_inference.py
nemo_curator/scripts/fuzzy_deduplication/__init__.py
nemo_curator/scripts/fuzzy_deduplication/buckets_to_edges.py
nemo_curator/scripts/fuzzy_deduplication/compute_minhashes.py
nemo_curator/scripts/fuzzy_deduplication/connected_components.py
nemo_curator/scripts/fuzzy_deduplication/jaccard_compute.py
nemo_curator/scripts/fuzzy_deduplication/jaccard_shuffle.py
nemo_curator/scripts/fuzzy_deduplication/map_buckets.py
nemo_curator/scripts/fuzzy_deduplication/minhash_lsh.py
nemo_curator/scripts/semdedup/__init__.py
nemo_curator/scripts/semdedup/clustering.py
nemo_curator/scripts/semdedup/compute_embeddings.py
nemo_curator/scripts/semdedup/extract_dedup_data.py
nemo_curator/services/__init__.py
nemo_curator/services/conversation_formatter.py
nemo_curator/services/model_client.py
nemo_curator/services/nemo_client.py
nemo_curator/services/openai_client.py
nemo_curator/synthetic/__init__.py
nemo_curator/synthetic/async_nemotron.py
nemo_curator/synthetic/async_nemotron_cc.py
nemo_curator/synthetic/error.py
nemo_curator/synthetic/generator.py
nemo_curator/synthetic/mixtral.py
nemo_curator/synthetic/nemotron.py
nemo_curator/synthetic/nemotron_cc.py
nemo_curator/synthetic/no_format.py
nemo_curator/synthetic/prompts.py
nemo_curator/tasks/__init__.py
nemo_curator/tasks/downstream_task.py
nemo_curator/tasks/metrics.py
nemo_curator/utils/__init__.py
nemo_curator/utils/aegis_utils.py
nemo_curator/utils/config_utils.py
nemo_curator/utils/constants.py
nemo_curator/utils/decorators.py
nemo_curator/utils/distributed_utils.py
nemo_curator/utils/download_utils.py
nemo_curator/utils/duplicates_removal.py
nemo_curator/utils/file_utils.py
nemo_curator/utils/gpu_utils.py
nemo_curator/utils/import_utils.py
nemo_curator/utils/llm_pii_utils.py
nemo_curator/utils/module_utils.py
nemo_curator/utils/script_utils.py
nemo_curator/utils/semdedup_utils.py
nemo_curator/utils/text_utils.py
nemo_curator/utils/fuzzy_dedup_utils/__init__.py
nemo_curator/utils/fuzzy_dedup_utils/id_mapping.py
nemo_curator/utils/fuzzy_dedup_utils/io_utils.py
nemo_curator/utils/fuzzy_dedup_utils/merge_utils.py
nemo_curator/utils/fuzzy_dedup_utils/output_map_utils.py
nemo_curator/utils/fuzzy_dedup_utils/shuffle_utils.py
nemo_curator/utils/image/__init__.py
nemo_curator/utils/image/transforms.py
tests/test_add_id.py
tests/test_argument_helper.py
tests/test_backends.py
tests/test_blend_datasets.py
tests/test_classifiers.py
tests/test_cleaning.py
tests/test_config.py
tests/test_dataset.py
tests/test_download.py
tests/test_duplicates_removal.py
tests/test_exact_dedup.py
tests/test_filters.py
tests/test_fuzzy_dedup.py
tests/test_io.py
tests/test_joiner.py
tests/test_llm_pii.py
tests/test_metrics.py
tests/test_pii_accuracy.py
tests/test_read_data.py
tests/test_read_simple_bitext.py
tests/test_sdg_pipeline_filters.py
tests/test_semdedup.py
tests/test_separate_by_metadata.py
tests/test_shuffle.py
tests/test_slurm.py
tests/test_splitter.py
tests/test_splitter_joiner.py
tests/test_task_decontamination.py
tutorials/bitext_cleaning/docbuilder.py
tutorials/bitext_cleaning/main.py
tutorials/dapt-curation/code/docbuilder.py
tutorials/dapt-curation/code/downloaders.py
tutorials/dapt-curation/code/main.py
tutorials/dapt-curation/code/utils.py
tutorials/image-curation/helper.py
tutorials/nemo-retriever-synthetic-data-generation/__init__.py
tutorials/nemo-retriever-synthetic-data-generation/main.py
tutorials/nemo-retriever-synthetic-data-generation/mine_hard_negatives.py
tutorials/nemo-retriever-synthetic-data-generation/repartition.py
tutorials/nemo-retriever-synthetic-data-generation/retriever_evalset_generator.py
tutorials/nemo-retriever-synthetic-data-generation/retriever_hardnegative_miner.py
tutorials/nemo-retriever-synthetic-data-generation/config/__init__.py
tutorials/nemo-retriever-synthetic-data-generation/config/config.py
tutorials/peft-curation/docbuilder.py
tutorials/peft-curation/filters.py
tutorials/peft-curation/main.py
tutorials/peft-curation/modifiers.py
tutorials/peft-curation-with-sdg/docbuilder.py
tutorials/peft-curation-with-sdg/filters.py
tutorials/peft-curation-with-sdg/main.py
tutorials/peft-curation-with-sdg/modifiers.py
tutorials/peft-curation-with-sdg/synthetic_gen.py
tutorials/pretraining-data-curation/helper.py
tutorials/synthetic-retrieval-evaluation/DeDup.py
tutorials/synthetic-retrieval-evaluation/Endpoints.py
tutorials/synthetic-retrieval-evaluation/Generator.py
tutorials/synthetic-retrieval-evaluation/prompts.py
tutorials/tinystories/docbuilder.py
tutorials/tinystories/filters.py
tutorials/tinystories/helpers.py
tutorials/tinystories/main.py
tutorials/tinystories/modifiers.py
tutorials/zyda2-tutorial/0_processing/helper.py
tutorials/zyda2-tutorial/0_processing/process_dclm.py
tutorials/zyda2-tutorial/0_processing/process_dolma_cc.py
tutorials/zyda2-tutorial/0_processing/process_fwe2.py
tutorials/zyda2-tutorial/0_processing/process_zyda.py
tutorials/zyda2-tutorial/1_fuzzy_dedup/0_minhash.py
tutorials/zyda2-tutorial/1_fuzzy_dedup/1_lsh.py
tutorials/zyda2-tutorial/1_fuzzy_dedup/2_buckets_to_edges.py
tutorials/zyda2-tutorial/1_fuzzy_dedup/3_connected_components.py
tutorials/zyda2-tutorial/2_dupes_removal/0_id_mapping.py
tutorials/zyda2-tutorial/2_dupes_removal/1_id_conversion.py
tutorials/zyda2-tutorial/2_dupes_removal/2_compute_counts.py
tutorials/zyda2-tutorial/2_dupes_removal/3_prep_dupes.py
tutorials/zyda2-tutorial/2_dupes_removal/4_get_dupes_dclm.py
tutorials/zyda2-tutorial/2_dupes_removal/4_get_dupes_dolma-cc.py
tutorials/zyda2-tutorial/2_dupes_removal/5_get_dupes_zyda.py
tutorials/zyda2-tutorial/2_dupes_removal/remove_dupes.py
tutorials/zyda2-tutorial/3_quality_model/run_quality_classifier.py
tutorials/zyda2-tutorial/4_filtering/filter_fwe.py
tutorials/zyda2-tutorial/4_filtering/filter_quality.py