LICENSE
MANIFEST.in
README.md
pyproject.toml
nemo_curator/__init__.py
nemo_curator/config.py
nemo_curator/package_info.py
nemo_curator.egg-info/PKG-INFO
nemo_curator.egg-info/SOURCES.txt
nemo_curator.egg-info/dependency_links.txt
nemo_curator.egg-info/requires.txt
nemo_curator.egg-info/top_level.txt
nemo_curator/backends/__init__.py
nemo_curator/backends/base.py
nemo_curator/backends/utils.py
nemo_curator/backends/experimental/__init__.py
nemo_curator/backends/experimental/utils.py
nemo_curator/backends/experimental/ray_actor_pool/__init__.py
nemo_curator/backends/experimental/ray_actor_pool/adapter.py
nemo_curator/backends/experimental/ray_actor_pool/executor.py
nemo_curator/backends/experimental/ray_actor_pool/raft_adapter.py
nemo_curator/backends/experimental/ray_actor_pool/shuffle_adapter.py
nemo_curator/backends/experimental/ray_actor_pool/utils.py
nemo_curator/backends/experimental/ray_data/__init__.py
nemo_curator/backends/experimental/ray_data/adapter.py
nemo_curator/backends/experimental/ray_data/executor.py
nemo_curator/backends/experimental/ray_data/utils.py
nemo_curator/backends/internal/__init__.py
nemo_curator/backends/internal/raft/__init__.py
nemo_curator/backends/internal/raft/ray_comms.py
nemo_curator/backends/xenna/__init__.py
nemo_curator/backends/xenna/adapter.py
nemo_curator/backends/xenna/executor.py
nemo_curator/core/__init__.py
nemo_curator/core/client.py
nemo_curator/core/constants.py
nemo_curator/core/utils.py
nemo_curator/metrics/__init__.py
nemo_curator/metrics/constants.py
nemo_curator/metrics/start_prometheus_grafana.py
nemo_curator/metrics/utils.py
nemo_curator/models/__init__.py
nemo_curator/models/aesthetics.py
nemo_curator/models/base.py
nemo_curator/models/clip.py
nemo_curator/models/cosmos_embed1.py
nemo_curator/models/internvideo2_mm.py
nemo_curator/models/nsfw.py
nemo_curator/models/prompt_formatter.py
nemo_curator/models/qwen_lm.py
nemo_curator/models/qwen_vl.py
nemo_curator/models/transnetv2.py
nemo_curator/pipeline/__init__.py
nemo_curator/pipeline/pipeline.py
nemo_curator/stages/__init__.py
nemo_curator/stages/base.py
nemo_curator/stages/client_partitioning.py
nemo_curator/stages/file_partitioning.py
nemo_curator/stages/function_decorators.py
nemo_curator/stages/resources.py
nemo_curator/stages/audio/__init__.py
nemo_curator/stages/audio/common.py
nemo_curator/stages/audio/datasets/__init__.py
nemo_curator/stages/audio/datasets/file_utils.py
nemo_curator/stages/audio/datasets/fleurs/__init__.py
nemo_curator/stages/audio/datasets/fleurs/create_initial_manifest.py
nemo_curator/stages/audio/inference/__init__.py
nemo_curator/stages/audio/inference/asr_nemo.py
nemo_curator/stages/audio/io/__init__.py
nemo_curator/stages/audio/io/convert.py
nemo_curator/stages/audio/metrics/__init__.py
nemo_curator/stages/audio/metrics/get_wer.py
nemo_curator/stages/deduplication/__init__.py
nemo_curator/stages/deduplication/gpu_utils.py
nemo_curator/stages/deduplication/id_generator.py
nemo_curator/stages/deduplication/io_utils.py
nemo_curator/stages/deduplication/exact/__init__.py
nemo_curator/stages/deduplication/exact/identification.py
nemo_curator/stages/deduplication/exact/workflow.py
nemo_curator/stages/deduplication/fuzzy/__init__.py
nemo_curator/stages/deduplication/fuzzy/buckets_to_edges.py
nemo_curator/stages/deduplication/fuzzy/connected_components.py
nemo_curator/stages/deduplication/fuzzy/identify_duplicates.py
nemo_curator/stages/deduplication/fuzzy/minhash.py
nemo_curator/stages/deduplication/fuzzy/utils.py
nemo_curator/stages/deduplication/fuzzy/workflow.py
nemo_curator/stages/deduplication/fuzzy/lsh/__init__.py
nemo_curator/stages/deduplication/fuzzy/lsh/lsh.py
nemo_curator/stages/deduplication/fuzzy/lsh/stage.py
nemo_curator/stages/deduplication/semantic/__init__.py
nemo_curator/stages/deduplication/semantic/identify_duplicates.py
nemo_curator/stages/deduplication/semantic/kmeans.py
nemo_curator/stages/deduplication/semantic/pairwise.py
nemo_curator/stages/deduplication/semantic/pairwise_io.py
nemo_curator/stages/deduplication/semantic/ranking.py
nemo_curator/stages/deduplication/semantic/utils.py
nemo_curator/stages/deduplication/semantic/workflow.py
nemo_curator/stages/deduplication/shuffle_utils/__init__.py
nemo_curator/stages/deduplication/shuffle_utils/rapidsmpf_shuffler.py
nemo_curator/stages/deduplication/shuffle_utils/stage.py
nemo_curator/stages/image/__init__.py
nemo_curator/stages/image/deduplication/__init__.py
nemo_curator/stages/image/deduplication/removal.py
nemo_curator/stages/image/embedders/__init__.py
nemo_curator/stages/image/embedders/clip_embedder.py
nemo_curator/stages/image/filters/__init__.py
nemo_curator/stages/image/filters/aesthetic_filter.py
nemo_curator/stages/image/filters/base.py
nemo_curator/stages/image/filters/nsfw_filter.py
nemo_curator/stages/image/io/__init__.py
nemo_curator/stages/image/io/convert.py
nemo_curator/stages/image/io/image_reader.py
nemo_curator/stages/image/io/image_writer.py
nemo_curator/stages/text/__init__.py
nemo_curator/stages/text/classifiers/__init__.py
nemo_curator/stages/text/classifiers/aegis.py
nemo_curator/stages/text/classifiers/aegis_utils.py
nemo_curator/stages/text/classifiers/base.py
nemo_curator/stages/text/classifiers/constants.py
nemo_curator/stages/text/classifiers/content_type.py
nemo_curator/stages/text/classifiers/domain.py
nemo_curator/stages/text/classifiers/fineweb_edu.py
nemo_curator/stages/text/classifiers/prompt_task_complexity.py
nemo_curator/stages/text/classifiers/quality.py
nemo_curator/stages/text/deduplication/__init__.py
nemo_curator/stages/text/deduplication/removal.py
nemo_curator/stages/text/deduplication/removal_workflow.py
nemo_curator/stages/text/deduplication/semantic.py
nemo_curator/stages/text/download/__init__.py
nemo_curator/stages/text/download/utils.py
nemo_curator/stages/text/download/arxiv/__init__.py
nemo_curator/stages/text/download/arxiv/download.py
nemo_curator/stages/text/download/arxiv/extract.py
nemo_curator/stages/text/download/arxiv/iterator.py
nemo_curator/stages/text/download/arxiv/stage.py
nemo_curator/stages/text/download/arxiv/url_generation.py
nemo_curator/stages/text/download/base/__init__.py
nemo_curator/stages/text/download/base/download.py
nemo_curator/stages/text/download/base/extract.py
nemo_curator/stages/text/download/base/iterator.py
nemo_curator/stages/text/download/base/stage.py
nemo_curator/stages/text/download/base/url_generation.py
nemo_curator/stages/text/download/common_crawl/__init__.py
nemo_curator/stages/text/download/common_crawl/download.py
nemo_curator/stages/text/download/common_crawl/extract.py
nemo_curator/stages/text/download/common_crawl/stage.py
nemo_curator/stages/text/download/common_crawl/url_generation.py
nemo_curator/stages/text/download/common_crawl/warc_iterator.py
nemo_curator/stages/text/download/html_extractors/__init__.py
nemo_curator/stages/text/download/html_extractors/base.py
nemo_curator/stages/text/download/html_extractors/justext.py
nemo_curator/stages/text/download/html_extractors/resiliparse.py
nemo_curator/stages/text/download/html_extractors/trafilatura.py
nemo_curator/stages/text/download/html_extractors/utils/__init__.py
nemo_curator/stages/text/download/html_extractors/utils/ja_stopwords.py
nemo_curator/stages/text/download/html_extractors/utils/th_stopwords.py
nemo_curator/stages/text/download/html_extractors/utils/zh_stopwords.py
nemo_curator/stages/text/download/wikipedia/__init__.py
nemo_curator/stages/text/download/wikipedia/download.py
nemo_curator/stages/text/download/wikipedia/extract.py
nemo_curator/stages/text/download/wikipedia/iterator.py
nemo_curator/stages/text/download/wikipedia/stage.py
nemo_curator/stages/text/download/wikipedia/url_generation.py
nemo_curator/stages/text/embedders/__init__.py
nemo_curator/stages/text/embedders/base.py
nemo_curator/stages/text/embedders/utils.py
nemo_curator/stages/text/filters/__init__.py
nemo_curator/stages/text/filters/code.py
nemo_curator/stages/text/filters/doc_filter.py
nemo_curator/stages/text/filters/fasttext_filter.py
nemo_curator/stages/text/filters/heuristic_filter.py
nemo_curator/stages/text/io/__init__.py
nemo_curator/stages/text/io/reader/__init__.py
nemo_curator/stages/text/io/reader/base.py
nemo_curator/stages/text/io/reader/jsonl.py
nemo_curator/stages/text/io/reader/parquet.py
nemo_curator/stages/text/io/writer/__init__.py
nemo_curator/stages/text/io/writer/base.py
nemo_curator/stages/text/io/writer/jsonl.py
nemo_curator/stages/text/io/writer/parquet.py
nemo_curator/stages/text/io/writer/utils.py
nemo_curator/stages/text/models/__init__.py
nemo_curator/stages/text/models/model.py
nemo_curator/stages/text/models/tokenizer.py
nemo_curator/stages/text/models/utils.py
nemo_curator/stages/text/modifiers/__init__.py
nemo_curator/stages/text/modifiers/c4.py
nemo_curator/stages/text/modifiers/doc_modifier.py
nemo_curator/stages/text/modifiers/fasttext.py
nemo_curator/stages/text/modifiers/line_remover.py
nemo_curator/stages/text/modifiers/markdown_remover.py
nemo_curator/stages/text/modifiers/newline_normalizer.py
nemo_curator/stages/text/modifiers/quotation_remover.py
nemo_curator/stages/text/modifiers/slicer.py
nemo_curator/stages/text/modifiers/unicode_reformatter.py
nemo_curator/stages/text/modifiers/url_remover.py
nemo_curator/stages/text/modules/__init__.py
nemo_curator/stages/text/modules/add_id.py
nemo_curator/stages/text/modules/modifier.py
nemo_curator/stages/text/modules/score_filter.py
nemo_curator/stages/text/utils/__init__.py
nemo_curator/stages/text/utils/constants.py
nemo_curator/stages/text/utils/text_utils.py
nemo_curator/stages/video/__init__.py
nemo_curator/stages/video/caption/__init__.py
nemo_curator/stages/video/caption/caption_enhancement.py
nemo_curator/stages/video/caption/caption_generation.py
nemo_curator/stages/video/caption/caption_preparation.py
nemo_curator/stages/video/clipping/__init__.py
nemo_curator/stages/video/clipping/clip_extraction_stages.py
nemo_curator/stages/video/clipping/clip_frame_extraction.py
nemo_curator/stages/video/clipping/transnetv2_extraction.py
nemo_curator/stages/video/clipping/video_frame_extraction.py
nemo_curator/stages/video/embedding/__init__.py
nemo_curator/stages/video/embedding/cosmos_embed1.py
nemo_curator/stages/video/embedding/internvideo2.py
nemo_curator/stages/video/filtering/__init__.py
nemo_curator/stages/video/filtering/clip_aesthetic_filter.py
nemo_curator/stages/video/filtering/motion_filter.py
nemo_curator/stages/video/filtering/motion_vector_backend.py
nemo_curator/stages/video/io/__init__.py
nemo_curator/stages/video/io/clip_writer.py
nemo_curator/stages/video/io/video_reader.py
nemo_curator/stages/video/preview/__init__.py
nemo_curator/stages/video/preview/preview.py
nemo_curator/tasks/__init__.py
nemo_curator/tasks/audio_batch.py
nemo_curator/tasks/document.py
nemo_curator/tasks/file_group.py
nemo_curator/tasks/image.py
nemo_curator/tasks/tasks.py
nemo_curator/tasks/utils.py
nemo_curator/tasks/video.py
nemo_curator/utils/__init__.py
nemo_curator/utils/client_utils.py
nemo_curator/utils/column_utils.py
nemo_curator/utils/decoder_utils.py
nemo_curator/utils/file_utils.py
nemo_curator/utils/grouping.py
nemo_curator/utils/hf_download_utils.py
nemo_curator/utils/nvcodec_utils.py
nemo_curator/utils/operation_utils.py
nemo_curator/utils/performance_utils.py
nemo_curator/utils/storage_utils.py
nemo_curator/utils/windowing_utils.py
nemo_curator/utils/writer_utils.py
tests/test_cudf_placeholder.py