.gitignore
.pre-commit-config.yaml
.pypirc
LICENSE
README.md
pyproject.toml
requirements.txt
setup.py
.github/workflows/ci.yml
.github/workflows/publish_pypi.yml
.vscode/launch.json
.vscode/settings.json
src/lm_datasets/__init__.py
src/lm_datasets/chunkify_datasets.py
src/lm_datasets/collect_metrics.py
src/lm_datasets/compose_dataset.py
src/lm_datasets/convert_parquet_to_jsonl.py
src/lm_datasets/extract_text.py
src/lm_datasets/hf_tokenize_parquet_dataset.py
src/lm_datasets/print_stats.py
src/lm_datasets/shuffle_datasets.py
src/lm_datasets/train_sp_tokenizer.py
src/lm_datasets.egg-info/PKG-INFO
src/lm_datasets.egg-info/SOURCES.txt
src/lm_datasets.egg-info/dependency_links.txt
src/lm_datasets.egg-info/entry_points.txt
src/lm_datasets.egg-info/requires.txt
src/lm_datasets.egg-info/top_level.txt
src/lm_datasets/commands/__init__.py
src/lm_datasets/commands/chunkify_command.py
src/lm_datasets/commands/collect_metrics_command.py
src/lm_datasets/commands/compose_command.py
src/lm_datasets/commands/convert_parquet_to_jsonl_command.py
src/lm_datasets/commands/extract_text_command.py
src/lm_datasets/commands/hf_upload_command.py
src/lm_datasets/commands/lm_datasets_cli.py
src/lm_datasets/commands/print_stats_command.py
src/lm_datasets/commands/shuffle_command.py
src/lm_datasets/commands/train_tokenizer_command.py
src/lm_datasets/datasets/__init__.py
src/lm_datasets/datasets/base.py
src/lm_datasets/datasets/dataset_registry.py
src/lm_datasets/datasets/hf_dataset.py
src/lm_datasets/datasets/jsonl_dataset.py
src/lm_datasets/datasets/parquet_dataset.py
src/lm_datasets/datasets/bg/__init__.py
src/lm_datasets/datasets/bg/bgnc_admin_eur.py
src/lm_datasets/datasets/bg/bgnc_news_corpus.py
src/lm_datasets/datasets/bg/bulgarian_news.py
src/lm_datasets/datasets/bg/bulnc.py
src/lm_datasets/datasets/code/__init__.py
src/lm_datasets/datasets/code/starcoder.py
src/lm_datasets/datasets/cs/__init__.py
src/lm_datasets/datasets/cs/cs_en_parallel.py
src/lm_datasets/datasets/cs/syn_v9.py
src/lm_datasets/datasets/da/__init__.py
src/lm_datasets/datasets/da/danewsroom.py
src/lm_datasets/datasets/da/danish_gigaword.py
src/lm_datasets/datasets/da/danish_parliament_corpus.py
src/lm_datasets/datasets/da/dk_clarin.py
src/lm_datasets/datasets/de/__init__.py
src/lm_datasets/datasets/de/dewac.py
src/lm_datasets/datasets/de/openlegaldata.py
src/lm_datasets/datasets/el/__init__.py
src/lm_datasets/datasets/el/greek_legal_code.py
src/lm_datasets/datasets/el/greek_web_corpus.py
src/lm_datasets/datasets/en/__init__.py
src/lm_datasets/datasets/en/dialogstudio.py
src/lm_datasets/datasets/en/math_amps.py
src/lm_datasets/datasets/en/pes2o.py
src/lm_datasets/datasets/en/pile_of_law.py
src/lm_datasets/datasets/en/proof_pile.py
src/lm_datasets/datasets/en/wikihow.py
src/lm_datasets/datasets/es/__init__.py
src/lm_datasets/datasets/es/escorpius.py
src/lm_datasets/datasets/es/spanish_legal.py
src/lm_datasets/datasets/et/__init__.py
src/lm_datasets/datasets/et/ekspress.py
src/lm_datasets/datasets/et/enc.py
src/lm_datasets/datasets/et/estonian_reference_corpus.py
src/lm_datasets/datasets/eu/__init__.py
src/lm_datasets/datasets/eu/euscrawl.py
src/lm_datasets/datasets/fi/__init__.py
src/lm_datasets/datasets/fi/ylenews.py
src/lm_datasets/datasets/fr/__init__.py
src/lm_datasets/datasets/fr/cabernet.py
src/lm_datasets/datasets/ga/__init__.py
src/lm_datasets/datasets/ga/ga_bilingual_legistation.py
src/lm_datasets/datasets/ga/ga_universal_dependencies.py
src/lm_datasets/datasets/hr/__init__.py
src/lm_datasets/datasets/hr/croatian_news_engri.py
src/lm_datasets/datasets/hr/hrwac.py
src/lm_datasets/datasets/hr/styria_news.py
src/lm_datasets/datasets/it/__init__.py
src/lm_datasets/datasets/it/itwac.py
src/lm_datasets/datasets/lt/seimas_lt_en.py
src/lm_datasets/datasets/lv/__init__.py
src/lm_datasets/datasets/lv/state_related_latvian_web.py
src/lm_datasets/datasets/mt/__init__.py
src/lm_datasets/datasets/mt/korpus_malti.py
src/lm_datasets/datasets/multilingual/__init__.py
src/lm_datasets/datasets/multilingual/colossal_oscar.py
src/lm_datasets/datasets/multilingual/curlicat.py
src/lm_datasets/datasets/multilingual/eurlex.py
src/lm_datasets/datasets/multilingual/legal_mc4.py
src/lm_datasets/datasets/multilingual/macocu.py
src/lm_datasets/datasets/multilingual/redpajama.py
src/lm_datasets/datasets/multilingual/wikimedia.py
src/lm_datasets/datasets/nl/__init__.py
src/lm_datasets/datasets/nl/sonar.py
src/lm_datasets/datasets/nl/sonar_new_media.py
src/lm_datasets/datasets/no/__init__.py
src/lm_datasets/datasets/no/maalfrid_2021.py
src/lm_datasets/datasets/no/nak.py
src/lm_datasets/datasets/no/nbdigital.py
src/lm_datasets/datasets/no/norwegian_cc.py
src/lm_datasets/datasets/no/parlamint.py
src/lm_datasets/datasets/no/parliamentary_proceedings.py
src/lm_datasets/datasets/no/sakspapir_nno.py
src/lm_datasets/datasets/pl/luna_pl.py
src/lm_datasets/datasets/pl/pl_nkjp.py
src/lm_datasets/datasets/pl/pl_parliamentary_corpus.py
src/lm_datasets/datasets/pt/brwac.py
src/lm_datasets/datasets/pt/parlamento_pt.py
src/lm_datasets/datasets/ro/__init__.py
src/lm_datasets/datasets/ro/marcell_legislative_subcorpus_v2.py
src/lm_datasets/datasets/sk/__init__.py
src/lm_datasets/datasets/sk/sk_court_decisions.py
src/lm_datasets/datasets/sk/sk_laws.py
src/lm_datasets/datasets/sl/__init__.py
src/lm_datasets/datasets/sl/academic_slovene_kas.py
src/lm_datasets/datasets/sl/cc_gigafida.py
src/lm_datasets/datasets/sl/slwac_web.py
src/lm_datasets/datasets/sr/__init__.py
src/lm_datasets/datasets/sr/srpkor.py
src/lm_datasets/datasets/sv/__init__.py
src/lm_datasets/datasets/sv/sv_gigaword.py
src/lm_datasets/datasets/uk/__init__.py
src/lm_datasets/datasets/uk/uk_laws.py
src/lm_datasets/io/__init__.py
src/lm_datasets/io/conllu_file.py
src/lm_datasets/io/parquet.py
src/lm_datasets/io/prevert_file.py
src/lm_datasets/utils/__init__.py
src/lm_datasets/utils/config.py
src/lm_datasets/utils/dataframe.py
src/lm_datasets/utils/dataset_generator.py
src/lm_datasets/utils/languages.py
src/lm_datasets/utils/settings.py
src/lm_datasets/utils/shuffle_big_file.py
src/lm_datasets/utils/systems.py
src/lm_datasets/utils/wikimedia.py
src/lm_datasets/viewer/app.py
src/lm_datasets/viewer/ngrok-app.py
src/lm_datasets/viewer/viewer_utils.py
tests/__init__.py
tests/conftest.py
tests/dummy_datasets.py
tests/test_compose_dataset_benchmark.py
tests/test_config.py
tests/test_generate_texts_from_output.py
tests/test_interleave_datasets.py
tests/test_iterate_over_shuffled_datasets.py
tests/test_read_parquet.py
tests/test_split_dataset.py
tests/test_write_parquet.py
tests/test_write_parquet_chunks.py