datatrove/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
datatrove/data.py,sha256=V4RctBl3AJoml2cvr-SwqgEjiDl_IItu4n8HOHDlSAo,1145
datatrove/io.py,sha256=Rpecy_Z6fdv54RsG2hH-KF0V-4osGu5CSx03EXjjHsw,13915
datatrove/assets/banned_subwords.txt,sha256=R3BEXNjSsWSV796IvE78OlIL3A8xN9FNhc09ibGUojI,1673
datatrove/assets/banned_words.txt,sha256=eq5RCczE2DVxZxaPo6kTMfovr9jrjMnDjMkUfCYQ-rA,1732
datatrove/assets/soft_banned_words.txt,sha256=5QQwMKzMozsONoppOZdXa-6kxYJ52mQb_0C04CKBsGM,674
datatrove/assets/url_filterblacklists.tar.gz,sha256=llndpkDhZ0JvXk3Aw4ChHsbg5dnpsbV13i1Jo0nVQqc,17678810
datatrove/executor/__init__.py,sha256=nBMsc2-ySpEAO-ArqC0ny6Rn6NhfpUMvub4Xe4PM1zA,82
datatrove/executor/base.py,sha256=n1SMBPahXMP_YTigmUaI5iyWrzQRR45oiXoF1CS3R2k,6323
datatrove/executor/local.py,sha256=hHresURoLkZjzM7CxPEr38ldzhdvQJyBzbW2MHe1HPQ,6360
datatrove/executor/slurm.py,sha256=pPfVaeoegto5AaZwDNlgLzLAGe8D9qvE1avAZO6xoqY,14179
datatrove/pipeline/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
datatrove/pipeline/base.py,sha256=z_25SkWyYMQ1fV6LZgNGCE8GmgAMWR7jmqwkKyHRBRc,5481
datatrove/pipeline/dedup/__init__.py,sha256=_PF3nVW0EYa1x4o6S79XsSqHBrYj0EL8QESDuNq1Gow,493
datatrove/pipeline/dedup/bloom_filter.py,sha256=-JcMyl19O5zkrViGsRBI_nk5SnBKY0zNtfUao7HVx8c,7372
datatrove/pipeline/dedup/exact_substrings.py,sha256=VxwK8Qyd8SmVhKd25CX2jbMiwXJVhka6ZqDzXbnunJA,13938
datatrove/pipeline/dedup/minhash.py,sha256=7USwxvz0T86NTHk7N2NYhYreK2DtR1e3pE4oap88He4,25545
datatrove/pipeline/dedup/sentence_dedup.py,sha256=6Q-5fIE39v2_uQUAU8e89v9WSp_HXed5pZUhZQwEoA0,20598
datatrove/pipeline/dedup/url_dedup.py,sha256=tbgYeqwvj9JclpaChuln_Ui2nBSVOfYTJoz_s2kBKFk,13735
datatrove/pipeline/extractors/__init__.py,sha256=sHzyoqkwven6X4XdKoWzvmHXq8BUbS8Kl-t1-kvrs8k,80
datatrove/pipeline/extractors/base.py,sha256=bFjE2shdf69td36HqqrOO9V-yWaWUZOuY33hbkB741Y,2251
datatrove/pipeline/extractors/modular.py,sha256=OzDkwS3vDQeTeNYcRImezh025qcelhcN8cYqE7u-z90,2049
datatrove/pipeline/extractors/trafilatura.py,sha256=1q0LVmEdx7p8q7SH9taOduz1-xLW1wilqsSDfHMG4-c,1754
datatrove/pipeline/filters/__init__.py,sha256=3QuGrZYLbjVjxqPwMZ0XBsiUIGL1put-cbcgGYeCR-8,543
datatrove/pipeline/filters/base_filter.py,sha256=IGGduv67HqFjl_Ky6dFbkpWF7TI7kK2z1M3AIf_A8lM,2255
datatrove/pipeline/filters/c4_quality_filter.py,sha256=YslreQaiLtM-UJhoCftHhAUoES8ecbf8kguZlIula-g,7301
datatrove/pipeline/filters/fasttext_filter.py,sha256=uzpPwvaFBfCWL0YjJWzynZnSBj3tARqRmmjgZ0jGkMM,5183
datatrove/pipeline/filters/fineweb_quality_filter.py,sha256=lojWggFEUaeg-wOjjSINXqAiFyLoMw6Ppf242Ma2mkU,1959
datatrove/pipeline/filters/gopher_quality_filter.py,sha256=IX1R3I7ZQAiqzweeyeKeocBO8VSf0eTkma0PtVgDZTg,5057
datatrove/pipeline/filters/gopher_repetition_filter.py,sha256=vymEtY860AK3SKFYMH7E_dyRDcG-KFo_dZ5ie6HQZlY,4794
datatrove/pipeline/filters/lambda_filter.py,sha256=TcEIMbpHB0pRecvoUDQ-2MnFwfpL8udGPgqZsbiIGs0,827
datatrove/pipeline/filters/language_filter.py,sha256=Mte3Mf03-YIO72aODm6LsW3rYR5zUMYoikZ3AZyyi3Q,2125
datatrove/pipeline/filters/regex_filter.py,sha256=0heUgDmmP98qXuxocMNCZy63WLsl6F_DD2bAMgtOp6A,737
datatrove/pipeline/filters/sampler_filter.py,sha256=QhDn-6_L_kVALYIDMWSNmSR0A2NkS3rrS3iu1sIf2OI,741
datatrove/pipeline/filters/unigram_log_probs.py,sha256=89WWE3R2WFoMT77lOZeeVjbDM4OmELORFLY3nzyZ0q4,2679
datatrove/pipeline/filters/url_filter.py,sha256=y3XvmLuIWCMMo1nOQTlMOmBC4TtQu7x5Uyl2WcejYQw,4495
datatrove/pipeline/formatters/__init__.py,sha256=9AexxNkQhphgOuMftuPpfLVxQCnxeg8mC4OjvloRd3g,117
datatrove/pipeline/formatters/base.py,sha256=F3-AojKl3ZA2nTr_IoXIqfW_mTRcBFRfwWw2odjlgdc,665
datatrove/pipeline/formatters/ftfy.py,sha256=Ln3CJYfbu6WrR83FxYw1CccWf1McuTaoB_x-US38mJw,228
datatrove/pipeline/formatters/pii.py,sha256=9AD-fGEy8O1YVMSTd7lTHR8IJ3c_u-zsxMmNDZeTP2A,3476
datatrove/pipeline/formatters/symbol_lines_remover.py,sha256=htmuLW3dTwLy4XNXfV5OCsbNgk7gkMHcN299_6aF7Cg,1237
datatrove/pipeline/readers/__init__.py,sha256=bTCTLGdjs7YduMD5sY6q7_WM1iy43q5FL5h7mt64j5E,199
datatrove/pipeline/readers/base.py,sha256=Fi1meygR1q7PIrqNo4dzG0m9hqRa_XCLAlWc_YIllUY,9521
datatrove/pipeline/readers/csv.py,sha256=ZTiVfiPeKbAPXK7_aZtvM69t5XFnrk0XzYbwyVvWrP8,2627
datatrove/pipeline/readers/huggingface.py,sha256=PFPiSLQhjcZSlkeWX8PSyP7IfU-X-CtIRa24TB_nFHI,3478
datatrove/pipeline/readers/ipc.py,sha256=CEpE0ydOIu6LwnBqfXSBvQ1ATuD_mhBX-2uGdiF1Apg,3332
datatrove/pipeline/readers/jsonl.py,sha256=zDau_bWk3sYbLQ5NM3o2w0bZRE1K8ABlkKbwKFxqimA,2982
datatrove/pipeline/readers/parquet.py,sha256=IJzfFOhozCmg6UJbU7Xp_LEgV55ei2CNF6P9TrokzzU,3069
datatrove/pipeline/readers/warc.py,sha256=NXC_iLFtbDZudSQy_PPqApTbiUZSsU0KGTe-b6pMsQE,4653
datatrove/pipeline/stats/__init__.py,sha256=dovnxyJw-AHoVr0zQAFx6Dwv1qV0b_3S0ORT6tX2Kbg,60
datatrove/pipeline/stats/doc_len.py,sha256=LW8V8LBRzAmmkvPFBg5wkELkKk3mZSLlrJT-_-tJvpk,554
datatrove/pipeline/stats/urls.py,sha256=nFcxW-p1wOzRq7kker6Kv8P8HChqh61hGoHabVb1aaU,3726
datatrove/pipeline/tokens/__init__.py,sha256=Av-j6fsikkwOpmo3PcawQxarEri_5JPXktTp2IrN4lo,198
datatrove/pipeline/tokens/context_shuffler.py,sha256=p5nteEa29SsjKNv2ToTd4uaco9mDK5qjdHmHos3mb7Y,3031
datatrove/pipeline/tokens/counter.py,sha256=rgnBsArgAxDIR7AFntFV_yGXkz4ziBjjDGEKJCxQj_A,3489
datatrove/pipeline/tokens/merger.py,sha256=WEJHYawA3eT-LRCco_frMCIRs_qLdKhPXxoFjPx8Nnc,8799
datatrove/pipeline/tokens/tokenizer.py,sha256=a_gGDpop1myRx-WQz12Ds0TDdt0aCYYzVfi3zLB5heg,19251
datatrove/pipeline/writers/__init__.py,sha256=4UPe-fekukvJ2cYpFACUlYd2Y0TTL-AS0Np6PcxJLq0,132
datatrove/pipeline/writers/disk_base.py,sha256=BoKrTewePan-2XhrKADoBwxuR10rV0kmOc_ErkP74Kc,6963
datatrove/pipeline/writers/huggingface.py,sha256=Zw31CLBTOFk28KkcUMtiEIK4isxKaobF1KWbJmpHReQ,5607
datatrove/pipeline/writers/jsonl.py,sha256=bzzan_M8nGcm-Dk1P_NXFipIO8BVrCX32SvzuDu9ZQE,1239
datatrove/pipeline/writers/parquet.py,sha256=O6Ov61R_BKFkY0qeMiip1V-zaE60UjBRbRmQfYZqb1g,2614
datatrove/tools/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
datatrove/tools/check_dataset.py,sha256=vNys4pl0EkrWyUQNGrXh1_yuHB_-bBW0klrsC3sWZOQ,3632
datatrove/tools/failed_logs.py,sha256=N_ygW6aoDG-DKOBS6_8UgRKXj7GzNcOZiGLELak1etI,2622
datatrove/tools/inspect_data.py,sha256=mf4T0spOJGehci39PWDMVpakuqIda6Pxd9_u_L9LUEA,5741
datatrove/tools/jobs_status.py,sha256=YfAKwxUaSMTftMn0LQ4ivzejkn5_W9dd7b5eOjDoOIU,3160
datatrove/tools/launch_pickled_pipeline.py,sha256=HKHZ7cSEOmPA6KTldiFO20twAxY-5czGopi5PonMYX0,512
datatrove/tools/merge_stats.py,sha256=qyuS33IeiBdJFWXG__6tP6HM0T8LWIgxmS3uhsCRrNc,1158
datatrove/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
datatrove/utils/_import_utils.py,sha256=7GESCgAz5LmbxFrSQCJWjd7Tx0PLSed_-StaLwwDFO4,786
datatrove/utils/binaryio.py,sha256=Qasnh3KsdKy5PUco2gVc3Gio5BGfuiU39KZ73HdON1E,3049
datatrove/utils/logging.py,sha256=ZuwDm31uNvLSgVUmzHCMY4J34agzTknHO5BKQyZAfVM,1576
datatrove/utils/stats.py,sha256=OeH_GRujlqubrJfBNU0dFu9dWiA0Lg6ar30mbfNi_dQ,14029
datatrove/utils/text.py,sha256=HNEJI7vMhTZ_O992rWYLBQQmmMPF3zcsQXWlqcySGXw,4138
datatrove/utils/tokenization.py,sha256=uIjBpmUemiiNbuyi8l6vQbUogC83KiD_WgkaQzDI20E,1473
datatrove/utils/typeshelper.py,sha256=vtJLesxQm53cI2F9y7v-YMnZQOChWkJF2sVqZQuW-8Y,873
datatrove-0.2.0.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
datatrove-0.2.0.dist-info/METADATA,sha256=_nypfWMGxNLpMBi9m3w0OZgKgj5jLHnu9rSdXU0f1Eg,22966
datatrove-0.2.0.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
datatrove-0.2.0.dist-info/entry_points.txt,sha256=LMMMud15s4s6jucjsr57crbMppkvg6xRtN6unsgp2MM,330
datatrove-0.2.0.dist-info/top_level.txt,sha256=EA6CAg36D1YzT-oXcFMx0ImOvfCRDHZasZHwxcjFXSQ,10
datatrove-0.2.0.dist-info/RECORD,,
