dask[complete]>=2021.7.1
distributed>=2021.7.1
dask-mpi>=2021.11.0
charset_normalizer>=3.1.0
awscli>=1.22.55
fasttext==0.9.2
pycld2==0.41
justext==3.0.0
ftfy==6.1.1
warcio==1.7.4
zstandard==0.18.0
in-place==0.5.0
unidic-lite==1.0.8
jieba==0.42.1
comment_parser
beautifulsoup4
mwparserfromhell==0.6.5
spacy<4.0.0,>=3.6.0
presidio-analyzer==2.2.351
presidio-anonymizer==2.2.351
usaddress==0.5.10
nemo_toolkit[nlp]>=1.23.0
lxml[html_clean]

[cuda12x]
cudf-cu12==24.4.*
dask-cudf-cu12==24.4.*
cugraph-cu12==24.4.*
dask-cuda==24.4.*
spacy[cuda12x]<4.0.0,>=3.6.0
