[Length-MAX] vocab_size=32000
[Length-MAX] longest_tokens(top 20):
  #01 len= 47 id=  544 tok=',ĠwhichĠassignsĠaĠnormaliz...
  #02 len= 47 id= 1113 tok='1930ĠrenumberingĠofĠstateĠ...
  #03 len= 47 id= 4881 tok='InternationalĠUnionĠforĠCo...
  #04 len= 46 id=  577 tok='-ĠtoĠ49Ġ@-@ĠyearĠ@-@ĠoldsĠ...
  #05 len= 46 id= 1661 tok='6Ġ@.@Ġ8ĠkmĠ)ĠChampionshipĠ...
  #06 len= 46 id= 9056 tok='airedĠonĠtheĠFoxĠnetworkĠi...
  #07 len= 45 id=16516 tok='ieveĠ:ĠAĠCriticalĠGuideĠto...
  #08 len= 43 id= 2733 tok='BoatĠRaceĠisĠaĠsideĠ@-@Ġby...
  #09 len= 43 id=26459 tok='wentĠintoĠtheĠraceĠasĠreig...
  #10 len= 41 id=26782 tok='workĠonĠcasesĠlinkedĠtoĠth...
  #11 len= 40 id= 3405 tok='CreditsĠadaptedĠfromĠtheĠl...
  #12 len= 40 id=23160 tok='sheetĠmusicĠpublishedĠatĠM...
  #13 len= 39 id= 7926 tok='TheĠshowĠcentersĠonĠFBIĠsp...
  #14 len= 39 id=20767 tok='percentĠofĠallĠtelevisionĠ...
  #15 len= 38 id= 6899 tok='RecordingĠIndustryĠAssocia...
  #16 len= 38 id=14085 tok='erĠofĠtheĠOrderĠofĠtheĠBri...
  #17 len= 37 id= 7373 tok='ShearmanĠandĠLarsĠPearsonĠ...
  #18 len= 36 id= 2839 tok='BritishĠPhonographicĠIndus...
  #19 len= 36 id= 5089 tok='JointĠTyphoonĠWarningĠCent...
  #20 len= 36 id= 7454 tok='SiteĠofĠSpecialĠScientific...

[BPE] vocab_size=32000
[BPE] longest_tokens(top 20):
  #01 len= 53 id=30908 tok='byĠtheĠRecordingĠIndustryĠ...
  #02 len= 41 id=20607 tok='byĠtheĠRecordingĠIndustryĠ...
  #03 len= 34 id=23001 tok='NationalĠRegisterĠofĠHisto...
  #04 len= 34 id=18002 tok='RecordingĠIndustryĠAssocia...
  #05 len= 31 id=30092 tok='=Ġ=Ġ=ĠCriticalĠreceptionĠ=...
  #06 len= 31 id=22890 tok='=Ġ=ĠMeteorologicalĠhistory...
  #07 len= 31 id=24258 tok='WorldĠHeavyweightĠChampion...
  #08 len= 30 id=29151 tok='=Ġ=Ġ=ĠCriticalĠresponseĠ=Ġ...
  #09 len= 30 id=21973 tok='=Ġ=ĠCreditsĠandĠpersonnelĠ...
  #10 len= 29 id=25191 tok='DepartmentĠofĠTransportati...
  #11 len= 28 id=28453 tok='=Ġ=ĠCulturalĠreferencesĠ=Ġ=Ċ'
  #12 len= 28 id=22033 tok='=Ġ=ĠMajorĠintersectionsĠ=Ġ=Ċ'
  #13 len= 28 id=24942 tok='receivedĠmixedĠreviewsĠfromĠ'
  #14 len= 27 id=18372 tok='=Ġ=ĠCriticalĠreceptionĠ=Ġ=Ċ'
  #15 len= 27 id=29239 tok='intoĠaĠtropicalĠdepressionĠ'
  #16 len= 26 id=29255 tok='=Ġ=ĠChartĠperformanceĠ=Ġ=Ċ'
  #17 len= 26 id=21958 tok='=Ġ=ĠCreditsĠandĠpersonnelĠ'
  #18 len= 26 id=29940 tok='=Ġ=ĠLiveĠperformancesĠ=Ġ=Ċ'
  #19 len= 26 id=17992 tok='=Ġ=ĠRouteĠdescriptionĠ=Ġ=Ċ'
  #20 len= 26 id=29626 tok='NationalĠHurricaneĠCenterĠ'

[load] loading tokenizers ...
[load] done in 0.23s  len_rust_active=True


[done]
lines        = 1165029
chars        = 534799177
len_tokens   = 92579527  tpc=0.173111  chars/token=5.777
bpe_tokens   = 92226825  tpc=0.172451  chars/token=5.799
elapsed      = 61.8s
