[Length-MAX] vocab_size=32000
[Length-MAX] longest_tokens(top 20):
  #01 len=218 id= 5098 tok='TheĠBoatĠRaceĠisĠaĠsideĠ@-...
  #02 len=134 id= 4519 tok='RobertĠShearmanĠandĠLarsĠP...
  #03 len=129 id= 4108 tok='OxfordĠ(ĠsometimesĠreferre...
  #04 len= 83 id= 3064 tok='IĠspecialĠagentsĠFoxĠMulde...
  #05 len= 81 id= 5832 tok='aceĠbetweenĠcrewsĠfromĠthe...
  #06 len= 81 id=13912 tok='ingĠtoĠBelieveĠ:ĠAĠCritica...
  #07 len= 77 id=19714 tok='sometimesĠreferredĠtoĠasĠt...
  #08 len= 72 id=24579 tok="ĠCanĠ'tĠBelieveĠItĠ'sĠaĠBi...
  #09 len= 66 id=26159 tok='ĠpercentĠofĠhouseholdsĠwat...
  #10 len= 63 id=24903 tok='ĠallĠ18-ĠtoĠ49Ġ@-@ĠyearĠ@-...
  #11 len= 58 id=  822 tok=':ĠAĠCriticalĠGuideĠtoĠTheĠ...
  #12 len= 57 id=17674 tok='placeĠonĠtheĠ4Ġ@.@Ġ2Ġ@-@Ġm...
  #13 len= 53 id= 2984 tok='InternationalĠUnionĠforĠCo...
  #14 len= 48 id= 1302 tok='AustralianĠRecordingĠIndus...
  #15 len= 48 id= 4445 tok='RecordingĠIndustryĠAssocia...
  #16 len= 45 id=18455 tok='ratingĠoutĠofĠ100ĠtoĠrevie...
  #17 len= 44 id= 1658 tok="CanĠ'tĠBelieveĠItĠ'sĠaĠBig...
  #18 len= 42 id=16388 tok='oatĠRaceĠisĠaĠsideĠ@-@ĠbyĠ...
  #19 len= 41 id=24607 tok='ĠCourseĠonĠtheĠRiverĠThame...
  #20 len= 40 id= 2983 tok='InternationalĠUnionĠforĠCo...

[BPE] vocab_size=32000
[BPE] longest_tokens(top 20):
  #01 len= 17 id=22946 tok='ĠIntercontinental'
  #02 len= 17 id=19840 tok='Ġautobiographical'
  #03 len= 17 id=21024 tok='Ġcharacterization'
  #04 len= 17 id=28098 tok='Ġdisqualification'
  #05 len= 17 id=25734 tok='Ġextraterrestrial'
  #06 len= 17 id=15227 tok='Ġresponsibilities'
  #07 len= 17 id=23429 tok='Ġunconstitutional'
  #08 len= 16 id=25484 tok='ĠCharacteristics'
  #09 len= 16 id=23399 tok='ĠGloucestershire'
  #10 len= 16 id=11190 tok='ĠRepresentatives'
  #11 len= 16 id=19476 tok='Ġaccomplishments'
  #12 len= 16 id= 7258 tok='Ġcharacteristics'
  #13 len= 16 id=23276 tok='Ġcinematographer'
  #14 len= 16 id=24037 tok='Ġclassifications'
  #15 len= 16 id=26563 tok='Ġdissatisfaction'
  #16 len= 16 id=21151 tok='Ġelectromagnetic'
  #17 len= 16 id=22722 tok='Ġexperimentation'
  #18 len= 16 id=13036 tok='Ġinstrumentation'
  #19 len= 16 id=17370 tok='Ġintensification'
  #20 len= 16 id=12616 tok='Ġinternationally'

[load] loading tokenizers ...
[load] done in 0.23s  len_rust_active=True


[done]
lines        = 1165029
chars        = 534799177
len_tokens   = 126280828  tpc=0.236128  chars/token=4.235
bpe_tokens   = 113327064  tpc=0.211906  chars/token=4.719
elapsed      = 91.2s
