LICENSE
README.md
pyproject.toml
src/nanotron/__init__.py
src/nanotron/constants.py
src/nanotron/dataloader.py
src/nanotron/distributed.py
src/nanotron/helpers.py
src/nanotron/logging.py
src/nanotron/random.py
src/nanotron/sanity_checks.py
src/nanotron/trainer.py
src/nanotron/utils.py
src/nanotron.egg-info/PKG-INFO
src/nanotron.egg-info/SOURCES.txt
src/nanotron.egg-info/dependency_links.txt
src/nanotron.egg-info/requires.txt
src/nanotron.egg-info/top_level.txt
src/nanotron/config/__init__.py
src/nanotron/config/config.py
src/nanotron/config/lighteval_config.py
src/nanotron/config/models_config.py
src/nanotron/config/parallelism_config.py
src/nanotron/config/utils_config.py
src/nanotron/fp8/__init__.py
src/nanotron/fp8/constants.py
src/nanotron/fp8/dtypes.py
src/nanotron/fp8/kernel.py
src/nanotron/fp8/linear.py
src/nanotron/fp8/meta.py
src/nanotron/fp8/parameter.py
src/nanotron/fp8/tensor.py
src/nanotron/fp8/utils.py
src/nanotron/generation/__init__.py
src/nanotron/generation/decode.py
src/nanotron/generation/generate_store.py
src/nanotron/generation/sampler.py
src/nanotron/models/__init__.py
src/nanotron/models/base.py
src/nanotron/models/llama.py
src/nanotron/models/starcoder2.py
src/nanotron/nn/__init__.py
src/nanotron/nn/activations.py
src/nanotron/nn/layer_norm.py
src/nanotron/optim/__init__.py
src/nanotron/optim/base.py
src/nanotron/optim/clip_grads.py
src/nanotron/optim/gradient_accumulator.py
src/nanotron/optim/inherit_from_other_optimizer.py
src/nanotron/optim/named_optimizer.py
src/nanotron/optim/optimizer_from_gradient_accumulator.py
src/nanotron/optim/zero.py
src/nanotron/parallel/__init__.py
src/nanotron/parallel/context.py
src/nanotron/parallel/parameters.py
src/nanotron/parallel/sharded_parameters.py
src/nanotron/parallel/tied_parameters.py
src/nanotron/parallel/utils.py
src/nanotron/parallel/data_parallel/utils.py
src/nanotron/parallel/pipeline_parallel/block.py
src/nanotron/parallel/pipeline_parallel/context_manager.py
src/nanotron/parallel/pipeline_parallel/engine.py
src/nanotron/parallel/pipeline_parallel/functional.py
src/nanotron/parallel/pipeline_parallel/p2p.py
src/nanotron/parallel/pipeline_parallel/state.py
src/nanotron/parallel/pipeline_parallel/tensor_pointer.py
src/nanotron/parallel/pipeline_parallel/utils.py
src/nanotron/parallel/tensor_parallel/__init__.py
src/nanotron/parallel/tensor_parallel/distributed_differentiable_primitives.py
src/nanotron/parallel/tensor_parallel/enum.py
src/nanotron/parallel/tensor_parallel/functional.py
src/nanotron/parallel/tensor_parallel/nn.py
src/nanotron/serialize/__init__.py
src/nanotron/serialize/main.py
src/nanotron/serialize/metadata.py
src/nanotron/serialize/optimizer.py
src/nanotron/serialize/random.py
src/nanotron/serialize/utils.py
src/nanotron/serialize/weights.py
tests/test_checkpointing.py
tests/test_clip_grads.py
tests/test_data_parallel.py
tests/test_distributed.py
tests/test_p2p.py
tests/test_parameter.py
tests/test_parameters_accumulate_gradient_in_fp32.py
tests/test_pipeline_parallel.py
tests/test_random_state.py
tests/test_serialize.py
tests/test_tensor_parallel.py
tests/test_tie_weights.py
tests/test_zero.py