tiny_recursive_model/config_2060.yaml at main · dah33/tiny_recursive_model · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
# lightning.pytorch==2.5.5
#
# Reduced config for RTX 2060 (6GB VRAM):
# > uv run python -m tiny_recursive_model.main fit --config config_base.yaml --config config_2060.yaml
#
# Fast fail mode for testing:
# > --trainer.fast_dev_run=true
#
# Memory usage on RTX 2060 with batch_size=192, accumulate_grad_batches=4:
# - n=2, activation_checkpointing=false: ~4.9 GB, ~1.0 it/s
# - n=3-6, activation_checkpointing=true: ~5.1 GB, ~0.75-0.63 it/s (OOMs without checkpointing)
#
compile: false  # No benefit with this setup
trainer:
  accumulate_grad_batches: 4  # Process 4×192=768 samples before optimizer step
  logger: null  # use default logger (TensorBoard)
data:
  batch_size: 192  # Smaller batches for 6GB VRAM
model:
  n_layers: 2
  T: 2  # Paper uses T=3, which doesn't increase memory but slows training
  n: 2  # Paper uses n=6, see above re memory usage
  N_supervision: 1  # Paper uses 16, which doesn't increase memory but slows training
  activation_checkpointing: false  # Set to true for n=3-6 (saves memory but slows training)
lr_scheduler: # dummy, no warmup
  init_args:
    start_factor: 1.0