fix: types

walln · walln · commit 3b246498a102 · 2025-06-16T12:30:08.000-04:00
diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml
@@ -27,7 +27,7 @@ jobs:
               run: uvx ruff format --check
 
             - name: Typecheck
-              run: uvx mypy src --enable-incomplete-feature=NewGenericSyntax
+              run: uvx ty check
 
             - name: Test
               run: uv run pytest tests
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -21,7 +21,7 @@ repos:
         stages: [commit-msg]
       - id: typecheck
         name: typecheck
-        entry: uvx mypy src/scratch/
+        entry: uvx ty check
         language: system
         types: [python]
         pass_filenames: false
diff --git a/pyproject.toml b/pyproject.toml
@@ -42,6 +42,8 @@ dev = [
     "ipykernel>=6.29.5",
     "jupyter>=1.1.1",
     "ruff>=0.8.4",
+    "ty>=0.0.1a10",
+
 ]
 
 [tool.uv]
@@ -62,6 +64,8 @@ packages = ["src/scratch"]
 target-version = "py312"
 include = ["src/**", "tests/**"]
 line-length = 88
+
+[tool.ruff.lint]
 select = [
     "E",      # pycodestyle
     "W",      # pycodestyle
@@ -79,10 +83,10 @@ select = [
 ]
 ignore = ["F722"]
 
-[tool.ruff.pydocstyle]
+[tool.ruff.lint.pydocstyle]
 convention = "google"
 
-[tool.ruff.isort]
+[tool.ruff.lint.isort]
 known-first-party = ["src"]
 
 [tool.pytest.ini_options]
diff --git a/src/scratch/datasets/utils.py b/src/scratch/datasets/utils.py
@@ -4,7 +4,8 @@
 import warnings
 from dataclasses import dataclass, field
 
-from transformers import AutoTokenizer, PreTrainedTokenizerBase
+from transformers import AutoTokenizer
+from transformers.tokenization_utils_base import PreTrainedTokenizerBase
 
 
 def patch_datasets_warning():
@@ -32,16 +33,6 @@ def filter_specific_warning(warning):
                 frame = frame.f_back
         return False
 
-    # Register the custom filter
-    warnings.filterwarnings("ignore", category=UserWarning, module=r".*")
-    warnings.showwarning = (
-        lambda message, category, filename, lineno, file=None, line=None: None
-        if filter_specific_warning(
-            warnings.WarningMessage(message, category, filename, lineno)
-        )
-        else warnings.showwarning(message, category, filename, lineno)
-    )
-
 
 @dataclass
 class TokenizerMetadata:
@@ -62,7 +53,7 @@ class TokenizerMetadata:
     def from_tokenizer(cls, tokenizer: PreTrainedTokenizerBase, max_length: int):
         """Create metadata from a tokenizer instance."""
         vocab_size = tokenizer.vocab_size  # type: ignore
-        if not vocab_size:
+        if not vocab_size or not isinstance(vocab_size, int):
             raise ValueError("The tokenizer does not have a vocab size.")
         return cls(
             vocab_size=vocab_size,
diff --git a/src/scratch/deep_learning/layers/attention/rope.py b/src/scratch/deep_learning/layers/attention/rope.py
@@ -92,6 +92,10 @@ def precompute_theta_pos_freqs(dim: int, end: int, theta: float = 10000.0):
     """
     freqs = 1.0 / (theta ** (jnp.arange(0, dim, 2)[: (dim // 2)] / dim))
     t = jnp.arange(end, dtype=jnp.float32)
+
+    assert isinstance(t, jnp.ndarray)
+    assert isinstance(freqs, jnp.ndarray)
+
     freqs = jnp.outer(t, freqs)
     freqs_cis = jnp.exp(1j * freqs)  # Using Euler's formula to create complex numbers
     return freqs_cis
diff --git a/src/scratch/image_classification/cnn.py b/src/scratch/image_classification/cnn.py
@@ -112,5 +112,7 @@ def __call__(self, x):
     # And comment out the following line
     logger = None
 
-    trainer = ImageClassificationParallelTrainer(model, trainer_config, logger=logger)
+    trainer = ImageClassificationParallelTrainer[CNN](
+        model, trainer_config, logger=logger
+    )
     trainer.train_and_evaluate(dataset.train, dataset.test)
diff --git a/src/scratch/image_classification/resnet.py b/src/scratch/image_classification/resnet.py
@@ -321,5 +321,5 @@ def __call__(self, x):
     trainer_config = ImageClassificationParallelTrainerConfig(
         batch_size=batch_size, learning_rate=0.01, epochs=3
     )
-    trainer = ImageClassificationParallelTrainer(model, trainer_config)
+    trainer = ImageClassificationParallelTrainer[ResNet](model, trainer_config)
     trainer.train_and_evaluate(dataset.train, dataset.test)
diff --git a/src/scratch/image_classification/swin_transformer.py b/src/scratch/image_classification/swin_transformer.py
@@ -741,5 +741,5 @@ def __call__(self, x: jnp.ndarray, train=True):
     trainer_config = ImageClassificationParallelTrainerConfig(
         batch_size=batch_size, epochs=5
     )
-    trainer = ImageClassificationParallelTrainer(model, trainer_config)
+    trainer = ImageClassificationParallelTrainer[SwinTransformer](model, trainer_config)
     trainer.train_and_evaluate(dataset.train, dataset.test)
diff --git a/src/scratch/image_classification/trainer.py b/src/scratch/image_classification/trainer.py
@@ -5,9 +5,8 @@
 devices.
 """
 
-from collections.abc import Callable
 from dataclasses import dataclass
-from typing import TypeVar
+from typing import Protocol, TypeVar
 
 import jax
 import jax.numpy as jnp
@@ -26,6 +25,14 @@
 M = TypeVar("M", bound=nnx.Module)
 
 
+class CallableModule(Protocol):
+    """Protocol for callable modules."""
+
+    def __call__(self, *args, **kwargs) -> jnp.ndarray:
+        """Call the module."""
+        ...  # pragma: no cover
+
+
 @dataclass
 class ImageClassificationParallelTrainerConfig(SupervisedTrainerConfig):
     """Configuration for the ImageClassificationParallelTrainer."""
@@ -83,8 +90,8 @@ def train(
         def train_step(
             model: M, train_state: TrainState, inputs: jnp.ndarray, targets: jnp.ndarray
         ):
-            def loss_fn(model: Callable):
-                logits = model(inputs)
+            def loss_fn(model: nnx.Module):
+                logits = model(inputs)  # type: ignore
                 assert logits.shape == targets.shape
                 loss = optax.softmax_cross_entropy(logits=logits, labels=targets).mean()
                 return loss, logits
diff --git a/src/scratch/image_classification/vision_transformer.py b/src/scratch/image_classification/vision_transformer.py
@@ -214,5 +214,7 @@ def img_to_patch(x: jnp.ndarray, patch_size: int):
     trainer_config = ImageClassificationParallelTrainerConfig(
         batch_size=batch_size, epochs=5
     )
-    trainer = ImageClassificationParallelTrainer(model, trainer_config)
+    trainer = ImageClassificationParallelTrainer[VisionTransformer](
+        model, trainer_config
+    )
     trainer.train_and_evaluate(dataset.train, dataset.test)
diff --git a/src/scratch/language_modeling/bert/question_answering.py b/src/scratch/language_modeling/bert/question_answering.py
@@ -64,6 +64,8 @@ def __call__(
     model = BertForQuestionAnswering(config, rngs=nnx.Rngs(0))
 
     trainer_config = QuestionAnsweringTrainerConfig(batch_size=2)
-    trainer = QuestionAnsweringTrainer(model, trainer_config=trainer_config)
+    trainer = QuestionAnsweringTrainer[BertForQuestionAnswering](
+        model, trainer_config=trainer_config
+    )
 
     trainer.train_and_evaluate(dataset.train, dataset.test)
diff --git a/src/scratch/language_modeling/bert/sequence_classification.py b/src/scratch/language_modeling/bert/sequence_classification.py
@@ -67,6 +67,8 @@ def __call__(
     trainer_config = SequenceClassificationTrainerConfig(
         batch_size=2, num_labels=dataset.metadata.num_classes
     )
-    trainer = SequenceClassificationTrainer(model, trainer_config=trainer_config)
+    trainer = SequenceClassificationTrainer[BertForSequenceClassification](
+        model, trainer_config=trainer_config
+    )
 
     trainer.train_and_evaluate(dataset.train, dataset.test)
diff --git a/src/scratch/language_modeling/bert/token_classification.py b/src/scratch/language_modeling/bert/token_classification.py
@@ -67,6 +67,8 @@ def __call__(
     trainer_config = TokenClassificationTrainerConfig(
         batch_size=dataset.batch_size, num_labels=dataset.metadata.num_labels
     )
-    trainer = TokenClassificationTrainer(model, trainer_config=trainer_config)
+    trainer = TokenClassificationTrainer[BertForTokenClassification](
+        model, trainer_config=trainer_config
+    )
 
     trainer.train_and_evaluate(dataset.train, dataset.test)
diff --git a/src/scratch/language_modeling/olmo/modeling/blocks/base.py b/src/scratch/language_modeling/olmo/modeling/blocks/base.py
@@ -92,8 +92,9 @@ def __init__(self, layer_id: int, config: OLMoConfig, *, rngs: nnx.Rngs):
         self.dropout = nnx.Dropout(config.residual_dropout, rngs=rngs)
 
         # Make sure QKV clip coefficient is positive, otherwise it's not well-defined.
-        if config.clip_qkv is not None:
-            assert config.clip_qkv > 0
+        clip_qkv = config.clip_qkv
+        if clip_qkv is not None:
+            assert clip_qkv > 0.0
 
         self.activation = SwiGLU()
         assert (self.activation_multiplier * self.hidden_size) % 1 == 0
diff --git a/src/scratch/language_modeling/olmo/modeling/model.py b/src/scratch/language_modeling/olmo/modeling/model.py
@@ -115,7 +115,7 @@ def __call__(
 
         # Attention masking
         if attention_mask is not None:
-            attention_mask = attention_mask.astype(dtype=jnp.float32).view(
+            attention_mask = attention_mask.astype(dtype=jnp.float32).reshape(  # type: ignore
                 batch_size, -1
             )[:, None, None, :]
             attention_mask = (1.0 - attention_mask) * jnp.finfo(
@@ -188,5 +188,7 @@ def mask_attention_bias(attention_bias):
 
         return OLMoForwardResult(
             logits=logits,
-            hidden_states=tuple(all_hidden_states) if output_hidden_states else None,
+            hidden_states=tuple(all_hidden_states)
+            if (output_hidden_states and all_hidden_states)
+            else None,  # type: ignore
         )
diff --git a/src/scratch/language_modeling/olmo/utils/numerical_stability.py b/src/scratch/language_modeling/olmo/utils/numerical_stability.py
@@ -21,7 +21,7 @@ def ensure_finite(
       The input tensor with the infinite values replaced.
     """
 
-    def replace_inf(x, value) -> jnp.ndarray:
+    def replace_inf(x, value):
         return jnp.where(jnp.isinf(x), value, x)
 
     if check_neg_inf:
diff --git a/src/scratch/language_modeling/trainers/question_answering.py b/src/scratch/language_modeling/trainers/question_answering.py
@@ -5,7 +5,6 @@
 multiple devices.
 """
 
-from collections.abc import Callable
 from dataclasses import dataclass
 from typing import TypeVar
 
@@ -56,8 +55,8 @@ def train(self, train_loader: DataLoader[QuestionAnsweringBatch]):
 
         @nnx.jit
         def train_step(model: M, train_state: TrainState, batch: dict):
-            def loss_fn(model: Callable):
-                start_logits, end_logits = model(
+            def loss_fn(model: nnx.Module):
+                start_logits, end_logits = model(  # type: ignore
                     input_ids=batch["input_ids"],
                     attention_mask=batch["attention_mask"],
                     train=True,
@@ -128,7 +127,7 @@ def eval(
 
         @nnx.jit
         def eval_step(model: M, train_state: TrainState, batch: dict):
-            start_logits, end_logits = model(
+            start_logits, end_logits = model(  # type: ignore
                 input_ids=batch["input_ids"],
                 attention_mask=batch["attention_mask"],
                 train=False,
diff --git a/src/scratch/language_modeling/trainers/token_classification.py b/src/scratch/language_modeling/trainers/token_classification.py
@@ -122,7 +122,7 @@ def eval(
 
         @nnx.jit
         def eval_step(model: M, train_state: TrainState, batch: dict):
-            logits = model(
+            logits = model(  # type: ignore
                 input_ids=batch["input_ids"],
                 attention_mask=batch["attention_mask"],
                 train=False,
diff --git a/src/scratch/trainer.py b/src/scratch/trainer.py
@@ -28,7 +28,7 @@
 M = TypeVar("M", bound=nnx.Module)
 
 
-class TrainState(Generic[M], nnx.Optimizer):
+class TrainState(nnx.Optimizer, Generic[M]):
     """Train state for training models.
 
     This class manages the training state, including the model, optimizer, and metrics.
@@ -212,7 +212,6 @@ def _setup_checkpoint_manager(self):
         opts = ocp.CheckpointManagerOptions(max_to_keep=3, cleanup_tmp_directories=True)
         return ocp.CheckpointManager(
             self.trainer_config.checkpoint_path,
-            item_names=("model", "opt_state"),
             options=opts,
         )
 
@@ -226,13 +225,17 @@ def save_checkpoint(self, step: int, metrics: dict):
         self.logger.log(f"Saving checkpoint at step {step}")
         state = nnx.state(self.model)
         opt_state = self.train_state.opt_state
+
+        # Create the checkpoint data structure
+        checkpoint_data = {
+            "model": state,
+            "opt_state": opt_state,
+        }
+
         self.checkpoint_manager.save(
             step=step,
+            items=checkpoint_data,
             metrics=metrics,
-            args=ocp.args.Composite(
-                model=ocp.args.PyTreeSave(state),
-                opt_state=ocp.args.PyTreeSave(opt_state),
-            ),
         )
         self.checkpoint_manager.wait_until_finished()
 
@@ -244,20 +247,24 @@ def load_checkpoint(self, step: int = 0):
         """
         model = nnx.eval_shape(lambda: self.model)
         state = nnx.state(model)
-
         opt_state = self.train_state.opt_state
 
-        self.checkpoint_manager.restore(
+        # Create the target structure for restoration
+        target_structure = {
+            "model": state,
+            "opt_state": opt_state,
+        }
+
+        restored = self.checkpoint_manager.restore(
             step=step,
-            args=ocp.args.Composite(
-                model=ocp.args.PyTreeRestore(state),
-                opt_state=ocp.args.PyTreeRestore(opt_state),
-            ),
+            items=target_structure,
         )
 
+        # Update the model and train state with restored data
+        nnx.update(model, restored["model"])
         self.model = model
         self.train_state = self._create_train_state()
-        self.train_state.opt_state = opt_state
+        self.train_state.opt_state = restored["opt_state"]
         self.global_step = step
 
         self.logger.log(f"Loaded checkpoint at step {step}")
diff --git a/tests/deep_learning/layers/attention/test_grouped_query_attention.py b/tests/deep_learning/layers/attention/test_grouped_query_attention.py
@@ -173,9 +173,9 @@ def loss_fn_rope(model):
     shape_check_rope = jax.tree_util.tree_map(
         lambda p, g: p.shape == g.shape, params, grads_rope
     )
-    assert jax.tree_util.tree_all(
-        shape_check_rope
-    ), f"Shapes don't match with RoPE: {shape_check_rope}"
+    assert jax.tree_util.tree_all(shape_check_rope), (
+        f"Shapes don't match with RoPE: {shape_check_rope}"
+    )
 
     # Test without RoPE
     def loss_fn_no_rope(model):
@@ -187,9 +187,9 @@ def loss_fn_no_rope(model):
     shape_check_no_rope = jax.tree_util.tree_map(
         lambda p, g: p.shape == g.shape, params, grads_no_rope
     )
-    assert jax.tree_util.tree_all(
-        shape_check_no_rope
-    ), f"Shapes don't match without RoPE: {shape_check_no_rope}"
+    assert jax.tree_util.tree_all(shape_check_no_rope), (
+        f"Shapes don't match without RoPE: {shape_check_no_rope}"
+    )
 
 
 @pytest.mark.parametrize("start_pos", [0, 8, 16])
diff --git a/uv.lock b/uv.lock

Original file line number	Diff line number	Diff line change
`@@ -321,5 +321,5 @@ def __call__(self, x):`
`321`	`321`	`trainer_config = ImageClassificationParallelTrainerConfig(`
`322`	`322`	`batch_size=batch_size, learning_rate=0.01, epochs=3`
`323`	`323`	`)`
`324`		`- trainer = ImageClassificationParallelTrainer(model, trainer_config)`
	`324`	`+ trainer = ImageClassificationParallelTrainer[ResNet](model, trainer_config)`
`325`	`325`	`trainer.train_and_evaluate(dataset.train, dataset.test)`
Original file line number	Diff line number	Diff line change
`@@ -741,5 +741,5 @@ def __call__(self, x: jnp.ndarray, train=True):`
`741`	`741`	`trainer_config = ImageClassificationParallelTrainerConfig(`
`742`	`742`	`batch_size=batch_size, epochs=5`
`743`	`743`	`)`
`744`		`- trainer = ImageClassificationParallelTrainer(model, trainer_config)`
	`744`	`+ trainer = ImageClassificationParallelTrainer[SwinTransformer](model, trainer_config)`
`745`	`745`	`trainer.train_and_evaluate(dataset.train, dataset.test)`
Original file line number	Diff line number	Diff line change
`@@ -214,5 +214,7 @@ def img_to_patch(x: jnp.ndarray, patch_size: int):`
`214`	`214`	`trainer_config = ImageClassificationParallelTrainerConfig(`
`215`	`215`	`batch_size=batch_size, epochs=5`
`216`	`216`	`)`
`217`		`- trainer = ImageClassificationParallelTrainer(model, trainer_config)`
	`217`	`+ trainer = ImageClassificationParallelTrainer[VisionTransformer](`
	`218`	`+ model, trainer_config`
	`219`	`+ )`
`218`	`220`	`trainer.train_and_evaluate(dataset.train, dataset.test)`