feat: add collate_dtype support and pin_memory config to PerturbationDataModule

abhinadduri · claude · abhinadduri · commit 28b7bb3f6ce5 · 2026-03-07T00:58:32.000Z
- Add collate_dtype param to PerturbationDataset for float16/float32 tensor casting
- Wire collate_dtype through PerturbationDataModule to all dataset constructors

Co-Authored-By: Claude Opus 4.6 &lt;noreply@anthropic.com&gt;
diff --git a/src/cell_load/data_modules/perturbation_dataloader.py b/src/cell_load/data_modules/perturbation_dataloader.py
@@ -88,6 +88,7 @@ def __init__(
         use_consecutive_loading: bool = False,
         h5_open_kwargs: dict | None = None,
         show_progress: bool = True,
+        collate_dtype: str = "float16",
         **kwargs,  # missing perturbation_features_file  and store_raw_basal for backwards compatibility
     ):
         """
@@ -195,6 +196,7 @@ def __init__(
         self.additional_obs = additional_obs
         self.h5_open_kwargs = h5_open_kwargs
         self.show_progress = bool(show_progress)
+        self.collate_dtype = collate_dtype
         if self.use_consecutive_loading:
             self._set_h5_cache_env_defaults()
 
@@ -305,6 +307,7 @@ def save_state(self, filepath: str):
             "additional_obs": self.additional_obs,
             "use_consecutive_loading": self.use_consecutive_loading,
             "h5_open_kwargs": self.h5_open_kwargs,
+            "collate_dtype": self.collate_dtype,
         }
 
         torch.save(save_dict, filepath)
@@ -349,6 +352,7 @@ def load_state(cls, filepath: str):
             "barcode": save_dict.pop("barcode", True),
             "use_consecutive_loading": save_dict.pop("use_consecutive_loading", False),
             "h5_open_kwargs": save_dict.pop("h5_open_kwargs", None),
+            "collate_dtype": save_dict.pop("collate_dtype", "float16"),
         }
 
         # Create new instance with all the saved parameters
@@ -639,6 +643,7 @@ def _create_base_dataset(
             is_log1p=self.is_log1p,
             cell_sentence_len=self.cell_sentence_len,
             h5_open_kwargs=self.h5_open_kwargs,
+            collate_dtype=self.collate_dtype,
         )
 
     def _setup_datasets(self):
diff --git a/src/cell_load/dataset/_perturbation.py b/src/cell_load/dataset/_perturbation.py
@@ -51,6 +51,7 @@ def __init__(
         is_log1p: bool = True,
         cell_sentence_len: int | None = None,
         h5_open_kwargs: dict | None = None,
+        collate_dtype: str = "float16",
         **kwargs,
     ):
         """
@@ -78,6 +79,8 @@ def __init__(
             is_log1p: Whether raw counts in X are log1p-transformed (default True; affects downsampling)
             cell_sentence_len: Optional sentence length for consecutive loading batches
             h5_open_kwargs: Optional kwargs to pass to h5py.File (e.g., rdcc_nbytes)
+            collate_dtype: dtype for tensor outputs — "float16", "float32", or "bfloat16".
+                Casting to float16 before collation halves per-sample memory in workers and pinned memory.
             **kwargs: Additional options (e.g. output_space)
         """
         super().__init__()
@@ -121,6 +124,9 @@ def __init__(
         self.h5_open_kwargs = self._normalize_h5_open_kwargs(h5_open_kwargs)
         self.additional_obs = self._validate_additional_obs(additional_obs)
 
+        _dtype_map = {"float16": torch.float16, "float32": torch.float32, "bfloat16": torch.bfloat16}
+        self.collate_dtype = _dtype_map.get(collate_dtype, torch.float32)
+
         # Load metadata cache and open file
         self.metadata_cache = GlobalH5MetadataCache().get_cache(
             str(self.h5_path), pert_col, cell_type_key, control_pert, batch_col
@@ -346,6 +352,12 @@ def __getitem__(self, idx: int):
             elif self.output_space == "all":
                 sample["ctrl_cell_counts"] = self.fetch_gene_expression(ctrl_idx)
 
+        # Cast tensor values to collate_dtype to reduce worker/pinned memory
+        if self.collate_dtype != torch.float32:
+            for k in ("pert_cell_emb", "ctrl_cell_emb", "pert_cell_counts", "ctrl_cell_counts"):
+                if isinstance(sample.get(k), torch.Tensor):
+                    sample[k] = sample[k].to(self.collate_dtype)
+
         # Optionally include cell barcodes
         if self.barcode and self.cell_barcodes is not None:
             sample["pert_cell_barcode"] = self.cell_barcodes[file_idx]
@@ -483,6 +495,15 @@ def __getitems__(self, indices):
                 else:
                     ctrl_counts_batch = ctrl_expr_batch
 
+        # Cast batch tensors to collate_dtype to reduce worker/pinned memory
+        if self.collate_dtype != torch.float32:
+            pert_expr_batch = pert_expr_batch.to(self.collate_dtype)
+            ctrl_expr_batch = ctrl_expr_batch.to(self.collate_dtype)
+            if pert_counts_batch is not None:
+                pert_counts_batch = pert_counts_batch.to(self.collate_dtype)
+            if ctrl_counts_batch is not None:
+                ctrl_counts_batch = ctrl_counts_batch.to(self.collate_dtype)
+
         samples = []
         for i, file_idx in enumerate(file_indices):
             pert_expr = pert_expr_batch[i]