Improve API flexibility, input validation, and robustness

RalfG · RalfG · commit c5d8bd189bed · 2026-03-25T21:56:29.000+01:00
- Accept lists of PSMs, Peptidoforms, or strings as input for predict
  and calibrate functions, not just PSMList objects
- Rename psm_list to psm_list_reference in finetune/train for clarity
- Add num_threads parameter to train, predict, and evaluate
- Add automatic device detection if device is set to `None`
- Add input validation for empty data loaders and dataset splitting
- Handle empty predictions gracefully by returning empty tensor
- Fix target tensor shape in dataset when no target is available
- Rename _exceptions module to exceptions (public API)
- Add unit tests for edge cases in model ops and data splitting
diff --git a/deeplc/_model_ops.py b/deeplc/_model_ops.py
@@ -32,7 +32,7 @@ def load_model(
     selected_device = device or torch.device("cuda" if torch.cuda.is_available() else "cpu")
 
     # Load model from file if a path is provided
-    if isinstance(model, str | Path):
+    if isinstance(model, (str, PathLike, Path)):
         loaded_model = torch.load(model, weights_only=False, map_location=selected_device)
     elif isinstance(model, torch.nn.Module):
         loaded_model = model
@@ -54,8 +54,9 @@ def train(
     model: torch.nn.Module | PathLike | str | None,
     train_dataset: DeepLCDataset | Subset[DeepLCDataset],
     validation_dataset: DeepLCDataset | Subset[DeepLCDataset],
-    device: str = "cpu",
+    device: str | None = None,
     num_workers: int = 0,
+    num_threads: int | None = None,
     learning_rate: float = 0.001,
     epochs: int = 25,
     batch_size: int = 512,
@@ -77,6 +78,8 @@ def train(
         Device to train on ('cpu' or 'cuda').
     num_workers
         Number of worker processes for data loading.
+    num_threads
+        Number of threads for model operations on CPU (ignored if using GPU).
     learning_rate
         Learning rate for optimizer.
     epochs
@@ -94,6 +97,8 @@ def train(
         Trained model.
 
     """
+    torch.set_num_threads(num_threads or torch.get_num_threads())
+    device = device or ("cuda" if torch.cuda.is_available() else "cpu")
     model = load_model(model, device)
 
     # Parse datasets; setup loaders
@@ -107,6 +112,13 @@ def train(
         num_workers=num_workers,
     )
 
+    if len(train_loader) == 0:
+        raise ValueError("Training data loader is empty. Provide at least one training sample.")
+    if len(val_loader) == 0:
+        raise ValueError(
+            "Validation data loader is empty. Adjust validation data or validation_split."
+        )
+
     optimizer = _get_optimizer(model, learning_rate)
     loss_fn = torch.nn.L1Loss()
 
@@ -145,12 +157,15 @@ def train(
 def predict(
     model: torch.nn.Module | PathLike | str | None,
     data: Dataset,
-    device: str = "cpu",
+    device: str | None = None,
     batch_size: int = 512,
     num_workers: int = 0,
+    num_threads: int | None = None,
     show_progress: bool = True,
 ) -> torch.Tensor:
     """Predict using the model for the given dataset."""
+    torch.set_num_threads(num_threads or torch.get_num_threads())
+    device = device or ("cuda" if torch.cuda.is_available() else "cpu")
     model = load_model(model, device)
     data_loader = DataLoader(data, batch_size=batch_size, shuffle=False, num_workers=num_workers)
     predictions = _predict_epoch(model, data_loader, device, show_progress=show_progress)
@@ -160,11 +175,14 @@ def predict(
 def evaluate(
     model: torch.nn.Module | PathLike | str | None,
     data: Dataset,
-    device: str = "cpu",
+    device: str | None = None,
     batch_size: int = 512,
     num_workers: int = 0,
+    num_threads: int | None = None,
 ) -> float:
     """Evaluate the model on the given dataset."""
+    torch.set_num_threads(num_threads or torch.get_num_threads())
+    device = device or ("cuda" if torch.cuda.is_available() else "cpu")
     model = load_model(model, device)
     data_loader = DataLoader(data, batch_size=batch_size, shuffle=False, num_workers=num_workers)
     loss_fn = torch.nn.L1Loss()
@@ -235,6 +253,8 @@ def _predict_epoch(
             features = [feature_tensor.to(device) for feature_tensor in features]
             outputs = model(*features)
             predictions.append(outputs.cpu())
+    if not predictions:
+        return torch.empty(0, dtype=torch.float32)
     return torch.cat(predictions, dim=0).squeeze()
 
 
diff --git a/deeplc/calibration.py b/deeplc/calibration.py
@@ -11,7 +11,7 @@
 from sklearn.pipeline import Pipeline, make_pipeline  # type: ignore[import]
 from sklearn.preprocessing import SplineTransformer  # type: ignore[import]
 
-from deeplc._exceptions import CalibrationError
+from deeplc.exceptions import CalibrationError
 
 LOGGER = logging.getLogger(__name__)
 
diff --git a/deeplc/core.py b/deeplc/core.py
@@ -8,7 +8,7 @@
 
 import numpy as np
 import torch
-from psm_utils.psm_list import PSMList
+from psm_utils import PSM, Peptidoform, PSMList
 
 from deeplc import _model_ops
 from deeplc.calibration import (
@@ -25,7 +25,7 @@
 
 
 def predict(
-    psm_list: PSMList,
+    psm_list: PSMList | list[PSM | Peptidoform | str],
     model: torch.nn.Module | PathLike | str | None = None,
     predict_kwargs: dict | None = None,
 ) -> np.ndarray:
@@ -49,7 +49,7 @@ def predict(
     """
     return _model_ops.predict(
         model=model or DEFAULT_MODEL,
-        data=DeepLCDataset.from_psm_list(psm_list),
+        data=DeepLCDataset.from_psm_list(_parse_psms(psm_list)),
         **(predict_kwargs or {}),
     ).numpy()
 
@@ -116,7 +116,7 @@ def calibrate(
 
 
 def predict_and_calibrate(
-    psm_list: PSMList,
+    psm_list: PSMList | list[PSM | Peptidoform | str],
     psm_list_reference: PSMList,
     model: torch.nn.Module | PathLike | str | None = None,
     calibration: Calibration | None = None,
@@ -147,7 +147,7 @@ def predict_and_calibrate(
     # Predict initial retention times
     LOGGER.info("Predicting retention times...")
     predicted_rt = predict(
-        psm_list=psm_list,
+        psm_list=_parse_psms(psm_list),
         model=model,
         predict_kwargs=predict_kwargs,
     )
@@ -175,7 +175,7 @@ def predict_and_calibrate(
 
 
 def finetune_and_predict(
-    psm_list: PSMList,
+    psm_list: PSMList | list[PSM | Peptidoform | str],
     psm_list_reference: PSMList,
     model: torch.nn.Module | PathLike | str | None = None,
     train_kwargs: dict | None = None,
@@ -205,15 +205,15 @@ def finetune_and_predict(
     """
     # Fine-tune the model
     finetuned_model = finetune(
-        psm_list=psm_list_reference,
+        psm_list_reference=psm_list_reference,
         model=model,
         train_kwargs=train_kwargs,
     )
 
     # Predict retention times with fine-tuned model
     LOGGER.info("Predicting retention times with fine-tuned model...")
     predicted_rt = predict(
-        psm_list=psm_list,
+        psm_list=_parse_psms(psm_list),
         model=finetuned_model,
         predict_kwargs=predict_kwargs,
     )
@@ -233,7 +233,7 @@ def finetune_and_predict(
 
 
 def finetune(
-    psm_list: PSMList,
+    psm_list_reference: PSMList,
     psm_list_validation: PSMList | None = None,
     validation_split: float = 0.1,
     model: torch.nn.Module | PathLike | str | None = None,
@@ -244,7 +244,7 @@ def finetune(
 
     Parameters
     ----------
-    psm_list
+    psm_list_reference
         List of PSMs to use as reference for fine-tuning.
     psm_list_validation
         List of PSMs to use for validation during fine-tuning. If None, a split from psm_list is
@@ -261,10 +261,10 @@ def finetune(
 
     """
     LOGGER.info("Fine-tuning model...")
-    if any(psm_list["is_decoy"]):
+    if any(psm_list_reference["is_decoy"]):
         # TODO: Move to reusable validation step?
         LOGGER.warning("PSM list contains decoy PSMs. These will be used for fine tuning.")
-    training_data = DeepLCDataset.from_psm_list(psm_list)
+    training_data = DeepLCDataset.from_psm_list(psm_list_reference)
     validation_data = (
         DeepLCDataset.from_psm_list(psm_list_validation) if psm_list_validation else None
     )
@@ -281,7 +281,7 @@ def finetune(
 
 
 def train(
-    psm_list: PSMList,
+    psm_list_reference: PSMList,
     psm_list_validation: PSMList | None = None,
     validation_split: float = 0.1,
     train_kwargs: dict | None = None,
@@ -291,8 +291,8 @@ def train(
 
     Parameters
     ----------
-    psm_list
-        List of PSMs to use for training.
+    psm_list_reference
+        List of PSMs to use as reference for fine-tuning.
     psm_list_validation
         List of PSMs to use for validation. If None, a split from psm_list is used.
     validation_split
@@ -306,7 +306,7 @@ def train(
         Trained model.
 
     """
-    training_data = DeepLCDataset.from_psm_list(psm_list)
+    training_data = DeepLCDataset.from_psm_list(psm_list_reference)
     validation_data = (
         DeepLCDataset.from_psm_list(psm_list_validation) if psm_list_validation else None
     )
@@ -321,3 +321,29 @@ def train(
         **(train_kwargs or {}),
     )
     return trained_model
+
+
+def _parse_psms(psm_list: PSMList | list[PSM | Peptidoform | str]) -> PSMList:
+    """
+    Parse a list of PSMs, Peptidoforms, or strings into a PSMList.
+
+    Note that this function can only be used for inputs that do not require additional data,
+    such as retention times or decoy status. It cannot be used for reference or validation
+    data sets that require observed retention times for calibration or training.
+
+    """
+    if isinstance(psm_list, PSMList):
+        return psm_list
+    elif isinstance(psm_list, list):
+        if all(isinstance(psm, PSM) for psm in psm_list):
+            return PSMList(psm_list=psm_list)
+        elif all(isinstance(psm, Peptidoform) for psm in psm_list) or all(
+            isinstance(psm, str) for psm in psm_list
+        ):
+            return PSMList(
+                psm_list=[PSM(spectrum_id=i, peptidoform=pf) for i, pf in enumerate(psm_list)]
+            )
+        else:
+            raise ValueError("List must contain either PSMs, Peptidoforms, or strings.")
+    else:
+        raise ValueError("Input must be a PSMList or a list of PSMs, Peptidoforms, or strings.")
diff --git a/deeplc/data.py b/deeplc/data.py
@@ -74,7 +74,7 @@ def __getitem__(self, idx) -> tuple:
         targets = (
             self.target_retention_times[idx]
             if self.target_retention_times is not None
-            else torch.full_like(feature_tuples[0], fill_value=float("nan"), dtype=torch.float32)
+            else torch.tensor(float("nan"), dtype=torch.float32)
         )
         return feature_tuples, targets
 
@@ -160,10 +160,19 @@ def split_datasets(
     """
     # TODO: Implement stratified splitting based on stripped sequence
     if validation_data is None:
+        if not 0 < validation_split < 1:
+            raise ValueError(
+                f"validation_split must be between 0 and 1 (exclusive), got {validation_split}."
+            )
         if not hasattr(train_data, "__len__"):
             raise ValueError("Dataset must implement __len__ method for automatic splitting")
         dataset_len = len(train_data)  # type: ignore[arg-type]
-        val_size = int(dataset_len * validation_split)
+        if dataset_len < 2:
+            raise ValueError(
+                "Need at least 2 samples in train_data when validation_data is not provided."
+            )
+        val_size = max(1, int(dataset_len * validation_split))
+        val_size = min(val_size, dataset_len - 1)
         train_size = dataset_len - val_size
         train_dataset, val_dataset = torch.utils.data.random_split(
             train_data, [train_size, val_size]
diff --git a/deeplc/exceptions.py b/deeplc/exceptions.py
diff --git a/tests/test_model_ops.py b/tests/test_model_ops.py
@@ -0,0 +1,60 @@
+from __future__ import annotations
+
+import pytest
+import torch
+from torch.utils.data import Dataset
+
+from deeplc import _model_ops
+from deeplc.data import split_datasets
+
+
+class _TinyDeepLCDataset(Dataset):
+    def __init__(self, length: int):
+        self.length = length
+
+    def __len__(self) -> int:
+        return self.length
+
+    def __getitem__(self, index: int):
+        features = (
+            torch.zeros((60, 6), dtype=torch.float32),
+            torch.zeros((30, 6), dtype=torch.float32),
+            torch.zeros((55,), dtype=torch.float32),
+            torch.zeros((60, 20), dtype=torch.float32),
+        )
+        target = torch.tensor(0.0, dtype=torch.float32)
+        return features, target
+
+
+class _DummyModel(torch.nn.Module):
+    def forward(self, matrix, matrix_sum, matrix_global, matrix_hc):  # noqa: ARG002
+        batch_size = matrix.shape[0]
+        return torch.zeros((batch_size, 1), dtype=torch.float32)
+
+
+def test_predict_returns_empty_tensor_for_empty_dataset():
+    empty_data = _TinyDeepLCDataset(length=0)
+    preds = _model_ops.predict(model=_DummyModel(), data=empty_data, show_progress=False)
+    assert isinstance(preds, torch.Tensor)
+    assert preds.numel() == 0
+
+
+def test_split_datasets_rejects_too_small_dataset_without_validation_data():
+    with pytest.raises(ValueError, match="Need at least 2 samples"):
+        split_datasets(
+            train_data=_TinyDeepLCDataset(length=1),
+            validation_data=None,
+            validation_split=0.1,
+        )
+
+
+def test_train_rejects_empty_validation_loader():
+    with pytest.raises(ValueError, match="Validation data loader is empty"):
+        _model_ops.train(
+            model=_DummyModel(),
+            train_dataset=_TinyDeepLCDataset(length=2),
+            validation_dataset=_TinyDeepLCDataset(length=0),
+            epochs=1,
+            batch_size=2,
+            show_progress=False,
+        )