Skip to content

Commit 87f6acf

Browse files
authored
Merge pull request #264 from CompOmics/fix/predict-library-2
Cherry-pick fixes from fix/predict-library (PR #248)
2 parents 347da26 + b547f65 commit 87f6acf

5 files changed

Lines changed: 90 additions & 46 deletions

File tree

ms2pip/_spectrum_processing.py

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@
1919
get_ms2_spectra, # type: ignore[ty:unresolved-import]
2020
)
2121
from psm_utils import PSM, Peptidoform, PSMList
22+
from pyteomics.proforma import MassModification, ModificationBase
2223

2324
import ms2pip.exceptions as exceptions
2425
from ms2pip.constants import MODELS
@@ -92,7 +93,11 @@ def proforma_to_mass_shift(peptidoform: Peptidoform) -> str:
9293
parts.append(aa)
9394
if mods:
9495
for mod in mods:
95-
parts.append(f"[{mod.mass:+.4f}]") # type: ignore[ty:unresolved-attribute]
96+
if not isinstance(mod, (ModificationBase, MassModification)):
97+
raise ValueError(
98+
f"Unsupported ProForma tag type {type(mod)} in peptidoform {peptidoform}"
99+
)
100+
parts.append(f"[{mod.mass:+.4f}]")
96101
c_term = peptidoform.properties.get("c_term")
97102
if c_term:
98103
for mod in c_term:
@@ -224,7 +229,9 @@ def _load_and_match_spectra(
224229
preprocessed_cache[spec_id] = obs
225230

226231
results.append(
227-
MatchedSpectrum(psm_index, psm, preprocessed_cache[spec_id], annotated_spectra[batch_idx])
232+
MatchedSpectrum(
233+
psm_index, psm, preprocessed_cache[spec_id], annotated_spectra[batch_idx]
234+
)
228235
)
229236

230237
return results

ms2pip/_utils/xgb_models.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -77,6 +77,7 @@ def load_xgb_models(
7777
Number of threads for XGBoost prediction. Capped internally.
7878
7979
"""
80+
os.environ.pop("CUDA_VISIBLE_DEVICES", None) # Workaround for dmlc/xgboost#11283
8081
nthread = min(
8182
processes if processes is not None else (os.cpu_count() or 1),
8283
_MAX_PREDICTION_THREADS,

ms2pip/result.py

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -9,11 +9,6 @@
99
from psm_utils import PSM
1010
from pydantic import BaseModel, ConfigDict
1111

12-
try:
13-
import spectrum_utils.plot as sup
14-
except ImportError:
15-
sup = None # type: ignore[ty:invalid-assignment]
16-
1712
from ms2pip.correlation import pearson
1813
from ms2pip.spectrum import ObservedSpectrum, PredictedSpectrum
1914

@@ -105,6 +100,11 @@ def plot_spectra(self):
105100
Requires optional dependency ``spectrum_utils`` to be installed.
106101
107102
"""
103+
try:
104+
import spectrum_utils.plot as sup
105+
except ImportError as e:
106+
raise ImportError("Optional dependency spectrum_utils not installed.") from e
107+
108108
predicted, observed = (
109109
spec.to_spectrum_utils() if spec else None for spec in self.as_spectra()
110110
)

ms2pip/search_space.py

Lines changed: 37 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -184,8 +184,8 @@ class ProteomeSearchSpace(BaseModel):
184184
fasta_file: Path
185185
min_length: int = 8
186186
max_length: int = 30
187-
min_precursor_mz: float | None = 0
188-
max_precursor_mz: float | None = np.inf
187+
min_precursor_mz: float = 0
188+
max_precursor_mz: float = np.inf
189189
cleavage_rule: str = "trypsin"
190190
missed_cleavages: int = 2
191191
semi_specific: bool = False
@@ -194,7 +194,31 @@ class ProteomeSearchSpace(BaseModel):
194194
max_variable_modifications: int = 3
195195
charges: list[int] = [2, 3]
196196

197-
_peptidoform_spaces: list[_PeptidoformSearchSpace] = PrivateAttr(default_factory=list)
197+
_peptidoform_spaces: list[_PeptidoformSearchSpace] | None = PrivateAttr(default=None)
198+
199+
@field_validator("min_precursor_mz", mode="before")
200+
@classmethod
201+
def _coerce_min_precursor_mz(cls, v):
202+
return 0.0 if v is None else v
203+
204+
@field_validator("max_precursor_mz", mode="before")
205+
@classmethod
206+
def _coerce_max_precursor_mz(cls, v):
207+
return np.inf if v is None else v
208+
209+
@field_validator("min_length")
210+
@classmethod
211+
def _validate_min_length(cls, v):
212+
if v > 3:
213+
return v
214+
raise ValueError("Minimum peptide length must be greater than 3.")
215+
216+
@field_validator("max_length")
217+
@classmethod
218+
def _validate_max_length(cls, v):
219+
if v <= 100:
220+
return v
221+
raise ValueError("Maximum peptide length must be less than or equal to 100.")
198222

199223
@field_validator("modifications")
200224
@classmethod
@@ -219,7 +243,7 @@ def _validate_unspecific_cleavage(self):
219243
return self
220244

221245
def __len__(self):
222-
if not self._peptidoform_spaces:
246+
if self._peptidoform_spaces is None:
223247
raise ValueError("Search space must be built before length can be determined.")
224248
return sum(len(pep_space) for pep_space in self._peptidoform_spaces)
225249

@@ -276,7 +300,7 @@ def __iter__(self) -> Generator[PSM, None, None]: # type: ignore[ty:invalid-met
276300
277301
"""
278302
# Build search space if not already built
279-
if not self._peptidoform_spaces:
303+
if self._peptidoform_spaces is None:
280304
raise ValueError("Search space must be built before PSMs can be generated.")
281305

282306
spectrum_id = 0
@@ -295,25 +319,26 @@ def filter_psms_by_mz(self, psms: PSMList) -> PSMList:
295319
psm_list=[
296320
psm
297321
for psm in psms
298-
if self.min_precursor_mz <= psm.peptidoform.theoretical_mz <= self.max_precursor_mz # type: ignore[ty:unsupported-operator]
322+
if psm.peptidoform.theoretical_mz is not None
323+
and self.min_precursor_mz <= psm.peptidoform.theoretical_mz <= self.max_precursor_mz
299324
]
300325
)
301326

302327
def _digest_fasta(self, processes: int = 1):
303328
"""Digest FASTA file to peptides and populate search space."""
304329
# Convert to string to avoid issues with Path objects
305-
self.fasta_file = str(self.fasta_file) # type: ignore[ty:invalid-assignment]
330+
fasta_file = str(self.fasta_file)
306331
n_proteins = _count_fasta_entries(self.fasta_file)
307332
if self.add_decoys:
308333
fasta_db = pyteomics.fasta.decoy_db(
309-
self.fasta_file,
334+
fasta_file,
310335
mode="reverse",
311336
decoy_only=False,
312337
keep_nterm=True,
313338
)
314339
n_proteins *= 2
315340
else:
316-
fasta_db = pyteomics.fasta.FASTA(self.fasta_file)
341+
fasta_db = pyteomics.fasta.FASTA(fasta_file)
317342

318343
# Read proteins and digest to peptides
319344
with _get_pool(processes) as pool:
@@ -335,6 +360,7 @@ def _digest_fasta(self, processes: int = 1):
335360

336361
def _remove_redundancy(self):
337362
"""Remove redundancy in peptides and combine protein lists."""
363+
assert self._peptidoform_spaces is not None # for type checker
338364
peptide_dict = dict()
339365
for peptide in track(
340366
self._peptidoform_spaces,
@@ -351,6 +377,7 @@ def _remove_redundancy(self):
351377

352378
def _add_modifications(self, processes: int = 1):
353379
"""Add modifications to peptides in search space."""
380+
assert self._peptidoform_spaces is not None # for type checker
354381
modifications_by_target = _restructure_modifications_by_target(self.modifications)
355382
modification_options = []
356383
with _get_pool(processes) as pool:
@@ -373,6 +400,7 @@ def _add_modifications(self, processes: int = 1):
373400

374401
def _add_charges(self):
375402
"""Add charge permutations to peptides in search space."""
403+
assert self._peptidoform_spaces is not None # for type checker
376404
for peptide in track(
377405
self._peptidoform_spaces,
378406
description="Adding charge permutations...",

ms2pip/spectrum.py

Lines changed: 38 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -3,16 +3,22 @@
33
from __future__ import annotations
44

55
import warnings
6+
from typing import Annotated
7+
68
import numpy as np
79
from psm_utils import Peptidoform
8-
from pydantic import model_validator, field_validator, ConfigDict, BaseModel
10+
from pydantic import BaseModel, BeforeValidator, ConfigDict, model_validator
11+
12+
13+
def _coerce_peptidoform(v):
14+
if v is None or isinstance(v, Peptidoform):
15+
return v
16+
elif isinstance(v, str):
17+
return Peptidoform(v)
18+
raise ValueError("Peptidoform must be a string, a Peptidoform object, or None.")
19+
920

10-
try:
11-
import spectrum_utils.spectrum as sus
12-
import spectrum_utils.plot as sup
13-
except ImportError:
14-
sus = None # type: ignore[ty:invalid-assignment]
15-
sup = None # type: ignore[ty:invalid-assignment]
21+
_PeptidoformField = Annotated[Peptidoform | None, BeforeValidator(_coerce_peptidoform)]
1622

1723

1824
class Spectrum(BaseModel):
@@ -47,7 +53,7 @@ class Spectrum(BaseModel):
4753
intensity: np.ndarray
4854
annotations: np.ndarray | None = None
4955
identifier: str | None = None
50-
peptidoform: Peptidoform | str | None = None
56+
peptidoform: _PeptidoformField = None
5157
precursor_mz: float | None = None
5258
precursor_charge: int | None = None
5359
retention_time: float | None = None
@@ -73,16 +79,6 @@ def check_array_lengths(cls, data):
7379
raise ValueError("Array lengths do not match.")
7480
return data
7581

76-
@field_validator("peptidoform")
77-
@classmethod
78-
def check_peptidoform(cls, value):
79-
if not value or isinstance(value, Peptidoform):
80-
return value
81-
elif isinstance(value, str):
82-
return Peptidoform(value)
83-
else:
84-
raise ValueError("Peptidoform must be a string, a Peptidoform object, or None.")
85-
8682
@property
8783
def tic(self):
8884
"""Total ion current."""
@@ -138,39 +134,51 @@ def to_spectrum_utils(self):
138134
Otherwise, ``ValueError`` is raised.
139135
140136
"""
141-
if not sus:
142-
raise ImportError("Optional dependency spectrum_utils not installed.")
137+
try:
138+
import spectrum_utils.spectrum as sus
139+
except ImportError as e:
140+
raise ImportError("Optional dependency spectrum_utils not installed.") from e
143141

144142
if self.precursor_charge:
145143
precursor_charge = self.precursor_charge
146144
else:
147145
if not self.peptidoform:
148146
raise ValueError("`precursor_charge` or `peptidoform` must be set.")
149-
else:
150-
precursor_charge = self.peptidoform.precursor_charge # type: ignore[ty:unresolved-attribute]
147+
precursor_charge = self.peptidoform.precursor_charge
148+
if precursor_charge is None:
149+
raise ValueError("Peptidoform charge state is not set.")
151150

152151
if self.precursor_mz:
153-
precursor_mz = self.precursor_mz
152+
precursor_mz_float = float(self.precursor_mz)
154153
else:
155154
if not self.peptidoform:
156155
raise ValueError("`precursor_mz` or `peptidoform` must be set.")
156+
elif not self.peptidoform.theoretical_mz:
157+
raise ValueError(
158+
"Peptidoform theoretical m/z could not be calculated; ensure the charge state "
159+
" is set."
160+
)
157161
else:
158162
warnings.warn("precursor_mz not set, using theoretical precursor m/z.")
159-
precursor_mz = self.peptidoform.theoretical_mz # type: ignore[ty:unresolved-attribute]
163+
precursor_mz_float = float(self.peptidoform.theoretical_mz)
160164

161165
spectrum = sus.MsmsSpectrum(
162166
identifier=self.identifier if self.identifier else "spectrum",
163-
precursor_mz=precursor_mz, # type: ignore[ty:invalid-argument-type]
164-
precursor_charge=precursor_charge, # type: ignore[ty:invalid-argument-type]
167+
precursor_mz=precursor_mz_float,
168+
precursor_charge=precursor_charge,
165169
mz=self.mz,
166170
intensity=self.intensity,
167-
retention_time=self.retention_time, # type: ignore[ty:invalid-argument-type]
171+
retention_time=self.retention_time if self.retention_time is not None else 0.0,
168172
)
169-
if self.peptidoform:
173+
if (
174+
self.peptidoform
175+
and self.mass_tolerance is not None
176+
and self.mass_tolerance_unit is not None
177+
):
170178
spectrum.annotate_proforma(
171179
str(self.peptidoform),
172-
self.mass_tolerance, # type: ignore[ty:invalid-argument-type]
173-
self.mass_tolerance_unit, # type: ignore[ty:invalid-argument-type]
180+
self.mass_tolerance,
181+
self.mass_tolerance_unit,
174182
)
175183
return spectrum
176184

0 commit comments

Comments
 (0)