Skip to content

Commit 945e11c

Browse files
authored
Merge pull request #49 from bigbio/dev
MS1Feature Finder output to parquet.
2 parents 7c24b6d + 0c87d41 commit 945e11c

6 files changed

Lines changed: 482 additions & 51 deletions

File tree

README.md

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -81,6 +81,28 @@ For MS/MS signals, we have the following additional columns:
8181
8282
</details>
8383
84+
<details>
85+
<summary>MS1 Feature Maps</summary>
86+
87+
We use the `FeatureFinderMultiplexAlgorithm` from [OpenMS](https://pyopenms.readthedocs.io/en/latest/apidocs/_autosummary/pyopenms/pyopenms.FeatureFinderMultiplexAlgorithm.html)
88+
to extract the features from the MS1 spectra. We use an algorithm based on the original implementation by [Andy Lin](https://doi.org/10.1093/bioinformatics/btad058). The output of this algorithm is a feature map, which contains the following information:
89+
90+
- `feature_mz`: The m/z of the feature.
91+
- `feature_rt`: The retention time of the feature.
92+
- `feature_intensity`: The intensity of the feature.
93+
- `feature_charge`: The charge of the feature.
94+
- `feature_quality`: The quality of the feature.
95+
- `feature_percentile_tic`: The percentile of the feature in the total ion current.
96+
- `feature_id`: The unique identifier of the feature generated by OpenMS.
97+
- `feature_min_rt`: The minimum retention time of the feature within the feature map.
98+
- `feature_min_mz`: The minimum m/z of the feature within the feature map.
99+
- `feature_max_rt`: The maximum retention time of the feature within the feature map.
100+
- `feature_max_mz`: The maximum m/z of the feature within the feature map.
101+
- `feature_num_scans`: The number of scans that the feature is present in the feature map.
102+
- `feature_scans`: The scans where the feature is present in the feature map.
103+
104+
The tool will generate a gzip compressed parquet file with the extension `{file_name}_ms1_feature_info.parquet`.
105+
84106
## Contributions and issues
85107
86108
Contributions and issues are welcome. Please, open an issue in the [GitHub repository](https://github.com/bigbio/quantms) or PR in the [GitHub repository](https://github.com/bigbio/quantms-utils).
Lines changed: 324 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,324 @@
1+
"""
2+
MS1 Feature Detection, this algorithm is based on the OpenMS library and is used to detect MS1 features from mzML files.
3+
In addition, it adds some normalization and filtering steps from the
4+
previous algorithm by Andy Lim https://github.com/bmx8177/MS1Connect
5+
published in https://doi.org/10.1093/bioinformatics/btad058.
6+
7+
We improved the original algorithm by using FeatureFinderMultiplexAlgorithm instead of FeatureFinder as originally
8+
implemented by Andy Lim. Also, we annotated additional features such as min and max retention time and mz values.
9+
10+
This algorithm is used to detect MS1 features from mzML files and save them to parquet format.
11+
"""
12+
13+
import bisect
14+
import logging
15+
from pathlib import Path
16+
from typing import List, Optional, Dict, Any, Union
17+
import pandas as pd
18+
import pyopenms as oms
19+
20+
from quantmsutils.openms import extract_scan_id
21+
22+
logging.basicConfig(format="%(asctime)s [%(funcName)s] - %(message)s", level=logging.INFO)
23+
logger = logging.getLogger(__name__)
24+
25+
26+
class MS1FeatureDetector:
27+
"""
28+
Class for detecting MS1 features from mzML files and saving to parquet format.
29+
"""
30+
31+
def __init__(self, min_ptic: float = 0.05, max_ptic: float = 0.95, ms_level: int = 1):
32+
"""
33+
Initialize the MS1 feature detector.
34+
35+
Parameters
36+
----------
37+
min_ptic : float, optional
38+
Minimum percentile TIC to include features, by default 0.05
39+
max_ptic : float, optional
40+
Maximum percentile TIC to include features, by default 0.95
41+
ms_level : int, optional
42+
MS level to analyze, by default 1
43+
"""
44+
# Configure logging
45+
46+
self.min_ptic = min_ptic
47+
self.max_ptic = max_ptic
48+
self.ms_level = ms_level
49+
50+
# Initialize options for file loading
51+
self.options = oms.PeakFileOptions()
52+
self.options.setMSLevels([self.ms_level])
53+
54+
def _calc_tic(self, experiment: oms.MSExperiment) -> float:
55+
"""
56+
Calculate all the TIC in the MS experiment, summing all intensities from all scans.
57+
58+
Parameters
59+
----------
60+
experiment : MSExperiment
61+
The MS experiment containing spectra.
62+
63+
Returns
64+
-------
65+
float
66+
Total ion current.
67+
"""
68+
return sum(
69+
sum(intensity)
70+
for mz, intensity in (
71+
scan.get_peaks() for scan in experiment if scan.getMSLevel() == self.ms_level
72+
)
73+
)
74+
75+
def _get_ptic_data(self, experiment: oms.MSExperiment):
76+
"""
77+
Convert TIC to pTIC (percentile TIC) for all MS scans. The pTIC is the cumulative sum of TIC
78+
up to a given retention time, divided by the total TIC.
79+
80+
Parameters
81+
----------
82+
experiment : MSExperiment
83+
The MS experiment containing spectra.
84+
"""
85+
total_tic = self._calc_tic(experiment)
86+
if total_tic == 0:
87+
logger.warning("Total TIC is zero, check input data")
88+
return [], []
89+
90+
rt_list, ptic_list, scans = [], [], []
91+
sum_tic = 0.0
92+
93+
logger.info("Converting TIC to pTIC")
94+
for scan in experiment:
95+
if scan.getMSLevel() == self.ms_level:
96+
intensities = scan.get_peaks()[1]
97+
rt_list.append(scan.getRT())
98+
ptic_list.append(sum_tic / total_tic)
99+
sum_tic += sum(intensities)
100+
scans.append(extract_scan_id(scan))
101+
102+
return rt_list, ptic_list, scans
103+
104+
@staticmethod
105+
def _find_ptic_for_rt(rt: float, rt_list: List[float], ptic_list: List[float]) -> float:
106+
"""
107+
Find the pTIC value for a given retention time by interpolation.
108+
109+
Parameters
110+
----------
111+
rt : float
112+
Retention time to find pTIC for.
113+
rt_list : List[float]
114+
List of retention times.
115+
ptic_list : List[float]
116+
List of pTIC values corresponding to rt_list.
117+
118+
Returns
119+
-------
120+
float
121+
Interpolated pTIC value.
122+
"""
123+
if not rt_list or not ptic_list:
124+
return 0.0
125+
126+
index = bisect.bisect_left(rt_list, rt)
127+
128+
# Handle edge cases
129+
if index == 0:
130+
return ptic_list[0]
131+
if index >= len(rt_list):
132+
return ptic_list[-1]
133+
134+
# Linear interpolation between adjacent points
135+
rt_left = rt_list[index - 1]
136+
rt_right = rt_list[index]
137+
ptic_left = ptic_list[index - 1]
138+
ptic_right = ptic_list[index]
139+
140+
# Calculate interpolated pTIC
141+
rt_frac = (rt - rt_left) / (rt_right - rt_left) if rt_right != rt_left else 0
142+
return ptic_left + rt_frac * (ptic_right - ptic_left)
143+
144+
def _extract_features(
145+
self, features: oms.FeatureMap, rt_list: List[float], ptic_list: List[float], scans: List[str]
146+
) -> List[Dict[str, Any]]:
147+
"""
148+
Extract feature information and filter by pTIC.
149+
150+
Parameters
151+
----------
152+
features : FeatureMap
153+
Feature map from feature finder.
154+
rt_list : List[float]
155+
List of retention times.
156+
ptic_list : List[float]
157+
List of pTIC values corresponding to rt_list.
158+
159+
Returns
160+
-------
161+
List[Dict[str, Any]]
162+
List of feature dictionaries.
163+
"""
164+
feature_list = []
165+
166+
for feature in features:
167+
mz = round(feature.getMZ(), 4)
168+
intensity = feature.getIntensity()
169+
rt = round(feature.getRT(), 4)
170+
charge = feature.getCharge()
171+
172+
# Get quality metrics
173+
quality = feature.getOverallQuality()
174+
175+
# Interpolate pTIC for this retention time
176+
ptic = round(self._find_ptic_for_rt(rt, rt_list, ptic_list), 4)
177+
178+
convexhull = feature.getConvexHull().getBoundingBox()
179+
180+
minRT, minMZ = convexhull.minPosition()
181+
maxRT, maxMZ = convexhull.maxPosition()
182+
select_scans = self._get_selected_scans(scans, rt_list, minRT, maxRT)
183+
num_scans = len(select_scans)
184+
185+
186+
187+
# Filter by pTIC
188+
if self.min_ptic <= ptic <= self.max_ptic:
189+
feature_list.append(
190+
{
191+
"feature_mz": mz,
192+
"feature_intensity": intensity,
193+
"feature_rt": rt,
194+
"feature_charge": charge,
195+
"feature_percentile_tic": ptic,
196+
"feature_quality": quality,
197+
"feature_id": feature.getUniqueId(),
198+
"feature_min_rt": minRT,
199+
"feature_min_mz": minMZ,
200+
"feature_max_rt": maxRT,
201+
"feature_max_mz": maxMZ,
202+
"feature_num_scans": num_scans,
203+
"feature_scans": select_scans
204+
205+
}
206+
)
207+
else:
208+
logger.debug(f"Skipping feature at RT {rt} due to pTIC {ptic}")
209+
210+
return feature_list
211+
212+
def process_file(
213+
self,
214+
input_file: Union[str, Path],
215+
output_file: Union[str, Path],
216+
sort_by: str = "intensity",
217+
ascending: bool = False,
218+
) -> Optional[str]:
219+
"""
220+
Process an mzML file to detect features and save to parquet format.
221+
222+
Parameters
223+
----------
224+
input_file : str
225+
Path to mzML file to process.
226+
output_file : str
227+
Path to an output parquet file.
228+
sort_by : str, optional
229+
Column to sort by, by default "intensity"
230+
ascending : bool, optional
231+
Sort order, by default False (descending)
232+
233+
Returns
234+
-------
235+
Optional[str]
236+
Path to the output file if successful, None otherwise.
237+
"""
238+
try:
239+
# Validate inputs
240+
input_path = Path(input_file)
241+
if not input_path.exists():
242+
logger.error(f"Input file not found: {input_file}")
243+
return None
244+
245+
# Create output directory if needed
246+
output_path = Path(output_file)
247+
output_path.parent.mkdir(parents=True, exist_ok=True)
248+
249+
# Load MS data
250+
logger.info(f"Loading data from {input_file}")
251+
file_handler = oms.MzMLFile()
252+
experiment = oms.MSExperiment()
253+
picker = oms.PeakPickerHiRes()
254+
255+
filtered_experiment = oms.MSExperiment()
256+
257+
file_handler.setOptions(self.options)
258+
file_handler.load(str(input_path), experiment)
259+
picker.pickExperiment(experiment, filtered_experiment, False)
260+
experiment = filtered_experiment
261+
262+
filtered_experiment = oms.MSExperiment()
263+
for spec in experiment:
264+
if spec.getMSLevel() == self.ms_level:
265+
filtered_experiment.addSpectrum(spec)
266+
experiment = filtered_experiment
267+
268+
if experiment.size() == 0:
269+
logger.error("No spectra found in input file")
270+
return None
271+
272+
# Get pTIC data
273+
rt_list, ptic_list, scans = self._get_ptic_data(experiment)
274+
275+
# Run feature finder
276+
logger.info("Running feature detection")
277+
feature_finder = oms.FeatureFinderMultiplexAlgorithm()
278+
params = feature_finder.getParameters()
279+
params.setValue("algorithm:labels", "[]")
280+
feature_finder.setParameters(params)
281+
282+
feature_finder.run(exp=experiment, progress=True)
283+
features = feature_finder.getFeatureMap()
284+
features.setUniqueIds()
285+
286+
logger.info(f"Found {features.size()} features")
287+
288+
# Extract features
289+
feature_list = self._extract_features(features, rt_list, ptic_list, scans)
290+
291+
# Create DataFrame
292+
df = pd.DataFrame(feature_list)
293+
294+
# Sort and limit features
295+
if sort_by in df.columns:
296+
df = df.sort_values(by=sort_by, ascending=ascending)
297+
298+
# Save to Parquet
299+
df.to_parquet(output_path, index=False, compression="gzip")
300+
logger.info(f"Saved {len(df)} features to {output_file}")
301+
302+
return str(output_path)
303+
304+
except Exception as e:
305+
logger.exception(f"Error processing {input_file}: {str(e)}")
306+
return None
307+
308+
@staticmethod
309+
def _get_selected_scans(scans: List[str], rt_list: List[float], min_rt: float, max_rt: float):
310+
"""
311+
This function returns the scans that are within the RT range of the feature.
312+
The scans and rt_list are two lists that have the same length and the same order.
313+
:param scans: List of scans ids
314+
:param rt_list: List of retention times
315+
:param min_rt: Minimum retention time for the feature
316+
:param max_rt: Maximum retention time for the feature
317+
:return:
318+
"""
319+
selected_scans = []
320+
for i, rt in enumerate(rt_list):
321+
if min_rt <= rt <= max_rt:
322+
selected_scans.append(scans[i])
323+
return selected_scans
324+

0 commit comments

Comments
 (0)