|
| 1 | +""" |
| 2 | + MS1 Feature Detection, this algorithm is based on the OpenMS library and is used to detect MS1 features from mzML files. |
| 3 | + In addition, it adds some normalization and filtering steps from the |
| 4 | + previous algorithm by Andy Lim https://github.com/bmx8177/MS1Connect |
| 5 | + published in https://doi.org/10.1093/bioinformatics/btad058. |
| 6 | +
|
| 7 | + We improved the original algorithm by using FeatureFinderMultiplexAlgorithm instead of FeatureFinder as originally |
| 8 | + implemented by Andy Lim. Also, we annotated additional features such as min and max retention time and mz values. |
| 9 | +
|
| 10 | + This algorithm is used to detect MS1 features from mzML files and save them to parquet format. |
| 11 | +""" |
| 12 | + |
| 13 | +import bisect |
| 14 | +import logging |
| 15 | +from pathlib import Path |
| 16 | +from typing import List, Optional, Dict, Any, Union |
| 17 | +import pandas as pd |
| 18 | +import pyopenms as oms |
| 19 | + |
| 20 | +from quantmsutils.openms import extract_scan_id |
| 21 | + |
| 22 | +logging.basicConfig(format="%(asctime)s [%(funcName)s] - %(message)s", level=logging.INFO) |
| 23 | +logger = logging.getLogger(__name__) |
| 24 | + |
| 25 | + |
| 26 | +class MS1FeatureDetector: |
| 27 | + """ |
| 28 | + Class for detecting MS1 features from mzML files and saving to parquet format. |
| 29 | + """ |
| 30 | + |
| 31 | + def __init__(self, min_ptic: float = 0.05, max_ptic: float = 0.95, ms_level: int = 1): |
| 32 | + """ |
| 33 | + Initialize the MS1 feature detector. |
| 34 | +
|
| 35 | + Parameters |
| 36 | + ---------- |
| 37 | + min_ptic : float, optional |
| 38 | + Minimum percentile TIC to include features, by default 0.05 |
| 39 | + max_ptic : float, optional |
| 40 | + Maximum percentile TIC to include features, by default 0.95 |
| 41 | + ms_level : int, optional |
| 42 | + MS level to analyze, by default 1 |
| 43 | + """ |
| 44 | + # Configure logging |
| 45 | + |
| 46 | + self.min_ptic = min_ptic |
| 47 | + self.max_ptic = max_ptic |
| 48 | + self.ms_level = ms_level |
| 49 | + |
| 50 | + # Initialize options for file loading |
| 51 | + self.options = oms.PeakFileOptions() |
| 52 | + self.options.setMSLevels([self.ms_level]) |
| 53 | + |
| 54 | + def _calc_tic(self, experiment: oms.MSExperiment) -> float: |
| 55 | + """ |
| 56 | + Calculate all the TIC in the MS experiment, summing all intensities from all scans. |
| 57 | +
|
| 58 | + Parameters |
| 59 | + ---------- |
| 60 | + experiment : MSExperiment |
| 61 | + The MS experiment containing spectra. |
| 62 | +
|
| 63 | + Returns |
| 64 | + ------- |
| 65 | + float |
| 66 | + Total ion current. |
| 67 | + """ |
| 68 | + return sum( |
| 69 | + sum(intensity) |
| 70 | + for mz, intensity in ( |
| 71 | + scan.get_peaks() for scan in experiment if scan.getMSLevel() == self.ms_level |
| 72 | + ) |
| 73 | + ) |
| 74 | + |
| 75 | + def _get_ptic_data(self, experiment: oms.MSExperiment): |
| 76 | + """ |
| 77 | + Convert TIC to pTIC (percentile TIC) for all MS scans. The pTIC is the cumulative sum of TIC |
| 78 | + up to a given retention time, divided by the total TIC. |
| 79 | +
|
| 80 | + Parameters |
| 81 | + ---------- |
| 82 | + experiment : MSExperiment |
| 83 | + The MS experiment containing spectra. |
| 84 | + """ |
| 85 | + total_tic = self._calc_tic(experiment) |
| 86 | + if total_tic == 0: |
| 87 | + logger.warning("Total TIC is zero, check input data") |
| 88 | + return [], [] |
| 89 | + |
| 90 | + rt_list, ptic_list, scans = [], [], [] |
| 91 | + sum_tic = 0.0 |
| 92 | + |
| 93 | + logger.info("Converting TIC to pTIC") |
| 94 | + for scan in experiment: |
| 95 | + if scan.getMSLevel() == self.ms_level: |
| 96 | + intensities = scan.get_peaks()[1] |
| 97 | + rt_list.append(scan.getRT()) |
| 98 | + ptic_list.append(sum_tic / total_tic) |
| 99 | + sum_tic += sum(intensities) |
| 100 | + scans.append(extract_scan_id(scan)) |
| 101 | + |
| 102 | + return rt_list, ptic_list, scans |
| 103 | + |
| 104 | + @staticmethod |
| 105 | + def _find_ptic_for_rt(rt: float, rt_list: List[float], ptic_list: List[float]) -> float: |
| 106 | + """ |
| 107 | + Find the pTIC value for a given retention time by interpolation. |
| 108 | +
|
| 109 | + Parameters |
| 110 | + ---------- |
| 111 | + rt : float |
| 112 | + Retention time to find pTIC for. |
| 113 | + rt_list : List[float] |
| 114 | + List of retention times. |
| 115 | + ptic_list : List[float] |
| 116 | + List of pTIC values corresponding to rt_list. |
| 117 | +
|
| 118 | + Returns |
| 119 | + ------- |
| 120 | + float |
| 121 | + Interpolated pTIC value. |
| 122 | + """ |
| 123 | + if not rt_list or not ptic_list: |
| 124 | + return 0.0 |
| 125 | + |
| 126 | + index = bisect.bisect_left(rt_list, rt) |
| 127 | + |
| 128 | + # Handle edge cases |
| 129 | + if index == 0: |
| 130 | + return ptic_list[0] |
| 131 | + if index >= len(rt_list): |
| 132 | + return ptic_list[-1] |
| 133 | + |
| 134 | + # Linear interpolation between adjacent points |
| 135 | + rt_left = rt_list[index - 1] |
| 136 | + rt_right = rt_list[index] |
| 137 | + ptic_left = ptic_list[index - 1] |
| 138 | + ptic_right = ptic_list[index] |
| 139 | + |
| 140 | + # Calculate interpolated pTIC |
| 141 | + rt_frac = (rt - rt_left) / (rt_right - rt_left) if rt_right != rt_left else 0 |
| 142 | + return ptic_left + rt_frac * (ptic_right - ptic_left) |
| 143 | + |
| 144 | + def _extract_features( |
| 145 | + self, features: oms.FeatureMap, rt_list: List[float], ptic_list: List[float], scans: List[str] |
| 146 | + ) -> List[Dict[str, Any]]: |
| 147 | + """ |
| 148 | + Extract feature information and filter by pTIC. |
| 149 | +
|
| 150 | + Parameters |
| 151 | + ---------- |
| 152 | + features : FeatureMap |
| 153 | + Feature map from feature finder. |
| 154 | + rt_list : List[float] |
| 155 | + List of retention times. |
| 156 | + ptic_list : List[float] |
| 157 | + List of pTIC values corresponding to rt_list. |
| 158 | +
|
| 159 | + Returns |
| 160 | + ------- |
| 161 | + List[Dict[str, Any]] |
| 162 | + List of feature dictionaries. |
| 163 | + """ |
| 164 | + feature_list = [] |
| 165 | + |
| 166 | + for feature in features: |
| 167 | + mz = round(feature.getMZ(), 4) |
| 168 | + intensity = feature.getIntensity() |
| 169 | + rt = round(feature.getRT(), 4) |
| 170 | + charge = feature.getCharge() |
| 171 | + |
| 172 | + # Get quality metrics |
| 173 | + quality = feature.getOverallQuality() |
| 174 | + |
| 175 | + # Interpolate pTIC for this retention time |
| 176 | + ptic = round(self._find_ptic_for_rt(rt, rt_list, ptic_list), 4) |
| 177 | + |
| 178 | + convexhull = feature.getConvexHull().getBoundingBox() |
| 179 | + |
| 180 | + minRT, minMZ = convexhull.minPosition() |
| 181 | + maxRT, maxMZ = convexhull.maxPosition() |
| 182 | + select_scans = self._get_selected_scans(scans, rt_list, minRT, maxRT) |
| 183 | + num_scans = len(select_scans) |
| 184 | + |
| 185 | + |
| 186 | + |
| 187 | + # Filter by pTIC |
| 188 | + if self.min_ptic <= ptic <= self.max_ptic: |
| 189 | + feature_list.append( |
| 190 | + { |
| 191 | + "feature_mz": mz, |
| 192 | + "feature_intensity": intensity, |
| 193 | + "feature_rt": rt, |
| 194 | + "feature_charge": charge, |
| 195 | + "feature_percentile_tic": ptic, |
| 196 | + "feature_quality": quality, |
| 197 | + "feature_id": feature.getUniqueId(), |
| 198 | + "feature_min_rt": minRT, |
| 199 | + "feature_min_mz": minMZ, |
| 200 | + "feature_max_rt": maxRT, |
| 201 | + "feature_max_mz": maxMZ, |
| 202 | + "feature_num_scans": num_scans, |
| 203 | + "feature_scans": select_scans |
| 204 | + |
| 205 | + } |
| 206 | + ) |
| 207 | + else: |
| 208 | + logger.debug(f"Skipping feature at RT {rt} due to pTIC {ptic}") |
| 209 | + |
| 210 | + return feature_list |
| 211 | + |
| 212 | + def process_file( |
| 213 | + self, |
| 214 | + input_file: Union[str, Path], |
| 215 | + output_file: Union[str, Path], |
| 216 | + sort_by: str = "intensity", |
| 217 | + ascending: bool = False, |
| 218 | + ) -> Optional[str]: |
| 219 | + """ |
| 220 | + Process an mzML file to detect features and save to parquet format. |
| 221 | +
|
| 222 | + Parameters |
| 223 | + ---------- |
| 224 | + input_file : str |
| 225 | + Path to mzML file to process. |
| 226 | + output_file : str |
| 227 | + Path to an output parquet file. |
| 228 | + sort_by : str, optional |
| 229 | + Column to sort by, by default "intensity" |
| 230 | + ascending : bool, optional |
| 231 | + Sort order, by default False (descending) |
| 232 | +
|
| 233 | + Returns |
| 234 | + ------- |
| 235 | + Optional[str] |
| 236 | + Path to the output file if successful, None otherwise. |
| 237 | + """ |
| 238 | + try: |
| 239 | + # Validate inputs |
| 240 | + input_path = Path(input_file) |
| 241 | + if not input_path.exists(): |
| 242 | + logger.error(f"Input file not found: {input_file}") |
| 243 | + return None |
| 244 | + |
| 245 | + # Create output directory if needed |
| 246 | + output_path = Path(output_file) |
| 247 | + output_path.parent.mkdir(parents=True, exist_ok=True) |
| 248 | + |
| 249 | + # Load MS data |
| 250 | + logger.info(f"Loading data from {input_file}") |
| 251 | + file_handler = oms.MzMLFile() |
| 252 | + experiment = oms.MSExperiment() |
| 253 | + picker = oms.PeakPickerHiRes() |
| 254 | + |
| 255 | + filtered_experiment = oms.MSExperiment() |
| 256 | + |
| 257 | + file_handler.setOptions(self.options) |
| 258 | + file_handler.load(str(input_path), experiment) |
| 259 | + picker.pickExperiment(experiment, filtered_experiment, False) |
| 260 | + experiment = filtered_experiment |
| 261 | + |
| 262 | + filtered_experiment = oms.MSExperiment() |
| 263 | + for spec in experiment: |
| 264 | + if spec.getMSLevel() == self.ms_level: |
| 265 | + filtered_experiment.addSpectrum(spec) |
| 266 | + experiment = filtered_experiment |
| 267 | + |
| 268 | + if experiment.size() == 0: |
| 269 | + logger.error("No spectra found in input file") |
| 270 | + return None |
| 271 | + |
| 272 | + # Get pTIC data |
| 273 | + rt_list, ptic_list, scans = self._get_ptic_data(experiment) |
| 274 | + |
| 275 | + # Run feature finder |
| 276 | + logger.info("Running feature detection") |
| 277 | + feature_finder = oms.FeatureFinderMultiplexAlgorithm() |
| 278 | + params = feature_finder.getParameters() |
| 279 | + params.setValue("algorithm:labels", "[]") |
| 280 | + feature_finder.setParameters(params) |
| 281 | + |
| 282 | + feature_finder.run(exp=experiment, progress=True) |
| 283 | + features = feature_finder.getFeatureMap() |
| 284 | + features.setUniqueIds() |
| 285 | + |
| 286 | + logger.info(f"Found {features.size()} features") |
| 287 | + |
| 288 | + # Extract features |
| 289 | + feature_list = self._extract_features(features, rt_list, ptic_list, scans) |
| 290 | + |
| 291 | + # Create DataFrame |
| 292 | + df = pd.DataFrame(feature_list) |
| 293 | + |
| 294 | + # Sort and limit features |
| 295 | + if sort_by in df.columns: |
| 296 | + df = df.sort_values(by=sort_by, ascending=ascending) |
| 297 | + |
| 298 | + # Save to Parquet |
| 299 | + df.to_parquet(output_path, index=False, compression="gzip") |
| 300 | + logger.info(f"Saved {len(df)} features to {output_file}") |
| 301 | + |
| 302 | + return str(output_path) |
| 303 | + |
| 304 | + except Exception as e: |
| 305 | + logger.exception(f"Error processing {input_file}: {str(e)}") |
| 306 | + return None |
| 307 | + |
| 308 | + @staticmethod |
| 309 | + def _get_selected_scans(scans: List[str], rt_list: List[float], min_rt: float, max_rt: float): |
| 310 | + """ |
| 311 | + This function returns the scans that are within the RT range of the feature. |
| 312 | + The scans and rt_list are two lists that have the same length and the same order. |
| 313 | + :param scans: List of scans ids |
| 314 | + :param rt_list: List of retention times |
| 315 | + :param min_rt: Minimum retention time for the feature |
| 316 | + :param max_rt: Maximum retention time for the feature |
| 317 | + :return: |
| 318 | + """ |
| 319 | + selected_scans = [] |
| 320 | + for i, rt in enumerate(rt_list): |
| 321 | + if min_rt <= rt <= max_rt: |
| 322 | + selected_scans.append(scans[i]) |
| 323 | + return selected_scans |
| 324 | + |
0 commit comments