CompOmics
diff --git a/‎.gitignore‎
Lines changed: 1 addition & 0 deletions b/‎.gitignore‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎config.py‎
Lines changed: 7 additions & 1 deletion b/‎config.py‎
Lines changed: 7 additions & 1 deletion
diff --git a/‎data_structures.py‎
Lines changed: 0 additions & 8 deletions b/‎data_structures.py‎
Lines changed: 0 additions & 8 deletions
diff --git a/‎feature_generators/diann_feature_generator.py‎
Lines changed: 59 additions & 7 deletions b/‎feature_generators/diann_feature_generator.py‎
Lines changed: 59 additions & 7 deletions
diff --git a/‎feature_generators/features_fragment_intensity.py‎
Lines changed: 26 additions & 4 deletions b/‎feature_generators/features_fragment_intensity.py‎
Lines changed: 26 additions & 4 deletions
@@ -13,6 +13,7 @@ test_results.txt
 mumdia.egg-info/
 mzml_files/
 notebook_helpers/
+test_data/
 
 # Byte-compiled / optimized / DLL files
 __pycache__/
 
@@ -81,6 +81,7 @@ def convert_legacy_config(legacy_data: Dict[str, Any]) -> Dict[str, Any]:
         "read_initial_search_pickle": "read_initial_search_pickle",
         "remove_intermediate_files": "remove_intermediate_files",
         "fdr_init_search": "fdr_init_search",
+        "min_occurrences": "min_occurrences",
     }
 
     if "mumdia" in legacy_data:
@@ -217,6 +218,9 @@ class MuMDIAConfig:
     read_full_search_pickle: bool = False
     read_initial_search_pickle: bool = False
 
+    # Filtering settings
+    min_occurrences: int = 5  # Minimum PSMs per peptide to keep
+
     # Processing settings
     remove_intermediate_files: bool = False
     dlc_transfer_learn: bool = False
@@ -233,6 +237,7 @@ class MuMDIAConfig:
     clean: bool = False
     sage_only: bool = False
     skip_mokapot: bool = False
+    use_diann_features: bool = True
     verbose: bool = False
 
     # Feature settings
@@ -382,7 +387,8 @@ def get_mumdia_config(self) -> Dict[str, Any]:
             "clean": self.clean,
             "sage_only": self.sage_only,
             "skip_mokapot": self.skip_mokapot,
-            "verbose": self.verbose
+            "verbose": self.verbose,
+            "min_occurrences": self.min_occurrences
         }
 
     def to_legacy_format(self) -> Dict[str, Any]:
 
@@ -20,14 +20,6 @@ class CorrelationResults:
     sum_pred_frag_intens: np.ndarray
     correlation_matrix_psm_ids: np.ndarray
     correlation_matrix_frag_ids: np.ndarray
-    correlation_matrix_psm_ids_ignore_zeros: np.ndarray
-    correlation_matrix_psm_ids_ignore_zeros_counts: np.ndarray
-    correlation_matrix_psm_ids_missing: np.ndarray
-    correlation_matrix_psm_ids_missing_zeros_counts: np.ndarray
-    correlation_matrix_frag_ids_ignore_zeros: np.ndarray
-    correlation_matrix_frag_ids_ignore_zeros_counts: np.ndarray
-    correlation_matrix_frag_ids_missing: np.ndarray
-    correlation_matrix_frag_ids_missing_zeros_counts: np.ndarray
     most_intens_cor: float
     most_intens_cos: float
     mse_avg_pred_intens: float
 
@@ -47,6 +47,9 @@ class FeatureConfig:
     ms1_accuracy_factors: List[float] = None
     ms2_accuracy_factors: List[float] = None
 
+    # Feature toggles
+    enable_ms1_features: bool = False  # MS1-based features (groups 2-3); slow, disabled by default
+
     # Parallelization settings
     n_jobs: int = -1  # -1 means use all available CPU cores
 
@@ -101,8 +104,40 @@ def __init__(self, config: Optional[FeatureConfig] = None):
         self._pivot_cache = {}
         self._correlation_cache = {}
 
+        # Pre-processed MS1 data (set via prepare_ms1_dict)
+        self._ms1_prepared = None  # list of (rt, mz_array, intensity_array) sorted by RT
+
         logger.info("Initialized DIANNFeatureGenerator with built-in optimizations")
 
+    def prepare_ms1_dict(self, ms1_dict: Dict[str, Dict[str, Any]]) -> None:
+        """Pre-convert ms1_dict to sorted numpy arrays for fast elution profile building.
+
+        Call once before processing peptidoforms. Converts each scan's mz/intensity
+        lists to numpy arrays and sorts the scan list by RT. This avoids repeated
+        np.asarray + sort checks in build_elution_profile (~20ms -> ~2ms per call).
+        """
+        prepared = []
+        for scan_dict in ms1_dict.values():
+            mzs = scan_dict.get("mz", [])
+            intensities = scan_dict.get("intensity", [])
+            rt = scan_dict.get("retention_time", None)
+            if rt is None or len(mzs) == 0:
+                continue
+            # Convert RT from seconds to minutes if needed
+            if isinstance(rt, (int, float)) and rt > 1000:
+                rt = rt / 60
+            mz_arr = np.asarray(mzs)
+            int_arr = np.asarray(intensities)
+            # Ensure sorted by m/z
+            if len(mz_arr) > 1 and mz_arr[0] > mz_arr[-1]:
+                order = np.argsort(mz_arr)
+                mz_arr = mz_arr[order]
+                int_arr = int_arr[order]
+            prepared.append((rt, mz_arr, int_arr))
+        # Sort by RT for potential windowed access
+        prepared.sort(key=lambda x: x[0])
+        self._ms1_prepared = prepared
+
     def _setup_parallelization(self):
         """Set up parallelization parameters."""
         import os
@@ -158,7 +193,7 @@ def clear_cache(self):
         self._cache.clear()
         self._pivot_cache.clear()
         self._correlation_cache.clear()
-        logger.info("Cleared all caches")
+        logger.debug("Cleared all caches")
 
     def get_cache_stats(self) -> Dict[str, int]:
         """Get cache statistics for monitoring."""
@@ -657,9 +692,27 @@ def build_elution_profile(
         if tolerance_ppm is None:
             tolerance_ppm = self.config.precursor_mass_tolerance
 
-        elution_profile = {}
         tol_mz = target_mz * tolerance_ppm / 1e6 * acc_factor
 
+        # Fast path: use pre-processed arrays (avoids np.asarray + sort per scan)
+        if self._ms1_prepared is not None:
+            elution_profile = {}
+            for rt, mz_arr, int_arr in self._ms1_prepared:
+                idx = np.searchsorted(mz_arr, target_mz)
+                best_idx = None
+                best_diff = tol_mz
+                for check_idx in (idx - 1, idx, idx + 1):
+                    if 0 <= check_idx < len(mz_arr):
+                        diff = abs(mz_arr[check_idx] - target_mz)
+                        if diff < best_diff:
+                            best_diff = diff
+                            best_idx = check_idx
+                if best_idx is not None:
+                    elution_profile[rt] = int_arr[best_idx]
+            return elution_profile
+
+        # Slow fallback: original dict-based path
+        elution_profile = {}
         for scan, scan_dict in ms1_dict.items():
             mzs = scan_dict.get("mz", [])
             intensities = scan_dict.get("intensity", [])
@@ -668,8 +721,7 @@ def build_elution_profile(
             if rt is None or len(mzs) == 0 or len(intensities) == 0:
                 continue
 
-            # Convert RT from seconds to minutes if needed
-            if isinstance(rt, (int, float)) and rt > 1000:  # Likely in seconds
+            if isinstance(rt, (int, float)) and rt > 1000:
                 rt = rt / 60
 
             best_idx, best_val = self._search_sorted_with_tolerance(
@@ -3272,7 +3324,7 @@ def _calculate_all_features_parallel(
                 except Exception as e:
                     logger.error(f"Error calculating {feature_name}: {e}")
 
-        logger.info(f"Calculated {len(features)} feature groups in parallel")
+        logger.debug(f"Calculated {len(features)} feature groups in parallel")
         return features
 
     def _safe_feature_calculation(self, func, args):
@@ -3317,7 +3369,7 @@ def _calculate_all_features_sequential(
         """
         features = {}
 
-        logger.info("Calculating DIA-NN features...")
+        logger.debug("Calculating DIA-NN features...")
 
         # Group 1: Ion co-elution (MS2 level)
         try:
@@ -3436,7 +3488,7 @@ def _calculate_all_features_sequential(
         except Exception as e:
             logger.error(f"Error in group 10 features: {e}")
 
-        logger.info(f"Calculated {len(features)} feature groups")
+        logger.debug(f"Calculated {len(features)} feature groups")
         return features
 
 
 
@@ -840,10 +840,32 @@ def get_features_fragment_intensity(
 
     df_fragment = df_fragment.join(df_precursor_rt, on="precursor", how="left")
 
-    df_fragment = df_fragment.filter(
-        (pl.col("rt_max_peptide_sub").is_not_null())
-        & (abs(pl.col("rt") - pl.col("rt_max_peptide_sub")) < filter_max_apex_rt)
-    )
+    # Filter fragments to the retention time window around the apex.
+    # If calibrated RT margins are available (rt_lower_margin / rt_higher_margin),
+    # use them for per-peptidoform adaptive windows. Otherwise fall back to the
+    # fixed ±filter_max_apex_rt seconds window.
+    if "rt_lower_margin" in df_fragment.columns and "rt_higher_margin" in df_fragment.columns:
+        df_fragment = df_fragment.filter(
+            (pl.col("rt_max_peptide_sub").is_not_null())
+            & (
+                # Use calibrated margins where available, fall back to fixed window where NaN
+                pl.when(pl.col("rt_lower_margin").is_not_null())
+                .then(
+                    (pl.col("rt") >= pl.col("rt_lower_margin"))
+                    & (pl.col("rt") <= pl.col("rt_higher_margin"))
+                )
+                .otherwise(
+                    abs(pl.col("rt") - pl.col("rt_max_peptide_sub")) < filter_max_apex_rt
+                )
+            )
+        )
+        log_info("Fragment filtering: using calibrated RT margins (with fixed fallback)")
+    else:
+        df_fragment = df_fragment.filter(
+            (pl.col("rt_max_peptide_sub").is_not_null())
+            & (abs(pl.col("rt") - pl.col("rt_max_peptide_sub")) < filter_max_apex_rt)
+        )
+        log_info(f"Fragment filtering: using fixed ±{filter_max_apex_rt}s window (no margins available)")
 
     for (peptidoform, charge), df_fragment_sub_peptidoform in tqdm(
         df_fragment.group_by(["peptide", "charge"])