bigbio
diff --git a/‎.DS_Store‎
8 KB b/‎.DS_Store‎
8 KB
diff --git a/‎.github/workflows/python-package.yml‎
Lines changed: 4 additions & 5 deletions b/‎.github/workflows/python-package.yml‎
Lines changed: 4 additions & 5 deletions
diff --git a/‎.gitignore‎
Lines changed: 4 additions & 5 deletions b/‎.gitignore‎
Lines changed: 4 additions & 5 deletions
diff --git a/‎.markdownlint.json‎
Lines changed: 0 additions & 5 deletions b/‎.markdownlint.json‎
Lines changed: 0 additions & 5 deletions
diff --git a/‎README.md‎
Lines changed: 6 additions & 4 deletions b/‎README.md‎
Lines changed: 6 additions & 4 deletions
diff --git a/‎pyproject.toml‎
Lines changed: 1 addition & 1 deletion b/‎pyproject.toml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎quantmsutils/.DS_Store‎
6 KB b/‎quantmsutils/.DS_Store‎
6 KB
diff --git a/‎quantmsutils/__init__.py‎
Lines changed: 1 addition & 1 deletion b/‎quantmsutils/__init__.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎quantmsutils/diann/diann2msstats.py‎
Lines changed: 159 additions & 0 deletions b/‎quantmsutils/diann/diann2msstats.py‎
Lines changed: 159 additions & 0 deletions
@@ -16,12 +16,12 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        python-version: ["3.9", "3.10"]
+        python-version: ["3.10", "3.11"]
 
     steps:
     - uses: actions/checkout@v4
     - name: Set up Python ${{ matrix.python-version }}
-      uses: actions/setup-python@v3
+      uses: actions/setup-python@v5
       with:
         python-version: ${{ matrix.python-version }}
     - name: Install dependencies
@@ -51,8 +51,7 @@ jobs:
     - name: Test package online TMT
       run: |
         quantmsutilsc mzmlstats --ms_path TMT_Erwinia_1uLSike_Top10HCD_isol2_45stepped_60min_01.mzML --ms2_file --feature_detection
-    - name: Text package online DIA
+    - name: Test package online DIA
       run: |
         quantmsutilsc mzmlstats --ms_path RD139_Narrow_UPS1_0_1fmol_inj1.mzML --ms2_file --feature_detection
-        
-        
+
@@ -160,12 +160,11 @@ cython_debug/
 .idea/
 *.csv
 *_df.csv
-*.tsv
 /tests/test_data/hMICAL1_coiPAnP-N2-200_3Murea-1Mthiourea-200mMtcep_14733.d/
-/tests/test_data/diann2mztab/RD139_Narrow_UPS1_0_1fmol_inj1.mzML
-/tests/test_data/diann2mztab/RD139_Narrow_UPS1_0_1fmol_inj2.mzML
-/tests/test_data/diann2mztab/RD139_Narrow_UPS1_0_25fmol_inj1.mzML
-/tests/test_data/diann2mztab/RD139_Narrow_UPS1_0_25fmol_inj2.mzML
+/tests/test_data/diann2msstats/RD139_Narrow_UPS1_0_1fmol_inj1.mzML
+/tests/test_data/diann2msstats/RD139_Narrow_UPS1_0_1fmol_inj2.mzML
+/tests/test_data/diann2msstats/RD139_Narrow_UPS1_0_25fmol_inj1.mzML
+/tests/test_data/diann2msstats/RD139_Narrow_UPS1_0_25fmol_inj2.mzML
 
 .qodo
 /tests/test_data/RD139_Narrow_UPS1_0_1fmol_inj1.mzML
 
@@ -19,8 +19,8 @@ The following functionalities are available in the package:
 ### Diann scripts
 
 - `dianncfg` - Create a configuration file for Diann including enzymes, modifications, and other parameters.
-- `diann2mztab` - Convert Diann output to mzTab format. In addition, convert DIA-NN output to MSstats, Triqler or mzTab.
-    The output formats are used for quality control and downstream analysis in quantms.
+- `diann2msstats` - Convert DIA-NN output to MSstats format.
+    The output is used for quality control and downstream analysis in quantms.
 
 ### SDRF scripts
 
@@ -29,8 +29,8 @@ The following functionalities are available in the package:
 
 ### Other scripts
 
-- `psmconvert` - The convert_psm function converts peptide spectrum matches (PSMs) from an idXML file to a CSV file, optionally filtering out decoy matches. It extracts and processes data from both the idXML and an associated spectra file, handling multiple search engines and scoring systems.
-- `mzmlstats` - The `mzmlstats` processes mass spectrometry data files in either `.mzML` or `Bruker .d` formats to extract and compile statistics about the spectra. It supports generating detailed or ID-only CSV files based on the spectra data.
+- `psmconvert` - The convert_psm function converts peptide spectrum matches (PSMs) from an idXML file to a parquet file, optionally filtering out decoy matches. It extracts and processes data from both the idXML and an associated spectra file, handling multiple search engines and scoring systems.
+- `mzmlstats` - The `mzmlstats` processes `.mzML` mass spectrometry data files to extract and compile statistics about the spectra. It supports generating detailed parquet files with spectrum metadata and MS2 peak data.
 
 #### mzml statistics 
 
@@ -101,6 +101,8 @@ to extract the features from the MS1 spectra. We use an algorithm based on the o
 
 The tool will generate a gzip compressed parquet file with the extension `{file_name}_ms1_feature_info.parquet`.
 
+</details>
+
 ## Contributions and issues
 
 Contributions and issues are welcome. Please, open an issue in the [GitHub repository](https://github.com/bigbio/quantms) or PR in the [GitHub repository](https://github.com/bigbio/quantms-utils).
@@ -3,7 +3,7 @@ name = "quantms-utils"
 description = "Python scripts and helpers for the quantMS workflow"
 readme = "README.md"
 license = "MIT"
-version = "0.0.24"
+version = "0.0.25"
 authors = [
     "Yasset Perez-Riverol <ypriverol@gmail.com>",
     "Dai Chengxin <chengxin2024@126.com>",
 
@@ -1 +1 @@
-__version__ = "0.0.24"
+__version__ = "0.0.25"
@@ -0,0 +1,159 @@
+"""
+Convert DIA-NN output to MSstats format.
+License: Apache 2.0
+Authors: Hong Wong, Yasset Perez-Riverol
+Revisions:
+    2023-Aug-05: J. Sebastian Paez
+"""
+
+import logging
+from pathlib import Path
+
+import click
+import pandas as pd
+from pyopenms import AASequence
+
+CONTEXT_SETTINGS = dict(help_option_names=["-h", "--help"])
+REVISION = "0.1.1"
+
+logging.basicConfig(format="%(asctime)s [%(funcName)s] - %(message)s", level=logging.DEBUG)
+logger = logging.getLogger(__name__)
+
+
+@click.command("diann2msstats", short_help="Convert DIA-NN output to MSstats")
+@click.option("--report", "-r", "report_path", required=True, type=click.Path(exists=True))
+@click.option("--exp_design", "-d", required=True, type=click.Path(exists=True))
+@click.option("--qvalue_threshold", "-q", type=float, required=True)
+def diann2msstats(
+    report_path,
+    exp_design,
+    qvalue_threshold,
+):
+    """
+    Convert DIA-NN output to MSstats format for downstream analysis.
+
+    :param report_path: DIA-NN main report file (.tsv or .parquet)
+    :param exp_design: Experimental design file
+    :param qvalue_threshold: Q-value filter threshold
+    """
+    logger.debug(f"Revision {REVISION}")
+    logger.debug("Reading input files...")
+    report = load_report(report_path, qvalue_threshold)
+    s_data_frame, f_table = get_exp_design_dfs(exp_design)
+
+    msstats_columns_keep = [
+        "Protein.Names",
+        "Modified.Sequence",
+        "Precursor.Charge",
+        "Precursor.Quantity",
+        "Run",
+    ]
+
+    logger.debug("Converting to MSstats format...")
+    if "Decoy" in report.columns:
+        out_msstats = report[report["Decoy"] != 1][msstats_columns_keep].copy()
+    else:
+        out_msstats = report[msstats_columns_keep].copy()
+
+    out_msstats.columns = [
+        "ProteinName",
+        "PeptideSequence",
+        "PrecursorCharge",
+        "Intensity",
+        "Run",
+    ]
+    out_msstats = out_msstats[out_msstats["Intensity"] != 0]
+
+    out_msstats.loc[:, "PeptideSequence"] = out_msstats.apply(
+        lambda x: (
+            AASequence.fromString(x["PeptideSequence"]).toString()
+            if "^" not in x["PeptideSequence"]
+            else "^" + AASequence.fromString(x["PeptideSequence"].replace("^", "")).toString()
+        ),
+        axis=1,
+    )
+    out_msstats["FragmentIon"] = "NA"
+    out_msstats["ProductCharge"] = "0"
+    out_msstats["IsotopeLabelType"] = "L"
+
+    logger.debug(f"out_msstats ({out_msstats.shape}) >>>")
+    logger.debug("Adding Fraction, BioReplicate, Condition columns")
+
+    design_lookup = (
+        s_data_frame[["Sample", "MSstats_Condition", "MSstats_BioReplicate"]]
+        .merge(f_table[["Fraction", "Sample", "run"]], on="Sample")
+        .rename(
+            columns={
+                "run": "Run",
+                "MSstats_BioReplicate": "BioReplicate",
+                "MSstats_Condition": "Condition",
+            }
+        )
+        .drop(columns=["Sample"])
+    )
+    out_msstats = out_msstats.merge(design_lookup, on="Run", how="left", validate="many_to_one")
+
+    unmatched = out_msstats["BioReplicate"].isna()
+    if unmatched.any():
+        bad_runs = out_msstats.loc[unmatched, "Run"].unique().tolist()
+        logger.warning(
+            "Run(s) in DIA-NN report have no match in experimental design: %s. "
+            "These rows will be dropped. Check that Run names (spectra file stems) match Spectra_Filepath in the design.",
+            bad_runs,
+        )
+        out_msstats = out_msstats.dropna(subset=["BioReplicate"])
+    exp_out_prefix = Path(exp_design).stem
+    out_msstats.to_csv(exp_out_prefix + "_msstats_in.csv", sep=",", index=False)
+    logger.info(f"MSstats input file is saved as {exp_out_prefix}_msstats_in.csv")
+
+
+def _true_stem(x):
+    """Return the file name stem (without extension)."""
+    return Path(x).stem
+
+
+def get_exp_design_dfs(exp_design_file):
+    logger.info(f"Reading experimental design file: {exp_design_file}")
+    with open(exp_design_file, "r") as f:
+        data = [line.replace("\r\n", "\n").replace("\r", "\n") for line in f.readlines()]
+        try:
+            empty_row = data.index("\n")
+        except ValueError:
+            raise ValueError(
+                f"Could not find blank separator row in {exp_design_file}. "
+                "Ensure the file contains a blank line between the file and sample tables."
+            )
+        f_table = [i.replace("\n", "").split("\t") for i in data[1:empty_row]]
+        f_header = data[0].replace("\n", "").split("\t")
+        f_table = pd.DataFrame(f_table, columns=f_header)
+        f_table.loc[:, "run"] = f_table.apply(lambda x: _true_stem(x["Spectra_Filepath"]), axis=1)
+
+        s_table = [i.replace("\n", "").split("\t") for i in data[empty_row + 1 :]][1:]
+        s_header = data[empty_row + 1].replace("\n", "").split("\t")
+        s_data_frame = pd.DataFrame(s_table, columns=s_header)
+
+    return s_data_frame, f_table
+
+
+def load_report(report_path, qvalue_threshold: float) -> pd.DataFrame:
+    """Load DIA-NN report from TSV or Parquet, detecting format from file extension."""
+    path = Path(report_path)
+    remain_cols = [
+        "Run",
+        "Protein.Names",
+        "Modified.Sequence",
+        "Precursor.Charge",
+        "Precursor.Quantity",
+        "Q.Value",
+    ]
+    if path.suffix == ".parquet":
+        pq_columns = pd.read_parquet(path, columns=[]).columns.tolist()
+        use_cols = remain_cols + (["Decoy"] if "Decoy" in pq_columns else [])
+        report = pd.read_parquet(path, columns=use_cols)
+    else:
+        tsv_header = pd.read_csv(path, sep="\t", header=0, nrows=0).columns.tolist()
+        tsv_cols = remain_cols + (["Decoy"] if "Decoy" in tsv_header else [])
+        report = pd.read_csv(path, sep="\t", header=0, usecols=tsv_cols)
+
+    report = report[report["Q.Value"] < qvalue_threshold]
+    return report
Original file line number	Diff line number	Diff line change
`@@ -1 +1 @@`
`1`		`-__version__ = "0.0.24"`
	`1`	`+__version__ = "0.0.25"`