Skip to content

Commit deff52d

Browse files
authored
Merge pull request #78 from bigbio/dev
Mirnor changes for plexDIA and msstats
2 parents 39d82a1 + 3416884 commit deff52d

1 file changed

Lines changed: 59 additions & 16 deletions

File tree

quantmsutils/diann/diann2msstats.py

Lines changed: 59 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@
1111

1212
import click
1313
import pandas as pd
14+
import pyarrow.parquet as pq
1415
from pyopenms import AASequence
1516

1617
CONTEXT_SETTINGS = dict(help_option_names=["-h", "--help"])
@@ -49,21 +50,29 @@ def diann2msstats(
4950
"Run",
5051
]
5152

52-
logger.debug("Converting to MSstats format...")
53-
if "Decoy" in report.columns:
54-
out_msstats = report[report["Decoy"] != 1][msstats_columns_keep].copy()
55-
else:
56-
out_msstats = report[msstats_columns_keep].copy()
57-
58-
out_msstats.columns = [
53+
out_msstats_columns = [
5954
"ProteinName",
6055
"PeptideSequence",
6156
"PrecursorCharge",
6257
"Intensity",
6358
"Run",
6459
]
60+
61+
if "Channel" in report.columns and report["Channel"].nunique() > 1:
62+
logger.info("Multiplexing detected. Applying label mapping...")
63+
msstats_columns_keep.append("Channel")
64+
out_msstats_columns.append("IsotopeLabelType")
65+
66+
logger.debug("Converting to MSstats format...")
67+
if "Decoy" in report.columns:
68+
out_msstats = report[report["Decoy"] != 1][msstats_columns_keep].copy()
69+
else:
70+
out_msstats = report[msstats_columns_keep].copy()
71+
72+
out_msstats.columns = out_msstats_columns
6573
out_msstats = out_msstats[out_msstats["Intensity"] != 0]
6674

75+
out_msstats["PeptideSequence"] = out_msstats["PeptideSequence"].apply(_sanitize_sequence)
6776
out_msstats.loc[:, "PeptideSequence"] = out_msstats.apply(
6877
lambda x: (
6978
AASequence.fromString(x["PeptideSequence"]).toString()
@@ -74,24 +83,38 @@ def diann2msstats(
7483
)
7584
out_msstats["FragmentIon"] = "NA"
7685
out_msstats["ProductCharge"] = "0"
77-
out_msstats["IsotopeLabelType"] = "L"
86+
87+
if "IsotopeLabelType" in out_msstats.columns:
88+
out_msstats = out_msstats[out_msstats["IsotopeLabelType"].notna()]
89+
out_msstats = out_msstats[out_msstats["IsotopeLabelType"].str.strip() != ""]
90+
91+
# is multiplexing
92+
f_table_cols = ["Fraction", "Sample", "run", "Label"]
93+
merge_keys = ["Run", "IsotopeLabelType"]
94+
else:
95+
out_msstats["IsotopeLabelType"] = "L"
96+
97+
f_table_cols = ["Fraction", "Sample", "run"]
98+
merge_keys = ["Run"]
7899

79100
logger.debug(f"out_msstats ({out_msstats.shape}) >>>")
80101
logger.debug("Adding Fraction, BioReplicate, Condition columns")
81102

82103
design_lookup = (
83104
s_data_frame[["Sample", "MSstats_Condition", "MSstats_BioReplicate"]]
84-
.merge(f_table[["Fraction", "Sample", "run"]], on="Sample")
105+
.merge(f_table[f_table_cols], on="Sample")
85106
.rename(
86107
columns={
87108
"run": "Run",
88109
"MSstats_BioReplicate": "BioReplicate",
89110
"MSstats_Condition": "Condition",
90-
}
111+
"Label": "IsotopeLabelType",
112+
},
113+
errors="ignore"
91114
)
92115
.drop(columns=["Sample"])
93116
)
94-
out_msstats = out_msstats.merge(design_lookup, on="Run", how="left", validate="many_to_one")
117+
out_msstats = out_msstats.merge(design_lookup, on=merge_keys, how="left", validate="many_to_one")
95118

96119
unmatched = out_msstats["BioReplicate"].isna()
97120
if unmatched.any():
@@ -142,7 +165,10 @@ def _parse_unified_design(exp_design_file):
142165
df = pd.read_csv(exp_design_file, sep="\t")
143166

144167
# Validate required columns
145-
required_cols = {"Filename", "Fraction", "Sample", "Condition", "BioReplicate"}
168+
required_cols = {
169+
"Filename", "Fraction", "Sample", "Condition", "BioReplicate", "Label", "LabelType"
170+
}
171+
146172
missing = required_cols - set(df.columns)
147173
if missing:
148174
raise ValueError(
@@ -152,8 +178,20 @@ def _parse_unified_design(exp_design_file):
152178

153179
df["run"] = df["Filename"].apply(_true_stem)
154180

155-
# Build f_table (file table): one row per file/channel
156-
f_table = df[["Filename", "Fraction", "Sample", "run"]].copy()
181+
# Multiplexing
182+
if df["Label"].nunique() > 1:
183+
184+
if "silac" in df["LabelType"].values:
185+
silac_dict = {
186+
"SILAC light": "L",
187+
"SILAC medium": "M",
188+
"SILAC heavy": "H",
189+
}
190+
df["Label"] = df["Label"].replace(silac_dict)
191+
192+
f_table = df[["Filename", "Fraction", "Sample", "run", "Label"]].copy()
193+
else:
194+
f_table = df[["Filename", "Fraction", "Sample", "run"]].copy()
157195

158196
# Validate each Sample maps to exactly one (Condition, BioReplicate) pair
159197
unique_mapping = df[["Sample", "Condition", "BioReplicate"]].drop_duplicates()
@@ -205,8 +243,8 @@ def load_report(report_path, qvalue_threshold: float) -> pd.DataFrame:
205243
"Q.Value",
206244
]
207245
if path.suffix == ".parquet":
208-
pq_columns = pd.read_parquet(path, columns=[]).columns.tolist()
209-
use_cols = remain_cols + (["Decoy"] if "Decoy" in pq_columns else [])
246+
pq_schema = pq.read_schema(path)
247+
use_cols = remain_cols + [col for col in ["Decoy", "Channel"] if col in pq_schema.names]
210248
report = pd.read_parquet(path, columns=use_cols)
211249
else:
212250
tsv_header = pd.read_csv(path, sep="\t", header=0, nrows=0).columns.tolist()
@@ -215,3 +253,8 @@ def load_report(report_path, qvalue_threshold: float) -> pd.DataFrame:
215253

216254
report = report[report["Q.Value"] < qvalue_threshold]
217255
return report
256+
257+
258+
def _sanitize_sequence(seq):
259+
seq = seq.replace("(SILAC)", "")
260+
return seq

0 commit comments

Comments
 (0)