1111
1212import click
1313import pandas as pd
14+ import pyarrow .parquet as pq
1415from pyopenms import AASequence
1516
1617CONTEXT_SETTINGS = dict (help_option_names = ["-h" , "--help" ])
@@ -49,21 +50,29 @@ def diann2msstats(
4950 "Run" ,
5051 ]
5152
52- logger .debug ("Converting to MSstats format..." )
53- if "Decoy" in report .columns :
54- out_msstats = report [report ["Decoy" ] != 1 ][msstats_columns_keep ].copy ()
55- else :
56- out_msstats = report [msstats_columns_keep ].copy ()
57-
58- out_msstats .columns = [
53+ out_msstats_columns = [
5954 "ProteinName" ,
6055 "PeptideSequence" ,
6156 "PrecursorCharge" ,
6257 "Intensity" ,
6358 "Run" ,
6459 ]
60+
61+ if "Channel" in report .columns and report ["Channel" ].nunique () > 1 :
62+ logger .info ("Multiplexing detected. Applying label mapping..." )
63+ msstats_columns_keep .append ("Channel" )
64+ out_msstats_columns .append ("IsotopeLabelType" )
65+
66+ logger .debug ("Converting to MSstats format..." )
67+ if "Decoy" in report .columns :
68+ out_msstats = report [report ["Decoy" ] != 1 ][msstats_columns_keep ].copy ()
69+ else :
70+ out_msstats = report [msstats_columns_keep ].copy ()
71+
72+ out_msstats .columns = out_msstats_columns
6573 out_msstats = out_msstats [out_msstats ["Intensity" ] != 0 ]
6674
75+ out_msstats ["PeptideSequence" ] = out_msstats ["PeptideSequence" ].apply (_sanitize_sequence )
6776 out_msstats .loc [:, "PeptideSequence" ] = out_msstats .apply (
6877 lambda x : (
6978 AASequence .fromString (x ["PeptideSequence" ]).toString ()
@@ -74,24 +83,38 @@ def diann2msstats(
7483 )
7584 out_msstats ["FragmentIon" ] = "NA"
7685 out_msstats ["ProductCharge" ] = "0"
77- out_msstats ["IsotopeLabelType" ] = "L"
86+
87+ if "IsotopeLabelType" in out_msstats .columns :
88+ out_msstats = out_msstats [out_msstats ["IsotopeLabelType" ].notna ()]
89+ out_msstats = out_msstats [out_msstats ["IsotopeLabelType" ].str .strip () != "" ]
90+
91+ # is multiplexing
92+ f_table_cols = ["Fraction" , "Sample" , "run" , "Label" ]
93+ merge_keys = ["Run" , "IsotopeLabelType" ]
94+ else :
95+ out_msstats ["IsotopeLabelType" ] = "L"
96+
97+ f_table_cols = ["Fraction" , "Sample" , "run" ]
98+ merge_keys = ["Run" ]
7899
79100 logger .debug (f"out_msstats ({ out_msstats .shape } ) >>>" )
80101 logger .debug ("Adding Fraction, BioReplicate, Condition columns" )
81102
82103 design_lookup = (
83104 s_data_frame [["Sample" , "MSstats_Condition" , "MSstats_BioReplicate" ]]
84- .merge (f_table [[ "Fraction" , "Sample" , "run" ] ], on = "Sample" )
105+ .merge (f_table [f_table_cols ], on = "Sample" )
85106 .rename (
86107 columns = {
87108 "run" : "Run" ,
88109 "MSstats_BioReplicate" : "BioReplicate" ,
89110 "MSstats_Condition" : "Condition" ,
90- }
111+ "Label" : "IsotopeLabelType" ,
112+ },
113+ errors = "ignore"
91114 )
92115 .drop (columns = ["Sample" ])
93116 )
94- out_msstats = out_msstats .merge (design_lookup , on = "Run" , how = "left" , validate = "many_to_one" )
117+ out_msstats = out_msstats .merge (design_lookup , on = merge_keys , how = "left" , validate = "many_to_one" )
95118
96119 unmatched = out_msstats ["BioReplicate" ].isna ()
97120 if unmatched .any ():
@@ -142,7 +165,10 @@ def _parse_unified_design(exp_design_file):
142165 df = pd .read_csv (exp_design_file , sep = "\t " )
143166
144167 # Validate required columns
145- required_cols = {"Filename" , "Fraction" , "Sample" , "Condition" , "BioReplicate" }
168+ required_cols = {
169+ "Filename" , "Fraction" , "Sample" , "Condition" , "BioReplicate" , "Label" , "LabelType"
170+ }
171+
146172 missing = required_cols - set (df .columns )
147173 if missing :
148174 raise ValueError (
@@ -152,8 +178,20 @@ def _parse_unified_design(exp_design_file):
152178
153179 df ["run" ] = df ["Filename" ].apply (_true_stem )
154180
155- # Build f_table (file table): one row per file/channel
156- f_table = df [["Filename" , "Fraction" , "Sample" , "run" ]].copy ()
181+ # Multiplexing
182+ if df ["Label" ].nunique () > 1 :
183+
184+ if "silac" in df ["LabelType" ].values :
185+ silac_dict = {
186+ "SILAC light" : "L" ,
187+ "SILAC medium" : "M" ,
188+ "SILAC heavy" : "H" ,
189+ }
190+ df ["Label" ] = df ["Label" ].replace (silac_dict )
191+
192+ f_table = df [["Filename" , "Fraction" , "Sample" , "run" , "Label" ]].copy ()
193+ else :
194+ f_table = df [["Filename" , "Fraction" , "Sample" , "run" ]].copy ()
157195
158196 # Validate each Sample maps to exactly one (Condition, BioReplicate) pair
159197 unique_mapping = df [["Sample" , "Condition" , "BioReplicate" ]].drop_duplicates ()
@@ -205,8 +243,8 @@ def load_report(report_path, qvalue_threshold: float) -> pd.DataFrame:
205243 "Q.Value" ,
206244 ]
207245 if path .suffix == ".parquet" :
208- pq_columns = pd . read_parquet (path , columns = []). columns . tolist ( )
209- use_cols = remain_cols + ([ "Decoy" ] if "Decoy" in pq_columns else [])
246+ pq_schema = pq . read_schema (path )
247+ use_cols = remain_cols + [ col for col in [ "Decoy" , "Channel" ] if col in pq_schema . names ]
210248 report = pd .read_parquet (path , columns = use_cols )
211249 else :
212250 tsv_header = pd .read_csv (path , sep = "\t " , header = 0 , nrows = 0 ).columns .tolist ()
@@ -215,3 +253,8 @@ def load_report(report_path, qvalue_threshold: float) -> pd.DataFrame:
215253
216254 report = report [report ["Q.Value" ] < qvalue_threshold ]
217255 return report
256+
257+
258+ def _sanitize_sequence (seq ):
259+ seq = seq .replace ("(SILAC)" , "" )
260+ return seq
0 commit comments