1+ """Runs specific pathway model combinatorial experiment analysis (linear modelling)"""
2+ import pandas as pd
3+ import numpy as np
4+ import matplotlib .pyplot as plt
5+ import os
6+ from source .visualization import plot_dbtl_cycles
7+ from matplotlib .colors import ListedColormap
8+ from sklearn .linear_model import LinearRegression
9+ from sklearn .preprocessing import PolynomialFeatures
10+ from sklearn .linear_model import LinearRegression
11+ from sklearn .pipeline import make_pipeline
12+ import statsmodels .formula .api as smf
13+ import pandas as pd
14+ import statsmodels .formula .api as smf
15+
16+ name = "pathwayA"
17+ target = 'Y_product_A'
18+ base_dir = f"results/optimization_results/stratified_sampling/{ name } /experiment1"
19+ output_dir = f"results/statistics_combinatorial_exp/stratified_sampling/{ name } /"
20+ dirs = os .listdir (base_dir )
21+
22+
23+ #pathway A settings
24+ Number_of_Samples = [50 , 100 , 200 ]
25+ Screening_Ratio = [1 , 2 , 4 , 6 ]
26+ Positions = [4 , 6 , 8 , 10 ]
27+ Features = [6 , 12 , 19 ]
28+
29+ # # # pathway B settings
30+ # Number_of_Samples = [50, 100,200]
31+ # Screening_Ratio = [2, 4, 6]
32+ # Positions = [4, 6, 8, 10]
33+ # Features = [6, 10,12]
34+ #
35+ # # #pathway C settings
36+ # Number_of_Samples = [50, 100,200]
37+ # Screening_Ratio = [2, 4, 6]
38+ # Positions = [4, 6, 8, 10]
39+ # Features = [6, 10, 12, 17]
40+ # #
41+ # # #pathway D settings
42+ # Number_of_Samples = [50, 100,200,300]
43+ # Screening_Ratio = [2, 4, 6]
44+ # Positions = [4, 6, 8, 10]
45+ # Features = [5, 10, 15, 21]
46+ # #
47+ # # pathway E settings
48+ # Number_of_Samples = [50, 100, 200, 300]
49+ # Screening_Ratio = [1, 2, 4, 6]
50+ # Positions = [4, 6, 8, 10]
51+ # Features = [5, 10, 15, 20, 25]
52+
53+
54+ option_combinations = []
55+ list_of_list_combinations = []
56+ for F in Features :
57+ for P in Positions :
58+ for N in Number_of_Samples :
59+ for R in Screening_Ratio :
60+ grid_str = f"S{ R * N } X4N{ N } F{ F } P{ P } "
61+ option_combinations .append (grid_str )
62+ list_of_list_combinations .append ([R * N , N , F , P ])
63+
64+ print ("number of combinations" , len (list_of_list_combinations ))
65+
66+ results = {}
67+ for k , option in enumerate (option_combinations ):
68+ option_values = []
69+ for dir in dirs :
70+ working_dir = os .path .join (base_dir , dir )
71+ files = os .listdir (working_dir )
72+ for file in files :
73+
74+ if option in file :
75+ file_path = os .path .join (working_dir , file )
76+ process = pd .read_csv (file_path , index_col = 0 )
77+
78+ max_val = process [process .index .str .startswith ('cycle4' )][target ].median () # calculates the maximum after 5 rounds
79+
80+ #calculates an auc
81+ test = process .copy ()
82+ test .index = test .index .astype (str )
83+ test ["cycle" ] = test .index .str .extract (r"(cycle\d+)" ,expand = False )
84+ cycle_auc = test .groupby ("cycle" )[target ].mean ().sum ()
85+ max_val = cycle_auc
86+
87+ option_values .append (max_val )
88+ results [option ]= option_values
89+
90+ gridsearch_results = pd .DataFrame .from_dict (results ,
91+ orient = 'index' ).iloc [:,:10 ] #only takes 10 repeats (sometimes more experiments were done)
92+
93+
94+ print ("number of missing experiments" , sum (gridsearch_results .isna ().sum (axis = 1 )))
95+
96+
97+
98+
99+ # First, make a DataFrame with column names
100+ mean_performance = gridsearch_results .mean (axis = 1 )
101+ scenarios = pd .DataFrame (np .array (list_of_list_combinations ), columns = ['S' ,'N' ,'F' ,'P' ])
102+ scenarios ['mean_performance' ] = np .array (mean_performance )
103+ scenarios = scenarios .sort_values ('mean_performance' , ascending = False )
104+ scenarios ['SNratio' ] = scenarios ['S' ]/ scenarios ['N' ]
105+
106+
107+ df = scenarios .copy ()
108+
109+ # Build formula with interactions
110+ # Example: 'target ~ A + B + C + A:B + A:C + B:C'
111+ features = ['SNratio' ,'F' ,'P' ]
112+ target = 'mean_performance'
113+ # formula = f"{target} ~ " + " + ".join(features) + " + " + " + ".join([f"{a}:{b}" for i, a in enumerate(features) for b in features[i+1:]])
114+ formula = 'mean_performance ~ SNratio + F + P'
115+
116+
117+ model = smf .ols (formula , data = df ).fit ()
118+ print (model .summary ())
119+ with open (f"{ output_dir } /{ name } _lm_SNratio_interaction.txt" , "w" ) as f :
120+ f .write (model .summary ().as_text ())
121+
122+ model .summary ()
123+ from statsmodels .stats .multitest import multipletests
124+
125+ # Assume you already have a fitted model
126+ pvals = model .pvalues
127+
128+ # Apply Benjamini-Hochberg (FDR) or Bonferroni correction
129+ corrected = multipletests (pvals , alpha = 0.05 , method = 'fdr_bh' ) # or method='bonferroni'
130+
131+ # Output adjusted p-values
132+
133+ adjusted_pvals = pd .DataFrame ({
134+ 'coef' : model .params ,
135+ 'raw_pval' : pvals ,
136+ 'adj_pval' : corrected [1 ],
137+ 'significant (FDR)' : corrected [0 ]
138+ })
139+
140+ adjusted_pvals .to_csv (f"{ output_dir } /mht_{ name } _lm_with_interaction.csv" )
141+
142+
143+ import statsmodels .formula .api as smf
144+
145+ # First, make a DataFrame with column names
146+ df = scenarios .copy ()
147+
148+ # Build formula with interactions
149+ # Example: 'target ~ A + B + C + A:B + A:C + B:C'
150+ features = df .columns [:- 1 ]
151+ target = df .columns [- 1 ]
152+ formula = 'mean_performance ~ S + N + F + P'
153+
154+
155+ model = smf .ols (formula , data = df ).fit ()
156+ print (model .summary ())
157+ with open (f"{ output_dir } /{ name } _lm_SandN_interaction.txt" , "w" ) as f :
158+ f .write (model .summary ().as_text ())
159+
160+ model .summary ()
161+ from statsmodels .stats .multitest import multipletests
162+
163+ # Assume you already have a fitted model
164+ pvals = model .pvalues
165+
166+ # Apply Benjamini-Hochberg (FDR) or Bonferroni correction
167+ corrected = multipletests (pvals , alpha = 0.05 , method = 'fdr_bh' ) # or method='bonferroni'
168+
169+ # Output adjusted p-values
170+ import pandas as pd
171+ adjusted_pvals = pd .DataFrame ({
172+ 'coef' : model .params ,
173+ 'raw_pval' : pvals ,
174+ 'adj_pval' : corrected [1 ],
175+ 'significant (FDR)' : corrected [0 ]
176+ })
177+
178+
179+ adjusted_pvals .to_csv (f"{ output_dir } /mht_{ name } _lm_without_interaction.csv" )
0 commit comments