ml_assisted_library_transform implementation (work in progress)

PHvanLent · PHvanLent · commit b74c31767053 · 2025-04-22T19:45:24.000+02:00
diff --git a/scripts/210325_design_methods_simulation_dev.py b/scripts/210325_design_methods_simulation_dev.py
@@ -10,7 +10,7 @@
 
 logger = getLogger(__name__)
 
-configs_filenames = ["config_best_sampling"]
+configs_filenames = ["config_batch_model_v2_n_rounds"]
 
 start = time.time()
 fig, ax = plt.subplots(figsize=(4, 4))
diff --git a/scripts/develop_library_transformation.ipynb b/scripts/develop_library_transformation.ipynb
diff --git a/scripts/setupconfigfile.py b/scripts/setupconfigfile.py
@@ -11,22 +11,23 @@
 output_name = "config_batch_model_v2_n_rounds"
 target = "product"
 run_id = 1
-n_cycles = 5
-n_experiments = [10, 10, 10, 10, 10]
-n_screened= 60
+n_cycles = 2
+n_experiments = [10, 10]
+n_screened = 30
 
-n_engineered_positions = [6, 6, 6, 6, 6]
-design_method_per_cycle = ["library_transform", "library_transform",
-                           "library_transform", "library_transform", "library_transform"]
+n_engineered_positions = [6, 6]
+design_method_per_cycle = ["library_transform", "ml_assisted_library_transform"]
 noise_percentage = 0.1  # not a percentage
 noise_type = "heteroscedastic"
 
-recommendation_method = ["greedy_baseline", "greedy_baseline",
-                         "greedy_baseline", "greedy_baseline",
-                         "greedy_baseline"]
+recommendation_method = ["greedy_baseline", "greedy_baseline"]
 hyperparams = {'library_transform': {"n_screened_strains": n_screened,
-                                     "sequencing_selection_method": "best_sampling"},}
-               # 'library_ml_assisted':{}} #includes
+                                     "sequencing_selection_method": "best_sampling"},
+               'ml_assisted_library_transform': {"n_screened_strains": n_screened,
+                                                 "ml_method" : "xgboost",
+                                                 "beta": 2**(2.5),
+                                                 "data_strategy" : "all",  # there are multiple strategies here,
+                                                 "sequencing_selection_method": "best_sampling",},}
 
 hyperparams = [hyperparams[design_method] for design_method in design_method_per_cycle]
 
diff --git a/source/optimization_process.py b/source/optimization_process.py
@@ -1,7 +1,11 @@
+import itertools
+from collections import defaultdict
 from jaxkineticmodel.simulated_dbtl.dbtl import DesignBuildTestLearnCycle
 from logging import getLogger
 from source.design_strategies import stratified_sampling, best_sampling
 import pandas as pd
+import xgboost as xgb
+from scipy.special import softmax
 
 logger = getLogger(__name__)
 
@@ -43,6 +47,13 @@ def update_dbtl(self):
             data = self.dbtl.test_format_dataset(designs, production_values, self.dbtl.parameters)
             self.data[self.cycle_id] = data
         elif dbtl_config['design_build_test']['design_method'] == "ml_assisted_library_transform":
+            #this method cannot be done as the first round, since it requires a model
+            assert self.cycle_status != 0
+
+            dbtl_config_dbt = dbtl_config['design_build_test']
+            designs, production_values, strain_promoters = self.ml_assisted_library_transform(dbtl_config_dbt)
+
+
             logger.error("Not implemented yet")
         else:
             logger.error("This design scenario is not implemented. Choose from options"
@@ -57,6 +68,71 @@ def update_dbtl(self):
                         "[greedy_baseline, ..]")
         return best_design, best_producer
 
+    def ml_assisted_library_transform(self,
+                                      dbtl_config_dbt):
+        parameter_perturbations = self.config['optimization_settings']['parameters_perturbation_values']
+        n_strains_screened = dbtl_config_dbt['design_method_hyperparams']['n_screened_strains']
+        n_engineered_positions = dbtl_config_dbt['n_engineered_positions']
+        n_strains = dbtl_config_dbt['n_strains']
+        beta = dbtl_config_dbt['beta'] #exploration/exploitation
+
+        ## train a model based on previous data
+        if dbtl_config_dbt['ml_method']=="xgboost":
+            xgbparameters = {'tree_method': 'auto', 'reg_lambda': 1, 'max_depth': 2, "disable_default_eval_metric": 0}
+            alternative_params = {'num_boost_round': 10, 'early_stopping_rounds': 40}
+
+            if dbtl_config_dbt['data_strategy'] == "all":
+                cycle_names = self.data.keys()
+                data = pd.concat([self.data[i] for i in cycle_names])
+            else:
+                logger.error("data_strategy not supported yet. Choose all data strategy")
+            bst, r2_scores = self.dbtl.learn_train_model(data=self.data,
+                                                          target=self.target,
+                                                          model_type="XGBoost",
+                                                          args=(xgbparameters, alternative_params), test_size=0.20)
+        else:
+            logger.error(f"This ml_method {dbtl_config_dbt['ml_method']} is not implemented. Choose from"
+                         f"options xgboost, or implement your own method")
+
+        #construct library (we do this always)
+        self.dbtl.design_establish_library_elements(parameter_perturbations=parameter_perturbations)
+        _ = self.dbtl.design_assign_positions(n_positions=n_engineered_positions)
+
+        ## construct probability distribution
+        position_elements = []
+        for position in self.dbtl.library.columns.get_level_values(0).unique():
+            temp = list(zip(self.dbtl.library[position]['parameter_name'].values, self.dbtl.library[position]['promoter_value'].values))
+            position_elements.append(temp)
+        combinatorial_designs = list(itertools.product(*position_elements))
+
+        all_designs = []
+        for k, design in enumerate(combinatorial_designs):
+            design = combine_duplicate_keys(design)
+            all_designs.append(design)
+
+        ## we now need to assign the probabilities determined using the XGBoost model. This requires that we save the data from the previous cycle
+        all_designs = pd.DataFrame(all_designs).fillna(0) + 1
+        all_designs_xgb = xgb.DMatrix(all_designs)
+        y_predicted = bst.predict(all_designs_xgb)
+        softmax_distribution = softmax(y_predicted * beta)
+
+        positions = [f"pos_{i}" for i in range(n_engineered_positions)]
+        pandas_combinatorial_designs = pd.DataFrame(combinatorial_designs, columns=positions)
+
+        pandas_combinatorial_designs['softmax'] = softmax_distribution
+
+        probability_dist_per_position = {}
+        for position in positions:
+            probabilities = pandas_combinatorial_designs.groupby(position)['softmax'].sum()
+            probability_dist_per_position[position] = probabilities.to_dict()
+
+
+
+        _ = self.dbtl.design_assign_probabilities(probabilities_per_position=probability_dist_per_position) #this now needs to be done
+
+        logger.info("ml_assisted_library_transform")
+
+
     def library_transform(self,
                         dbtl_config_dbt):
         """Generates a random set of designs.
@@ -119,3 +195,11 @@ def greedy_baseline(self):
 
         return best_design, best_producer
 
+
+
+def combine_duplicate_keys(tuples):
+    """Combines similar keys by adding up their promoter values"""
+    combined = defaultdict(float)
+    for key, value in tuples:
+        combined[str(key)] += value
+    return combined
diff --git a/todos.md b/todos.md
@@ -22,4 +22,16 @@ Overview of the update_dbtl function
 
 11-04 
 Development of the ML-assisted recommendation: develop_library_transformation.ipynb
-Now we just need to add it the probability distribution to assign probabilities
+Now we just need to add it the probability distribution to assign probabilities
+
+22-04-2025
+There are several parameters neccessary here. The method we will fix to XGboost, there is a beta parameter
+scheme that could be set per round. The strategy for including data in the model. Perhaps at some point in 
+the optimization "forgetting" is better than keep stacking data on top. 
+
+
+{"n_screened_strains": n_screened,
+ "ml_method" : "xgboost",
+ "beta": 2**(2.5),
+ "data_strategy" : "all",  # there are multiple strategies here,
+ "sequencing_selection_method": "best_sampling",}