Refactor: Clean up unused variables and improve code style

SamoraHunter · SamoraHunter · commit 23f41867c6bf · 2026-02-17T17:05:17.000Z
This commit focuses on improving code quality by removing unused variables and adhering to more Pythonic style conventions.

Key changes include:

Removed numerous unused local variables across various modules, including model classes, pipeline steps, and utility functions.
Replaced lambda functions with standard def statements for better readability and clarity in keras_classifier_class.py and embeddings.py.
Updated boolean comparisons from == False to the more idiomatic not ... in main.py and notebooks.
Simplified boolean DataFrame indexing in plot_feature_categories.py and notebooks.
Removed unnecessary assignments where the return value of a function was not used.
These changes do not affect the functionality but enhance code maintainability and reduce clutter.
diff --git a/ml_grid/model_classes/keras_classifier_class.py b/ml_grid/model_classes/keras_classifier_class.py
@@ -120,20 +120,22 @@ def __init__(
             l1_reg=0.0,  # Register l1_reg with a default value
             l2_reg=0.0,  # Register l2_reg with a default value
         )
-        X_data = self.X
-        y_data = self.y
 
         # vals = np.linspace(2, 750, 6)
         vals = np.logspace(1, 2.0, 3)
 
-        floorer = lambda t: math.floor(t)
+        def floorer(t):
+            return math.floor(t)
+
         floored_width = np.array([floorer(xi) for xi in vals])
         floored_width = np.insert(floored_width, 0, 1, axis=None)
         floored_width
 
         vals = np.logspace(1, 2.0, 3)
 
-        floorer = lambda t: math.floor(t)
+        def floorer(t):
+            return math.floor(t)
+
         floored_depth = np.array([floorer(xi) for xi in vals])
         floored_depth = np.insert(floored_depth, 0, 1, axis=None)
         floored_depth
diff --git a/ml_grid/model_classes_time_series/CNNClassifier_module.py b/ml_grid/model_classes_time_series/CNNClassifier_module.py
@@ -29,9 +29,6 @@ def __init__(self, ml_grid_object: pipe):
         Args:
             ml_grid_object (pipe): An instance of the main data pipeline object.
         """
-        time_limit_param = ml_grid_object.global_params.time_limit_param
-
-        n_jobs_model_val = ml_grid_object.global_params.n_jobs_model_val
 
         random_state_val = ml_grid_object.global_params.random_state_val
 
diff --git a/ml_grid/model_classes_time_series/Catch22Classifer_module.py b/ml_grid/model_classes_time_series/Catch22Classifer_module.py
@@ -31,7 +31,6 @@ def __init__(self, ml_grid_object: pipe):
             ml_grid_object (pipe): An instance of the main data pipeline object.
         """
 
-        verbose_param = ml_grid_object.verbose
         random_state_val = ml_grid_object.global_params.random_state_val
         n_jobs_model_val = ml_grid_object.global_params.n_jobs_model_val
 
diff --git a/ml_grid/model_classes_time_series/ResNetClassifier_module.py b/ml_grid/model_classes_time_series/ResNetClassifier_module.py
@@ -31,10 +31,6 @@ def __init__(self, ml_grid_object: pipe):
             ml_grid_object (pipe): An instance of the main data pipeline object.
         """
 
-        random_state_val = ml_grid_object.global_params.random_state_val
-
-        n_jobs_model_val = ml_grid_object.global_params.n_jobs_model_val
-
         verbose_param = ml_grid_object.verbose
 
         param_space = ParamSpace(
diff --git a/ml_grid/model_classes_time_series/TemporalDictionaryEnsembleClassifier_module.py b/ml_grid/model_classes_time_series/TemporalDictionaryEnsembleClassifier_module.py
@@ -32,8 +32,6 @@ def __init__(self, ml_grid_object: pipe):
             ml_grid_object (pipe): An instance of the main data pipeline object.
         """
 
-        verbose_param = ml_grid_object.verbose
-
         random_state_val = ml_grid_object.global_params.random_state_val
 
         time_limit_param = global_parameters.time_limit_param
diff --git a/ml_grid/pipeline/embeddings.py b/ml_grid/pipeline/embeddings.py
@@ -152,7 +152,10 @@ def create_embedding_pipeline(
     elif method_lower == "select_kbest_mi":
         default_params = {"random_state": 42}
         default_params.update(kwargs)
-        score_func = lambda X, y: mutual_info_classif(X, y, **default_params)
+
+        def score_func(X, y):
+            return mutual_info_classif(X, y, **default_params)
+
         steps.append(("embed", SelectKBest(score_func=score_func, k=n_components)))
 
     else:
diff --git a/ml_grid/pipeline/grid_search_cross_validate.py b/ml_grid/pipeline/grid_search_cross_validate.py
@@ -593,8 +593,6 @@ def __init__(
         if not failed and self.global_parameters.verbose >= 3:
             self.logger.debug("Fitting final model")
 
-        metric_list = self.metric_list
-
         if not failed and self.y_train.nunique() < 2:
             raise ValueError(
                 "Only one class present in y_train. ROC AUC score is not defined "
@@ -902,8 +900,6 @@ def __init__(
                     f"Cross-validation for {method_name} completed in {elapsed_time:.2f} seconds."
                 )
 
-        current_algorithm_scores = scores
-
         if self.global_parameters.verbose >= 4:
 
             debug_print_statements_class(scores).debug_print_scores()
@@ -975,7 +971,7 @@ def _adjust_knn_parameters(self, parameter_space: Union[Dict, List[Dict]]):
         Dynamically adjusts the 'n_neighbors' parameter for KNN-based models
         to prevent errors on small datasets during cross-validation.
         """
-        n_splits = self.cv.get_n_splits()
+        self.cv.get_n_splits()
 
         # Correctly calculate the training fold size
         dummy_indices = np.arange(len(self.X_train))
diff --git a/ml_grid/pipeline/main.py b/ml_grid/pipeline/main.py
@@ -361,7 +361,6 @@ def execute(self) -> Tuple[List[List[Any]], float]:
 
         self.model_error_list = []
         self.highest_score = 0
-        highest_score = 0  # for optimisation
 
         if self.multiprocess:
 
@@ -373,11 +372,11 @@ def multi_run_wrapper(args: Tuple) -> Any:
                 from multiprocessing import Pool
 
                 pool = Pool(8)
-                results = pool.map(multi_run_wrapper, self.arg_list)
+                pool.map(multi_run_wrapper, self.arg_list)
                 # print(results)
                 pool.close()  # exp
 
-        elif self.multiprocess == False:
+        elif not self.multiprocess:
             for k in range(0, len(self.arg_list)):
                 try:
                     self.logger.info(
diff --git a/ml_grid/results_processing/plot_feature_categories.py b/ml_grid/results_processing/plot_feature_categories.py
@@ -187,8 +187,8 @@ def plot_category_impact_on_metric(
             if plot_data[category].nunique() < 2:
                 continue
 
-            mean_with = plot_data[plot_data[category] == True][metric].mean()
-            mean_without = plot_data[plot_data[category] == False][metric].mean()
+            mean_with = plot_data[plot_data[category]][metric].mean()
+            mean_without = plot_data[not plot_data[category]][metric].mean()
 
             impact = mean_with - mean_without
 
diff --git a/ml_grid/util/project_score_save.py b/ml_grid/util/project_score_save.py
@@ -165,16 +165,12 @@ def update_score_log(
 
         X_train = ml_grid_object_iter.X_train
 
-        y_train = ml_grid_object_iter.y_train
-
         X_test = ml_grid_object_iter.X_test
 
         y_test = ml_grid_object_iter.y_test
 
         X_test_orig = ml_grid_object_iter.X_test_orig
 
-        y_test_orig = ml_grid_object_iter.y_test_orig
-
         param_space_index = ml_grid_object.param_space_index
 
         bayessearch = global_params.bayessearch
diff --git a/notebooks/grid_score_reader_ML_Grid_V_1.8_unit_test.ipynb b/notebooks/grid_score_reader_ML_Grid_V_1.8_unit_test.ipynb
@@ -495,10 +495,10 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "df[(df['annotation_n']==False) &\n",
-    "   (df['meta_sp_annotation_n']==False) &\n",
-    "   (df['meta_sp_annotation_mrc_n']==False) &\n",
-    "   (df['annotation_mrc_n']==False)\n",
+    "df[(not df['annotation_n']) &\n",
+    "   (not df['meta_sp_annotation_n']) &\n",
+    "   (not df['meta_sp_annotation_mrc_n']) &\n",
+    "   (not df['annotation_mrc_n'])\n",
     "  ]"
    ]
   },
@@ -520,11 +520,11 @@
    "outputs": [],
    "source": [
     "try:\n",
-    "    df[(df['annotation_n']==False) &\n",
-    "       (df['meta_sp_annotation_n']==False) &\n",
-    "       (df['meta_sp_annotation_mrc_n']==False) &\n",
-    "       (df['annotation_mrc_n']==False) &\n",
-    "       (df['date_time_stamp']==False)\n",
+    "    df[(not df['annotation_n']) &\n",
+    "       (not df['meta_sp_annotation_n']) &\n",
+    "       (not df['meta_sp_annotation_mrc_n']) &\n",
+    "       (not df['annotation_mrc_n']) &\n",
+    "       (not df['date_time_stamp'])\n",
     "      ].sort_values(by='auc', ascending=False)\n",
     "except Exception as e:\n",
     "    print(e)"
@@ -547,7 +547,7 @@
    "source": [
     "try:\n",
     "    df[\n",
-    "       (df['date_time_stamp']==False)\n",
+    "       (not df['date_time_stamp'])\n",
     "      ].sort_values(by='auc', ascending=False)\n",
     "except:\n",
     "    pass"
@@ -1935,7 +1935,7 @@
     "#df.iloc[0]\n",
     "pd.set_option('display.width', None)\n",
     "res  =df[\n",
-    "   (df['date_time_stamp']==False)\n",
+    "   (not df['date_time_stamp'])\n",
     "  ].sort_values(by='f1_m', ascending=False).iloc[0]\n",
     "\n",
     "pd.DataFrame(res)"
@@ -1951,7 +1951,7 @@
     "#df.iloc[0]\n",
     "pd.set_option('display.width', None)\n",
     "res  =df[\n",
-    "   (df['date_time_stamp']==False)\n",
+    "   (not df['date_time_stamp'])\n",
     "  ].sort_values(by='auc', ascending=False).iloc[0]\n",
     "\n",
     "pd.DataFrame(res)"
@@ -1975,7 +1975,7 @@
    "outputs": [],
    "source": [
     "sv = df[\n",
-    "   (df['date_time_stamp']==False)\n",
+    "   (not df['date_time_stamp'])\n",
     "  ].sort_values(by='auc', ascending=False)['f_list'].iloc[0]\n",
     "\n",
     "svcn = get_column_names(sv)"
@@ -2175,9 +2175,9 @@
    "outputs": [],
    "source": [
     "df[\n",
-    "   (df['date_time_stamp']==False)\n",
+    "   (not df['date_time_stamp'])\n",
     "  ].sort_values(by='auc', ascending=False)[df[\n",
-    "   (df['date_time_stamp']==False)\n",
+    "   (not df['date_time_stamp'])\n",
     "  ].sort_values(by='auc', ascending=False)['method_name']=='xbg']"
    ]
   },
@@ -2200,9 +2200,9 @@
    "source": [
     "try:\n",
     "    print(df[\n",
-    "   (df['date_time_stamp']==False)\n",
+    "   (not df['date_time_stamp'])\n",
     "  ].sort_values(by='auc', ascending=False)[df[\n",
-    "   (df['date_time_stamp']==False)\n",
+    "   (not df['date_time_stamp'])\n",
     "  ].sort_values(by='auc', ascending=False)['method_name']=='xbg']['algorithm_implementation'].iloc[0])\n",
     "\n",
     "except:\n",
@@ -2227,9 +2227,9 @@
    "outputs": [],
    "source": [
     "eval(str(df[\n",
-    "   (df['date_time_stamp']==False)\n",
+    "   (not df['date_time_stamp'])\n",
     "  ].sort_values(by='auc', ascending=False)[df[\n",
-    "   (df['date_time_stamp']==False)\n",
+    "   (not df['date_time_stamp'])\n",
     "  ].sort_values(by='auc', ascending=False)['method_name']=='xbg']['algorithm_implementation'].iloc[0]).replace(\"nan\", \"np.nan\").replace(\", ...\", \"\"))"
    ]
   },
@@ -2246,9 +2246,9 @@
     "try:\n",
     "    break\n",
     "    model = eval(str(df[\n",
-    "   (df['date_time_stamp']==False)\n",
+    "   (not df['date_time_stamp'])\n",
     "  ].sort_values(by='auc', ascending=False)[df[\n",
-    "   (df['date_time_stamp']==False)\n",
+    "   (not df['date_time_stamp'])\n",
     "  ].sort_values(by='auc', ascending=False)['method_name']=='xbg']['algorithm_implementation'].iloc[0]).replace(\"nan\", \"np.nan\").replace(\", ...\", \"\"))\n",
     "    print(\"set best model params\")\n",
     "except Exception as e:\n",
@@ -2410,7 +2410,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "df[(df['method_name']=='XGBClassifier') & (df['date_time_stamp']==False)]"
+    "df[(df['method_name']=='XGBClassifier') & (not df['date_time_stamp'])]"
    ]
   },
   {
@@ -2441,7 +2441,7 @@
    "outputs": [],
    "source": [
     "try:\n",
-    "    print(df[(df['method_name']=='XGBClassifier') & (df['date_time_stamp']==False)]['algorithm_implementation'].iloc[0])\n",
+    "    print(df[(df['method_name']=='XGBClassifier') & (not df['date_time_stamp'])]['algorithm_implementation'].iloc[0])\n",
     "\n",
     "except:\n",
     "    pass"
@@ -2463,7 +2463,7 @@
    "outputs": [],
    "source": [
     "try:\n",
-    "    sv = df[(df['method_name']=='XGBClassifier') & (df['date_time_stamp']==False)]['f_list'].iloc[0]\n",
+    "    sv = df[(df['method_name']=='XGBClassifier') & (not df['date_time_stamp'])]['f_list'].iloc[0]\n",
     "\n",
     "    svcn = get_column_names(sv)\n",
     "\n",
diff --git a/notebooks/unit_test_synthetic.ipynb b/notebooks/unit_test_synthetic.ipynb
@@ -839,7 +839,7 @@
    "outputs": [],
    "source": [
     "#%%prun\n",
-    "if( multiple_outcomes_example == False):\n",
+    "if( not multiple_outcomes_example):\n",
     "\n",
     "    # Fix the additional argument (outcome_var) using partial\n",
     "    # Define the single outcome to test\n",
@@ -868,7 +868,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "if( multiple_outcomes_example == False):\n",
+    "if( not multiple_outcomes_example):\n",
     "    best"
    ]
   },
@@ -878,7 +878,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "if( multiple_outcomes_example == False):\n",
+    "if( not multiple_outcomes_example):\n",
     "    # Assuming the log file is in the current experiment_dir\n",
     "    results_df = pd.read_csv(experiment_dir / 'final_grid_score_log.csv')"
    ]
@@ -889,7 +889,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "if( multiple_outcomes_example == False):\n",
+    "if( not multiple_outcomes_example):\n",
     "    if 'results_df' in locals():\n",
     "        display(results_df.sort_values('auc', ascending=False).iloc[0])\n",
     "    else:\n",
@@ -902,7 +902,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "if( multiple_outcomes_example == False):\n",
+    "if( not multiple_outcomes_example):\n",
     "    if 'results_df' in locals():\n",
     "        display(results_df.sort_values('auc', ascending=False))\n",
     "    else:\n",
@@ -915,7 +915,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "if( multiple_outcomes_example == True):\n",
+    "if( multiple_outcomes_example):\n",
     "\n",
     "    dft = pd.read_csv(input_csv_path.resolve(), nrows=1)\n",
     "    dft"
@@ -929,7 +929,7 @@
    "source": [
     "# get outcome variables by finding prefix \"outcome_var_\" in column list\n",
     "\n",
-    "if( multiple_outcomes_example == True):\n",
+    "if( multiple_outcomes_example):\n",
     "    outcome_var_list = [dft.columns[i] for i in range(len(dft.columns)) if \"outcome_var_\" in dft.columns[i]]\n",
     "\n",
     "    outcome_var_list"
@@ -1008,7 +1008,7 @@
     "    global_parameters.force_second_cv = config['global_params'].get('force_second_cv', False)\n",
     "    global_parameters.model_eval_time_limit = config['global_params']['model_eval_time_limit']\n",
     "    \n",
-    "    print(f\"Global parameters set from hyperopt config:\")\n",
+    "    print(\"Global parameters set from hyperopt config:\")\n",
     "    print(f\"  - verbose: {global_parameters.verbose}\")\n",
     "    print(f\"  - error_raise: {global_parameters.error_raise}\")\n",
     "    print(f\"  - h2o_show_progress: {global_parameters.h2o_show_progress}\")\n",
diff --git a/tests/test_model_classes_param_spaces.py b/tests/test_model_classes_param_spaces.py
@@ -84,11 +84,9 @@ def test_all_classifier_param_spaces(self):
                 # Determine which object to use for testing
                 # Try algorithm_implementation first (for wrappers), then fall back to the instance itself
                 if hasattr(class_instance, "algorithm_implementation"):
-                    test_object = class_instance.algorithm_implementation
                     object_type = "wrapped sklearn estimator"
                 else:
                     # Use the instance itself if it has the sklearn interface
-                    test_object = class_instance
                     object_type = "direct estimator"
 
                 # Check if the object has parameter_space attribute
@@ -173,9 +171,9 @@ def _validate_parameter_space(self, classifier_class_def, module_name, is_bayes)
 
             # Determine which object to use for validation
             if hasattr(class_instance, "algorithm_implementation"):
-                base_estimator = class_instance.algorithm_implementation
+                pass
             else:
-                base_estimator = class_instance
+                pass
 
             if is_bayes:
                 # Normalize the parameter space to a list of dictionaries,
diff --git a/tests/test_project_score_save.py b/tests/test_project_score_save.py