SamoraHunter
diff --git a/‎README.md‎
Lines changed: 2 additions & 1 deletion b/‎README.md‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎assets/grid_search_cross_validate.mmd‎
Lines changed: 143 additions & 0 deletions b/‎assets/grid_search_cross_validate.mmd‎
Lines changed: 143 additions & 0 deletions
diff --git a/‎assets/grid_search_cross_validate.svg‎
Lines changed: 102 additions & 0 deletions b/‎assets/grid_search_cross_validate.svg‎
Lines changed: 102 additions & 0 deletions
diff --git a/‎config_hyperopt.yml‎
Lines changed: 1 addition & 0 deletions b/‎config_hyperopt.yml‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎ml_grid/model_classes/H2OAutoMLClassifier.py‎
Lines changed: 2 additions & 0 deletions b/‎ml_grid/model_classes/H2OAutoMLClassifier.py‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎ml_grid/model_classes/H2OBaseClassifier.py‎
Lines changed: 29 additions & 36 deletions b/‎ml_grid/model_classes/H2OBaseClassifier.py‎
Lines changed: 29 additions & 36 deletions
diff --git a/‎ml_grid/model_classes/H2OGAMClassifier.py‎
Lines changed: 2 additions & 0 deletions b/‎ml_grid/model_classes/H2OGAMClassifier.py‎
Lines changed: 2 additions & 0 deletions
@@ -35,7 +35,7 @@ Binary classification is a common machine learning task where the goal is to cat
 
 This framework is designed to be a comprehensive toolkit for binary classification experiments, offering a wide range of configurable options:
 
-- **Diverse Model Support:** Includes a collection of standard classifiers (e.g., Logistic Regression, SVM, RandomForest, XGBoost, LightGBM, CatBoost) and specialized time-series models from the `aeon` library (e.g., HIVE-COTE v2, MUSE, OrdinalTDE).
+- **Diverse Model Support:** Includes a collection of standard classifiers (e.g., Logistic Regression, SVM, RandomForest, XGBoost, LightGBM, CatBoost, H2O AutoML/GLM/GBM) and specialized time-series models from the `aeon` library (e.g., HIVE-COTE v2, MUSE, OrdinalTDE).
 - **Advanced Hyperparameter Tuning:** Supports multiple search strategies:
   - **Grid Search:** Exhaustively search a defined parameter grid.
   - **Random Search:** Randomly sample from the parameter space.
@@ -270,3 +270,4 @@ This project is licensed under the MIT License - see the LICENSE file for detail
 ## Acknowledgments
 scikit-learn
 hyperopt
+H2O.ai
@@ -0,0 +1,143 @@
+graph TB
+    Start([Grid Search CV Initialization]) --> Init[Initialize Parameters<br/>- Algorithm<br/>- Parameter Space<br/>- CV Strategy<br/>- Global Config]
+    
+    Init --> DataPrep[Data Preparation]
+    
+    DataPrep --> CheckDF{X_train is<br/>DataFrame?}
+    CheckDF -->|No| ConvertDF[Convert to DataFrame]
+    CheckDF -->|Yes| CheckSeries{y_train is<br/>Series?}
+    ConvertDF --> CheckSeries
+    
+    CheckSeries -->|No| ConvertSeries[Convert to Series<br/>Align with X_train index]
+    CheckSeries -->|Yes| SetCategory[Set y_train as category<br/>Name = 'outcome']
+    ConvertSeries --> SetCategory
+    
+    SetCategory --> ModelCheck{Model Type<br/>Detection}
+    
+    ModelCheck -->|GPU Model| GPUConfig[Configure GPU<br/>n_jobs=1<br/>TF Memory Growth]
+    ModelCheck -->|SVC| ScaleData[Apply StandardScaler]
+    ModelCheck -->|KNN/SimBSig| AdjustKNN[Adjust n_neighbors<br/>for small datasets]
+    ModelCheck -->|CatBoost| CheckSize{Dataset<br/>Size OK?}
+    ModelCheck -->|Other| CVSetup
+    
+    CheckSize -->|Too Small| ReturnDefault[Return Default Score 0.5]
+    CheckSize -->|OK| AdjustCatBoost[Adjust subsample/rsm<br/>parameters]
+    
+    GPUConfig --> CVSetup[CV Strategy Setup]
+    ScaleData --> CVSetup
+    AdjustKNN --> CVSetup
+    AdjustCatBoost --> CVSetup
+    
+    CVSetup --> TestMode{Test Mode<br/>Enabled?}
+    TestMode -->|Yes| FastCV[KFold n_splits=2]
+    TestMode -->|No| ProductionCV[RepeatedKFold<br/>n_splits=2, n_repeats=2]
+    
+    FastCV --> ParamValidation
+    ProductionCV --> ParamValidation
+    
+    ParamValidation[Parameter Validation] --> BayesCheck{Bayesian<br/>Search?}
+    
+    BayesCheck -->|Yes| WrapCategorical[Wrap lists in<br/>Categorical for skopt]
+    BayesCheck -->|No| ValidateParams[Validate parameters<br/>against estimator]
+    
+    WrapCategorical --> ConfigNIter
+    ValidateParams --> ConfigNIter
+    
+    ConfigNIter[Configure n_iter] --> LocalOverride{Local<br/>Override?}
+    LocalOverride -->|Yes| UseLocal[Use local n_iter]
+    LocalOverride -->|No| UseGlobal[Use global n_iter]
+    
+    UseLocal --> CapIter{Exceeds<br/>max_iter?}
+    UseGlobal --> CapIter
+    CapIter -->|Yes| CapValue[Cap to max value]
+    CapIter -->|No| Search
+    CapValue --> Search
+    
+    Search[HyperparameterSearch<br/>Instantiation] --> ResetIndices[Reset DataFrame indices<br/>to integer-based]
+    
+    ResetIndices --> IndexCheck{Index<br/>Aligned?}
+    IndexCheck -->|No| RaiseError[Raise AssertionError]
+    IndexCheck -->|Yes| RunSearch[search.run_search]
+    
+    RunSearch --> SearchError{Search<br/>Error?}
+    SearchError -->|SVC Dual Coef| SVCDefault[Return default 0.5]
+    SearchError -->|Other Error| LogRaise[Log error & re-raise]
+    SearchError -->|Success| TestModeCheck2{Test Mode?}
+    
+    TestModeCheck2 -->|Yes| SkipCV[Skip final CV<br/>Return 0.5]
+    TestModeCheck2 -->|No| CheckClasses{Classes >= 2?}
+    
+    CheckClasses -->|No| RaiseValueError[Raise ValueError<br/>AUC not defined]
+    CheckClasses -->|Yes| H2OCheck{H2O or<br/>Keras Model?}
+    
+    H2OCheck -->|Yes| SingleThread[Set n_jobs=1<br/>for CV]
+    H2OCheck -->|No| MultiThread[Use grid_n_jobs]
+    
+    SingleThread --> CheckCache{Can reuse<br/>cached CV<br/>results?}
+    MultiThread --> CheckCache
+    
+    CheckCache -->|Yes & Not Forced| ExtractCache[Extract scores from<br/>cv_results_]
+    CheckCache -->|No or Forced| FreshCV[Run fresh<br/>cross_validate]
+    
+    ExtractCache --> CacheError{Extraction<br/>Error?}
+    CacheError -->|Yes| FreshCV
+    CacheError -->|No| ProcessScores
+    
+    FreshCV --> CVType{Model<br/>Type?}
+    CVType -->|Keras| KerasCV[Internal CV handling<br/>in fit method]
+    CVType -->|Other| StandardCV[cross_validate with<br/>multiple metrics]
+    
+    KerasCV --> CVErrors{CV<br/>Errors?}
+    StandardCV --> CVErrors
+    
+    CVErrors -->|XGBoost GPU Error| FallbackCPU[Fallback to CPU<br/>tree_method='hist']
+    CVErrors -->|AdaBoost Poor| AdaBoostDefault[Use default scores]
+    CVErrors -->|H2O RuntimeError| H2ODefault[Use default scores]
+    CVErrors -->|Other Error| GenericDefault[Use default scores<br/>Log error]
+    CVErrors -->|Success| ProcessScores[Process Scores]
+    
+    FallbackCPU --> Retry[Retry cross_validate]
+    Retry --> RetryError{Retry<br/>Error?}
+    RetryError -->|Yes| GenericDefault
+    RetryError -->|No| ProcessScores
+    
+    ProcessScores --> TimeCheck{CV time ><br/>threshold?}
+    TimeCheck -->|Yes| WarnSlow[Warn about slow CV]
+    TimeCheck -->|No| LogTime[Log CV completion time]
+    
+    WarnSlow --> Predict
+    LogTime --> Predict
+    
+    Predict[Predict on X_test] --> UpdateLog{Score logging<br/>enabled?}
+    
+    UpdateLog -->|Yes| SaveScores[Update score log with:<br/>- CV scores<br/>- predictions<br/>- best estimator<br/>- timing info]
+    UpdateLog -->|No| WarnNoLog[Warn: no logging]
+    
+    SaveScores --> CalcAUC[Calculate final AUC<br/>on test set]
+    WarnNoLog --> CalcAUC
+    
+    CalcAUC --> H2OCleanup{H2O<br/>Model?}
+    H2OCleanup -->|Yes| LeaveRunning[Leave H2O cluster running<br/>for next model]
+    H2OCleanup -->|No| End
+    
+    LeaveRunning --> End([Return AUC Score])
+    
+    SVCDefault --> End
+    SkipCV --> H2OCleanup
+    ReturnDefault --> End
+    RaiseError --> End
+    LogRaise --> End
+    RaiseValueError --> End
+    AdaBoostDefault --> CalcAUC
+    H2ODefault --> CalcAUC
+    GenericDefault --> CalcAUC
+    
+    style Start fill:#e1f5e1
+    style End fill:#ffe1e1
+    style SearchError fill:#fff3cd
+    style CVErrors fill:#fff3cd
+    style TestMode fill:#d1ecf1
+    style TestModeCheck2 fill:#d1ecf1
+    style H2OCheck fill:#f8d7da
+    style BayesCheck fill:#d1ecf1
+    style CheckCache fill:#d4edda
@@ -10,6 +10,7 @@ global_params:
   # Number of iterations for RandomizedSearchCV and BayesSearchCV
   n_iter: 2
   max_param_space_iter_value : 10
+  force_second_cv: false # If True, forces a second cross-validation run even if cached results are available. Defaults to False.
 
 # Experiment settings for the hyperopt run
 experiment:
 
@@ -78,6 +78,7 @@ def fit(self, X: pd.DataFrame, y: pd.Series, **kwargs) -> "H2OAutoMLClassifier":
             self.model_ = H2OGeneralizedLinearEstimator(
                 family="binomial", ignore_const_cols=False
             )
+            self._sanitize_model_params()
             self.model_.train(y=outcome_var, x=x_vars, training_frame=train_h2o)
             self._using_dummy_model = True  # Set flag for reference
 
@@ -101,6 +102,7 @@ def _finalize_dummy_fit(self, X, y):
         self.model_ = H2OGeneralizedLinearEstimator(
             family="binomial", ignore_const_cols=False
         )
+        self._sanitize_model_params()
         # We need to create a minimal H2OFrame to train on
         train_h2o, x_vars, outcome_var, _ = self._prepare_fit(X, y)
         self.model_.train(y=outcome_var, x=x_vars, training_frame=train_h2o)
 
@@ -409,6 +409,17 @@ def _handle_small_data_fallback(self, X: pd.DataFrame, y: pd.Series) -> bool:
             return True
         return False
 
+    def _sanitize_model_params(self):
+        """Removes problematic parameters from the H2O model instance before training.
+
+        This handles version mismatches where the Python client sends parameters
+        (like HGLM) that the H2O backend does not recognize.
+        """
+        if self.model_ and hasattr(self.model_, "_parms"):
+            if "HGLM" in self.model_._parms:
+                self.logger.debug("Removing 'HGLM' parameter from H2O model to prevent backend error.")
+                del self.model_._parms["HGLM"]
+
     def fit(self, X: pd.DataFrame, y: pd.Series, **kwargs) -> "H2OBaseClassifier":
         """Fits the H2O model.
 
@@ -447,6 +458,9 @@ def fit(self, X: pd.DataFrame, y: pd.Series, **kwargs) -> "H2OBaseClassifier":
             self.logger.debug(f"Creating H2O model with params: {model_params}")
             self.model_ = self.estimator_class(**model_params)
 
+            # Sanitize parameters to prevent backend errors (e.g. HGLM)
+            self._sanitize_model_params()
+
             # Call the train() method with ONLY the data-related arguments
             self.logger.debug("Calling H2O model.train()...")
             self.model_.train(x=x_vars, y=outcome_var, training_frame=train_h2o)
@@ -550,30 +564,18 @@ def predict(self, X: pd.DataFrame) -> np.ndarray:
             # This seems to create a more 'stable' frame in the H2O cluster, preventing
             # internal errors during prediction with some models like GLM.
 
-            # Create a temporary H2OFrame by uploading the pandas DataFrame.
-            # FIX: Do not pass column_types to constructor as it can be flaky.
-            # Instead, create frame and explicitly cast columns.
-            tmp_frame = h2o.H2OFrame(X, column_names=self.feature_names_)
-
-            # Enforce types explicitly to match training schema
+            # Optimization: Pass column_types directly to constructor to avoid
+            # expensive column-by-column casting loop (which triggers GC overhead).
+            # We filter feature_types_ to ensure only present columns are passed.
+            col_types = None
             if self.feature_types_:
-                for col in self.feature_names_:
-                    if col in self.feature_types_ and col in tmp_frame.columns:
-                        t_type = self.feature_types_[col]
-                        if t_type == "enum":
-                            tmp_frame[col] = tmp_frame[col].asfactor()
-                        elif t_type in ["int", "real", "numeric"]:
-                            tmp_frame[col] = tmp_frame[col].asnumeric()
-                        elif t_type == "string":
-                            tmp_frame[col] = tmp_frame[col].ascharacter()
-
-            # Assign it to a unique key in the H2O cluster. This is more reliable.
-            # Add PID and ID to ensure uniqueness across processes
-            frame_id = f"pred_{os.getpid()}_{id(self)}_{pd.Timestamp.now().strftime('%H%M%S%f')}"
-            h2o.assign(tmp_frame, frame_id)
-
-            # Get a handle to the newly created frame
-            test_h2o = h2o.get_frame(frame_id)
+                col_types = {k: v for k, v in self.feature_types_.items() if k in X.columns}
+            
+            tmp_frame = h2o.H2OFrame(X, column_names=self.feature_names_, column_types=col_types)
+
+            # Optimization: Use the temporary frame directly.
+            # Explicitly assigning a key (h2o.assign) triggers expensive GC checks.
+            test_h2o = tmp_frame
 
         except Exception as e:
             raise RuntimeError(f"Failed to create H2O frame for prediction: {e}")
@@ -654,21 +656,12 @@ def predict_proba(self, X: pd.DataFrame) -> np.ndarray:
 
         # Create H2O frame with explicit column names
         try:
-            # FIX: Explicit type enforcement for predict_proba as well
-            tmp_frame = h2o.H2OFrame(X, column_names=self.feature_names_)
-            
+            # Optimization: Pass column_types directly to constructor
+            col_types = None
             if self.feature_types_:
-                for col in self.feature_names_:
-                    if col in self.feature_types_ and col in tmp_frame.columns:
-                        t_type = self.feature_types_[col]
-                        if t_type == "enum":
-                            tmp_frame[col] = tmp_frame[col].asfactor()
-                        elif t_type in ["int", "real", "numeric"]:
-                            tmp_frame[col] = tmp_frame[col].asnumeric()
-                        elif t_type == "string":
-                            tmp_frame[col] = tmp_frame[col].ascharacter()
+                col_types = {k: v for k, v in self.feature_types_.items() if k in X.columns}
 
-            test_h2o = tmp_frame
+            test_h2o = h2o.H2OFrame(X, column_names=self.feature_names_, column_types=col_types)
         except Exception as e:
             raise RuntimeError(f"Failed to create H2O frame for prediction: {e}")
 
 
@@ -171,6 +171,7 @@ def fit(self, X: pd.DataFrame, y: pd.Series, **kwargs) -> "H2OGAMClassifier":
             estimator_cls = self.estimator_class
 
         self.model_ = estimator_cls(**model_params)
+        self._sanitize_model_params()
 
         # --- RUNTIME TRAIN WITH FALLBACK ---
         try:
@@ -200,6 +201,7 @@ def fit(self, X: pd.DataFrame, y: pd.Series, **kwargs) -> "H2OGAMClassifier":
                 glm_params["lambda_search"] = False
 
                 self.model_ = H2OGeneralizedLinearEstimator(**glm_params)
+                self._sanitize_model_params()
                 self.model_.train(x=x_vars, y=outcome_var, training_frame=train_h2o)
             else:
                 raise e
Original file line number	Diff line number	Diff line change
`@@ -78,6 +78,7 @@ def fit(self, X: pd.DataFrame, y: pd.Series, **kwargs) -> "H2OAutoMLClassifier":`
`78`	`78`	`self.model_ = H2OGeneralizedLinearEstimator(`
`79`	`79`	`family="binomial", ignore_const_cols=False`
`80`	`80`	`)`
	`81`	`+ self._sanitize_model_params()`
`81`	`82`	`self.model_.train(y=outcome_var, x=x_vars, training_frame=train_h2o)`
`82`	`83`	`self._using_dummy_model = True # Set flag for reference`
`83`	`84`
`@@ -101,6 +102,7 @@ def _finalize_dummy_fit(self, X, y):`
`101`	`102`	`self.model_ = H2OGeneralizedLinearEstimator(`
`102`	`103`	`family="binomial", ignore_const_cols=False`
`103`	`104`	`)`
	`105`	`+ self._sanitize_model_params()`
`104`	`106`	`# We need to create a minimal H2OFrame to train on`
`105`	`107`	`train_h2o, x_vars, outcome_var, _ = self._prepare_fit(X, y)`
`106`	`108`	`self.model_.train(y=outcome_var, x=x_vars, training_frame=train_h2o)`