SamoraHunter
diff --git a/‎ml_grid/model_classes/H2OGAMClassifier.py‎
Lines changed: 113 additions & 205 deletions b/‎ml_grid/model_classes/H2OGAMClassifier.py‎
Lines changed: 113 additions & 205 deletions
diff --git a/‎ml_grid/model_classes/H2OGLMClassifier.py‎
Lines changed: 83 additions & 31 deletions b/‎ml_grid/model_classes/H2OGLMClassifier.py‎
Lines changed: 83 additions & 31 deletions
diff --git a/‎ml_grid/model_classes/h2o_gam_classifier_class.py‎
Lines changed: 31 additions & 28 deletions b/‎ml_grid/model_classes/h2o_gam_classifier_class.py‎
Lines changed: 31 additions & 28 deletions
@@ -1,54 +1,106 @@
-
+import numpy as np
 import pandas as pd
 from h2o.estimators import H2OGeneralizedLinearEstimator
+from skopt.space import Real, Categorical, Integer
 
 from .H2OBaseClassifier import H2OBaseClassifier
 
 
 class H2OGLMClassifier(H2OBaseClassifier):
-    """A scikit-learn compatible wrapper for H2O's Generalized Linear Models."""
+    """
+    The actual scikit-learn compatible wrapper for H2O's Generalized Linear Models.
+    This class handles the training, prediction, and H2O interaction.
+    """
 
     def __init__(self, **kwargs):
-        """Initializes the H2OGLMClassifier, handling the 'lambda' parameter.
-
-        This wrapper ensures compatibility with scikit-learn's parameter naming
-        by accepting `lambda_` and internally mapping it for the H2O backend.
-
-        Args:
-            **kwargs: Keyword arguments passed directly to the
-                `H2OGeneralizedLinearEstimator`. Common arguments include
-                `family='binomial'`, `alpha=0.5`, and `lambda_` (for regularization).
-        """
-        # --- FIX for scikit-learn cloning and H2O's 'lambda' parameter ---
-        # scikit-learn's get_params() will return 'lambda_', but the user might
-        # provide 'lambda' in the parameter grid. We must handle both cases.
+        """Initializes the H2OGLMClassifier."""
+
+        # --- FIX 1: Normalize lambda parameter name ---
         if "lambda" in kwargs and "lambda_" not in kwargs:
             kwargs["lambda_"] = kwargs.pop("lambda")
 
-        # Remove estimator_class from kwargs if present (happens during sklearn clone)
         kwargs.pop("estimator_class", None)
+
+        # --- DEFENSIVE DEFAULTS ---
+        kwargs.setdefault("standardize", True)
+
+        # --- CRITICAL FIXES FOR STABILITY ---
+        # 1. Force L_BFGS: The only solver robust against the Java NPE on this data.
+        kwargs["solver"] = "L_BFGS"
+        # 2. Disable collinear removal: Changing vector size causes index mismatch crashes.
+        kwargs["remove_collinear_columns"] = False
+        # 3. Disable lambda_search: If True, H2O ignores 'solver' and uses Coordinate Descent, causing crashes.
+        kwargs["lambda_search"] = False
+
         # Pass the specific estimator class
         super().__init__(estimator_class=H2OGeneralizedLinearEstimator, **kwargs)
 
     def fit(self, X: pd.DataFrame, y: pd.Series, **kwargs) -> "H2OGLMClassifier":
-        """Fits the H2O GLM model and corrects the 'lambda_' parameter name.
+        """Fits the H2O GLM model."""
+
+        # --- DOUBLE-LOCK: Enforce stable parameters at fit time ---
+        # GridSearch calls set_params() which might overwrite our safe defaults.
+        # We explicitly revert them here before training.
+
+        kwargs["solver"] = "L_BFGS"
+        kwargs["remove_collinear_columns"] = False
+        kwargs["lambda_search"] = False
 
-        This method first calls the parent `fit` method to train the model.
-        After fitting, it ensures the internal H2O model object has the
-        regularization parameter named 'lambda' (not 'lambda_') to prevent
-        errors during prediction.
+        # Update internal H2O parameter dictionary if it exists
+        if hasattr(self, "_parms"):
+            self._parms["solver"] = "L_BFGS"
+            self._parms["remove_collinear_columns"] = False
+            self._parms["lambda_search"] = False
 
-        Returns:
-            H2OGLMClassifier: The fitted classifier instance.
-        """
-        # Call the parent class's fit method to perform the actual training
+        # Proceed with standard fit
         super().fit(X, y, **kwargs)
 
-        # --- CRITICAL FIX for predict-time NullPointerException ---
-        # The H2O backend's predict method requires the 'lambda' parameter, but the
-        # Python object may hold it as 'lambda_'. We must ensure the final model
-        # object has the correct 'lambda' parameter set in its internal params dict.
-        if self.model_ and "lambda_" in self.model_.params:
-            self.model_.params["lambda"] = self.model_.params.pop("lambda_")
+        # 3. TRIPLE-LOCK: Ensure the internal model object respects this
+        if hasattr(self, "model_") and self.model_ is not None:
+            self.model_._parms["solver"] = "L_BFGS"
+            self.model_._parms["remove_collinear_columns"] = False
+            self.model_._parms["lambda_search"] = False
 
         return self
+
+
+class H2O_GLM_class:
+    """
+    The Model Definition class used by the Grid Search framework.
+    """
+
+    def __init__(self, X=None, y=None, parameter_space_size="small"):
+        self.method_name = "H2OGLMClassifier"
+
+        # Instantiate the actual estimator wrapper
+        self.algorithm_implementation = H2OGLMClassifier()
+
+        # Define the Hyperparameter Space
+        # CRITICAL: We only offer L_BFGS to the optimizer.
+
+        if parameter_space_size == "xsmall":
+            self.parameter_space = {
+                "alpha": Real(0.0, 1.0),
+                "lambda_": Real(1e-3, 1e-1, prior="log-uniform"),
+                "family": Categorical(["binomial"]),
+                "solver": Categorical(["L_BFGS"]),
+                "standardize": Categorical([True]),
+            }
+        elif parameter_space_size == "small":
+            self.parameter_space = {
+                "alpha": Real(0.0, 1.0),
+                "lambda_": Real(1e-4, 1e-1, prior="log-uniform"),
+                "family": Categorical(["binomial"]),
+                "solver": Categorical(["L_BFGS"]),
+                "standardize": Categorical([True]),
+            }
+        else:
+            # Medium/Large space
+            self.parameter_space = {
+                "alpha": Real(0.0, 1.0),
+                "lambda_": Real(1e-6, 10.0, prior="log-uniform"),
+                "family": Categorical(["binomial"]),
+                "solver": Categorical(["L_BFGS"]),
+                "standardize": Categorical([True, False]),
+                "balance_classes": Categorical([True, False]),
+            }
@@ -15,67 +15,70 @@
 from ml_grid.util.global_params import global_parameters
 
 logger = logging.getLogger(__name__)
-logger.debug("Imported h2o_gam_classifier_class")
 
 
 class H2OGAMClass:
     """A configuration class for the H2OGAMClassifier.
 
     Provides parameter spaces for grid search and Bayesian optimization.
     The parameter space is dynamically generated to include columns from the
-    input data `X` for the `gam_columns` parameter.
+    input data `X` for the `gam_columns` parameter, filtering out columns
+    unsuitable for smoothing (e.g., low cardinality).
     """
 
     def __init__(
         self,
         X: Optional[pd.DataFrame] = None,
-        y: Optional[pd.Series] = None,  # type: ignore
-        parameter_space_size: str = "small",  # Added for consistency
+        y: Optional[pd.Series] = None,
+        parameter_space_size: str = "small",
     ) -> None:
-        """Initializes the H2OGAMClass.
-
-        Args:
-            X: The input features. This is used to
-                dynamically populate the `gam_columns` in the parameter space.
-            y: The target variable. # type: ignore
-            parameter_space_size (str): The size of the parameter space to use
-                ('xsmall', 'small', 'medium'). Defaults to 'small'.
-
-        Raises:
-            ValueError: If `parameter_space_size` is not a valid key (though current
-                implementation does not explicitly raise this for GAM).
-        """
+        """Initializes the H2OGAMClass."""
         self.X: Optional[pd.DataFrame] = X
         self.y: Optional[pd.Series] = y
+
+        # Instantiate the Estimator
         self.algorithm_implementation: H2OGAMClassifier = H2OGAMClassifier()
+
         self.method_name: str = "H2OGAMClassifier"
         self.parameter_space: Union[List[Dict[str, Any]], Dict[str, Any]]
 
-        # Define the available columns for the hyperparameter search.
-        gam_cols = list(X.columns) if X is not None else []
+        # --- SMART GAM COLUMN SELECTION ---
+        # Filter X to find only numeric columns with sufficient cardinality (>10).
+        gam_cols = []
+        if X is not None:
+            for col in X.columns:
+                if pd.api.types.is_numeric_dtype(X[col]):
+                    # Check cardinality (>10 unique values)
+                    if X[col].nunique() > 10:
+                        gam_cols.append(col)
 
-        # Conditionally define the parameter space based on the search method.
+        if not gam_cols and X is not None:
+            logger.warning(
+                "No high-cardinality numeric columns found for GAM splines. Search will likely fallback to GLM."
+            )
+
+        # Define Parameter Space
         if global_parameters.bayessearch:
-            # For Bayesian search, use skopt distribution objects.
+            # Bayesian Search Space
             param_space = {
-                "num_knots": Integer(5, 15),
+                "num_knots": Integer(5, 10),
                 "bs": Categorical(["cs", "tp"]),
                 "scale": Real(0.01, 1.0, "log-uniform"),
                 "seed": Integer(1, 1000),
+                "solver": Categorical(["COORDINATE_DESCENT"]),
             }
             if gam_cols:
-                # H2O GAM can take a list of lists for gam_columns, but for
-                # hyperparameter search, we let it pick one column to focus on.
-                # This could be extended to search for combinations.
                 param_space["gam_columns"] = Categorical(gam_cols)
+
             self.parameter_space = param_space
         else:
-            # For Grid/Random search, use standard lists.
+            # Grid/Random Search Space
             param_space = {
-                "num_knots": [5, 8, 10, 12, 15],
+                "num_knots": [5, 8, 10],
                 "bs": ["cs", "tp"],
                 "scale": [0.01, 0.1, 0.5, 1.0],
-                "seed": [1, 42, 123, 500, 1000],
+                "seed": [1, 42, 123],
+                "solver": ["COORDINATE_DESCENT"],
             }
             if gam_cols:
                 param_space["gam_columns"] = gam_cols