Skip to content

Commit a371302

Browse files
author
SamoraHunter
committed
GLM GAM stability fixes
1 parent 5a6bde9 commit a371302

7 files changed

Lines changed: 691 additions & 492 deletions

ml_grid/model_classes/H2OGAMClassifier.py

Lines changed: 113 additions & 205 deletions
Large diffs are not rendered by default.
Lines changed: 83 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -1,54 +1,106 @@
1-
1+
import numpy as np
22
import pandas as pd
33
from h2o.estimators import H2OGeneralizedLinearEstimator
4+
from skopt.space import Real, Categorical, Integer
45

56
from .H2OBaseClassifier import H2OBaseClassifier
67

78

89
class H2OGLMClassifier(H2OBaseClassifier):
9-
"""A scikit-learn compatible wrapper for H2O's Generalized Linear Models."""
10+
"""
11+
The actual scikit-learn compatible wrapper for H2O's Generalized Linear Models.
12+
This class handles the training, prediction, and H2O interaction.
13+
"""
1014

1115
def __init__(self, **kwargs):
12-
"""Initializes the H2OGLMClassifier, handling the 'lambda' parameter.
13-
14-
This wrapper ensures compatibility with scikit-learn's parameter naming
15-
by accepting `lambda_` and internally mapping it for the H2O backend.
16-
17-
Args:
18-
**kwargs: Keyword arguments passed directly to the
19-
`H2OGeneralizedLinearEstimator`. Common arguments include
20-
`family='binomial'`, `alpha=0.5`, and `lambda_` (for regularization).
21-
"""
22-
# --- FIX for scikit-learn cloning and H2O's 'lambda' parameter ---
23-
# scikit-learn's get_params() will return 'lambda_', but the user might
24-
# provide 'lambda' in the parameter grid. We must handle both cases.
16+
"""Initializes the H2OGLMClassifier."""
17+
18+
# --- FIX 1: Normalize lambda parameter name ---
2519
if "lambda" in kwargs and "lambda_" not in kwargs:
2620
kwargs["lambda_"] = kwargs.pop("lambda")
2721

28-
# Remove estimator_class from kwargs if present (happens during sklearn clone)
2922
kwargs.pop("estimator_class", None)
23+
24+
# --- DEFENSIVE DEFAULTS ---
25+
kwargs.setdefault("standardize", True)
26+
27+
# --- CRITICAL FIXES FOR STABILITY ---
28+
# 1. Force L_BFGS: The only solver robust against the Java NPE on this data.
29+
kwargs["solver"] = "L_BFGS"
30+
# 2. Disable collinear removal: Changing vector size causes index mismatch crashes.
31+
kwargs["remove_collinear_columns"] = False
32+
# 3. Disable lambda_search: If True, H2O ignores 'solver' and uses Coordinate Descent, causing crashes.
33+
kwargs["lambda_search"] = False
34+
3035
# Pass the specific estimator class
3136
super().__init__(estimator_class=H2OGeneralizedLinearEstimator, **kwargs)
3237

3338
def fit(self, X: pd.DataFrame, y: pd.Series, **kwargs) -> "H2OGLMClassifier":
34-
"""Fits the H2O GLM model and corrects the 'lambda_' parameter name.
39+
"""Fits the H2O GLM model."""
40+
41+
# --- DOUBLE-LOCK: Enforce stable parameters at fit time ---
42+
# GridSearch calls set_params() which might overwrite our safe defaults.
43+
# We explicitly revert them here before training.
44+
45+
kwargs["solver"] = "L_BFGS"
46+
kwargs["remove_collinear_columns"] = False
47+
kwargs["lambda_search"] = False
3548

36-
This method first calls the parent `fit` method to train the model.
37-
After fitting, it ensures the internal H2O model object has the
38-
regularization parameter named 'lambda' (not 'lambda_') to prevent
39-
errors during prediction.
49+
# Update internal H2O parameter dictionary if it exists
50+
if hasattr(self, "_parms"):
51+
self._parms["solver"] = "L_BFGS"
52+
self._parms["remove_collinear_columns"] = False
53+
self._parms["lambda_search"] = False
4054

41-
Returns:
42-
H2OGLMClassifier: The fitted classifier instance.
43-
"""
44-
# Call the parent class's fit method to perform the actual training
55+
# Proceed with standard fit
4556
super().fit(X, y, **kwargs)
4657

47-
# --- CRITICAL FIX for predict-time NullPointerException ---
48-
# The H2O backend's predict method requires the 'lambda' parameter, but the
49-
# Python object may hold it as 'lambda_'. We must ensure the final model
50-
# object has the correct 'lambda' parameter set in its internal params dict.
51-
if self.model_ and "lambda_" in self.model_.params:
52-
self.model_.params["lambda"] = self.model_.params.pop("lambda_")
58+
# 3. TRIPLE-LOCK: Ensure the internal model object respects this
59+
if hasattr(self, "model_") and self.model_ is not None:
60+
self.model_._parms["solver"] = "L_BFGS"
61+
self.model_._parms["remove_collinear_columns"] = False
62+
self.model_._parms["lambda_search"] = False
5363

5464
return self
65+
66+
67+
class H2O_GLM_class:
68+
"""
69+
The Model Definition class used by the Grid Search framework.
70+
"""
71+
72+
def __init__(self, X=None, y=None, parameter_space_size="small"):
73+
self.method_name = "H2OGLMClassifier"
74+
75+
# Instantiate the actual estimator wrapper
76+
self.algorithm_implementation = H2OGLMClassifier()
77+
78+
# Define the Hyperparameter Space
79+
# CRITICAL: We only offer L_BFGS to the optimizer.
80+
81+
if parameter_space_size == "xsmall":
82+
self.parameter_space = {
83+
"alpha": Real(0.0, 1.0),
84+
"lambda_": Real(1e-3, 1e-1, prior="log-uniform"),
85+
"family": Categorical(["binomial"]),
86+
"solver": Categorical(["L_BFGS"]),
87+
"standardize": Categorical([True]),
88+
}
89+
elif parameter_space_size == "small":
90+
self.parameter_space = {
91+
"alpha": Real(0.0, 1.0),
92+
"lambda_": Real(1e-4, 1e-1, prior="log-uniform"),
93+
"family": Categorical(["binomial"]),
94+
"solver": Categorical(["L_BFGS"]),
95+
"standardize": Categorical([True]),
96+
}
97+
else:
98+
# Medium/Large space
99+
self.parameter_space = {
100+
"alpha": Real(0.0, 1.0),
101+
"lambda_": Real(1e-6, 10.0, prior="log-uniform"),
102+
"family": Categorical(["binomial"]),
103+
"solver": Categorical(["L_BFGS"]),
104+
"standardize": Categorical([True, False]),
105+
"balance_classes": Categorical([True, False]),
106+
}

ml_grid/model_classes/h2o_gam_classifier_class.py

Lines changed: 31 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -15,67 +15,70 @@
1515
from ml_grid.util.global_params import global_parameters
1616

1717
logger = logging.getLogger(__name__)
18-
logger.debug("Imported h2o_gam_classifier_class")
1918

2019

2120
class H2OGAMClass:
2221
"""A configuration class for the H2OGAMClassifier.
2322
2423
Provides parameter spaces for grid search and Bayesian optimization.
2524
The parameter space is dynamically generated to include columns from the
26-
input data `X` for the `gam_columns` parameter.
25+
input data `X` for the `gam_columns` parameter, filtering out columns
26+
unsuitable for smoothing (e.g., low cardinality).
2727
"""
2828

2929
def __init__(
3030
self,
3131
X: Optional[pd.DataFrame] = None,
32-
y: Optional[pd.Series] = None, # type: ignore
33-
parameter_space_size: str = "small", # Added for consistency
32+
y: Optional[pd.Series] = None,
33+
parameter_space_size: str = "small",
3434
) -> None:
35-
"""Initializes the H2OGAMClass.
36-
37-
Args:
38-
X: The input features. This is used to
39-
dynamically populate the `gam_columns` in the parameter space.
40-
y: The target variable. # type: ignore
41-
parameter_space_size (str): The size of the parameter space to use
42-
('xsmall', 'small', 'medium'). Defaults to 'small'.
43-
44-
Raises:
45-
ValueError: If `parameter_space_size` is not a valid key (though current
46-
implementation does not explicitly raise this for GAM).
47-
"""
35+
"""Initializes the H2OGAMClass."""
4836
self.X: Optional[pd.DataFrame] = X
4937
self.y: Optional[pd.Series] = y
38+
39+
# Instantiate the Estimator
5040
self.algorithm_implementation: H2OGAMClassifier = H2OGAMClassifier()
41+
5142
self.method_name: str = "H2OGAMClassifier"
5243
self.parameter_space: Union[List[Dict[str, Any]], Dict[str, Any]]
5344

54-
# Define the available columns for the hyperparameter search.
55-
gam_cols = list(X.columns) if X is not None else []
45+
# --- SMART GAM COLUMN SELECTION ---
46+
# Filter X to find only numeric columns with sufficient cardinality (>10).
47+
gam_cols = []
48+
if X is not None:
49+
for col in X.columns:
50+
if pd.api.types.is_numeric_dtype(X[col]):
51+
# Check cardinality (>10 unique values)
52+
if X[col].nunique() > 10:
53+
gam_cols.append(col)
5654

57-
# Conditionally define the parameter space based on the search method.
55+
if not gam_cols and X is not None:
56+
logger.warning(
57+
"No high-cardinality numeric columns found for GAM splines. Search will likely fallback to GLM."
58+
)
59+
60+
# Define Parameter Space
5861
if global_parameters.bayessearch:
59-
# For Bayesian search, use skopt distribution objects.
62+
# Bayesian Search Space
6063
param_space = {
61-
"num_knots": Integer(5, 15),
64+
"num_knots": Integer(5, 10),
6265
"bs": Categorical(["cs", "tp"]),
6366
"scale": Real(0.01, 1.0, "log-uniform"),
6467
"seed": Integer(1, 1000),
68+
"solver": Categorical(["COORDINATE_DESCENT"]),
6569
}
6670
if gam_cols:
67-
# H2O GAM can take a list of lists for gam_columns, but for
68-
# hyperparameter search, we let it pick one column to focus on.
69-
# This could be extended to search for combinations.
7071
param_space["gam_columns"] = Categorical(gam_cols)
72+
7173
self.parameter_space = param_space
7274
else:
73-
# For Grid/Random search, use standard lists.
75+
# Grid/Random Search Space
7476
param_space = {
75-
"num_knots": [5, 8, 10, 12, 15],
77+
"num_knots": [5, 8, 10],
7678
"bs": ["cs", "tp"],
7779
"scale": [0.01, 0.1, 0.5, 1.0],
78-
"seed": [1, 42, 123, 500, 1000],
80+
"seed": [1, 42, 123],
81+
"solver": ["COORDINATE_DESCENT"],
7982
}
8083
if gam_cols:
8184
param_space["gam_columns"] = gam_cols

0 commit comments

Comments
 (0)