|
1 | | - |
| 1 | +import numpy as np |
2 | 2 | import pandas as pd |
3 | 3 | from h2o.estimators import H2OGeneralizedLinearEstimator |
| 4 | +from skopt.space import Real, Categorical, Integer |
4 | 5 |
|
5 | 6 | from .H2OBaseClassifier import H2OBaseClassifier |
6 | 7 |
|
7 | 8 |
|
8 | 9 | class H2OGLMClassifier(H2OBaseClassifier): |
9 | | - """A scikit-learn compatible wrapper for H2O's Generalized Linear Models.""" |
| 10 | + """ |
| 11 | + The actual scikit-learn compatible wrapper for H2O's Generalized Linear Models. |
| 12 | + This class handles the training, prediction, and H2O interaction. |
| 13 | + """ |
10 | 14 |
|
11 | 15 | def __init__(self, **kwargs): |
12 | | - """Initializes the H2OGLMClassifier, handling the 'lambda' parameter. |
13 | | -
|
14 | | - This wrapper ensures compatibility with scikit-learn's parameter naming |
15 | | - by accepting `lambda_` and internally mapping it for the H2O backend. |
16 | | -
|
17 | | - Args: |
18 | | - **kwargs: Keyword arguments passed directly to the |
19 | | - `H2OGeneralizedLinearEstimator`. Common arguments include |
20 | | - `family='binomial'`, `alpha=0.5`, and `lambda_` (for regularization). |
21 | | - """ |
22 | | - # --- FIX for scikit-learn cloning and H2O's 'lambda' parameter --- |
23 | | - # scikit-learn's get_params() will return 'lambda_', but the user might |
24 | | - # provide 'lambda' in the parameter grid. We must handle both cases. |
| 16 | + """Initializes the H2OGLMClassifier.""" |
| 17 | + |
| 18 | + # --- FIX 1: Normalize lambda parameter name --- |
25 | 19 | if "lambda" in kwargs and "lambda_" not in kwargs: |
26 | 20 | kwargs["lambda_"] = kwargs.pop("lambda") |
27 | 21 |
|
28 | | - # Remove estimator_class from kwargs if present (happens during sklearn clone) |
29 | 22 | kwargs.pop("estimator_class", None) |
| 23 | + |
| 24 | + # --- DEFENSIVE DEFAULTS --- |
| 25 | + kwargs.setdefault("standardize", True) |
| 26 | + |
| 27 | + # --- CRITICAL FIXES FOR STABILITY --- |
| 28 | + # 1. Force L_BFGS: The only solver robust against the Java NPE on this data. |
| 29 | + kwargs["solver"] = "L_BFGS" |
| 30 | + # 2. Disable collinear removal: Changing vector size causes index mismatch crashes. |
| 31 | + kwargs["remove_collinear_columns"] = False |
| 32 | + # 3. Disable lambda_search: If True, H2O ignores 'solver' and uses Coordinate Descent, causing crashes. |
| 33 | + kwargs["lambda_search"] = False |
| 34 | + |
30 | 35 | # Pass the specific estimator class |
31 | 36 | super().__init__(estimator_class=H2OGeneralizedLinearEstimator, **kwargs) |
32 | 37 |
|
33 | 38 | def fit(self, X: pd.DataFrame, y: pd.Series, **kwargs) -> "H2OGLMClassifier": |
34 | | - """Fits the H2O GLM model and corrects the 'lambda_' parameter name. |
| 39 | + """Fits the H2O GLM model.""" |
| 40 | + |
| 41 | + # --- DOUBLE-LOCK: Enforce stable parameters at fit time --- |
| 42 | + # GridSearch calls set_params() which might overwrite our safe defaults. |
| 43 | + # We explicitly revert them here before training. |
| 44 | + |
| 45 | + kwargs["solver"] = "L_BFGS" |
| 46 | + kwargs["remove_collinear_columns"] = False |
| 47 | + kwargs["lambda_search"] = False |
35 | 48 |
|
36 | | - This method first calls the parent `fit` method to train the model. |
37 | | - After fitting, it ensures the internal H2O model object has the |
38 | | - regularization parameter named 'lambda' (not 'lambda_') to prevent |
39 | | - errors during prediction. |
| 49 | + # Update internal H2O parameter dictionary if it exists |
| 50 | + if hasattr(self, "_parms"): |
| 51 | + self._parms["solver"] = "L_BFGS" |
| 52 | + self._parms["remove_collinear_columns"] = False |
| 53 | + self._parms["lambda_search"] = False |
40 | 54 |
|
41 | | - Returns: |
42 | | - H2OGLMClassifier: The fitted classifier instance. |
43 | | - """ |
44 | | - # Call the parent class's fit method to perform the actual training |
| 55 | + # Proceed with standard fit |
45 | 56 | super().fit(X, y, **kwargs) |
46 | 57 |
|
47 | | - # --- CRITICAL FIX for predict-time NullPointerException --- |
48 | | - # The H2O backend's predict method requires the 'lambda' parameter, but the |
49 | | - # Python object may hold it as 'lambda_'. We must ensure the final model |
50 | | - # object has the correct 'lambda' parameter set in its internal params dict. |
51 | | - if self.model_ and "lambda_" in self.model_.params: |
52 | | - self.model_.params["lambda"] = self.model_.params.pop("lambda_") |
| 58 | + # 3. TRIPLE-LOCK: Ensure the internal model object respects this |
| 59 | + if hasattr(self, "model_") and self.model_ is not None: |
| 60 | + self.model_._parms["solver"] = "L_BFGS" |
| 61 | + self.model_._parms["remove_collinear_columns"] = False |
| 62 | + self.model_._parms["lambda_search"] = False |
53 | 63 |
|
54 | 64 | return self |
| 65 | + |
| 66 | + |
| 67 | +class H2O_GLM_class: |
| 68 | + """ |
| 69 | + The Model Definition class used by the Grid Search framework. |
| 70 | + """ |
| 71 | + |
| 72 | + def __init__(self, X=None, y=None, parameter_space_size="small"): |
| 73 | + self.method_name = "H2OGLMClassifier" |
| 74 | + |
| 75 | + # Instantiate the actual estimator wrapper |
| 76 | + self.algorithm_implementation = H2OGLMClassifier() |
| 77 | + |
| 78 | + # Define the Hyperparameter Space |
| 79 | + # CRITICAL: We only offer L_BFGS to the optimizer. |
| 80 | + |
| 81 | + if parameter_space_size == "xsmall": |
| 82 | + self.parameter_space = { |
| 83 | + "alpha": Real(0.0, 1.0), |
| 84 | + "lambda_": Real(1e-3, 1e-1, prior="log-uniform"), |
| 85 | + "family": Categorical(["binomial"]), |
| 86 | + "solver": Categorical(["L_BFGS"]), |
| 87 | + "standardize": Categorical([True]), |
| 88 | + } |
| 89 | + elif parameter_space_size == "small": |
| 90 | + self.parameter_space = { |
| 91 | + "alpha": Real(0.0, 1.0), |
| 92 | + "lambda_": Real(1e-4, 1e-1, prior="log-uniform"), |
| 93 | + "family": Categorical(["binomial"]), |
| 94 | + "solver": Categorical(["L_BFGS"]), |
| 95 | + "standardize": Categorical([True]), |
| 96 | + } |
| 97 | + else: |
| 98 | + # Medium/Large space |
| 99 | + self.parameter_space = { |
| 100 | + "alpha": Real(0.0, 1.0), |
| 101 | + "lambda_": Real(1e-6, 10.0, prior="log-uniform"), |
| 102 | + "family": Categorical(["binomial"]), |
| 103 | + "solver": Categorical(["L_BFGS"]), |
| 104 | + "standardize": Categorical([True, False]), |
| 105 | + "balance_classes": Categorical([True, False]), |
| 106 | + } |
0 commit comments