Skip to content

Commit c75c681

Browse files
committed
feat(automl): Add AutoKeras classifier wrapper and tests
This commit introduces a scikit-learn compatible wrapper for the AutoKeras library, allowing it to be integrated into the ml_grid framework. Key additions include: - `AutoKerasClassifierWrapper`: A new wrapper class that provides a standard sklearn interface (`fit`, `predict`, `predict_proba`) for `autokeras.StructuredDataClassifier`. - Automatically handles the conversion of pandas DataFrames to NumPy arrays. - Manages the creation and cleanup of temporary directories required by AutoKeras. - Includes robust error handling to raise a `RuntimeError` if AutoKeras fails to find a usable model within the given trials. - Correctly reshapes the output of `predict_proba` for binary classification to the standard `(n_samples, 2)` format. - `AutoKerasClassifierClass`: A configuration class that defines the parameter spaces for the wrapper. It supports different configurations for grid search, Bayesian optimization, and a minimal `test_mode` for faster testing. - `tests/test_auto_keras_classifier.py`: Comprehensive unit tests for both the wrapper and the configuration class. These tests use `unittest.mock` to validate the integration logic without requiring actual model training, ensuring fast and reliable test execution.
1 parent b694978 commit c75c681

3 files changed

Lines changed: 313 additions & 0 deletions

File tree

Lines changed: 137 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,137 @@
1+
"""AutoKeras Classifier Wrapper.
2+
3+
This module provides a scikit-learn compatible wrapper for AutoKeras StructuredDataClassifier.
4+
"""
5+
6+
import logging
7+
import os
8+
import shutil
9+
import tempfile
10+
from typing import Optional, Union
11+
12+
import numpy as np
13+
import pandas as pd
14+
from sklearn.base import BaseEstimator, ClassifierMixin
15+
from sklearn.utils.validation import check_is_fitted
16+
17+
# Attempt to import AutoKeras and TensorFlow
18+
try:
19+
import autokeras as ak
20+
import tensorflow as tf
21+
except ImportError:
22+
ak = None
23+
tf = None
24+
25+
logger = logging.getLogger(__name__)
26+
27+
28+
class AutoKerasClassifierWrapper(BaseEstimator, ClassifierMixin):
29+
"""A scikit-learn compatible wrapper for AutoKeras StructuredDataClassifier."""
30+
31+
def __init__(
32+
self,
33+
max_trials: int = 3,
34+
epochs: int = 10,
35+
validation_split: float = 0.2,
36+
directory: Optional[str] = None,
37+
seed: int = 42,
38+
verbose: int = 1,
39+
overwrite: bool = True,
40+
):
41+
self.max_trials = max_trials
42+
self.epochs = epochs
43+
self.validation_split = validation_split
44+
self.directory = directory
45+
self.seed = seed
46+
self.verbose = verbose
47+
self.overwrite = overwrite
48+
49+
self.model_ = None
50+
self._temp_dir = None
51+
52+
def fit(
53+
self,
54+
X: Union[np.ndarray, pd.DataFrame],
55+
y: Union[np.ndarray, pd.Series],
56+
**kwargs,
57+
) -> "AutoKerasClassifierWrapper":
58+
if ak is None:
59+
raise ImportError(
60+
"AutoKeras is not installed. Please install it to use AutoKerasClassifierWrapper."
61+
)
62+
63+
# Ensure input is numpy array to avoid AutoKeras ValueError with DataFrames
64+
if isinstance(X, pd.DataFrame):
65+
X = X.values
66+
if isinstance(y, (pd.Series, pd.DataFrame)):
67+
y = y.values
68+
69+
# Handle directory
70+
if self.directory is None:
71+
self._temp_dir = tempfile.mkdtemp(prefix="autokeras_")
72+
dir_path = self._temp_dir
73+
else:
74+
dir_path = self.directory
75+
76+
if tf:
77+
tf.random.set_seed(self.seed)
78+
79+
self.model_ = ak.StructuredDataClassifier(
80+
max_trials=self.max_trials,
81+
directory=dir_path,
82+
seed=self.seed,
83+
overwrite=self.overwrite,
84+
)
85+
86+
self.model_.fit(
87+
x=X,
88+
y=y,
89+
epochs=self.epochs,
90+
validation_split=self.validation_split,
91+
verbose=self.verbose,
92+
**kwargs,
93+
)
94+
95+
# Check if a model was actually found and can be exported.
96+
try:
97+
self.model_.export_model()
98+
except Exception as e:
99+
# This typically happens if max_trials is too low and no model is found.
100+
msg = f"AutoKeras failed to find a usable model (max_trials={self.max_trials}, epochs={self.epochs}). Original error: {e}"
101+
logger.error(msg)
102+
raise RuntimeError(msg)
103+
104+
# AutoKeras does not explicitly expose classes_, so we infer them from y
105+
if isinstance(y, (pd.Series, pd.DataFrame)):
106+
self.classes_ = np.unique(y.values)
107+
else:
108+
self.classes_ = np.unique(y)
109+
110+
return self
111+
112+
def predict(self, X: Union[np.ndarray, pd.DataFrame]) -> np.ndarray:
113+
check_is_fitted(self, "model_")
114+
if isinstance(X, pd.DataFrame):
115+
X = X.values
116+
return self.model_.predict(X).flatten()
117+
118+
def predict_proba(self, X: Union[np.ndarray, pd.DataFrame]) -> np.ndarray:
119+
check_is_fitted(self, "model_")
120+
if isinstance(X, pd.DataFrame):
121+
X = X.values
122+
# Export the underlying Keras model to get probabilities
123+
keras_model = self.model_.export_model()
124+
probs = keras_model.predict(X, verbose=0)
125+
126+
# Handle binary classification case where output is (N, 1)
127+
if probs.shape[1] == 1:
128+
return np.hstack([1 - probs, probs])
129+
return probs
130+
131+
def __del__(self):
132+
# Cleanup temporary directory
133+
if self._temp_dir and os.path.exists(self._temp_dir):
134+
try:
135+
shutil.rmtree(self._temp_dir)
136+
except Exception:
137+
pass
Lines changed: 50 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,50 @@
1+
"""AutoKeras Classifier Configuration.
2+
3+
This module contains the AutoKerasClassifierClass, which is a configuration
4+
class for the AutoKerasClassifierWrapper.
5+
"""
6+
7+
import logging
8+
from typing import Any, Dict, List, Optional, Union
9+
10+
import pandas as pd
11+
from skopt.space import Integer
12+
13+
from ml_grid.model_classes.AutoKerasClassifierWrapper import AutoKerasClassifierWrapper
14+
from ml_grid.util.global_params import global_parameters
15+
16+
logger = logging.getLogger(__name__)
17+
18+
19+
class AutoKerasClassifierClass:
20+
"""Configuration class for AutoKerasClassifierWrapper."""
21+
22+
def __init__(
23+
self,
24+
X: Optional[pd.DataFrame] = None,
25+
y: Optional[pd.Series] = None,
26+
parameter_space_size: Optional[str] = None,
27+
):
28+
self.X = X
29+
self.y = y
30+
self.algorithm_implementation = AutoKerasClassifierWrapper()
31+
self.method_name = "AutoKerasClassifier"
32+
33+
self.parameter_space: Union[List[Dict[str, Any]], Dict[str, Any]]
34+
35+
if getattr(global_parameters, "test_mode", False):
36+
# Extremely small parameter space for fast unit testing
37+
logger.info("Using test_mode parameter space for AutoKerasClassifier")
38+
self.parameter_space = [{"max_trials": [2], "epochs": [3]}]
39+
elif global_parameters.bayessearch:
40+
self.parameter_space = {
41+
"max_trials": Integer(3, 10),
42+
"epochs": Integer(10, 30),
43+
}
44+
else:
45+
self.parameter_space = [
46+
{
47+
"max_trials": [3, 5],
48+
"epochs": [10, 20],
49+
}
50+
]
Lines changed: 126 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,126 @@
1+
import unittest
2+
from unittest.mock import MagicMock, patch
3+
import pandas as pd
4+
import numpy as np
5+
6+
from ml_grid.model_classes.AutoKerasClassifierWrapper import AutoKerasClassifierWrapper
7+
from ml_grid.model_classes.auto_keras_classifier_class import AutoKerasClassifierClass
8+
9+
10+
class TestAutoKerasClassifier(unittest.TestCase):
11+
def setUp(self):
12+
self.X = pd.DataFrame(
13+
{"feature_0": [1.0, 2.0, 3.0, 4.0], "feature_1": [4.0, 3.0, 2.0, 1.0]}
14+
)
15+
self.y = pd.Series([0, 1, 0, 1], name="target")
16+
17+
def test_init(self):
18+
clf = AutoKerasClassifierWrapper(max_trials=5, epochs=20)
19+
self.assertEqual(clf.max_trials, 5)
20+
self.assertEqual(clf.epochs, 20)
21+
self.assertIsNone(clf.model_)
22+
23+
@patch(
24+
"ml_grid.model_classes.AutoKerasClassifierWrapper.ak.StructuredDataClassifier"
25+
)
26+
@patch("ml_grid.model_classes.AutoKerasClassifierWrapper.tempfile.mkdtemp")
27+
@patch("ml_grid.model_classes.AutoKerasClassifierWrapper.shutil.rmtree")
28+
def test_fit(self, mock_rmtree, mock_mkdtemp, mock_ak_cls):
29+
# Setup mocks
30+
mock_mkdtemp.return_value = "/tmp/mock_autokeras_dir"
31+
mock_ak_instance = MagicMock()
32+
mock_ak_cls.return_value = mock_ak_instance
33+
34+
clf = AutoKerasClassifierWrapper(max_trials=2)
35+
36+
# Test fit
37+
clf.fit(self.X, self.y)
38+
39+
# Verify StructuredDataClassifier init
40+
mock_ak_cls.assert_called_once()
41+
_, kwargs = mock_ak_cls.call_args
42+
self.assertEqual(kwargs["max_trials"], 2)
43+
self.assertEqual(kwargs["directory"], "/tmp/mock_autokeras_dir")
44+
45+
# Verify fit call
46+
mock_ak_instance.fit.assert_called_once()
47+
_, fit_kwargs = mock_ak_instance.fit.call_args
48+
self.assertEqual(fit_kwargs["epochs"], 10)
49+
np.testing.assert_array_equal(fit_kwargs["x"], self.X.values)
50+
np.testing.assert_array_equal(fit_kwargs["y"], self.y.values)
51+
52+
# Verify attributes set
53+
self.assertIsNotNone(clf.model_)
54+
55+
@patch(
56+
"ml_grid.model_classes.AutoKerasClassifierWrapper.ak.StructuredDataClassifier"
57+
)
58+
def test_predict(self, mock_ak_cls):
59+
# Setup mock
60+
mock_ak_instance = MagicMock()
61+
mock_ak_cls.return_value = mock_ak_instance
62+
63+
# Mock predict return (AutoKeras returns array of shape (N, 1))
64+
mock_ak_instance.predict.return_value = np.array([[0], [1], [0], [1]])
65+
66+
clf = AutoKerasClassifierWrapper()
67+
clf.fit(self.X, self.y)
68+
69+
preds = clf.predict(self.X)
70+
71+
self.assertIsInstance(preds, np.ndarray)
72+
self.assertEqual(preds.shape, (4,))
73+
np.testing.assert_array_equal(preds, np.array([0, 1, 0, 1]))
74+
75+
@patch(
76+
"ml_grid.model_classes.AutoKerasClassifierWrapper.ak.StructuredDataClassifier"
77+
)
78+
def test_predict_proba(self, mock_ak_cls):
79+
# Setup mock
80+
mock_ak_instance = MagicMock()
81+
mock_ak_cls.return_value = mock_ak_instance
82+
83+
# Mock export_model and its predict method
84+
mock_keras_model = MagicMock()
85+
mock_ak_instance.export_model.return_value = mock_keras_model
86+
# Return (N, 1) probabilities for binary classification
87+
mock_keras_model.predict.return_value = np.array([[0.1], [0.9], [0.2], [0.8]])
88+
89+
clf = AutoKerasClassifierWrapper()
90+
clf.fit(self.X, self.y)
91+
92+
probas = clf.predict_proba(self.X)
93+
94+
self.assertIsInstance(probas, np.ndarray)
95+
self.assertEqual(probas.shape, (4, 2)) # Should be converted to (N, 2)
96+
self.assertAlmostEqual(probas[0, 0], 0.9) # 1 - 0.1
97+
self.assertAlmostEqual(probas[0, 1], 0.1)
98+
99+
# Verify predict called on internal keras model with numpy array
100+
mock_keras_model.predict.assert_called_once()
101+
np.testing.assert_array_equal(
102+
mock_keras_model.predict.call_args[0][0], self.X.values
103+
)
104+
105+
106+
class TestAutoKerasClassifierClass(unittest.TestCase):
107+
def test_structure(self):
108+
with patch(
109+
"ml_grid.model_classes.auto_keras_classifier_class.global_parameters"
110+
) as mock_globals:
111+
# Case 1: Grid Search
112+
mock_globals.test_mode = False
113+
mock_globals.bayessearch = False
114+
config = AutoKerasClassifierClass()
115+
self.assertEqual(config.method_name, "AutoKerasClassifier")
116+
self.assertIsInstance(config.parameter_space, list)
117+
118+
# Case 2: Bayes Search
119+
mock_globals.bayessearch = True
120+
config = AutoKerasClassifierClass()
121+
self.assertIsInstance(config.parameter_space, dict)
122+
self.assertIn("max_trials", config.parameter_space)
123+
124+
125+
if __name__ == "__main__":
126+
unittest.main()

0 commit comments

Comments
 (0)