feat: add FLAML classifier wrapper and configuration

SamoraHunter · SamoraHunter · commit b694978e43ab · 2026-02-28T18:51:47.000Z
This commit introduces support for FLAML (Fast and Lightweight AutoML) within the ml_grid framework.

Key changes:
- Added `FLAMLClassifierWrapper`: A scikit-learn compatible wrapper for `flaml.AutoML`. It handles model fitting, prediction, and probability estimation, with robust error handling for missing dependencies and runtime exceptions.
- Added `FLAMLClassifierClass`: A configuration class that defines the parameter space (specifically `time_budget`) for both grid search and Bayesian optimization modes.
- Added `tests/test_flaml_classifier.py`: Comprehensive unit tests covering initialization, fitting, prediction, and configuration logic, using mocks for external dependencies.

This enables FLAML to be used as a standard classifier in the existing grid search and hyperparameter optimization pipelines.
diff --git a/ml_grid/model_classes/FLAMLClassifierWrapper.py b/ml_grid/model_classes/FLAMLClassifierWrapper.py
@@ -0,0 +1,153 @@
+"""FLAML Classifier Wrapper.
+
+This module provides a scikit-learn compatible wrapper for FLAML's AutoML.
+"""
+
+import logging
+from typing import Union, List
+
+import numpy as np
+import pandas as pd
+from sklearn.base import BaseEstimator, ClassifierMixin
+from sklearn.utils.validation import check_is_fitted
+
+# Attempt to import FLAML
+try:
+    from flaml import AutoML
+except ImportError:
+    AutoML = None
+
+logger = logging.getLogger(__name__)
+
+
+class FLAMLClassifierWrapper(BaseEstimator, ClassifierMixin):
+    """A scikit-learn compatible wrapper for FLAML AutoML."""
+
+    def __init__(
+        self,
+        time_budget: int = 60,
+        metric: str = "auto",
+        task: str = "classification",
+        n_jobs: int = -1,
+        eval_method: str = "auto",
+        split_ratio: float = 0.2,
+        n_splits: int = 5,
+        log_file_name: str = "flaml.log",
+        seed: int = 42,
+        verbose: int = 0,
+        estimator_list: Union[str, List[str]] = "auto",
+    ):
+        self.time_budget = time_budget
+        self.metric = metric
+        self.task = task
+        self.n_jobs = n_jobs
+        self.eval_method = eval_method
+        self.split_ratio = split_ratio
+        self.n_splits = n_splits
+        self.log_file_name = log_file_name
+        self.seed = seed
+        self.verbose = verbose
+        self.estimator_list = estimator_list
+
+        self.model_ = None
+
+    def fit(
+        self,
+        X: Union[np.ndarray, pd.DataFrame],
+        y: Union[np.ndarray, pd.Series],
+        **kwargs,
+    ) -> "FLAMLClassifierWrapper":
+        if AutoML is None:
+            raise ImportError(
+                "FLAML is not installed. Please install it to use FLAMLClassifierWrapper."
+            )
+
+        self.model_ = AutoML()
+
+        try:
+            self.model_.fit(
+                X_train=X,
+                y_train=y,
+                time_budget=self.time_budget,
+                metric=self.metric,
+                task=self.task,
+                n_jobs=self.n_jobs,
+                eval_method=self.eval_method,
+                split_ratio=self.split_ratio,
+                n_splits=self.n_splits,
+                log_file_name=self.log_file_name,
+                seed=self.seed,
+                verbose=self.verbose,
+                estimator_list=self.estimator_list,
+                **kwargs,
+            )
+        except StopIteration:
+            # FLAML can raise StopIteration internally when used within scikit-learn's
+            # cross-validation framework. We catch it here to prevent it from
+            # crashing the joblib parallel backend. The model is still fitted.
+            logger.debug(
+                "Caught StopIteration from FLAML, which is expected in some CV scenarios."
+            )
+            pass
+        except Exception as e:
+            # Catch any other errors during fit (e.g. AttributeError from FLAML runner)
+            logger.error(f"FLAML fit failed: {e}")
+            raise RuntimeError(f"FLAML fit failed: {e}")
+
+        # After fitting, check if a model was actually found. This is crucial because
+        # if the time_budget is too short, FLAML may not find any valid model.
+        if self.model_.best_estimator is None:
+            msg = (
+                "FLAML failed to find a usable model within the given time_budget. "
+                "This may be due to a time limit that is too short, or very complex data."
+            )
+            logger.error(msg)
+            raise RuntimeError(msg)
+
+        if hasattr(self.model_, "classes_"):
+            self.classes_ = self.model_.classes_
+        else:
+            # If fit fails early or StopIteration is caught before classes_ is set,
+            # we infer them from the target variable y to ensure compatibility.
+            if isinstance(y, (pd.Series, pd.DataFrame)):
+                self.classes_ = np.unique(y.values)
+            else:
+                self.classes_ = np.unique(y)
+        return self
+
+    def predict(self, X: Union[np.ndarray, pd.DataFrame]) -> np.ndarray:
+        check_is_fitted(self, ["model_", "classes_"])
+        try:
+            predictions = self.model_.predict(X)
+            if predictions is None:
+                logger.warning(
+                    "FLAML predict() returned None. Returning dummy predictions (majority class)."
+                )
+                # Return the most frequent class as a fallback
+                dummy_pred = np.full(
+                    len(X), self.classes_[0], dtype=self.classes_.dtype
+                )
+                return dummy_pred
+            return predictions
+        except Exception as e:
+            logger.error(f"FLAML predict failed: {e}. Returning dummy predictions.")
+            dummy_pred = np.full(len(X), self.classes_[0], dtype=self.classes_.dtype)
+            return dummy_pred
+
+    def predict_proba(self, X: Union[np.ndarray, pd.DataFrame]) -> np.ndarray:
+        check_is_fitted(self, ["model_", "classes_"])
+        try:
+            probas = self.model_.predict_proba(X)
+            if probas is None:
+                logger.warning(
+                    "FLAML predict_proba() returned None. Returning dummy probabilities."
+                )
+                n_classes = len(self.classes_)
+                return np.full((len(X), n_classes), 1 / n_classes)
+            return probas
+        except Exception as e:
+            logger.error(
+                f"FLAML predict_proba failed: {e}. Returning dummy probabilities."
+            )
+            n_classes = len(self.classes_)
+            return np.full((len(X), n_classes), 1 / n_classes)
diff --git a/ml_grid/model_classes/flaml_classifier_class.py b/ml_grid/model_classes/flaml_classifier_class.py
@@ -0,0 +1,44 @@
+"""FLAML Classifier Configuration.
+
+This module contains the FLAMLClassifierClass, which is a configuration
+class for the FLAMLClassifierWrapper.
+"""
+
+import logging
+from typing import Any, Dict, List, Optional, Union
+
+import pandas as pd
+from skopt.space import Integer
+
+from ml_grid.model_classes.FLAMLClassifierWrapper import FLAMLClassifierWrapper
+from ml_grid.util.global_params import global_parameters
+
+logger = logging.getLogger(__name__)
+
+
+class FLAMLClassifierClass:
+    """Configuration class for FLAMLClassifierWrapper."""
+
+    def __init__(
+        self,
+        X: Optional[pd.DataFrame] = None,
+        y: Optional[pd.Series] = None,
+        parameter_space_size: Optional[str] = None,
+    ):
+        self.X = X
+        self.y = y
+        self.algorithm_implementation = FLAMLClassifierWrapper()
+        self.method_name = "FLAMLClassifier"
+
+        self.parameter_space: Union[List[Dict[str, Any]], Dict[str, Any]]
+
+        if global_parameters.bayessearch:
+            self.parameter_space = {
+                "time_budget": Integer(1, 5),
+            }
+        else:
+            self.parameter_space = [
+                {
+                    "time_budget": [1, 2],
+                }
+            ]
diff --git a/tests/test_flaml_classifier.py b/tests/test_flaml_classifier.py
@@ -0,0 +1,99 @@
+import unittest
+from unittest.mock import MagicMock, patch
+import pandas as pd
+import numpy as np
+
+from ml_grid.model_classes.FLAMLClassifierWrapper import FLAMLClassifierWrapper
+from ml_grid.model_classes.flaml_classifier_class import FLAMLClassifierClass
+
+
+class TestFLAMLClassifier(unittest.TestCase):
+    def setUp(self):
+        self.X = pd.DataFrame(
+            {"feature_0": [1.0, 2.0, 3.0, 4.0], "feature_1": [4.0, 3.0, 2.0, 1.0]}
+        )
+        self.y = pd.Series([0, 1, 0, 1], name="target")
+
+    def test_init(self):
+        clf = FLAMLClassifierWrapper(time_budget=120, metric="roc_auc")
+        self.assertEqual(clf.time_budget, 120)
+        self.assertEqual(clf.metric, "roc_auc")
+        self.assertIsNone(clf.model_)
+
+    @patch("ml_grid.model_classes.FLAMLClassifierWrapper.AutoML")
+    def test_fit(self, mock_automl_cls):
+        # Setup mocks
+        mock_automl_instance = MagicMock()
+        mock_automl_cls.return_value = mock_automl_instance
+
+        clf = FLAMLClassifierWrapper(time_budget=60)
+
+        # Test fit
+        clf.fit(self.X, self.y)
+
+        # Verify AutoML init
+        mock_automl_cls.assert_called_once()
+
+        # Verify fit call
+        mock_automl_instance.fit.assert_called_once()
+        _, kwargs = mock_automl_instance.fit.call_args
+        self.assertEqual(kwargs["time_budget"], 60)
+        self.assertEqual(kwargs["task"], "classification")
+
+        # Verify attributes set
+        self.assertIsNotNone(clf.model_)
+
+    @patch("ml_grid.model_classes.FLAMLClassifierWrapper.AutoML")
+    def test_predict(self, mock_automl_cls):
+        # Setup mock
+        mock_automl_instance = MagicMock()
+        mock_automl_cls.return_value = mock_automl_instance
+
+        # Mock predict return
+        mock_automl_instance.predict.return_value = np.array([0, 1, 0, 1])
+
+        clf = FLAMLClassifierWrapper()
+        clf.fit(self.X, self.y)
+
+        preds = clf.predict(self.X)
+
+        self.assertIsInstance(preds, np.ndarray)
+        np.testing.assert_array_equal(preds, np.array([0, 1, 0, 1]))
+
+        # Verify predict called on internal model
+        mock_automl_instance.predict.assert_called_once_with(self.X)
+
+    def test_missing_flaml(self):
+        # Simulate missing flaml by patching AutoML to None
+        with patch("ml_grid.model_classes.FLAMLClassifierWrapper.AutoML", None):
+            clf = FLAMLClassifierWrapper()
+            with self.assertRaises(ImportError):
+                clf.fit(self.X, self.y)
+
+
+class TestFLAMLClassifierClass(unittest.TestCase):
+    def test_structure(self):
+        # Mock global_parameters to control bayessearch flag
+        with patch(
+            "ml_grid.model_classes.flaml_classifier_class.global_parameters"
+        ) as mock_globals:
+            # Case 1: Grid Search (bayessearch = False)
+            mock_globals.bayessearch = False
+
+            config = FLAMLClassifierClass()
+            self.assertEqual(config.method_name, "FLAMLClassifier")
+            self.assertIsInstance(
+                config.algorithm_implementation, FLAMLClassifierWrapper
+            )
+            self.assertIsInstance(config.parameter_space, list)
+
+            # Case 2: Bayes Search (bayessearch = True)
+            mock_globals.bayessearch = True
+
+            config = FLAMLClassifierClass()
+            self.assertIsInstance(config.parameter_space, dict)
+            self.assertIn("time_budget", config.parameter_space)
+
+
+if __name__ == "__main__":
+    unittest.main()