Fix plotting failures and robustify data cleaning

SamoraHunter · SamoraHunter · commit c8a81d8a6ab6 · 2026-01-15T09:55:36.000Z
Addressed issues causing empty plots and crashes during result visualization:

- `ml_grid/results_processing/core.py`: Updated `get_clean_data` and `validate_data_structure` to robustly handle mixed data types in the 'failed' column (e.g., "Timeout" strings vs. 0/False). Updated `get_outcome_summary` to use this robust cleaning logic.
- `ml_grid/results_processing/plot_algorithms.py`:
    - Removed an erroneous `.reset_index()` call on a Seaborn heatmap object (which returns an Axes, not a DataFrame).
    - Added a check for empty stability data to prevent crashes when calculating standard deviation for algorithms with only a single run.
- `ml_grid/results_processing/filters.py`: Replaced brittle `data["failed"] == 0` checks with `get_clean_data()` to ensure failed runs are correctly filtered across all analysis methods.
diff --git a/ml_grid/results_processing/core.py b/ml_grid/results_processing/core.py
@@ -416,7 +416,7 @@ def get_outcome_summary(self, data: Optional[pd.DataFrame] = None) -> pd.DataFra
         available_metrics = [col for col in metrics if col in data.columns]
 
         # Clean data (remove failed runs)
-        clean_data = data[data["failed"] == 0] if "failed" in data.columns else data
+        clean_data = get_clean_data(data)
 
         # Group by outcome variable and calculate summary stats
         outcome_summary = (
@@ -479,7 +479,10 @@ def validate_data_structure(df: pd.DataFrame) -> Dict[str, Any]:
 
         # Check for data quality issues
         if "failed" in df.columns:
-            failed_count = (df["failed"] == 1).sum()
+            # Robust check for failures handling mixed types (str/bool/int)
+            failed_as_str = df["failed"].astype(str).str.lower()
+            success_values = ["false", "0", "0.0"]
+            failed_count = (~failed_as_str.isin(success_values)).sum()
             if failed_count > 0:
                 validation_report["data_quality_issues"].append(
                     f"{failed_count} failed runs detected"
@@ -558,7 +561,11 @@ def get_clean_data(df: pd.DataFrame, remove_failed: bool = True) -> pd.DataFrame
         pd.DataFrame: The cleaned DataFrame.
     """
     if remove_failed and "failed" in df.columns:
-        return df[df["failed"] == 0].copy()
+        # Robustly identify failed rows handling mixed types (bool, int, str)
+        # We consider the row failed if it is NOT explicitly False, 0, "False", or "0"
+        failed_as_str = df["failed"].astype(str).str.lower()
+        success_values = ["false", "0", "0.0"]
+        return df[failed_as_str.isin(success_values)].copy()
     return df.copy()
 
 
diff --git a/ml_grid/results_processing/filters.py b/ml_grid/results_processing/filters.py
@@ -504,9 +504,8 @@ def get_outcome_characteristics(self) -> pd.DataFrame:
 
             # Basic counts
             char_dict["total_experiments"] = len(outcome_data)
-            char_dict["successful_experiments"] = len(
-                outcome_data[outcome_data["failed"] == 0]
-            )
+            successful_data = get_clean_data(outcome_data)
+            char_dict["successful_experiments"] = len(successful_data)
             char_dict["success_rate"] = (
                 char_dict["successful_experiments"] / char_dict["total_experiments"]
             )
@@ -518,7 +517,6 @@ def get_outcome_characteristics(self) -> pd.DataFrame:
                     char_dict[char] = outcome_data[char].median()
 
             # Performance characteristics
-            successful_data = outcome_data[outcome_data["failed"] == 0]
             if len(successful_data) > 0:
                 for metric in ["auc", "f1", "precision", "recall", "accuracy"]:
                     if metric in successful_data.columns:
@@ -551,7 +549,7 @@ def find_similar_outcomes(
             similarity_metrics = ["auc", "f1"]
 
         # Get successful runs only
-        successful_data = self.data[self.data["failed"] == 0]
+        successful_data = get_clean_data(self.data)
 
         # Get reference outcome performance
         ref_data = successful_data[
diff --git a/ml_grid/results_processing/plot_algorithms.py b/ml_grid/results_processing/plot_algorithms.py
@@ -535,9 +535,18 @@ def plot_algorithm_stability(
         stability = (
             self.clean_data.groupby("method_name")[metric]
             .std()
+            .dropna()
             .sort_values(ascending=True)
         )
 
+        if stability.empty:
+            warnings.warn(
+                f"No stability data available for metric '{metric}'. "
+                "This usually happens if each algorithm was run only once (std is undefined).",
+                stacklevel=2,
+            )
+            return
+
         # Select top N most stable
         stability = stability.head(top_n)
 
@@ -560,7 +569,8 @@ def plot_algorithm_stability(
         )
         ax.set_ylabel("Algorithm", fontsize=12)
 
-        ax.bar_label(ax.containers[0], fmt="%.4f", padding=3)
+        if ax.containers:
+            ax.bar_label(ax.containers[0], fmt="%.4f", padding=3)
         plt.tight_layout()
         plt.show()
 
@@ -813,6 +823,6 @@ def plot_statistical_significance_heatmap(
             cmap="coolwarm_r",
             center=0.05,
             cbar_kws={"label": "P-value"},
-        ).reset_index()
+        )
         plt.title(title, fontsize=14, fontweight="bold")
         plt.show()
diff --git a/ml_grid/util/project_score_save.py b/ml_grid/util/project_score_save.py
@@ -192,6 +192,10 @@ def update_score_log(
             y_test_np = y_test.values if hasattr(y_test, "values") else y_test
             best_pred_np = best_pred_orig.values if hasattr(best_pred_orig, "values") else best_pred_orig
 
+            # Ensure 1D arrays to prevent shape mismatch errors
+            y_test_np = np.ravel(y_test_np)
+            best_pred_np = np.ravel(best_pred_np)
+
             # Attempt to convert to integers (e.g. "0"/"1" strings from H2O) for faster np.unique
             try:
                 y_test_np = y_test_np.astype(int)