Add script for statistical analysis WIP (#1155)

constantinpape · anwai98 · web-flow · commit 5b2c7085f31a · 2026-02-02T18:04:48.000+01:00
* Add script for statistical analysis WIP

* Fix dataset pool names

* Update scripts for statistical analysis

* Add doc and more cosmetic changes

---------

Co-authored-by: Anwai Archit &lt;anwai.archit@gmail.com&gt;
diff --git a/scripts/apg_experiments/README.md b/scripts/apg_experiments/README.md
@@ -21,3 +21,4 @@ The top-level folder contains scripts to evaluate other models with `micro-sam`,
     - `plot_qualitative.py`: Scripts to display qualitative results over all datasets.
     - `plot_quantitative.py`: Scripts to display quantitative results over all datasets.
     - `plot_util.py`: Stores related information helpful for plotting.
+- `statistical_analysis`: Scripts for performing statistical analysis on quantitative results computed per image.
diff --git a/scripts/apg_experiments/prepare_baselines.py b/scripts/apg_experiments/prepare_baselines.py
@@ -65,16 +65,18 @@ def run_baseline_engine(image, method, **kwargs):
 
 def run_default_baselines(dataset_name, method, model_type, experiment_folder, target=None):
     # Prepare the results folder.
-    res_folder = os.path.join(experiment_folder, "results")
+    res_folder = os.path.join(experiment_folder, "results", method, model_type)
     inference_folder = os.path.join(experiment_folder, "inference", f"{dataset_name}_{method}_{model_type}")
     os.makedirs(res_folder, exist_ok=True)
     os.makedirs(inference_folder, exist_ok=True)
 
-    fnext = (target if model_type == "sam3" else model_type)
-    csv_path = os.path.join(res_folder, f"{dataset_name}_{method}_{fnext}.csv")
+    csv_path = os.path.join(res_folder, f"{dataset_name}.csv")
     if os.path.exists(csv_path):
-        print(pd.read_csv(csv_path))
-        print(f"The results are computed and stored at '{csv_path}'.")
+        df = pd.read_csv(csv_path)
+        print(df)
+        mean_msa = df["msa"].mean()
+        print(f"\nThe results are computed and stored at '{csv_path}'.")
+        print(f"Mean MSA for {dataset_name}: {mean_msa:.4f}")
         return
 
     # Get the image and label paths.
@@ -83,7 +85,7 @@ def run_default_baselines(dataset_name, method, model_type, experiment_folder, t
     assert isinstance(method, str)
     kwargs = {}
     if method in ["ais", "amg"]:
-        predictor, segmenter = get_predictor_and_segmenter(model_type=model_type, segmentation_mode="amg")
+        predictor, segmenter = get_predictor_and_segmenter(model_type=model_type, segmentation_mode=method)
         kwargs["predictor"] = predictor
         kwargs["segmenter"] = segmenter
     elif method == "apg":
@@ -105,7 +107,7 @@ def run_default_baselines(dataset_name, method, model_type, experiment_folder, t
         kwargs["processor"] = Sam3Processor(model)
         kwargs["prompt"] = target
 
-    msas, sa50s, precisions, recalls, f1s = [], [], [], [], []
+    per_image_results = []
     for curr_image_path, curr_label_path in tqdm(
         zip(image_paths, label_paths), total=len(image_paths),
         desc=f"Run '{method}' baseline for '{model_type}' on '{dataset_name}'",
@@ -124,23 +126,25 @@ def run_default_baselines(dataset_name, method, model_type, experiment_folder, t
         fname = os.path.join(inference_folder, f"{Path(curr_image_path).stem}.tif")
         imageio.imwrite(fname, segmentation, compression="zlib")
 
-        msas.append(msa)
-        sa50s.append(sas[0])
-        precisions.append(stats["precision"])
-        recalls.append(stats["recall"])
-        f1s.append(stats["f1"])
-
-    results = {
-        "mSA": np.mean(msas),
-        "SA50": np.mean(sa50s),
-        "Precision": np.mean(precisions),
-        "Recall": np.mean(recalls),
-        "F1": np.mean(f1s),
-    }
-    results = pd.DataFrame.from_dict([results])
-    results.to_csv(csv_path)
-    print(results)
-    print(f"The results above are stored at '{csv_path}'.")
+        # Store per-image metrics
+        per_image_results.append({
+            "image": os.path.basename(curr_image_path),
+            "label": os.path.basename(curr_label_path),
+            "msa": msa,
+            "sa50": sas[0],
+            "precision": stats["precision"],
+            "recall": stats["recall"],
+            "f1": stats["f1"],
+        })
+
+    # Create DataFrame with per-image results
+    results_df = pd.DataFrame(per_image_results)
+    results_df.to_csv(csv_path, index=False)
+    print(results_df)
+
+    # Compute and print mean MSA
+    mean_msa = results_df["msa"].mean()
+    print(f"\nMean MSA for {dataset_name}: {mean_msa:.4f}")
 
 
 def main(args):
diff --git a/scripts/apg_experiments/statistical_analysis.py b/scripts/apg_experiments/statistical_analysis.py
@@ -0,0 +1,222 @@
+import numpy as np
+import pandas as pd
+import seaborn as sns
+import matplotlib.pyplot as plt
+from scipy.stats import shapiro, wilcoxon
+
+
+METRIC = "msa"
+CRITERION = 0.05
+
+
+def statistical_analysis_dataset(dataset, method1_path, method2_path, verbose=True):
+    res1 = pd.read_csv(f"./results/{method1_path}/{dataset}.csv")[METRIC].values
+    res2 = pd.read_csv(f"./results/{method2_path}/{dataset}.csv")[METRIC].values
+    assert res1.shape == res2.shape
+
+    diff = res1 - res2
+
+    _, p_gauss = shapiro(diff)
+    if verbose:
+        print("P-value for gaussian distribution:", p_gauss)
+
+    is_better = diff.sum() > 0
+    _, p = wilcoxon(diff, alternative="greater" if is_better else "less")
+    is_significant = p < CRITERION
+
+    if verbose:
+        print(
+            "Hypothesis:", method1_path if is_better else method2_path, "is better than",
+            method2_path if is_better else method1_path
+        )
+        print("Result:", "True" if is_significant else "False", f"(p = {p:.4f})")
+
+    return is_better, is_significant
+
+
+def statistical_analysis_pair(datasets, method1_path, method2_path, verbose=False):
+    better1 = 0
+    better2 = 0
+    neutral = 0
+
+    for ds in datasets:
+        is_better, is_significant = statistical_analysis_dataset(ds, method1_path, method2_path, verbose=verbose)
+        if is_significant and is_better:
+            better1 += 1
+        elif is_significant:
+            better2 += 1
+        else:
+            neutral += 1
+
+    assert better1 + better2 + neutral == len(datasets)
+    if verbose:
+        print(method1_path, "better than", method2_path, ":", better1)
+        print(method2_path, "better than", method1_path, ":", better2)
+        print("No difference:", neutral)
+    return better1, better2, neutral
+
+
+def get_datasets(domain):
+    domain_to_ds = {
+        "fluo_cells": [
+            "cellpose",
+            "covid_if",
+            "hpa",
+            "plantseg_root",
+            "plantseg_ovules",
+            "pnas_arabidopsis",
+            "tissuenet",
+            "cellbindb",
+            "mouse_embryo",
+        ],
+        "fluo_nuclei": [
+            "arvidsson",
+            "bitdepth_nucseg",
+            "dsb",
+            "dynamicnuclearnet",
+            "gonuclear",
+            "ifnuclei",
+            "nis3d",
+            "parhyale_regen",
+            "u20s",
+        ],
+        "label_free": [
+            "deepbacs",
+            "deepseas",
+            "livecell",
+            "omnipose",
+            "usiigaci",
+            "vicar",
+            "toiam",
+            "yeaz",
+            "segpc",
+        ],
+        "histopatho": [
+            "cytodark0",
+            "ihc_tma",
+            "monuseg",
+            "lynsec",
+            "nuinsseg",
+            "pannuke",
+            "puma",
+            "tnbc",
+            "cryonuseg",
+        ],
+    }
+    datasets = domain_to_ds[domain]
+    assert len(datasets) == 9
+    return datasets
+
+
+def _plot_comparison_heatmap(domain, comparison_df, title=None):
+    # Extract wins for method in row vs method in column
+    n = len(comparison_df)
+    win_matrix = np.zeros((n, n))
+
+    for i in range(n):
+        for j in range(n):
+            if i != j:
+                parts = comparison_df.iloc[i, j].split(' / ')
+                win_matrix[i, j] = int(parts[0])  # wins for row method
+
+    # Masking the diagonal to exclude it from coloring.
+    mask = np.eye(n, dtype=bool)
+
+    fig, ax = plt.subplots(figsize=(10, 8))
+    sns.heatmap(
+        win_matrix, annot=comparison_df.values, fmt='',
+        cmap='RdYlGn', center=len(get_datasets(domain))/2,
+        xticklabels=comparison_df.columns,
+        yticklabels=comparison_df.index,
+        cbar_kws={'label': 'Wins'}, ax=ax,
+        mask=mask, linewidths=0.5, linecolor='#A9A9A9'
+    )
+
+    plt.title(title)
+    plt.tight_layout()
+    plt.savefig(f'comparison_heatmap_{domain}.png', dpi=400, bbox_inches='tight')
+    plt.savefig(f'comparison_heatmap_{domain}.svg', dpi=400, bbox_inches='tight')
+    plt.close()
+
+
+def compare_all():
+    # Sorting out the paths where the methods' results exist.
+    method_configs = {
+        "amg": "amg/vit_b",
+        "ais_lm": "ais/vit_b_lm",
+        "ais_histo": "ais/vit_b_histopathology",
+        "cellpose3": "cellpose/cyto3",
+        "cellpose4": "cellpose/cpsam",
+        "cellsam": "cellsam/cellsam",
+        "sam3": "sam3/cell",
+        "apg_lm": "apg/vit_b_lm",
+        "apg_histo": "apg/vit_b_histopathology",
+    }
+
+    # Sorting the methods we would like to compare stuff with.
+    domain_methods = {
+        "fluo_cells": ["amg", "ais_lm", "cellpose3", "cellpose4", "cellsam", "sam3", "apg_lm"],
+        "fluo_nuclei": ["amg", "ais_lm", "cellpose3", "cellpose4", "cellsam", "sam3", "apg_lm"],
+        "label_free": ["amg", "ais_lm", "cellpose3", "cellpose4", "cellsam", "sam3", "apg_lm"],
+        "histopatho": ["amg", "ais_histo", "cellpose3", "cellpose4", "cellsam", "sam3", "apg_histo"],
+    }
+
+    # Let's map the keys to expected names.
+    display_names = {
+        "amg": "AMG (SAM)",
+        "ais_lm": "AIS (μSAM)",
+        "ais_histo": "AIS\n(PathoSAM)",
+        "cellsam": "CellSAM",
+        "cellpose3": "Cellpose 3",
+        "cellpose4": "CellposeSAM",
+        "sam3": "SAM3",
+        "apg_lm": r"$\mathbf{APG}$" + r" $\mathbf{(μSAM)}$",
+        "apg_histo": r"$\mathbf{APG}$" + "\n" + r"$\mathbf{(PathoSAM)}$",
+    }
+
+    # Choosing custom plot titles.
+    custom_titles = {
+        "fluo_cells": "Fluorescence Microscopy (Cell Segmentation)",
+        "fluo_nuclei": "Fluorescence Microscopy (Nucleus Segmentation)",
+        "label_free": "Label-Free Microscopy (Cell Segmentation)",
+        "histopatho": "Histopathology (Nucleus Segmentation)",
+    }
+
+    for domain in ["fluo_cells", "fluo_nuclei", "label_free", "histopatho"]:
+        datasets = get_datasets(domain)
+        methods = domain_methods[domain]
+        n_methods = len(methods)
+
+        comparison = np.empty((n_methods, n_methods), dtype="U15")
+
+        for i in range(n_methods):
+            for j in range(n_methods):
+                if i == j:
+                    comparison[i, j] = "-"
+                    continue
+
+                method_row = methods[i]
+                method_col = methods[j]
+                method_row_path = method_configs[method_row]
+                method_col_path = method_configs[method_col]
+
+                better_row, better_col, neutral = statistical_analysis_pair(
+                    datasets, method_row_path, method_col_path
+                )
+                comparison[i, j] = f"{better_row} / {better_col} / {neutral}"
+
+        # Let's use expected display names.
+        display_method_names = [display_names[m] for m in methods]
+        comparison = pd.DataFrame(comparison, index=display_method_names, columns=display_method_names)
+
+        # Let's visualize the results
+        _plot_comparison_heatmap(domain, comparison, title=custom_titles[domain])
+        print(f"Generated heatmap for {domain}: comparison_heatmap_{domain}.png")
+
+
+def main():
+    compare_all()
+
+
+if __name__ == "__main__":
+    main()
diff --git a/scripts/apg_experiments/submit_evaluation.py b/scripts/apg_experiments/submit_evaluation.py
@@ -9,26 +9,21 @@ def write_batch_script(
 ):
     """Writing scripts to submit multiple evaluations relevant for APG.
     """
-    if method == "cellpose":
-        if model_type == "cyto3":
-            env = "cp3"
-        elif model_type == "cpsam":
-            env = "cp4"
-        else:
-            raise ValueError
+    if method == "cellpose" and model_type == "cyto3":
+        env = "cp3"
     else:
         env = "super"
 
     batch_script = f"""#!/bin/bash
 #SBATCH -t 2-00:00:00
 #SBATCH --nodes=1
 #SBATCH --ntasks=1
-#SBATCH -p grete:shared
-#SBATCH -G A100:1
+#SBATCH -p grete-h100:shared
+#SBATCH -G H100:1
 #SBATCH -A nim00007
 #SBATCH -c 16
 #SBATCH --mem 64G
-#SBATCH --constraint=inet,80gb
+#SBATCH --constraint=inet
 #SBATCH --job-name=apg_evaluation
 
 source ~/.bashrc
@@ -83,33 +78,36 @@ def submit_slurm(args):
 
     method_combinations = [
         # SAM-based models
-        # ["amg", "vit_b"],
-        # ["amg", generalist_model],
-        # ["ais", generalist_model],
+        ["amg", "vit_b"],
+        ["ais", generalist_model],
         ["apg", generalist_model],
         # SAM3
-        # ["sam3", "cells"],
+        ["sam3", "cell"],
         # And other external methods.
-        # ["cellpose", "cyto3"],
-        # ["cellpose", "cpsam"],
-        # ["cellsam", "cellsam"],
+        ["cellpose", "cyto3"],
+        ["cellpose", "cpsam"],
+        ["cellsam", "cellsam"],
     ]
 
     if dataset_name is None:
         if generalist_model == "vit_b_lm":
             datasets = [
                 # Label-free
-                "livecell", "omnipose", "deepbacs", "usiigaci", "vicar", "deepseas", "toiam",
+                "livecell", "omnipose", "deepbacs", "usiigaci", "vicar",
+                "deepseas", "toiam", "yeaz", "segpc",
                 # Fluo (nuclei)
                 "dynamicnuclearnet", "u20s", "arvidsson", "ifnuclei",
                 "gonuclear", "nis3d", "parhyale_regen", "dsb", "bitdepth_nucseg",
                 # Fluo (cells)
                 "cellpose", "cellbindb", "tissuenet", "plantseg_root", "covid_if",
-                "hpa", "plantseg_ovules", "pnas_arabidopsis",
+                "hpa", "plantseg_ovules", "pnas_arabidopsis", "mouse_embryo",
             ]
         else:  # Histopatholgoy
             assert generalist_model == "vit_b_histopathology"
-            datasets = ["ihc_tma", "lynsec", "pannuke", "monuseg", "tnbc", "nuinsseg", "puma", "cytodark0"]
+            datasets = [
+                "ihc_tma", "lynsec", "pannuke", "monuseg", "tnbc",
+                "nuinsseg", "puma", "cytodark0", "cryonuseg"
+            ]
     else:
         datasets = [dataset_name]