raw scores for sem and vc

janursa · janursa · commit 2cac736e54e2 · 2025-11-26T07:07:50.000+01:00
diff --git a/scripts/configs/labels_tw.config b/scripts/configs/labels_tw.config
@@ -46,11 +46,11 @@ process {
     
 
   // Resource labels
-  withLabel: {lowtime: 1.h}
-  withLabel: {midtime: 4.h}
-  withLabel: {hightime: 8.h}
-  withLabel: {onedaytime: 24.h}
-  withLabel: {twodaytime: 48.h}
+  withLabel: lowtime {time = 1.h}
+  withLabel: midtime {time = 4.h}
+  withLabel: hightime {time = 8.h}
+  withLabel: onedaytime {time = 24.h}
+  withLabel: twodaytime {time = 48.h}
 
   withLabel: lowcpu { cpus = 5 }
   withLabel: midcpu { cpus = 15 }
diff --git a/scripts/run_all.sh b/scripts/run_all.sh
@@ -1,7 +1,7 @@
 set -e
 
 # datasets=( 'replogle' 'op' 'nakatake' 'adamson' 'norman'  'xaira_HEK293T' 'xaira_HCT116'  'parsebioscience' 'ibd_uc' 'ibd_cd' '300BCG' ) #'replogle' 'op' 'nakatake' 'adamson' 'norman'  'xaira_HEK293T' 'xaira_HCT116'  'parsebioscience' 'ibd_uc' 'ibd_cd'  '300BCG') #
-datasets=( 'op'  ) #'replogle' 'op' 'nakatake' 'adamson' 'norman'  'xaira_HEK293T' 'xaira_HCT116'  'parsebioscience' 'ibd_uc' 'ibd_cd' '300BCG') #
+datasets=( 'op' 'replogle'  ) #'replogle' 'op' 'nakatake' 'adamson' 'norman'  'xaira_HEK293T' 'xaira_HCT116'  'parsebioscience' 'ibd_uc' 'ibd_cd' '300BCG') #
 run_local=false # set to true to run locally, false to run on AWS
 
 run_grn_inference=false
diff --git a/src/metrics/sem/helper.py b/src/metrics/sem/helper.py
@@ -383,7 +383,9 @@ def main(par):
     gene_mask = np.logical_or(np.any(A, axis=1), np.any(A, axis=0))
     in_degrees = np.sum(A != 0, axis=0)
     out_degrees = np.sum(A != 0, axis=1)
-    idx = np.argsort(np.maximum(out_degrees, in_degrees))[:-par['n_top_genes']]
+    # n_genes = par['n_top_genes']
+    n_genes = 3000
+    idx = np.argsort(np.maximum(out_degrees, in_degrees))[:-n_genes]
     gene_mask[idx] = False
     X = X[:, gene_mask]
     X = X.toarray() if isinstance(X, csr_matrix) else X
@@ -445,52 +447,67 @@ def main(par):
     # Evaluate inferred GRN
     print("\n======== Evaluate inferred GRN ========")
     scores = evaluate_grn(X_controls, delta_X, is_train, is_reporter, A, signed=use_signs)
-
-    # Evaluate baseline GRN
-    print("\n======== Evaluate shuffled GRN ========")
-    scores_baseline = evaluate_grn(X_controls, delta_X, is_train, is_reporter, A_baseline, signed=use_signs)
-
-    # Keep only the genes for which both GRNs got a score
-    mask = ~np.logical_or(np.isnan(scores), np.isnan(scores_baseline))
-    scores = scores[mask]
-    scores_baseline = scores_baseline[mask]
-
-    rr_all = {}
-    # Perform rank test between actual scores and baseline
-    rr_all['spearman'] = float(np.mean(scores))
-    rr_all['spearman_shuffled'] = float(np.mean(scores_baseline))
-    if len(scores) == 0:
+    
+    # Keep only valid scores (non-NaN)
+    valid_scores = scores[~np.isnan(scores)]
+    
+    if len(valid_scores) == 0:
         # No valid genes to evaluate
-        df_results = pd.DataFrame({'sem_precision': [np.nan], 'sem_balanced': [np.nan]})
-    elif np.all(scores - scores_baseline == 0):
-        # Identical performance (suspicious - likely an error)
-        print("WARNING: Identical scores detected - possible evaluation error!")
-        df_results = pd.DataFrame({'sem_precision': [1.0], 'sem_balanced': [0.0]})
+        print("WARNING: No valid genes to evaluate!")
+        results = {'sem': [0.0]}
     else:
-        res = wilcoxon(scores - scores_baseline, zero_method='wilcox', alternative='greater')
-        rr_all['Wilcoxon pvalue'] = float(res.pvalue)
-
-        print(rr_all)
+        # Final score is mean of valid R² scores
+        final_score = float(np.mean(valid_scores))
         
-        eps = 1e-300  # very small number to avoid log(0)
-        pval_clipped = max(res.pvalue, eps)
+        print(f"\nMethod: {method_id}")
+        print(f"SEM score (mean R²): {final_score:.4f}")
+        print(f"Valid genes evaluated: {len(valid_scores)}/{len(scores)}")
+        print(f"SEM score (min): {np.min(valid_scores):.4f}")
+        print(f"SEM score (max): {np.max(valid_scores):.4f}")
         
-        # Set to 0 if not significant (p >= 0.05)
-        if res.pvalue >= 0.05:
-            score = 0.0
-            print(f"p-value: {res.pvalue:.6f} (not significant, p >= 0.05)")
-            print(f"SEM score set to 0")
+        results = {'sem': [float(final_score)]}
+    
+    # Evaluate baseline GRN
+    if False:
+        print("\n======== Evaluate shuffled GRN ========")
+        scores_baseline = evaluate_grn(X_controls, delta_X, is_train, is_reporter, A_baseline, signed=use_signs)
+
+        # Keep only the genes for which both GRNs got a score
+        mask = ~np.logical_or(np.isnan(scores), np.isnan(scores_baseline))
+        scores = scores[mask]
+        scores_baseline = scores_baseline[mask]
+
+        rr_all = {}
+        # Perform rank test between actual scores and baseline
+        rr_all['spearman'] = float(np.mean(scores))
+        rr_all['spearman_shuffled'] = float(np.mean(scores_baseline))
+        if len(scores) == 0:
+            raise ValueError("No valid scores to compare between inferred GRN and baseline GRN.")
+        elif np.all(scores - scores_baseline == 0):
+            # Identical performance (suspicious - likely an error)
+            raise ValueError("Identical performance between inferred GRN and baseline GRN - likely an error.")
         else:
-            # Compute final score
-            score = -np.log10(pval_clipped)
-            print(f"p-value: {res.pvalue:.6f} (significant)")
-        
-        print(f"Final score: {score}")
-
-        results = {
-            'sem_precision': [float(np.log2(np.mean(scores) / (np.mean(scores_baseline) + 1e-6)))],
-            'sem': [float(score)]
-        }
-
-        df_results = pd.DataFrame(results)
+            res = wilcoxon(scores - scores_baseline, zero_method='wilcox', alternative='greater')
+            rr_all['Wilcoxon pvalue'] = float(res.pvalue)
+
+            print(rr_all)
+            
+            eps = 1e-300  # very small number to avoid log(0)
+            pval_clipped = max(res.pvalue, eps)
+            
+            # Set to 0 if not significant (p >= 0.05)
+            if res.pvalue >= 0.05:
+                score = 0.0
+                print(f"p-value: {res.pvalue:.6f} (not significant, p >= 0.05)")
+                print(f"SEM score set to 0")
+            else:
+                # Compute final score
+                score = -np.log10(pval_clipped)
+                print(f"p-value: {res.pvalue:.6f} (significant)")
+            
+            print(f"Final score: {score}")
+        results['sem_precision'] = [float(np.log2(np.mean(scores) / (np.mean(scores_baseline) + 1e-6)))]
+        results['sem_n'] = [float(score)]
+
+    df_results = pd.DataFrame(results)
     return df_results
diff --git a/src/metrics/sem/script.py b/src/metrics/sem/script.py
@@ -18,18 +18,11 @@
     'genes_n': 5000
 }
 ## VIASH END
-
-import argparse
-parser = argparse.ArgumentParser()
-parser.add_argument('--prediction', type=str, help='Path to the predicted GRN in h5ad format')
-parser.add_argument('--evaluation_data', type=str, help='Path to the evaluation data in h5ad format')
-parser.add_argument('--score', type=str)
-parser.add_argument('--layer', type=str, default='lognorm', help='Layer in the h5ad file to use')
-parser.add_argument('--num_workers', type=int, default=20, help='Number of workers to use')
-
+run_local = False
 try:
     sys.path.append(meta["resources_dir"])
 except:
+    run_local=True
     meta = {
     "resources_dir":'src/metrics/sem/',
     "util_dir": 'src/utils',
@@ -39,14 +32,11 @@
     sys.path.append(meta["util_dir"])
     sys.path.append(meta["helper_dir"])
 from helper import main as main_sem 
-from util import format_save_score
-
+from util import format_save_score, parse_args
 
+if run_local:
+    par = parse_args(par)
 
-args = parser.parse_args()
-for key, value in vars(args).items():
-    if value is not None:
-        par[key] = value
 
 if __name__ == "__main__":
     
diff --git a/src/metrics/vc/helper.py b/src/metrics/vc/helper.py
@@ -486,45 +486,42 @@ def main(par):
             j = gene_dict[target]
             A[i, j] = float(weight)
 
-    # Only consider the genes that are actually present in the inferred GRN
-    gene_mask = np.logical_or(np.any(A, axis=1), np.any(A, axis=0))
-    X = X[:, gene_mask]
-    A = A[gene_mask, :][:, gene_mask]
-    gene_names = gene_names[gene_mask]
-
-    print(f"Using {len(gene_names)} genes present in the GRN")
-    
-    # Additional memory-aware gene filtering for very large GRNs
-    MAX_GENES_FOR_MEMORY = par['n_top_genes']  # Reduced further to avoid memory issues
-    if len(gene_names) > MAX_GENES_FOR_MEMORY:
-        print(f"Too many genes ({len(gene_names)}) for memory. Selecting top {MAX_GENES_FOR_MEMORY} by GRN connectivity.")
+    # Gene filtering based on n_top_genes parameter
+    if par['n_top_genes'] == -1:
+        # Use all genes from evaluation data
+        # A already has zeros for genes without GRN connections
+        print(f"Using all {len(gene_names)} genes from evaluation data (including those without GRN connections)")
+    else:
+        # Filter to genes present in the inferred GRN
+        gene_mask = np.logical_or(np.any(A, axis=1), np.any(A, axis=0))
         
-        # Select genes with highest connectivity in the GRN
-        gene_connectivity = np.sum(np.abs(A), axis=0) + np.sum(np.abs(A), axis=1)
-        top_gene_indices = np.argsort(gene_connectivity)[-MAX_GENES_FOR_MEMORY:]
+        # Additional memory-aware gene filtering for very large GRNs
+        MAX_GENES_FOR_MEMORY = par['n_top_genes']
+        if np.sum(gene_mask) > MAX_GENES_FOR_MEMORY:
+            print(f"Too many genes with GRN connections ({np.sum(gene_mask)}) for memory. Selecting top {MAX_GENES_FOR_MEMORY} by GRN connectivity.")
+            
+            # Select genes with highest connectivity in the GRN
+            gene_connectivity = np.sum(np.abs(A), axis=0) + np.sum(np.abs(A), axis=1)
+            # Set connectivity to 0 for genes not in mask
+            gene_connectivity[~gene_mask] = 0
+            top_gene_indices = np.argsort(gene_connectivity)[-MAX_GENES_FOR_MEMORY:]
+            gene_mask = np.zeros(len(gene_names), dtype=bool)
+            gene_mask[top_gene_indices] = True
         
-        X = X[:, top_gene_indices]
-        A = A[top_gene_indices, :][:, top_gene_indices]
-        gene_names = gene_names[top_gene_indices]
+        # Apply the gene mask
+        X = X[:, gene_mask]
+        A = A[gene_mask, :][:, gene_mask]
+        gene_names = gene_names[gene_mask]
         
-        print(f"Final: Using {len(gene_names)} most connected genes for evaluation")
+        print(f"Using {len(gene_names)} genes (filtered by GRN connectivity)")
 
     # Remove self-regulations
     np.fill_diagonal(A, 0)
 
-    # Create baseline model
-    A_baseline = np.copy(A)
-    for j in range(A.shape[1]):
-        np.random.shuffle(A[:j, j])
-        np.random.shuffle(A[j+1:, j])
-    assert np.any(A_baseline != A)
-
     # Mapping between gene expression profiles and their matched negative controls
-
     control_map, _ = create_control_matching(are_controls, match_groups)
     loose_control_map, _ = create_control_matching(are_controls, loose_match_groups)
 
-
     ss_res = 0
     ss_tot = 0
     cv = GroupKFold(n_splits=5)
@@ -561,22 +558,26 @@ def main(par):
         test_data_loader = torch.utils.data.DataLoader(test_dataset, batch_size=512)
 
         # Evaluate inferred GRN
+        print(f"\n======== Fold {i+1}: Evaluate inferred GRN ========")
         res = evaluate(A, train_data_loader, test_data_loader, n_perturbations)
         ss_res = ss_res + res[0]
         ss_tot = ss_tot + res[1]
 
-        # Evaluate baseline GRN (shuffled target genes)
-        #ss_tot = ss_tot + evaluate(A_baseline, train_data_loader, test_data_loader, n_perturbations)
-
     r2 = 1 - ss_res / ss_tot
 
-    final_score = np.mean(np.clip(r2, 0, 1))
-    print(f"Method: {method_id}")
-    print(f"R2: {final_score}")
+    # Compute scores per gene
+    r2_per_gene = np.clip(r2, 0, 1)
+    
+    # Final score is mean R2 across genes
+    final_score = np.mean(r2_per_gene)
+    
+    print(f"\nMethod: {method_id}")
+    print(f"R2 (mean): {final_score:.4f}")
+    print(f"R2 (min): {np.min(r2_per_gene):.4f}")
+    print(f"R2 (max): {np.max(r2_per_gene):.4f}")
 
     results = {
         'vc': [float(final_score)]
-
     }
 
     df_results = pd.DataFrame(results)
diff --git a/src/metrics/vc/run_local.sh b/src/metrics/vc/run_local.sh
@@ -17,7 +17,7 @@ mkdir -p "$save_dir"
 
 # datasets to process
 datasets=( "replogle" "xaira_HEK293T" "xaira_HCT116" "nakatake" "norman" "adamson" 'parsebioscience' 'op' "300BCG" 'ibd_uc' 'ibd_cd') #"300BCG" "ibd" 'parsebioscience', 'xaira_HEK293T'
-# datasets=( "op" "300BCG" "parsebioscience" "ibd" )
+datasets=( "op" "300BCG" "parsebioscience" "ibd" )
 # methods to process
 methods=( "pearson_corr" "positive_control" "negative_control" "ppcor" "portia" "scenic" "grnboost" "scprint" "scenicplus" "celloracle" "scglue" "figr" "granie")
 # methods=( "grnboost")
diff --git a/src/utils/config.py b/src/utils/config.py
@@ -93,7 +93,6 @@
     'norman': ['regression', 'ws_distance', 'tf_binding', 'sem', 'gs_recovery'],
     'nakatake': ['regression', 'sem', 'gs_recovery'],
     'op': ['regression', 'vc', 'rc_tf_act', 'tf_binding', 'sem',  'gs_recovery'],
-    # 'op': ['regression', 'tf_binding', 'gs_recovery'],
     '300BCG': ['regression', 'vc', 'rc_tf_act', 'tf_binding', 'sem',  'gs_recovery'],
     'ibd_uc': ['regression', 'vc', 'tf_binding', 'sem',  'gs_recovery'],
     'ibd_cd': ['regression', 'vc', 'tf_binding', 'sem',  'gs_recovery'],
@@ -111,28 +110,21 @@
 METRICS = [
        'r_precision', 'r_recall', 'r_f1',
        'ws_precision', 'ws_recall', 'ws_f1',
-       'vc', 
-       'sem', 
+       'vc', 'vc_raw', 'vc_precision', 
+       'sem', 'sem_precision', 'sem_raw',
        't_rec_precision', 't_rec_recall', 't_rec_f1',
-       
-       'rc_tf_act',
-       
-       'anchor_regression_raw',
-       
+       'rc_tf_act',       
        'tfb_precision', 'tfb_recall',  'tfb_f1',
        'gs_precision', 'gs_recall', 'gs_f1',
        ]
     
 FINAL_METRICS = [
        'r_precision', 'r_recall', 
-       'ws_precision', 'ws_recall', 
        'vc', 
-       'sem', 
+       'sem',
+       'ws_precision', 'ws_recall', 
        't_rec_precision', 't_rec_recall', 
-        
        'rc_tf_act',
-       
-       'anchor_regression_raw',
        'tfb_f1', 
        'gs_f1', 
        ]
diff --git a/src/utils/dataset_config.env b/src/utils/dataset_config.env
@@ -19,7 +19,7 @@ METRICS_replogle="regression,ws_distance,tf_recovery,tf_binding,sem,gs_recovery"
 METRICS_adamson="regression,tf_binding,sem,gs_recovery"
 METRICS_norman="regression,ws_distance,tf_binding,sem,gs_recovery"
 METRICS_nakatake="regression,sem,gs_recovery"
-METRICS_op="tf_binding,gs_recovery"
+METRICS_op="regression,vc,rc_tf_act,tf_binding,sem,gs_recovery"
 METRICS_300BCG="regression,vc,rc_tf_act,tf_binding,sem,gs_recovery"
 METRICS_ibd_uc="regression,vc,tf_binding,sem,gs_recovery"
 METRICS_ibd_cd="regression,vc,tf_binding,sem,gs_recovery"
diff --git a/test.ipynb b/test.ipynb