Add clr normalization for adt counts

Marius1311 · Marius1311 · commit 17ef5c6add04 · 2025-07-31T08:44:39.000+02:00
diff --git a/src/methods/cellmapper_scvi/config.vsh.yaml b/src/methods/cellmapper_scvi/config.vsh.yaml
@@ -21,15 +21,23 @@ info:
     cellmapper_hnoca_hvg:
       kernel_method: hnoca
       use_hvg: true
+      adt_normalization: clr
     cellmapper_hnoca_all_genes:
       kernel_method: hnoca
       use_hvg: false
+      adt_normalization: clr
     cellmapper_gauss_hvg:
       kernel_method: gauss
       use_hvg: true
+      adt_normalization: clr
+    cellmapper_gauss_hvg_log_cp10k:
+      kernel_method: gauss
+      use_hvg: true
+      adt_normalization: log_cp10k
     cellmapper_gauss_all_genes:
       kernel_method: gauss
       use_hvg: false
+      adt_normalization: clr
 
 arguments:
   - name: "--kernel_method"
@@ -45,6 +53,11 @@ arguments:
     type: boolean
     default: true
     description: Whether to use highly variable genes (HVG) for the mapping (Generic analysis parameter).
+  - name: "--adt_normalization"
+    type: "string"
+    choices: ["clr", "log_cp10k"]
+    default: "clr"
+    description: Normalization method for ADT data, clr = centered log ratio. 
 resources:
   - type: python_script
     path: script.py
@@ -56,6 +69,7 @@ engines:
         packages: 
           - cellmapper>=0.2.2
           - scvi-tools>=1.3.0
+          - muon>=0.1.6
 
 runners:
   - type: executable
diff --git a/src/methods/cellmapper_scvi/script.py b/src/methods/cellmapper_scvi/script.py
@@ -7,13 +7,14 @@
 # Note: this section is auto-generated by viash at runtime. To edit it, make changes
 # in config.vsh.yaml and then run `viash config inject config.vsh.yaml`.
 par = {
-    'input_train_mod1': 'resources_test/task_predict_modality/openproblems_neurips2021/bmmc_multiome/normal/train_mod1.h5ad',
-    'input_train_mod2': 'resources_test/task_predict_modality/openproblems_neurips2021/bmmc_multiome/normal/train_mod2.h5ad',
-    'input_test_mod1': 'resources_test/task_predict_modality/openproblems_neurips2021/bmmc_multiome/normal/test_mod1.h5ad',
+    'input_train_mod1': 'resources_test/task_predict_modality/openproblems_neurips2021/bmmc_cite/swap/train_mod1.h5ad',
+    'input_train_mod2': 'resources_test/task_predict_modality/openproblems_neurips2021/bmmc_cite/swap/train_mod2.h5ad',
+    'input_test_mod1': 'resources_test/task_predict_modality/openproblems_neurips2021/bmmc_cite/swap/test_mod1.h5ad',
     'output': 'output.h5ad',
     'n_neighbors': 30, 
     'kernel_method': 'hnoca',
     'use_hvg': True,
+    'adt_normalization': 'clr',
 
 }
 meta = {
@@ -42,7 +43,7 @@
 
 # Compute a latent representation using an appropriate model based on the modality
 print("Get latent representation", flush=True)
-adata = get_representation(adata=adata, modality=mod1, use_hvg=par['use_hvg'])
+adata = get_representation(adata=adata, modality=mod1, use_hvg=par['use_hvg'], adt_normalization=par['adt_normalization'])
 
 # Place the representation back into individual objects
 input_train_mod1.obsm["X_scvi"] = adata[adata.obs["split"] == "train"].obsm["X_scvi"].copy()
diff --git a/src/methods/cellmapper_scvi/utils.py b/src/methods/cellmapper_scvi/utils.py
@@ -1,10 +1,12 @@
 from typing import Literal
 import anndata as ad
 import scvi 
-from scipy.sparse import issparse
+from scipy.sparse import issparse, csr_matrix, csc_matrix
+import muon
 
 
-def get_representation(adata: ad.AnnData, modality: Literal["GEX", "ADT", "ATAC"], use_hvg: bool = True) -> ad.AnnData:
+def get_representation(
+        adata: ad.AnnData, modality: Literal["GEX", "ADT", "ATAC"], use_hvg: bool = True, adt_normalization: Literal["clr", "log_cp10k"] = "clr") -> ad.AnnData:
     """
     Get a joint latent space representation of the data based on the modality.
     
@@ -23,6 +25,10 @@ def get_representation(adata: ad.AnnData, modality: Literal["GEX", "ADT", "ATAC"
         (e.g. UMI counts for GEX and peak counts for ATAC), and the normalized data in the `normalized` layer.
     use_hvg
         Whether to subset the data to highly variable genes (HVGs) before training the model
+    adt_normalization
+        Normalization method for ADT data. Options are:
+         - "clr" (centered log-ratio transformation)
+         - "log_cp10k" (normalization to 10k counts per cell and logarithm transformation)
 
     Returns
     -------
@@ -43,7 +49,17 @@ def get_representation(adata: ad.AnnData, modality: Literal["GEX", "ADT", "ATAC"
         scvi.model.SCVI.setup_anndata(adata, batch_key="batch", layer=layer)
         model = scvi.model.SCVI(adata, gene_likelihood="nb", n_layers=2, n_latent=30)
     elif modality == "ADT":
-        layer = "normalized"
+        print(f"Normalizing the ADT data using method '{adt_normalization}'")
+        if adt_normalization == "clr":
+            adata.X = csc_matrix(adata.layers["counts"]) # Use raw counts for ADT
+            muon.prot.pp.clr(adata)
+            adata.layers["adt_normalized"] = csr_matrix(adata.X)
+        elif adt_normalization == "log_cp10k":
+            adata.layers["adt_normalized"] = adata.layers["normalized"]
+        else:
+            raise ValueError(f"Unknown ADT normalization method: {adt_normalization}")
+        
+        layer = "adt_normalized"
         scvi.model.SCVI.setup_anndata(adata, batch_key="batch", layer=layer)
         model = scvi.model.SCVI(adata, gene_likelihood="normal", n_layers=1, n_latent=10)
     elif modality == "ATAC":