panpipes-tutorials/docs/deconvolution/pipeline.yml at main · DendrouLab/panpipes-tutorials · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
# =============================================================================================
# Deconvolution, spatial transcriptomics workflow Panpipes (pipeline_deconvolution_spatial.py)
# =============================================================================================
# Written by Sarah Ouologuem


# ---------------------------
# 0. Compute resource options
# ---------------------------
resources:
  # Number of threads used for parallel jobs
  threads_high: 1 # Number of threads used for high intensity computing tasks
  threads_medium: 1  # Must be enough memory to load your mudata and do computationally light tasks
  threads_low: 1 # Must be enough memory to load text files and do plotting, requires much less memory

# Path to conda env, leave blank if running native or your cluster automatically inherits the login node environment
condaenv:


# ----------------------
# 1. Specify input
# ----------------------

# One or multiple slides can be deconvoluted with the same reference in one run. For that, one MuData object for each slide is expected.
input:
  spatial: ./data/spatial_data # Path to folder containing one or multiple MuDatas of spatial data. The pipeline is reading in all MuData files in that folder and assuming that they are MuDatas of spatial slides.
# For all MuData files in that folder, deconvolution is run by the pipeline.
# In the MuData objects, the spatial data is expected to be saved in mudata.mod["spatial"]. For each spatial MuData, deconvolution is run with "singlecell" (see below) as a reference
  singlecell: ./data/Human_Heart_reference.h5mu # Path to the MuData file of the reference single-cell data, reference data expected to be saved in mudata.mod["rna"]


# ----------------------
## 2. Cell2Location
# ----------------------

Cell2Location:
  run: True # Whether to run Cell2Location

  # -------------------------------
  # Feature selection paramaters
  # -------------------------------
  feature_selection:
  # Reduced feature set can either be given  a) via a csv file of genes or b) feature selection will be performed à la Cell2Location, i.e. via the function: cell2location.utils.filtering.filter_genes()
  # If no file is given in a), b) will be run, i.e. feature selection is not optional.

    # a) Path to a csv file containing a reduced feature set
    gene_list: # A header in the csv is expected in the first row

    # b) Parameters for Cell2Location's feature selection, leave empty to use defaults
    # Whether to remove mitochondrial genes before feature selection
    remove_mt: # Default True
    # All genes detected in less than cell_count_cutoff cells will be excluded.
    cell_count_cutoff: # Default 15, parameter of function cell2location.utils.filtering.filter_genes()
    # All genes detected in at least this percentage of cells will be included.
    cell_percentage_cutoff2: # Default 0.05, parameter of function cell2location.utils.filtering.filter_genes()
    # Genes detected in the number of cells between the above-mentioned cutoffs are selected only when their average expression in non-zero cells is above this cutoff
    nonz_mean_cutoff:  # Default 1.12, parameter of function cell2location.utils.filtering.filter_genes()

  # -------------------------------
  # Reference model paramaters
  # Leave empty to use defaults
  # -------------------------------
  reference:
    labels_key: cell_type_original # Default None, key in adata.obs for label (cell type) information
    batch_key: # Default None, key in adata.obs for batch information
    layer: # Default None (if None, X will be used), Layer of the raw (!) counts
    categorical_covariate_keys: # Comma-separated without spaces; default None; keys in adata.obs that correspond to categorical data. These covariates can be added in addition to the batch covariate and are also treated as nuisance factors (i.e., the model tries to minimize their effects on the latent space)
    continuous_covariate_keys: # Comma-separated without spaces; default None; keys in adata.obs that correspond to continuous data. These covariates can be added in addition to the batch covariate and are also treated as nuisance factors (i.e., the model tries to minimize their effects on the latent space)
    max_epochs: 400 # Default np.min([round((20000 / n_cells) * 400), 400])
    use_gpu: # Default True; whether to use GPU for training

  # -------------------------------
  # Spatial mapping model paramaters
  # Leave empty to use defaults
  # -------------------------------
  spatial:
    batch_key: # Default None, key in adata.obs for batch information
    layer: # Default None (if None, X will be used), Layer of the raw (!) counts
    categorical_covariate_keys: # Comma-separated without spaces; default None; keys in adata.obs that correspond to categorical data. These covariates can be added in addition to the batch covariate and are also treated as nuisance factors (i.e., the model tries to minimize their effects on the latent space)
    continuous_covariate_keys: # Comma-separated without spaces; default None; keys in adata.obs that correspond to continuous data. These covariates can be added in addition to the batch covariate and are also treated as nuisance factors (i.e., the model tries to minimize their effects on the latent space)

    # The following two parameters must be specified (cannot leave empty), otherwise an error will be thrown:
    N_cells_per_location: 8 # Expected cell abundance per voxel
    detection_alpha: 20 # Regularization of with-in experiment variation in RNA detection sensitivity

    max_epochs: 400 # Default np.min([round((20000 / n_cells) * 400), 400])
    use_gpu: # Default True; whether to use GPU for training

 # -------------------------------
  save_models: False # Default False; whether to save the reference and spatial mapping models


# -------------
## 3. Tangram
# -------------

Tangram:
  run: False # Whether to run Tangram

  # -------------------------------
  # Feature selection paramaters
  # -------------------------------
  # Reduced feature set can either be given  a) via a csv file of genes or b) sc.tl.rank_genes_groups() is run and the top n markers of each group are selected
  # If no file is given in a), b) will be run, i.e. feature selection is not optional.
  feature_selection:
    #  a) Path to a csv file containing a reduced feature set
    gene_list: # A header in the csv is expected in the first row

    # b) Parameters for sc.tl.rank_genes_groups() gene selection.
    rank_genes:
      labels_key: cell_type # Which column in .obs of the reference to use for the 'groupby' parameter of sc.tl.rank_genes_groups()
      layer: Null # Default None, which layer to use of the reference for sc.tl.rank_genes_groups(). if Null (i.e. None), uses .X
      n_genes: 100 # Default 100, how many top genes to select of each 'groupby' group
      test_method: wilcoxon # Default t-test_overestim_var, which test method to use. one of ['logreg', 't-test', 'wilcoxon', 't-test_overestim_var']
      correction_method: benjamini-hochberg # Default benjamini-hochberg, which p-value correction method to use. one of ['benjamini-hochberg', 'bonferroni']. Used only for 't-test', 't-test_overestim_var', and 'wilcoxon'

  model:
    labels_key: # Default None, cell type key in the reference .obs
    num_epochs: # Default 1000. Number of epochs for tangram.mapping_utils.map_cells_to_space()
    device: cpu # Default cpu. Device to use for deconvolution
    kwargs: # Parameters for tangram.mapping_utils.map_cells_to_space(), feel free to add or remove parameters below
      learning_rate: 0.1
      lambda_d: 0