Description of feature
Dear all,
I would like to ask for a feature in the Milo submodule: Grouping Neighborhoods, as implemented in MiloR (https://github.com/MarioniLab/miloR/blob/master/R/groupNhoods.R).
@emdann suggested that I request the feature for the pertpy implementation (see emdann/milopy#49).
As I am using the feature already to analyse my data I have code suggestions, should you be interested.
I adjusted the code and example to work with pertpy-Milo using the tutorial data, compare the neighborhood grouping results to the MiloR implementation, and included a suggestion for neighborhood group markers based on the pertpy differential expression module.
I would be very thankful for any help to correct/verify my code.
Best regards,
max


Marker genes from find_nhood_groups_markers for neighborhood groups 5 vs 4 in "Python Neighborhood groups":

MiloR neighborhood groups:

Neighborhood group functions:
def find_nhood_groups(
mdata,
SpatialFDR_threshold = 0.1,
merge_discord = False,
max_lfc_delta=None,
overlap=1,
subset_nhoods: None | str = None,
):
"""Get igraph graph from adjacency matrix. Adjusted from scanpy louvain clustering."""
nhs = mdata["rna"].obsm["nhoods"].copy()
nhood_adj = mdata["milo"].varp["nhood_connectivities"].copy()
da_res = mdata["milo"].var.copy()
is_da = np.asarray(da_res.SpatialFDR <= SpatialFDR_threshold)
if subset_nhoods is not None:
print(da_res.shape)
mask_da_res = (
da_res
.query(subset_nhoods)
.copy()
)
mask = np.asarray([True if x in mask_da_res.index else False for x in da_res.index])
da_res = da_res.loc[mask, :].copy()
print(da_res.shape)
# mask = da_res.index.values.astype(bool)
print(mask.shape)
nhs = nhs[:, mask].copy()
print(mask.shape)
print(nhood_adj.shape)
nhood_adj = nhood_adj[:, mask]
nhood_adj = nhood_adj[mask, :].copy()
print(nhood_adj.shape)
# is_da = np.asarray(da_res.SpatialFDR <= SpatialFDR_threshold)
is_da = is_da[mask].copy()
# print(is_da.shape)
# da_res.loc[is_da, 'logFC'].values @ (da_res.loc[is_da, 'logFC']).T.values
# print(da_res.loc[is_da, 'logFC'])
if merge_discord is False:
# discord_sign = np.sign(da_res.loc[is_da, 'logFC'].values @ (da_res.loc[is_da, 'logFC'])) < 0
# Assuming da.res is a pandas DataFrame and is.da is a boolean array
discord_sign = np.sign(da_res.loc[is_da, 'logFC'].values @ da_res.loc[is_da, 'logFC'].values.T) < 0
# print(discord_sign.shape)
# nhood_adj[is_da, is_da][discord_sign] <- 0
# print(nhood_adj.shape)
nhood_adj[(is_da, is_da)][discord_sign] = 0
if overlap > 1:
nhood_adj[nhood_adj < overlap] = 0
if max_lfc_delta is not None:
lfc_diff = np.array([da_res['logFC'].values - x for x in da_res['logFC'].values])
nhood_adj[np.abs(lfc_diff) > max_lfc_delta] = 0
# binarise
# nhood_adj = np.asarray((nhood_adj > 0) + 0)
nhood_adj = (nhood_adj > 0).astype(int)
g = sc._utils.get_igraph_from_adjacency(nhood_adj, directed = False)
weights = None
part = g.community_multilevel(weights=weights)
groups = np.array(part.membership)
groups = groups.astype(int).astype(str)
if subset_nhoods is not None:
mdata["milo"].var["nhood_groups"] = np.nan
mdata["milo"].var.loc[mask, "nhood_groups"] = groups
return None
mdata["milo"].var["nhood_groups"] = groups
Nhood group markers:
from typing import Sequence
from lamin_utils import logger
class Limma(pt.tl._differential_gene_expression._base.LinearModelBase):
def _check_counts(self):
# check_is_integer_matrix(self.data)
pass
def fit(self, voom = False, **kwargs):
"""Fit model on normalized data using limma.
Note: this creates its own AnnData object for downstream.
Args:
**kwargs: Keyword arguments specific to glmQLFit()
"""
if voom:
print("voom selected")
self.voom_fit(**kwargs)
return None
from scipy.sparse import issparse
# For running in notebook
# pandas2ri.activate()
# rpy2.robjects.numpy2ri.activate()
try:
import rpy2.robjects.numpy2ri
import rpy2.robjects.pandas2ri
from rpy2 import robjects as ro
from rpy2.robjects import numpy2ri, pandas2ri
from rpy2.robjects.conversion import localconverter
from rpy2.robjects.packages import importr
# pandas2ri.activate()
# rpy2.robjects.numpy2ri.activate()
except ImportError:
raise ImportError("edger requires rpy2 to be installed.") from None
try:
edger = importr("edgeR")
limma = importr("limma")
except ImportError as e:
raise ImportError(
"edgeR requires a valid R installation with the following packages:\n"
"edgeR, BiocParallel, RhpcBLASctl"
) from e
# Convert dataframe
with localconverter(ro.default_converter + numpy2ri.converter):
expr = self.adata.X if self.layer is None else self.adata.layers[self.layer]
if issparse(expr):
expr = expr.T.toarray()
else:
expr = expr.T
expr_r = ro.conversion.py2rpy(pd.DataFrame(expr, index=self.adata.var_names, columns=self.adata.obs_names))
r_obs = ro.conversion.py2rpy(self.adata.obs)
# rbase = importr("base")
with localconverter(default_converter + pandas2ri.converter):
r_design = ro.conversion.py2rpy(self.design)
self.fit = limma.lmFit(expr_r, r_design)
# self.fit = limma.eBayes(fit)
### for the sake of completeness
def voom_fit(self, voom_kwargs = {}, **lmFit_kwargs):
"""Fit model on count data using limma-voom.
Note: this creates its own AnnData object for downstream.
Args:
**kwargs: Keyword arguments specific to glmQLFit()
"""
print("Fitting limma with voom")
from scipy.sparse import issparse
# For running in notebook
# pandas2ri.activate()
# rpy2.robjects.numpy2ri.activate()
try:
import rpy2.robjects.numpy2ri
import rpy2.robjects.pandas2ri
from rpy2 import robjects as ro
from rpy2.robjects import numpy2ri, pandas2ri
from rpy2.robjects.conversion import localconverter
from rpy2.robjects.packages import importr
# pandas2ri.activate()
# rpy2.robjects.numpy2ri.activate()
except ImportError:
raise ImportError("edger requires rpy2 to be installed.") from None
try:
edger = importr("edgeR")
limma = importr("limma")
except ImportError as e:
raise ImportError(
"!!limma requires a valid R installation with the following packages:\n"
"edgeR, limma, BiocParallel, RhpcBLASctl"
) from e
# Convert dataframe
with localconverter(ro.default_converter + numpy2ri.converter):
expr = self.adata.X if self.layer is None else self.adata.layers[self.layer]
if issparse(expr):
expr = expr.T.toarray()
else:
expr = expr.T
expr_r = ro.conversion.py2rpy(pd.DataFrame(expr, index=self.adata.var_names, columns=self.adata.obs_names))
r_obs = ro.conversion.py2rpy(self.adata.obs)
# rbase = importr("base")
with localconverter(default_converter + pandas2ri.converter):
r_design = ro.conversion.py2rpy(self.design)
v = limma.voom(expr_r, r_design, **voom_kwargs)
self.fit = limma.lmFit(v, r_design, **lmFit_kwargs)
def _test_single_contrast(self, contrast, trend = False, robust = True, **kwargs):
with localconverter(default_converter + pandas2ri.converter):
r_design = ro.conversion.py2rpy(self.design)
r_string = '''
FUN = function(fit, contrast, design) {
# contrast_matrix <- makeContrasts(contrast, levels=design)
# print(contrast_matrix)
contrast <- as.numeric(contrast)
print(class(contrast))
fit2 = limma::contrasts.fit(fit, contrast)
print("check")
efit <- limma::eBayes(fit2)
print("check")
# print(coef(efit))
res <- limma::topTable(efit, coef = 1, n = Inf)
}
'''
r_pkg = STAP(r_string, "r_pkg")
res = r_pkg.FUN(self.fit, contrast, r_design)
with localconverter(default_converter + pandas2ri.converter):
de_res = pandas2ri.rpy2py(res)
de_res.index.name = "variable"
de_res = de_res.reset_index()
de_res = de_res.rename(columns={"P.Value": "p_value", "logFC": "log_fc", "adj.P.Val": "adj_p_value"})
return de_res
def get_cells_in_nhoods(mdata, nhood_ids):
'''
Get cells in neighbourhoods of interest '''
in_nhoods = np.array(mdata["rna"].obsm['nhoods'][:,nhood_ids.astype('int')].sum(1))
ad = mdata["rna"][in_nhoods.astype(bool), :].copy()
return ad
def _get_groups(
mdata,
sample_col,
nhood_group_obs = "nhood_groups",
confounders_obs = [],
pseudobulk = True,
func = "sum",
layer = "counts"
):
layer = None if layer == "X" else layer
ad_list = []
groups = mdata["milo"].var[nhood_group_obs].dropna().unique()
print(groups)
mdata["milo"].var["_my_groups"] = mdata["milo"].var[nhood_group_obs]
for g in groups:
nhood_ids = (
mdata["milo"].var
.query('_my_groups == @g')
.index.to_list()
)
nhood_ids = np.asarray(nhood_ids)
cur_ad = get_cells_in_nhoods(mdata, nhood_ids)
cur_ad.obs[nhood_group_obs] = g
if pseudobulk:
cur_ad = sc.get.aggregate(cur_ad, by = [sample_col, nhood_group_obs] + confounders_obs, func = func,
layer=layer)
ad_list.append(cur_ad)
ad = sc.concat(ad_list)
return ad
def find_nhood_group_markers(
mdata,
sample_col = "patient_id",
nhood_group_obs = "nhood_groups",
design = "~0+nhood_groups",
baseline = "non_enriched",
group_to_compare = "enriched",
confounders_obs = [],
pseudobulk = True,
func = "mean",
layer = "X",
pb_model = Limma,
return_model = False,
**kwargs
):
ad = _get_groups(
mdata,
sample_col = sample_col,
nhood_group_obs = nhood_group_obs,
confounders_obs = confounders_obs,
pseudobulk = pseudobulk,
func = func,
layer = layer
)
ad.X = ad.layers[func]
mod = pb_model(ad, design = design)
mod.fit(**kwargs)
res_df = mod._test_single_contrast(
mod.contrast(nhood_group_obs, baseline = baseline, group_to_compare = group_to_compare)
)
if return_model:
return res_df, mod
return res_df
Code to reproduce the example plots:
import os
os.environ["R_HOME"] = r_env_path
import rpy2.rinterface_lib.callbacks
# import anndata2ri
import logging
from rpy2.robjects import pandas2ri
from rpy2.robjects import numpy2ri
from rpy2.robjects import r
import rpy2.robjects as ro
from rpy2.robjects.packages import importr
from rpy2.robjects import default_converter
from rpy2.robjects.conversion import localconverter
from rpy2.robjects.packages import STAP
# sc.settings.verbosity = 0
rpy2.rinterface_lib.callbacks.logger.setLevel(logging.ERROR)
%load_ext rpy2.ipython
import numpy as np
import scanpy as sc
import seaborn as sns
# from scipy.stats import median_abs_deviation
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import matplotlib as mpl
import pertpy as pt
adata = pt.data.stephenson_2021_subsampled()
adata.obs["COVID_severity"] = adata.obs["Status_on_day_collection_summary"].copy() # short name
adata.obs[["patient_id", "COVID_severity"]].drop_duplicates().value_counts("COVID_severity")
sc.pl.umap(adata, color=["author_cell_type"], legend_loc="on data")
sc.pl.umap(adata, color=["Site", "Status"], wspace=0.3, size=8)
## Exclude LPS samples
adata = adata[adata.obs["Status"] != "LPS"].copy()
## Initialize object for Milo analysis
milo = pt.tl.Milo()
mdata = milo.load(adata)
from sklearn_ann.kneighbors.annoy import AnnoyTransformer
transformer = AnnoyTransformer(n_neighbors=150)
sc.pp.neighbors(mdata["rna"], use_rep="X_scVI", transformer = transformer)
milo.make_nhoods(mdata["rna"], prop=0.1)
nhood_size = np.array(mdata["rna"].obsm["nhoods"].sum(0)).ravel()
plt.hist(nhood_size, bins=100)
plt.xlabel("# cells in nhood")
plt.ylabel("# nhoods");
milo.count_nhoods(mdata, sample_col="patient_id")
# Reorder categories
# (by default, the last category is taken as the condition of interest)
mdata["rna"].obs["Status"] = mdata["rna"].obs["Status"].cat.reorder_categories(["Healthy", "Covid"])
milo.da_nhoods(mdata, design="~Status")
old_figsize = plt.rcParams["figure.figsize"]
plt.rcParams["figure.figsize"] = [10, 5]
plt.subplot(1, 2, 1)
plt.hist(mdata["milo"].var.PValue, bins=50)
plt.xlabel("P-Vals")
plt.subplot(1, 2, 2)
plt.plot(mdata["milo"].var.logFC, -np.log10(mdata["milo"].var.SpatialFDR), ".")
plt.xlabel("log-Fold Change")
plt.ylabel("- log10(Spatial FDR)")
plt.tight_layout()
plt.rcParams["figure.figsize"] = old_figsize
milo.build_nhood_graph(mdata)
plt.rcParams["figure.figsize"] = [7, 7]
milo.plot_nhood_graph(
mdata,
alpha=0.1, ## SpatialFDR level (1%)
min_size=1, ## Size of smallest dot
)
milo.annotate_nhoods(mdata, anno_col="author_cell_type")
plt.hist(mdata["milo"].var["nhood_annotation_frac"], bins=30)
plt.xlabel("celltype fraction")
mdata["milo"].var["nhood_annotation"] = mdata["milo"].var["nhood_annotation"].astype(str)
mdata["milo"].var.loc[mdata["milo"].var["nhood_annotation_frac"] < 0.6, "nhood_annotation"] = "Mixed"
milo.plot_da_beeswarm(mdata, alpha=0.1)
find_nhood_groups(mdata, # subset_nhoods = "nhood_annotation == 'CD83_CD14_mono'"
)
### Nhood groups beeswarm plot
milo.plot_da_beeswarm(
mdata,
anno_col = "nhood_groups",
)
### Python neighbor groups plot
from legendkit import cat_legend, size_legend, vstack
plt_df = pd.DataFrame(mdata["milo"].varm["X_milo_graph"],
columns = ["UMAP_"+str(i+1) for i in range(2)]
)
plt_df["nhood_groups"] = mdata["milo"].var["nhood_groups"].values
plt_df["nhood_groups"] = plt_df["nhood_groups"].astype(str)
plt_df["size"] = mdata["milo"].var["Nhood_size"].values
plt_df = plt_df.sort_values("nhood_groups")
label_df = label_df = (
plt_df
.groupby("nhood_groups")
.median()
)
from legendkit import size_legend, cat_legend, vstack
# groups = plt_df["nhood_groups"].unique()
groups = []
[groups.append(n) for n in plt_df["nhood_groups"] if n not in groups]
print(groups)
palette = [color for color in sns.color_palette("Paired", 12) for _ in range(len(groups))]
scale_factor = 0.2
sns.scatterplot(
x=plt_df["UMAP_1"],
y=plt_df["UMAP_2"],
size = plt_df["size"].values * scale_factor, #.apply(lambda x: x*0.2),
hue = plt_df["nhood_groups"],#.astype("category").cat.codes,
legend = False,
linewidth=0,
palette = sns.color_palette("Paired", 12)
)
sns.despine()
slegend = size_legend(mdata["milo"].var["Nhood_size"].values * scale_factor, loc = "out right center",
title = "Nhood size", func = lambda x: x*(1/scale_factor))
color = dict(
colors = sns.color_palette("Paired", 12),
labels = groups
)
ax = plt.gca()
ax.set_title("Python Neighborhood groups", fontsize = 18)
handles, labels = ax.get_legend_handles_labels()
clegend = cat_legend(**color, title = "Nhood group", handle = "circle")
vstack([clegend, slegend], # title="Stack Vertically",
loc="out right center", spacing=10, frameon=False, ax=ax)
for _group, row in label_df.iterrows():
plt.text(row["UMAP_1"], row["UMAP_2"], _group, fontsize = 15)
plt.show()
###############
import anndata2ri
mdata = mdata.copy()
adata = pt.data.stephenson_2021_subsampled()
adata = adata[adata.obs["Status"] != "LPS"].copy()
da_res = mdata["milo"].var.copy()
da_res = (
da_res.drop("nhood_groups", axis = 1)
)
with localconverter(anndata2ri.converter):
r.assign("sce", adata)
r.assign("nhoods", mdata["rna"].obsm["nhoods"])
r.assign("nhoodCounts", mdata["milo"].X.T)
r.assign("nhood_adjacency", mdata["milo"].varp["nhood_connectivities"])
r.assign("da_results", da_res)
r.assign("nhood_graph", mdata["milo"].varm["X_milo_graph"])
%%R -o da_results
suppressMessages(library(SingleCellExperiment))
suppressMessages(library(scran))
suppressMessages(library(miloR))
suppressMessages(library(Matrix))
milo <- Milo(sce)
nhoodAdjacency(milo) <- nhood_adjacency
nhoodCounts(milo) <- nhoodCounts
nhoods(milo) <- as(nhoods, "CsparseMatrix")
nhoodGraph(milo) <- nhood_graph
colnames(nhoodAdjacency(milo)) <- da_results$index_cell
rownames(nhoodAdjacency(milo)) <- da_results$index_cell
da_results <- groupNhoods(milo, da_results)
### MiloR nhood groups plot
mdata["milo"].var["nhood_groups_miloR"] = da_results["NhoodGroup"]
from legendkit import cat_legend, size_legend, vstack
plt_df = pd.DataFrame(mdata["milo"].varm["X_milo_graph"],
columns = ["UMAP_"+str(i+1) for i in range(2)]
)
plt_df["nhood_groups"] = mdata["milo"].var["nhood_groups_miloR"].values
plt_df.loc[plt_df.nhood_groups.isnull(), "nhood_groups"] = np.nan
plt_df["nhood_groups"] = plt_df["nhood_groups"].astype(str)
plt_df["size"] = mdata["milo"].var["Nhood_size"].values
plt_df = plt_df.sort_values("nhood_groups", na_position = "first")
label_df = label_df = (
plt_df
.groupby("nhood_groups")
.median()
)
from legendkit import size_legend, cat_legend, vstack
# groups = plt_df["nhood_groups"].unique()
groups = []
[groups.append(n) for n in plt_df["nhood_groups"] if n not in groups]
print(groups)
palette = [color for color in sns.color_palette("Paired", 12) for _ in range(len(groups))]
scale_factor = 0.2
sns.scatterplot(
x=plt_df["UMAP_1"],
y=plt_df["UMAP_2"],
size = plt_df["size"].values * scale_factor, #.apply(lambda x: x*0.2),
hue = plt_df["nhood_groups"],#.astype("category").cat.codes,
legend = False,
linewidth=0,
palette = sns.color_palette("Paired", 12)
)
sns.despine()
slegend = size_legend(mdata["milo"].var["Nhood_size"].values * scale_factor, loc = "out right center",
title = "Nhood size", func = lambda x: x*(1/scale_factor))
color = dict(
colors = sns.color_palette("Paired", 12),
labels = groups
)
ax = plt.gca()
ax.set_title("MiloR Neighborhood groups", fontsize = 18)
handles, labels = ax.get_legend_handles_labels()
clegend = cat_legend(**color, title = "Nhood group", handle = "circle")
vstack([clegend, slegend], # title="Stack Vertically",
loc="out right center", spacing=10, frameon=False, ax=ax)
for _group, row in label_df.iterrows():
plt.text(row["UMAP_1"], row["UMAP_2"], _group, fontsize = 15)
plt.show()
mdata["milo"].var.loc[~mdata["milo"].var["nhood_groups"].isin(["4", "5"]), "nhood_groups"] = np.nan
### uses the above Limma class by default, but can be switched out for pt.tl.EdgeR when a counts layer exists.
res_df, mod = find_nhood_group_markers(mdata,
sample_col = "patient_id",
baseline = "4", group_to_compare = "5",
return_model = True,
voom = True)
### Volcano Plot:
mod.plot_volcano(res_df, log2fc_thresh = 0.1,legend_pos = (1.3, 1))
Description of feature
Dear all,
I would like to ask for a feature in the Milo submodule: Grouping Neighborhoods, as implemented in MiloR (https://github.com/MarioniLab/miloR/blob/master/R/groupNhoods.R).
@emdann suggested that I request the feature for the pertpy implementation (see emdann/milopy#49).
As I am using the feature already to analyse my data I have code suggestions, should you be interested.
I adjusted the code and example to work with pertpy-Milo using the tutorial data, compare the neighborhood grouping results to the MiloR implementation, and included a suggestion for neighborhood group markers based on the pertpy differential expression module.
I would be very thankful for any help to correct/verify my code.
Best regards,
max
Marker genes from

find_nhood_groups_markersfor neighborhood groups 5 vs 4 in "Python Neighborhood groups":MiloR neighborhood groups:

Neighborhood group functions:
Nhood group markers:
Code to reproduce the example plots: