Vitek-Lab · tonywu1999 · Jun 16, 2026 · Jun 15, 2026 · Jun 16, 2026 · Jun 16, 2026
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -46,4 +46,4 @@ Encoding: UTF-8
 URL: http://msstats.org, https://vitek-lab.github.io/MSstatsBioNet/
 BugReports: https://groups.google.com/forum/#!forum/msstats
 Config/testthat/edition: 3
-RoxygenNote: 7.3.3
+Config/roxygen2/version: 8.0.0
diff --git a/NAMESPACE b/NAMESPACE
@@ -1,8 +1,11 @@
 # Generated by roxygen2: do not edit by hand
 
 export(annotateProteinInfoFromIndra)
+export(bootstrapTopicModels)
+export(compareTopicModels)
 export(cytoscapeNetwork)
 export(cytoscapeNetworkOutput)
+export(decomposeSubnetworkByTopic)
 export(deleteEdgeFromNetwork)
 export(exportNetworkToHTML)
 export(filterSubnetworkByContext)
@@ -27,6 +30,7 @@ importFrom(r2r,keys)
 importFrom(r2r,query)
 importFrom(rentrez,entrez_fetch)
 importFrom(stats,cor)
+importFrom(stats,runif)
 importFrom(stats,setNames)
 importFrom(stopwords,stopwords)
 importFrom(text2vec,TfIdf)

diff --git a/R/bootstrapTopicModels.R b/R/bootstrapTopicModels.R
@@ -0,0 +1,153 @@
+#' Bootstrap the topic decomposition to find each topic's robust top words
+#'
+#' Refits the NMF topic model on many bootstrap resamples of the papers and
+#' reports, for every topic, how reliably each word stays among the topic's top
+#' terms. This separates words that genuinely characterise a topic from words
+#' that only surface in a single lucky fit, and lets you see how the top-word
+#' lists change with and without the PPI view (run once per \code{include_ppi}).
+#'
+#' Because topic indices are arbitrary across fits (label switching), each
+#' resample's topics are first aligned to a reference fit on the full data by
+#' cosine similarity of their topic-word vectors. The papers are resampled with
+#' replacement; the word vocabulary is held fixed (from the full data) so topics
+#' remain comparable across resamples, and the NMF seed is held fixed so the
+#' variability reported reflects \emph{data} resampling rather than random
+#' initialization.
+#'
+#' @param subnetwork list with \code{nodes} and \code{edges} data.frames, e.g.
+#'   the output of \code{\link{getSubnetworkFromIndra}}.
+#' @param n_boot number of bootstrap resamples. Default 50.
+#' @param n_topics number of topics (rank of the factorization). Default 5.
+#' @param include_ppi logical; factorize the PPI/edge view jointly with the text
+#'   (\code{TRUE}, default) or use paper words only (\code{FALSE}). See
+#'   \code{\link{decomposeSubnetworkByTopic}}.
+#' @param n_top_terms number of top words that define a topic's "top list" in
+#'   each resample (the cutoff for the selection-frequency tally). Default 10.
+#' @param min_term_count minimum corpus frequency for a word to be kept when
+#'   building the text matrix. Default 2.
+#' @param max_iter maximum number of NMF multiplicative-update iterations.
+#'   Default 200.
+#' @param tol relative-change tolerance for NMF early stopping. Default 1e-4.
+#' @param seed random seed for the reference fit, the resampling, and each
+#'   bootstrap NMF. Default 1.
+#'
+#' @return A list with
+#'   \describe{
+#'     \item{include_ppi, n_boot, n_topics}{the settings used.}
+#'     \item{topTerms}{named list \code{topic_1} ... \code{topic_k}. Each is a
+#'       data.frame sorted by \code{selection_freq}, with columns \code{term},
+#'       \code{selection_freq} (fraction of resamples the word was in this
+#'       topic's top \code{n_top_terms}), and \code{mean_weight} (mean
+#'       within-topic word weight across resamples). A word with
+#'       \code{selection_freq} near 1 is a stable signature of the topic.}
+#'     \item{reference}{named list of the top \code{n_top_terms} words per topic
+#'       from the single full-data fit, for comparison.}
+#'   }
+#'
+#' @seealso \code{\link{decomposeSubnetworkByTopic}},
+#'   \code{\link{compareTopicModels}}
+#' @note \strong{Beta feature:} This function is experimental and the API may
+#'   change without notice in future versions.
+#' @export
+#'
+#' @examples
+#' \dontrun{
+#' input <- data.table::fread(system.file(
+#'     "extdata/groupComparisonModel.csv",
+#'     package = "MSstatsBioNet"
+#' ))
+#' subnetwork <- getSubnetworkFromIndra(input)
+#'
+#' # Top words with PPIs included vs. words only:
+#' boot_ppi  <- bootstrapTopicModels(subnetwork, include_ppi = TRUE)
+#' boot_text <- bootstrapTopicModels(subnetwork, include_ppi = FALSE)
+#'
+#' head(boot_ppi$topTerms$topic_1)     # robust signature words for topic 1
+#' boot_text$topTerms$topic_1
+#' }
+bootstrapTopicModels <- function(subnetwork,
+                                 n_boot = 50,
+                                 n_topics = 5,
+                                 include_ppi = TRUE,
+                                 n_top_terms = 10,
+                                 min_term_count = 2,
+                                 max_iter = 200,
+                                 tol = 1e-4,
+                                 seed = 1) {
+
+    .validateDecomposeSubnetworkByTopicInput(subnetwork, n_topics, 0.2,
+                                             include_ppi)
+    if (!is.numeric(n_boot) || length(n_boot) != 1L || is.na(n_boot) ||
+        n_boot < 2 || n_boot != as.integer(n_boot)) {
+        stop("`n_boot` must be a single integer >= 2.")
+    }
-bootstrapTopicModels <- function(subnetwork,
-                                 n_boot = 50,
-                                 n_topics = 5,
-                                 include_ppi = TRUE,
-                                 n_top_terms = 10,
-                                 min_term_count = 2,
-                                 max_iter = 200,
-                                 tol = 1e-4,
-                                 seed = 1) {
-
-    .validateDecomposeSubnetworkByTopicInput(subnetwork, n_topics, 0.2,
-                                             include_ppi)
-    if (!is.numeric(n_boot) || length(n_boot) != 1L || is.na(n_boot) ||
-        n_boot < 2 || n_boot != as.integer(n_boot)) {
-        stop("`n_boot` must be a single integer >= 2.")
-    }
+bootstrapTopicModels <- function(subnetwork,
+                                 n_boot = 50,
+                                 n_topics = 5,
+                                 include_ppi = TRUE,
+                                 n_top_terms = 10,
+                                 min_term_count = 2,
+                                 max_iter = 200,
+                                 tol = 1e-4,
+                                 seed = 1) {
+
+    .validateDecomposeSubnetworkByTopicInput(subnetwork, n_topics, 0.2,
+                                             include_ppi)
+    if (!is.numeric(n_boot) || length(n_boot) != 1L || is.na(n_boot) ||
+        n_boot < 2 || n_boot != as.integer(n_boot)) {
+        stop("`n_boot` must be a single integer >= 2.")
+    }
+    n_boot <- as.integer(n_boot)
+    if (!is.numeric(n_top_terms) || length(n_top_terms) != 1L ||
+        is.na(n_top_terms) || n_top_terms < 1 ||
+        n_top_terms != as.integer(n_top_terms)) {
+        stop("`n_top_terms` must be a single positive integer.")
+    }
+    n_top_terms <- as.integer(n_top_terms)
-bootstrapTopicModels <- function(subnetwork,
-                                 n_boot = 50,
-                                 n_topics = 5,
-                                 include_ppi = TRUE,
-                                 n_top_terms = 10,
-                                 min_term_count = 2,
-                                 max_iter = 200,
-                                 tol = 1e-4,
-                                 seed = 1) {
-
-    .validateDecomposeSubnetworkByTopicInput(subnetwork, n_topics, 0.2,
-                                             include_ppi)
-    if (!is.numeric(n_boot) || length(n_boot) != 1L || is.na(n_boot) ||
-        n_boot < 2 || n_boot != as.integer(n_boot)) {
-        stop("`n_boot` must be a single integer >= 2.")
-    }
+        for (r in seq_len(k)) {
+            row <- fit$H_text[map[r], ]
+            s <- sum(row)
+            weightsum[r, ] <- weightsum[r, ] + (if (s > 0) row / s else row)
+            n_sel <- min(n_top_terms, length(row))
+            top <- order(row, decreasing = TRUE)[seq_len(n_sel)]
+            top <- top[row[top] > 0]
+            topcount[r, top] <- topcount[r, top] + 1
+        }
-bootstrapTopicModels <- function(subnetwork,
-                                 n_boot = 50,
-                                 n_topics = 5,
-                                 include_ppi = TRUE,
-                                 n_top_terms = 10,
-                                 min_term_count = 2,
-                                 max_iter = 200,
-                                 tol = 1e-4,
-                                 seed = 1) {
-
-    .validateDecomposeSubnetworkByTopicInput(subnetwork, n_topics, 0.2,
-                                             include_ppi)
-    if (!is.numeric(n_boot) || length(n_boot) != 1L || is.na(n_boot) ||
-        n_boot < 2 || n_boot != as.integer(n_boot)) {
-        stop("`n_boot` must be a single integer >= 2.")
-    }
+bootstrapTopicModels <- function(subnetwork,
+                                 n_boot = 50,
+                                 n_topics = 5,
+                                 include_ppi = TRUE,
+                                 n_top_terms = 10,
+                                 min_term_count = 2,
+                                 max_iter = 200,
+                                 tol = 1e-4,
+                                 seed = 1) {
+
+    .validateDecomposeSubnetworkByTopicInput(subnetwork, n_topics, 0.2,
+                                             include_ppi)
+    if (!is.numeric(n_boot) || length(n_boot) != 1L || is.na(n_boot) ||
+        n_boot < 2 || n_boot != as.integer(n_boot)) {
+        stop("`n_boot` must be a single integer >= 2.")
+    }
+    n_boot <- as.integer(n_boot)
+    if (!is.numeric(n_top_terms) || length(n_top_terms) != 1L ||
+        is.na(n_top_terms) || n_top_terms < 1 ||
+        n_top_terms != as.integer(n_top_terms)) {
+        stop("`n_top_terms` must be a single positive integer.")
+    }
+    n_top_terms <- as.integer(n_top_terms)
-bootstrapTopicModels <- function(subnetwork,
-                                 n_boot = 50,
-                                 n_topics = 5,
-                                 include_ppi = TRUE,
-                                 n_top_terms = 10,
-                                 min_term_count = 2,
-                                 max_iter = 200,
-                                 tol = 1e-4,
-                                 seed = 1) {
-
-    .validateDecomposeSubnetworkByTopicInput(subnetwork, n_topics, 0.2,
-                                             include_ppi)
-    if (!is.numeric(n_boot) || length(n_boot) != 1L || is.na(n_boot) ||
-        n_boot < 2 || n_boot != as.integer(n_boot)) {
-        stop("`n_boot` must be a single integer >= 2.")
-    }
+        for (r in seq_len(k)) {
+            row <- fit$H_text[map[r], ]
+            s <- sum(row)
+            weightsum[r, ] <- weightsum[r, ] + (if (s > 0) row / s else row)
+            n_sel <- min(n_top_terms, length(row))
+            top <- order(row, decreasing = TRUE)[seq_len(n_sel)]
+            top <- top[row[top] > 0]
+            topcount[r, top] <- topcount[r, top] + 1
+        }
+    n_boot <- as.integer(n_boot)
+
+    # Build the shared matrices once; keep the vocabulary fixed across resamples.
+    mats <- .buildTopicMatrices(subnetwork, n_topics, min_term_count)
+    X_text <- mats$X_text
+    X_edges <- mats$X_edges
+    k <- mats$n_topics
+    n_papers <- nrow(X_text)
+    terms <- colnames(X_text)
+
+    # Reference fit on the full data defines the canonical topic ordering that
+    # every resample is aligned back to.
+    ref <- .fitTopicModel(X_text, X_edges, k, include_ppi,
+                          max_iter = max_iter, tol = tol, seed = seed)
+    reference <- lapply(seq_len(k), function(t)
+        .topTermsForTopic(ref$H_text, t, n_top_terms))
+    names(reference) <- paste0("topic_", seq_len(k))
+
+    # Precompute all resample row-index sets up front: the NMF resets the RNG
+    # internally, so drawing the samples inside the loop would repeat them.
+    set.seed(seed)
+    boot_indices <- lapply(seq_len(n_boot),
+                           function(b) sample.int(n_papers, replace = TRUE))
+
+    # Tally, per (reference topic x word): top-list selection count and the
+    # summed within-topic weight, accumulated across resamples.
+    topcount <- matrix(0, k, length(terms), dimnames = list(NULL, terms))
+    weightsum <- matrix(0, k, length(terms), dimnames = list(NULL, terms))
+
+    for (b in seq_len(n_boot)) {
+        idx <- boot_indices[[b]]
+        fit <- .fitTopicModel(X_text[idx, , drop = FALSE],
+                              X_edges[idx, , drop = FALSE],
+                              k, include_ppi,
+                              max_iter = max_iter, tol = tol, seed = seed)
+
+        # Align this resample's topics to the reference, then aggregate.
+        map <- .greedyTopicMatch(.cosineSimMatrix(ref$H_text, fit$H_text))
+        for (r in seq_len(k)) {
+            row <- fit$H_text[map[r], ]
+            s <- sum(row)
+            weightsum[r, ] <- weightsum[r, ] + (if (s > 0) row / s else row)
+            top <- order(row, decreasing = TRUE)[seq_len(n_top_terms)]
+            top <- top[row[top] > 0]
+            topcount[r, top] <- topcount[r, top] + 1
+        }
+    }
+
+    topTerms <- lapply(seq_len(k), function(r) {
+        keep <- which(topcount[r, ] > 0)
+        df <- data.frame(
+            term            = terms[keep],
+            selection_freq  = topcount[r, keep] / n_boot,
+            mean_weight     = weightsum[r, keep] / n_boot,
+            stringsAsFactors = FALSE
+        )
+        df <- df[order(-df$selection_freq, -df$mean_weight), , drop = FALSE]
+        rownames(df) <- NULL
+        df
+    })
+    names(topTerms) <- paste0("topic_", seq_len(k))
+
+    list(
+        include_ppi = include_ppi,
+        n_boot      = n_boot,
+        n_topics    = k,
+        topTerms    = topTerms,
+        reference   = reference
+    )
+}
diff --git a/R/compareTopicModels.R b/R/compareTopicModels.R
@@ -0,0 +1,160 @@
+#' Test whether including PPIs changes topic structure beyond random chance
+#'
+#' Quantifies how much the topic decomposition produced by
+#' \code{\link{decomposeSubnetworkByTopic}} changes when the PPI/edge view is
+#' included (\code{include_ppi = TRUE}) versus excluded
+#' (\code{include_ppi = FALSE}), and separates that change from the run-to-run
+#' variability that NMF produces just from its random initialization.
+#'
+#' NMF converges to a local optimum that depends on the random seed, so a single
+#' joint-vs-text comparison conflates the real effect of the PPI view with
+#' optimization noise. This function instead refits both modes across many seeds
+#' and compares three distributions of partition agreement (Adjusted Rand Index,
+#' ARI):
+#' \describe{
+#'   \item{within_joint}{ARI between pairs of joint runs (different seeds) —
+#'     how much the joint solution wobbles on its own.}
+#'   \item{within_text}{ARI between pairs of text-only runs — the same for the
+#'     text-only solution.}
+#'   \item{between}{ARI between the joint and text-only run at the \emph{same}
+#'     seed. Because both modes draw \code{W} and \code{H_text} from the same
+#'     seeded stream, a matched seed gives both modes an identical
+#'     initialization, so this isolates the effect of adding the PPI view from
+#'     the starting point.}
+#' }
+#' If the between-mode ARI is systematically lower than the within-mode ARIs,
+#' the PPI view changes the topic structure more than chance would — a
+#' one-sided Wilcoxon rank-sum test (\code{between < within}) puts a p-value on
+#' it. If the between distribution sits inside the within distributions, the
+#' apparent difference is just optimization noise.
+#'
+#' The expensive, network-bound steps (evidence extraction, abstract fetching,
+#' matrix construction) run once; only the NMF is repeated per seed.
+#'
+#' @param subnetwork list with \code{nodes} and \code{edges} data.frames, e.g.
+#'   the output of \code{\link{getSubnetworkFromIndra}}.
+#' @param seeds integer vector of NMF seeds to fit (at least 2). Default
+#'   \code{1:20}.
+#' @param n_topics number of topics (rank of the factorization). Default 5.
+#' @param unit either \code{"edges"} (compare edge-to-topic assignments, the
+#'   default, matching the subnetworks the decomposition returns) or
+#'   \code{"papers"} (compare paper-to-topic assignments).
+#' @param min_term_count minimum corpus frequency for a word to be kept when
+#'   building the text matrix. Default 2.
+#' @param max_iter maximum number of NMF multiplicative-update iterations.
+#'   Default 200.
+#' @param tol relative-change tolerance for NMF early stopping. Default 1e-4.
+#'
+#' @return A list with
+#'   \describe{
+#'     \item{unit}{the comparison unit used.}
+#'     \item{seeds}{the seeds fitted.}
+#'     \item{n_topics}{the effective number of topics.}
+#'     \item{ari}{list of numeric vectors \code{within_joint},
+#'       \code{within_text}, and \code{between} (matched seeds).}
+#'     \item{summary}{data.frame of median/mean ARI and count per comparison.}
+#'     \item{test}{the \code{\link[stats]{wilcox.test}} object comparing the
+#'       between distribution against the pooled within distributions
+#'       (\code{alternative = "less"}).}
+#'     \item{consensus}{list of consensus (co-membership) matrices,
+#'       \code{joint} and \code{text}, across seeds.}
+#'     \item{dispersion}{named numeric vector of consensus dispersion
+#'       coefficients (1 = identical clustering across all seeds).}
+#'     \item{partitions}{list of the raw per-seed partitions, \code{joint} and
+#'       \code{text}, for further inspection.}
+#'   }
+#'
+#' @seealso \code{\link{decomposeSubnetworkByTopic}}
+#' @note \strong{Beta feature:} This function is experimental and the API may
+#'   change without notice in future versions.
+#' @export
+#'
+#' @examples
+#' \dontrun{
+#' input <- data.table::fread(system.file(
+#'     "extdata/groupComparisonModel.csv",
+#'     package = "MSstatsBioNet"
+#' ))
+#' subnetwork <- getSubnetworkFromIndra(input)
+#' cmp <- compareTopicModels(subnetwork, seeds = 1:20, n_topics = 5)
+#' cmp$summary
+#' cmp$test            # p < 0.05 => PPI changes topics beyond chance
+#' cmp$dispersion      # how stable each mode is across seeds
+#' }
+compareTopicModels <- function(subnetwork,
+                               seeds = seq_len(20),
+                               n_topics = 5,
+                               unit = c("edges", "papers"),
+                               min_term_count = 2,
+                               max_iter = 200,
+                               tol = 1e-4) {
+
+    unit <- match.arg(unit)
+    # Reuse the decompose validator for subnetwork/n_topics structure.
+    .validateDecomposeSubnetworkByTopicInput(subnetwork, n_topics, 0.2)
+    if (!is.numeric(seeds) || length(seeds) < 2L || anyNA(seeds) ||
+        any(seeds != as.integer(seeds))) {
+        stop("`seeds` must be a vector of at least two integer seeds.")
+    }
+    seeds <- as.integer(seeds)
+
+    # Build the shared matrices once; only the NMF is repeated per seed.
+    mats <- .buildTopicMatrices(subnetwork, n_topics, min_term_count)
+    X_text <- mats$X_text
+    X_edges <- mats$X_edges
+    k <- mats$n_topics
+
+    joint_parts <- vector("list", length(seeds))
+    text_parts <- vector("list", length(seeds))
+    between <- numeric(length(seeds))
+
+    for (i in seq_along(seeds)) {
+        s <- seeds[i]
+        joint <- .fitTopicModel(X_text, X_edges, k, include_ppi = TRUE,
+                                max_iter = max_iter, tol = tol, seed = s)
+        text <- .fitTopicModel(X_text, X_edges, k, include_ppi = FALSE,
+                               max_iter = max_iter, tol = tol, seed = s)
+
+        joint_parts[[i]] <- .topicPartition(joint, unit)
+        text_parts[[i]] <- .topicPartition(text, unit)
+        between[i] <- .adjustedRandIndex(joint_parts[[i]], text_parts[[i]])
+    }
+    names(joint_parts) <- names(text_parts) <- paste0("seed_", seeds)
+
+    within_joint <- .pairwiseARI(joint_parts)
+    within_text <- .pairwiseARI(text_parts)
+    within_pooled <- c(within_joint, within_text)
+
+    # Is the between-mode agreement lower than within-mode agreement?
+    test <- stats::wilcox.test(between, within_pooled,
+                               alternative = "less", exact = FALSE)
+
+    consensus_joint <- .consensusMatrix(joint_parts)
+    consensus_text <- .consensusMatrix(text_parts)
+
+    summary <- data.frame(
+        comparison = c("within_joint", "within_text", "between"),
+        n          = c(length(within_joint), length(within_text),
+                       length(between)),
+        median_ari = c(stats::median(within_joint),
+                       stats::median(within_text),
+                       stats::median(between)),
+        mean_ari   = c(mean(within_joint), mean(within_text), mean(between)),
+        stringsAsFactors = FALSE
+    )
+
+    list(
+        unit       = unit,
+        seeds      = seeds,
+        n_topics   = k,
+        ari        = list(within_joint = within_joint,
+                          within_text  = within_text,
+                          between       = between),
+        summary    = summary,
+        test       = test,
+        consensus  = list(joint = consensus_joint, text = consensus_text),
+        dispersion = c(joint = .dispersionCoefficient(consensus_joint),
+                       text  = .dispersionCoefficient(consensus_text)),
+        partitions = list(joint = joint_parts, text = text_parts)
+    )
+}