Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion DESCRIPTION
Original file line number Diff line number Diff line change
Expand Up @@ -46,4 +46,4 @@ Encoding: UTF-8
URL: http://msstats.org, https://vitek-lab.github.io/MSstatsBioNet/
BugReports: https://groups.google.com/forum/#!forum/msstats
Config/testthat/edition: 3
RoxygenNote: 7.3.3
Config/roxygen2/version: 8.0.0
4 changes: 4 additions & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
@@ -1,8 +1,11 @@
# Generated by roxygen2: do not edit by hand

export(annotateProteinInfoFromIndra)
export(bootstrapTopicModels)
export(compareTopicModels)
export(cytoscapeNetwork)
export(cytoscapeNetworkOutput)
export(decomposeSubnetworkByTopic)
export(deleteEdgeFromNetwork)
export(exportNetworkToHTML)
export(filterSubnetworkByContext)
Expand All @@ -27,6 +30,7 @@ importFrom(r2r,keys)
importFrom(r2r,query)
importFrom(rentrez,entrez_fetch)
importFrom(stats,cor)
importFrom(stats,runif)
importFrom(stats,setNames)
importFrom(stopwords,stopwords)
importFrom(text2vec,TfIdf)
Expand Down
153 changes: 153 additions & 0 deletions R/bootstrapTopicModels.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,153 @@
#' Bootstrap the topic decomposition to find each topic's robust top words
#'
#' Refits the NMF topic model on many bootstrap resamples of the papers and
#' reports, for every topic, how reliably each word stays among the topic's top
#' terms. This separates words that genuinely characterise a topic from words
#' that only surface in a single lucky fit, and lets you see how the top-word
#' lists change with and without the PPI view (run once per \code{include_ppi}).
#'
#' Because topic indices are arbitrary across fits (label switching), each
#' resample's topics are first aligned to a reference fit on the full data by
#' cosine similarity of their topic-word vectors. The papers are resampled with
#' replacement; the word vocabulary is held fixed (from the full data) so topics
#' remain comparable across resamples, and the NMF seed is held fixed so the
#' variability reported reflects \emph{data} resampling rather than random
#' initialization.
#'
#' @param subnetwork list with \code{nodes} and \code{edges} data.frames, e.g.
#' the output of \code{\link{getSubnetworkFromIndra}}.
#' @param n_boot number of bootstrap resamples. Default 50.
#' @param n_topics number of topics (rank of the factorization). Default 5.
#' @param include_ppi logical; factorize the PPI/edge view jointly with the text
#' (\code{TRUE}, default) or use paper words only (\code{FALSE}). See
#' \code{\link{decomposeSubnetworkByTopic}}.
#' @param n_top_terms number of top words that define a topic's "top list" in
#' each resample (the cutoff for the selection-frequency tally). Default 10.
#' @param min_term_count minimum corpus frequency for a word to be kept when
#' building the text matrix. Default 2.
#' @param max_iter maximum number of NMF multiplicative-update iterations.
#' Default 200.
#' @param tol relative-change tolerance for NMF early stopping. Default 1e-4.
#' @param seed random seed for the reference fit, the resampling, and each
#' bootstrap NMF. Default 1.
#'
#' @return A list with
#' \describe{
#' \item{include_ppi, n_boot, n_topics}{the settings used.}
#' \item{topTerms}{named list \code{topic_1} ... \code{topic_k}. Each is a
#' data.frame sorted by \code{selection_freq}, with columns \code{term},
#' \code{selection_freq} (fraction of resamples the word was in this
#' topic's top \code{n_top_terms}), and \code{mean_weight} (mean
#' within-topic word weight across resamples). A word with
#' \code{selection_freq} near 1 is a stable signature of the topic.}
#' \item{reference}{named list of the top \code{n_top_terms} words per topic
#' from the single full-data fit, for comparison.}
#' }
#'
#' @seealso \code{\link{decomposeSubnetworkByTopic}},
#' \code{\link{compareTopicModels}}
#' @note \strong{Beta feature:} This function is experimental and the API may
#' change without notice in future versions.
#' @export
#'
#' @examples
#' \dontrun{
#' input <- data.table::fread(system.file(
#' "extdata/groupComparisonModel.csv",
#' package = "MSstatsBioNet"
#' ))
#' subnetwork <- getSubnetworkFromIndra(input)
#'
#' # Top words with PPIs included vs. words only:
#' boot_ppi <- bootstrapTopicModels(subnetwork, include_ppi = TRUE)
#' boot_text <- bootstrapTopicModels(subnetwork, include_ppi = FALSE)
#'
#' head(boot_ppi$topTerms$topic_1) # robust signature words for topic 1
#' boot_text$topTerms$topic_1
#' }
bootstrapTopicModels <- function(subnetwork,
n_boot = 50,
n_topics = 5,
include_ppi = TRUE,
n_top_terms = 10,
min_term_count = 2,
max_iter = 200,
tol = 1e-4,
seed = 1) {

.validateDecomposeSubnetworkByTopicInput(subnetwork, n_topics, 0.2,
include_ppi)
if (!is.numeric(n_boot) || length(n_boot) != 1L || is.na(n_boot) ||
n_boot < 2 || n_boot != as.integer(n_boot)) {
stop("`n_boot` must be a single integer >= 2.")
}
Comment on lines +68 to +83

Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟡 Minor | ⚡ Quick win

n_top_terms lacks input validation and runtime bounds checking. The parameter is used without validation at function entry and without clamping against vocabulary size inside the bootstrap loop, which could cause NA propagation if the value exceeds the vocabulary.

  • R/bootstrapTopicModels.R#L67-L82: Add validation that n_top_terms is a single positive integer, similar to the n_boot check.
  • R/bootstrapTopicModels.R#L121-L128: Clamp n_top_terms to min(n_top_terms, length(row)) before indexing to prevent out-of-bounds NAs.
🛡️ Proposed fixes

Add validation after line 83:

     n_boot <- as.integer(n_boot)
+    if (!is.numeric(n_top_terms) || length(n_top_terms) != 1L ||
+        is.na(n_top_terms) || n_top_terms < 1 ||
+        n_top_terms != as.integer(n_top_terms)) {
+        stop("`n_top_terms` must be a single positive integer.")
+    }
+    n_top_terms <- as.integer(n_top_terms)

Clamp at line 125:

-            top <- order(row, decreasing = TRUE)[seq_len(n_top_terms)]
+            n_sel <- min(n_top_terms, length(row))
+            top <- order(row, decreasing = TRUE)[seq_len(n_sel)]
📝 Committable suggestion

‼️ IMPORTANT
Carefully review the code before committing. Ensure that it accurately replaces the highlighted code, contains no missing lines, and has no issues with indentation. Thoroughly test & benchmark the code to ensure it meets the requirements.

Suggested change
bootstrapTopicModels <- function(subnetwork,
n_boot = 50,
n_topics = 5,
include_ppi = TRUE,
n_top_terms = 10,
min_term_count = 2,
max_iter = 200,
tol = 1e-4,
seed = 1) {
.validateDecomposeSubnetworkByTopicInput(subnetwork, n_topics, 0.2,
include_ppi)
if (!is.numeric(n_boot) || length(n_boot) != 1L || is.na(n_boot) ||
n_boot < 2 || n_boot != as.integer(n_boot)) {
stop("`n_boot` must be a single integer >= 2.")
}
bootstrapTopicModels <- function(subnetwork,
n_boot = 50,
n_topics = 5,
include_ppi = TRUE,
n_top_terms = 10,
min_term_count = 2,
max_iter = 200,
tol = 1e-4,
seed = 1) {
.validateDecomposeSubnetworkByTopicInput(subnetwork, n_topics, 0.2,
include_ppi)
if (!is.numeric(n_boot) || length(n_boot) != 1L || is.na(n_boot) ||
n_boot < 2 || n_boot != as.integer(n_boot)) {
stop("`n_boot` must be a single integer >= 2.")
}
n_boot <- as.integer(n_boot)
if (!is.numeric(n_top_terms) || length(n_top_terms) != 1L ||
is.na(n_top_terms) || n_top_terms < 1 ||
n_top_terms != as.integer(n_top_terms)) {
stop("`n_top_terms` must be a single positive integer.")
}
n_top_terms <- as.integer(n_top_terms)
Suggested change
bootstrapTopicModels <- function(subnetwork,
n_boot = 50,
n_topics = 5,
include_ppi = TRUE,
n_top_terms = 10,
min_term_count = 2,
max_iter = 200,
tol = 1e-4,
seed = 1) {
.validateDecomposeSubnetworkByTopicInput(subnetwork, n_topics, 0.2,
include_ppi)
if (!is.numeric(n_boot) || length(n_boot) != 1L || is.na(n_boot) ||
n_boot < 2 || n_boot != as.integer(n_boot)) {
stop("`n_boot` must be a single integer >= 2.")
}
for (r in seq_len(k)) {
row <- fit$H_text[map[r], ]
s <- sum(row)
weightsum[r, ] <- weightsum[r, ] + (if (s > 0) row / s else row)
n_sel <- min(n_top_terms, length(row))
top <- order(row, decreasing = TRUE)[seq_len(n_sel)]
top <- top[row[top] > 0]
topcount[r, top] <- topcount[r, top] + 1
}
📍 Affects 1 file
  • R/bootstrapTopicModels.R#L67-L82 (this comment)
  • R/bootstrapTopicModels.R#L121-L128
🤖 Prompt for AI Agents
Verify each finding against current code. Fix only still-valid issues, skip the
rest with a brief reason, keep changes minimal, and validate.

In `@R/bootstrapTopicModels.R` around lines 67 - 82, The `n_top_terms` parameter
lacks validation at function entry and bounds checking during runtime, which can
cause NA propagation. At R/bootstrapTopicModels.R#L67-L82 (anchor site), add
input validation for `n_top_terms` after the `n_boot` validation block to ensure
it is a single positive integer, following the same pattern as the existing
`n_boot` check. At R/bootstrapTopicModels.R#L121-L128 (sibling site), before
using `n_top_terms` to index into a row vector, clamp it to `min(n_top_terms,
length(row))` to prevent out-of-bounds access and NA values.

n_boot <- as.integer(n_boot)

# Build the shared matrices once; keep the vocabulary fixed across resamples.
mats <- .buildTopicMatrices(subnetwork, n_topics, min_term_count)
X_text <- mats$X_text
X_edges <- mats$X_edges
k <- mats$n_topics
n_papers <- nrow(X_text)
terms <- colnames(X_text)

# Reference fit on the full data defines the canonical topic ordering that
# every resample is aligned back to.
ref <- .fitTopicModel(X_text, X_edges, k, include_ppi,
max_iter = max_iter, tol = tol, seed = seed)
reference <- lapply(seq_len(k), function(t)
.topTermsForTopic(ref$H_text, t, n_top_terms))
names(reference) <- paste0("topic_", seq_len(k))

# Precompute all resample row-index sets up front: the NMF resets the RNG
# internally, so drawing the samples inside the loop would repeat them.
set.seed(seed)
boot_indices <- lapply(seq_len(n_boot),
function(b) sample.int(n_papers, replace = TRUE))

# Tally, per (reference topic x word): top-list selection count and the
# summed within-topic weight, accumulated across resamples.
topcount <- matrix(0, k, length(terms), dimnames = list(NULL, terms))
weightsum <- matrix(0, k, length(terms), dimnames = list(NULL, terms))

for (b in seq_len(n_boot)) {
idx <- boot_indices[[b]]
fit <- .fitTopicModel(X_text[idx, , drop = FALSE],
X_edges[idx, , drop = FALSE],
k, include_ppi,
max_iter = max_iter, tol = tol, seed = seed)

# Align this resample's topics to the reference, then aggregate.
map <- .greedyTopicMatch(.cosineSimMatrix(ref$H_text, fit$H_text))
for (r in seq_len(k)) {
row <- fit$H_text[map[r], ]
s <- sum(row)
weightsum[r, ] <- weightsum[r, ] + (if (s > 0) row / s else row)
top <- order(row, decreasing = TRUE)[seq_len(n_top_terms)]
top <- top[row[top] > 0]
topcount[r, top] <- topcount[r, top] + 1
}
}

topTerms <- lapply(seq_len(k), function(r) {
keep <- which(topcount[r, ] > 0)
df <- data.frame(
term = terms[keep],
selection_freq = topcount[r, keep] / n_boot,
mean_weight = weightsum[r, keep] / n_boot,
stringsAsFactors = FALSE
)
df <- df[order(-df$selection_freq, -df$mean_weight), , drop = FALSE]
rownames(df) <- NULL
df
})
names(topTerms) <- paste0("topic_", seq_len(k))

list(
include_ppi = include_ppi,
n_boot = n_boot,
n_topics = k,
topTerms = topTerms,
reference = reference
)
}
160 changes: 160 additions & 0 deletions R/compareTopicModels.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,160 @@
#' Test whether including PPIs changes topic structure beyond random chance
#'
#' Quantifies how much the topic decomposition produced by
#' \code{\link{decomposeSubnetworkByTopic}} changes when the PPI/edge view is
#' included (\code{include_ppi = TRUE}) versus excluded
#' (\code{include_ppi = FALSE}), and separates that change from the run-to-run
#' variability that NMF produces just from its random initialization.
#'
#' NMF converges to a local optimum that depends on the random seed, so a single
#' joint-vs-text comparison conflates the real effect of the PPI view with
#' optimization noise. This function instead refits both modes across many seeds
#' and compares three distributions of partition agreement (Adjusted Rand Index,
#' ARI):
#' \describe{
#' \item{within_joint}{ARI between pairs of joint runs (different seeds) —
#' how much the joint solution wobbles on its own.}
#' \item{within_text}{ARI between pairs of text-only runs — the same for the
#' text-only solution.}
#' \item{between}{ARI between the joint and text-only run at the \emph{same}
#' seed. Because both modes draw \code{W} and \code{H_text} from the same
#' seeded stream, a matched seed gives both modes an identical
#' initialization, so this isolates the effect of adding the PPI view from
#' the starting point.}
#' }
#' If the between-mode ARI is systematically lower than the within-mode ARIs,
#' the PPI view changes the topic structure more than chance would — a
#' one-sided Wilcoxon rank-sum test (\code{between < within}) puts a p-value on
#' it. If the between distribution sits inside the within distributions, the
#' apparent difference is just optimization noise.
#'
#' The expensive, network-bound steps (evidence extraction, abstract fetching,
#' matrix construction) run once; only the NMF is repeated per seed.
#'
#' @param subnetwork list with \code{nodes} and \code{edges} data.frames, e.g.
#' the output of \code{\link{getSubnetworkFromIndra}}.
#' @param seeds integer vector of NMF seeds to fit (at least 2). Default
#' \code{1:20}.
#' @param n_topics number of topics (rank of the factorization). Default 5.
#' @param unit either \code{"edges"} (compare edge-to-topic assignments, the
#' default, matching the subnetworks the decomposition returns) or
#' \code{"papers"} (compare paper-to-topic assignments).
#' @param min_term_count minimum corpus frequency for a word to be kept when
#' building the text matrix. Default 2.
#' @param max_iter maximum number of NMF multiplicative-update iterations.
#' Default 200.
#' @param tol relative-change tolerance for NMF early stopping. Default 1e-4.
#'
#' @return A list with
#' \describe{
#' \item{unit}{the comparison unit used.}
#' \item{seeds}{the seeds fitted.}
#' \item{n_topics}{the effective number of topics.}
#' \item{ari}{list of numeric vectors \code{within_joint},
#' \code{within_text}, and \code{between} (matched seeds).}
#' \item{summary}{data.frame of median/mean ARI and count per comparison.}
#' \item{test}{the \code{\link[stats]{wilcox.test}} object comparing the
#' between distribution against the pooled within distributions
#' (\code{alternative = "less"}).}
#' \item{consensus}{list of consensus (co-membership) matrices,
#' \code{joint} and \code{text}, across seeds.}
#' \item{dispersion}{named numeric vector of consensus dispersion
#' coefficients (1 = identical clustering across all seeds).}
#' \item{partitions}{list of the raw per-seed partitions, \code{joint} and
#' \code{text}, for further inspection.}
#' }
#'
#' @seealso \code{\link{decomposeSubnetworkByTopic}}
#' @note \strong{Beta feature:} This function is experimental and the API may
#' change without notice in future versions.
#' @export
#'
#' @examples
#' \dontrun{
#' input <- data.table::fread(system.file(
#' "extdata/groupComparisonModel.csv",
#' package = "MSstatsBioNet"
#' ))
#' subnetwork <- getSubnetworkFromIndra(input)
#' cmp <- compareTopicModels(subnetwork, seeds = 1:20, n_topics = 5)
#' cmp$summary
#' cmp$test # p < 0.05 => PPI changes topics beyond chance
#' cmp$dispersion # how stable each mode is across seeds
#' }
compareTopicModels <- function(subnetwork,
seeds = seq_len(20),
n_topics = 5,
unit = c("edges", "papers"),
min_term_count = 2,
max_iter = 200,
tol = 1e-4) {

unit <- match.arg(unit)
# Reuse the decompose validator for subnetwork/n_topics structure.
.validateDecomposeSubnetworkByTopicInput(subnetwork, n_topics, 0.2)
if (!is.numeric(seeds) || length(seeds) < 2L || anyNA(seeds) ||
any(seeds != as.integer(seeds))) {
stop("`seeds` must be a vector of at least two integer seeds.")
}
seeds <- as.integer(seeds)

# Build the shared matrices once; only the NMF is repeated per seed.
mats <- .buildTopicMatrices(subnetwork, n_topics, min_term_count)
X_text <- mats$X_text
X_edges <- mats$X_edges
k <- mats$n_topics

joint_parts <- vector("list", length(seeds))
text_parts <- vector("list", length(seeds))
between <- numeric(length(seeds))

for (i in seq_along(seeds)) {
s <- seeds[i]
joint <- .fitTopicModel(X_text, X_edges, k, include_ppi = TRUE,
max_iter = max_iter, tol = tol, seed = s)
text <- .fitTopicModel(X_text, X_edges, k, include_ppi = FALSE,
max_iter = max_iter, tol = tol, seed = s)

joint_parts[[i]] <- .topicPartition(joint, unit)
text_parts[[i]] <- .topicPartition(text, unit)
between[i] <- .adjustedRandIndex(joint_parts[[i]], text_parts[[i]])
}
names(joint_parts) <- names(text_parts) <- paste0("seed_", seeds)

within_joint <- .pairwiseARI(joint_parts)
within_text <- .pairwiseARI(text_parts)
within_pooled <- c(within_joint, within_text)

# Is the between-mode agreement lower than within-mode agreement?
test <- stats::wilcox.test(between, within_pooled,
alternative = "less", exact = FALSE)

consensus_joint <- .consensusMatrix(joint_parts)
consensus_text <- .consensusMatrix(text_parts)

summary <- data.frame(
comparison = c("within_joint", "within_text", "between"),
n = c(length(within_joint), length(within_text),
length(between)),
median_ari = c(stats::median(within_joint),
stats::median(within_text),
stats::median(between)),
mean_ari = c(mean(within_joint), mean(within_text), mean(between)),
stringsAsFactors = FALSE
)

list(
unit = unit,
seeds = seeds,
n_topics = k,
ari = list(within_joint = within_joint,
within_text = within_text,
between = between),
summary = summary,
test = test,
consensus = list(joint = consensus_joint, text = consensus_text),
dispersion = c(joint = .dispersionCoefficient(consensus_joint),
text = .dispersionCoefficient(consensus_text)),
partitions = list(joint = joint_parts, text = text_parts)
)
}
Loading
Loading