From 821f6f8205e5e3d34b0703db208ab4c59e2d7d66 Mon Sep 17 00:00:00 2001 From: tonywu1999 Date: Mon, 15 Jun 2026 15:19:40 -0400 Subject: [PATCH 1/4] add unsupervised approach NMF for clustering --- NAMESPACE | 2 + R/decomposeSubnetworkByTopic.R | 160 +++++++++++++++++++ R/utils_decomposeSubnetworkByTopic.R | 225 +++++++++++++++++++++++++++ 3 files changed, 387 insertions(+) create mode 100644 R/decomposeSubnetworkByTopic.R create mode 100644 R/utils_decomposeSubnetworkByTopic.R diff --git a/NAMESPACE b/NAMESPACE index 82a8a32..0dddb38 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -3,6 +3,7 @@ export(annotateProteinInfoFromIndra) export(cytoscapeNetwork) export(cytoscapeNetworkOutput) +export(decomposeSubnetworkByTopic) export(deleteEdgeFromNetwork) export(exportNetworkToHTML) export(filterSubnetworkByContext) @@ -27,6 +28,7 @@ importFrom(r2r,keys) importFrom(r2r,query) importFrom(rentrez,entrez_fetch) importFrom(stats,cor) +importFrom(stats,runif) importFrom(stats,setNames) importFrom(stopwords,stopwords) importFrom(text2vec,TfIdf) diff --git a/R/decomposeSubnetworkByTopic.R b/R/decomposeSubnetworkByTopic.R new file mode 100644 index 0000000..f446ade --- /dev/null +++ b/R/decomposeSubnetworkByTopic.R @@ -0,0 +1,160 @@ +#' Decompose a subnetwork into topic-specific subnetworks via joint NMF +#' +#' Takes a subnetwork (the output of \code{\link{getSubnetworkFromIndra}}) and +#' splits it into a list of smaller, topic-specific subnetworks discovered with +#' unsupervised joint non-negative matrix factorization (NMF). +#' +#' The procedure is: +#' \enumerate{ +#' \item For every edge, the supporting INDRA evidence is retrieved and the +#' PubMed abstract of each referenced PMID is fetched. Papers (PMIDs) are +#' the shared unit of analysis. +#' \item Two matrices are built that share the same rows (papers): +#' \code{X_text} (papers x words, term counts from the abstracts) and +#' \code{X_edges} (papers x unique \code{source_target_interaction} +#' combinations, evidence-sentence counts). +#' \item A joint NMF learns a single shared basis matrix \code{W} +#' (papers x topics) such that \eqn{X_{text} \approx W H_{text}} and +#' \eqn{X_{edges} \approx W H_{edges}}. Sharing \code{W} ties each learned +#' topic to both a set of words and a set of edges. +#' \item Each topic becomes its own subnetwork: an edge is included in a +#' topic when that topic carries at least \code{edge_topic_cutoff} of the +#' edge's loading (soft, overlapping assignment), and nodes are restricted +#' to those touched by the kept edges. +#' } +#' +#' @param subnetwork list with \code{nodes} and \code{edges} data.frames, e.g. +#' the output of \code{\link{getSubnetworkFromIndra}}. +#' @param n_topics number of topics (rank of the factorization). Default 5. +#' @param edge_topic_cutoff numeric in \code{[0, 1]}; an edge is added to a +#' topic's subnetwork when the topic carries at least this share of the +#' edge's total loading. Each edge is always included in at least its +#' highest-loading topic. Default 0.2. +#' @param n_top_terms number of top words to report per topic. Default 10. +#' @param min_term_count minimum corpus frequency for a word to be kept when +#' building \code{X_text}. Default 2. +#' @param max_iter maximum number of NMF multiplicative-update iterations. +#' Default 200. +#' @param tol relative-change tolerance for NMF early stopping. Default 1e-4. +#' @param seed random seed for NMF initialization. Default 1. +#' +#' @return A list of length \code{n_topics}, named \code{topic_1} ... +#' \code{topic_k}. Each element is a topic-specific subnetwork: a list with +#' \describe{ +#' \item{nodes}{nodes data.frame restricted to the topic's edges.} +#' \item{edges}{edges data.frame for the topic, with an added +#' \code{topicWeight} column (the edge's topic share).} +#' \item{topic}{the topic index.} +#' \item{topTerms}{character vector of the topic's top words.} +#' \item{pmids}{PMIDs whose strongest topic loading is this topic.} +#' } +#' The full factorization (W, H_text, H_edges, etc.) is attached as the +#' \code{"nmf"} attribute of the returned list. +#' +#' @seealso \code{\link{getSubnetworkFromIndra}}, +#' \code{\link{filterSubnetworkByContext}} +#' +#' @export +#' +#' @examples +#' \dontrun{ +#' input <- data.table::fread(system.file( +#' "extdata/groupComparisonModel.csv", +#' package = "MSstatsBioNet" +#' )) +#' subnetwork <- getSubnetworkFromIndra(input) +#' topics <- decomposeSubnetworkByTopic(subnetwork, n_topics = 5) +#' topics$topic_1$topTerms +#' exportNetworkToHTML(topics$topic_1$nodes, topics$topic_1$edges) +#' } +decomposeSubnetworkByTopic <- function(subnetwork, + n_topics = 5, + edge_topic_cutoff = 0.2, + n_top_terms = 10, + min_term_count = 2, + max_iter = 200, + tol = 1e-4, + seed = 1) { + + .validateDecomposeSubnetworkByTopicInput(subnetwork, n_topics, + edge_topic_cutoff) + nodes <- subnetwork$nodes + edges <- subnetwork$edges + n_topics <- as.integer(n_topics) + + # 1. Evidence (paper <-> edge links) for every edge. + evidence <- .extract_evidence_text(edges) + evidence <- evidence[!is.na(evidence$pmid) & nchar(evidence$pmid) > 0, ] + if (nrow(evidence) == 0) { + stop("No evidence with PMIDs was found for any edge; ", + "cannot decompose into topics.") + } + + pmids <- unique(evidence$pmid) + edge_keys <- unique(.edgeKey(evidence$source, evidence$target, + evidence$interaction)) + + if (length(pmids) < n_topics) { + warning(sprintf( + "Only %d papers available; reducing n_topics from %d to %d.", + length(pmids), n_topics, length(pmids) + )) + n_topics <- length(pmids) + } + + # 2. X_text (papers x words) from PubMed abstracts. + abstract_list <- .fetch_clean_abstracts_xml(pmids) + abstracts <- vapply(pmids, function(p) { + a <- abstract_list[[p]] + if (is.null(a)) "" else a + }, character(1)) + X_text <- .buildTextMatrix(pmids, abstracts, min_term_count) + + # 3. X_edges (papers x source_target_interaction) of evidence counts. + X_edges <- .buildEdgeMatrix(evidence, pmids, edge_keys) + + # 4. Joint NMF with a shared W. + model <- .jointNMF(X_text, X_edges, k = n_topics, + max_iter = max_iter, tol = tol, seed = seed) + + # 5. One subnetwork per topic (soft / overlapping edge assignment). + shares <- .edgeTopicShares(model$H_edges) # topics x edges + edge_argmax <- apply(shares, 2, which.max) # best topic per edge + edges_key_vec <- .edgeKey(edges$source, edges$target, edges$interaction) + paper_argmax <- apply(model$W, 1, which.max) # best topic per paper + + topics <- lapply(seq_len(n_topics), function(t) { + # Edge keys assigned to this topic. + topic_keys <- edge_keys[shares[t, ] >= edge_topic_cutoff | + edge_argmax == t] + in_topic <- edges_key_vec %in% topic_keys + topic_edges <- edges[in_topic, , drop = FALSE] + if (nrow(topic_edges) > 0) { + topic_edges$topicWeight <- + shares[t, match(edges_key_vec[in_topic], edge_keys)] + } + topic_nodes <- nodes[nodes$id %in% + c(topic_edges$source, topic_edges$target), , + drop = FALSE] + list( + nodes = topic_nodes, + edges = topic_edges, + topic = t, + topTerms = .topTermsForTopic(model$H_text, t, n_top_terms), + pmids = pmids[paper_argmax == t] + ) + }) + names(topics) <- paste0("topic_", seq_len(n_topics)) + + attr(topics, "nmf") <- list( + W = model$W, + H_text = model$H_text, + H_edges = model$H_edges, + terms = colnames(X_text), + edge_keys = edge_keys, + pmids = pmids, + objective = model$objective, + n_iter = model$n_iter + ) + return(topics) +} diff --git a/R/utils_decomposeSubnetworkByTopic.R b/R/utils_decomposeSubnetworkByTopic.R new file mode 100644 index 0000000..8d2e3b4 --- /dev/null +++ b/R/utils_decomposeSubnetworkByTopic.R @@ -0,0 +1,225 @@ +#' Validate input for decomposeSubnetworkByTopic +#' @param subnetwork list with `nodes` and `edges` data.frames +#' @param n_topics number of topics (rank of the factorization) +#' @param edge_topic_cutoff topic-share threshold for assigning an edge to a topic +#' @keywords internal +#' @noRd +.validateDecomposeSubnetworkByTopicInput <- function(subnetwork, + n_topics, + edge_topic_cutoff) { + if (!is.list(subnetwork) || + !all(c("nodes", "edges") %in% names(subnetwork))) { + stop("`subnetwork` must be a list containing `nodes` and `edges`, ", + "e.g. the output of getSubnetworkFromIndra().") + } + if (!is.data.frame(subnetwork$nodes) || !"id" %in% names(subnetwork$nodes)) { + stop("`subnetwork$nodes` must be a data.frame with an `id` column.") + } + required_edge_cols <- c("source", "target", "interaction", + "site", "evidenceLink", "stmt_hash") + missing_cols <- setdiff(required_edge_cols, names(subnetwork$edges)) + if (!is.data.frame(subnetwork$edges) || length(missing_cols) > 0) { + stop(sprintf( + "`subnetwork$edges` must be a data.frame with columns: %s", + paste(required_edge_cols, collapse = ", ") + )) + } + if (!is.numeric(n_topics) || length(n_topics) != 1L || is.na(n_topics) || + n_topics < 1 || n_topics != as.integer(n_topics)) { + stop("`n_topics` must be a single positive integer.") + } + if (!is.numeric(edge_topic_cutoff) || length(edge_topic_cutoff) != 1L || + is.na(edge_topic_cutoff) || edge_topic_cutoff < 0 || + edge_topic_cutoff > 1) { + stop("`edge_topic_cutoff` must be a single numeric value in [0, 1].") + } +} + + +#' Build a unique source_target_interaction key for each edge +#' @param source character vector of source node ids +#' @param target character vector of target node ids +#' @param interaction character vector of interaction types +#' @return character vector of edge keys +#' @keywords internal +#' @noRd +.edgeKey <- function(source, target, interaction) { + paste(source, target, interaction, sep = "||") +} + + +#' Build the paper-by-word matrix (X_text) from PubMed abstracts +#' +#' Tokenises abstracts with text2vec and returns a dense paper-by-word count +#' matrix whose rows are aligned to `pmids`. +#' +#' @param pmids character vector of PubMed IDs (rows of the matrix) +#' @param abstracts character vector of abstract texts, aligned to `pmids` +#' @param min_term_count minimum corpus term frequency to keep a word +#' @return dense numeric matrix (papers x words) with rownames = pmids +#' @keywords internal +#' @noRd +#' @importFrom text2vec itoken word_tokenizer create_vocabulary +#' prune_vocabulary vocab_vectorizer create_dtm +#' @importFrom stopwords stopwords +.buildTextMatrix <- function(pmids, abstracts, min_term_count = 2) { + tokens <- itoken(abstracts, + preprocessor = tolower, + tokenizer = word_tokenizer, + ids = pmids, + progressbar = FALSE) + vocab <- create_vocabulary(tokens, stopwords = stopwords("en")) + pruned <- prune_vocabulary(vocab, term_count_min = min_term_count) + if (nrow(pruned) == 0) { + # Fall back to keeping every term before giving up. + pruned <- prune_vocabulary(vocab, term_count_min = 1) + } + if (nrow(pruned) == 0) { + stop("No usable words found in the fetched abstracts; ", + "cannot build the text matrix.") + } + vectorizer <- vocab_vectorizer(pruned) + dtm <- create_dtm(tokens, vectorizer) + dtm <- as.matrix(dtm) + # Align rows to the requested pmid order (zero rows for empty abstracts). + aligned <- matrix(0, nrow = length(pmids), ncol = ncol(dtm), + dimnames = list(pmids, colnames(dtm))) + common <- intersect(pmids, rownames(dtm)) + if (length(common) > 0) { + aligned[common, ] <- dtm[common, , drop = FALSE] + } + return(aligned) +} + + +#' Build the paper-by-edge matrix (X_edges) of evidence counts +#' +#' @param evidence data.frame from \code{.extract_evidence_text}; one row per +#' evidence sentence with `pmid`, `source`, `target`, `interaction`. +#' @param pmids character vector of PubMed IDs (rows of the matrix) +#' @param edge_keys character vector of unique edge keys (columns of the matrix) +#' @return dense numeric matrix (papers x edges) of evidence-sentence counts +#' @keywords internal +#' @noRd +.buildEdgeMatrix <- function(evidence, pmids, edge_keys) { + X <- matrix(0, nrow = length(pmids), ncol = length(edge_keys), + dimnames = list(pmids, edge_keys)) + ev_key <- .edgeKey(evidence$source, evidence$target, evidence$interaction) + counts <- table(factor(evidence$pmid, levels = pmids), + factor(ev_key, levels = edge_keys)) + X[] <- as.numeric(counts) + return(X) +} + + +#' Joint non-negative matrix factorization with a shared basis matrix +#' +#' Factorizes two matrices that share the same rows (papers) so that +#' \eqn{X_{text} \approx W H_{text}} and \eqn{X_{edges} \approx W H_{edges}}, +#' where the basis matrix \eqn{W} (papers x topics) is shared between both +#' views. Uses multiplicative updates that minimise the combined squared +#' Frobenius reconstruction error. +#' +#' @param X_text paper-by-word matrix +#' @param X_edges paper-by-edge matrix +#' @param k number of topics (rank) +#' @param max_iter maximum number of multiplicative-update iterations +#' @param tol relative-change tolerance for early stopping +#' @param seed random seed for initialization +#' @param normalize logical; if TRUE each view is scaled to unit Frobenius norm +#' so neither view dominates the shared factorization +#' @return list with elements W, H_text, H_edges, objective (per-iteration +#' objective values), and n_iter +#' @keywords internal +#' @noRd +#' @importFrom stats runif +.jointNMF <- function(X_text, X_edges, k, + max_iter = 200, tol = 1e-4, seed = 1, + normalize = TRUE) { + eps <- 1e-10 + n <- nrow(X_text) + if (nrow(X_edges) != n) { + stop("X_text and X_edges must have the same number of rows (papers).") + } + if (normalize) { + ft <- sqrt(sum(X_text^2)) + fe <- sqrt(sum(X_edges^2)) + if (ft > 0) X_text <- X_text / ft + if (fe > 0) X_edges <- X_edges / fe + } + + p <- ncol(X_text) + m <- ncol(X_edges) + + set.seed(seed) + W <- matrix(runif(n * k), n, k) + H_text <- matrix(runif(k * p), k, p) + H_edges <- matrix(runif(k * m), k, m) + + objective <- numeric(0) + prev_obj <- Inf + n_iter <- 0L + for (iter in seq_len(max_iter)) { + n_iter <- iter + # Update view-specific coefficient matrices. + WtW <- crossprod(W) # k x k + H_text <- H_text * (crossprod(W, X_text)) / (WtW %*% H_text + eps) + H_edges <- H_edges * (crossprod(W, X_edges)) / (WtW %*% H_edges + eps) + + # Update the shared basis from both views jointly. + numer <- tcrossprod(X_text, H_text) + tcrossprod(X_edges, H_edges) + denom <- W %*% (tcrossprod(H_text) + tcrossprod(H_edges)) + eps + W <- W * numer / denom + + obj <- sum((X_text - W %*% H_text)^2) + + sum((X_edges - W %*% H_edges)^2) + objective <- c(objective, obj) + if (is.finite(prev_obj) && + abs(prev_obj - obj) <= tol * (prev_obj + eps)) { + break + } + prev_obj <- obj + } + + rownames(W) <- rownames(X_text) + colnames(W) <- paste0("topic_", seq_len(k)) + rownames(H_text) <- colnames(W) + colnames(H_text) <- colnames(X_text) + rownames(H_edges) <- colnames(W) + colnames(H_edges) <- colnames(X_edges) + + return(list(W = W, H_text = H_text, H_edges = H_edges, + objective = objective, n_iter = n_iter)) +} + + +#' Compute the per-edge topic shares from H_edges +#' +#' Column-normalises H_edges so that, for each edge, the loadings across topics +#' sum to 1, giving each edge a distribution over topics. +#' +#' @param H_edges topics-by-edges coefficient matrix +#' @return topics-by-edges matrix of topic shares (columns sum to 1) +#' @keywords internal +#' @noRd +.edgeTopicShares <- function(H_edges) { + col_sums <- colSums(H_edges) + col_sums[col_sums == 0] <- 1 + sweep(H_edges, 2, col_sums, "/") +} + + +#' Top words for a topic from H_text +#' @param H_text topics-by-words coefficient matrix +#' @param topic integer topic index +#' @param n number of top terms to return +#' @return character vector of the top-weighted terms for the topic +#' @keywords internal +#' @noRd +.topTermsForTopic <- function(H_text, topic, n = 10) { + weights <- H_text[topic, ] + weights <- weights[weights > 0] + if (length(weights) == 0) return(character(0)) + ordered <- sort(weights, decreasing = TRUE) + names(ordered)[seq_len(min(n, length(ordered)))] +} From ff94b8bc8e6edb2eaf52aa554e5bc073e362a326 Mon Sep 17 00:00:00 2001 From: Tony Wu Date: Tue, 16 Jun 2026 06:52:48 -0400 Subject: [PATCH 2/4] add bootstrapping and comparison against topic models --- DESCRIPTION | 2 +- NAMESPACE | 2 + R/bootstrapTopicModels.R | 152 ++++++++++++ R/compareTopicModels.R | 159 +++++++++++++ R/decomposeSubnetworkByTopic.R | 97 ++++---- R/utils_decomposeSubnetworkByTopic.R | 344 ++++++++++++++++++++++++++- man/bootstrapTopicModels.Rd | 94 ++++++++ man/compareTopicModels.Rd | 106 +++++++++ man/decomposeSubnetworkByTopic.Rd | 108 +++++++++ 9 files changed, 1012 insertions(+), 52 deletions(-) create mode 100644 R/bootstrapTopicModels.R create mode 100644 R/compareTopicModels.R create mode 100644 man/bootstrapTopicModels.Rd create mode 100644 man/compareTopicModels.Rd create mode 100644 man/decomposeSubnetworkByTopic.Rd diff --git a/DESCRIPTION b/DESCRIPTION index 31856d5..c95ca26 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -46,4 +46,4 @@ Encoding: UTF-8 URL: http://msstats.org, https://vitek-lab.github.io/MSstatsBioNet/ BugReports: https://groups.google.com/forum/#!forum/msstats Config/testthat/edition: 3 -RoxygenNote: 7.3.3 +Config/roxygen2/version: 8.0.0 diff --git a/NAMESPACE b/NAMESPACE index 0dddb38..ab462d3 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -1,6 +1,8 @@ # Generated by roxygen2: do not edit by hand export(annotateProteinInfoFromIndra) +export(bootstrapTopicModels) +export(compareTopicModels) export(cytoscapeNetwork) export(cytoscapeNetworkOutput) export(decomposeSubnetworkByTopic) diff --git a/R/bootstrapTopicModels.R b/R/bootstrapTopicModels.R new file mode 100644 index 0000000..e113962 --- /dev/null +++ b/R/bootstrapTopicModels.R @@ -0,0 +1,152 @@ +#' Bootstrap the topic decomposition to find each topic's robust top words +#' +#' Refits the NMF topic model on many bootstrap resamples of the papers and +#' reports, for every topic, how reliably each word stays among the topic's top +#' terms. This separates words that genuinely characterise a topic from words +#' that only surface in a single lucky fit, and lets you see how the top-word +#' lists change with and without the PPI view (run once per \code{include_ppi}). +#' +#' Because topic indices are arbitrary across fits (label switching), each +#' resample's topics are first aligned to a reference fit on the full data by +#' cosine similarity of their topic-word vectors. The papers are resampled with +#' replacement; the word vocabulary is held fixed (from the full data) so topics +#' remain comparable across resamples, and the NMF seed is held fixed so the +#' variability reported reflects \emph{data} resampling rather than random +#' initialization. +#' +#' @param subnetwork list with \code{nodes} and \code{edges} data.frames, e.g. +#' the output of \code{\link{getSubnetworkFromIndra}}. +#' @param n_boot number of bootstrap resamples. Default 50. +#' @param n_topics number of topics (rank of the factorization). Default 5. +#' @param include_ppi logical; factorize the PPI/edge view jointly with the text +#' (\code{TRUE}, default) or use paper words only (\code{FALSE}). See +#' \code{\link{decomposeSubnetworkByTopic}}. +#' @param n_top_terms number of top words that define a topic's "top list" in +#' each resample (the cutoff for the selection-frequency tally). Default 10. +#' @param min_term_count minimum corpus frequency for a word to be kept when +#' building the text matrix. Default 2. +#' @param max_iter maximum number of NMF multiplicative-update iterations. +#' Default 200. +#' @param tol relative-change tolerance for NMF early stopping. Default 1e-4. +#' @param seed random seed for the reference fit, the resampling, and each +#' bootstrap NMF. Default 1. +#' +#' @return A list with +#' \describe{ +#' \item{include_ppi, n_boot, n_topics}{the settings used.} +#' \item{topTerms}{named list \code{topic_1} ... \code{topic_k}. Each is a +#' data.frame sorted by \code{selection_freq}, with columns \code{term}, +#' \code{selection_freq} (fraction of resamples the word was in this +#' topic's top \code{n_top_terms}), and \code{mean_weight} (mean +#' within-topic word weight across resamples). A word with +#' \code{selection_freq} near 1 is a stable signature of the topic.} +#' \item{reference}{named list of the top \code{n_top_terms} words per topic +#' from the single full-data fit, for comparison.} +#' } +#' +#' @seealso \code{\link{decomposeSubnetworkByTopic}}, +#' \code{\link{compareTopicModels}} +#' +#' @export +#' +#' @examples +#' \dontrun{ +#' input <- data.table::fread(system.file( +#' "extdata/groupComparisonModel.csv", +#' package = "MSstatsBioNet" +#' )) +#' subnetwork <- getSubnetworkFromIndra(input) +#' +#' # Top words with PPIs included vs. words only: +#' boot_ppi <- bootstrapTopicModels(subnetwork, include_ppi = TRUE) +#' boot_text <- bootstrapTopicModels(subnetwork, include_ppi = FALSE) +#' +#' head(boot_ppi$topTerms$topic_1) # robust signature words for topic 1 +#' boot_text$topTerms$topic_1 +#' } +bootstrapTopicModels <- function(subnetwork, + n_boot = 50, + n_topics = 5, + include_ppi = TRUE, + n_top_terms = 10, + min_term_count = 2, + max_iter = 200, + tol = 1e-4, + seed = 1) { + + .validateDecomposeSubnetworkByTopicInput(subnetwork, n_topics, 0.2, + include_ppi) + if (!is.numeric(n_boot) || length(n_boot) != 1L || is.na(n_boot) || + n_boot < 2 || n_boot != as.integer(n_boot)) { + stop("`n_boot` must be a single integer >= 2.") + } + n_boot <- as.integer(n_boot) + + # Build the shared matrices once; keep the vocabulary fixed across resamples. + mats <- .buildTopicMatrices(subnetwork, n_topics, min_term_count) + X_text <- mats$X_text + X_edges <- mats$X_edges + k <- mats$n_topics + n_papers <- nrow(X_text) + terms <- colnames(X_text) + + # Reference fit on the full data defines the canonical topic ordering that + # every resample is aligned back to. + ref <- .fitTopicModel(X_text, X_edges, k, include_ppi, + max_iter = max_iter, tol = tol, seed = seed) + reference <- lapply(seq_len(k), function(t) + .topTermsForTopic(ref$H_text, t, n_top_terms)) + names(reference) <- paste0("topic_", seq_len(k)) + + # Precompute all resample row-index sets up front: the NMF resets the RNG + # internally, so drawing the samples inside the loop would repeat them. + set.seed(seed) + boot_indices <- lapply(seq_len(n_boot), + function(b) sample.int(n_papers, replace = TRUE)) + + # Tally, per (reference topic x word): top-list selection count and the + # summed within-topic weight, accumulated across resamples. + topcount <- matrix(0, k, length(terms), dimnames = list(NULL, terms)) + weightsum <- matrix(0, k, length(terms), dimnames = list(NULL, terms)) + + for (b in seq_len(n_boot)) { + idx <- boot_indices[[b]] + fit <- .fitTopicModel(X_text[idx, , drop = FALSE], + X_edges[idx, , drop = FALSE], + k, include_ppi, + max_iter = max_iter, tol = tol, seed = seed) + + # Align this resample's topics to the reference, then aggregate. + map <- .greedyTopicMatch(.cosineSimMatrix(ref$H_text, fit$H_text)) + for (r in seq_len(k)) { + row <- fit$H_text[map[r], ] + s <- sum(row) + weightsum[r, ] <- weightsum[r, ] + (if (s > 0) row / s else row) + top <- order(row, decreasing = TRUE)[seq_len(n_top_terms)] + top <- top[row[top] > 0] + topcount[r, top] <- topcount[r, top] + 1 + } + } + + topTerms <- lapply(seq_len(k), function(r) { + keep <- which(topcount[r, ] > 0) + df <- data.frame( + term = terms[keep], + selection_freq = topcount[r, keep] / n_boot, + mean_weight = weightsum[r, keep] / n_boot, + stringsAsFactors = FALSE + ) + df <- df[order(-df$selection_freq, -df$mean_weight), , drop = FALSE] + rownames(df) <- NULL + df + }) + names(topTerms) <- paste0("topic_", seq_len(k)) + + list( + include_ppi = include_ppi, + n_boot = n_boot, + n_topics = k, + topTerms = topTerms, + reference = reference + ) +} diff --git a/R/compareTopicModels.R b/R/compareTopicModels.R new file mode 100644 index 0000000..2184ce0 --- /dev/null +++ b/R/compareTopicModels.R @@ -0,0 +1,159 @@ +#' Test whether including PPIs changes topic structure beyond random chance +#' +#' Quantifies how much the topic decomposition produced by +#' \code{\link{decomposeSubnetworkByTopic}} changes when the PPI/edge view is +#' included (\code{include_ppi = TRUE}) versus excluded +#' (\code{include_ppi = FALSE}), and separates that change from the run-to-run +#' variability that NMF produces just from its random initialization. +#' +#' NMF converges to a local optimum that depends on the random seed, so a single +#' joint-vs-text comparison conflates the real effect of the PPI view with +#' optimization noise. This function instead refits both modes across many seeds +#' and compares three distributions of partition agreement (Adjusted Rand Index, +#' ARI): +#' \describe{ +#' \item{within_joint}{ARI between pairs of joint runs (different seeds) — +#' how much the joint solution wobbles on its own.} +#' \item{within_text}{ARI between pairs of text-only runs — the same for the +#' text-only solution.} +#' \item{between}{ARI between the joint and text-only run at the \emph{same} +#' seed. Because both modes draw \code{W} and \code{H_text} from the same +#' seeded stream, a matched seed gives both modes an identical +#' initialization, so this isolates the effect of adding the PPI view from +#' the starting point.} +#' } +#' If the between-mode ARI is systematically lower than the within-mode ARIs, +#' the PPI view changes the topic structure more than chance would — a +#' one-sided Wilcoxon rank-sum test (\code{between < within}) puts a p-value on +#' it. If the between distribution sits inside the within distributions, the +#' apparent difference is just optimization noise. +#' +#' The expensive, network-bound steps (evidence extraction, abstract fetching, +#' matrix construction) run once; only the NMF is repeated per seed. +#' +#' @param subnetwork list with \code{nodes} and \code{edges} data.frames, e.g. +#' the output of \code{\link{getSubnetworkFromIndra}}. +#' @param seeds integer vector of NMF seeds to fit (at least 2). Default +#' \code{1:20}. +#' @param n_topics number of topics (rank of the factorization). Default 5. +#' @param unit either \code{"edges"} (compare edge-to-topic assignments, the +#' default, matching the subnetworks the decomposition returns) or +#' \code{"papers"} (compare paper-to-topic assignments). +#' @param min_term_count minimum corpus frequency for a word to be kept when +#' building the text matrix. Default 2. +#' @param max_iter maximum number of NMF multiplicative-update iterations. +#' Default 200. +#' @param tol relative-change tolerance for NMF early stopping. Default 1e-4. +#' +#' @return A list with +#' \describe{ +#' \item{unit}{the comparison unit used.} +#' \item{seeds}{the seeds fitted.} +#' \item{n_topics}{the effective number of topics.} +#' \item{ari}{list of numeric vectors \code{within_joint}, +#' \code{within_text}, and \code{between} (matched seeds).} +#' \item{summary}{data.frame of median/mean ARI and count per comparison.} +#' \item{test}{the \code{\link[stats]{wilcox.test}} object comparing the +#' between distribution against the pooled within distributions +#' (\code{alternative = "less"}).} +#' \item{consensus}{list of consensus (co-membership) matrices, +#' \code{joint} and \code{text}, across seeds.} +#' \item{dispersion}{named numeric vector of consensus dispersion +#' coefficients (1 = identical clustering across all seeds).} +#' \item{partitions}{list of the raw per-seed partitions, \code{joint} and +#' \code{text}, for further inspection.} +#' } +#' +#' @seealso \code{\link{decomposeSubnetworkByTopic}} +#' +#' @export +#' +#' @examples +#' \dontrun{ +#' input <- data.table::fread(system.file( +#' "extdata/groupComparisonModel.csv", +#' package = "MSstatsBioNet" +#' )) +#' subnetwork <- getSubnetworkFromIndra(input) +#' cmp <- compareTopicModels(subnetwork, seeds = 1:20, n_topics = 5) +#' cmp$summary +#' cmp$test # p < 0.05 => PPI changes topics beyond chance +#' cmp$dispersion # how stable each mode is across seeds +#' } +compareTopicModels <- function(subnetwork, + seeds = seq_len(20), + n_topics = 5, + unit = c("edges", "papers"), + min_term_count = 2, + max_iter = 200, + tol = 1e-4) { + + unit <- match.arg(unit) + # Reuse the decompose validator for subnetwork/n_topics structure. + .validateDecomposeSubnetworkByTopicInput(subnetwork, n_topics, 0.2) + if (!is.numeric(seeds) || length(seeds) < 2L || anyNA(seeds) || + any(seeds != as.integer(seeds))) { + stop("`seeds` must be a vector of at least two integer seeds.") + } + seeds <- as.integer(seeds) + + # Build the shared matrices once; only the NMF is repeated per seed. + mats <- .buildTopicMatrices(subnetwork, n_topics, min_term_count) + X_text <- mats$X_text + X_edges <- mats$X_edges + k <- mats$n_topics + + joint_parts <- vector("list", length(seeds)) + text_parts <- vector("list", length(seeds)) + between <- numeric(length(seeds)) + + for (i in seq_along(seeds)) { + s <- seeds[i] + joint <- .fitTopicModel(X_text, X_edges, k, include_ppi = TRUE, + max_iter = max_iter, tol = tol, seed = s) + text <- .fitTopicModel(X_text, X_edges, k, include_ppi = FALSE, + max_iter = max_iter, tol = tol, seed = s) + + joint_parts[[i]] <- .topicPartition(joint, unit) + text_parts[[i]] <- .topicPartition(text, unit) + between[i] <- .adjustedRandIndex(joint_parts[[i]], text_parts[[i]]) + } + names(joint_parts) <- names(text_parts) <- paste0("seed_", seeds) + + within_joint <- .pairwiseARI(joint_parts) + within_text <- .pairwiseARI(text_parts) + within_pooled <- c(within_joint, within_text) + + # Is the between-mode agreement lower than within-mode agreement? + test <- stats::wilcox.test(between, within_pooled, + alternative = "less", exact = FALSE) + + consensus_joint <- .consensusMatrix(joint_parts) + consensus_text <- .consensusMatrix(text_parts) + + summary <- data.frame( + comparison = c("within_joint", "within_text", "between"), + n = c(length(within_joint), length(within_text), + length(between)), + median_ari = c(stats::median(within_joint), + stats::median(within_text), + stats::median(between)), + mean_ari = c(mean(within_joint), mean(within_text), mean(between)), + stringsAsFactors = FALSE + ) + + list( + unit = unit, + seeds = seeds, + n_topics = k, + ari = list(within_joint = within_joint, + within_text = within_text, + between = between), + summary = summary, + test = test, + consensus = list(joint = consensus_joint, text = consensus_text), + dispersion = c(joint = .dispersionCoefficient(consensus_joint), + text = .dispersionCoefficient(consensus_text)), + partitions = list(joint = joint_parts, text = text_parts) + ) +} diff --git a/R/decomposeSubnetworkByTopic.R b/R/decomposeSubnetworkByTopic.R index f446ade..dc6efaa 100644 --- a/R/decomposeSubnetworkByTopic.R +++ b/R/decomposeSubnetworkByTopic.R @@ -2,7 +2,7 @@ #' #' Takes a subnetwork (the output of \code{\link{getSubnetworkFromIndra}}) and #' splits it into a list of smaller, topic-specific subnetworks discovered with -#' unsupervised joint non-negative matrix factorization (NMF). +#' unsupervised non-negative matrix factorization (NMF). #' #' The procedure is: #' \enumerate{ @@ -13,10 +13,16 @@ #' \code{X_text} (papers x words, term counts from the abstracts) and #' \code{X_edges} (papers x unique \code{source_target_interaction} #' combinations, evidence-sentence counts). -#' \item A joint NMF learns a single shared basis matrix \code{W} -#' (papers x topics) such that \eqn{X_{text} \approx W H_{text}} and -#' \eqn{X_{edges} \approx W H_{edges}}. Sharing \code{W} ties each learned -#' topic to both a set of words and a set of edges. +#' \item NMF learns a basis matrix \code{W} (papers x topics). When +#' \code{include_ppi = TRUE} (the default) a \emph{joint} NMF learns a +#' single shared \code{W} such that \eqn{X_{text} \approx W H_{text}} and +#' \eqn{X_{edges} \approx W H_{edges}}, tying each learned topic to both a +#' set of words and a set of edges. When \code{include_ppi = FALSE} the +#' factorization uses only \code{X_text} (\eqn{X_{text} \approx W H_{text}}); +#' the PPI evidence is excluded from the modeling and edge-topic loadings +#' are instead derived afterwards by folding the edge counts onto the +#' text-learned topics (\eqn{H_{edges} = W^\top X_{edges}}). This lets you +#' compare topic structure with and without the PPI view. #' \item Each topic becomes its own subnetwork: an edge is included in a #' topic when that topic carries at least \code{edge_topic_cutoff} of the #' edge's loading (soft, overlapping assignment), and nodes are restricted @@ -37,6 +43,11 @@ #' Default 200. #' @param tol relative-change tolerance for NMF early stopping. Default 1e-4. #' @param seed random seed for NMF initialization. Default 1. +#' @param include_ppi logical; if \code{TRUE} (default) the PPI/edge matrix is +#' factorized jointly with the text matrix via a shared basis. If +#' \code{FALSE}, NMF is run on the paper-word matrix only and edge-topic +#' loadings are derived afterwards by folding edge counts onto the +#' text-learned topics, so the PPIs do not influence the topics themselves. #' #' @return A list of length \code{n_topics}, named \code{topic_1} ... #' \code{topic_k}. Each element is a topic-specific subnetwork: a list with @@ -74,49 +85,34 @@ decomposeSubnetworkByTopic <- function(subnetwork, min_term_count = 2, max_iter = 200, tol = 1e-4, - seed = 1) { + seed = 1, + include_ppi = TRUE) { .validateDecomposeSubnetworkByTopicInput(subnetwork, n_topics, - edge_topic_cutoff) - nodes <- subnetwork$nodes - edges <- subnetwork$edges - n_topics <- as.integer(n_topics) + edge_topic_cutoff, include_ppi) - # 1. Evidence (paper <-> edge links) for every edge. - evidence <- .extract_evidence_text(edges) - evidence <- evidence[!is.na(evidence$pmid) & nchar(evidence$pmid) > 0, ] - if (nrow(evidence) == 0) { - stop("No evidence with PMIDs was found for any edge; ", - "cannot decompose into topics.") - } - - pmids <- unique(evidence$pmid) - edge_keys <- unique(.edgeKey(evidence$source, evidence$target, - evidence$interaction)) + # 1-3. Build the shared paper-by-word and paper-by-edge matrices. + mats <- .buildTopicMatrices(subnetwork, n_topics, min_term_count) + nodes <- mats$nodes + edges <- mats$edges + pmids <- mats$pmids + edge_keys <- mats$edge_keys + X_text <- mats$X_text + X_edges <- mats$X_edges + n_topics <- mats$n_topics - if (length(pmids) < n_topics) { - warning(sprintf( - "Only %d papers available; reducing n_topics from %d to %d.", - length(pmids), n_topics, length(pmids) - )) - n_topics <- length(pmids) + # 4. NMF: jointly over text + PPIs, or over text only. + if (include_ppi) { + model <- .jointNMF(X_text, X_edges, k = n_topics, + max_iter = max_iter, tol = tol, seed = seed) + } else { + model <- .textNMF(X_text, k = n_topics, + max_iter = max_iter, tol = tol, seed = seed) + # Edges are excluded from the modeling, but still need topic loadings: + # fold the edge counts onto the text-learned topics. + model$H_edges <- .edgeLoadingsFromTopics(model$W, X_edges) } - # 2. X_text (papers x words) from PubMed abstracts. - abstract_list <- .fetch_clean_abstracts_xml(pmids) - abstracts <- vapply(pmids, function(p) { - a <- abstract_list[[p]] - if (is.null(a)) "" else a - }, character(1)) - X_text <- .buildTextMatrix(pmids, abstracts, min_term_count) - - # 3. X_edges (papers x source_target_interaction) of evidence counts. - X_edges <- .buildEdgeMatrix(evidence, pmids, edge_keys) - - # 4. Joint NMF with a shared W. - model <- .jointNMF(X_text, X_edges, k = n_topics, - max_iter = max_iter, tol = tol, seed = seed) - # 5. One subnetwork per topic (soft / overlapping edge assignment). shares <- .edgeTopicShares(model$H_edges) # topics x edges edge_argmax <- apply(shares, 2, which.max) # best topic per edge @@ -147,14 +143,15 @@ decomposeSubnetworkByTopic <- function(subnetwork, names(topics) <- paste0("topic_", seq_len(n_topics)) attr(topics, "nmf") <- list( - W = model$W, - H_text = model$H_text, - H_edges = model$H_edges, - terms = colnames(X_text), - edge_keys = edge_keys, - pmids = pmids, - objective = model$objective, - n_iter = model$n_iter + W = model$W, + H_text = model$H_text, + H_edges = model$H_edges, + terms = colnames(X_text), + edge_keys = edge_keys, + pmids = pmids, + objective = model$objective, + n_iter = model$n_iter, + include_ppi = include_ppi ) return(topics) } diff --git a/R/utils_decomposeSubnetworkByTopic.R b/R/utils_decomposeSubnetworkByTopic.R index 8d2e3b4..39d76ce 100644 --- a/R/utils_decomposeSubnetworkByTopic.R +++ b/R/utils_decomposeSubnetworkByTopic.R @@ -2,11 +2,13 @@ #' @param subnetwork list with `nodes` and `edges` data.frames #' @param n_topics number of topics (rank of the factorization) #' @param edge_topic_cutoff topic-share threshold for assigning an edge to a topic +#' @param include_ppi logical; whether the PPI/edge matrix is included in the NMF #' @keywords internal #' @noRd .validateDecomposeSubnetworkByTopicInput <- function(subnetwork, n_topics, - edge_topic_cutoff) { + edge_topic_cutoff, + include_ppi = TRUE) { if (!is.list(subnetwork) || !all(c("nodes", "edges") %in% names(subnetwork))) { stop("`subnetwork` must be a list containing `nodes` and `edges`, ", @@ -33,6 +35,10 @@ edge_topic_cutoff > 1) { stop("`edge_topic_cutoff` must be a single numeric value in [0, 1].") } + if (!is.logical(include_ppi) || length(include_ppi) != 1L || + is.na(include_ppi)) { + stop("`include_ppi` must be a single logical value (TRUE or FALSE).") + } } @@ -48,6 +54,66 @@ } +#' Build the shared paper-by-word and paper-by-edge matrices for topic modeling +#' +#' Performs the data-preparation steps shared by +#' \code{\link{decomposeSubnetworkByTopic}} and +#' \code{\link{compareTopicModels}}: extracts INDRA evidence, fetches and +#' tokenises the PubMed abstracts, and builds the aligned \code{X_text} +#' (papers x words) and \code{X_edges} (papers x edges) matrices. Factored out +#' so the (network-bound) abstract fetch and matrix construction happen once and +#' can be reused across many NMF fits. +#' +#' @param subnetwork list with `nodes` and `edges` data.frames +#' @param n_topics requested number of topics; reduced (with a warning) when +#' fewer papers than topics are available +#' @param min_term_count minimum corpus term frequency to keep a word +#' @return list with `nodes`, `edges`, `evidence`, `pmids`, `edge_keys`, +#' `X_text`, `X_edges`, and the (possibly reduced) `n_topics` +#' @keywords internal +#' @noRd +.buildTopicMatrices <- function(subnetwork, n_topics, min_term_count = 2) { + nodes <- subnetwork$nodes + edges <- subnetwork$edges + n_topics <- as.integer(n_topics) + + # 1. Evidence (paper <-> edge links) for every edge. + evidence <- .extract_evidence_text(edges) + evidence <- evidence[!is.na(evidence$pmid) & nchar(evidence$pmid) > 0, ] + if (nrow(evidence) == 0) { + stop("No evidence with PMIDs was found for any edge; ", + "cannot decompose into topics.") + } + + pmids <- unique(evidence$pmid) + edge_keys <- unique(.edgeKey(evidence$source, evidence$target, + evidence$interaction)) + + if (length(pmids) < n_topics) { + warning(sprintf( + "Only %d papers available; reducing n_topics from %d to %d.", + length(pmids), n_topics, length(pmids) + )) + n_topics <- length(pmids) + } + + # 2. X_text (papers x words) from PubMed abstracts. + abstract_list <- .fetch_clean_abstracts_xml(pmids) + abstracts <- vapply(pmids, function(p) { + a <- abstract_list[[p]] + if (is.null(a)) "" else a + }, character(1)) + X_text <- .buildTextMatrix(pmids, abstracts, min_term_count) + + # 3. X_edges (papers x source_target_interaction) of evidence counts. + X_edges <- .buildEdgeMatrix(evidence, pmids, edge_keys) + + list(nodes = nodes, edges = edges, evidence = evidence, + pmids = pmids, edge_keys = edge_keys, + X_text = X_text, X_edges = X_edges, n_topics = n_topics) +} + + #' Build the paper-by-word matrix (X_text) from PubMed abstracts #' #' Tokenises abstracts with text2vec and returns a dense paper-by-word count @@ -193,6 +259,94 @@ } +#' Single-view non-negative matrix factorization of the paper-by-word matrix +#' +#' Factorizes the text view only, \eqn{X_{text} \approx W H_{text}}, using the +#' same multiplicative updates and normalization as \code{\link{.jointNMF}} but +#' without a PPI/edge view. Used by \code{decomposeSubnetworkByTopic} when +#' \code{include_ppi = FALSE} so the learned topics reflect paper words alone. +#' +#' @param X_text paper-by-word matrix +#' @param k number of topics (rank) +#' @param max_iter maximum number of multiplicative-update iterations +#' @param tol relative-change tolerance for early stopping +#' @param seed random seed for initialization +#' @param normalize logical; if TRUE the view is scaled to unit Frobenius norm +#' @return list with elements W, H_text, objective (per-iteration objective +#' values), and n_iter +#' @keywords internal +#' @noRd +#' @importFrom stats runif +.textNMF <- function(X_text, k, + max_iter = 200, tol = 1e-4, seed = 1, + normalize = TRUE) { + eps <- 1e-10 + n <- nrow(X_text) + if (normalize) { + ft <- sqrt(sum(X_text^2)) + if (ft > 0) X_text <- X_text / ft + } + + p <- ncol(X_text) + + set.seed(seed) + W <- matrix(runif(n * k), n, k) + H_text <- matrix(runif(k * p), k, p) + + objective <- numeric(0) + prev_obj <- Inf + n_iter <- 0L + for (iter in seq_len(max_iter)) { + n_iter <- iter + WtW <- crossprod(W) # k x k + H_text <- H_text * (crossprod(W, X_text)) / (WtW %*% H_text + eps) + + numer <- tcrossprod(X_text, H_text) + denom <- W %*% tcrossprod(H_text) + eps + W <- W * numer / denom + + obj <- sum((X_text - W %*% H_text)^2) + objective <- c(objective, obj) + if (is.finite(prev_obj) && + abs(prev_obj - obj) <= tol * (prev_obj + eps)) { + break + } + prev_obj <- obj + } + + rownames(W) <- rownames(X_text) + colnames(W) <- paste0("topic_", seq_len(k)) + rownames(H_text) <- colnames(W) + colnames(H_text) <- colnames(X_text) + + return(list(W = W, H_text = H_text, + objective = objective, n_iter = n_iter)) +} + + +#' Derive topic-by-edge loadings from paper topics (text-only model) +#' +#' When PPIs are excluded from the factorization there is no learned +#' \code{H_edges}. Edges still need topic loadings so they can be assigned to +#' topic-specific subnetworks, so each edge's loading on a topic is obtained by +#' folding its evidence counts onto the text-learned paper topics: +#' \eqn{H_{edges} = W^\top X_{edges}}. The resulting matrix matches the shape +#' and dimnames of a jointly-learned \code{H_edges} and is consumed downstream +#' identically (e.g. by \code{\link{.edgeTopicShares}}). +#' +#' @param W paper-by-topic basis matrix from \code{\link{.textNMF}} +#' @param X_edges paper-by-edge matrix of evidence counts +#' @return topics-by-edges matrix of (unnormalized) edge-topic loadings +#' @keywords internal +#' @noRd +.edgeLoadingsFromTopics <- function(W, X_edges) { + H_edges <- crossprod(W, X_edges) # topics x edges + rownames(H_edges) <- colnames(W) + colnames(H_edges) <- colnames(X_edges) + return(H_edges) +} + + #' Compute the per-edge topic shares from H_edges #' #' Column-normalises H_edges so that, for each edge, the loadings across topics @@ -223,3 +377,191 @@ ordered <- sort(weights, decreasing = TRUE) names(ordered)[seq_len(min(n, length(ordered)))] } + + +#' Extract a hard topic assignment (partition) from a fitted NMF model +#' +#' Reduces a factorization to one integer label per unit so that solutions from +#' different runs can be compared with permutation-invariant metrics. For +#' \code{unit = "papers"} each paper is assigned its strongest topic in +#' \code{W}; for \code{unit = "edges"} each edge is assigned its strongest topic +#' share in \code{H_edges} (matching the edge assignment used by +#' \code{decomposeSubnetworkByTopic}). +#' +#' @param model fitted model list with `W` and `H_edges` +#' @param unit either "edges" or "papers" +#' @return integer vector of topic labels, named by edge key or pmid +#' @keywords internal +#' @noRd +.topicPartition <- function(model, unit = c("edges", "papers")) { + unit <- match.arg(unit) + if (unit == "papers") { + labels <- apply(model$W, 1, which.max) + names(labels) <- rownames(model$W) + } else { + shares <- .edgeTopicShares(model$H_edges) + labels <- apply(shares, 2, which.max) + names(labels) <- colnames(model$H_edges) + } + as.integer(labels) +} + + +#' Adjusted Rand Index between two partitions +#' +#' Permutation-invariant agreement between two labelings of the same items, +#' corrected for chance. Equals 1 for identical partitions and ~0 for the +#' agreement expected at random. Implemented from the contingency table so no +#' extra package dependency is needed. +#' +#' @param a,b integer/factor vectors of equal length (cluster labels) +#' @return numeric scalar in roughly \code{[-0.5, 1]} +#' @keywords internal +#' @noRd +.adjustedRandIndex <- function(a, b) { + if (length(a) != length(b)) { + stop("`a` and `b` must have the same length.") + } + n <- length(a) + if (n == 0) return(NA_real_) + choose2 <- function(x) x * (x - 1) / 2 + tab <- table(a, b) + sum_cells <- sum(choose2(tab)) + sum_a <- sum(choose2(rowSums(tab))) + sum_b <- sum(choose2(colSums(tab))) + expected <- sum_a * sum_b / choose2(n) + max_index <- (sum_a + sum_b) / 2 + if (max_index == expected) return(1) # both trivially one cluster + (sum_cells - expected) / (max_index - expected) +} + + +#' Consensus (co-membership) matrix across a set of partitions +#' +#' Entry \code{[i, j]} is the fraction of partitions in which items \code{i} and +#' \code{j} were assigned to the same topic. All partitions must label the same +#' items in the same order. +#' +#' @param partitions list of integer label vectors of equal length +#' @return symmetric numeric matrix with entries in \code{[0, 1]} +#' @keywords internal +#' @noRd +.consensusMatrix <- function(partitions) { + n <- length(partitions[[1]]) + consensus <- matrix(0, n, n, + dimnames = list(names(partitions[[1]]), + names(partitions[[1]]))) + for (p in partitions) { + consensus <- consensus + outer(p, p, "==") + } + consensus / length(partitions) +} + + +#' Dispersion coefficient of a consensus matrix +#' +#' Summarises how close a consensus matrix is to perfectly stable (every entry +#' 0 or 1) versus maximally ambiguous (every entry 0.5). Defined as +#' \eqn{\rho = \frac{1}{n^2}\sum_{ij} 4 (C_{ij} - 0.5)^2} (Kim & Park, 2007): +#' 1 means clustering is identical across all runs, 0 means maximally unstable. +#' +#' @param consensus consensus matrix from \code{\link{.consensusMatrix}} +#' @return numeric scalar in \code{[0, 1]} +#' @keywords internal +#' @noRd +.dispersionCoefficient <- function(consensus) { + n <- nrow(consensus) + sum(4 * (consensus - 0.5)^2) / (n * n) +} + + +#' Pairwise ARI over all unordered pairs in a list of partitions +#' @param partitions list of integer label vectors +#' @return numeric vector of ARI values, one per pair +#' @keywords internal +#' @noRd +.pairwiseARI <- function(partitions) { + k <- length(partitions) + if (k < 2) return(numeric(0)) + pairs <- utils::combn(k, 2) + vapply(seq_len(ncol(pairs)), function(i) { + .adjustedRandIndex(partitions[[pairs[1, i]]], + partitions[[pairs[2, i]]]) + }, numeric(1)) +} + + +#' Fit one topic model, with or without the PPI/edge view +#' +#' Thin dispatcher used by both \code{compareTopicModels} and +#' \code{bootstrapTopicModels} so the joint-vs-text-only branch lives in one +#' place. Always returns a model carrying \code{W}, \code{H_text}, and +#' \code{H_edges} (folded from the text topics when the edges are excluded). +#' +#' @param X_text paper-by-word matrix +#' @param X_edges paper-by-edge matrix +#' @param k number of topics +#' @param include_ppi logical; jointly factorize the edge view when TRUE +#' @param max_iter,tol,seed passed to the underlying NMF +#' @return fitted model list (W, H_text, H_edges, objective, n_iter) +#' @keywords internal +#' @noRd +.fitTopicModel <- function(X_text, X_edges, k, include_ppi, + max_iter = 200, tol = 1e-4, seed = 1) { + if (include_ppi) { + model <- .jointNMF(X_text, X_edges, k = k, + max_iter = max_iter, tol = tol, seed = seed) + } else { + model <- .textNMF(X_text, k = k, + max_iter = max_iter, tol = tol, seed = seed) + model$H_edges <- .edgeLoadingsFromTopics(model$W, X_edges) + } + model +} + + +#' Row-wise cosine similarity between two topic-by-word matrices +#' +#' @param A,B numeric matrices with the same columns (k x p) +#' @return matrix of cosine similarities, \code{nrow(A)} x \code{nrow(B)} +#' @keywords internal +#' @noRd +.cosineSimMatrix <- function(A, B) { + an <- sqrt(rowSums(A^2)); an[an == 0] <- 1 + bn <- sqrt(rowSums(B^2)); bn[bn == 0] <- 1 + tcrossprod(A, B) / outer(an, bn) +} + + +#' Greedily match a resample's topics to reference topics +#' +#' Topic indices are arbitrary across NMF fits (label switching), so before any +#' per-topic aggregation each resample's topics must be put back into +#' correspondence with a reference fit. Matches on cosine similarity of the +#' topic-word vectors (\code{H_text} rows), greedily taking the highest +#' remaining similarity until every reference topic has a unique partner. With +#' the small topic counts used here this is effectively optimal. +#' +#' @param sim reference-by-resample similarity matrix from +#' \code{\link{.cosineSimMatrix}} +#' @return integer vector of length \code{nrow(sim)}: for each reference topic, +#' the index of the resample topic assigned to it +#' @keywords internal +#' @noRd +.greedyTopicMatch <- function(sim) { + k <- nrow(sim) + match <- integer(k) + used <- logical(ncol(sim)) + remaining <- seq_len(k) + for (step in seq_len(k)) { + cand <- which(!used) + sub <- sim[remaining, cand, drop = FALSE] + best <- which(sub == max(sub), arr.ind = TRUE)[1, ] + r <- remaining[best[1]] + c <- cand[best[2]] + match[r] <- c + used[c] <- TRUE + remaining <- setdiff(remaining, r) + } + match +} diff --git a/man/bootstrapTopicModels.Rd b/man/bootstrapTopicModels.Rd new file mode 100644 index 0000000..4ed337e --- /dev/null +++ b/man/bootstrapTopicModels.Rd @@ -0,0 +1,94 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/bootstrapTopicModels.R +\name{bootstrapTopicModels} +\alias{bootstrapTopicModels} +\title{Bootstrap the topic decomposition to find each topic's robust top words} +\usage{ +bootstrapTopicModels( + subnetwork, + n_boot = 50, + n_topics = 5, + include_ppi = TRUE, + n_top_terms = 10, + min_term_count = 2, + max_iter = 200, + tol = 1e-04, + seed = 1 +) +} +\arguments{ +\item{subnetwork}{list with \code{nodes} and \code{edges} data.frames, e.g. +the output of \code{\link{getSubnetworkFromIndra}}.} + +\item{n_boot}{number of bootstrap resamples. Default 50.} + +\item{n_topics}{number of topics (rank of the factorization). Default 5.} + +\item{include_ppi}{logical; factorize the PPI/edge view jointly with the text +(\code{TRUE}, default) or use paper words only (\code{FALSE}). See +\code{\link{decomposeSubnetworkByTopic}}.} + +\item{n_top_terms}{number of top words that define a topic's "top list" in +each resample (the cutoff for the selection-frequency tally). Default 10.} + +\item{min_term_count}{minimum corpus frequency for a word to be kept when +building the text matrix. Default 2.} + +\item{max_iter}{maximum number of NMF multiplicative-update iterations. +Default 200.} + +\item{tol}{relative-change tolerance for NMF early stopping. Default 1e-4.} + +\item{seed}{random seed for the reference fit, the resampling, and each +bootstrap NMF. Default 1.} +} +\value{ +A list with + \describe{ + \item{include_ppi, n_boot, n_topics}{the settings used.} + \item{topTerms}{named list \code{topic_1} ... \code{topic_k}. Each is a + data.frame sorted by \code{selection_freq}, with columns \code{term}, + \code{selection_freq} (fraction of resamples the word was in this + topic's top \code{n_top_terms}), and \code{mean_weight} (mean + within-topic word weight across resamples). A word with + \code{selection_freq} near 1 is a stable signature of the topic.} + \item{reference}{named list of the top \code{n_top_terms} words per topic + from the single full-data fit, for comparison.} + } +} +\description{ +Refits the NMF topic model on many bootstrap resamples of the papers and +reports, for every topic, how reliably each word stays among the topic's top +terms. This separates words that genuinely characterise a topic from words +that only surface in a single lucky fit, and lets you see how the top-word +lists change with and without the PPI view (run once per \code{include_ppi}). +} +\details{ +Because topic indices are arbitrary across fits (label switching), each +resample's topics are first aligned to a reference fit on the full data by +cosine similarity of their topic-word vectors. The papers are resampled with +replacement; the word vocabulary is held fixed (from the full data) so topics +remain comparable across resamples, and the NMF seed is held fixed so the +variability reported reflects \emph{data} resampling rather than random +initialization. +} +\examples{ +\dontrun{ +input <- data.table::fread(system.file( + "extdata/groupComparisonModel.csv", + package = "MSstatsBioNet" +)) +subnetwork <- getSubnetworkFromIndra(input) + +# Top words with PPIs included vs. words only: +boot_ppi <- bootstrapTopicModels(subnetwork, include_ppi = TRUE) +boot_text <- bootstrapTopicModels(subnetwork, include_ppi = FALSE) + +head(boot_ppi$topTerms$topic_1) # robust signature words for topic 1 +boot_text$topTerms$topic_1 +} +} +\seealso{ +\code{\link{decomposeSubnetworkByTopic}}, + \code{\link{compareTopicModels}} +} diff --git a/man/compareTopicModels.Rd b/man/compareTopicModels.Rd new file mode 100644 index 0000000..3f9f9e1 --- /dev/null +++ b/man/compareTopicModels.Rd @@ -0,0 +1,106 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/compareTopicModels.R +\name{compareTopicModels} +\alias{compareTopicModels} +\title{Test whether including PPIs changes topic structure beyond random chance} +\usage{ +compareTopicModels( + subnetwork, + seeds = seq_len(20), + n_topics = 5, + unit = c("edges", "papers"), + min_term_count = 2, + max_iter = 200, + tol = 1e-04 +) +} +\arguments{ +\item{subnetwork}{list with \code{nodes} and \code{edges} data.frames, e.g. +the output of \code{\link{getSubnetworkFromIndra}}.} + +\item{seeds}{integer vector of NMF seeds to fit (at least 2). Default +\code{1:20}.} + +\item{n_topics}{number of topics (rank of the factorization). Default 5.} + +\item{unit}{either \code{"edges"} (compare edge-to-topic assignments, the +default, matching the subnetworks the decomposition returns) or +\code{"papers"} (compare paper-to-topic assignments).} + +\item{min_term_count}{minimum corpus frequency for a word to be kept when +building the text matrix. Default 2.} + +\item{max_iter}{maximum number of NMF multiplicative-update iterations. +Default 200.} + +\item{tol}{relative-change tolerance for NMF early stopping. Default 1e-4.} +} +\value{ +A list with + \describe{ + \item{unit}{the comparison unit used.} + \item{seeds}{the seeds fitted.} + \item{n_topics}{the effective number of topics.} + \item{ari}{list of numeric vectors \code{within_joint}, + \code{within_text}, and \code{between} (matched seeds).} + \item{summary}{data.frame of median/mean ARI and count per comparison.} + \item{test}{the \code{\link[stats]{wilcox.test}} object comparing the + between distribution against the pooled within distributions + (\code{alternative = "less"}).} + \item{consensus}{list of consensus (co-membership) matrices, + \code{joint} and \code{text}, across seeds.} + \item{dispersion}{named numeric vector of consensus dispersion + coefficients (1 = identical clustering across all seeds).} + \item{partitions}{list of the raw per-seed partitions, \code{joint} and + \code{text}, for further inspection.} + } +} +\description{ +Quantifies how much the topic decomposition produced by +\code{\link{decomposeSubnetworkByTopic}} changes when the PPI/edge view is +included (\code{include_ppi = TRUE}) versus excluded +(\code{include_ppi = FALSE}), and separates that change from the run-to-run +variability that NMF produces just from its random initialization. +} +\details{ +NMF converges to a local optimum that depends on the random seed, so a single +joint-vs-text comparison conflates the real effect of the PPI view with +optimization noise. This function instead refits both modes across many seeds +and compares three distributions of partition agreement (Adjusted Rand Index, +ARI): +\describe{ + \item{within_joint}{ARI between pairs of joint runs (different seeds) — + how much the joint solution wobbles on its own.} + \item{within_text}{ARI between pairs of text-only runs — the same for the + text-only solution.} + \item{between}{ARI between the joint and text-only run at the \emph{same} + seed. Because both modes draw \code{W} and \code{H_text} from the same + seeded stream, a matched seed gives both modes an identical + initialization, so this isolates the effect of adding the PPI view from + the starting point.} +} +If the between-mode ARI is systematically lower than the within-mode ARIs, +the PPI view changes the topic structure more than chance would — a +one-sided Wilcoxon rank-sum test (\code{between < within}) puts a p-value on +it. If the between distribution sits inside the within distributions, the +apparent difference is just optimization noise. + +The expensive, network-bound steps (evidence extraction, abstract fetching, +matrix construction) run once; only the NMF is repeated per seed. +} +\examples{ +\dontrun{ +input <- data.table::fread(system.file( + "extdata/groupComparisonModel.csv", + package = "MSstatsBioNet" +)) +subnetwork <- getSubnetworkFromIndra(input) +cmp <- compareTopicModels(subnetwork, seeds = 1:20, n_topics = 5) +cmp$summary +cmp$test # p < 0.05 => PPI changes topics beyond chance +cmp$dispersion # how stable each mode is across seeds +} +} +\seealso{ +\code{\link{decomposeSubnetworkByTopic}} +} diff --git a/man/decomposeSubnetworkByTopic.Rd b/man/decomposeSubnetworkByTopic.Rd new file mode 100644 index 0000000..e74b1fc --- /dev/null +++ b/man/decomposeSubnetworkByTopic.Rd @@ -0,0 +1,108 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/decomposeSubnetworkByTopic.R +\name{decomposeSubnetworkByTopic} +\alias{decomposeSubnetworkByTopic} +\title{Decompose a subnetwork into topic-specific subnetworks via joint NMF} +\usage{ +decomposeSubnetworkByTopic( + subnetwork, + n_topics = 5, + edge_topic_cutoff = 0.2, + n_top_terms = 10, + min_term_count = 2, + max_iter = 200, + tol = 1e-04, + seed = 1, + include_ppi = TRUE +) +} +\arguments{ +\item{subnetwork}{list with \code{nodes} and \code{edges} data.frames, e.g. +the output of \code{\link{getSubnetworkFromIndra}}.} + +\item{n_topics}{number of topics (rank of the factorization). Default 5.} + +\item{edge_topic_cutoff}{numeric in \code{[0, 1]}; an edge is added to a +topic's subnetwork when the topic carries at least this share of the +edge's total loading. Each edge is always included in at least its +highest-loading topic. Default 0.2.} + +\item{n_top_terms}{number of top words to report per topic. Default 10.} + +\item{min_term_count}{minimum corpus frequency for a word to be kept when +building \code{X_text}. Default 2.} + +\item{max_iter}{maximum number of NMF multiplicative-update iterations. +Default 200.} + +\item{tol}{relative-change tolerance for NMF early stopping. Default 1e-4.} + +\item{seed}{random seed for NMF initialization. Default 1.} + +\item{include_ppi}{logical; if \code{TRUE} (default) the PPI/edge matrix is +factorized jointly with the text matrix via a shared basis. If +\code{FALSE}, NMF is run on the paper-word matrix only and edge-topic +loadings are derived afterwards by folding edge counts onto the +text-learned topics, so the PPIs do not influence the topics themselves.} +} +\value{ +A list of length \code{n_topics}, named \code{topic_1} ... + \code{topic_k}. Each element is a topic-specific subnetwork: a list with + \describe{ + \item{nodes}{nodes data.frame restricted to the topic's edges.} + \item{edges}{edges data.frame for the topic, with an added + \code{topicWeight} column (the edge's topic share).} + \item{topic}{the topic index.} + \item{topTerms}{character vector of the topic's top words.} + \item{pmids}{PMIDs whose strongest topic loading is this topic.} + } + The full factorization (W, H_text, H_edges, etc.) is attached as the + \code{"nmf"} attribute of the returned list. +} +\description{ +Takes a subnetwork (the output of \code{\link{getSubnetworkFromIndra}}) and +splits it into a list of smaller, topic-specific subnetworks discovered with +unsupervised non-negative matrix factorization (NMF). +} +\details{ +The procedure is: +\enumerate{ + \item For every edge, the supporting INDRA evidence is retrieved and the + PubMed abstract of each referenced PMID is fetched. Papers (PMIDs) are + the shared unit of analysis. + \item Two matrices are built that share the same rows (papers): + \code{X_text} (papers x words, term counts from the abstracts) and + \code{X_edges} (papers x unique \code{source_target_interaction} + combinations, evidence-sentence counts). + \item NMF learns a basis matrix \code{W} (papers x topics). When + \code{include_ppi = TRUE} (the default) a \emph{joint} NMF learns a + single shared \code{W} such that \eqn{X_{text} \approx W H_{text}} and + \eqn{X_{edges} \approx W H_{edges}}, tying each learned topic to both a + set of words and a set of edges. When \code{include_ppi = FALSE} the + factorization uses only \code{X_text} (\eqn{X_{text} \approx W H_{text}}); + the PPI evidence is excluded from the modeling and edge-topic loadings + are instead derived afterwards by folding the edge counts onto the + text-learned topics (\eqn{H_{edges} = W^\top X_{edges}}). This lets you + compare topic structure with and without the PPI view. + \item Each topic becomes its own subnetwork: an edge is included in a + topic when that topic carries at least \code{edge_topic_cutoff} of the + edge's loading (soft, overlapping assignment), and nodes are restricted + to those touched by the kept edges. +} +} +\examples{ +\dontrun{ +input <- data.table::fread(system.file( + "extdata/groupComparisonModel.csv", + package = "MSstatsBioNet" +)) +subnetwork <- getSubnetworkFromIndra(input) +topics <- decomposeSubnetworkByTopic(subnetwork, n_topics = 5) +topics$topic_1$topTerms +exportNetworkToHTML(topics$topic_1$nodes, topics$topic_1$edges) +} +} +\seealso{ +\code{\link{getSubnetworkFromIndra}}, + \code{\link{filterSubnetworkByContext}} +} From a2caf70091335d53e074cc4f3dd21ac2277c4947 Mon Sep 17 00:00:00 2001 From: Tony Wu Date: Tue, 16 Jun 2026 10:40:55 -0400 Subject: [PATCH 3/4] add note on beta features --- R/bootstrapTopicModels.R | 3 ++- R/compareTopicModels.R | 3 ++- R/decomposeSubnetworkByTopic.R | 3 +++ R/filterSubnetworkByContext.R | 8 ++++---- R/utils_decomposeSubnetworkByTopic.R | 3 +-- man/bootstrapTopicModels.Rd | 4 ++++ man/compareTopicModels.Rd | 4 ++++ man/decomposeSubnetworkByTopic.Rd | 4 ++++ man/filterSubnetworkByContext.Rd | 4 ++++ 9 files changed, 28 insertions(+), 8 deletions(-) diff --git a/R/bootstrapTopicModels.R b/R/bootstrapTopicModels.R index e113962..3c2f791 100644 --- a/R/bootstrapTopicModels.R +++ b/R/bootstrapTopicModels.R @@ -46,7 +46,8 @@ #' #' @seealso \code{\link{decomposeSubnetworkByTopic}}, #' \code{\link{compareTopicModels}} -#' +#' @note **Beta feature:** This function is experimental and the API may +#' change without notice in future versions. #' @export #' #' @examples diff --git a/R/compareTopicModels.R b/R/compareTopicModels.R index 2184ce0..10114fa 100644 --- a/R/compareTopicModels.R +++ b/R/compareTopicModels.R @@ -65,7 +65,8 @@ #' } #' #' @seealso \code{\link{decomposeSubnetworkByTopic}} -#' +#' @note **Beta feature:** This function is experimental and the API may +#' change without notice in future versions. #' @export #' #' @examples diff --git a/R/decomposeSubnetworkByTopic.R b/R/decomposeSubnetworkByTopic.R index dc6efaa..a1f7786 100644 --- a/R/decomposeSubnetworkByTopic.R +++ b/R/decomposeSubnetworkByTopic.R @@ -66,6 +66,9 @@ #' \code{\link{filterSubnetworkByContext}} #' #' @export +#' +#' @note **Beta feature:** This function is experimental and the API may +#' change without notice in future versions. #' #' @examples #' \dontrun{ diff --git a/R/filterSubnetworkByContext.R b/R/filterSubnetworkByContext.R index 2bb07b4..a593a78 100644 --- a/R/filterSubnetworkByContext.R +++ b/R/filterSubnetworkByContext.R @@ -48,9 +48,10 @@ #' contains tag counts (integer) or cosine similarities (numeric) depending #' on the method used.} #' -#' @importFrom text2vec itoken word_tokenizer create_vocabulary prune_vocabulary -#' vocab_vectorizer create_dtm TfIdf fit_transform +#' @importFrom text2vec itoken word_tokenizer create_vocabulary prune_vocabulary vocab_vectorizer create_dtm TfIdf fit_transform #' @importFrom stopwords stopwords +#' @note **Beta feature:** This function is experimental and the API may +#' change without notice in future versions. #' @export filterSubnetworkByContext <- function(nodes, edges, @@ -196,8 +197,7 @@ filterSubnetworkByContext <- function(nodes, #' as \code{abstracts}. #' @keywords internal #' @noRd -#' @importFrom text2vec itoken word_tokenizer create_vocabulary prune_vocabulary -#' vocab_vectorizer create_dtm TfIdf fit_transform +#' @importFrom text2vec itoken word_tokenizer create_vocabulary prune_vocabulary vocab_vectorizer create_dtm TfIdf fit_transform #' @importFrom stopwords stopwords .score_by_cosine <- function(query, abstracts) { all_texts <- c(query, abstracts) diff --git a/R/utils_decomposeSubnetworkByTopic.R b/R/utils_decomposeSubnetworkByTopic.R index 39d76ce..b783faa 100644 --- a/R/utils_decomposeSubnetworkByTopic.R +++ b/R/utils_decomposeSubnetworkByTopic.R @@ -125,8 +125,7 @@ #' @return dense numeric matrix (papers x words) with rownames = pmids #' @keywords internal #' @noRd -#' @importFrom text2vec itoken word_tokenizer create_vocabulary -#' prune_vocabulary vocab_vectorizer create_dtm +#' @importFrom text2vec itoken word_tokenizer create_vocabulary prune_vocabulary vocab_vectorizer create_dtm #' @importFrom stopwords stopwords .buildTextMatrix <- function(pmids, abstracts, min_term_count = 2) { tokens <- itoken(abstracts, diff --git a/man/bootstrapTopicModels.Rd b/man/bootstrapTopicModels.Rd index 4ed337e..f2343fe 100644 --- a/man/bootstrapTopicModels.Rd +++ b/man/bootstrapTopicModels.Rd @@ -72,6 +72,10 @@ remain comparable across resamples, and the NMF seed is held fixed so the variability reported reflects \emph{data} resampling rather than random initialization. } +\note{ +**Beta feature:** This function is experimental and the API may + change without notice in future versions. +} \examples{ \dontrun{ input <- data.table::fread(system.file( diff --git a/man/compareTopicModels.Rd b/man/compareTopicModels.Rd index 3f9f9e1..7da0fe5 100644 --- a/man/compareTopicModels.Rd +++ b/man/compareTopicModels.Rd @@ -88,6 +88,10 @@ apparent difference is just optimization noise. The expensive, network-bound steps (evidence extraction, abstract fetching, matrix construction) run once; only the NMF is repeated per seed. } +\note{ +**Beta feature:** This function is experimental and the API may + change without notice in future versions. +} \examples{ \dontrun{ input <- data.table::fread(system.file( diff --git a/man/decomposeSubnetworkByTopic.Rd b/man/decomposeSubnetworkByTopic.Rd index e74b1fc..4ea4145 100644 --- a/man/decomposeSubnetworkByTopic.Rd +++ b/man/decomposeSubnetworkByTopic.Rd @@ -90,6 +90,10 @@ The procedure is: to those touched by the kept edges. } } +\note{ +**Beta feature:** This function is experimental and the API may + change without notice in future versions. +} \examples{ \dontrun{ input <- data.table::fread(system.file( diff --git a/man/filterSubnetworkByContext.Rd b/man/filterSubnetworkByContext.Rd index 91d176c..f160bd0 100644 --- a/man/filterSubnetworkByContext.Rd +++ b/man/filterSubnetworkByContext.Rd @@ -68,3 +68,7 @@ Two scoring methods are available, controlled by the \code{method} argument: } } } +\note{ +**Beta feature:** This function is experimental and the API may + change without notice in future versions. +} From aefb3161154db202fe0b50c494020030e74a5a27 Mon Sep 17 00:00:00 2001 From: Tony Wu Date: Tue, 16 Jun 2026 10:49:34 -0400 Subject: [PATCH 4/4] fix bold heading --- R/bootstrapTopicModels.R | 2 +- R/compareTopicModels.R | 2 +- R/decomposeSubnetworkByTopic.R | 2 +- R/filterSubnetworkByContext.R | 2 +- man/bootstrapTopicModels.Rd | 2 +- man/compareTopicModels.Rd | 2 +- man/decomposeSubnetworkByTopic.Rd | 2 +- man/filterSubnetworkByContext.Rd | 2 +- 8 files changed, 8 insertions(+), 8 deletions(-) diff --git a/R/bootstrapTopicModels.R b/R/bootstrapTopicModels.R index 3c2f791..c0c997e 100644 --- a/R/bootstrapTopicModels.R +++ b/R/bootstrapTopicModels.R @@ -46,7 +46,7 @@ #' #' @seealso \code{\link{decomposeSubnetworkByTopic}}, #' \code{\link{compareTopicModels}} -#' @note **Beta feature:** This function is experimental and the API may +#' @note \strong{Beta feature:} This function is experimental and the API may #' change without notice in future versions. #' @export #' diff --git a/R/compareTopicModels.R b/R/compareTopicModels.R index 10114fa..4476209 100644 --- a/R/compareTopicModels.R +++ b/R/compareTopicModels.R @@ -65,7 +65,7 @@ #' } #' #' @seealso \code{\link{decomposeSubnetworkByTopic}} -#' @note **Beta feature:** This function is experimental and the API may +#' @note \strong{Beta feature:} This function is experimental and the API may #' change without notice in future versions. #' @export #' diff --git a/R/decomposeSubnetworkByTopic.R b/R/decomposeSubnetworkByTopic.R index a1f7786..ec24a3d 100644 --- a/R/decomposeSubnetworkByTopic.R +++ b/R/decomposeSubnetworkByTopic.R @@ -67,7 +67,7 @@ #' #' @export #' -#' @note **Beta feature:** This function is experimental and the API may +#' @note \strong{Beta feature:} This function is experimental and the API may #' change without notice in future versions. #' #' @examples diff --git a/R/filterSubnetworkByContext.R b/R/filterSubnetworkByContext.R index a593a78..e0618e3 100644 --- a/R/filterSubnetworkByContext.R +++ b/R/filterSubnetworkByContext.R @@ -50,7 +50,7 @@ #' #' @importFrom text2vec itoken word_tokenizer create_vocabulary prune_vocabulary vocab_vectorizer create_dtm TfIdf fit_transform #' @importFrom stopwords stopwords -#' @note **Beta feature:** This function is experimental and the API may +#' @note \strong{Beta feature:} This function is experimental and the API may #' change without notice in future versions. #' @export filterSubnetworkByContext <- function(nodes, diff --git a/man/bootstrapTopicModels.Rd b/man/bootstrapTopicModels.Rd index f2343fe..5e4b640 100644 --- a/man/bootstrapTopicModels.Rd +++ b/man/bootstrapTopicModels.Rd @@ -73,7 +73,7 @@ variability reported reflects \emph{data} resampling rather than random initialization. } \note{ -**Beta feature:** This function is experimental and the API may +\strong{Beta feature:} This function is experimental and the API may change without notice in future versions. } \examples{ diff --git a/man/compareTopicModels.Rd b/man/compareTopicModels.Rd index 7da0fe5..4831fe3 100644 --- a/man/compareTopicModels.Rd +++ b/man/compareTopicModels.Rd @@ -89,7 +89,7 @@ The expensive, network-bound steps (evidence extraction, abstract fetching, matrix construction) run once; only the NMF is repeated per seed. } \note{ -**Beta feature:** This function is experimental and the API may +\strong{Beta feature:} This function is experimental and the API may change without notice in future versions. } \examples{ diff --git a/man/decomposeSubnetworkByTopic.Rd b/man/decomposeSubnetworkByTopic.Rd index 4ea4145..e29afd7 100644 --- a/man/decomposeSubnetworkByTopic.Rd +++ b/man/decomposeSubnetworkByTopic.Rd @@ -91,7 +91,7 @@ The procedure is: } } \note{ -**Beta feature:** This function is experimental and the API may +\strong{Beta feature:} This function is experimental and the API may change without notice in future versions. } \examples{ diff --git a/man/filterSubnetworkByContext.Rd b/man/filterSubnetworkByContext.Rd index f160bd0..6700d53 100644 --- a/man/filterSubnetworkByContext.Rd +++ b/man/filterSubnetworkByContext.Rd @@ -69,6 +69,6 @@ Two scoring methods are available, controlled by the \code{method} argument: } } \note{ -**Beta feature:** This function is experimental and the API may +\strong{Beta feature:} This function is experimental and the API may change without notice in future versions. }