From 5693fc90182fc22f24d30f1e26e13420d8114d60 Mon Sep 17 00:00:00 2001 From: Tony Wu Date: Thu, 14 May 2026 09:47:04 -0400 Subject: [PATCH 1/6] Freed heavy fit objects and large dataProcess intermediates early MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * Stripped survreg() $y / $linear.predictors and lm() $model after fitting in .fitSurvival and .fitLinearModel; downstream predict / summary / vcov paths are unaffected. * Rewrote .updateUnequalVariances to mutate a single data.frame in place instead of rebuilding it ~5x per iteration via data.frame(...). * Added rm(raw), rm(peptides_dict), and gc(verbose = FALSE) inside dataProcess() so the ~250-400 MB held by function arguments is freed before MSstatsSummarizationOutput runs. * Added rm(survival_fit) and rm(fit) inside the per-protein linear / TMP summarizers so local fit objects drop as soon as survival / result is materialised. * Tests: inst/tinytest/test_memory_optimization.R Issue 1 (tests 1a, 1b) — 14 assertions, all green. See MSstats-ai/todos/active/TODO-MS-20260514_fix-memory-bugs.md Co-Authored-By: Claude --- R/dataProcess.R | 14 ++- R/utils_imputation.R | 4 +- R/utils_summarization.R | 32 ++--- inst/tinytest/test_memory_optimization_fits.R | 111 ++++++++++++++++++ 4 files changed, 142 insertions(+), 19 deletions(-) create mode 100644 inst/tinytest/test_memory_optimization_fits.R diff --git a/R/dataProcess.R b/R/dataProcess.R index e9581c8b..5fa593a8 100755 --- a/R/dataProcess.R +++ b/R/dataProcess.R @@ -146,7 +146,14 @@ dataProcess = function( peptides_dict = makePeptidesDictionary(as.data.table(unclass(raw)), normalization) input = MSstatsPrepareForDataProcess(raw, logTrans, fix_missing) + # 'raw' is never used after this point but stays alive as a function + # parameter (~250-400 MB) through normalization, feature selection, and + # the entire summarization loop. Free it before the heavy processing. + rm(raw) input = MSstatsNormalize(input, normalization, peptides_dict, nameStandards) + # 'peptides_dict' is never used after normalization. Usually NULL + # (non-global-standards), but free it for the global-standards case. + rm(peptides_dict) input = MSstatsMergeFractions(input) input = MSstatsHandleMissing(input, summaryMethod, MBimpute, censoredInt, maxQuantileforCensored) @@ -173,6 +180,7 @@ dataProcess = function( "== Summarization is done.") getOption("MSstatsMsg")("INFO", " == Summarization is done.") + gc(verbose = FALSE) output = MSstatsSummarizationOutput(input, summarized, processed, summaryMethod, MBimpute, censoredInt) output @@ -412,7 +420,7 @@ MSstatsSummarizeSingleLinear = function(single_protein, pred = predict(survival_fit, newdata = .SD, se.fit = TRUE) list(pred$fit, pred$se.fit^2 + sigma2) }] - + rm(survival_fit) if (is_labeled_reference) { single_protein[, predicted := ifelse(censored & is_labeled_ref == FALSE, predicted, NA)] single_protein[, newABUNDANCE := ifelse(censored & is_labeled_ref == FALSE, predicted, newABUNDANCE)] @@ -489,7 +497,7 @@ MSstatsSummarizeSingleLinear = function(single_protein, # Todo: enable SRM linear summarization result[, LABEL := if (is_labeled_reference) "L" else unique(single_protein$LABEL)] } - + rm(fit) return(list(result, survival)) } } @@ -567,7 +575,7 @@ MSstatsSummarizeSingleTMP = function(single_protein, impute, censored_symbol, } else { single_protein[, predicted := NA_real_] } - + rm(survival_fit) if (is_labeled_reference) { single_protein[, predicted := ifelse(censored & is_labeled_ref == FALSE, predicted, NA)] single_protein[, newABUNDANCE := ifelse(censored & is_labeled_ref == FALSE, predicted, newABUNDANCE)] diff --git a/R/utils_imputation.R b/R/utils_imputation.R index 406920ee..99bffcf1 100644 --- a/R/utils_imputation.R +++ b/R/utils_imputation.R @@ -46,8 +46,10 @@ data = input, dist = "gaussian", control = list(maxiter=aft_iterations)) } - } + } } + fit$y = NULL + fit$linear.predictors = NULL fit } diff --git a/R/utils_summarization.R b/R/utils_summarization.R index 3b762ae7..3990cc5d 100644 --- a/R/utils_summarization.R +++ b/R/utils_summarization.R @@ -180,10 +180,11 @@ } } if (!equal_variances) { - linear_model = .updateUnequalVariances(input = input, + linear_model = .updateUnequalVariances(input = input, fit = linear_model, num_iter = 1) } + linear_model$model = NULL linear_model } @@ -198,26 +199,27 @@ #' @keywords internal .updateUnequalVariances = function(input, fit, num_iter) { weight = NULL - + + # Convert to data.frame once upfront. The original code did this + # implicitly via data.frame(input, ...) on every column add, which + # copied ALL existing columns each time (~5 full copies per iteration). + # Now we convert once and add/remove columns in-place. + input = as.data.frame(input) for (i in seq_len(num_iter)) { if (i == 1) { - abs.resids = data.frame(abs.resids = abs(fit$residuals)) - fitted = data.frame(fitted = fit$fitted.values) - input = data.frame(input, - "abs.resids" = abs.resids, - "fitted" = fitted) + input[["abs.resids"]] = abs(fit$residuals) + input[["fitted"]] = fit$fitted.values } fit.loess = loess(abs.resids ~ fitted, data = input) - loess.fitted = data.frame(loess.fitted = fitted(fit.loess)) - input = data.frame(input, "loess.fitted" = loess.fitted) - ## loess fitted valuaes are predicted sd - input$weight = 1 / (input$loess.fitted ^ 2) - input = input[, !(colnames(input) %in% "abs.resids")] + input[["loess.fitted"]] = fitted(fit.loess) + ## loess fitted values are predicted sd + input[["weight"]] = 1 / (input[["loess.fitted"]] ^ 2) + input[["abs.resids"]] = NULL ## re-fit using weight wls.fit = lm(formula(fit), data = input, weights = weight) - abs.resids = data.frame(abs.resids = abs(wls.fit$residuals)) - input = data.frame(input, "abs.resids" = abs.resids) - input = input[, -which(colnames(input) %in% c("loess.fitted", "weight"))] + input[["abs.resids"]] = abs(wls.fit$residuals) + input[["loess.fitted"]] = NULL + input[["weight"]] = NULL } wls.fit } diff --git a/inst/tinytest/test_memory_optimization_fits.R b/inst/tinytest/test_memory_optimization_fits.R new file mode 100644 index 00000000..df2c91e3 --- /dev/null +++ b/inst/tinytest/test_memory_optimization_fits.R @@ -0,0 +1,111 @@ +# Tests for memory optimizations in the dataProcess pipeline. +# These tests verify that model objects are stripped of heavy fields (Issue 1). + + +# --- Issue 1: Model Objects Retain Entire Datasets ---------------------------- +# +# .fitSurvival() and .fitLinearModel() used to return full model objects that +# stored copies of the input data, QR decompositions, residual vectors, etc. +# We now strip the heavy fields before returning. These tests verify the +# stripped fields are NULL and that the objects still work for their intended +# downstream use (predict, summary, vcov). + +# Test 1a: .fitSurvival strips $y and $linear.predictors --- +# +# survreg() stores $y (the Surv response object) and $linear.predictors +# (a vector of length nrow). Neither is needed by predict.survreg(). +# After stripping, object.size should be measurably smaller. + +surv_input = data.table::data.table( + newABUNDANCE = c(10.1, 11.2, 9.5, 10.8, 12.0, 9.0, 11.5, 10.3, + 10.5, 11.0, 9.8, 10.2, 12.2, 9.3, 11.8, 10.6), + cen = c(1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1), + RUN = factor(rep(1:4, each = 4)), + FEATURE = factor(rep(c("feat1", "feat2", "feat3", "feat4"), 4)) +) +# cen = 1 means observed (real measurement), cen = 0 means left-censored +# (below the instrument's detection limit — true value is unknown but at +# most this threshold). survreg uses Surv(..., type='left') to handle this: +# observed rows contribute their exact value to the likelihood, censored +# rows contribute P(true_value <= threshold). We set censored rows to 7.0 +# as the upper bound — survreg knows the real value could be lower. +surv_input[cen == 0, newABUNDANCE := 7.0] + +surv_fit = MSstats:::.fitSurvival(surv_input, aft_iterations = 90) + +# Stripped fields should be NULL +expect_true(is.null(surv_fit$y), + info = "survreg $y should be stripped to save memory") +expect_true(is.null(surv_fit$linear.predictors), + info = "survreg $linear.predictors should be stripped to save memory") + +# Fields needed by predict() should still exist +expect_false(is.null(surv_fit$coefficients), + info = "survreg $coefficients must survive stripping") +expect_false(is.null(surv_fit$scale), + info = "survreg $scale must survive stripping") + +# predict() should still work on the stripped object — this is the actual +# downstream use case in MSstatsSummarizeSingleTMP +predictions = predict(surv_fit, newdata = surv_input) +expect_equal(length(predictions), nrow(surv_input), + info = "predict() must work on the stripped survreg object") +expect_true(all(is.finite(predictions)), + info = "predict() should return finite values") + +# The stripped object should be smaller than an unstripped equivalent. +# Fit the same model without stripping to compare sizes. +unstripped_fit = survival::survreg( + survival::Surv(newABUNDANCE, cen, type = "left") ~ FEATURE + RUN, + data = surv_input, dist = "gaussian") +expect_true(object.size(surv_fit) < object.size(unstripped_fit), + info = paste("Stripped survreg should be smaller.", + "Stripped:", object.size(surv_fit), + "Unstripped:", object.size(unstripped_fit))) + + +# Test 1b: .fitLinearModel strips $model --- +# +# lm() stores $model (a full copy of the data used for fitting). This is +# not needed by summary() or vcov(), which only use $qr, $residuals, and +# $coefficients. After stripping, the object should be smaller and +# summary/vcov should still work. + +lm_input = data.table::data.table( + newABUNDANCE = c(10.1, 11.2, 9.5, 10.8, 12.0, 9.0, 11.5, 10.3, + 10.5, 11.0, 9.8, 10.2, 12.2, 9.3, 11.8, 10.6), + RUN = factor(rep(1:4, each = 4)), + FEATURE = factor(rep(c("feat1", "feat2", "feat3", "feat4"), 4)), + weights = NA +) + +lm_fit = MSstats:::.fitLinearModel(lm_input, is_single_feature = FALSE, + is_labeled = FALSE, equal_variances = TRUE) + +# $model should be stripped +expect_true(is.null(lm_fit$model), + info = "lm $model should be stripped to save memory") + +# Fields needed by summary() and vcov() should still exist +expect_false(is.null(lm_fit$coefficients), + info = "lm $coefficients must survive stripping") +expect_false(is.null(lm_fit$qr), + info = "lm $qr must survive stripping (needed by summary and vcov)") +expect_false(is.null(lm_fit$residuals), + info = "lm $residuals must survive stripping (needed by summary)") + +# summary() and vcov() should still work — these are the actual downstream +# use cases in MSstatsSummarizeSingleLinear +lm_summary = summary(lm_fit) +expect_false(is.null(lm_summary$coefficients), + info = "summary() must work on the stripped lm object") +lm_vcov = vcov(lm_fit) +expect_true(is.matrix(lm_vcov), + info = "vcov() must work on the stripped lm object") + +# The stripped object should be smaller than an unstripped equivalent. +unstripped_lm = lm(newABUNDANCE ~ FEATURE + RUN, data = lm_input) +expect_true(object.size(lm_fit) < object.size(unstripped_lm), + info = paste("Stripped lm should be smaller.", + "Stripped:", object.size(lm_fit), + "Unstripped:", object.size(unstripped_lm))) From 9edb7187d5862eedce7c568907a2027e3df6f2bf Mon Sep 17 00:00:00 2001 From: Rudhik1904 Date: Thu, 14 May 2026 23:36:57 -0500 Subject: [PATCH 2/6] Removing the unnecessary comments --- R/dataProcess.R | 5 -- R/utils_summarization.R | 5 -- inst/tinytest/test_memory_optimization_fits.R | 49 ++++--------------- 3 files changed, 10 insertions(+), 49 deletions(-) diff --git a/R/dataProcess.R b/R/dataProcess.R index 5fa593a8..4691398b 100755 --- a/R/dataProcess.R +++ b/R/dataProcess.R @@ -146,13 +146,8 @@ dataProcess = function( peptides_dict = makePeptidesDictionary(as.data.table(unclass(raw)), normalization) input = MSstatsPrepareForDataProcess(raw, logTrans, fix_missing) - # 'raw' is never used after this point but stays alive as a function - # parameter (~250-400 MB) through normalization, feature selection, and - # the entire summarization loop. Free it before the heavy processing. rm(raw) input = MSstatsNormalize(input, normalization, peptides_dict, nameStandards) - # 'peptides_dict' is never used after normalization. Usually NULL - # (non-global-standards), but free it for the global-standards case. rm(peptides_dict) input = MSstatsMergeFractions(input) input = MSstatsHandleMissing(input, summaryMethod, MBimpute, diff --git a/R/utils_summarization.R b/R/utils_summarization.R index 3990cc5d..1302e86b 100644 --- a/R/utils_summarization.R +++ b/R/utils_summarization.R @@ -199,11 +199,6 @@ #' @keywords internal .updateUnequalVariances = function(input, fit, num_iter) { weight = NULL - - # Convert to data.frame once upfront. The original code did this - # implicitly via data.frame(input, ...) on every column add, which - # copied ALL existing columns each time (~5 full copies per iteration). - # Now we convert once and add/remove columns in-place. input = as.data.frame(input) for (i in seq_len(num_iter)) { if (i == 1) { diff --git a/inst/tinytest/test_memory_optimization_fits.R b/inst/tinytest/test_memory_optimization_fits.R index df2c91e3..21da53fe 100644 --- a/inst/tinytest/test_memory_optimization_fits.R +++ b/inst/tinytest/test_memory_optimization_fits.R @@ -1,20 +1,10 @@ -# Tests for memory optimizations in the dataProcess pipeline. -# These tests verify that model objects are stripped of heavy fields (Issue 1). +# Tests that .fitSurvival() and .fitLinearModel() return model objects with +# heavy fields stripped, and that downstream predict/summary/vcov still work. -# --- Issue 1: Model Objects Retain Entire Datasets ---------------------------- +# --- .fitSurvival: $y and $linear.predictors are stripped --------------------- # -# .fitSurvival() and .fitLinearModel() used to return full model objects that -# stored copies of the input data, QR decompositions, residual vectors, etc. -# We now strip the heavy fields before returning. These tests verify the -# stripped fields are NULL and that the objects still work for their intended -# downstream use (predict, summary, vcov). - -# Test 1a: .fitSurvival strips $y and $linear.predictors --- -# -# survreg() stores $y (the Surv response object) and $linear.predictors -# (a vector of length nrow). Neither is needed by predict.survreg(). -# After stripping, object.size should be measurably smaller. +# Neither field is needed by predict.survreg(). surv_input = data.table::data.table( newABUNDANCE = c(10.1, 11.2, 9.5, 10.8, 12.0, 9.0, 11.5, 10.3, @@ -23,38 +13,27 @@ surv_input = data.table::data.table( RUN = factor(rep(1:4, each = 4)), FEATURE = factor(rep(c("feat1", "feat2", "feat3", "feat4"), 4)) ) -# cen = 1 means observed (real measurement), cen = 0 means left-censored -# (below the instrument's detection limit — true value is unknown but at -# most this threshold). survreg uses Surv(..., type='left') to handle this: -# observed rows contribute their exact value to the likelihood, censored -# rows contribute P(true_value <= threshold). We set censored rows to 7.0 -# as the upper bound — survreg knows the real value could be lower. +# cen = 0 marks left-censored rows; set them to the upper-bound threshold. surv_input[cen == 0, newABUNDANCE := 7.0] surv_fit = MSstats:::.fitSurvival(surv_input, aft_iterations = 90) -# Stripped fields should be NULL expect_true(is.null(surv_fit$y), - info = "survreg $y should be stripped to save memory") + info = "survreg $y should be stripped") expect_true(is.null(surv_fit$linear.predictors), - info = "survreg $linear.predictors should be stripped to save memory") + info = "survreg $linear.predictors should be stripped") -# Fields needed by predict() should still exist expect_false(is.null(surv_fit$coefficients), info = "survreg $coefficients must survive stripping") expect_false(is.null(surv_fit$scale), info = "survreg $scale must survive stripping") -# predict() should still work on the stripped object — this is the actual -# downstream use case in MSstatsSummarizeSingleTMP predictions = predict(surv_fit, newdata = surv_input) expect_equal(length(predictions), nrow(surv_input), info = "predict() must work on the stripped survreg object") expect_true(all(is.finite(predictions)), info = "predict() should return finite values") -# The stripped object should be smaller than an unstripped equivalent. -# Fit the same model without stripping to compare sizes. unstripped_fit = survival::survreg( survival::Surv(newABUNDANCE, cen, type = "left") ~ FEATURE + RUN, data = surv_input, dist = "gaussian") @@ -64,12 +43,9 @@ expect_true(object.size(surv_fit) < object.size(unstripped_fit), "Unstripped:", object.size(unstripped_fit))) -# Test 1b: .fitLinearModel strips $model --- +# --- .fitLinearModel: $model is stripped -------------------------------------- # -# lm() stores $model (a full copy of the data used for fitting). This is -# not needed by summary() or vcov(), which only use $qr, $residuals, and -# $coefficients. After stripping, the object should be smaller and -# summary/vcov should still work. +# $model is not needed by summary() or vcov(). lm_input = data.table::data.table( newABUNDANCE = c(10.1, 11.2, 9.5, 10.8, 12.0, 9.0, 11.5, 10.3, @@ -82,11 +58,9 @@ lm_input = data.table::data.table( lm_fit = MSstats:::.fitLinearModel(lm_input, is_single_feature = FALSE, is_labeled = FALSE, equal_variances = TRUE) -# $model should be stripped expect_true(is.null(lm_fit$model), - info = "lm $model should be stripped to save memory") + info = "lm $model should be stripped") -# Fields needed by summary() and vcov() should still exist expect_false(is.null(lm_fit$coefficients), info = "lm $coefficients must survive stripping") expect_false(is.null(lm_fit$qr), @@ -94,8 +68,6 @@ expect_false(is.null(lm_fit$qr), expect_false(is.null(lm_fit$residuals), info = "lm $residuals must survive stripping (needed by summary)") -# summary() and vcov() should still work — these are the actual downstream -# use cases in MSstatsSummarizeSingleLinear lm_summary = summary(lm_fit) expect_false(is.null(lm_summary$coefficients), info = "summary() must work on the stripped lm object") @@ -103,7 +75,6 @@ lm_vcov = vcov(lm_fit) expect_true(is.matrix(lm_vcov), info = "vcov() must work on the stripped lm object") -# The stripped object should be smaller than an unstripped equivalent. unstripped_lm = lm(newABUNDANCE ~ FEATURE + RUN, data = lm_input) expect_true(object.size(lm_fit) < object.size(unstripped_lm), info = paste("Stripped lm should be smaller.", From d07f4d17809138b0a56246b3367548f039415d74 Mon Sep 17 00:00:00 2001 From: Rudhik1904 Date: Fri, 22 May 2026 15:44:29 -0500 Subject: [PATCH 3/6] Adding the peak Script --- benchmark/profile_dataprocess_peak.R | 175 +++++++++++++++++++++++++++ 1 file changed, 175 insertions(+) create mode 100644 benchmark/profile_dataprocess_peak.R diff --git a/benchmark/profile_dataprocess_peak.R b/benchmark/profile_dataprocess_peak.R new file mode 100644 index 00000000..bb2e98ca --- /dev/null +++ b/benchmark/profile_dataprocess_peak.R @@ -0,0 +1,175 @@ +#!/usr/bin/env Rscript +# +# profile_dataprocess_peak.R +# +# Peak-memory + wall-clock instrumentation for MSstats::dataProcess. +# +# Replays dataProcess's internals manually, printing the gc() high-water +# mark and elapsed time at every stage boundary. Useful for locating +# which pipeline stage drives overall peak memory and runtime. +# +# Usage +# ----- +# +# # From the package root: +# Rscript benchmark/profile_dataprocess_peak.R +# +# # With an alternate MSstats-format CSV (10 cols: ProteinName, +# # PeptideSequence, PrecursorCharge, FragmentIon, ProductCharge, +# # IsotopeLabelType, Run, BioReplicate, Condition, Intensity): +# Rscript benchmark/profile_dataprocess_peak.R path/to/input.csv +# +# How to read the output +# ---------------------- +# +# used : Vcells currently in use at the checkpoint. +# max : running max of `used` since gc(reset = TRUE) at "start". +# dt : seconds elapsed since the previous checkpoint. +# +# The *delta in the max column between consecutive rows* is the peak +# memory contribution of that stage. The dt column is per-stage wall time. + +# --- CLI arg: path to MSstats-format CSV, or integer for DDARawData ---- +# +# - Rscript profile_dataprocess_peak.R -> default CSV +# - Rscript profile_dataprocess_peak.R path/to/x.csv -> use that CSV +# - Rscript profile_dataprocess_peak.R 100 -> replicate +# DDARawData 100x + +default_input <- "data/reduce_output_spectronaut_input_qc_test.csv" +args <- commandArgs(trailingOnly = TRUE) +arg <- if (length(args) >= 1) args[1] else default_input +n_replicates <- suppressWarnings(as.integer(arg)) +use_ddaraw <- !is.na(n_replicates) && n_replicates >= 1 +if (!use_ddaraw && !file.exists(arg)) { + stop(sprintf("Input file not found: %s", arg)) +} + +# --- Load the package from source --------------------------------------- + +if (!requireNamespace("pkgload", quietly = TRUE)) { + stop("pkgload is required: install.packages('pkgload')") +} +pkgload::load_all(".", quiet = TRUE) +MSstatsConvert::MSstatsLogsSettings(FALSE) + +# --- Load the fixture --------------------------------------------------- + +required_cols <- c("ProteinName", "PeptideSequence", "PrecursorCharge", + "FragmentIon", "ProductCharge", "IsotopeLabelType", + "Run", "BioReplicate", "Condition", "Intensity") +if (use_ddaraw) { + set.seed(42) + base <- data.table::as.data.table(DDARawData) + raw_input <- data.table::rbindlist(lapply(seq_len(n_replicates), function(i) { + d <- data.table::copy(base) + d$ProteinName <- paste0(d$ProteinName, "_rep", i) + d + })) + raw_input <- as.data.frame(raw_input) + fixture_label <- sprintf("DDARawData x %d", n_replicates) +} else { + raw_input <- data.table::fread(arg, data.table = FALSE) + fixture_label <- arg + # If this looks like a raw Spectronaut export (has R.FileName / + # PG.ProteinAccessions, etc.), convert to MSstats 10-col format. + if ("R.FileName" %in% colnames(raw_input) && + !all(required_cols %in% colnames(raw_input))) { + cat("Detected raw Spectronaut export — converting via", + "SpectronauttoMSstatsFormat()...\n") + raw_input <- MSstatsConvert::SpectronauttoMSstatsFormat( + raw_input, use_log_file = FALSE) + fixture_label <- paste0(arg, " (Spectronaut-converted)") + } +} +missing_cols <- setdiff(required_cols, colnames(raw_input)) +if (length(missing_cols) > 0) { + stop(sprintf("Input is missing MSstats columns: %s", + paste(missing_cols, collapse = ", "))) +} +input_mb <- as.numeric(object.size(raw_input)) / 1e6 +cat(sprintf("Fixture: %s\n %d rows, %.1f MB in memory\n\n", + fixture_label, nrow(raw_input), input_mb)) + +# --- Checkpoint helper -------------------------------------------------- + +bench_rows <- list() +.last_time <- NULL +checkpoint <- function(label, reset = FALSE) { + if (reset) { gc(); gc(reset = TRUE); .last_time <<- proc.time() } + now <- proc.time() + dt <- if (is.null(.last_time)) 0 else (now - .last_time)[["elapsed"]] + .last_time <<- now + g <- gc() + cols <- colnames(g) + used_mb <- g["Vcells", which(cols == "used")[1] + 1] + max_mb <- g["Vcells", which(cols == "max used")[1] + 1] + cat(sprintf("%-42s used=%7.1f max=%7.1f dt=%6.2fs\n", + label, used_mb, max_mb, dt), + file = stderr()) + bench_rows[[length(bench_rows) + 1L]] <<- data.frame( + stage = label, + used_mb = used_mb, + max_mb = max_mb, + dt_s = dt, + stringsAsFactors = FALSE + ) + invisible(dt) +} + +# --- Replay dataProcess stages manually --------------------------------- +# +# Mirrors the sequence in R/dataProcess.R::dataProcess. Keep these in +# sync if dataProcess reorders its pipeline. + +t_start <- proc.time() +checkpoint("start", reset = TRUE) + +raw <- raw_input +peptides_dict <- MSstats:::makePeptidesDictionary( + data.table::as.data.table(unclass(raw)), "equalizeMedians") +input <- MSstats:::MSstatsPrepareForDataProcess(raw, 2, NULL) +rm(raw); checkpoint("after PrepareForDataProcess + rm(raw)") + +input <- MSstats:::MSstatsNormalize(input, "equalizeMedians", peptides_dict, NULL) +rm(peptides_dict); checkpoint("after Normalize") + +input <- MSstats:::MSstatsMergeFractions(input) +checkpoint("after MergeFractions") + +input <- MSstats:::MSstatsHandleMissing(input, "TMP", TRUE, "NA", 0.999) +checkpoint("after HandleMissing") + +input <- MSstats:::MSstatsSelectFeatures(input, "all", 3, 2) +checkpoint("after SelectFeatures") + +processed <- MSstats:::getProcessed(input) +input <- MSstats:::MSstatsPrepareForSummarization(input, "TMP", TRUE, "NA", TRUE) +checkpoint("after PrepareForSummarization") + +summarized <- MSstats:::MSstatsSummarizeWithMultipleCores( + input, "TMP", TRUE, "NA", FALSE, TRUE, 1, 90) +checkpoint("after Summarize") + +gc(verbose = FALSE) +checkpoint("after gc()") + +output <- MSstats:::MSstatsSummarizationOutput( + input, summarized, processed, "TMP", TRUE, "NA") +checkpoint("after SummarizationOutput") + +total_elapsed <- (proc.time() - t_start)[["elapsed"]] + +# --- Summary ------------------------------------------------------------ + +bench_df <- do.call(rbind, bench_rows) +peak_mb <- max(bench_df$max_mb) +baseline_mb <- bench_df$max_mb[bench_df$stage == "start"] +peak_delta_mb <- peak_mb - baseline_mb + +cat("\n========================================\n") +cat(sprintf("Total elapsed: %.2f s\n", total_elapsed)) +cat(sprintf("Peak memory (max): %.1f MB\n", peak_mb)) +cat(sprintf("Baseline memory: %.1f MB\n", baseline_mb)) +cat(sprintf("Peak above baseline: %.1f MB\n", peak_delta_mb)) +cat("========================================\n") From 4d2d41ab322a07f3304f4abd9629ae626916ba93 Mon Sep 17 00:00:00 2001 From: Rudhik1904 Date: Mon, 25 May 2026 19:53:24 -0500 Subject: [PATCH 4/6] Fixing the PR comments --- R/utils_summarization.R | 24 +++++---- benchmark/profile_dataprocess_peak.R | 75 +++++++++++++--------------- 2 files changed, 48 insertions(+), 51 deletions(-) diff --git a/R/utils_summarization.R b/R/utils_summarization.R index 1302e86b..b4f32238 100644 --- a/R/utils_summarization.R +++ b/R/utils_summarization.R @@ -198,23 +198,27 @@ #' @return merMod #' @keywords internal .updateUnequalVariances = function(input, fit, num_iter) { - weight = NULL - input = as.data.frame(input) + weight = abs.resids = loess.fitted = NULL + if (data.table::is.data.table(input)) { + input = data.table::copy(input) + } else { + input = data.table::as.data.table(input) + } for (i in seq_len(num_iter)) { if (i == 1) { - input[["abs.resids"]] = abs(fit$residuals) - input[["fitted"]] = fit$fitted.values + input[, abs.resids := abs(fit$residuals)] + input[, fitted := fit$fitted.values] } fit.loess = loess(abs.resids ~ fitted, data = input) - input[["loess.fitted"]] = fitted(fit.loess) + input[, loess.fitted := fitted(fit.loess)] ## loess fitted values are predicted sd - input[["weight"]] = 1 / (input[["loess.fitted"]] ^ 2) - input[["abs.resids"]] = NULL + input[, weight := 1 / (loess.fitted ^ 2)] + input[, abs.resids := NULL] ## re-fit using weight wls.fit = lm(formula(fit), data = input, weights = weight) - input[["abs.resids"]] = abs(wls.fit$residuals) - input[["loess.fitted"]] = NULL - input[["weight"]] = NULL + input[, abs.resids := abs(wls.fit$residuals)] + input[, loess.fitted := NULL] + input[, weight := NULL] } wls.fit } diff --git a/benchmark/profile_dataprocess_peak.R b/benchmark/profile_dataprocess_peak.R index bb2e98ca..c0fad09d 100644 --- a/benchmark/profile_dataprocess_peak.R +++ b/benchmark/profile_dataprocess_peak.R @@ -5,7 +5,7 @@ # Peak-memory + wall-clock instrumentation for MSstats::dataProcess. # # Replays dataProcess's internals manually, printing the gc() high-water -# mark and elapsed time at every stage boundary. Useful for locating +# mark and elapsed time at every stage boundary. Useful forunning max of `used` since gc(reset = TRUE)r locating # which pipeline stage drives overall peak memory and runtime. # # Usage @@ -14,43 +14,46 @@ # # From the package root: # Rscript benchmark/profile_dataprocess_peak.R # -# # With an alternate MSstats-format CSV (10 cols: ProteinName, -# # PeptideSequence, PrecursorCharge, FragmentIon, ProductCharge, -# # IsotopeLabelType, Run, BioReplicate, Condition, Intensity): +# # With an alternate MSstats-format CSV: # Rscript benchmark/profile_dataprocess_peak.R path/to/input.csv # +# Input is expected to already be in MSstats format. Store MSstats-format +# fixtures alongside this script (or on HPC) — converters live elsewhere. +# # How to read the output # ---------------------- # -# used : Vcells currently in use at the checkpoint. -# max : running max of `used` since gc(reset = TRUE) at "start". -# dt : seconds elapsed since the previous checkpoint. +# used : Vcells currently in use at the checkpoint. +# max : running max of `used` since gc(reset = TRUE) at "start". +# Example: if "after Normalize" reports max=420 and +# "after MergeFractions" reports max=510, the MergeFractions +# stage drove peak memory up by ~90 MB. +# elapsed_s : seconds elapsed since the previous checkpoint. # # The *delta in the max column between consecutive rows* is the peak -# memory contribution of that stage. The dt column is per-stage wall time. +# memory contribution of that stage. elapsed_s is per-stage wall time. -# --- CLI arg: path to MSstats-format CSV, or integer for DDARawData ---- +# --- CLI arg: path to MSstats-format CSV, or integer to scale DDARawData --- # # - Rscript profile_dataprocess_peak.R -> default CSV # - Rscript profile_dataprocess_peak.R path/to/x.csv -> use that CSV -# - Rscript profile_dataprocess_peak.R 100 -> replicate -# DDARawData 100x +# - Rscript profile_dataprocess_peak.R 100 -> stack 100 copies +# of DDARawData with +# renamed proteins to +# scale up the input default_input <- "data/reduce_output_spectronaut_input_qc_test.csv" args <- commandArgs(trailingOnly = TRUE) arg <- if (length(args) >= 1) args[1] else default_input -n_replicates <- suppressWarnings(as.integer(arg)) -use_ddaraw <- !is.na(n_replicates) && n_replicates >= 1 +n_protein_copies <- suppressWarnings(as.integer(arg)) +use_ddaraw <- !is.na(n_protein_copies) && n_protein_copies >= 1 if (!use_ddaraw && !file.exists(arg)) { stop(sprintf("Input file not found: %s", arg)) } -# --- Load the package from source --------------------------------------- +# --- Load the package --------------------------------------------------- -if (!requireNamespace("pkgload", quietly = TRUE)) { - stop("pkgload is required: install.packages('pkgload')") -} -pkgload::load_all(".", quiet = TRUE) +library(MSstats) MSstatsConvert::MSstatsLogsSettings(FALSE) # --- Load the fixture --------------------------------------------------- @@ -61,26 +64,16 @@ required_cols <- c("ProteinName", "PeptideSequence", "PrecursorCharge", if (use_ddaraw) { set.seed(42) base <- data.table::as.data.table(DDARawData) - raw_input <- data.table::rbindlist(lapply(seq_len(n_replicates), function(i) { + raw_input <- data.table::rbindlist(lapply(seq_len(n_protein_copies), function(i) { d <- data.table::copy(base) - d$ProteinName <- paste0(d$ProteinName, "_rep", i) + d$ProteinName <- paste0(d$ProteinName, "_copy", i) d })) raw_input <- as.data.frame(raw_input) - fixture_label <- sprintf("DDARawData x %d", n_replicates) + fixture_label <- sprintf("DDARawData x %d protein copies", n_protein_copies) } else { raw_input <- data.table::fread(arg, data.table = FALSE) fixture_label <- arg - # If this looks like a raw Spectronaut export (has R.FileName / - # PG.ProteinAccessions, etc.), convert to MSstats 10-col format. - if ("R.FileName" %in% colnames(raw_input) && - !all(required_cols %in% colnames(raw_input))) { - cat("Detected raw Spectronaut export — converting via", - "SpectronauttoMSstatsFormat()...\n") - raw_input <- MSstatsConvert::SpectronauttoMSstatsFormat( - raw_input, use_log_file = FALSE) - fixture_label <- paste0(arg, " (Spectronaut-converted)") - } } missing_cols <- setdiff(required_cols, colnames(raw_input)) if (length(missing_cols) > 0) { @@ -98,23 +91,23 @@ bench_rows <- list() checkpoint <- function(label, reset = FALSE) { if (reset) { gc(); gc(reset = TRUE); .last_time <<- proc.time() } now <- proc.time() - dt <- if (is.null(.last_time)) 0 else (now - .last_time)[["elapsed"]] + elapsed_s <- if (is.null(.last_time)) 0 else (now - .last_time)[["elapsed"]] .last_time <<- now g <- gc() cols <- colnames(g) used_mb <- g["Vcells", which(cols == "used")[1] + 1] max_mb <- g["Vcells", which(cols == "max used")[1] + 1] - cat(sprintf("%-42s used=%7.1f max=%7.1f dt=%6.2fs\n", - label, used_mb, max_mb, dt), + cat(sprintf("%-42s used=%7.1f max=%7.1f elapsed_s=%6.2f\n", + label, used_mb, max_mb, elapsed_s), file = stderr()) bench_rows[[length(bench_rows) + 1L]] <<- data.frame( - stage = label, - used_mb = used_mb, - max_mb = max_mb, - dt_s = dt, + stage = label, + used_mb = used_mb, + max_mb = max_mb, + elapsed_s = elapsed_s, stringsAsFactors = FALSE ) - invisible(dt) + invisible(elapsed_s) } # --- Replay dataProcess stages manually --------------------------------- @@ -140,11 +133,11 @@ checkpoint("after MergeFractions") input <- MSstats:::MSstatsHandleMissing(input, "TMP", TRUE, "NA", 0.999) checkpoint("after HandleMissing") -input <- MSstats:::MSstatsSelectFeatures(input, "all", 3, 2) +input <- MSstats:::MSstatsSelectFeatures(input, "topN", 100, 2) checkpoint("after SelectFeatures") processed <- MSstats:::getProcessed(input) -input <- MSstats:::MSstatsPrepareForSummarization(input, "TMP", TRUE, "NA", TRUE) +input <- MSstats:::MSstatsPrepareForSummarization(input, "TMP", TRUE, "NA", FALSE) checkpoint("after PrepareForSummarization") summarized <- MSstats:::MSstatsSummarizeWithMultipleCores( From 8a85c119196fa146ad41f95fdc5ad2356984a158 Mon Sep 17 00:00:00 2001 From: Rudhik1904 Date: Tue, 26 May 2026 18:18:35 -0500 Subject: [PATCH 5/6] moving the script to MSstats Scalblity --- benchmark/profile_dataprocess_peak.R | 168 --------------------------- 1 file changed, 168 deletions(-) delete mode 100644 benchmark/profile_dataprocess_peak.R diff --git a/benchmark/profile_dataprocess_peak.R b/benchmark/profile_dataprocess_peak.R deleted file mode 100644 index c0fad09d..00000000 --- a/benchmark/profile_dataprocess_peak.R +++ /dev/null @@ -1,168 +0,0 @@ -#!/usr/bin/env Rscript -# -# profile_dataprocess_peak.R -# -# Peak-memory + wall-clock instrumentation for MSstats::dataProcess. -# -# Replays dataProcess's internals manually, printing the gc() high-water -# mark and elapsed time at every stage boundary. Useful forunning max of `used` since gc(reset = TRUE)r locating -# which pipeline stage drives overall peak memory and runtime. -# -# Usage -# ----- -# -# # From the package root: -# Rscript benchmark/profile_dataprocess_peak.R -# -# # With an alternate MSstats-format CSV: -# Rscript benchmark/profile_dataprocess_peak.R path/to/input.csv -# -# Input is expected to already be in MSstats format. Store MSstats-format -# fixtures alongside this script (or on HPC) — converters live elsewhere. -# -# How to read the output -# ---------------------- -# -# used : Vcells currently in use at the checkpoint. -# max : running max of `used` since gc(reset = TRUE) at "start". -# Example: if "after Normalize" reports max=420 and -# "after MergeFractions" reports max=510, the MergeFractions -# stage drove peak memory up by ~90 MB. -# elapsed_s : seconds elapsed since the previous checkpoint. -# -# The *delta in the max column between consecutive rows* is the peak -# memory contribution of that stage. elapsed_s is per-stage wall time. - -# --- CLI arg: path to MSstats-format CSV, or integer to scale DDARawData --- -# -# - Rscript profile_dataprocess_peak.R -> default CSV -# - Rscript profile_dataprocess_peak.R path/to/x.csv -> use that CSV -# - Rscript profile_dataprocess_peak.R 100 -> stack 100 copies -# of DDARawData with -# renamed proteins to -# scale up the input - -default_input <- "data/reduce_output_spectronaut_input_qc_test.csv" -args <- commandArgs(trailingOnly = TRUE) -arg <- if (length(args) >= 1) args[1] else default_input -n_protein_copies <- suppressWarnings(as.integer(arg)) -use_ddaraw <- !is.na(n_protein_copies) && n_protein_copies >= 1 -if (!use_ddaraw && !file.exists(arg)) { - stop(sprintf("Input file not found: %s", arg)) -} - -# --- Load the package --------------------------------------------------- - -library(MSstats) -MSstatsConvert::MSstatsLogsSettings(FALSE) - -# --- Load the fixture --------------------------------------------------- - -required_cols <- c("ProteinName", "PeptideSequence", "PrecursorCharge", - "FragmentIon", "ProductCharge", "IsotopeLabelType", - "Run", "BioReplicate", "Condition", "Intensity") -if (use_ddaraw) { - set.seed(42) - base <- data.table::as.data.table(DDARawData) - raw_input <- data.table::rbindlist(lapply(seq_len(n_protein_copies), function(i) { - d <- data.table::copy(base) - d$ProteinName <- paste0(d$ProteinName, "_copy", i) - d - })) - raw_input <- as.data.frame(raw_input) - fixture_label <- sprintf("DDARawData x %d protein copies", n_protein_copies) -} else { - raw_input <- data.table::fread(arg, data.table = FALSE) - fixture_label <- arg -} -missing_cols <- setdiff(required_cols, colnames(raw_input)) -if (length(missing_cols) > 0) { - stop(sprintf("Input is missing MSstats columns: %s", - paste(missing_cols, collapse = ", "))) -} -input_mb <- as.numeric(object.size(raw_input)) / 1e6 -cat(sprintf("Fixture: %s\n %d rows, %.1f MB in memory\n\n", - fixture_label, nrow(raw_input), input_mb)) - -# --- Checkpoint helper -------------------------------------------------- - -bench_rows <- list() -.last_time <- NULL -checkpoint <- function(label, reset = FALSE) { - if (reset) { gc(); gc(reset = TRUE); .last_time <<- proc.time() } - now <- proc.time() - elapsed_s <- if (is.null(.last_time)) 0 else (now - .last_time)[["elapsed"]] - .last_time <<- now - g <- gc() - cols <- colnames(g) - used_mb <- g["Vcells", which(cols == "used")[1] + 1] - max_mb <- g["Vcells", which(cols == "max used")[1] + 1] - cat(sprintf("%-42s used=%7.1f max=%7.1f elapsed_s=%6.2f\n", - label, used_mb, max_mb, elapsed_s), - file = stderr()) - bench_rows[[length(bench_rows) + 1L]] <<- data.frame( - stage = label, - used_mb = used_mb, - max_mb = max_mb, - elapsed_s = elapsed_s, - stringsAsFactors = FALSE - ) - invisible(elapsed_s) -} - -# --- Replay dataProcess stages manually --------------------------------- -# -# Mirrors the sequence in R/dataProcess.R::dataProcess. Keep these in -# sync if dataProcess reorders its pipeline. - -t_start <- proc.time() -checkpoint("start", reset = TRUE) - -raw <- raw_input -peptides_dict <- MSstats:::makePeptidesDictionary( - data.table::as.data.table(unclass(raw)), "equalizeMedians") -input <- MSstats:::MSstatsPrepareForDataProcess(raw, 2, NULL) -rm(raw); checkpoint("after PrepareForDataProcess + rm(raw)") - -input <- MSstats:::MSstatsNormalize(input, "equalizeMedians", peptides_dict, NULL) -rm(peptides_dict); checkpoint("after Normalize") - -input <- MSstats:::MSstatsMergeFractions(input) -checkpoint("after MergeFractions") - -input <- MSstats:::MSstatsHandleMissing(input, "TMP", TRUE, "NA", 0.999) -checkpoint("after HandleMissing") - -input <- MSstats:::MSstatsSelectFeatures(input, "topN", 100, 2) -checkpoint("after SelectFeatures") - -processed <- MSstats:::getProcessed(input) -input <- MSstats:::MSstatsPrepareForSummarization(input, "TMP", TRUE, "NA", FALSE) -checkpoint("after PrepareForSummarization") - -summarized <- MSstats:::MSstatsSummarizeWithMultipleCores( - input, "TMP", TRUE, "NA", FALSE, TRUE, 1, 90) -checkpoint("after Summarize") - -gc(verbose = FALSE) -checkpoint("after gc()") - -output <- MSstats:::MSstatsSummarizationOutput( - input, summarized, processed, "TMP", TRUE, "NA") -checkpoint("after SummarizationOutput") - -total_elapsed <- (proc.time() - t_start)[["elapsed"]] - -# --- Summary ------------------------------------------------------------ - -bench_df <- do.call(rbind, bench_rows) -peak_mb <- max(bench_df$max_mb) -baseline_mb <- bench_df$max_mb[bench_df$stage == "start"] -peak_delta_mb <- peak_mb - baseline_mb - -cat("\n========================================\n") -cat(sprintf("Total elapsed: %.2f s\n", total_elapsed)) -cat(sprintf("Peak memory (max): %.1f MB\n", peak_mb)) -cat(sprintf("Baseline memory: %.1f MB\n", baseline_mb)) -cat(sprintf("Peak above baseline: %.1f MB\n", peak_delta_mb)) -cat("========================================\n") From 2e6cf178bf739ffe76fedb799730cc93f8e7f812 Mon Sep 17 00:00:00 2001 From: Rudhik1904 Date: Tue, 26 May 2026 18:20:04 -0500 Subject: [PATCH 6/6] fixing the comment --- R/utils_summarization.R | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/R/utils_summarization.R b/R/utils_summarization.R index b4f32238..47bbef21 100644 --- a/R/utils_summarization.R +++ b/R/utils_summarization.R @@ -199,9 +199,7 @@ #' @keywords internal .updateUnequalVariances = function(input, fit, num_iter) { weight = abs.resids = loess.fitted = NULL - if (data.table::is.data.table(input)) { - input = data.table::copy(input) - } else { + if (!data.table::is.data.table(input)) { input = data.table::as.data.table(input) } for (i in seq_len(num_iter)) {