From 5693fc90182fc22f24d30f1e26e13420d8114d60 Mon Sep 17 00:00:00 2001
From: Tony Wu <wu.anthon@northeastern.edu>
Date: Thu, 14 May 2026 09:47:04 -0400
Subject: [PATCH 1/6] Freed heavy fit objects and large dataProcess
 intermediates early
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* Stripped survreg() $y / $linear.predictors and lm() $model after
  fitting in .fitSurvival and .fitLinearModel; downstream predict /
  summary / vcov paths are unaffected.
* Rewrote .updateUnequalVariances to mutate a single data.frame in
  place instead of rebuilding it ~5x per iteration via data.frame(...).
* Added rm(raw), rm(peptides_dict), and gc(verbose = FALSE) inside
  dataProcess() so the ~250-400 MB held by function arguments is freed
  before MSstatsSummarizationOutput runs.
* Added rm(survival_fit) and rm(fit) inside the per-protein linear /
  TMP summarizers so local fit objects drop as soon as survival /
  result is materialised.
* Tests: inst/tinytest/test_memory_optimization.R Issue 1 (tests 1a,
  1b) — 14 assertions, all green.

See MSstats-ai/todos/active/TODO-MS-20260514_fix-memory-bugs.md

Co-Authored-By: Claude <noreply@anthropic.com>
---
 R/dataProcess.R                               |  14 ++-
 R/utils_imputation.R                          |   4 +-
 R/utils_summarization.R                       |  32 ++---
 inst/tinytest/test_memory_optimization_fits.R | 111 ++++++++++++++++++
 4 files changed, 142 insertions(+), 19 deletions(-)
 create mode 100644 inst/tinytest/test_memory_optimization_fits.R

diff --git a/R/dataProcess.R b/R/dataProcess.R
index e9581c8b..5fa593a8 100755
--- a/R/dataProcess.R
+++ b/R/dataProcess.R
@@ -146,7 +146,14 @@ dataProcess = function(
     
     peptides_dict = makePeptidesDictionary(as.data.table(unclass(raw)), normalization)
     input = MSstatsPrepareForDataProcess(raw, logTrans, fix_missing)
+    # 'raw' is never used after this point but stays alive as a function
+    # parameter (~250-400 MB) through normalization, feature selection, and
+    # the entire summarization loop. Free it before the heavy processing.
+    rm(raw)
     input = MSstatsNormalize(input, normalization, peptides_dict, nameStandards)
+    # 'peptides_dict' is never used after normalization. Usually NULL
+    # (non-global-standards), but free it for the global-standards case.
+    rm(peptides_dict)
     input = MSstatsMergeFractions(input)
     input = MSstatsHandleMissing(input, summaryMethod, MBimpute,
                                  censoredInt, maxQuantileforCensored)
@@ -173,6 +180,7 @@ dataProcess = function(
                             "== Summarization is done.")
     getOption("MSstatsMsg")("INFO",
                             " == Summarization is done.")
+    gc(verbose = FALSE)
     output = MSstatsSummarizationOutput(input, summarized, processed,
                                         summaryMethod, MBimpute, censoredInt)
     output
@@ -412,7 +420,7 @@ MSstatsSummarizeSingleLinear = function(single_protein,
             pred = predict(survival_fit, newdata = .SD, se.fit = TRUE)
             list(pred$fit, pred$se.fit^2 + sigma2)
         }]
-
+        rm(survival_fit)
         if (is_labeled_reference) {
             single_protein[, predicted := ifelse(censored & is_labeled_ref == FALSE, predicted, NA)]
             single_protein[, newABUNDANCE := ifelse(censored & is_labeled_ref == FALSE, predicted, newABUNDANCE)]
@@ -489,7 +497,7 @@ MSstatsSummarizeSingleLinear = function(single_protein,
             # Todo: enable SRM linear summarization
             result[, LABEL := if (is_labeled_reference) "L" else unique(single_protein$LABEL)]
         }
-        
+        rm(fit)
         return(list(result, survival))
     }
 }
@@ -567,7 +575,7 @@ MSstatsSummarizeSingleTMP = function(single_protein, impute, censored_symbol,
         } else {
             single_protein[, predicted := NA_real_]
         }
-
+        rm(survival_fit)
         if (is_labeled_reference) {
             single_protein[, predicted := ifelse(censored & is_labeled_ref == FALSE, predicted, NA)]
             single_protein[, newABUNDANCE := ifelse(censored & is_labeled_ref == FALSE, predicted, newABUNDANCE)]
diff --git a/R/utils_imputation.R b/R/utils_imputation.R
index 406920ee..99bffcf1 100644
--- a/R/utils_imputation.R
+++ b/R/utils_imputation.R
@@ -46,8 +46,10 @@
                                         data = input, dist = "gaussian", 
                               control = list(maxiter=aft_iterations))
             }
-        }  
+        }
     }
+    fit$y = NULL
+    fit$linear.predictors = NULL
     fit
 }
 
diff --git a/R/utils_summarization.R b/R/utils_summarization.R
index 3b762ae7..3990cc5d 100644
--- a/R/utils_summarization.R
+++ b/R/utils_summarization.R
@@ -180,10 +180,11 @@
         }
     }
     if (!equal_variances) {
-        linear_model = .updateUnequalVariances(input = input, 
+        linear_model = .updateUnequalVariances(input = input,
                                                fit = linear_model,
                                                num_iter = 1)
     }
+    linear_model$model = NULL
     linear_model
 }
 
@@ -198,26 +199,27 @@
 #' @keywords internal
 .updateUnequalVariances = function(input, fit, num_iter) {
     weight = NULL
-    
+
+    # Convert to data.frame once upfront. The original code did this
+    # implicitly via data.frame(input, ...) on every column add, which
+    # copied ALL existing columns each time (~5 full copies per iteration).
+    # Now we convert once and add/remove columns in-place.
+    input = as.data.frame(input)
     for (i in seq_len(num_iter)) {
         if (i == 1) {
-            abs.resids = data.frame(abs.resids = abs(fit$residuals))
-            fitted = data.frame(fitted = fit$fitted.values)
-            input = data.frame(input, 
-                               "abs.resids" = abs.resids, 
-                               "fitted" = fitted)
+            input[["abs.resids"]] = abs(fit$residuals)
+            input[["fitted"]] = fit$fitted.values
         }
         fit.loess = loess(abs.resids ~ fitted, data = input)
-        loess.fitted = data.frame(loess.fitted = fitted(fit.loess))
-        input = data.frame(input, "loess.fitted" = loess.fitted)
-        ## loess fitted valuaes are predicted sd
-        input$weight = 1 / (input$loess.fitted ^ 2)
-        input = input[, !(colnames(input) %in% "abs.resids")]
+        input[["loess.fitted"]] = fitted(fit.loess)
+        ## loess fitted values are predicted sd
+        input[["weight"]] = 1 / (input[["loess.fitted"]] ^ 2)
+        input[["abs.resids"]] = NULL
         ## re-fit using weight
         wls.fit = lm(formula(fit), data = input, weights = weight)
-        abs.resids = data.frame(abs.resids = abs(wls.fit$residuals))
-        input = data.frame(input, "abs.resids" = abs.resids)
-        input = input[, -which(colnames(input) %in% c("loess.fitted", "weight"))]
+        input[["abs.resids"]] = abs(wls.fit$residuals)
+        input[["loess.fitted"]] = NULL
+        input[["weight"]] = NULL
     }
     wls.fit
 }
diff --git a/inst/tinytest/test_memory_optimization_fits.R b/inst/tinytest/test_memory_optimization_fits.R
new file mode 100644
index 00000000..df2c91e3
--- /dev/null
+++ b/inst/tinytest/test_memory_optimization_fits.R
@@ -0,0 +1,111 @@
+# Tests for memory optimizations in the dataProcess pipeline.
+# These tests verify that model objects are stripped of heavy fields (Issue 1).
+
+
+# --- Issue 1: Model Objects Retain Entire Datasets ----------------------------
+#
+# .fitSurvival() and .fitLinearModel() used to return full model objects that
+# stored copies of the input data, QR decompositions, residual vectors, etc.
+# We now strip the heavy fields before returning. These tests verify the
+# stripped fields are NULL and that the objects still work for their intended
+# downstream use (predict, summary, vcov).
+
+# Test 1a: .fitSurvival strips $y and $linear.predictors ---
+#
+# survreg() stores $y (the Surv response object) and $linear.predictors
+# (a vector of length nrow). Neither is needed by predict.survreg().
+# After stripping, object.size should be measurably smaller.
+
+surv_input = data.table::data.table(
+    newABUNDANCE = c(10.1, 11.2, 9.5, 10.8, 12.0, 9.0, 11.5, 10.3,
+                     10.5, 11.0, 9.8, 10.2, 12.2, 9.3, 11.8, 10.6),
+    cen = c(1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1),
+    RUN = factor(rep(1:4, each = 4)),
+    FEATURE = factor(rep(c("feat1", "feat2", "feat3", "feat4"), 4))
+)
+# cen = 1 means observed (real measurement), cen = 0 means left-censored
+# (below the instrument's detection limit — true value is unknown but at
+# most this threshold). survreg uses Surv(..., type='left') to handle this:
+# observed rows contribute their exact value to the likelihood, censored
+# rows contribute P(true_value <= threshold). We set censored rows to 7.0
+# as the upper bound — survreg knows the real value could be lower.
+surv_input[cen == 0, newABUNDANCE := 7.0]
+
+surv_fit = MSstats:::.fitSurvival(surv_input, aft_iterations = 90)
+
+# Stripped fields should be NULL
+expect_true(is.null(surv_fit$y),
+            info = "survreg $y should be stripped to save memory")
+expect_true(is.null(surv_fit$linear.predictors),
+            info = "survreg $linear.predictors should be stripped to save memory")
+
+# Fields needed by predict() should still exist
+expect_false(is.null(surv_fit$coefficients),
+             info = "survreg $coefficients must survive stripping")
+expect_false(is.null(surv_fit$scale),
+             info = "survreg $scale must survive stripping")
+
+# predict() should still work on the stripped object — this is the actual
+# downstream use case in MSstatsSummarizeSingleTMP
+predictions = predict(surv_fit, newdata = surv_input)
+expect_equal(length(predictions), nrow(surv_input),
+             info = "predict() must work on the stripped survreg object")
+expect_true(all(is.finite(predictions)),
+            info = "predict() should return finite values")
+
+# The stripped object should be smaller than an unstripped equivalent.
+# Fit the same model without stripping to compare sizes.
+unstripped_fit = survival::survreg(
+    survival::Surv(newABUNDANCE, cen, type = "left") ~ FEATURE + RUN,
+    data = surv_input, dist = "gaussian")
+expect_true(object.size(surv_fit) < object.size(unstripped_fit),
+            info = paste("Stripped survreg should be smaller.",
+                         "Stripped:", object.size(surv_fit),
+                         "Unstripped:", object.size(unstripped_fit)))
+
+
+# Test 1b: .fitLinearModel strips $model ---
+#
+# lm() stores $model (a full copy of the data used for fitting). This is
+# not needed by summary() or vcov(), which only use $qr, $residuals, and
+# $coefficients. After stripping, the object should be smaller and
+# summary/vcov should still work.
+
+lm_input = data.table::data.table(
+    newABUNDANCE = c(10.1, 11.2, 9.5, 10.8, 12.0, 9.0, 11.5, 10.3,
+                     10.5, 11.0, 9.8, 10.2, 12.2, 9.3, 11.8, 10.6),
+    RUN = factor(rep(1:4, each = 4)),
+    FEATURE = factor(rep(c("feat1", "feat2", "feat3", "feat4"), 4)),
+    weights = NA
+)
+
+lm_fit = MSstats:::.fitLinearModel(lm_input, is_single_feature = FALSE,
+                                    is_labeled = FALSE, equal_variances = TRUE)
+
+# $model should be stripped
+expect_true(is.null(lm_fit$model),
+            info = "lm $model should be stripped to save memory")
+
+# Fields needed by summary() and vcov() should still exist
+expect_false(is.null(lm_fit$coefficients),
+             info = "lm $coefficients must survive stripping")
+expect_false(is.null(lm_fit$qr),
+             info = "lm $qr must survive stripping (needed by summary and vcov)")
+expect_false(is.null(lm_fit$residuals),
+             info = "lm $residuals must survive stripping (needed by summary)")
+
+# summary() and vcov() should still work — these are the actual downstream
+# use cases in MSstatsSummarizeSingleLinear
+lm_summary = summary(lm_fit)
+expect_false(is.null(lm_summary$coefficients),
+             info = "summary() must work on the stripped lm object")
+lm_vcov = vcov(lm_fit)
+expect_true(is.matrix(lm_vcov),
+            info = "vcov() must work on the stripped lm object")
+
+# The stripped object should be smaller than an unstripped equivalent.
+unstripped_lm = lm(newABUNDANCE ~ FEATURE + RUN, data = lm_input)
+expect_true(object.size(lm_fit) < object.size(unstripped_lm),
+            info = paste("Stripped lm should be smaller.",
+                         "Stripped:", object.size(lm_fit),
+                         "Unstripped:", object.size(unstripped_lm)))

From 9edb7187d5862eedce7c568907a2027e3df6f2bf Mon Sep 17 00:00:00 2001
From: Rudhik1904 <rudhikshah50@gmail.com>
Date: Thu, 14 May 2026 23:36:57 -0500
Subject: [PATCH 2/6] Removing the unnecessary comments

---
 R/dataProcess.R                               |  5 --
 R/utils_summarization.R                       |  5 --
 inst/tinytest/test_memory_optimization_fits.R | 49 ++++---------------
 3 files changed, 10 insertions(+), 49 deletions(-)

diff --git a/R/dataProcess.R b/R/dataProcess.R
index 5fa593a8..4691398b 100755
--- a/R/dataProcess.R
+++ b/R/dataProcess.R
@@ -146,13 +146,8 @@ dataProcess = function(
     
     peptides_dict = makePeptidesDictionary(as.data.table(unclass(raw)), normalization)
     input = MSstatsPrepareForDataProcess(raw, logTrans, fix_missing)
-    # 'raw' is never used after this point but stays alive as a function
-    # parameter (~250-400 MB) through normalization, feature selection, and
-    # the entire summarization loop. Free it before the heavy processing.
     rm(raw)
     input = MSstatsNormalize(input, normalization, peptides_dict, nameStandards)
-    # 'peptides_dict' is never used after normalization. Usually NULL
-    # (non-global-standards), but free it for the global-standards case.
     rm(peptides_dict)
     input = MSstatsMergeFractions(input)
     input = MSstatsHandleMissing(input, summaryMethod, MBimpute,
diff --git a/R/utils_summarization.R b/R/utils_summarization.R
index 3990cc5d..1302e86b 100644
--- a/R/utils_summarization.R
+++ b/R/utils_summarization.R
@@ -199,11 +199,6 @@
 #' @keywords internal
 .updateUnequalVariances = function(input, fit, num_iter) {
     weight = NULL
-
-    # Convert to data.frame once upfront. The original code did this
-    # implicitly via data.frame(input, ...) on every column add, which
-    # copied ALL existing columns each time (~5 full copies per iteration).
-    # Now we convert once and add/remove columns in-place.
     input = as.data.frame(input)
     for (i in seq_len(num_iter)) {
         if (i == 1) {
diff --git a/inst/tinytest/test_memory_optimization_fits.R b/inst/tinytest/test_memory_optimization_fits.R
index df2c91e3..21da53fe 100644
--- a/inst/tinytest/test_memory_optimization_fits.R
+++ b/inst/tinytest/test_memory_optimization_fits.R
@@ -1,20 +1,10 @@
-# Tests for memory optimizations in the dataProcess pipeline.
-# These tests verify that model objects are stripped of heavy fields (Issue 1).
+# Tests that .fitSurvival() and .fitLinearModel() return model objects with
+# heavy fields stripped, and that downstream predict/summary/vcov still work.
 
 
-# --- Issue 1: Model Objects Retain Entire Datasets ----------------------------
+# --- .fitSurvival: $y and $linear.predictors are stripped ---------------------
 #
-# .fitSurvival() and .fitLinearModel() used to return full model objects that
-# stored copies of the input data, QR decompositions, residual vectors, etc.
-# We now strip the heavy fields before returning. These tests verify the
-# stripped fields are NULL and that the objects still work for their intended
-# downstream use (predict, summary, vcov).
-
-# Test 1a: .fitSurvival strips $y and $linear.predictors ---
-#
-# survreg() stores $y (the Surv response object) and $linear.predictors
-# (a vector of length nrow). Neither is needed by predict.survreg().
-# After stripping, object.size should be measurably smaller.
+# Neither field is needed by predict.survreg().
 
 surv_input = data.table::data.table(
     newABUNDANCE = c(10.1, 11.2, 9.5, 10.8, 12.0, 9.0, 11.5, 10.3,
@@ -23,38 +13,27 @@ surv_input = data.table::data.table(
     RUN = factor(rep(1:4, each = 4)),
     FEATURE = factor(rep(c("feat1", "feat2", "feat3", "feat4"), 4))
 )
-# cen = 1 means observed (real measurement), cen = 0 means left-censored
-# (below the instrument's detection limit — true value is unknown but at
-# most this threshold). survreg uses Surv(..., type='left') to handle this:
-# observed rows contribute their exact value to the likelihood, censored
-# rows contribute P(true_value <= threshold). We set censored rows to 7.0
-# as the upper bound — survreg knows the real value could be lower.
+# cen = 0 marks left-censored rows; set them to the upper-bound threshold.
 surv_input[cen == 0, newABUNDANCE := 7.0]
 
 surv_fit = MSstats:::.fitSurvival(surv_input, aft_iterations = 90)
 
-# Stripped fields should be NULL
 expect_true(is.null(surv_fit$y),
-            info = "survreg $y should be stripped to save memory")
+            info = "survreg $y should be stripped")
 expect_true(is.null(surv_fit$linear.predictors),
-            info = "survreg $linear.predictors should be stripped to save memory")
+            info = "survreg $linear.predictors should be stripped")
 
-# Fields needed by predict() should still exist
 expect_false(is.null(surv_fit$coefficients),
              info = "survreg $coefficients must survive stripping")
 expect_false(is.null(surv_fit$scale),
              info = "survreg $scale must survive stripping")
 
-# predict() should still work on the stripped object — this is the actual
-# downstream use case in MSstatsSummarizeSingleTMP
 predictions = predict(surv_fit, newdata = surv_input)
 expect_equal(length(predictions), nrow(surv_input),
              info = "predict() must work on the stripped survreg object")
 expect_true(all(is.finite(predictions)),
             info = "predict() should return finite values")
 
-# The stripped object should be smaller than an unstripped equivalent.
-# Fit the same model without stripping to compare sizes.
 unstripped_fit = survival::survreg(
     survival::Surv(newABUNDANCE, cen, type = "left") ~ FEATURE + RUN,
     data = surv_input, dist = "gaussian")
@@ -64,12 +43,9 @@ expect_true(object.size(surv_fit) < object.size(unstripped_fit),
                          "Unstripped:", object.size(unstripped_fit)))
 
 
-# Test 1b: .fitLinearModel strips $model ---
+# --- .fitLinearModel: $model is stripped --------------------------------------
 #
-# lm() stores $model (a full copy of the data used for fitting). This is
-# not needed by summary() or vcov(), which only use $qr, $residuals, and
-# $coefficients. After stripping, the object should be smaller and
-# summary/vcov should still work.
+# $model is not needed by summary() or vcov().
 
 lm_input = data.table::data.table(
     newABUNDANCE = c(10.1, 11.2, 9.5, 10.8, 12.0, 9.0, 11.5, 10.3,
@@ -82,11 +58,9 @@ lm_input = data.table::data.table(
 lm_fit = MSstats:::.fitLinearModel(lm_input, is_single_feature = FALSE,
                                     is_labeled = FALSE, equal_variances = TRUE)
 
-# $model should be stripped
 expect_true(is.null(lm_fit$model),
-            info = "lm $model should be stripped to save memory")
+            info = "lm $model should be stripped")
 
-# Fields needed by summary() and vcov() should still exist
 expect_false(is.null(lm_fit$coefficients),
              info = "lm $coefficients must survive stripping")
 expect_false(is.null(lm_fit$qr),
@@ -94,8 +68,6 @@ expect_false(is.null(lm_fit$qr),
 expect_false(is.null(lm_fit$residuals),
              info = "lm $residuals must survive stripping (needed by summary)")
 
-# summary() and vcov() should still work — these are the actual downstream
-# use cases in MSstatsSummarizeSingleLinear
 lm_summary = summary(lm_fit)
 expect_false(is.null(lm_summary$coefficients),
              info = "summary() must work on the stripped lm object")
@@ -103,7 +75,6 @@ lm_vcov = vcov(lm_fit)
 expect_true(is.matrix(lm_vcov),
             info = "vcov() must work on the stripped lm object")
 
-# The stripped object should be smaller than an unstripped equivalent.
 unstripped_lm = lm(newABUNDANCE ~ FEATURE + RUN, data = lm_input)
 expect_true(object.size(lm_fit) < object.size(unstripped_lm),
             info = paste("Stripped lm should be smaller.",

From d07f4d17809138b0a56246b3367548f039415d74 Mon Sep 17 00:00:00 2001
From: Rudhik1904 <rudhikshah50@gmail.com>
Date: Fri, 22 May 2026 15:44:29 -0500
Subject: [PATCH 3/6] Adding the peak Script

---
 benchmark/profile_dataprocess_peak.R | 175 +++++++++++++++++++++++++++
 1 file changed, 175 insertions(+)
 create mode 100644 benchmark/profile_dataprocess_peak.R

diff --git a/benchmark/profile_dataprocess_peak.R b/benchmark/profile_dataprocess_peak.R
new file mode 100644
index 00000000..bb2e98ca
--- /dev/null
+++ b/benchmark/profile_dataprocess_peak.R
@@ -0,0 +1,175 @@
+#!/usr/bin/env Rscript
+#
+# profile_dataprocess_peak.R
+#
+# Peak-memory + wall-clock instrumentation for MSstats::dataProcess.
+#
+# Replays dataProcess's internals manually, printing the gc() high-water
+# mark and elapsed time at every stage boundary. Useful for locating
+# which pipeline stage drives overall peak memory and runtime.
+#
+# Usage
+# -----
+#
+#   # From the package root:
+#   Rscript benchmark/profile_dataprocess_peak.R
+#
+#   # With an alternate MSstats-format CSV (10 cols: ProteinName,
+#   # PeptideSequence, PrecursorCharge, FragmentIon, ProductCharge,
+#   # IsotopeLabelType, Run, BioReplicate, Condition, Intensity):
+#   Rscript benchmark/profile_dataprocess_peak.R path/to/input.csv
+#
+# How to read the output
+# ----------------------
+#
+#   used : Vcells currently in use at the checkpoint.
+#   max  : running max of `used` since gc(reset = TRUE) at "start".
+#   dt   : seconds elapsed since the previous checkpoint.
+#
+# The *delta in the max column between consecutive rows* is the peak
+# memory contribution of that stage. The dt column is per-stage wall time.
+
+# --- CLI arg: path to MSstats-format CSV, or integer for DDARawData ----
+#
+# - Rscript profile_dataprocess_peak.R                 -> default CSV
+# - Rscript profile_dataprocess_peak.R path/to/x.csv   -> use that CSV
+# - Rscript profile_dataprocess_peak.R 100             -> replicate
+#                                                         DDARawData 100x
+
+default_input <- "data/reduce_output_spectronaut_input_qc_test.csv"
+args <- commandArgs(trailingOnly = TRUE)
+arg <- if (length(args) >= 1) args[1] else default_input
+n_replicates <- suppressWarnings(as.integer(arg))
+use_ddaraw <- !is.na(n_replicates) && n_replicates >= 1
+if (!use_ddaraw && !file.exists(arg)) {
+    stop(sprintf("Input file not found: %s", arg))
+}
+
+# --- Load the package from source ---------------------------------------
+
+if (!requireNamespace("pkgload", quietly = TRUE)) {
+    stop("pkgload is required: install.packages('pkgload')")
+}
+pkgload::load_all(".", quiet = TRUE)
+MSstatsConvert::MSstatsLogsSettings(FALSE)
+
+# --- Load the fixture ---------------------------------------------------
+
+required_cols <- c("ProteinName", "PeptideSequence", "PrecursorCharge",
+                   "FragmentIon", "ProductCharge", "IsotopeLabelType",
+                   "Run", "BioReplicate", "Condition", "Intensity")
+if (use_ddaraw) {
+    set.seed(42)
+    base <- data.table::as.data.table(DDARawData)
+    raw_input <- data.table::rbindlist(lapply(seq_len(n_replicates), function(i) {
+        d <- data.table::copy(base)
+        d$ProteinName <- paste0(d$ProteinName, "_rep", i)
+        d
+    }))
+    raw_input <- as.data.frame(raw_input)
+    fixture_label <- sprintf("DDARawData x %d", n_replicates)
+} else {
+    raw_input <- data.table::fread(arg, data.table = FALSE)
+    fixture_label <- arg
+    # If this looks like a raw Spectronaut export (has R.FileName /
+    # PG.ProteinAccessions, etc.), convert to MSstats 10-col format.
+    if ("R.FileName" %in% colnames(raw_input) &&
+        !all(required_cols %in% colnames(raw_input))) {
+        cat("Detected raw Spectronaut export — converting via",
+            "SpectronauttoMSstatsFormat()...\n")
+        raw_input <- MSstatsConvert::SpectronauttoMSstatsFormat(
+            raw_input, use_log_file = FALSE)
+        fixture_label <- paste0(arg, " (Spectronaut-converted)")
+    }
+}
+missing_cols <- setdiff(required_cols, colnames(raw_input))
+if (length(missing_cols) > 0) {
+    stop(sprintf("Input is missing MSstats columns: %s",
+                 paste(missing_cols, collapse = ", ")))
+}
+input_mb <- as.numeric(object.size(raw_input)) / 1e6
+cat(sprintf("Fixture: %s\n         %d rows, %.1f MB in memory\n\n",
+            fixture_label, nrow(raw_input), input_mb))
+
+# --- Checkpoint helper --------------------------------------------------
+
+bench_rows <- list()
+.last_time <- NULL
+checkpoint <- function(label, reset = FALSE) {
+    if (reset) { gc(); gc(reset = TRUE); .last_time <<- proc.time() }
+    now <- proc.time()
+    dt  <- if (is.null(.last_time)) 0 else (now - .last_time)[["elapsed"]]
+    .last_time <<- now
+    g <- gc()
+    cols <- colnames(g)
+    used_mb <- g["Vcells", which(cols == "used")[1] + 1]
+    max_mb  <- g["Vcells", which(cols == "max used")[1] + 1]
+    cat(sprintf("%-42s  used=%7.1f  max=%7.1f  dt=%6.2fs\n",
+                label, used_mb, max_mb, dt),
+        file = stderr())
+    bench_rows[[length(bench_rows) + 1L]] <<- data.frame(
+        stage   = label,
+        used_mb = used_mb,
+        max_mb  = max_mb,
+        dt_s    = dt,
+        stringsAsFactors = FALSE
+    )
+    invisible(dt)
+}
+
+# --- Replay dataProcess stages manually ---------------------------------
+#
+# Mirrors the sequence in R/dataProcess.R::dataProcess. Keep these in
+# sync if dataProcess reorders its pipeline.
+
+t_start <- proc.time()
+checkpoint("start", reset = TRUE)
+
+raw <- raw_input
+peptides_dict <- MSstats:::makePeptidesDictionary(
+    data.table::as.data.table(unclass(raw)), "equalizeMedians")
+input <- MSstats:::MSstatsPrepareForDataProcess(raw, 2, NULL)
+rm(raw); checkpoint("after PrepareForDataProcess + rm(raw)")
+
+input <- MSstats:::MSstatsNormalize(input, "equalizeMedians", peptides_dict, NULL)
+rm(peptides_dict); checkpoint("after Normalize")
+
+input <- MSstats:::MSstatsMergeFractions(input)
+checkpoint("after MergeFractions")
+
+input <- MSstats:::MSstatsHandleMissing(input, "TMP", TRUE, "NA", 0.999)
+checkpoint("after HandleMissing")
+
+input <- MSstats:::MSstatsSelectFeatures(input, "all", 3, 2)
+checkpoint("after SelectFeatures")
+
+processed <- MSstats:::getProcessed(input)
+input <- MSstats:::MSstatsPrepareForSummarization(input, "TMP", TRUE, "NA", TRUE)
+checkpoint("after PrepareForSummarization")
+
+summarized <- MSstats:::MSstatsSummarizeWithMultipleCores(
+    input, "TMP", TRUE, "NA", FALSE, TRUE, 1, 90)
+checkpoint("after Summarize")
+
+gc(verbose = FALSE)
+checkpoint("after gc()")
+
+output <- MSstats:::MSstatsSummarizationOutput(
+    input, summarized, processed, "TMP", TRUE, "NA")
+checkpoint("after SummarizationOutput")
+
+total_elapsed <- (proc.time() - t_start)[["elapsed"]]
+
+# --- Summary ------------------------------------------------------------
+
+bench_df      <- do.call(rbind, bench_rows)
+peak_mb       <- max(bench_df$max_mb)
+baseline_mb   <- bench_df$max_mb[bench_df$stage == "start"]
+peak_delta_mb <- peak_mb - baseline_mb
+
+cat("\n========================================\n")
+cat(sprintf("Total elapsed:        %.2f s\n",  total_elapsed))
+cat(sprintf("Peak memory (max):    %.1f MB\n", peak_mb))
+cat(sprintf("Baseline memory:      %.1f MB\n", baseline_mb))
+cat(sprintf("Peak above baseline:  %.1f MB\n", peak_delta_mb))
+cat("========================================\n")

From 4d2d41ab322a07f3304f4abd9629ae626916ba93 Mon Sep 17 00:00:00 2001
From: Rudhik1904 <rudhikshah50@gmail.com>
Date: Mon, 25 May 2026 19:53:24 -0500
Subject: [PATCH 4/6] Fixing the PR comments

---
 R/utils_summarization.R              | 24 +++++----
 benchmark/profile_dataprocess_peak.R | 75 +++++++++++++---------------
 2 files changed, 48 insertions(+), 51 deletions(-)

diff --git a/R/utils_summarization.R b/R/utils_summarization.R
index 1302e86b..b4f32238 100644
--- a/R/utils_summarization.R
+++ b/R/utils_summarization.R
@@ -198,23 +198,27 @@
 #' @return merMod
 #' @keywords internal
 .updateUnequalVariances = function(input, fit, num_iter) {
-    weight = NULL
-    input = as.data.frame(input)
+    weight = abs.resids = loess.fitted = NULL
+    if (data.table::is.data.table(input)) {
+        input = data.table::copy(input)
+    } else {
+        input = data.table::as.data.table(input)
+    }
     for (i in seq_len(num_iter)) {
         if (i == 1) {
-            input[["abs.resids"]] = abs(fit$residuals)
-            input[["fitted"]] = fit$fitted.values
+            input[, abs.resids := abs(fit$residuals)]
+            input[, fitted := fit$fitted.values]
         }
         fit.loess = loess(abs.resids ~ fitted, data = input)
-        input[["loess.fitted"]] = fitted(fit.loess)
+        input[, loess.fitted := fitted(fit.loess)]
         ## loess fitted values are predicted sd
-        input[["weight"]] = 1 / (input[["loess.fitted"]] ^ 2)
-        input[["abs.resids"]] = NULL
+        input[, weight := 1 / (loess.fitted ^ 2)]
+        input[, abs.resids := NULL]
         ## re-fit using weight
         wls.fit = lm(formula(fit), data = input, weights = weight)
-        input[["abs.resids"]] = abs(wls.fit$residuals)
-        input[["loess.fitted"]] = NULL
-        input[["weight"]] = NULL
+        input[, abs.resids := abs(wls.fit$residuals)]
+        input[, loess.fitted := NULL]
+        input[, weight := NULL]
     }
     wls.fit
 }
diff --git a/benchmark/profile_dataprocess_peak.R b/benchmark/profile_dataprocess_peak.R
index bb2e98ca..c0fad09d 100644
--- a/benchmark/profile_dataprocess_peak.R
+++ b/benchmark/profile_dataprocess_peak.R
@@ -5,7 +5,7 @@
 # Peak-memory + wall-clock instrumentation for MSstats::dataProcess.
 #
 # Replays dataProcess's internals manually, printing the gc() high-water
-# mark and elapsed time at every stage boundary. Useful for locating
+# mark and elapsed time at every stage boundary. Useful forunning max of `used` since gc(reset = TRUE)r locating
 # which pipeline stage drives overall peak memory and runtime.
 #
 # Usage
@@ -14,43 +14,46 @@
 #   # From the package root:
 #   Rscript benchmark/profile_dataprocess_peak.R
 #
-#   # With an alternate MSstats-format CSV (10 cols: ProteinName,
-#   # PeptideSequence, PrecursorCharge, FragmentIon, ProductCharge,
-#   # IsotopeLabelType, Run, BioReplicate, Condition, Intensity):
+#   # With an alternate MSstats-format CSV:
 #   Rscript benchmark/profile_dataprocess_peak.R path/to/input.csv
 #
+# Input is expected to already be in MSstats format. Store MSstats-format
+# fixtures alongside this script (or on HPC) — converters live elsewhere.
+#
 # How to read the output
 # ----------------------
 #
-#   used : Vcells currently in use at the checkpoint.
-#   max  : running max of `used` since gc(reset = TRUE) at "start".
-#   dt   : seconds elapsed since the previous checkpoint.
+#   used      : Vcells currently in use at the checkpoint.
+#   max       : running max of `used` since gc(reset = TRUE) at "start".
+#               Example: if "after Normalize" reports max=420 and
+#               "after MergeFractions" reports max=510, the MergeFractions
+#               stage drove peak memory up by ~90 MB.
+#   elapsed_s : seconds elapsed since the previous checkpoint.
 #
 # The *delta in the max column between consecutive rows* is the peak
-# memory contribution of that stage. The dt column is per-stage wall time.
+# memory contribution of that stage. elapsed_s is per-stage wall time.
 
-# --- CLI arg: path to MSstats-format CSV, or integer for DDARawData ----
+# --- CLI arg: path to MSstats-format CSV, or integer to scale DDARawData ---
 #
 # - Rscript profile_dataprocess_peak.R                 -> default CSV
 # - Rscript profile_dataprocess_peak.R path/to/x.csv   -> use that CSV
-# - Rscript profile_dataprocess_peak.R 100             -> replicate
-#                                                         DDARawData 100x
+# - Rscript profile_dataprocess_peak.R 100             -> stack 100 copies
+#                                                         of DDARawData with
+#                                                         renamed proteins to
+#                                                         scale up the input
 
 default_input <- "data/reduce_output_spectronaut_input_qc_test.csv"
 args <- commandArgs(trailingOnly = TRUE)
 arg <- if (length(args) >= 1) args[1] else default_input
-n_replicates <- suppressWarnings(as.integer(arg))
-use_ddaraw <- !is.na(n_replicates) && n_replicates >= 1
+n_protein_copies <- suppressWarnings(as.integer(arg))
+use_ddaraw <- !is.na(n_protein_copies) && n_protein_copies >= 1
 if (!use_ddaraw && !file.exists(arg)) {
     stop(sprintf("Input file not found: %s", arg))
 }
 
-# --- Load the package from source ---------------------------------------
+# --- Load the package ---------------------------------------------------
 
-if (!requireNamespace("pkgload", quietly = TRUE)) {
-    stop("pkgload is required: install.packages('pkgload')")
-}
-pkgload::load_all(".", quiet = TRUE)
+library(MSstats)
 MSstatsConvert::MSstatsLogsSettings(FALSE)
 
 # --- Load the fixture ---------------------------------------------------
@@ -61,26 +64,16 @@ required_cols <- c("ProteinName", "PeptideSequence", "PrecursorCharge",
 if (use_ddaraw) {
     set.seed(42)
     base <- data.table::as.data.table(DDARawData)
-    raw_input <- data.table::rbindlist(lapply(seq_len(n_replicates), function(i) {
+    raw_input <- data.table::rbindlist(lapply(seq_len(n_protein_copies), function(i) {
         d <- data.table::copy(base)
-        d$ProteinName <- paste0(d$ProteinName, "_rep", i)
+        d$ProteinName <- paste0(d$ProteinName, "_copy", i)
         d
     }))
     raw_input <- as.data.frame(raw_input)
-    fixture_label <- sprintf("DDARawData x %d", n_replicates)
+    fixture_label <- sprintf("DDARawData x %d protein copies", n_protein_copies)
 } else {
     raw_input <- data.table::fread(arg, data.table = FALSE)
     fixture_label <- arg
-    # If this looks like a raw Spectronaut export (has R.FileName /
-    # PG.ProteinAccessions, etc.), convert to MSstats 10-col format.
-    if ("R.FileName" %in% colnames(raw_input) &&
-        !all(required_cols %in% colnames(raw_input))) {
-        cat("Detected raw Spectronaut export — converting via",
-            "SpectronauttoMSstatsFormat()...\n")
-        raw_input <- MSstatsConvert::SpectronauttoMSstatsFormat(
-            raw_input, use_log_file = FALSE)
-        fixture_label <- paste0(arg, " (Spectronaut-converted)")
-    }
 }
 missing_cols <- setdiff(required_cols, colnames(raw_input))
 if (length(missing_cols) > 0) {
@@ -98,23 +91,23 @@ bench_rows <- list()
 checkpoint <- function(label, reset = FALSE) {
     if (reset) { gc(); gc(reset = TRUE); .last_time <<- proc.time() }
     now <- proc.time()
-    dt  <- if (is.null(.last_time)) 0 else (now - .last_time)[["elapsed"]]
+    elapsed_s <- if (is.null(.last_time)) 0 else (now - .last_time)[["elapsed"]]
     .last_time <<- now
     g <- gc()
     cols <- colnames(g)
     used_mb <- g["Vcells", which(cols == "used")[1] + 1]
     max_mb  <- g["Vcells", which(cols == "max used")[1] + 1]
-    cat(sprintf("%-42s  used=%7.1f  max=%7.1f  dt=%6.2fs\n",
-                label, used_mb, max_mb, dt),
+    cat(sprintf("%-42s  used=%7.1f  max=%7.1f  elapsed_s=%6.2f\n",
+                label, used_mb, max_mb, elapsed_s),
         file = stderr())
     bench_rows[[length(bench_rows) + 1L]] <<- data.frame(
-        stage   = label,
-        used_mb = used_mb,
-        max_mb  = max_mb,
-        dt_s    = dt,
+        stage     = label,
+        used_mb   = used_mb,
+        max_mb    = max_mb,
+        elapsed_s = elapsed_s,
         stringsAsFactors = FALSE
     )
-    invisible(dt)
+    invisible(elapsed_s)
 }
 
 # --- Replay dataProcess stages manually ---------------------------------
@@ -140,11 +133,11 @@ checkpoint("after MergeFractions")
 input <- MSstats:::MSstatsHandleMissing(input, "TMP", TRUE, "NA", 0.999)
 checkpoint("after HandleMissing")
 
-input <- MSstats:::MSstatsSelectFeatures(input, "all", 3, 2)
+input <- MSstats:::MSstatsSelectFeatures(input, "topN", 100, 2)
 checkpoint("after SelectFeatures")
 
 processed <- MSstats:::getProcessed(input)
-input <- MSstats:::MSstatsPrepareForSummarization(input, "TMP", TRUE, "NA", TRUE)
+input <- MSstats:::MSstatsPrepareForSummarization(input, "TMP", TRUE, "NA", FALSE)
 checkpoint("after PrepareForSummarization")
 
 summarized <- MSstats:::MSstatsSummarizeWithMultipleCores(

From 8a85c119196fa146ad41f95fdc5ad2356984a158 Mon Sep 17 00:00:00 2001
From: Rudhik1904 <rudhikshah50@gmail.com>
Date: Tue, 26 May 2026 18:18:35 -0500
Subject: [PATCH 5/6] moving the script to MSstats Scalblity

---
 benchmark/profile_dataprocess_peak.R | 168 ---------------------------
 1 file changed, 168 deletions(-)
 delete mode 100644 benchmark/profile_dataprocess_peak.R

diff --git a/benchmark/profile_dataprocess_peak.R b/benchmark/profile_dataprocess_peak.R
deleted file mode 100644
index c0fad09d..00000000
--- a/benchmark/profile_dataprocess_peak.R
+++ /dev/null
@@ -1,168 +0,0 @@
-#!/usr/bin/env Rscript
-#
-# profile_dataprocess_peak.R
-#
-# Peak-memory + wall-clock instrumentation for MSstats::dataProcess.
-#
-# Replays dataProcess's internals manually, printing the gc() high-water
-# mark and elapsed time at every stage boundary. Useful forunning max of `used` since gc(reset = TRUE)r locating
-# which pipeline stage drives overall peak memory and runtime.
-#
-# Usage
-# -----
-#
-#   # From the package root:
-#   Rscript benchmark/profile_dataprocess_peak.R
-#
-#   # With an alternate MSstats-format CSV:
-#   Rscript benchmark/profile_dataprocess_peak.R path/to/input.csv
-#
-# Input is expected to already be in MSstats format. Store MSstats-format
-# fixtures alongside this script (or on HPC) — converters live elsewhere.
-#
-# How to read the output
-# ----------------------
-#
-#   used      : Vcells currently in use at the checkpoint.
-#   max       : running max of `used` since gc(reset = TRUE) at "start".
-#               Example: if "after Normalize" reports max=420 and
-#               "after MergeFractions" reports max=510, the MergeFractions
-#               stage drove peak memory up by ~90 MB.
-#   elapsed_s : seconds elapsed since the previous checkpoint.
-#
-# The *delta in the max column between consecutive rows* is the peak
-# memory contribution of that stage. elapsed_s is per-stage wall time.
-
-# --- CLI arg: path to MSstats-format CSV, or integer to scale DDARawData ---
-#
-# - Rscript profile_dataprocess_peak.R                 -> default CSV
-# - Rscript profile_dataprocess_peak.R path/to/x.csv   -> use that CSV
-# - Rscript profile_dataprocess_peak.R 100             -> stack 100 copies
-#                                                         of DDARawData with
-#                                                         renamed proteins to
-#                                                         scale up the input
-
-default_input <- "data/reduce_output_spectronaut_input_qc_test.csv"
-args <- commandArgs(trailingOnly = TRUE)
-arg <- if (length(args) >= 1) args[1] else default_input
-n_protein_copies <- suppressWarnings(as.integer(arg))
-use_ddaraw <- !is.na(n_protein_copies) && n_protein_copies >= 1
-if (!use_ddaraw && !file.exists(arg)) {
-    stop(sprintf("Input file not found: %s", arg))
-}
-
-# --- Load the package ---------------------------------------------------
-
-library(MSstats)
-MSstatsConvert::MSstatsLogsSettings(FALSE)
-
-# --- Load the fixture ---------------------------------------------------
-
-required_cols <- c("ProteinName", "PeptideSequence", "PrecursorCharge",
-                   "FragmentIon", "ProductCharge", "IsotopeLabelType",
-                   "Run", "BioReplicate", "Condition", "Intensity")
-if (use_ddaraw) {
-    set.seed(42)
-    base <- data.table::as.data.table(DDARawData)
-    raw_input <- data.table::rbindlist(lapply(seq_len(n_protein_copies), function(i) {
-        d <- data.table::copy(base)
-        d$ProteinName <- paste0(d$ProteinName, "_copy", i)
-        d
-    }))
-    raw_input <- as.data.frame(raw_input)
-    fixture_label <- sprintf("DDARawData x %d protein copies", n_protein_copies)
-} else {
-    raw_input <- data.table::fread(arg, data.table = FALSE)
-    fixture_label <- arg
-}
-missing_cols <- setdiff(required_cols, colnames(raw_input))
-if (length(missing_cols) > 0) {
-    stop(sprintf("Input is missing MSstats columns: %s",
-                 paste(missing_cols, collapse = ", ")))
-}
-input_mb <- as.numeric(object.size(raw_input)) / 1e6
-cat(sprintf("Fixture: %s\n         %d rows, %.1f MB in memory\n\n",
-            fixture_label, nrow(raw_input), input_mb))
-
-# --- Checkpoint helper --------------------------------------------------
-
-bench_rows <- list()
-.last_time <- NULL
-checkpoint <- function(label, reset = FALSE) {
-    if (reset) { gc(); gc(reset = TRUE); .last_time <<- proc.time() }
-    now <- proc.time()
-    elapsed_s <- if (is.null(.last_time)) 0 else (now - .last_time)[["elapsed"]]
-    .last_time <<- now
-    g <- gc()
-    cols <- colnames(g)
-    used_mb <- g["Vcells", which(cols == "used")[1] + 1]
-    max_mb  <- g["Vcells", which(cols == "max used")[1] + 1]
-    cat(sprintf("%-42s  used=%7.1f  max=%7.1f  elapsed_s=%6.2f\n",
-                label, used_mb, max_mb, elapsed_s),
-        file = stderr())
-    bench_rows[[length(bench_rows) + 1L]] <<- data.frame(
-        stage     = label,
-        used_mb   = used_mb,
-        max_mb    = max_mb,
-        elapsed_s = elapsed_s,
-        stringsAsFactors = FALSE
-    )
-    invisible(elapsed_s)
-}
-
-# --- Replay dataProcess stages manually ---------------------------------
-#
-# Mirrors the sequence in R/dataProcess.R::dataProcess. Keep these in
-# sync if dataProcess reorders its pipeline.
-
-t_start <- proc.time()
-checkpoint("start", reset = TRUE)
-
-raw <- raw_input
-peptides_dict <- MSstats:::makePeptidesDictionary(
-    data.table::as.data.table(unclass(raw)), "equalizeMedians")
-input <- MSstats:::MSstatsPrepareForDataProcess(raw, 2, NULL)
-rm(raw); checkpoint("after PrepareForDataProcess + rm(raw)")
-
-input <- MSstats:::MSstatsNormalize(input, "equalizeMedians", peptides_dict, NULL)
-rm(peptides_dict); checkpoint("after Normalize")
-
-input <- MSstats:::MSstatsMergeFractions(input)
-checkpoint("after MergeFractions")
-
-input <- MSstats:::MSstatsHandleMissing(input, "TMP", TRUE, "NA", 0.999)
-checkpoint("after HandleMissing")
-
-input <- MSstats:::MSstatsSelectFeatures(input, "topN", 100, 2)
-checkpoint("after SelectFeatures")
-
-processed <- MSstats:::getProcessed(input)
-input <- MSstats:::MSstatsPrepareForSummarization(input, "TMP", TRUE, "NA", FALSE)
-checkpoint("after PrepareForSummarization")
-
-summarized <- MSstats:::MSstatsSummarizeWithMultipleCores(
-    input, "TMP", TRUE, "NA", FALSE, TRUE, 1, 90)
-checkpoint("after Summarize")
-
-gc(verbose = FALSE)
-checkpoint("after gc()")
-
-output <- MSstats:::MSstatsSummarizationOutput(
-    input, summarized, processed, "TMP", TRUE, "NA")
-checkpoint("after SummarizationOutput")
-
-total_elapsed <- (proc.time() - t_start)[["elapsed"]]
-
-# --- Summary ------------------------------------------------------------
-
-bench_df      <- do.call(rbind, bench_rows)
-peak_mb       <- max(bench_df$max_mb)
-baseline_mb   <- bench_df$max_mb[bench_df$stage == "start"]
-peak_delta_mb <- peak_mb - baseline_mb
-
-cat("\n========================================\n")
-cat(sprintf("Total elapsed:        %.2f s\n",  total_elapsed))
-cat(sprintf("Peak memory (max):    %.1f MB\n", peak_mb))
-cat(sprintf("Baseline memory:      %.1f MB\n", baseline_mb))
-cat(sprintf("Peak above baseline:  %.1f MB\n", peak_delta_mb))
-cat("========================================\n")

From 2e6cf178bf739ffe76fedb799730cc93f8e7f812 Mon Sep 17 00:00:00 2001
From: Rudhik1904 <rudhikshah50@gmail.com>
Date: Tue, 26 May 2026 18:20:04 -0500
Subject: [PATCH 6/6] fixing the comment

---
 R/utils_summarization.R | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/R/utils_summarization.R b/R/utils_summarization.R
index b4f32238..47bbef21 100644
--- a/R/utils_summarization.R
+++ b/R/utils_summarization.R
@@ -199,9 +199,7 @@
 #' @keywords internal
 .updateUnequalVariances = function(input, fit, num_iter) {
     weight = abs.resids = loess.fitted = NULL
-    if (data.table::is.data.table(input)) {
-        input = data.table::copy(input)
-    } else {
+    if (!data.table::is.data.table(input)) {
         input = data.table::as.data.table(input)
     }
     for (i in seq_len(num_iter)) {