From 16edbd54a833da1b02ff3c9bb54ae8cdd69b974d Mon Sep 17 00:00:00 2001
From: Tony Wu <wu.anthon@northeastern.edu>
Date: Thu, 14 May 2026 11:02:41 -0400
Subject: [PATCH 1/7] Removed deep-copy data.table ops from the dataProcess
 pipeline
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* Replaced `input[, cols, with = FALSE]` deep-copy in
  MSstatsPrepareForDataProcess and MSstatsSummarizationOutput with
  drop-cols loops via data.table::set(j = ..., value = NULL).
* Replaced row-shuffle `input = input[order(...), ]` in
  .prepareForDataProcess with data.table::setorder() (in place).
* Replaced merge(all.x = TRUE) joins in MSstatsMergeFractions and
  .finalizeTMP with keyed-which lookups + data.table::set() writes —
  avoids deep-copying the whole table.
* Replaced the synthesised `tmp` string-join filter in
  MSstatsMergeFractions with a direct (FEATURE, FRACTION) keyed
  lookup; drops two large character vectors and a paste() call.
* Replaced ifelse() full-vector writes for predicted/newABUNDANCE
  and nonmissing_orig with targeted [i, j := v] in-place writes.
* Collapsed the two-step subset+transform in .selectHighQualityFeatures
  into a single pass to eliminate one intermediate data.table copy.
* Reworked MSstatsSummarizationOutput to extract predicted_survival
  upfront and null per-protein second slots so the nested-list
  duplication is freed before .finalizeTMP runs; switched the
  final return to data.table::setDF() in place of as.data.frame().
* Fixed two regressions in the original commit: (1) .finalizeTMP's
  join_cols must intersect with predicted_survival's columns so the
  keyed lookup doesn't error on missing LABEL; (2) reverted the
  survival-column-selection tightening that dropped LABEL — a
  downstream test in test_dataProcess.R relies on LABEL being kept.
* Tests: inst/tinytest/test_memory_optimization_copies.R Issues 2/3/4
  — 28 assertions, all green. Full suite 224/224 OK.

See MSstats-ai/todos/active/TODO-MS-20260514_fix-memory-bugs.md

Co-Authored-By: Claude <noreply@anthropic.com>
---
 R/dataProcess.R                               |  22 +-
 R/utils_checks.R                              |  18 +-
 R/utils_feature_selection.R                   |  42 +-
 R/utils_normalize.R                           |  36 +-
 R/utils_output.R                              |  78 ++-
 .../test_memory_optimization_copies.R         | 469 ++++++++++++++++++
 6 files changed, 588 insertions(+), 77 deletions(-)
 create mode 100644 inst/tinytest/test_memory_optimization_copies.R

diff --git a/R/dataProcess.R b/R/dataProcess.R
index e9581c8b..bc86bd24 100755
--- a/R/dataProcess.R
+++ b/R/dataProcess.R
@@ -414,19 +414,19 @@ MSstatsSummarizeSingleLinear = function(single_protein,
         }]
 
         if (is_labeled_reference) {
-            single_protein[, predicted := ifelse(censored & is_labeled_ref == FALSE, predicted, NA)]
-            single_protein[, newABUNDANCE := ifelse(censored & is_labeled_ref == FALSE, predicted, newABUNDANCE)]
+            single_protein[!(censored & is_labeled_ref == FALSE), predicted := NA]
+            single_protein[(censored) & is_labeled_ref == FALSE,
+                           newABUNDANCE := predicted]
         } else {
-            single_protein[, predicted := ifelse(censored, predicted, NA)]
-            single_protein[, newABUNDANCE := ifelse(censored, predicted, newABUNDANCE)]
+            single_protein[!(censored), predicted := NA]
+            single_protein[(censored), newABUNDANCE := predicted]
         }
-
         survival = single_protein[, intersect(c(cols, "LABEL", "predicted"), colnames(single_protein)), with = FALSE]
     } else {
         survival = single_protein[, intersect(c(cols, "LABEL"), colnames(single_protein)), with = FALSE]
         survival[, predicted := NA]
     }
-    
+
     if (all(!is.na(single_protein$ANOMALYSCORES))) {
         single_protein[, weights :=
             anomaly_weights_z_vec(ANOMALYSCORES),
@@ -569,11 +569,13 @@ MSstatsSummarizeSingleTMP = function(single_protein, impute, censored_symbol,
         }
 
         if (is_labeled_reference) {
-            single_protein[, predicted := ifelse(censored & is_labeled_ref == FALSE, predicted, NA)]
-            single_protein[, newABUNDANCE := ifelse(censored & is_labeled_ref == FALSE, predicted, newABUNDANCE)]
+            single_protein[!(censored & is_labeled_ref == FALSE), predicted := NA]
+            single_protein[(censored) & is_labeled_ref == FALSE,
+                           newABUNDANCE := predicted]
         } else {
-            single_protein[, predicted := ifelse(censored, predicted, NA)]
-            single_protein[, newABUNDANCE := ifelse(censored, predicted, newABUNDANCE)]
+            single_protein[!(censored), predicted := NA]
+            single_protein[(censored),
+                           newABUNDANCE := predicted]
         }
         survival = single_protein[, intersect(c(cols, "LABEL", "predicted"), colnames(single_protein)), with = FALSE]
     } else {
diff --git a/R/utils_checks.R b/R/utils_checks.R
index 3596e74d..97dbc9be 100644
--- a/R/utils_checks.R
+++ b/R/utils_checks.R
@@ -211,9 +211,14 @@ MSstatsPrepareForDataProcess = function(input, log_base, fix_missing) {
     cols = toupper(cols)
     cols = intersect(c(cols, "FRACTION", "TECHREPLICATE"),
                      colnames(input))
-    input = input[, cols, with = FALSE]
-    
-    input$PEPTIDE = paste(input$PEPTIDESEQUENCE, 
+    # Drop unwanted columns by reference rather than subsetting with
+    # `input[, cols, with = FALSE]`, which would deep-copy every retained
+    # column into a new data.table. On large inputs this subset was
+    # allocating hundreds of MB (~nrow x ncol x 8 bytes).
+    drop_cols = setdiff(colnames(input), cols)
+    for (col in drop_cols) data.table::set(input, j = col, value = NULL)
+
+    input$PEPTIDE = paste(input$PEPTIDESEQUENCE,
                           input$PRECURSORCHARGE, sep = "_")
     input$TRANSITION = paste(input$FRAGMENTION, 
                              input$PRODUCTCHARGE, sep = "_")
@@ -322,8 +327,11 @@ setMethod(".checkDataValidity", "MSstatsValidated", .prepareForDataProcess)
     input[, PROTEIN := factor(PROTEIN)]
     input[, PEPTIDE := factor(PEPTIDE)]
     input[, TRANSITION := factor(TRANSITION)]
-    input = input[order(LABEL, GROUP_ORIGINAL, SUBJECT_ORIGINAL,
-                        RUN, PROTEIN, PEPTIDE, TRANSITION), ]
+    # Sort in place instead of `input = input[order(...), ]`, which builds
+    # a full row-reshuffle copy of the whole table. setorder rewrites the
+    # underlying column vectors without allocating a new data.table.
+    data.table::setorder(input, LABEL, GROUP_ORIGINAL, SUBJECT_ORIGINAL,
+                         RUN, PROTEIN, PEPTIDE, TRANSITION)
     input[, GROUP := factor(GROUP)]
     input[, SUBJECT := factor(SUBJECT)]
     input[, FEATURE := factor(FEATURE)]
diff --git a/R/utils_feature_selection.R b/R/utils_feature_selection.R
index b11b6f16..65dfed26 100644
--- a/R/utils_feature_selection.R
+++ b/R/utils_feature_selection.R
@@ -74,29 +74,27 @@ MSstatsSelectFeatures = function(input, method, top_n = 3, min_feature_count = 2
 #' @return data.table
 #' @keywords internal
 .selectHighQualityFeatures = function(input, min_feature_count) {
-    PROTEIN = PEPTIDE = FEATURE = originalRUN = ABUNDANCE = is_censored = NULL
+    PROTEIN = PEPTIDE = FEATURE = originalRUN = ABUNDANCE = censored = NULL
     is_obs = log2inty = LABEL = NULL
-    
-    cols = c("PROTEIN", "PEPTIDE", "FEATURE", "originalRUN", "LABEL", 
-             "ABUNDANCE", "censored")
-    cols = intersect(cols, colnames(input))
-    input = input[, cols, with = FALSE]
-    if (!("censored" %in% cols)) {
-        input$censored = FALSE
-    } 
-    data.table::setnames(input, "censored", "is_censored")
-    input = input[, list(protein = as.character(PROTEIN),
-                         peptide = as.character(PEPTIDE),
-                         feature = as.character(FEATURE),
-                         run = as.character(originalRUN),
-                         label = as.character(LABEL),
-                         log2inty = ifelse(!(is.na(ABUNDANCE) | is_censored),
-                                           ABUNDANCE, NA),
-                         is_censored)]
-    input[, is_obs := !(is.na(log2inty) | is_censored)]
-    input[, is_censored := NULL]
-    
-    features_quality = data.table::rbindlist(lapply(split(input, input$label),
+
+    has_censored = is.element("censored", colnames(input))
+    # Build the working copy in a single operation. Previously this was two
+    # steps: (1) column-subset via input[, cols, with = FALSE] which deep-
+    # copied 7 columns from the 17-column table (~120 MB), then (2) a
+    # transform that created another new data.table (~100 MB). Combining
+    # them into one expression eliminates the intermediate copy.
+    work = input[, list(protein = as.character(PROTEIN),
+                        peptide = as.character(PEPTIDE),
+                        feature = as.character(FEATURE),
+                        run = as.character(originalRUN),
+                        label = as.character(LABEL),
+                        log2inty = ifelse(!(is.na(ABUNDANCE) |
+                                           if (has_censored) censored else FALSE),
+                                          ABUNDANCE, NA),
+                        is_obs = FALSE)]
+    work[, is_obs := !is.na(log2inty)]
+
+    features_quality = data.table::rbindlist(lapply(split(work, work$label),
                                                     .flagUninformativeSingleLabel,
                                                     min_feature_count = min_feature_count))
     features_quality
diff --git a/R/utils_normalize.R b/R/utils_normalize.R
index ca6f1f00..609abfe6 100644
--- a/R/utils_normalize.R
+++ b/R/utils_normalize.R
@@ -61,8 +61,8 @@ MSstatsNormalize = function(input, normalization_method, peptides_dict = NULL, s
     input[, ABUNDANCE_FRACTION := median(ABUNDANCE_RUN, na.rm = TRUE),
           by = "FRACTION"]
     input[, ABUNDANCE := ABUNDANCE - ABUNDANCE_RUN + ABUNDANCE_FRACTION]
-    input = input[, !(colnames(input) %in% c("ABUNDANCE_RUN", "ABUNDANCE_FRACTION")),
-                  with = FALSE]
+    data.table::set(input, j = "ABUNDANCE_RUN", value = NULL)
+    data.table::set(input, j = "ABUNDANCE_FRACTION", value = NULL)
     getOption("MSstatsLog")("Normalization based on median: OK")
     input
 }
@@ -255,7 +255,9 @@ MSstatsNormalize = function(input, normalization_method, peptides_dict = NULL, s
     input[, ABUNDANCE := ABUNDANCE - median_by_run + median_by_fraction]
 
     getOption("MSstatsLog")("INFO", "Normalization : normalization with global standards protein - okay")
-    input[ , !(colnames(input) %in% c("median_by_run", "median_by_fraction")), with = FALSE]
+    data.table::set(input, j = "median_by_run", value = NULL)
+    data.table::set(input, j = "median_by_fraction", value = NULL)
+    input
 }
 
 
@@ -344,23 +346,31 @@ MSstatsMergeFractions = function(input) {
                 match_runs = unique(match_runs[, list(GROUP_ORIGINAL,
                                                       SUBJECT_ORIGINAL,
                                                       newRun)])
-                
-                input = merge(input, match_runs,
-                              by = c("GROUP_ORIGINAL", "SUBJECT_ORIGINAL"),
-                              all.x = TRUE)
+
+                # In-place add of newRun via keyed-match lookup instead of
+                # `merge(all.x=TRUE)`, which would deep-copy the whole table.
+                nr_idx = match_runs[input,
+                                    on = c("GROUP_ORIGINAL", "SUBJECT_ORIGINAL"),
+                                    which = TRUE, mult = "first"]
+                data.table::set(input, j = "newRun",
+                                value = match_runs$newRun[nr_idx])
                 select_fraction = input[!is.na(ABUNDANCE) & input$ABUNDANCE > 0,
                                         list(ncount = .N),
                                         by = c("FEATURE", "FRACTION")]
                 select_fraction = select_fraction[ncount != 0]
-                select_fraction[, tmp := paste(FEATURE, FRACTION, sep = "_")]
-                input$tmp = paste(input$FEATURE, input$FRACTION, sep = "_")
-                input = input[tmp %in% select_fraction$tmp, ]
+                # Filter by (FEATURE, FRACTION) pair directly instead of
+                # synthesising a concatenated "tmp" string column on both
+                # sides. Saves two 207k-string character vectors plus the
+                # paste() cost each call.
+                keep_idx = select_fraction[input,
+                                           on = c("FEATURE", "FRACTION"),
+                                           which = TRUE, mult = "first"]
+                input = input[!is.na(keep_idx)]
                 input$originalRUN = input$newRun
                 input$RUN = input$originalRUN
-                input$RUN = factor(input$RUN, levels = unique(input$RUN), 
+                input$RUN = factor(input$RUN, levels = unique(input$RUN),
                                    labels = seq_along(unique(input$RUN)))
-                input = input[, !(colnames(input) %in% c('tmp','newRun')), 
-                              with = FALSE]
+                data.table::set(input, j = "newRun", value = NULL)
             }
         }
     }
diff --git a/R/utils_output.R b/R/utils_output.R
index 7b748482..9b49aae7 100644
--- a/R/utils_output.R
+++ b/R/utils_output.R
@@ -34,13 +34,20 @@
 #' output = output = MSstatsSummarizationOutput(input, summarized, processed,
 #' method, impute, cens)
 #' 
-MSstatsSummarizationOutput = function(input, summarized, processed, 
+MSstatsSummarizationOutput = function(input, summarized, processed,
                                       method, impute, censored_symbol) {
     LABEL = TotalGroupMeasurements = GROUP = Protein = RUN = NULL
-    
-    input = .finalizeInput(input, summarized, method, impute, censored_symbol)
-    summarized = lapply(summarized, function(x) x[[1]])
-    summarized = data.table::rbindlist(summarized, fill = TRUE)
+
+    predicted_survival = data.table::rbindlist(lapply(summarized, function(x) x[[2]]),
+                                                fill = TRUE)
+    for (i in seq_along(summarized)) summarized[[i]][[2]] = NULL
+    input = .finalizeInput(input, predicted_survival, method, impute, censored_symbol)
+    rm(predicted_survival)
+    protein_summaries = lapply(summarized, function(x) x[[1]])
+    rm(summarized)
+    summarized = data.table::rbindlist(protein_summaries, fill = TRUE)
+    rm(protein_summaries)
+
     if (inherits(summarized, "try-error")) {
         msg = paste("*** error : can't summarize per subplot with ",
                     method, ".")
@@ -82,18 +89,21 @@ MSstatsSummarizationOutput = function(input, summarized, processed,
                               "originalRUN", "censored", "INTENSITY", "ABUNDANCE",
                               "newABUNDANCE", "predicted", "feature_quality",
                               "is_outlier", "remove", "is_labeled_ref"), colnames(input))
-    input = input[, output_cols, with = FALSE]
-    
+    drop_cols = setdiff(colnames(input), output_cols)
+    for (col in drop_cols) data.table::set(input, j = col, value = NULL)
+
     if (is.element("remove", colnames(processed))) {
-        processed = processed[(remove), 
-                              intersect(output_cols, 
+        processed = processed[(remove),
+                              intersect(output_cols,
                                         colnames(processed)), with = FALSE]
         input = rbind(input, processed, fill = TRUE)
     }
-    list(FeatureLevelData = as.data.frame(input), 
-         ProteinLevelData = as.data.frame(rqall), 
+    data.table::setDF(input)
+    data.table::setDF(rqall)
+    list(FeatureLevelData = input,
+         ProteinLevelData = rqall,
          SummaryMethod = method)
-    
+
 }
 
 
@@ -104,9 +114,9 @@ MSstatsSummarizationOutput = function(input, summarized, processed,
 #' @param impute if TRUE, censored missing values were imputed
 #' @param censored_symbol censored missing value indicator
 #' @keywords internal
-.finalizeInput = function(input, summarized, method, impute, censored_symbol) {
+.finalizeInput = function(input, predicted_survival, method, impute, censored_symbol) {
     # if (method == "TMP") {
-    input = .finalizeTMP(input, censored_symbol, impute, summarized)
+    input = .finalizeTMP(input, censored_symbol, impute, predicted_survival)
     # } else {
     #     input = .finalizeLinear(input, censored_symbol)
     # }
@@ -117,21 +127,27 @@ MSstatsSummarizationOutput = function(input, summarized, processed,
 #' Summary statistics for output of TMP-based summarization
 #' @inheritParams .finalizeInput
 #' @keywords internal
-.finalizeTMP = function(input, censored_symbol, impute, summarized) {
+.finalizeTMP = function(input, censored_symbol, impute, predicted_survival) {
     NonMissingStats = NumMeasuredFeature = MissingPercentage = LABEL = NULL
     total_features = more50missing = nonmissing_orig = censored = NULL
     INTENSITY = newABUNDANCE = NumImputedFeature = NULL
-    
-    survival_predictions = lapply(summarized, function(x) x[[2]])
-    predicted_survival = data.table::rbindlist(survival_predictions, fill = TRUE)
+
     if (impute) {
-        cols = intersect(colnames(input), c("newABUNDANCE",
-                                            "cen", "RUN",
-                                            "FEATURE", "ref_covariate", "LABEL"))
-        input = merge(input[, colnames(input) != "newABUNDANCE", with = FALSE],
-                      predicted_survival,
-                      by = setdiff(cols, "newABUNDANCE"),
-                      all.x = TRUE)
+        # Join columns must exist in BOTH tables. The per-protein survival
+        # tables built in MSstatsSummarizeSingleLinear / TMP exclude LABEL,
+        # so intersect with predicted_survival to avoid a missing-column
+        # error when LABEL is in input but not in the survival table.
+        join_cols = intersect(intersect(colnames(input),
+                                        colnames(predicted_survival)),
+                              c("cen", "RUN", "FEATURE", "ref_covariate",
+                                "LABEL"))
+        data.table::set(input, j = "newABUNDANCE", value = NULL)
+        idx = predicted_survival[input, on = join_cols, which = TRUE,
+                                 mult = "first"]
+        data.table::set(input, j = "newABUNDANCE",
+                        value = predicted_survival$newABUNDANCE[idx])
+        data.table::set(input, j = "predicted",
+                        value = predicted_survival$predicted[idx])
     }
     input[, NonMissingStats := .getNonMissingFilterStats(.SD, censored_symbol)]
     input[, NumMeasuredFeature := sum(NonMissingStats),
@@ -144,7 +160,11 @@ MSstatsSummarizationOutput = function(input, summarized, processed,
         } else {
             input[, nonmissing_orig := !is.na(INTENSITY)]
         }
-        input[, nonmissing_orig := ifelse(is.na(newABUNDANCE), TRUE, nonmissing_orig)]
+        # In-place conditional write: only overwrite nonmissing_orig for
+        # the rows where newABUNDANCE is NA. `ifelse(...)` would build a
+        # full-length logical vector for every row regardless; this writes
+        # only the rows that need changing.
+        input[is.na(newABUNDANCE), nonmissing_orig := TRUE]
         if (impute) {
             input[, NumImputedFeature := sum(!nonmissing_orig),
                   by = c("PROTEIN", "RUN", "LABEL")]
@@ -175,7 +195,11 @@ MSstatsSummarizationOutput = function(input, summarized, processed,
         } else {
             input[, nonmissing_orig := !is.na(INTENSITY)]
         }
-        input[, nonmissing_orig := ifelse(is.na(newABUNDANCE), TRUE, nonmissing_orig)]
+        # In-place conditional write: only overwrite nonmissing_orig for
+        # the rows where newABUNDANCE is NA. `ifelse(...)` would build a
+        # full-length logical vector for every row regardless; this writes
+        # only the rows that need changing.
+        input[is.na(newABUNDANCE), nonmissing_orig := TRUE]
         input[, NumImputedFeature := 0]
     }
     input
diff --git a/inst/tinytest/test_memory_optimization_copies.R b/inst/tinytest/test_memory_optimization_copies.R
new file mode 100644
index 00000000..bcd9656e
--- /dev/null
+++ b/inst/tinytest/test_memory_optimization_copies.R
@@ -0,0 +1,469 @@
+# Tests for memory optimizations in the dataProcess pipeline that target
+# data.table deep-copy ops: in-place column drops, merge() -> keyed lookup,
+# ifelse() -> targeted [i, j := v] writes, and the predicted_survival
+# extraction in MSstatsSummarizationOutput.
+
+
+# --- Issue 2: Column Removal via Subsetting Creates Full Copies ---------------
+#
+# .normalizeMedian() adds two temp columns (ABUNDANCE_RUN, ABUNDANCE_FRACTION)
+# via := (in-place), uses them, then removes them. Previously it used
+# input[, !(colnames(input) %in% ...), with = FALSE] which created a full
+# copy of the data.table (~250 MB for a real dataset). Now it uses
+# data.table::set(input, j = ..., value = NULL) which removes in-place.
+#
+# We test this by checking that the data.table's memory address is preserved
+# after the function call — proving no copy was made.
+
+# Suppress MSstats log messages during test
+MSstatsConvert::MSstatsLogsSettings(FALSE)
+
+# Test 2a: .normalizeMedian preserves data.table identity (no copy) ---
+#
+# data.table::address() returns the hex pointer of the data.table object.
+# If the function modifies in-place, the address stays the same.
+# If it creates a copy (the old bug), the address changes.
+
+norm_input = data.table::data.table(
+    PROTEIN = factor(rep(c("P1", "P2"), each = 20)),
+    PEPTIDE = factor(rep(c("pep1", "pep2"), each = 10, times = 2)),
+    FEATURE = factor(rep(c("feat1", "feat2"), each = 10, times = 2)),
+    LABEL = rep("L", 40),
+    RUN = factor(rep(1:10, 4)),
+    FRACTION = rep(1L, 40),
+    ABUNDANCE = rnorm(40, mean = 20, sd = 2),
+    INTENSITY = 2^rnorm(40, mean = 20, sd = 2),
+    GROUP_ORIGINAL = rep(c("A", "B"), each = 5, times = 4),
+    SUBJECT_ORIGINAL = rep(1:10, 4),
+    TRANSITION = factor(rep("y3_1", 40))
+)
+
+addr_before = data.table::address(norm_input)
+norm_result = MSstats:::.normalizeMedian(norm_input)
+addr_after = data.table::address(norm_result)
+
+# The address should be the same — in-place modification, no copy
+expect_equal(addr_before, addr_after,
+             info = paste(".normalizeMedian should modify in-place (same address).",
+                          "Before:", addr_before, "After:", addr_after))
+
+# Temp columns should not exist in the result
+expect_false("ABUNDANCE_RUN" %in% colnames(norm_result),
+             info = "Temp column ABUNDANCE_RUN should be removed")
+expect_false("ABUNDANCE_FRACTION" %in% colnames(norm_result),
+             info = "Temp column ABUNDANCE_FRACTION should be removed")
+
+# ABUNDANCE should still exist and be modified (normalized)
+expect_true("ABUNDANCE" %in% colnames(norm_result),
+            info = "ABUNDANCE column should still exist after normalization")
+
+
+# Test 2b: .normalizeMedian does not allocate a full-table copy ---
+#
+# Create a larger dataset and measure memory allocated during the call.
+# With in-place set(), the only allocations should be the two temp column
+# vectors (~2x nrow doubles). With the old column-subset approach, the
+# allocation would be ~15x nrow doubles (copying all kept columns).
+
+n_rows = 100000
+big_input = data.table::data.table(
+    PROTEIN = factor(rep(paste0("P", 1:100), each = n_rows / 100)),
+    PEPTIDE = factor(rep(paste0("pep", 1:200), each = n_rows / 200)),
+    FEATURE = factor(rep(paste0("feat", 1:200), each = n_rows / 200)),
+    LABEL = rep("L", n_rows),
+    RUN = factor(rep(1:50, n_rows / 50)),
+    FRACTION = rep(1L, n_rows),
+    ABUNDANCE = rnorm(n_rows, mean = 20, sd = 2),
+    INTENSITY = 2^rnorm(n_rows, mean = 20, sd = 2),
+    GROUP_ORIGINAL = rep(c("A", "B"), each = 25, times = n_rows / 50),
+    SUBJECT_ORIGINAL = rep(1:50, n_rows / 50),
+    TRANSITION = factor(rep("y3_1", n_rows))
+)
+
+# Measure the size of the input table and one column for reference.
+# We use these to reason about whether the allocation during
+# .normalizeMedian is "2 temp columns worth" vs "full table copy worth".
+table_size = as.numeric(object.size(big_input))
+one_col_size = as.numeric(object.size(big_input$ABUNDANCE))
+ncols = ncol(big_input)
+
+# --- Memory measurement ---
+#
+# R manages data in "Vcells" (vector cells) — this is where column data
+# lives. gc() returns a matrix with memory stats; row 2 = Vcells,
+# column 6 = "max used (Mb)" = peak Vcells since last reset.
+#
+# gc(reset = TRUE) resets the "max used" counter to the current level,
+# giving us a clean window to measure just the .normalizeMedian call.
+gc(reset = TRUE)
+big_result = MSstats:::.normalizeMedian(big_input)
+gc_after = gc()
+
+# Extract peak vector memory (Vcells, max used, in MB).
+# We store this for informational purposes but do NOT assert on it
+# because gc() accounting is non-deterministic:
+#   - R may GC the temp columns before we call gc(), or may not
+#   - Other small allocations (formula parsing, string interning)
+#     contaminate the measurement
+#   - Two identical runs can give different numbers
+# The address() check below is the reliable, deterministic test.
+peak_vcells_mb = gc_after[2, 6]
+
+# Sanity check on the test data itself (not on the function):
+# Verify that 2 temp columns are much smaller than the full table.
+# If they were similar sizes, we couldn't distinguish "allocated 2 temp
+# columns" from "copied the whole table" and the test would be meaningless.
+# For our 11-column, 100K-row table: 2 cols ≈ 1.6 MB, full table ≈ 6 MB.
+expect_true(one_col_size * 2 < table_size * 0.5,
+            info = paste("Sanity check: 2 columns should be much less than",
+                         "50% of the table. 2 cols:", one_col_size * 2,
+                         "50% table:", table_size * 0.5))
+
+# The real test: same address = same R object = no copy was made.
+# This is deterministic and reliable, unlike gc() measurements.
+expect_equal(data.table::address(big_input), data.table::address(big_result),
+             info = "Large table: .normalizeMedian should modify in-place")
+expect_false("ABUNDANCE_RUN" %in% colnames(big_result),
+             info = "Large table: temp column ABUNDANCE_RUN should be removed")
+
+
+# --- Issue 3: .finalizeTMP Merge Duplicates the Entire Input ------------------
+#
+# .finalizeTMP replaces the pre-imputation newABUNDANCE column in the full
+# feature-level table with imputed values from predicted_survival. Previously
+# it did: merge(input[, cols != "newABUNDANCE", with=FALSE], predicted_survival)
+# which created a column-subset copy (~333 MB) and a merge result (~370 MB),
+# peaking at ~1,133 MB for ~350 MB of data.
+#
+# The fix uses in-place operations:
+#   set(input, j = "newABUNDANCE", value = NULL)  — remove old column (0 bytes)
+#   input[, c(...) := .(NA_real_, NA_real_)]       — add NA placeholders (~40 MB)
+#   input[predicted_survival, ... := ..., on = .]  — update matched rows (0 bytes)
+#
+# These tests verify:
+# (a) The data.table is modified in-place (address preserved)
+# (b) Matched rows get imputed values from predicted_survival
+# (c) Unmatched rows get NA (same as the old merge with all.x = TRUE)
+# (d) Memory allocation is minimal compared to a full-table copy
+
+# Test 3a: .finalizeTMP modifies input in-place with correct values ---
+
+# Build a feature-level input table mimicking post-summarization state.
+# Two proteins, 2 features each, 3 runs = 12 rows per label.
+n = 12
+finalize_input = data.table::data.table(
+    PROTEIN = factor(rep(c("P1", "P2"), each = 6)),
+    PEPTIDE = factor(rep(c("pep1", "pep2"), each = 3, times = 2)),
+    FEATURE = factor(rep(c("feat1", "feat2"), each = 3, times = 2)),
+    LABEL = rep("L", n),
+    GROUP = factor(rep(c("A", "B", "C"), times = 4)),
+    RUN = factor(rep(1:3, times = 4)),
+    SUBJECT = factor(rep(1:3, times = 4)),
+    FRACTION = rep(1L, n),
+    INTENSITY = 2^rnorm(n, 20, 2),
+    ABUNDANCE = rnorm(n, 20, 2),
+    # Pre-imputation newABUNDANCE: some are NA (censored)
+    newABUNDANCE = c(10.1, NA, 10.5, 11.2, 10.8, NA,
+                     9.5, NA, 10.2, 11.0, NA, 10.6),
+    cen = c(1, 0, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1),
+    censored = c(FALSE, TRUE, FALSE, FALSE, FALSE, TRUE,
+                 FALSE, TRUE, FALSE, FALSE, TRUE, FALSE),
+    originalRUN = factor(rep(paste0("Run", 1:3), times = 4)),
+    n_obs = rep(3, n),
+    n_obs_run = rep(2, n),
+    total_features = rep(2, n),
+    nonmissing = c(TRUE, FALSE, TRUE, TRUE, TRUE, FALSE,
+                   TRUE, FALSE, TRUE, TRUE, FALSE, TRUE)
+)
+
+# Build predicted_survival: simulates what rbindlist of per-protein survival
+# tables looks like. Only contains rows that passed the n_obs/n_obs_run filter
+# in MSstatsSummarizeSingleTMP — so fewer rows than input.
+# Rows 2, 6, 8, 11 are censored — only some of them appear in predicted_survival
+# (e.g., rows filtered out by n_obs <= 1 would be absent).
+# Here we include most rows but deliberately EXCLUDE rows for RUN=2, FEATURE=feat2
+# of P2 to test the unmatched (NA) case.
+predicted_survival = data.table::data.table(
+    newABUNDANCE = c(10.1, 8.5, 10.5, 11.2, 10.8, 7.9,
+                     9.5, 8.2, 10.2, 11.0, 10.6),
+    cen = c(1, 0, 1, 1, 1, 0,
+            1, 0, 1, 1, 1),
+    RUN = factor(c(1, 2, 3, 1, 2, 3,
+                   1, 2, 3, 1, 3)),
+    FEATURE = factor(c(rep("feat1", 3), rep("feat2", 3),
+                       rep("feat1", 3), rep("feat2", 2))),
+    predicted = c(NA, 8.5, NA, NA, NA, 7.9,
+                  NA, 8.2, NA, NA, NA)
+)
+
+addr_before = data.table::address(finalize_input)
+
+# Call .finalizeTMP with impute = TRUE.
+# This should replace newABUNDANCE and add predicted via in-place join.
+result = MSstats:::.finalizeTMP(finalize_input, censored_symbol = "NA",
+                                impute = TRUE,
+                                predicted_survival = predicted_survival)
+
+addr_after = data.table::address(result)
+
+# (a) Address preserved — the data.table was modified in-place, no full copy.
+expect_equal(addr_before, addr_after,
+             info = paste(".finalizeTMP should modify input in-place.",
+                          "Before:", addr_before, "After:", addr_after))
+
+# (b) Matched rows should have newABUNDANCE from predicted_survival.
+# Any row whose (cen, RUN, FEATURE) key exists in predicted_survival
+# should get a non-NA newABUNDANCE value.
+matched_count = sum(!is.na(result$newABUNDANCE))
+expect_true(matched_count > 0,
+            info = paste("Matched rows should have non-NA newABUNDANCE.",
+                         "Found", matched_count, "non-NA values"))
+
+# (c) Unmatched rows should have NA.
+# predicted_survival has 11 rows but input has 12. The missing key
+# (cen=0, RUN=2, FEATURE=feat2) has no match, so that row should be NA.
+# We check by looking for the key that's absent in predicted_survival.
+unmatched = result[cen == 0 & RUN == 2 & FEATURE == "feat2", newABUNDANCE]
+if (length(unmatched) > 0) {
+    expect_true(is.na(unmatched[1]),
+                info = "Unmatched row should get NA for newABUNDANCE")
+}
+
+# (d) The predicted column should exist.
+expect_true("predicted" %in% colnames(result),
+            info = ".finalizeTMP should add the 'predicted' column")
+
+
+# Test 3b: .finalizeTMP at scale — memory stays low ---
+#
+# Create a larger dataset and verify that peak memory allocation during
+# .finalizeTMP is far below what a full-table merge would require.
+# With the old merge approach, peak would be ~3x the table size.
+# With in-place join, peak should be close to ~1x (just the table + 2 new cols).
+
+n_big = 50000
+n_runs = 50
+n_features = 20
+n_proteins = n_big / (n_runs * n_features)  # = 50
+
+big_finalize_input = data.table::data.table(
+    PROTEIN = factor(rep(paste0("P", 1:n_proteins), each = n_runs * n_features)),
+    FEATURE = factor(rep(paste0("feat", 1:n_features), each = n_runs, times = n_proteins)),
+    LABEL = rep("L", n_big),
+    RUN = factor(rep(1:n_runs, times = n_proteins * n_features)),
+    INTENSITY = 2^rnorm(n_big, 20, 2),
+    ABUNDANCE = rnorm(n_big, 20, 2),
+    newABUNDANCE = rnorm(n_big, 20, 2),
+    cen = sample(c(0, 1), n_big, replace = TRUE, prob = c(0.2, 0.8)),
+    censored = FALSE,
+    originalRUN = factor(rep(paste0("Run", 1:n_runs), times = n_proteins * n_features)),
+    GROUP = factor(rep("A", n_big)),
+    SUBJECT = factor(rep(1:n_runs, times = n_proteins * n_features)),
+    n_obs = rep(n_runs, n_big),
+    n_obs_run = rep(n_features, n_big),
+    total_features = rep(n_features, n_big),
+    nonmissing = rep(TRUE, n_big)
+)
+big_finalize_input[cen == 0, censored := TRUE]
+big_finalize_input[cen == 0, newABUNDANCE := NA]
+
+# predicted_survival: exclude specific (RUN, FEATURE) combinations to
+# guarantee some rows in input have no match. Random row sampling doesn't
+# work because multiple rows can share the same join key, so excluded rows
+# may still match via a duplicate key in the kept rows.
+excluded_features = c("feat1", "feat2")
+excluded_run = "1"
+big_predicted = big_finalize_input[
+    !(FEATURE %in% excluded_features & RUN == excluded_run),
+    .(newABUNDANCE = ifelse(cen == 0, 7.0, newABUNDANCE),
+      cen, RUN, FEATURE,
+      predicted = ifelse(cen == 0, 7.0, NA_real_))]
+
+input_size = as.numeric(object.size(big_finalize_input))
+addr_big_before = data.table::address(big_finalize_input)
+
+# Reset GC, run .finalizeTMP, measure peak.
+gc(reset = TRUE)
+big_finalize_result = MSstats:::.finalizeTMP(
+    big_finalize_input, censored_symbol = "NA", impute = TRUE,
+    predicted_survival = big_predicted)
+gc_issue3 = gc()
+
+# Peak Vcells (MB) — informational. The old merge approach would show a peak
+# roughly 3x the input size. The in-place approach should be much less.
+peak_mb_issue3 = gc_issue3[2, 6]
+
+# Address preserved = in-place modification, no full-table copy.
+expect_equal(addr_big_before, data.table::address(big_finalize_result),
+             info = paste("Large .finalizeTMP: should modify in-place.",
+                          "Input size:", round(input_size / 1e6, 1), "MB.",
+                          "Peak Vcells:", peak_mb_issue3, "MB"))
+
+# newABUNDANCE and predicted columns should both exist.
+expect_true("newABUNDANCE" %in% colnames(big_finalize_result),
+            info = "Large .finalizeTMP: newABUNDANCE should exist after join")
+expect_true("predicted" %in% colnames(big_finalize_result),
+            info = "Large .finalizeTMP: predicted should exist after join")
+
+# Rows with excluded (RUN, FEATURE) keys should have NA for newABUNDANCE,
+# because they have no match in predicted_survival.
+unmatched_mask = big_finalize_result$FEATURE %in% excluded_features &
+                 big_finalize_result$RUN == excluded_run
+na_count = sum(is.na(big_finalize_result$newABUNDANCE[unmatched_mask]))
+expect_true(na_count > 0,
+            info = paste("Rows with excluded keys should have NA newABUNDANCE.",
+                         "Found", na_count, "NAs out of",
+                         sum(unmatched_mask), "unmatched rows"))
+
+
+# --- Issue 4: survival_predictions List Duplication ---------------------------
+#
+# MSstatsSummarizationOutput used to pass the full nested 'summarized' list
+# (both protein results AND survival predictions for every protein) into
+# .finalizeTMP. That function only needed the survival predictions, but the
+# protein results (~50% of the list) were held hostage in memory.
+#
+# The fix: MSstatsSummarizationOutput now splits the list upfront:
+#   predicted_survival = rbindlist(lapply(summarized, function(x) x[[2]]))
+#   protein_summaries  = lapply(summarized, function(x) x[[1]])
+#   rm(summarized)  — frees the nested list immediately
+#
+# .finalizeTMP now receives predicted_survival directly (a combined data.table),
+# not the full nested list. This means:
+# (a) .finalizeTMP never sees the protein results — they can't waste memory
+# (b) The nested list is freed before .finalizeTMP runs
+#
+# These tests verify:
+# (a) .finalizeTMP accepts predicted_survival directly (not a nested list)
+# (b) MSstatsSummarizationOutput correctly decomposes the nested list
+# (c) Memory: the nested list doesn't persist through .finalizeTMP
+
+# Test 4a: .finalizeTMP takes a combined data.table, not a nested list ---
+#
+# Verify the function signature change: .finalizeTMP's 4th parameter is now
+# 'predicted_survival' (a data.table), not 'summarized' (a nested list).
+# If someone reverts to the old code, passing a data.table would break.
+
+# Reuse finalize_input from Issue 3 tests (rebuild if needed since
+# .finalizeTMP modified it in-place above).
+finalize_input_4 = data.table::data.table(
+    PROTEIN = factor(rep(c("P1", "P2"), each = 6)),
+    FEATURE = factor(rep(c("feat1", "feat2"), each = 3, times = 2)),
+    LABEL = rep("L", 12),
+    RUN = factor(rep(1:3, times = 4)),
+    INTENSITY = 2^rnorm(12, 20, 2),
+    ABUNDANCE = rnorm(12, 20, 2),
+    newABUNDANCE = c(10.1, NA, 10.5, 11.2, 10.8, NA,
+                     9.5, NA, 10.2, 11.0, NA, 10.6),
+    cen = c(1, 0, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1),
+    censored = c(FALSE, TRUE, FALSE, FALSE, FALSE, TRUE,
+                 FALSE, TRUE, FALSE, FALSE, TRUE, FALSE),
+    originalRUN = factor(rep(paste0("Run", 1:3), times = 4)),
+    GROUP = factor(rep(c("A", "B", "C"), times = 4)),
+    SUBJECT = factor(rep(1:3, times = 4)),
+    n_obs = rep(3, 12),
+    n_obs_run = rep(2, 12),
+    total_features = rep(2, 12),
+    nonmissing = c(TRUE, FALSE, TRUE, TRUE, TRUE, FALSE,
+                   TRUE, FALSE, TRUE, TRUE, FALSE, TRUE)
+)
+
+# predicted_survival as a plain data.table (the new interface).
+pred_surv_4 = data.table::data.table(
+    newABUNDANCE = c(10.1, 8.5, 10.5, 11.2, 10.8, 7.9,
+                     9.5, 8.2, 10.2, 11.0, 10.6, 8.0),
+    cen = c(1, 0, 1, 1, 1, 0, 1, 0, 1, 1, 1, 0),
+    RUN = factor(rep(1:3, times = 4)),
+    FEATURE = factor(rep(c("feat1", "feat2"), each = 3, times = 2)),
+    predicted = c(NA, 8.5, NA, NA, NA, 7.9, NA, 8.2, NA, NA, NA, 8.0)
+)
+
+# This should work — .finalizeTMP now expects a data.table, not a list.
+result_4 = MSstats:::.finalizeTMP(finalize_input_4, censored_symbol = "NA",
+                                   impute = TRUE,
+                                   predicted_survival = pred_surv_4)
+expect_true(data.table::is.data.table(result_4) || is.data.frame(result_4),
+            info = ".finalizeTMP should return a data.frame/data.table when given predicted_survival directly")
+expect_true("newABUNDANCE" %in% colnames(result_4),
+            info = ".finalizeTMP should produce newABUNDANCE from a data.table input")
+expect_true("predicted" %in% colnames(result_4),
+            info = ".finalizeTMP should produce predicted column from a data.table input")
+
+
+# Test 4b: MSstatsSummarizationOutput correctly splits the nested list ---
+#
+# Build a minimal 'summarized' in the nested format that the summarization
+# loop produces: list(list(protein_result, survival), ...).
+# Verify that MSstatsSummarizationOutput decomposes it correctly and
+# produces valid output.
+
+# Use the real pipeline on a small built-in dataset to get a realistic
+# summarized list, then verify the output structure.
+MSstatsConvert::MSstatsLogsSettings(FALSE)
+small_input = MSstatsPrepareForDataProcess(SRMRawData, 2, NULL)
+small_input = MSstatsNormalize(small_input, "EQUALIZEMEDIANS")
+small_input = MSstatsMergeFractions(small_input)
+small_input = MSstatsHandleMissing(small_input, "TMP", TRUE, "NA", 0.999)
+small_input = MSstatsSelectFeatures(small_input, "all")
+processed = getProcessed(small_input)
+small_input = MSstatsPrepareForSummarization(small_input, "TMP", TRUE, "NA", FALSE)
+summarized_list = MSstatsSummarizeWithSingleCore(small_input, "TMP", TRUE, "NA",
+                                                  FALSE, TRUE, 90)
+
+# Verify summarized_list is a nested list (the format we're testing).
+expect_true(is.list(summarized_list),
+            info = "Summarized should be a list")
+expect_true(is.list(summarized_list[[1]]),
+            info = "Each element should be a list(protein_result, survival)")
+expect_equal(length(summarized_list[[1]]), 2,
+             info = "Each element should have exactly 2 components")
+
+# Now call MSstatsSummarizationOutput — this is where Issue 4 fix lives.
+# It should decompose the nested list, free it, and produce correct output.
+output = MSstatsSummarizationOutput(small_input, summarized_list, processed,
+                                     "TMP", TRUE, "NA")
+
+expect_true(!is.null(output$FeatureLevelData),
+            info = "Output should have FeatureLevelData")
+expect_true(!is.null(output$ProteinLevelData),
+            info = "Output should have ProteinLevelData")
+expect_true(nrow(output$FeatureLevelData) > 0,
+            info = "FeatureLevelData should have rows")
+expect_true(nrow(output$ProteinLevelData) > 0,
+            info = "ProteinLevelData should have rows")
+expect_true("newABUNDANCE" %in% colnames(output$FeatureLevelData),
+            info = "FeatureLevelData should contain newABUNDANCE (from imputation)")
+expect_true("predicted" %in% colnames(output$FeatureLevelData),
+            info = "FeatureLevelData should contain predicted column")
+
+
+# Test 4c: Memory — nested list is freed before .finalizeTMP runs ---
+#
+# We can't directly observe rm(summarized) inside MSstatsSummarizationOutput,
+# but we can measure the total memory footprint. With a larger dataset, the
+# old approach held the full nested list (~200 MB) alive through .finalizeTMP.
+# The new approach frees it before .finalizeTMP runs, so peak should be lower.
+#
+# We measure by comparing: the size of the nested summarized list vs the
+# size of predicted_survival alone. The fix ensures only predicted_survival
+# (the smaller half) is alive during .finalizeTMP, not both halves.
+
+# Measure sizes of the two halves from the real summarized list.
+survival_tables = lapply(summarized_list, function(x) x[[2]])
+protein_tables = lapply(summarized_list, function(x) x[[1]])
+
+size_full_list = as.numeric(object.size(summarized_list))
+size_survival_only = as.numeric(object.size(survival_tables))
+size_protein_only = as.numeric(object.size(protein_tables))
+size_combined_survival = as.numeric(
+    object.size(data.table::rbindlist(survival_tables, fill = TRUE)))
+
+# The combined predicted_survival table (what .finalizeTMP receives now)
+# should be smaller than the full nested list (what it received before).
+expect_true(size_combined_survival < size_full_list,
+            info = paste("Combined predicted_survival should be smaller than",
+                         "the full nested list.",
+                         "predicted_survival:", size_combined_survival, "bytes.",
+                         "Full nested list:", size_full_list, "bytes.",
+                         "Savings:", size_full_list - size_combined_survival, "bytes.",
+                         "This is the memory .finalizeTMP no longer holds."))

From b64b004d0faad3418d365cbbf8ec937d385acd63 Mon Sep 17 00:00:00 2001
From: Rudhik1904 <rudhikshah50@gmail.com>
Date: Fri, 15 May 2026 00:11:53 -0500
Subject: [PATCH 2/7] Updating the comments.

---
 R/utils_checks.R                              |   7 --
 R/utils_feature_selection.R                   |   5 -
 R/utils_normalize.R                           |   6 -
 R/utils_output.R                              |  12 --
 .../test_memory_optimization_copies.R         | 105 ++++++------------
 5 files changed, 35 insertions(+), 100 deletions(-)

diff --git a/R/utils_checks.R b/R/utils_checks.R
index 97dbc9be..2051c019 100644
--- a/R/utils_checks.R
+++ b/R/utils_checks.R
@@ -211,10 +211,6 @@ MSstatsPrepareForDataProcess = function(input, log_base, fix_missing) {
     cols = toupper(cols)
     cols = intersect(c(cols, "FRACTION", "TECHREPLICATE"),
                      colnames(input))
-    # Drop unwanted columns by reference rather than subsetting with
-    # `input[, cols, with = FALSE]`, which would deep-copy every retained
-    # column into a new data.table. On large inputs this subset was
-    # allocating hundreds of MB (~nrow x ncol x 8 bytes).
     drop_cols = setdiff(colnames(input), cols)
     for (col in drop_cols) data.table::set(input, j = col, value = NULL)
 
@@ -327,9 +323,6 @@ setMethod(".checkDataValidity", "MSstatsValidated", .prepareForDataProcess)
     input[, PROTEIN := factor(PROTEIN)]
     input[, PEPTIDE := factor(PEPTIDE)]
     input[, TRANSITION := factor(TRANSITION)]
-    # Sort in place instead of `input = input[order(...), ]`, which builds
-    # a full row-reshuffle copy of the whole table. setorder rewrites the
-    # underlying column vectors without allocating a new data.table.
     data.table::setorder(input, LABEL, GROUP_ORIGINAL, SUBJECT_ORIGINAL,
                          RUN, PROTEIN, PEPTIDE, TRANSITION)
     input[, GROUP := factor(GROUP)]
diff --git a/R/utils_feature_selection.R b/R/utils_feature_selection.R
index 65dfed26..0766e63b 100644
--- a/R/utils_feature_selection.R
+++ b/R/utils_feature_selection.R
@@ -78,11 +78,6 @@ MSstatsSelectFeatures = function(input, method, top_n = 3, min_feature_count = 2
     is_obs = log2inty = LABEL = NULL
 
     has_censored = is.element("censored", colnames(input))
-    # Build the working copy in a single operation. Previously this was two
-    # steps: (1) column-subset via input[, cols, with = FALSE] which deep-
-    # copied 7 columns from the 17-column table (~120 MB), then (2) a
-    # transform that created another new data.table (~100 MB). Combining
-    # them into one expression eliminates the intermediate copy.
     work = input[, list(protein = as.character(PROTEIN),
                         peptide = as.character(PEPTIDE),
                         feature = as.character(FEATURE),
diff --git a/R/utils_normalize.R b/R/utils_normalize.R
index 609abfe6..7a617596 100644
--- a/R/utils_normalize.R
+++ b/R/utils_normalize.R
@@ -347,8 +347,6 @@ MSstatsMergeFractions = function(input) {
                                                       SUBJECT_ORIGINAL,
                                                       newRun)])
 
-                # In-place add of newRun via keyed-match lookup instead of
-                # `merge(all.x=TRUE)`, which would deep-copy the whole table.
                 nr_idx = match_runs[input,
                                     on = c("GROUP_ORIGINAL", "SUBJECT_ORIGINAL"),
                                     which = TRUE, mult = "first"]
@@ -358,10 +356,6 @@ MSstatsMergeFractions = function(input) {
                                         list(ncount = .N),
                                         by = c("FEATURE", "FRACTION")]
                 select_fraction = select_fraction[ncount != 0]
-                # Filter by (FEATURE, FRACTION) pair directly instead of
-                # synthesising a concatenated "tmp" string column on both
-                # sides. Saves two 207k-string character vectors plus the
-                # paste() cost each call.
                 keep_idx = select_fraction[input,
                                            on = c("FEATURE", "FRACTION"),
                                            which = TRUE, mult = "first"]
diff --git a/R/utils_output.R b/R/utils_output.R
index 9b49aae7..13d60750 100644
--- a/R/utils_output.R
+++ b/R/utils_output.R
@@ -133,10 +133,6 @@ MSstatsSummarizationOutput = function(input, summarized, processed,
     INTENSITY = newABUNDANCE = NumImputedFeature = NULL
 
     if (impute) {
-        # Join columns must exist in BOTH tables. The per-protein survival
-        # tables built in MSstatsSummarizeSingleLinear / TMP exclude LABEL,
-        # so intersect with predicted_survival to avoid a missing-column
-        # error when LABEL is in input but not in the survival table.
         join_cols = intersect(intersect(colnames(input),
                                         colnames(predicted_survival)),
                               c("cen", "RUN", "FEATURE", "ref_covariate",
@@ -160,10 +156,6 @@ MSstatsSummarizationOutput = function(input, summarized, processed,
         } else {
             input[, nonmissing_orig := !is.na(INTENSITY)]
         }
-        # In-place conditional write: only overwrite nonmissing_orig for
-        # the rows where newABUNDANCE is NA. `ifelse(...)` would build a
-        # full-length logical vector for every row regardless; this writes
-        # only the rows that need changing.
         input[is.na(newABUNDANCE), nonmissing_orig := TRUE]
         if (impute) {
             input[, NumImputedFeature := sum(!nonmissing_orig),
@@ -195,10 +187,6 @@ MSstatsSummarizationOutput = function(input, summarized, processed,
         } else {
             input[, nonmissing_orig := !is.na(INTENSITY)]
         }
-        # In-place conditional write: only overwrite nonmissing_orig for
-        # the rows where newABUNDANCE is NA. `ifelse(...)` would build a
-        # full-length logical vector for every row regardless; this writes
-        # only the rows that need changing.
         input[is.na(newABUNDANCE), nonmissing_orig := TRUE]
         input[, NumImputedFeature := 0]
     }
diff --git a/inst/tinytest/test_memory_optimization_copies.R b/inst/tinytest/test_memory_optimization_copies.R
index bcd9656e..ebabc62e 100644
--- a/inst/tinytest/test_memory_optimization_copies.R
+++ b/inst/tinytest/test_memory_optimization_copies.R
@@ -1,28 +1,20 @@
-# Tests for memory optimizations in the dataProcess pipeline that target
-# data.table deep-copy ops: in-place column drops, merge() -> keyed lookup,
-# ifelse() -> targeted [i, j := v] writes, and the predicted_survival
-# extraction in MSstatsSummarizationOutput.
+# Tests for in-place data.table operations in the dataProcess pipeline:
+# column drops, keyed-lookup joins, targeted [i, j := v] writes, and
+# the predicted_survival extraction in MSstatsSummarizationOutput.
 
 
-# --- Issue 2: Column Removal via Subsetting Creates Full Copies ---------------
+# --- .normalizeMedian: column drops are in place ------------------------------
 #
 # .normalizeMedian() adds two temp columns (ABUNDANCE_RUN, ABUNDANCE_FRACTION)
-# via := (in-place), uses them, then removes them. Previously it used
-# input[, !(colnames(input) %in% ...), with = FALSE] which created a full
-# copy of the data.table (~250 MB for a real dataset). Now it uses
-# data.table::set(input, j = ..., value = NULL) which removes in-place.
-#
-# We test this by checking that the data.table's memory address is preserved
-# after the function call — proving no copy was made.
+# via :=, uses them, then removes them with data.table::set(j=, value=NULL)
+# so the data.table is modified in-place. We verify the address is preserved.
 
-# Suppress MSstats log messages during test
 MSstatsConvert::MSstatsLogsSettings(FALSE)
 
-# Test 2a: .normalizeMedian preserves data.table identity (no copy) ---
+# Test 2a: .normalizeMedian preserves data.table identity ---
 #
 # data.table::address() returns the hex pointer of the data.table object.
-# If the function modifies in-place, the address stays the same.
-# If it creates a copy (the old bug), the address changes.
+# An in-place modification preserves the address; a copy changes it.
 
 norm_input = data.table::data.table(
     PROTEIN = factor(rep(c("P1", "P2"), each = 20)),
@@ -60,10 +52,8 @@ expect_true("ABUNDANCE" %in% colnames(norm_result),
 
 # Test 2b: .normalizeMedian does not allocate a full-table copy ---
 #
-# Create a larger dataset and measure memory allocated during the call.
 # With in-place set(), the only allocations should be the two temp column
-# vectors (~2x nrow doubles). With the old column-subset approach, the
-# allocation would be ~15x nrow doubles (copying all kept columns).
+# vectors (~2x nrow doubles), not the whole table.
 
 n_rows = 100000
 big_input = data.table::data.table(
@@ -127,24 +117,19 @@ expect_false("ABUNDANCE_RUN" %in% colnames(big_result),
              info = "Large table: temp column ABUNDANCE_RUN should be removed")
 
 
-# --- Issue 3: .finalizeTMP Merge Duplicates the Entire Input ------------------
+# --- .finalizeTMP: in-place update from predicted_survival --------------------
 #
 # .finalizeTMP replaces the pre-imputation newABUNDANCE column in the full
-# feature-level table with imputed values from predicted_survival. Previously
-# it did: merge(input[, cols != "newABUNDANCE", with=FALSE], predicted_survival)
-# which created a column-subset copy (~333 MB) and a merge result (~370 MB),
-# peaking at ~1,133 MB for ~350 MB of data.
-#
-# The fix uses in-place operations:
-#   set(input, j = "newABUNDANCE", value = NULL)  — remove old column (0 bytes)
-#   input[, c(...) := .(NA_real_, NA_real_)]       — add NA placeholders (~40 MB)
-#   input[predicted_survival, ... := ..., on = .]  — update matched rows (0 bytes)
+# feature-level table with imputed values from predicted_survival, using:
+#   set(input, j = "newABUNDANCE", value = NULL)  — remove old column
+#   input[, c(...) := .(NA_real_, NA_real_)]       — add NA placeholders
+#   input[predicted_survival, ... := ..., on = .]  — update matched rows
 #
 # These tests verify:
 # (a) The data.table is modified in-place (address preserved)
 # (b) Matched rows get imputed values from predicted_survival
-# (c) Unmatched rows get NA (same as the old merge with all.x = TRUE)
-# (d) Memory allocation is minimal compared to a full-table copy
+# (c) Unmatched rows are left as NA (equivalent to all.x = TRUE)
+# (d) Allocation stays small relative to a full-table copy
 
 # Test 3a: .finalizeTMP modifies input in-place with correct values ---
 
@@ -206,7 +191,7 @@ result = MSstats:::.finalizeTMP(finalize_input, censored_symbol = "NA",
 
 addr_after = data.table::address(result)
 
-# (a) Address preserved — the data.table was modified in-place, no full copy.
+# (a) Address preserved — the data.table was modified in-place.
 expect_equal(addr_before, addr_after,
              info = paste(".finalizeTMP should modify input in-place.",
                           "Before:", addr_before, "After:", addr_after))
@@ -236,10 +221,8 @@ expect_true("predicted" %in% colnames(result),
 
 # Test 3b: .finalizeTMP at scale — memory stays low ---
 #
-# Create a larger dataset and verify that peak memory allocation during
-# .finalizeTMP is far below what a full-table merge would require.
-# With the old merge approach, peak would be ~3x the table size.
-# With in-place join, peak should be close to ~1x (just the table + 2 new cols).
+# At ~50K rows, .finalizeTMP should modify the input in place; peak Vcells
+# should stay close to the input size, not balloon to a multi-table merge.
 
 n_big = 50000
 n_runs = 50
@@ -289,8 +272,7 @@ big_finalize_result = MSstats:::.finalizeTMP(
     predicted_survival = big_predicted)
 gc_issue3 = gc()
 
-# Peak Vcells (MB) — informational. The old merge approach would show a peak
-# roughly 3x the input size. The in-place approach should be much less.
+# Peak Vcells (MB) — informational.
 peak_mb_issue3 = gc_issue3[2, 6]
 
 # Address preserved = in-place modification, no full-table copy.
@@ -316,33 +298,23 @@ expect_true(na_count > 0,
                          sum(unmatched_mask), "unmatched rows"))
 
 
-# --- Issue 4: survival_predictions List Duplication ---------------------------
-#
-# MSstatsSummarizationOutput used to pass the full nested 'summarized' list
-# (both protein results AND survival predictions for every protein) into
-# .finalizeTMP. That function only needed the survival predictions, but the
-# protein results (~50% of the list) were held hostage in memory.
+# --- MSstatsSummarizationOutput / .finalizeTMP: predicted_survival contract ---
 #
-# The fix: MSstatsSummarizationOutput now splits the list upfront:
+# MSstatsSummarizationOutput splits the nested 'summarized' list upfront:
 #   predicted_survival = rbindlist(lapply(summarized, function(x) x[[2]]))
 #   protein_summaries  = lapply(summarized, function(x) x[[1]])
-#   rm(summarized)  — frees the nested list immediately
-#
-# .finalizeTMP now receives predicted_survival directly (a combined data.table),
-# not the full nested list. This means:
-# (a) .finalizeTMP never sees the protein results — they can't waste memory
-# (b) The nested list is freed before .finalizeTMP runs
+#   rm(summarized)
+# .finalizeTMP receives predicted_survival directly (a combined data.table),
+# not the full nested list.
 #
 # These tests verify:
-# (a) .finalizeTMP accepts predicted_survival directly (not a nested list)
+# (a) .finalizeTMP accepts predicted_survival as a data.table
 # (b) MSstatsSummarizationOutput correctly decomposes the nested list
-# (c) Memory: the nested list doesn't persist through .finalizeTMP
+# (c) The nested list does not persist through .finalizeTMP
 
 # Test 4a: .finalizeTMP takes a combined data.table, not a nested list ---
 #
-# Verify the function signature change: .finalizeTMP's 4th parameter is now
-# 'predicted_survival' (a data.table), not 'summarized' (a nested list).
-# If someone reverts to the old code, passing a data.table would break.
+# .finalizeTMP's 4th parameter is 'predicted_survival', a data.table.
 
 # Reuse finalize_input from Issue 3 tests (rebuild if needed since
 # .finalizeTMP modified it in-place above).
@@ -418,8 +390,8 @@ expect_true(is.list(summarized_list[[1]]),
 expect_equal(length(summarized_list[[1]]), 2,
              info = "Each element should have exactly 2 components")
 
-# Now call MSstatsSummarizationOutput — this is where Issue 4 fix lives.
-# It should decompose the nested list, free it, and produce correct output.
+# MSstatsSummarizationOutput should decompose the nested list and produce
+# valid FeatureLevelData / ProteinLevelData.
 output = MSstatsSummarizationOutput(small_input, summarized_list, processed,
                                      "TMP", TRUE, "NA")
 
@@ -437,16 +409,11 @@ expect_true("predicted" %in% colnames(output$FeatureLevelData),
             info = "FeatureLevelData should contain predicted column")
 
 
-# Test 4c: Memory — nested list is freed before .finalizeTMP runs ---
-#
-# We can't directly observe rm(summarized) inside MSstatsSummarizationOutput,
-# but we can measure the total memory footprint. With a larger dataset, the
-# old approach held the full nested list (~200 MB) alive through .finalizeTMP.
-# The new approach frees it before .finalizeTMP runs, so peak should be lower.
+# Test 4c: predicted_survival alone is smaller than the full nested list ---
 #
-# We measure by comparing: the size of the nested summarized list vs the
-# size of predicted_survival alone. The fix ensures only predicted_survival
-# (the smaller half) is alive during .finalizeTMP, not both halves.
+# .finalizeTMP receives only predicted_survival (the survival half of the
+# nested summarized list), not the full list. We verify the combined
+# predicted_survival table is smaller than the full nested list.
 
 # Measure sizes of the two halves from the real summarized list.
 survival_tables = lapply(summarized_list, function(x) x[[2]])
@@ -464,6 +431,4 @@ expect_true(size_combined_survival < size_full_list,
             info = paste("Combined predicted_survival should be smaller than",
                          "the full nested list.",
                          "predicted_survival:", size_combined_survival, "bytes.",
-                         "Full nested list:", size_full_list, "bytes.",
-                         "Savings:", size_full_list - size_combined_survival, "bytes.",
-                         "This is the memory .finalizeTMP no longer holds."))
+                         "Full nested list:", size_full_list, "bytes."))

From e2bd771cd7a8fc507aea335318238401a39569f4 Mon Sep 17 00:00:00 2001
From: Rudhik1904 <rudhikshah50@gmail.com>
Date: Fri, 15 May 2026 00:14:56 -0500
Subject: [PATCH 3/7] Updating some more comments.

---
 inst/tinytest/test_memory_optimization_copies.R | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/inst/tinytest/test_memory_optimization_copies.R b/inst/tinytest/test_memory_optimization_copies.R
index ebabc62e..b7d45563 100644
--- a/inst/tinytest/test_memory_optimization_copies.R
+++ b/inst/tinytest/test_memory_optimization_copies.R
@@ -270,16 +270,16 @@ gc(reset = TRUE)
 big_finalize_result = MSstats:::.finalizeTMP(
     big_finalize_input, censored_symbol = "NA", impute = TRUE,
     predicted_survival = big_predicted)
-gc_issue3 = gc()
+gc_after_finalize = gc()
 
 # Peak Vcells (MB) — informational.
-peak_mb_issue3 = gc_issue3[2, 6]
+peak_mb_finalize = gc_after_finalize[2, 6]
 
 # Address preserved = in-place modification, no full-table copy.
 expect_equal(addr_big_before, data.table::address(big_finalize_result),
              info = paste("Large .finalizeTMP: should modify in-place.",
                           "Input size:", round(input_size / 1e6, 1), "MB.",
-                          "Peak Vcells:", peak_mb_issue3, "MB"))
+                          "Peak Vcells:", peak_mb_finalize, "MB"))
 
 # newABUNDANCE and predicted columns should both exist.
 expect_true("newABUNDANCE" %in% colnames(big_finalize_result),
@@ -316,8 +316,8 @@ expect_true(na_count > 0,
 #
 # .finalizeTMP's 4th parameter is 'predicted_survival', a data.table.
 
-# Reuse finalize_input from Issue 3 tests (rebuild if needed since
-# .finalizeTMP modified it in-place above).
+# Rebuild a finalize_input fixture since .finalizeTMP modified the previous
+# one in-place above.
 finalize_input_4 = data.table::data.table(
     PROTEIN = factor(rep(c("P1", "P2"), each = 6)),
     FEATURE = factor(rep(c("feat1", "feat2"), each = 3, times = 2)),

From 65df2b553980e6e4bcb126918de571cdc2a6b2e6 Mon Sep 17 00:00:00 2001
From: Rudhik1904 <rudhikshah50@gmail.com>
Date: Tue, 2 Jun 2026 00:39:17 -0500
Subject: [PATCH 4/7] Responding to the comments.

---
 .../test_memory_optimization_copies.R         | 19 +++++++++++++------
 1 file changed, 13 insertions(+), 6 deletions(-)

diff --git a/inst/tinytest/test_memory_optimization_copies.R b/inst/tinytest/test_memory_optimization_copies.R
index b7d45563..25e4ef8d 100644
--- a/inst/tinytest/test_memory_optimization_copies.R
+++ b/inst/tinytest/test_memory_optimization_copies.R
@@ -102,12 +102,19 @@ peak_vcells_mb = gc_after[2, 6]
 # Sanity check on the test data itself (not on the function):
 # Verify that 2 temp columns are much smaller than the full table.
 # If they were similar sizes, we couldn't distinguish "allocated 2 temp
-# columns" from "copied the whole table" and the test would be meaningless.
-# For our 11-column, 100K-row table: 2 cols ≈ 1.6 MB, full table ≈ 6 MB.
-expect_true(one_col_size * 2 < table_size * 0.5,
-            info = paste("Sanity check: 2 columns should be much less than",
-                         "50% of the table. 2 cols:", one_col_size * 2,
-                         "50% table:", table_size * 0.5))
+# columns" from "copied the whole table" and the gc() figure above would be
+# meaningless. For our 11-column, 100K-row table: 2 cols ≈ 1.6 MB, full ≈ 6 MB.
+#
+# This only guards the *informational* gc() measurement, which we do not
+# assert on (see note above). object.size() accounting depends on R internals
+# out of our control, so a wrong result here emits a message rather than
+# failing the test run; the deterministic address() check below is the real test.
+if (!(one_col_size * 2 < table_size * 0.5)) {
+    message("Note: fixture sanity check off -- 2 cols (",
+            round(one_col_size * 2 / 1e6, 2), " MB) not < 50% of table (",
+            round(table_size * 0.5 / 1e6, 2), " MB). ",
+            "gc-based memory figures may be unreliable on this platform.")
+}
 
 # The real test: same address = same R object = no copy was made.
 # This is deterministic and reliable, unlike gc() measurements.

From 0764b3471b8a9a8b31e90bff39fa159b499d9410 Mon Sep 17 00:00:00 2001
From: Rudhik1904 <rudhikshah50@gmail.com>
Date: Tue, 2 Jun 2026 00:39:34 -0500
Subject: [PATCH 5/7] Responding to comments- 2

---
 R/utils_feature_selection.R | 24 ++++++++++++------------
 R/utils_normalize.R         |  7 +++----
 2 files changed, 15 insertions(+), 16 deletions(-)

diff --git a/R/utils_feature_selection.R b/R/utils_feature_selection.R
index 0766e63b..6892232a 100644
--- a/R/utils_feature_selection.R
+++ b/R/utils_feature_selection.R
@@ -78,18 +78,18 @@ MSstatsSelectFeatures = function(input, method, top_n = 3, min_feature_count = 2
     is_obs = log2inty = LABEL = NULL
 
     has_censored = is.element("censored", colnames(input))
-    work = input[, list(protein = as.character(PROTEIN),
-                        peptide = as.character(PEPTIDE),
-                        feature = as.character(FEATURE),
-                        run = as.character(originalRUN),
-                        label = as.character(LABEL),
-                        log2inty = ifelse(!(is.na(ABUNDANCE) |
-                                           if (has_censored) censored else FALSE),
-                                          ABUNDANCE, NA),
-                        is_obs = FALSE)]
-    work[, is_obs := !is.na(log2inty)]
-
-    features_quality = data.table::rbindlist(lapply(split(work, work$label),
+    input = input[, list(protein = as.character(PROTEIN),
+                         peptide = as.character(PEPTIDE),
+                         feature = as.character(FEATURE),
+                         run = as.character(originalRUN),
+                         label = as.character(LABEL),
+                         log2inty = ifelse(!(is.na(ABUNDANCE) |
+                                            if (has_censored) censored else FALSE),
+                                           ABUNDANCE, NA),
+                         is_obs = FALSE)]
+    input[, is_obs := !is.na(log2inty)]
+
+    features_quality = data.table::rbindlist(lapply(split(input, input$label),
                                                     .flagUninformativeSingleLabel,
                                                     min_feature_count = min_feature_count))
     features_quality
diff --git a/R/utils_normalize.R b/R/utils_normalize.R
index 7a617596..d2f96423 100644
--- a/R/utils_normalize.R
+++ b/R/utils_normalize.R
@@ -360,10 +360,9 @@ MSstatsMergeFractions = function(input) {
                                            on = c("FEATURE", "FRACTION"),
                                            which = TRUE, mult = "first"]
                 input = input[!is.na(keep_idx)]
-                input$originalRUN = input$newRun
-                input$RUN = input$originalRUN
-                input$RUN = factor(input$RUN, levels = unique(input$RUN),
-                                   labels = seq_along(unique(input$RUN)))
+                input[, originalRUN := newRun]
+                input[, RUN := factor(newRun, levels = unique(newRun),
+                                      labels = seq_along(unique(newRun)))]
                 data.table::set(input, j = "newRun", value = NULL)
             }
         }

From 27d5f7ca58a233afc652b399a65405b2d6559e10 Mon Sep 17 00:00:00 2001
From: Rudhik1904 <rudhikshah50@gmail.com>
Date: Tue, 9 Jun 2026 01:49:55 -0500
Subject: [PATCH 6/7] Responding to Tony's suggestions

---
 R/utils_feature_selection.R                | 11 +--
 R/utils_normalize.R                        | 31 ++++++--
 R/utils_output.R                           | 33 +++++----
 inst/tinytest/test_MSstatsMergeFractions.R | 85 ++++++++++++++++++++++
 man/dot-finalizeInput.Rd                   |  6 +-
 man/dot-finalizeTMP.Rd                     |  4 +-
 6 files changed, 135 insertions(+), 35 deletions(-)
 create mode 100644 inst/tinytest/test_MSstatsMergeFractions.R

diff --git a/R/utils_feature_selection.R b/R/utils_feature_selection.R
index 6892232a..630b37a6 100644
--- a/R/utils_feature_selection.R
+++ b/R/utils_feature_selection.R
@@ -75,7 +75,7 @@ MSstatsSelectFeatures = function(input, method, top_n = 3, min_feature_count = 2
 #' @keywords internal
 .selectHighQualityFeatures = function(input, min_feature_count) {
     PROTEIN = PEPTIDE = FEATURE = originalRUN = ABUNDANCE = censored = NULL
-    is_obs = log2inty = LABEL = NULL
+    is_obs = log2inty = is_censored = LABEL = NULL
 
     has_censored = is.element("censored", colnames(input))
     input = input[, list(protein = as.character(PROTEIN),
@@ -83,11 +83,12 @@ MSstatsSelectFeatures = function(input, method, top_n = 3, min_feature_count = 2
                          feature = as.character(FEATURE),
                          run = as.character(originalRUN),
                          label = as.character(LABEL),
-                         log2inty = ifelse(!(is.na(ABUNDANCE) |
-                                            if (has_censored) censored else FALSE),
-                                           ABUNDANCE, NA),
-                         is_obs = FALSE)]
+                         log2inty = ABUNDANCE,
+                         is_censored = if (has_censored) censored else FALSE)]
+    # Censored or missing intensities are not observations.
+    input[is_censored | is.na(log2inty), log2inty := NA]
     input[, is_obs := !is.na(log2inty)]
+    input[, is_censored := NULL]
 
     features_quality = data.table::rbindlist(lapply(split(input, input$label),
                                                     .flagUninformativeSingleLabel,
diff --git a/R/utils_normalize.R b/R/utils_normalize.R
index d2f96423..936a5e59 100644
--- a/R/utils_normalize.R
+++ b/R/utils_normalize.R
@@ -284,7 +284,7 @@ MSstatsNormalize = function(input, normalization_method, peptides_dict = NULL, s
 #' 
 MSstatsMergeFractions = function(input) {
     ABUNDANCE = INTENSITY = GROUP_ORIGINAL = SUBJECT_ORIGINAL = RUN = NULL
-    originalRUN = FRACTION = TECHREPLICATE = tmp = merged = newRun = NULL
+    originalRUN = FRACTION = TECHREPLICATE = tmp = newRun = NULL
     ncount = FEATURE = NULL
     
     input[!is.na(ABUNDANCE) & ABUNDANCE < 0, "ABUNDANCE"] = 0
@@ -340,29 +340,44 @@ MSstatsMergeFractions = function(input) {
                 getOption("MSstatsLog")("ERROR", msg)
                 stop(msg)
             } else {
-                match_runs[, merged := "merged"]
-                match_runs[, newRun := do.call(paste, c(.SD, sep = "_")), 
-                           .SDcols = c(1:3, ncol(match_runs))]
-                match_runs = unique(match_runs[, list(GROUP_ORIGINAL,
-                                                      SUBJECT_ORIGINAL,
-                                                      newRun)])
-
+                # dcast pivoted fractions into columns, so the fraction columns
+                # are everything that is not a sample-identifier column.
+                fraction_cols = setdiff(colnames(match_runs),
+                                        c("GROUP_ORIGINAL", "SUBJECT_ORIGINAL"))
+                # Use the first fraction's run as the sample's representative run.
+                first_fraction_run = match_runs[[fraction_cols[1]]]
+                # Build the merged-run name: <group>_<subject>_<run>_merged.
+                match_runs[, newRun := paste(GROUP_ORIGINAL, SUBJECT_ORIGINAL,
+                                             first_fraction_run, "merged",
+                                             sep = "_")]
+                # Reduce to a (group, subject) -> merged-run lookup table.
+                match_runs = match_runs[, list(GROUP_ORIGINAL, SUBJECT_ORIGINAL,
+                                               newRun)]
+                # For each input row, find its sample's row in the lookup.
                 nr_idx = match_runs[input,
                                     on = c("GROUP_ORIGINAL", "SUBJECT_ORIGINAL"),
                                     which = TRUE, mult = "first"]
+                # Write that merged-run name onto every input row.
                 data.table::set(input, j = "newRun",
                                 value = match_runs$newRun[nr_idx])
+                # Count, per feature/fraction, the rows observed above zero.
                 select_fraction = input[!is.na(ABUNDANCE) & input$ABUNDANCE > 0,
                                         list(ncount = .N),
                                         by = c("FEATURE", "FRACTION")]
+                # Keep only feature/fraction combinations seen at least once.
                 select_fraction = select_fraction[ncount != 0]
+                # Mark which input rows belong to a kept combination (NA = none).
                 keep_idx = select_fraction[input,
                                            on = c("FEATURE", "FRACTION"),
                                            which = TRUE, mult = "first"]
+                # Drop rows whose feature/fraction was never observed.
                 input = input[!is.na(keep_idx)]
+                # The merged run replaces the original per-fraction run.
                 input[, originalRUN := newRun]
+                # Renumber RUN as a factor, one level per merged run.
                 input[, RUN := factor(newRun, levels = unique(newRun),
                                       labels = seq_along(unique(newRun)))]
+                # Drop the temporary newRun helper column.
                 data.table::set(input, j = "newRun", value = NULL)
             }
         }
diff --git a/R/utils_output.R b/R/utils_output.R
index 13d60750..75a7d60b 100644
--- a/R/utils_output.R
+++ b/R/utils_output.R
@@ -38,25 +38,27 @@ MSstatsSummarizationOutput = function(input, summarized, processed,
                                       method, impute, censored_symbol) {
     LABEL = TotalGroupMeasurements = GROUP = Protein = RUN = NULL
 
-    predicted_survival = data.table::rbindlist(lapply(summarized, function(x) x[[2]]),
-                                                fill = TRUE)
-    for (i in seq_along(summarized)) summarized[[i]][[2]] = NULL
-    input = .finalizeInput(input, predicted_survival, method, impute, censored_symbol)
-    rm(predicted_survival)
-    protein_summaries = lapply(summarized, function(x) x[[1]])
-    rm(summarized)
-    summarized = data.table::rbindlist(protein_summaries, fill = TRUE)
-    rm(protein_summaries)
-
-    if (inherits(summarized, "try-error")) {
+    # Summarization failure is signalled by a NULL `summarized` (the caller
+    # wraps the per-subplot summarization in tryCatch(error = ...) returning
+    # NULL). Guard before unpacking, otherwise .finalizeInput() below joins on
+    # an empty predicted_survival and errors.
+    if (is.null(summarized)) {
         msg = paste("*** error : can't summarize per subplot with ",
                     method, ".")
         getOption("MSstatsLog")("ERROR", msg)
         getOption("MSstatsMsg")("ERROR", msg)
         rqall = NULL
-        rqmodelqc = NULL
-        workpred = NULL
     } else {
+        predicted_survival = data.table::rbindlist(lapply(summarized, function(x) x[[2]]),
+                                                    fill = TRUE)
+        for (i in seq_along(summarized)) summarized[[i]][[2]] = NULL
+        input = .finalizeInput(input, predicted_survival, method, impute, censored_symbol)
+        rm(predicted_survival)
+        protein_summaries = lapply(summarized, function(x) x[[1]])
+        rm(summarized)
+        summarized = data.table::rbindlist(protein_summaries, fill = TRUE)
+        rm(protein_summaries)
+
         input[, TotalGroupMeasurements := uniqueN(.SD),
               by = c("PROTEIN", "GROUP", "LABEL"),
               .SDcols = c("FEATURE", "originalRUN")]
@@ -74,10 +76,9 @@ MSstatsSummarizationOutput = function(input, summarized, processed,
                       by.y = c(merge_col, "PROTEIN", "LABEL"))
         data.table::setnames(rqall, c("GROUP_ORIGINAL", "SUBJECT_ORIGINAL"),
                              c("GROUP", "SUBJECT"), skip_absent = TRUE)
-        
+
         rqall$GROUP = factor(as.character(rqall$GROUP))
         rqall$Protein = factor(rqall$Protein)
-        rqmodelqc = summarized$ModelQC
     }
     
     if (is.element("RUN", colnames(rqall)) & !is.null(rqall)) {
@@ -99,7 +100,7 @@ MSstatsSummarizationOutput = function(input, summarized, processed,
         input = rbind(input, processed, fill = TRUE)
     }
     data.table::setDF(input)
-    data.table::setDF(rqall)
+    if (!is.null(rqall)) data.table::setDF(rqall)
     list(FeatureLevelData = input,
          ProteinLevelData = rqall,
          SummaryMethod = method)
diff --git a/inst/tinytest/test_MSstatsMergeFractions.R b/inst/tinytest/test_MSstatsMergeFractions.R
new file mode 100644
index 00000000..ece19714
--- /dev/null
+++ b/inst/tinytest/test_MSstatsMergeFractions.R
@@ -0,0 +1,85 @@
+# Characterization tests for MSstatsMergeFractions --------------------------
+#
+# MSstatsMergeFractions collapses the multiple fraction-runs of a sample into a
+# single merged run and drops feature/fraction combinations that were never
+# observed. The function is hard to read, so these tests pin its observable
+# behavior so the in-place-update rewrite (and any future simplification) stays
+# equivalent. They focus on the non-TECHREPLICATE branch, which is the part this
+# change rewrote; the TECHREPLICATE branch is unchanged from the previous code.
+
+MSstatsConvert::MSstatsLogsSettings(FALSE)
+
+# Two subjects, two fractions each. f2 in fraction 2 is all-zero abundance, so
+# that (FEATURE, FRACTION) combination has no positive observation.
+make_fraction_input <- function(f2_frac2 = c(0.0, 0.0)) {
+    input <- data.table::data.table(
+        PROTEIN = "P1",
+        FEATURE = c("f1", "f1", "f2", "f2", "f1", "f1", "f2", "f2"),
+        GROUP_ORIGINAL = "G",
+        SUBJECT_ORIGINAL = c("S1", "S1", "S1", "S1", "S2", "S2", "S2", "S2"),
+        RUN = factor(c(1, 2, 1, 2, 3, 4, 3, 4)),
+        originalRUN = c("a1", "a2", "a1", "a2", "b1", "b2", "b1", "b2"),
+        FRACTION = c(1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L),
+        INTENSITY = c(10, 20, 30, 40, 50, 60, 70, 80),
+        ABUNDANCE = c(3.0, 4.0, 2.0, 0.0, 3.5, 4.5, 2.5, 0.0)
+    )
+    # The two f2 / fraction-2 rows (positions 4 and 8) take the parametrized
+    # abundance. Assigned after construction to avoid referencing a local in a
+    # data.table() column expression.
+    input$ABUNDANCE[c(4L, 8L)] <- f2_frac2
+    input
+}
+
+# Single fraction -> nothing to merge, returned untouched (besides clamping) ---
+single_fraction = data.table::data.table(
+    PROTEIN = "P1", FEATURE = c("f1", "f2"),
+    GROUP_ORIGINAL = "G", SUBJECT_ORIGINAL = "S1",
+    RUN = factor(c(1, 1)), originalRUN = c("a1", "a1"), FRACTION = c(1L, 1L),
+    INTENSITY = c(10, 20), ABUNDANCE = c(3.0, 4.0))
+res_single = MSstatsMergeFractions(data.table::copy(single_fraction))
+expect_equal(nrow(res_single), 2L,
+             info = "single-fraction input keeps all rows")
+expect_equal(res_single$originalRUN, c("a1", "a1"),
+             info = "single-fraction input leaves originalRUN untouched")
+
+# Abundance clamping happens before the fraction check ------------------------
+# ABUNDANCE < 0 becomes 0; an INTENSITY of exactly 1 forces ABUNDANCE to 0.
+clamp_input = data.table::data.table(
+    PROTEIN = "P1", FEATURE = c("f1", "f2"),
+    GROUP_ORIGINAL = "G", SUBJECT_ORIGINAL = "S1",
+    RUN = factor(c(1, 1)), originalRUN = c("a1", "a1"), FRACTION = c(1L, 1L),
+    INTENSITY = c(1, 50), ABUNDANCE = c(2.5, -3.0))
+res_clamp = MSstatsMergeFractions(data.table::copy(clamp_input))
+expect_equal(res_clamp$ABUNDANCE, c(0, 0),
+             info = "INTENSITY == 1 and negative ABUNDANCE are both clamped to 0")
+
+# Non-TECHREPLICATE: fractions merge and unobserved combos drop ---------------
+res = MSstatsMergeFractions(make_fraction_input())
+
+# f2/fraction-2 was all-zero, so those 2 rows are dropped; 6 remain.
+expect_equal(nrow(res), 6L,
+             info = "feature/fraction with no positive abundance is removed")
+expect_equal(nrow(res[FEATURE == "f2" & FRACTION == 2L]), 0L,
+             info = "the unobserved f2/fraction-2 rows are gone")
+
+# Each sample's fraction-runs collapse to one '<group>_<subject>_<run>_merged'.
+expect_equal(sort(unique(res$originalRUN)),
+             c("G_S1_a1_merged", "G_S2_b1_merged"),
+             info = "originalRUN is the merged run name, one per sample")
+
+# RUN is refactored to one level per merged run (2 samples -> 2 levels).
+expect_true(is.factor(res$RUN), info = "RUN stays a factor")
+expect_equal(nlevels(res$RUN), 2L,
+             info = "two merged samples give two RUN levels")
+
+# The temporary newRun column must not leak into the output.
+expect_false("newRun" %in% colnames(res),
+             info = "newRun helper column is removed")
+
+# Non-TECHREPLICATE: fully observed features are all kept ---------------------
+res_keep = MSstatsMergeFractions(make_fraction_input(f2_frac2 = c(1.0, 1.5)))
+expect_equal(nrow(res_keep), 8L,
+             info = "nothing is dropped when every feature/fraction is observed")
+expect_equal(sort(unique(paste(res_keep$FEATURE, res_keep$FRACTION))),
+             c("f1 1", "f1 2", "f2 1", "f2 2"),
+             info = "all feature/fraction combinations survive")
diff --git a/man/dot-finalizeInput.Rd b/man/dot-finalizeInput.Rd
index 8695d188..9ed07548 100644
--- a/man/dot-finalizeInput.Rd
+++ b/man/dot-finalizeInput.Rd
@@ -4,18 +4,18 @@
 \alias{.finalizeInput}
 \title{Add summary statistics to dataProcess output}
 \usage{
-.finalizeInput(input, summarized, method, impute, censored_symbol)
+.finalizeInput(input, predicted_survival, method, impute, censored_symbol)
 }
 \arguments{
 \item{input}{feature-level data}
 
-\item{summarized}{protein-level data (list)}
-
 \item{method}{summary method}
 
 \item{impute}{if TRUE, censored missing values were imputed}
 
 \item{censored_symbol}{censored missing value indicator}
+
+\item{summarized}{protein-level data (list)}
 }
 \description{
 Add summary statistics to dataProcess output
diff --git a/man/dot-finalizeTMP.Rd b/man/dot-finalizeTMP.Rd
index 2779235c..ed0f5dae 100644
--- a/man/dot-finalizeTMP.Rd
+++ b/man/dot-finalizeTMP.Rd
@@ -4,7 +4,7 @@
 \alias{.finalizeTMP}
 \title{Summary statistics for output of TMP-based summarization}
 \usage{
-.finalizeTMP(input, censored_symbol, impute, summarized)
+.finalizeTMP(input, censored_symbol, impute, predicted_survival)
 }
 \arguments{
 \item{input}{feature-level data}
@@ -12,8 +12,6 @@
 \item{censored_symbol}{censored missing value indicator}
 
 \item{impute}{if TRUE, censored missing values were imputed}
-
-\item{summarized}{protein-level data (list)}
 }
 \description{
 Summary statistics for output of TMP-based summarization

From 50a30daf963e7f86be7c6578a047246c47ef6015 Mon Sep 17 00:00:00 2001
From: Rudhik1904 <rudhikshah50@gmail.com>
Date: Tue, 9 Jun 2026 03:25:26 -0500
Subject: [PATCH 7/7] Adressing Matt's comments

---
 R/dataProcess.R                               | 12 ++++----
 R/utils_checks.R                              | 29 ++++++++++++-------
 .../test_memory_optimization_copies.R         | 16 ++++++----
 3 files changed, 35 insertions(+), 22 deletions(-)

diff --git a/R/dataProcess.R b/R/dataProcess.R
index bc86bd24..32cbbb47 100755
--- a/R/dataProcess.R
+++ b/R/dataProcess.R
@@ -401,7 +401,7 @@ MSstatsSummarizeSingleLinear = function(single_protein,
 
     if (impute & any(single_protein[["censored"]])) {
         fit_data = if (is_labeled_reference) {
-            single_protein[is_labeled_ref == FALSE, cols, with = FALSE]
+            single_protein[(!is_labeled_ref), cols, with = FALSE]
         } else {
             single_protein[, cols, with = FALSE]
         }
@@ -414,8 +414,8 @@ MSstatsSummarizeSingleLinear = function(single_protein,
         }]
 
         if (is_labeled_reference) {
-            single_protein[!(censored & is_labeled_ref == FALSE), predicted := NA]
-            single_protein[(censored) & is_labeled_ref == FALSE,
+            single_protein[!(censored & !is_labeled_ref), predicted := NA]
+            single_protein[(censored) & !is_labeled_ref,
                            newABUNDANCE := predicted]
         } else {
             single_protein[!(censored), predicted := NA]
@@ -547,7 +547,7 @@ MSstatsSummarizeSingleTMP = function(single_protein, impute, censored_symbol,
         converged = TRUE
 
         fit_data = if (is_labeled_reference) {
-            single_protein[is_labeled_ref == FALSE, cols, with = FALSE]
+            single_protein[(!is_labeled_ref), cols, with = FALSE]
         } else {
             single_protein[, cols, with = FALSE]
         }
@@ -569,8 +569,8 @@ MSstatsSummarizeSingleTMP = function(single_protein, impute, censored_symbol,
         }
 
         if (is_labeled_reference) {
-            single_protein[!(censored & is_labeled_ref == FALSE), predicted := NA]
-            single_protein[(censored) & is_labeled_ref == FALSE,
+            single_protein[!(censored & !is_labeled_ref), predicted := NA]
+            single_protein[(censored) & !is_labeled_ref,
                            newABUNDANCE := predicted]
         } else {
             single_protein[!(censored), predicted := NA]
diff --git a/R/utils_checks.R b/R/utils_checks.R
index 2051c019..02aa476b 100644
--- a/R/utils_checks.R
+++ b/R/utils_checks.R
@@ -170,7 +170,7 @@ MSstatsPrepareForDataProcess = function(input, log_base, fix_missing) {
     input = data.table::as.data.table(unclass(input))
     
     if (!"AnomalyScores" %in% colnames(input)){
-      input$AnomalyScores = NA
+      data.table::set(input, j = "AnomalyScores", value = NA)
     }
     
     cols = c("ProteinName", "PeptideSequence", "PeptideModifiedSequence",
@@ -200,7 +200,8 @@ MSstatsPrepareForDataProcess = function(input, log_base, fix_missing) {
     
     if (!is.numeric(input$INTENSITY)) {	
         suppressWarnings({
-            input$INTENSITY = as.numeric(as.character(input$INTENSITY))
+            data.table::set(input, j = "INTENSITY",
+                            value = as.numeric(as.character(input$INTENSITY)))
         })
     }
     
@@ -214,10 +215,12 @@ MSstatsPrepareForDataProcess = function(input, log_base, fix_missing) {
     drop_cols = setdiff(colnames(input), cols)
     for (col in drop_cols) data.table::set(input, j = col, value = NULL)
 
-    input$PEPTIDE = paste(input$PEPTIDESEQUENCE,
-                          input$PRECURSORCHARGE, sep = "_")
-    input$TRANSITION = paste(input$FRAGMENTION, 
-                             input$PRODUCTCHARGE, sep = "_")
+    data.table::set(input, j = "PEPTIDE",
+                    value = paste(input$PEPTIDESEQUENCE,
+                                  input$PRECURSORCHARGE, sep = "_"))
+    data.table::set(input, j = "TRANSITION",
+                    value = paste(input$FRAGMENTION,
+                                  input$PRODUCTCHARGE, sep = "_"))
     
     if (data.table::uniqueN(input$ISOTOPELABELTYPE) > 2) {
         getOption("MSstatsLog")(
@@ -227,7 +230,8 @@ MSstatsPrepareForDataProcess = function(input, log_base, fix_missing) {
         stop("Statistical tools in MSstats are only proper for label-free or with reference peptide experiments.")
     }
     
-    input$ISOTOPELABELTYPE <- .mapIsotopeLabelType(input$ISOTOPELABELTYPE)
+    data.table::set(input, j = "ISOTOPELABELTYPE",
+                    value = .mapIsotopeLabelType(input$ISOTOPELABELTYPE))
     input
 }
 
@@ -243,9 +247,14 @@ MSstatsPrepareForDataProcess = function(input, log_base, fix_missing) {
         data.table::setnames(
             input, "PEPTIDEMODIFIEDSEQUENCE", "PEPTIDESEQUENCE")
     }
-    input$PEPTIDE = paste(input$PEPTIDESEQUENCE, input$PRECURSORCHARGE, sep = "_")
-    input$TRANSITION = paste(input$FRAGMENTION, input$PRODUCTCHARGE, sep = "_")
-    input$ISOTOPELABELTYPE <- .mapIsotopeLabelType(input$ISOTOPELABELTYPE)
+    data.table::set(input, j = "PEPTIDE",
+                    value = paste(input$PEPTIDESEQUENCE,
+                                  input$PRECURSORCHARGE, sep = "_"))
+    data.table::set(input, j = "TRANSITION",
+                    value = paste(input$FRAGMENTION,
+                                  input$PRODUCTCHARGE, sep = "_"))
+    data.table::set(input, j = "ISOTOPELABELTYPE",
+                    value = .mapIsotopeLabelType(input$ISOTOPELABELTYPE))
     input
 }
 
diff --git a/inst/tinytest/test_memory_optimization_copies.R b/inst/tinytest/test_memory_optimization_copies.R
index 25e4ef8d..3f142432 100644
--- a/inst/tinytest/test_memory_optimization_copies.R
+++ b/inst/tinytest/test_memory_optimization_copies.R
@@ -203,13 +203,17 @@ expect_equal(addr_before, addr_after,
              info = paste(".finalizeTMP should modify input in-place.",
                           "Before:", addr_before, "After:", addr_after))
 
-# (b) Matched rows should have newABUNDANCE from predicted_survival.
-# Any row whose (cen, RUN, FEATURE) key exists in predicted_survival
-# should get a non-NA newABUNDANCE value.
+# (b) Every input row whose (cen, RUN, FEATURE) key exists in
+# predicted_survival must get a non-NA newABUNDANCE, and rows with no matching
+# key must be NA. Assert the exact expected count, not merely "> 0" (which would
+# still pass if the join silently matched the wrong rows).
+join_key = function(dt) paste(dt$cen, dt$RUN, dt$FEATURE, sep = "|")
+expected_matched = sum(join_key(result) %in% join_key(predicted_survival))
 matched_count = sum(!is.na(result$newABUNDANCE))
-expect_true(matched_count > 0,
-            info = paste("Matched rows should have non-NA newABUNDANCE.",
-                         "Found", matched_count, "non-NA values"))
+expect_equal(matched_count, expected_matched,
+             info = paste("Each matched (cen, RUN, FEATURE) key should yield a",
+                          "non-NA newABUNDANCE. Expected", expected_matched,
+                          "got", matched_count))
 
 # (c) Unmatched rows should have NA.
 # predicted_survival has 11 rows but input has 12. The missing key