From 16edbd54a833da1b02ff3c9bb54ae8cdd69b974d Mon Sep 17 00:00:00 2001 From: Tony Wu Date: Thu, 14 May 2026 11:02:41 -0400 Subject: [PATCH 1/7] Removed deep-copy data.table ops from the dataProcess pipeline MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * Replaced `input[, cols, with = FALSE]` deep-copy in MSstatsPrepareForDataProcess and MSstatsSummarizationOutput with drop-cols loops via data.table::set(j = ..., value = NULL). * Replaced row-shuffle `input = input[order(...), ]` in .prepareForDataProcess with data.table::setorder() (in place). * Replaced merge(all.x = TRUE) joins in MSstatsMergeFractions and .finalizeTMP with keyed-which lookups + data.table::set() writes — avoids deep-copying the whole table. * Replaced the synthesised `tmp` string-join filter in MSstatsMergeFractions with a direct (FEATURE, FRACTION) keyed lookup; drops two large character vectors and a paste() call. * Replaced ifelse() full-vector writes for predicted/newABUNDANCE and nonmissing_orig with targeted [i, j := v] in-place writes. * Collapsed the two-step subset+transform in .selectHighQualityFeatures into a single pass to eliminate one intermediate data.table copy. * Reworked MSstatsSummarizationOutput to extract predicted_survival upfront and null per-protein second slots so the nested-list duplication is freed before .finalizeTMP runs; switched the final return to data.table::setDF() in place of as.data.frame(). * Fixed two regressions in the original commit: (1) .finalizeTMP's join_cols must intersect with predicted_survival's columns so the keyed lookup doesn't error on missing LABEL; (2) reverted the survival-column-selection tightening that dropped LABEL — a downstream test in test_dataProcess.R relies on LABEL being kept. * Tests: inst/tinytest/test_memory_optimization_copies.R Issues 2/3/4 — 28 assertions, all green. Full suite 224/224 OK. See MSstats-ai/todos/active/TODO-MS-20260514_fix-memory-bugs.md Co-Authored-By: Claude --- R/dataProcess.R | 22 +- R/utils_checks.R | 18 +- R/utils_feature_selection.R | 42 +- R/utils_normalize.R | 36 +- R/utils_output.R | 78 ++- .../test_memory_optimization_copies.R | 469 ++++++++++++++++++ 6 files changed, 588 insertions(+), 77 deletions(-) create mode 100644 inst/tinytest/test_memory_optimization_copies.R diff --git a/R/dataProcess.R b/R/dataProcess.R index e9581c8b..bc86bd24 100755 --- a/R/dataProcess.R +++ b/R/dataProcess.R @@ -414,19 +414,19 @@ MSstatsSummarizeSingleLinear = function(single_protein, }] if (is_labeled_reference) { - single_protein[, predicted := ifelse(censored & is_labeled_ref == FALSE, predicted, NA)] - single_protein[, newABUNDANCE := ifelse(censored & is_labeled_ref == FALSE, predicted, newABUNDANCE)] + single_protein[!(censored & is_labeled_ref == FALSE), predicted := NA] + single_protein[(censored) & is_labeled_ref == FALSE, + newABUNDANCE := predicted] } else { - single_protein[, predicted := ifelse(censored, predicted, NA)] - single_protein[, newABUNDANCE := ifelse(censored, predicted, newABUNDANCE)] + single_protein[!(censored), predicted := NA] + single_protein[(censored), newABUNDANCE := predicted] } - survival = single_protein[, intersect(c(cols, "LABEL", "predicted"), colnames(single_protein)), with = FALSE] } else { survival = single_protein[, intersect(c(cols, "LABEL"), colnames(single_protein)), with = FALSE] survival[, predicted := NA] } - + if (all(!is.na(single_protein$ANOMALYSCORES))) { single_protein[, weights := anomaly_weights_z_vec(ANOMALYSCORES), @@ -569,11 +569,13 @@ MSstatsSummarizeSingleTMP = function(single_protein, impute, censored_symbol, } if (is_labeled_reference) { - single_protein[, predicted := ifelse(censored & is_labeled_ref == FALSE, predicted, NA)] - single_protein[, newABUNDANCE := ifelse(censored & is_labeled_ref == FALSE, predicted, newABUNDANCE)] + single_protein[!(censored & is_labeled_ref == FALSE), predicted := NA] + single_protein[(censored) & is_labeled_ref == FALSE, + newABUNDANCE := predicted] } else { - single_protein[, predicted := ifelse(censored, predicted, NA)] - single_protein[, newABUNDANCE := ifelse(censored, predicted, newABUNDANCE)] + single_protein[!(censored), predicted := NA] + single_protein[(censored), + newABUNDANCE := predicted] } survival = single_protein[, intersect(c(cols, "LABEL", "predicted"), colnames(single_protein)), with = FALSE] } else { diff --git a/R/utils_checks.R b/R/utils_checks.R index 3596e74d..97dbc9be 100644 --- a/R/utils_checks.R +++ b/R/utils_checks.R @@ -211,9 +211,14 @@ MSstatsPrepareForDataProcess = function(input, log_base, fix_missing) { cols = toupper(cols) cols = intersect(c(cols, "FRACTION", "TECHREPLICATE"), colnames(input)) - input = input[, cols, with = FALSE] - - input$PEPTIDE = paste(input$PEPTIDESEQUENCE, + # Drop unwanted columns by reference rather than subsetting with + # `input[, cols, with = FALSE]`, which would deep-copy every retained + # column into a new data.table. On large inputs this subset was + # allocating hundreds of MB (~nrow x ncol x 8 bytes). + drop_cols = setdiff(colnames(input), cols) + for (col in drop_cols) data.table::set(input, j = col, value = NULL) + + input$PEPTIDE = paste(input$PEPTIDESEQUENCE, input$PRECURSORCHARGE, sep = "_") input$TRANSITION = paste(input$FRAGMENTION, input$PRODUCTCHARGE, sep = "_") @@ -322,8 +327,11 @@ setMethod(".checkDataValidity", "MSstatsValidated", .prepareForDataProcess) input[, PROTEIN := factor(PROTEIN)] input[, PEPTIDE := factor(PEPTIDE)] input[, TRANSITION := factor(TRANSITION)] - input = input[order(LABEL, GROUP_ORIGINAL, SUBJECT_ORIGINAL, - RUN, PROTEIN, PEPTIDE, TRANSITION), ] + # Sort in place instead of `input = input[order(...), ]`, which builds + # a full row-reshuffle copy of the whole table. setorder rewrites the + # underlying column vectors without allocating a new data.table. + data.table::setorder(input, LABEL, GROUP_ORIGINAL, SUBJECT_ORIGINAL, + RUN, PROTEIN, PEPTIDE, TRANSITION) input[, GROUP := factor(GROUP)] input[, SUBJECT := factor(SUBJECT)] input[, FEATURE := factor(FEATURE)] diff --git a/R/utils_feature_selection.R b/R/utils_feature_selection.R index b11b6f16..65dfed26 100644 --- a/R/utils_feature_selection.R +++ b/R/utils_feature_selection.R @@ -74,29 +74,27 @@ MSstatsSelectFeatures = function(input, method, top_n = 3, min_feature_count = 2 #' @return data.table #' @keywords internal .selectHighQualityFeatures = function(input, min_feature_count) { - PROTEIN = PEPTIDE = FEATURE = originalRUN = ABUNDANCE = is_censored = NULL + PROTEIN = PEPTIDE = FEATURE = originalRUN = ABUNDANCE = censored = NULL is_obs = log2inty = LABEL = NULL - - cols = c("PROTEIN", "PEPTIDE", "FEATURE", "originalRUN", "LABEL", - "ABUNDANCE", "censored") - cols = intersect(cols, colnames(input)) - input = input[, cols, with = FALSE] - if (!("censored" %in% cols)) { - input$censored = FALSE - } - data.table::setnames(input, "censored", "is_censored") - input = input[, list(protein = as.character(PROTEIN), - peptide = as.character(PEPTIDE), - feature = as.character(FEATURE), - run = as.character(originalRUN), - label = as.character(LABEL), - log2inty = ifelse(!(is.na(ABUNDANCE) | is_censored), - ABUNDANCE, NA), - is_censored)] - input[, is_obs := !(is.na(log2inty) | is_censored)] - input[, is_censored := NULL] - - features_quality = data.table::rbindlist(lapply(split(input, input$label), + + has_censored = is.element("censored", colnames(input)) + # Build the working copy in a single operation. Previously this was two + # steps: (1) column-subset via input[, cols, with = FALSE] which deep- + # copied 7 columns from the 17-column table (~120 MB), then (2) a + # transform that created another new data.table (~100 MB). Combining + # them into one expression eliminates the intermediate copy. + work = input[, list(protein = as.character(PROTEIN), + peptide = as.character(PEPTIDE), + feature = as.character(FEATURE), + run = as.character(originalRUN), + label = as.character(LABEL), + log2inty = ifelse(!(is.na(ABUNDANCE) | + if (has_censored) censored else FALSE), + ABUNDANCE, NA), + is_obs = FALSE)] + work[, is_obs := !is.na(log2inty)] + + features_quality = data.table::rbindlist(lapply(split(work, work$label), .flagUninformativeSingleLabel, min_feature_count = min_feature_count)) features_quality diff --git a/R/utils_normalize.R b/R/utils_normalize.R index ca6f1f00..609abfe6 100644 --- a/R/utils_normalize.R +++ b/R/utils_normalize.R @@ -61,8 +61,8 @@ MSstatsNormalize = function(input, normalization_method, peptides_dict = NULL, s input[, ABUNDANCE_FRACTION := median(ABUNDANCE_RUN, na.rm = TRUE), by = "FRACTION"] input[, ABUNDANCE := ABUNDANCE - ABUNDANCE_RUN + ABUNDANCE_FRACTION] - input = input[, !(colnames(input) %in% c("ABUNDANCE_RUN", "ABUNDANCE_FRACTION")), - with = FALSE] + data.table::set(input, j = "ABUNDANCE_RUN", value = NULL) + data.table::set(input, j = "ABUNDANCE_FRACTION", value = NULL) getOption("MSstatsLog")("Normalization based on median: OK") input } @@ -255,7 +255,9 @@ MSstatsNormalize = function(input, normalization_method, peptides_dict = NULL, s input[, ABUNDANCE := ABUNDANCE - median_by_run + median_by_fraction] getOption("MSstatsLog")("INFO", "Normalization : normalization with global standards protein - okay") - input[ , !(colnames(input) %in% c("median_by_run", "median_by_fraction")), with = FALSE] + data.table::set(input, j = "median_by_run", value = NULL) + data.table::set(input, j = "median_by_fraction", value = NULL) + input } @@ -344,23 +346,31 @@ MSstatsMergeFractions = function(input) { match_runs = unique(match_runs[, list(GROUP_ORIGINAL, SUBJECT_ORIGINAL, newRun)]) - - input = merge(input, match_runs, - by = c("GROUP_ORIGINAL", "SUBJECT_ORIGINAL"), - all.x = TRUE) + + # In-place add of newRun via keyed-match lookup instead of + # `merge(all.x=TRUE)`, which would deep-copy the whole table. + nr_idx = match_runs[input, + on = c("GROUP_ORIGINAL", "SUBJECT_ORIGINAL"), + which = TRUE, mult = "first"] + data.table::set(input, j = "newRun", + value = match_runs$newRun[nr_idx]) select_fraction = input[!is.na(ABUNDANCE) & input$ABUNDANCE > 0, list(ncount = .N), by = c("FEATURE", "FRACTION")] select_fraction = select_fraction[ncount != 0] - select_fraction[, tmp := paste(FEATURE, FRACTION, sep = "_")] - input$tmp = paste(input$FEATURE, input$FRACTION, sep = "_") - input = input[tmp %in% select_fraction$tmp, ] + # Filter by (FEATURE, FRACTION) pair directly instead of + # synthesising a concatenated "tmp" string column on both + # sides. Saves two 207k-string character vectors plus the + # paste() cost each call. + keep_idx = select_fraction[input, + on = c("FEATURE", "FRACTION"), + which = TRUE, mult = "first"] + input = input[!is.na(keep_idx)] input$originalRUN = input$newRun input$RUN = input$originalRUN - input$RUN = factor(input$RUN, levels = unique(input$RUN), + input$RUN = factor(input$RUN, levels = unique(input$RUN), labels = seq_along(unique(input$RUN))) - input = input[, !(colnames(input) %in% c('tmp','newRun')), - with = FALSE] + data.table::set(input, j = "newRun", value = NULL) } } } diff --git a/R/utils_output.R b/R/utils_output.R index 7b748482..9b49aae7 100644 --- a/R/utils_output.R +++ b/R/utils_output.R @@ -34,13 +34,20 @@ #' output = output = MSstatsSummarizationOutput(input, summarized, processed, #' method, impute, cens) #' -MSstatsSummarizationOutput = function(input, summarized, processed, +MSstatsSummarizationOutput = function(input, summarized, processed, method, impute, censored_symbol) { LABEL = TotalGroupMeasurements = GROUP = Protein = RUN = NULL - - input = .finalizeInput(input, summarized, method, impute, censored_symbol) - summarized = lapply(summarized, function(x) x[[1]]) - summarized = data.table::rbindlist(summarized, fill = TRUE) + + predicted_survival = data.table::rbindlist(lapply(summarized, function(x) x[[2]]), + fill = TRUE) + for (i in seq_along(summarized)) summarized[[i]][[2]] = NULL + input = .finalizeInput(input, predicted_survival, method, impute, censored_symbol) + rm(predicted_survival) + protein_summaries = lapply(summarized, function(x) x[[1]]) + rm(summarized) + summarized = data.table::rbindlist(protein_summaries, fill = TRUE) + rm(protein_summaries) + if (inherits(summarized, "try-error")) { msg = paste("*** error : can't summarize per subplot with ", method, ".") @@ -82,18 +89,21 @@ MSstatsSummarizationOutput = function(input, summarized, processed, "originalRUN", "censored", "INTENSITY", "ABUNDANCE", "newABUNDANCE", "predicted", "feature_quality", "is_outlier", "remove", "is_labeled_ref"), colnames(input)) - input = input[, output_cols, with = FALSE] - + drop_cols = setdiff(colnames(input), output_cols) + for (col in drop_cols) data.table::set(input, j = col, value = NULL) + if (is.element("remove", colnames(processed))) { - processed = processed[(remove), - intersect(output_cols, + processed = processed[(remove), + intersect(output_cols, colnames(processed)), with = FALSE] input = rbind(input, processed, fill = TRUE) } - list(FeatureLevelData = as.data.frame(input), - ProteinLevelData = as.data.frame(rqall), + data.table::setDF(input) + data.table::setDF(rqall) + list(FeatureLevelData = input, + ProteinLevelData = rqall, SummaryMethod = method) - + } @@ -104,9 +114,9 @@ MSstatsSummarizationOutput = function(input, summarized, processed, #' @param impute if TRUE, censored missing values were imputed #' @param censored_symbol censored missing value indicator #' @keywords internal -.finalizeInput = function(input, summarized, method, impute, censored_symbol) { +.finalizeInput = function(input, predicted_survival, method, impute, censored_symbol) { # if (method == "TMP") { - input = .finalizeTMP(input, censored_symbol, impute, summarized) + input = .finalizeTMP(input, censored_symbol, impute, predicted_survival) # } else { # input = .finalizeLinear(input, censored_symbol) # } @@ -117,21 +127,27 @@ MSstatsSummarizationOutput = function(input, summarized, processed, #' Summary statistics for output of TMP-based summarization #' @inheritParams .finalizeInput #' @keywords internal -.finalizeTMP = function(input, censored_symbol, impute, summarized) { +.finalizeTMP = function(input, censored_symbol, impute, predicted_survival) { NonMissingStats = NumMeasuredFeature = MissingPercentage = LABEL = NULL total_features = more50missing = nonmissing_orig = censored = NULL INTENSITY = newABUNDANCE = NumImputedFeature = NULL - - survival_predictions = lapply(summarized, function(x) x[[2]]) - predicted_survival = data.table::rbindlist(survival_predictions, fill = TRUE) + if (impute) { - cols = intersect(colnames(input), c("newABUNDANCE", - "cen", "RUN", - "FEATURE", "ref_covariate", "LABEL")) - input = merge(input[, colnames(input) != "newABUNDANCE", with = FALSE], - predicted_survival, - by = setdiff(cols, "newABUNDANCE"), - all.x = TRUE) + # Join columns must exist in BOTH tables. The per-protein survival + # tables built in MSstatsSummarizeSingleLinear / TMP exclude LABEL, + # so intersect with predicted_survival to avoid a missing-column + # error when LABEL is in input but not in the survival table. + join_cols = intersect(intersect(colnames(input), + colnames(predicted_survival)), + c("cen", "RUN", "FEATURE", "ref_covariate", + "LABEL")) + data.table::set(input, j = "newABUNDANCE", value = NULL) + idx = predicted_survival[input, on = join_cols, which = TRUE, + mult = "first"] + data.table::set(input, j = "newABUNDANCE", + value = predicted_survival$newABUNDANCE[idx]) + data.table::set(input, j = "predicted", + value = predicted_survival$predicted[idx]) } input[, NonMissingStats := .getNonMissingFilterStats(.SD, censored_symbol)] input[, NumMeasuredFeature := sum(NonMissingStats), @@ -144,7 +160,11 @@ MSstatsSummarizationOutput = function(input, summarized, processed, } else { input[, nonmissing_orig := !is.na(INTENSITY)] } - input[, nonmissing_orig := ifelse(is.na(newABUNDANCE), TRUE, nonmissing_orig)] + # In-place conditional write: only overwrite nonmissing_orig for + # the rows where newABUNDANCE is NA. `ifelse(...)` would build a + # full-length logical vector for every row regardless; this writes + # only the rows that need changing. + input[is.na(newABUNDANCE), nonmissing_orig := TRUE] if (impute) { input[, NumImputedFeature := sum(!nonmissing_orig), by = c("PROTEIN", "RUN", "LABEL")] @@ -175,7 +195,11 @@ MSstatsSummarizationOutput = function(input, summarized, processed, } else { input[, nonmissing_orig := !is.na(INTENSITY)] } - input[, nonmissing_orig := ifelse(is.na(newABUNDANCE), TRUE, nonmissing_orig)] + # In-place conditional write: only overwrite nonmissing_orig for + # the rows where newABUNDANCE is NA. `ifelse(...)` would build a + # full-length logical vector for every row regardless; this writes + # only the rows that need changing. + input[is.na(newABUNDANCE), nonmissing_orig := TRUE] input[, NumImputedFeature := 0] } input diff --git a/inst/tinytest/test_memory_optimization_copies.R b/inst/tinytest/test_memory_optimization_copies.R new file mode 100644 index 00000000..bcd9656e --- /dev/null +++ b/inst/tinytest/test_memory_optimization_copies.R @@ -0,0 +1,469 @@ +# Tests for memory optimizations in the dataProcess pipeline that target +# data.table deep-copy ops: in-place column drops, merge() -> keyed lookup, +# ifelse() -> targeted [i, j := v] writes, and the predicted_survival +# extraction in MSstatsSummarizationOutput. + + +# --- Issue 2: Column Removal via Subsetting Creates Full Copies --------------- +# +# .normalizeMedian() adds two temp columns (ABUNDANCE_RUN, ABUNDANCE_FRACTION) +# via := (in-place), uses them, then removes them. Previously it used +# input[, !(colnames(input) %in% ...), with = FALSE] which created a full +# copy of the data.table (~250 MB for a real dataset). Now it uses +# data.table::set(input, j = ..., value = NULL) which removes in-place. +# +# We test this by checking that the data.table's memory address is preserved +# after the function call — proving no copy was made. + +# Suppress MSstats log messages during test +MSstatsConvert::MSstatsLogsSettings(FALSE) + +# Test 2a: .normalizeMedian preserves data.table identity (no copy) --- +# +# data.table::address() returns the hex pointer of the data.table object. +# If the function modifies in-place, the address stays the same. +# If it creates a copy (the old bug), the address changes. + +norm_input = data.table::data.table( + PROTEIN = factor(rep(c("P1", "P2"), each = 20)), + PEPTIDE = factor(rep(c("pep1", "pep2"), each = 10, times = 2)), + FEATURE = factor(rep(c("feat1", "feat2"), each = 10, times = 2)), + LABEL = rep("L", 40), + RUN = factor(rep(1:10, 4)), + FRACTION = rep(1L, 40), + ABUNDANCE = rnorm(40, mean = 20, sd = 2), + INTENSITY = 2^rnorm(40, mean = 20, sd = 2), + GROUP_ORIGINAL = rep(c("A", "B"), each = 5, times = 4), + SUBJECT_ORIGINAL = rep(1:10, 4), + TRANSITION = factor(rep("y3_1", 40)) +) + +addr_before = data.table::address(norm_input) +norm_result = MSstats:::.normalizeMedian(norm_input) +addr_after = data.table::address(norm_result) + +# The address should be the same — in-place modification, no copy +expect_equal(addr_before, addr_after, + info = paste(".normalizeMedian should modify in-place (same address).", + "Before:", addr_before, "After:", addr_after)) + +# Temp columns should not exist in the result +expect_false("ABUNDANCE_RUN" %in% colnames(norm_result), + info = "Temp column ABUNDANCE_RUN should be removed") +expect_false("ABUNDANCE_FRACTION" %in% colnames(norm_result), + info = "Temp column ABUNDANCE_FRACTION should be removed") + +# ABUNDANCE should still exist and be modified (normalized) +expect_true("ABUNDANCE" %in% colnames(norm_result), + info = "ABUNDANCE column should still exist after normalization") + + +# Test 2b: .normalizeMedian does not allocate a full-table copy --- +# +# Create a larger dataset and measure memory allocated during the call. +# With in-place set(), the only allocations should be the two temp column +# vectors (~2x nrow doubles). With the old column-subset approach, the +# allocation would be ~15x nrow doubles (copying all kept columns). + +n_rows = 100000 +big_input = data.table::data.table( + PROTEIN = factor(rep(paste0("P", 1:100), each = n_rows / 100)), + PEPTIDE = factor(rep(paste0("pep", 1:200), each = n_rows / 200)), + FEATURE = factor(rep(paste0("feat", 1:200), each = n_rows / 200)), + LABEL = rep("L", n_rows), + RUN = factor(rep(1:50, n_rows / 50)), + FRACTION = rep(1L, n_rows), + ABUNDANCE = rnorm(n_rows, mean = 20, sd = 2), + INTENSITY = 2^rnorm(n_rows, mean = 20, sd = 2), + GROUP_ORIGINAL = rep(c("A", "B"), each = 25, times = n_rows / 50), + SUBJECT_ORIGINAL = rep(1:50, n_rows / 50), + TRANSITION = factor(rep("y3_1", n_rows)) +) + +# Measure the size of the input table and one column for reference. +# We use these to reason about whether the allocation during +# .normalizeMedian is "2 temp columns worth" vs "full table copy worth". +table_size = as.numeric(object.size(big_input)) +one_col_size = as.numeric(object.size(big_input$ABUNDANCE)) +ncols = ncol(big_input) + +# --- Memory measurement --- +# +# R manages data in "Vcells" (vector cells) — this is where column data +# lives. gc() returns a matrix with memory stats; row 2 = Vcells, +# column 6 = "max used (Mb)" = peak Vcells since last reset. +# +# gc(reset = TRUE) resets the "max used" counter to the current level, +# giving us a clean window to measure just the .normalizeMedian call. +gc(reset = TRUE) +big_result = MSstats:::.normalizeMedian(big_input) +gc_after = gc() + +# Extract peak vector memory (Vcells, max used, in MB). +# We store this for informational purposes but do NOT assert on it +# because gc() accounting is non-deterministic: +# - R may GC the temp columns before we call gc(), or may not +# - Other small allocations (formula parsing, string interning) +# contaminate the measurement +# - Two identical runs can give different numbers +# The address() check below is the reliable, deterministic test. +peak_vcells_mb = gc_after[2, 6] + +# Sanity check on the test data itself (not on the function): +# Verify that 2 temp columns are much smaller than the full table. +# If they were similar sizes, we couldn't distinguish "allocated 2 temp +# columns" from "copied the whole table" and the test would be meaningless. +# For our 11-column, 100K-row table: 2 cols ≈ 1.6 MB, full table ≈ 6 MB. +expect_true(one_col_size * 2 < table_size * 0.5, + info = paste("Sanity check: 2 columns should be much less than", + "50% of the table. 2 cols:", one_col_size * 2, + "50% table:", table_size * 0.5)) + +# The real test: same address = same R object = no copy was made. +# This is deterministic and reliable, unlike gc() measurements. +expect_equal(data.table::address(big_input), data.table::address(big_result), + info = "Large table: .normalizeMedian should modify in-place") +expect_false("ABUNDANCE_RUN" %in% colnames(big_result), + info = "Large table: temp column ABUNDANCE_RUN should be removed") + + +# --- Issue 3: .finalizeTMP Merge Duplicates the Entire Input ------------------ +# +# .finalizeTMP replaces the pre-imputation newABUNDANCE column in the full +# feature-level table with imputed values from predicted_survival. Previously +# it did: merge(input[, cols != "newABUNDANCE", with=FALSE], predicted_survival) +# which created a column-subset copy (~333 MB) and a merge result (~370 MB), +# peaking at ~1,133 MB for ~350 MB of data. +# +# The fix uses in-place operations: +# set(input, j = "newABUNDANCE", value = NULL) — remove old column (0 bytes) +# input[, c(...) := .(NA_real_, NA_real_)] — add NA placeholders (~40 MB) +# input[predicted_survival, ... := ..., on = .] — update matched rows (0 bytes) +# +# These tests verify: +# (a) The data.table is modified in-place (address preserved) +# (b) Matched rows get imputed values from predicted_survival +# (c) Unmatched rows get NA (same as the old merge with all.x = TRUE) +# (d) Memory allocation is minimal compared to a full-table copy + +# Test 3a: .finalizeTMP modifies input in-place with correct values --- + +# Build a feature-level input table mimicking post-summarization state. +# Two proteins, 2 features each, 3 runs = 12 rows per label. +n = 12 +finalize_input = data.table::data.table( + PROTEIN = factor(rep(c("P1", "P2"), each = 6)), + PEPTIDE = factor(rep(c("pep1", "pep2"), each = 3, times = 2)), + FEATURE = factor(rep(c("feat1", "feat2"), each = 3, times = 2)), + LABEL = rep("L", n), + GROUP = factor(rep(c("A", "B", "C"), times = 4)), + RUN = factor(rep(1:3, times = 4)), + SUBJECT = factor(rep(1:3, times = 4)), + FRACTION = rep(1L, n), + INTENSITY = 2^rnorm(n, 20, 2), + ABUNDANCE = rnorm(n, 20, 2), + # Pre-imputation newABUNDANCE: some are NA (censored) + newABUNDANCE = c(10.1, NA, 10.5, 11.2, 10.8, NA, + 9.5, NA, 10.2, 11.0, NA, 10.6), + cen = c(1, 0, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1), + censored = c(FALSE, TRUE, FALSE, FALSE, FALSE, TRUE, + FALSE, TRUE, FALSE, FALSE, TRUE, FALSE), + originalRUN = factor(rep(paste0("Run", 1:3), times = 4)), + n_obs = rep(3, n), + n_obs_run = rep(2, n), + total_features = rep(2, n), + nonmissing = c(TRUE, FALSE, TRUE, TRUE, TRUE, FALSE, + TRUE, FALSE, TRUE, TRUE, FALSE, TRUE) +) + +# Build predicted_survival: simulates what rbindlist of per-protein survival +# tables looks like. Only contains rows that passed the n_obs/n_obs_run filter +# in MSstatsSummarizeSingleTMP — so fewer rows than input. +# Rows 2, 6, 8, 11 are censored — only some of them appear in predicted_survival +# (e.g., rows filtered out by n_obs <= 1 would be absent). +# Here we include most rows but deliberately EXCLUDE rows for RUN=2, FEATURE=feat2 +# of P2 to test the unmatched (NA) case. +predicted_survival = data.table::data.table( + newABUNDANCE = c(10.1, 8.5, 10.5, 11.2, 10.8, 7.9, + 9.5, 8.2, 10.2, 11.0, 10.6), + cen = c(1, 0, 1, 1, 1, 0, + 1, 0, 1, 1, 1), + RUN = factor(c(1, 2, 3, 1, 2, 3, + 1, 2, 3, 1, 3)), + FEATURE = factor(c(rep("feat1", 3), rep("feat2", 3), + rep("feat1", 3), rep("feat2", 2))), + predicted = c(NA, 8.5, NA, NA, NA, 7.9, + NA, 8.2, NA, NA, NA) +) + +addr_before = data.table::address(finalize_input) + +# Call .finalizeTMP with impute = TRUE. +# This should replace newABUNDANCE and add predicted via in-place join. +result = MSstats:::.finalizeTMP(finalize_input, censored_symbol = "NA", + impute = TRUE, + predicted_survival = predicted_survival) + +addr_after = data.table::address(result) + +# (a) Address preserved — the data.table was modified in-place, no full copy. +expect_equal(addr_before, addr_after, + info = paste(".finalizeTMP should modify input in-place.", + "Before:", addr_before, "After:", addr_after)) + +# (b) Matched rows should have newABUNDANCE from predicted_survival. +# Any row whose (cen, RUN, FEATURE) key exists in predicted_survival +# should get a non-NA newABUNDANCE value. +matched_count = sum(!is.na(result$newABUNDANCE)) +expect_true(matched_count > 0, + info = paste("Matched rows should have non-NA newABUNDANCE.", + "Found", matched_count, "non-NA values")) + +# (c) Unmatched rows should have NA. +# predicted_survival has 11 rows but input has 12. The missing key +# (cen=0, RUN=2, FEATURE=feat2) has no match, so that row should be NA. +# We check by looking for the key that's absent in predicted_survival. +unmatched = result[cen == 0 & RUN == 2 & FEATURE == "feat2", newABUNDANCE] +if (length(unmatched) > 0) { + expect_true(is.na(unmatched[1]), + info = "Unmatched row should get NA for newABUNDANCE") +} + +# (d) The predicted column should exist. +expect_true("predicted" %in% colnames(result), + info = ".finalizeTMP should add the 'predicted' column") + + +# Test 3b: .finalizeTMP at scale — memory stays low --- +# +# Create a larger dataset and verify that peak memory allocation during +# .finalizeTMP is far below what a full-table merge would require. +# With the old merge approach, peak would be ~3x the table size. +# With in-place join, peak should be close to ~1x (just the table + 2 new cols). + +n_big = 50000 +n_runs = 50 +n_features = 20 +n_proteins = n_big / (n_runs * n_features) # = 50 + +big_finalize_input = data.table::data.table( + PROTEIN = factor(rep(paste0("P", 1:n_proteins), each = n_runs * n_features)), + FEATURE = factor(rep(paste0("feat", 1:n_features), each = n_runs, times = n_proteins)), + LABEL = rep("L", n_big), + RUN = factor(rep(1:n_runs, times = n_proteins * n_features)), + INTENSITY = 2^rnorm(n_big, 20, 2), + ABUNDANCE = rnorm(n_big, 20, 2), + newABUNDANCE = rnorm(n_big, 20, 2), + cen = sample(c(0, 1), n_big, replace = TRUE, prob = c(0.2, 0.8)), + censored = FALSE, + originalRUN = factor(rep(paste0("Run", 1:n_runs), times = n_proteins * n_features)), + GROUP = factor(rep("A", n_big)), + SUBJECT = factor(rep(1:n_runs, times = n_proteins * n_features)), + n_obs = rep(n_runs, n_big), + n_obs_run = rep(n_features, n_big), + total_features = rep(n_features, n_big), + nonmissing = rep(TRUE, n_big) +) +big_finalize_input[cen == 0, censored := TRUE] +big_finalize_input[cen == 0, newABUNDANCE := NA] + +# predicted_survival: exclude specific (RUN, FEATURE) combinations to +# guarantee some rows in input have no match. Random row sampling doesn't +# work because multiple rows can share the same join key, so excluded rows +# may still match via a duplicate key in the kept rows. +excluded_features = c("feat1", "feat2") +excluded_run = "1" +big_predicted = big_finalize_input[ + !(FEATURE %in% excluded_features & RUN == excluded_run), + .(newABUNDANCE = ifelse(cen == 0, 7.0, newABUNDANCE), + cen, RUN, FEATURE, + predicted = ifelse(cen == 0, 7.0, NA_real_))] + +input_size = as.numeric(object.size(big_finalize_input)) +addr_big_before = data.table::address(big_finalize_input) + +# Reset GC, run .finalizeTMP, measure peak. +gc(reset = TRUE) +big_finalize_result = MSstats:::.finalizeTMP( + big_finalize_input, censored_symbol = "NA", impute = TRUE, + predicted_survival = big_predicted) +gc_issue3 = gc() + +# Peak Vcells (MB) — informational. The old merge approach would show a peak +# roughly 3x the input size. The in-place approach should be much less. +peak_mb_issue3 = gc_issue3[2, 6] + +# Address preserved = in-place modification, no full-table copy. +expect_equal(addr_big_before, data.table::address(big_finalize_result), + info = paste("Large .finalizeTMP: should modify in-place.", + "Input size:", round(input_size / 1e6, 1), "MB.", + "Peak Vcells:", peak_mb_issue3, "MB")) + +# newABUNDANCE and predicted columns should both exist. +expect_true("newABUNDANCE" %in% colnames(big_finalize_result), + info = "Large .finalizeTMP: newABUNDANCE should exist after join") +expect_true("predicted" %in% colnames(big_finalize_result), + info = "Large .finalizeTMP: predicted should exist after join") + +# Rows with excluded (RUN, FEATURE) keys should have NA for newABUNDANCE, +# because they have no match in predicted_survival. +unmatched_mask = big_finalize_result$FEATURE %in% excluded_features & + big_finalize_result$RUN == excluded_run +na_count = sum(is.na(big_finalize_result$newABUNDANCE[unmatched_mask])) +expect_true(na_count > 0, + info = paste("Rows with excluded keys should have NA newABUNDANCE.", + "Found", na_count, "NAs out of", + sum(unmatched_mask), "unmatched rows")) + + +# --- Issue 4: survival_predictions List Duplication --------------------------- +# +# MSstatsSummarizationOutput used to pass the full nested 'summarized' list +# (both protein results AND survival predictions for every protein) into +# .finalizeTMP. That function only needed the survival predictions, but the +# protein results (~50% of the list) were held hostage in memory. +# +# The fix: MSstatsSummarizationOutput now splits the list upfront: +# predicted_survival = rbindlist(lapply(summarized, function(x) x[[2]])) +# protein_summaries = lapply(summarized, function(x) x[[1]]) +# rm(summarized) — frees the nested list immediately +# +# .finalizeTMP now receives predicted_survival directly (a combined data.table), +# not the full nested list. This means: +# (a) .finalizeTMP never sees the protein results — they can't waste memory +# (b) The nested list is freed before .finalizeTMP runs +# +# These tests verify: +# (a) .finalizeTMP accepts predicted_survival directly (not a nested list) +# (b) MSstatsSummarizationOutput correctly decomposes the nested list +# (c) Memory: the nested list doesn't persist through .finalizeTMP + +# Test 4a: .finalizeTMP takes a combined data.table, not a nested list --- +# +# Verify the function signature change: .finalizeTMP's 4th parameter is now +# 'predicted_survival' (a data.table), not 'summarized' (a nested list). +# If someone reverts to the old code, passing a data.table would break. + +# Reuse finalize_input from Issue 3 tests (rebuild if needed since +# .finalizeTMP modified it in-place above). +finalize_input_4 = data.table::data.table( + PROTEIN = factor(rep(c("P1", "P2"), each = 6)), + FEATURE = factor(rep(c("feat1", "feat2"), each = 3, times = 2)), + LABEL = rep("L", 12), + RUN = factor(rep(1:3, times = 4)), + INTENSITY = 2^rnorm(12, 20, 2), + ABUNDANCE = rnorm(12, 20, 2), + newABUNDANCE = c(10.1, NA, 10.5, 11.2, 10.8, NA, + 9.5, NA, 10.2, 11.0, NA, 10.6), + cen = c(1, 0, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1), + censored = c(FALSE, TRUE, FALSE, FALSE, FALSE, TRUE, + FALSE, TRUE, FALSE, FALSE, TRUE, FALSE), + originalRUN = factor(rep(paste0("Run", 1:3), times = 4)), + GROUP = factor(rep(c("A", "B", "C"), times = 4)), + SUBJECT = factor(rep(1:3, times = 4)), + n_obs = rep(3, 12), + n_obs_run = rep(2, 12), + total_features = rep(2, 12), + nonmissing = c(TRUE, FALSE, TRUE, TRUE, TRUE, FALSE, + TRUE, FALSE, TRUE, TRUE, FALSE, TRUE) +) + +# predicted_survival as a plain data.table (the new interface). +pred_surv_4 = data.table::data.table( + newABUNDANCE = c(10.1, 8.5, 10.5, 11.2, 10.8, 7.9, + 9.5, 8.2, 10.2, 11.0, 10.6, 8.0), + cen = c(1, 0, 1, 1, 1, 0, 1, 0, 1, 1, 1, 0), + RUN = factor(rep(1:3, times = 4)), + FEATURE = factor(rep(c("feat1", "feat2"), each = 3, times = 2)), + predicted = c(NA, 8.5, NA, NA, NA, 7.9, NA, 8.2, NA, NA, NA, 8.0) +) + +# This should work — .finalizeTMP now expects a data.table, not a list. +result_4 = MSstats:::.finalizeTMP(finalize_input_4, censored_symbol = "NA", + impute = TRUE, + predicted_survival = pred_surv_4) +expect_true(data.table::is.data.table(result_4) || is.data.frame(result_4), + info = ".finalizeTMP should return a data.frame/data.table when given predicted_survival directly") +expect_true("newABUNDANCE" %in% colnames(result_4), + info = ".finalizeTMP should produce newABUNDANCE from a data.table input") +expect_true("predicted" %in% colnames(result_4), + info = ".finalizeTMP should produce predicted column from a data.table input") + + +# Test 4b: MSstatsSummarizationOutput correctly splits the nested list --- +# +# Build a minimal 'summarized' in the nested format that the summarization +# loop produces: list(list(protein_result, survival), ...). +# Verify that MSstatsSummarizationOutput decomposes it correctly and +# produces valid output. + +# Use the real pipeline on a small built-in dataset to get a realistic +# summarized list, then verify the output structure. +MSstatsConvert::MSstatsLogsSettings(FALSE) +small_input = MSstatsPrepareForDataProcess(SRMRawData, 2, NULL) +small_input = MSstatsNormalize(small_input, "EQUALIZEMEDIANS") +small_input = MSstatsMergeFractions(small_input) +small_input = MSstatsHandleMissing(small_input, "TMP", TRUE, "NA", 0.999) +small_input = MSstatsSelectFeatures(small_input, "all") +processed = getProcessed(small_input) +small_input = MSstatsPrepareForSummarization(small_input, "TMP", TRUE, "NA", FALSE) +summarized_list = MSstatsSummarizeWithSingleCore(small_input, "TMP", TRUE, "NA", + FALSE, TRUE, 90) + +# Verify summarized_list is a nested list (the format we're testing). +expect_true(is.list(summarized_list), + info = "Summarized should be a list") +expect_true(is.list(summarized_list[[1]]), + info = "Each element should be a list(protein_result, survival)") +expect_equal(length(summarized_list[[1]]), 2, + info = "Each element should have exactly 2 components") + +# Now call MSstatsSummarizationOutput — this is where Issue 4 fix lives. +# It should decompose the nested list, free it, and produce correct output. +output = MSstatsSummarizationOutput(small_input, summarized_list, processed, + "TMP", TRUE, "NA") + +expect_true(!is.null(output$FeatureLevelData), + info = "Output should have FeatureLevelData") +expect_true(!is.null(output$ProteinLevelData), + info = "Output should have ProteinLevelData") +expect_true(nrow(output$FeatureLevelData) > 0, + info = "FeatureLevelData should have rows") +expect_true(nrow(output$ProteinLevelData) > 0, + info = "ProteinLevelData should have rows") +expect_true("newABUNDANCE" %in% colnames(output$FeatureLevelData), + info = "FeatureLevelData should contain newABUNDANCE (from imputation)") +expect_true("predicted" %in% colnames(output$FeatureLevelData), + info = "FeatureLevelData should contain predicted column") + + +# Test 4c: Memory — nested list is freed before .finalizeTMP runs --- +# +# We can't directly observe rm(summarized) inside MSstatsSummarizationOutput, +# but we can measure the total memory footprint. With a larger dataset, the +# old approach held the full nested list (~200 MB) alive through .finalizeTMP. +# The new approach frees it before .finalizeTMP runs, so peak should be lower. +# +# We measure by comparing: the size of the nested summarized list vs the +# size of predicted_survival alone. The fix ensures only predicted_survival +# (the smaller half) is alive during .finalizeTMP, not both halves. + +# Measure sizes of the two halves from the real summarized list. +survival_tables = lapply(summarized_list, function(x) x[[2]]) +protein_tables = lapply(summarized_list, function(x) x[[1]]) + +size_full_list = as.numeric(object.size(summarized_list)) +size_survival_only = as.numeric(object.size(survival_tables)) +size_protein_only = as.numeric(object.size(protein_tables)) +size_combined_survival = as.numeric( + object.size(data.table::rbindlist(survival_tables, fill = TRUE))) + +# The combined predicted_survival table (what .finalizeTMP receives now) +# should be smaller than the full nested list (what it received before). +expect_true(size_combined_survival < size_full_list, + info = paste("Combined predicted_survival should be smaller than", + "the full nested list.", + "predicted_survival:", size_combined_survival, "bytes.", + "Full nested list:", size_full_list, "bytes.", + "Savings:", size_full_list - size_combined_survival, "bytes.", + "This is the memory .finalizeTMP no longer holds.")) From b64b004d0faad3418d365cbbf8ec937d385acd63 Mon Sep 17 00:00:00 2001 From: Rudhik1904 Date: Fri, 15 May 2026 00:11:53 -0500 Subject: [PATCH 2/7] Updating the comments. --- R/utils_checks.R | 7 -- R/utils_feature_selection.R | 5 - R/utils_normalize.R | 6 - R/utils_output.R | 12 -- .../test_memory_optimization_copies.R | 105 ++++++------------ 5 files changed, 35 insertions(+), 100 deletions(-) diff --git a/R/utils_checks.R b/R/utils_checks.R index 97dbc9be..2051c019 100644 --- a/R/utils_checks.R +++ b/R/utils_checks.R @@ -211,10 +211,6 @@ MSstatsPrepareForDataProcess = function(input, log_base, fix_missing) { cols = toupper(cols) cols = intersect(c(cols, "FRACTION", "TECHREPLICATE"), colnames(input)) - # Drop unwanted columns by reference rather than subsetting with - # `input[, cols, with = FALSE]`, which would deep-copy every retained - # column into a new data.table. On large inputs this subset was - # allocating hundreds of MB (~nrow x ncol x 8 bytes). drop_cols = setdiff(colnames(input), cols) for (col in drop_cols) data.table::set(input, j = col, value = NULL) @@ -327,9 +323,6 @@ setMethod(".checkDataValidity", "MSstatsValidated", .prepareForDataProcess) input[, PROTEIN := factor(PROTEIN)] input[, PEPTIDE := factor(PEPTIDE)] input[, TRANSITION := factor(TRANSITION)] - # Sort in place instead of `input = input[order(...), ]`, which builds - # a full row-reshuffle copy of the whole table. setorder rewrites the - # underlying column vectors without allocating a new data.table. data.table::setorder(input, LABEL, GROUP_ORIGINAL, SUBJECT_ORIGINAL, RUN, PROTEIN, PEPTIDE, TRANSITION) input[, GROUP := factor(GROUP)] diff --git a/R/utils_feature_selection.R b/R/utils_feature_selection.R index 65dfed26..0766e63b 100644 --- a/R/utils_feature_selection.R +++ b/R/utils_feature_selection.R @@ -78,11 +78,6 @@ MSstatsSelectFeatures = function(input, method, top_n = 3, min_feature_count = 2 is_obs = log2inty = LABEL = NULL has_censored = is.element("censored", colnames(input)) - # Build the working copy in a single operation. Previously this was two - # steps: (1) column-subset via input[, cols, with = FALSE] which deep- - # copied 7 columns from the 17-column table (~120 MB), then (2) a - # transform that created another new data.table (~100 MB). Combining - # them into one expression eliminates the intermediate copy. work = input[, list(protein = as.character(PROTEIN), peptide = as.character(PEPTIDE), feature = as.character(FEATURE), diff --git a/R/utils_normalize.R b/R/utils_normalize.R index 609abfe6..7a617596 100644 --- a/R/utils_normalize.R +++ b/R/utils_normalize.R @@ -347,8 +347,6 @@ MSstatsMergeFractions = function(input) { SUBJECT_ORIGINAL, newRun)]) - # In-place add of newRun via keyed-match lookup instead of - # `merge(all.x=TRUE)`, which would deep-copy the whole table. nr_idx = match_runs[input, on = c("GROUP_ORIGINAL", "SUBJECT_ORIGINAL"), which = TRUE, mult = "first"] @@ -358,10 +356,6 @@ MSstatsMergeFractions = function(input) { list(ncount = .N), by = c("FEATURE", "FRACTION")] select_fraction = select_fraction[ncount != 0] - # Filter by (FEATURE, FRACTION) pair directly instead of - # synthesising a concatenated "tmp" string column on both - # sides. Saves two 207k-string character vectors plus the - # paste() cost each call. keep_idx = select_fraction[input, on = c("FEATURE", "FRACTION"), which = TRUE, mult = "first"] diff --git a/R/utils_output.R b/R/utils_output.R index 9b49aae7..13d60750 100644 --- a/R/utils_output.R +++ b/R/utils_output.R @@ -133,10 +133,6 @@ MSstatsSummarizationOutput = function(input, summarized, processed, INTENSITY = newABUNDANCE = NumImputedFeature = NULL if (impute) { - # Join columns must exist in BOTH tables. The per-protein survival - # tables built in MSstatsSummarizeSingleLinear / TMP exclude LABEL, - # so intersect with predicted_survival to avoid a missing-column - # error when LABEL is in input but not in the survival table. join_cols = intersect(intersect(colnames(input), colnames(predicted_survival)), c("cen", "RUN", "FEATURE", "ref_covariate", @@ -160,10 +156,6 @@ MSstatsSummarizationOutput = function(input, summarized, processed, } else { input[, nonmissing_orig := !is.na(INTENSITY)] } - # In-place conditional write: only overwrite nonmissing_orig for - # the rows where newABUNDANCE is NA. `ifelse(...)` would build a - # full-length logical vector for every row regardless; this writes - # only the rows that need changing. input[is.na(newABUNDANCE), nonmissing_orig := TRUE] if (impute) { input[, NumImputedFeature := sum(!nonmissing_orig), @@ -195,10 +187,6 @@ MSstatsSummarizationOutput = function(input, summarized, processed, } else { input[, nonmissing_orig := !is.na(INTENSITY)] } - # In-place conditional write: only overwrite nonmissing_orig for - # the rows where newABUNDANCE is NA. `ifelse(...)` would build a - # full-length logical vector for every row regardless; this writes - # only the rows that need changing. input[is.na(newABUNDANCE), nonmissing_orig := TRUE] input[, NumImputedFeature := 0] } diff --git a/inst/tinytest/test_memory_optimization_copies.R b/inst/tinytest/test_memory_optimization_copies.R index bcd9656e..ebabc62e 100644 --- a/inst/tinytest/test_memory_optimization_copies.R +++ b/inst/tinytest/test_memory_optimization_copies.R @@ -1,28 +1,20 @@ -# Tests for memory optimizations in the dataProcess pipeline that target -# data.table deep-copy ops: in-place column drops, merge() -> keyed lookup, -# ifelse() -> targeted [i, j := v] writes, and the predicted_survival -# extraction in MSstatsSummarizationOutput. +# Tests for in-place data.table operations in the dataProcess pipeline: +# column drops, keyed-lookup joins, targeted [i, j := v] writes, and +# the predicted_survival extraction in MSstatsSummarizationOutput. -# --- Issue 2: Column Removal via Subsetting Creates Full Copies --------------- +# --- .normalizeMedian: column drops are in place ------------------------------ # # .normalizeMedian() adds two temp columns (ABUNDANCE_RUN, ABUNDANCE_FRACTION) -# via := (in-place), uses them, then removes them. Previously it used -# input[, !(colnames(input) %in% ...), with = FALSE] which created a full -# copy of the data.table (~250 MB for a real dataset). Now it uses -# data.table::set(input, j = ..., value = NULL) which removes in-place. -# -# We test this by checking that the data.table's memory address is preserved -# after the function call — proving no copy was made. +# via :=, uses them, then removes them with data.table::set(j=, value=NULL) +# so the data.table is modified in-place. We verify the address is preserved. -# Suppress MSstats log messages during test MSstatsConvert::MSstatsLogsSettings(FALSE) -# Test 2a: .normalizeMedian preserves data.table identity (no copy) --- +# Test 2a: .normalizeMedian preserves data.table identity --- # # data.table::address() returns the hex pointer of the data.table object. -# If the function modifies in-place, the address stays the same. -# If it creates a copy (the old bug), the address changes. +# An in-place modification preserves the address; a copy changes it. norm_input = data.table::data.table( PROTEIN = factor(rep(c("P1", "P2"), each = 20)), @@ -60,10 +52,8 @@ expect_true("ABUNDANCE" %in% colnames(norm_result), # Test 2b: .normalizeMedian does not allocate a full-table copy --- # -# Create a larger dataset and measure memory allocated during the call. # With in-place set(), the only allocations should be the two temp column -# vectors (~2x nrow doubles). With the old column-subset approach, the -# allocation would be ~15x nrow doubles (copying all kept columns). +# vectors (~2x nrow doubles), not the whole table. n_rows = 100000 big_input = data.table::data.table( @@ -127,24 +117,19 @@ expect_false("ABUNDANCE_RUN" %in% colnames(big_result), info = "Large table: temp column ABUNDANCE_RUN should be removed") -# --- Issue 3: .finalizeTMP Merge Duplicates the Entire Input ------------------ +# --- .finalizeTMP: in-place update from predicted_survival -------------------- # # .finalizeTMP replaces the pre-imputation newABUNDANCE column in the full -# feature-level table with imputed values from predicted_survival. Previously -# it did: merge(input[, cols != "newABUNDANCE", with=FALSE], predicted_survival) -# which created a column-subset copy (~333 MB) and a merge result (~370 MB), -# peaking at ~1,133 MB for ~350 MB of data. -# -# The fix uses in-place operations: -# set(input, j = "newABUNDANCE", value = NULL) — remove old column (0 bytes) -# input[, c(...) := .(NA_real_, NA_real_)] — add NA placeholders (~40 MB) -# input[predicted_survival, ... := ..., on = .] — update matched rows (0 bytes) +# feature-level table with imputed values from predicted_survival, using: +# set(input, j = "newABUNDANCE", value = NULL) — remove old column +# input[, c(...) := .(NA_real_, NA_real_)] — add NA placeholders +# input[predicted_survival, ... := ..., on = .] — update matched rows # # These tests verify: # (a) The data.table is modified in-place (address preserved) # (b) Matched rows get imputed values from predicted_survival -# (c) Unmatched rows get NA (same as the old merge with all.x = TRUE) -# (d) Memory allocation is minimal compared to a full-table copy +# (c) Unmatched rows are left as NA (equivalent to all.x = TRUE) +# (d) Allocation stays small relative to a full-table copy # Test 3a: .finalizeTMP modifies input in-place with correct values --- @@ -206,7 +191,7 @@ result = MSstats:::.finalizeTMP(finalize_input, censored_symbol = "NA", addr_after = data.table::address(result) -# (a) Address preserved — the data.table was modified in-place, no full copy. +# (a) Address preserved — the data.table was modified in-place. expect_equal(addr_before, addr_after, info = paste(".finalizeTMP should modify input in-place.", "Before:", addr_before, "After:", addr_after)) @@ -236,10 +221,8 @@ expect_true("predicted" %in% colnames(result), # Test 3b: .finalizeTMP at scale — memory stays low --- # -# Create a larger dataset and verify that peak memory allocation during -# .finalizeTMP is far below what a full-table merge would require. -# With the old merge approach, peak would be ~3x the table size. -# With in-place join, peak should be close to ~1x (just the table + 2 new cols). +# At ~50K rows, .finalizeTMP should modify the input in place; peak Vcells +# should stay close to the input size, not balloon to a multi-table merge. n_big = 50000 n_runs = 50 @@ -289,8 +272,7 @@ big_finalize_result = MSstats:::.finalizeTMP( predicted_survival = big_predicted) gc_issue3 = gc() -# Peak Vcells (MB) — informational. The old merge approach would show a peak -# roughly 3x the input size. The in-place approach should be much less. +# Peak Vcells (MB) — informational. peak_mb_issue3 = gc_issue3[2, 6] # Address preserved = in-place modification, no full-table copy. @@ -316,33 +298,23 @@ expect_true(na_count > 0, sum(unmatched_mask), "unmatched rows")) -# --- Issue 4: survival_predictions List Duplication --------------------------- -# -# MSstatsSummarizationOutput used to pass the full nested 'summarized' list -# (both protein results AND survival predictions for every protein) into -# .finalizeTMP. That function only needed the survival predictions, but the -# protein results (~50% of the list) were held hostage in memory. +# --- MSstatsSummarizationOutput / .finalizeTMP: predicted_survival contract --- # -# The fix: MSstatsSummarizationOutput now splits the list upfront: +# MSstatsSummarizationOutput splits the nested 'summarized' list upfront: # predicted_survival = rbindlist(lapply(summarized, function(x) x[[2]])) # protein_summaries = lapply(summarized, function(x) x[[1]]) -# rm(summarized) — frees the nested list immediately -# -# .finalizeTMP now receives predicted_survival directly (a combined data.table), -# not the full nested list. This means: -# (a) .finalizeTMP never sees the protein results — they can't waste memory -# (b) The nested list is freed before .finalizeTMP runs +# rm(summarized) +# .finalizeTMP receives predicted_survival directly (a combined data.table), +# not the full nested list. # # These tests verify: -# (a) .finalizeTMP accepts predicted_survival directly (not a nested list) +# (a) .finalizeTMP accepts predicted_survival as a data.table # (b) MSstatsSummarizationOutput correctly decomposes the nested list -# (c) Memory: the nested list doesn't persist through .finalizeTMP +# (c) The nested list does not persist through .finalizeTMP # Test 4a: .finalizeTMP takes a combined data.table, not a nested list --- # -# Verify the function signature change: .finalizeTMP's 4th parameter is now -# 'predicted_survival' (a data.table), not 'summarized' (a nested list). -# If someone reverts to the old code, passing a data.table would break. +# .finalizeTMP's 4th parameter is 'predicted_survival', a data.table. # Reuse finalize_input from Issue 3 tests (rebuild if needed since # .finalizeTMP modified it in-place above). @@ -418,8 +390,8 @@ expect_true(is.list(summarized_list[[1]]), expect_equal(length(summarized_list[[1]]), 2, info = "Each element should have exactly 2 components") -# Now call MSstatsSummarizationOutput — this is where Issue 4 fix lives. -# It should decompose the nested list, free it, and produce correct output. +# MSstatsSummarizationOutput should decompose the nested list and produce +# valid FeatureLevelData / ProteinLevelData. output = MSstatsSummarizationOutput(small_input, summarized_list, processed, "TMP", TRUE, "NA") @@ -437,16 +409,11 @@ expect_true("predicted" %in% colnames(output$FeatureLevelData), info = "FeatureLevelData should contain predicted column") -# Test 4c: Memory — nested list is freed before .finalizeTMP runs --- -# -# We can't directly observe rm(summarized) inside MSstatsSummarizationOutput, -# but we can measure the total memory footprint. With a larger dataset, the -# old approach held the full nested list (~200 MB) alive through .finalizeTMP. -# The new approach frees it before .finalizeTMP runs, so peak should be lower. +# Test 4c: predicted_survival alone is smaller than the full nested list --- # -# We measure by comparing: the size of the nested summarized list vs the -# size of predicted_survival alone. The fix ensures only predicted_survival -# (the smaller half) is alive during .finalizeTMP, not both halves. +# .finalizeTMP receives only predicted_survival (the survival half of the +# nested summarized list), not the full list. We verify the combined +# predicted_survival table is smaller than the full nested list. # Measure sizes of the two halves from the real summarized list. survival_tables = lapply(summarized_list, function(x) x[[2]]) @@ -464,6 +431,4 @@ expect_true(size_combined_survival < size_full_list, info = paste("Combined predicted_survival should be smaller than", "the full nested list.", "predicted_survival:", size_combined_survival, "bytes.", - "Full nested list:", size_full_list, "bytes.", - "Savings:", size_full_list - size_combined_survival, "bytes.", - "This is the memory .finalizeTMP no longer holds.")) + "Full nested list:", size_full_list, "bytes.")) From e2bd771cd7a8fc507aea335318238401a39569f4 Mon Sep 17 00:00:00 2001 From: Rudhik1904 Date: Fri, 15 May 2026 00:14:56 -0500 Subject: [PATCH 3/7] Updating some more comments. --- inst/tinytest/test_memory_optimization_copies.R | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/inst/tinytest/test_memory_optimization_copies.R b/inst/tinytest/test_memory_optimization_copies.R index ebabc62e..b7d45563 100644 --- a/inst/tinytest/test_memory_optimization_copies.R +++ b/inst/tinytest/test_memory_optimization_copies.R @@ -270,16 +270,16 @@ gc(reset = TRUE) big_finalize_result = MSstats:::.finalizeTMP( big_finalize_input, censored_symbol = "NA", impute = TRUE, predicted_survival = big_predicted) -gc_issue3 = gc() +gc_after_finalize = gc() # Peak Vcells (MB) — informational. -peak_mb_issue3 = gc_issue3[2, 6] +peak_mb_finalize = gc_after_finalize[2, 6] # Address preserved = in-place modification, no full-table copy. expect_equal(addr_big_before, data.table::address(big_finalize_result), info = paste("Large .finalizeTMP: should modify in-place.", "Input size:", round(input_size / 1e6, 1), "MB.", - "Peak Vcells:", peak_mb_issue3, "MB")) + "Peak Vcells:", peak_mb_finalize, "MB")) # newABUNDANCE and predicted columns should both exist. expect_true("newABUNDANCE" %in% colnames(big_finalize_result), @@ -316,8 +316,8 @@ expect_true(na_count > 0, # # .finalizeTMP's 4th parameter is 'predicted_survival', a data.table. -# Reuse finalize_input from Issue 3 tests (rebuild if needed since -# .finalizeTMP modified it in-place above). +# Rebuild a finalize_input fixture since .finalizeTMP modified the previous +# one in-place above. finalize_input_4 = data.table::data.table( PROTEIN = factor(rep(c("P1", "P2"), each = 6)), FEATURE = factor(rep(c("feat1", "feat2"), each = 3, times = 2)), From 65df2b553980e6e4bcb126918de571cdc2a6b2e6 Mon Sep 17 00:00:00 2001 From: Rudhik1904 Date: Tue, 2 Jun 2026 00:39:17 -0500 Subject: [PATCH 4/7] Responding to the comments. --- .../test_memory_optimization_copies.R | 19 +++++++++++++------ 1 file changed, 13 insertions(+), 6 deletions(-) diff --git a/inst/tinytest/test_memory_optimization_copies.R b/inst/tinytest/test_memory_optimization_copies.R index b7d45563..25e4ef8d 100644 --- a/inst/tinytest/test_memory_optimization_copies.R +++ b/inst/tinytest/test_memory_optimization_copies.R @@ -102,12 +102,19 @@ peak_vcells_mb = gc_after[2, 6] # Sanity check on the test data itself (not on the function): # Verify that 2 temp columns are much smaller than the full table. # If they were similar sizes, we couldn't distinguish "allocated 2 temp -# columns" from "copied the whole table" and the test would be meaningless. -# For our 11-column, 100K-row table: 2 cols ≈ 1.6 MB, full table ≈ 6 MB. -expect_true(one_col_size * 2 < table_size * 0.5, - info = paste("Sanity check: 2 columns should be much less than", - "50% of the table. 2 cols:", one_col_size * 2, - "50% table:", table_size * 0.5)) +# columns" from "copied the whole table" and the gc() figure above would be +# meaningless. For our 11-column, 100K-row table: 2 cols ≈ 1.6 MB, full ≈ 6 MB. +# +# This only guards the *informational* gc() measurement, which we do not +# assert on (see note above). object.size() accounting depends on R internals +# out of our control, so a wrong result here emits a message rather than +# failing the test run; the deterministic address() check below is the real test. +if (!(one_col_size * 2 < table_size * 0.5)) { + message("Note: fixture sanity check off -- 2 cols (", + round(one_col_size * 2 / 1e6, 2), " MB) not < 50% of table (", + round(table_size * 0.5 / 1e6, 2), " MB). ", + "gc-based memory figures may be unreliable on this platform.") +} # The real test: same address = same R object = no copy was made. # This is deterministic and reliable, unlike gc() measurements. From 0764b3471b8a9a8b31e90bff39fa159b499d9410 Mon Sep 17 00:00:00 2001 From: Rudhik1904 Date: Tue, 2 Jun 2026 00:39:34 -0500 Subject: [PATCH 5/7] Responding to comments- 2 --- R/utils_feature_selection.R | 24 ++++++++++++------------ R/utils_normalize.R | 7 +++---- 2 files changed, 15 insertions(+), 16 deletions(-) diff --git a/R/utils_feature_selection.R b/R/utils_feature_selection.R index 0766e63b..6892232a 100644 --- a/R/utils_feature_selection.R +++ b/R/utils_feature_selection.R @@ -78,18 +78,18 @@ MSstatsSelectFeatures = function(input, method, top_n = 3, min_feature_count = 2 is_obs = log2inty = LABEL = NULL has_censored = is.element("censored", colnames(input)) - work = input[, list(protein = as.character(PROTEIN), - peptide = as.character(PEPTIDE), - feature = as.character(FEATURE), - run = as.character(originalRUN), - label = as.character(LABEL), - log2inty = ifelse(!(is.na(ABUNDANCE) | - if (has_censored) censored else FALSE), - ABUNDANCE, NA), - is_obs = FALSE)] - work[, is_obs := !is.na(log2inty)] - - features_quality = data.table::rbindlist(lapply(split(work, work$label), + input = input[, list(protein = as.character(PROTEIN), + peptide = as.character(PEPTIDE), + feature = as.character(FEATURE), + run = as.character(originalRUN), + label = as.character(LABEL), + log2inty = ifelse(!(is.na(ABUNDANCE) | + if (has_censored) censored else FALSE), + ABUNDANCE, NA), + is_obs = FALSE)] + input[, is_obs := !is.na(log2inty)] + + features_quality = data.table::rbindlist(lapply(split(input, input$label), .flagUninformativeSingleLabel, min_feature_count = min_feature_count)) features_quality diff --git a/R/utils_normalize.R b/R/utils_normalize.R index 7a617596..d2f96423 100644 --- a/R/utils_normalize.R +++ b/R/utils_normalize.R @@ -360,10 +360,9 @@ MSstatsMergeFractions = function(input) { on = c("FEATURE", "FRACTION"), which = TRUE, mult = "first"] input = input[!is.na(keep_idx)] - input$originalRUN = input$newRun - input$RUN = input$originalRUN - input$RUN = factor(input$RUN, levels = unique(input$RUN), - labels = seq_along(unique(input$RUN))) + input[, originalRUN := newRun] + input[, RUN := factor(newRun, levels = unique(newRun), + labels = seq_along(unique(newRun)))] data.table::set(input, j = "newRun", value = NULL) } } From 27d5f7ca58a233afc652b399a65405b2d6559e10 Mon Sep 17 00:00:00 2001 From: Rudhik1904 Date: Tue, 9 Jun 2026 01:49:55 -0500 Subject: [PATCH 6/7] Responding to Tony's suggestions --- R/utils_feature_selection.R | 11 +-- R/utils_normalize.R | 31 ++++++-- R/utils_output.R | 33 +++++---- inst/tinytest/test_MSstatsMergeFractions.R | 85 ++++++++++++++++++++++ man/dot-finalizeInput.Rd | 6 +- man/dot-finalizeTMP.Rd | 4 +- 6 files changed, 135 insertions(+), 35 deletions(-) create mode 100644 inst/tinytest/test_MSstatsMergeFractions.R diff --git a/R/utils_feature_selection.R b/R/utils_feature_selection.R index 6892232a..630b37a6 100644 --- a/R/utils_feature_selection.R +++ b/R/utils_feature_selection.R @@ -75,7 +75,7 @@ MSstatsSelectFeatures = function(input, method, top_n = 3, min_feature_count = 2 #' @keywords internal .selectHighQualityFeatures = function(input, min_feature_count) { PROTEIN = PEPTIDE = FEATURE = originalRUN = ABUNDANCE = censored = NULL - is_obs = log2inty = LABEL = NULL + is_obs = log2inty = is_censored = LABEL = NULL has_censored = is.element("censored", colnames(input)) input = input[, list(protein = as.character(PROTEIN), @@ -83,11 +83,12 @@ MSstatsSelectFeatures = function(input, method, top_n = 3, min_feature_count = 2 feature = as.character(FEATURE), run = as.character(originalRUN), label = as.character(LABEL), - log2inty = ifelse(!(is.na(ABUNDANCE) | - if (has_censored) censored else FALSE), - ABUNDANCE, NA), - is_obs = FALSE)] + log2inty = ABUNDANCE, + is_censored = if (has_censored) censored else FALSE)] + # Censored or missing intensities are not observations. + input[is_censored | is.na(log2inty), log2inty := NA] input[, is_obs := !is.na(log2inty)] + input[, is_censored := NULL] features_quality = data.table::rbindlist(lapply(split(input, input$label), .flagUninformativeSingleLabel, diff --git a/R/utils_normalize.R b/R/utils_normalize.R index d2f96423..936a5e59 100644 --- a/R/utils_normalize.R +++ b/R/utils_normalize.R @@ -284,7 +284,7 @@ MSstatsNormalize = function(input, normalization_method, peptides_dict = NULL, s #' MSstatsMergeFractions = function(input) { ABUNDANCE = INTENSITY = GROUP_ORIGINAL = SUBJECT_ORIGINAL = RUN = NULL - originalRUN = FRACTION = TECHREPLICATE = tmp = merged = newRun = NULL + originalRUN = FRACTION = TECHREPLICATE = tmp = newRun = NULL ncount = FEATURE = NULL input[!is.na(ABUNDANCE) & ABUNDANCE < 0, "ABUNDANCE"] = 0 @@ -340,29 +340,44 @@ MSstatsMergeFractions = function(input) { getOption("MSstatsLog")("ERROR", msg) stop(msg) } else { - match_runs[, merged := "merged"] - match_runs[, newRun := do.call(paste, c(.SD, sep = "_")), - .SDcols = c(1:3, ncol(match_runs))] - match_runs = unique(match_runs[, list(GROUP_ORIGINAL, - SUBJECT_ORIGINAL, - newRun)]) - + # dcast pivoted fractions into columns, so the fraction columns + # are everything that is not a sample-identifier column. + fraction_cols = setdiff(colnames(match_runs), + c("GROUP_ORIGINAL", "SUBJECT_ORIGINAL")) + # Use the first fraction's run as the sample's representative run. + first_fraction_run = match_runs[[fraction_cols[1]]] + # Build the merged-run name: ___merged. + match_runs[, newRun := paste(GROUP_ORIGINAL, SUBJECT_ORIGINAL, + first_fraction_run, "merged", + sep = "_")] + # Reduce to a (group, subject) -> merged-run lookup table. + match_runs = match_runs[, list(GROUP_ORIGINAL, SUBJECT_ORIGINAL, + newRun)] + # For each input row, find its sample's row in the lookup. nr_idx = match_runs[input, on = c("GROUP_ORIGINAL", "SUBJECT_ORIGINAL"), which = TRUE, mult = "first"] + # Write that merged-run name onto every input row. data.table::set(input, j = "newRun", value = match_runs$newRun[nr_idx]) + # Count, per feature/fraction, the rows observed above zero. select_fraction = input[!is.na(ABUNDANCE) & input$ABUNDANCE > 0, list(ncount = .N), by = c("FEATURE", "FRACTION")] + # Keep only feature/fraction combinations seen at least once. select_fraction = select_fraction[ncount != 0] + # Mark which input rows belong to a kept combination (NA = none). keep_idx = select_fraction[input, on = c("FEATURE", "FRACTION"), which = TRUE, mult = "first"] + # Drop rows whose feature/fraction was never observed. input = input[!is.na(keep_idx)] + # The merged run replaces the original per-fraction run. input[, originalRUN := newRun] + # Renumber RUN as a factor, one level per merged run. input[, RUN := factor(newRun, levels = unique(newRun), labels = seq_along(unique(newRun)))] + # Drop the temporary newRun helper column. data.table::set(input, j = "newRun", value = NULL) } } diff --git a/R/utils_output.R b/R/utils_output.R index 13d60750..75a7d60b 100644 --- a/R/utils_output.R +++ b/R/utils_output.R @@ -38,25 +38,27 @@ MSstatsSummarizationOutput = function(input, summarized, processed, method, impute, censored_symbol) { LABEL = TotalGroupMeasurements = GROUP = Protein = RUN = NULL - predicted_survival = data.table::rbindlist(lapply(summarized, function(x) x[[2]]), - fill = TRUE) - for (i in seq_along(summarized)) summarized[[i]][[2]] = NULL - input = .finalizeInput(input, predicted_survival, method, impute, censored_symbol) - rm(predicted_survival) - protein_summaries = lapply(summarized, function(x) x[[1]]) - rm(summarized) - summarized = data.table::rbindlist(protein_summaries, fill = TRUE) - rm(protein_summaries) - - if (inherits(summarized, "try-error")) { + # Summarization failure is signalled by a NULL `summarized` (the caller + # wraps the per-subplot summarization in tryCatch(error = ...) returning + # NULL). Guard before unpacking, otherwise .finalizeInput() below joins on + # an empty predicted_survival and errors. + if (is.null(summarized)) { msg = paste("*** error : can't summarize per subplot with ", method, ".") getOption("MSstatsLog")("ERROR", msg) getOption("MSstatsMsg")("ERROR", msg) rqall = NULL - rqmodelqc = NULL - workpred = NULL } else { + predicted_survival = data.table::rbindlist(lapply(summarized, function(x) x[[2]]), + fill = TRUE) + for (i in seq_along(summarized)) summarized[[i]][[2]] = NULL + input = .finalizeInput(input, predicted_survival, method, impute, censored_symbol) + rm(predicted_survival) + protein_summaries = lapply(summarized, function(x) x[[1]]) + rm(summarized) + summarized = data.table::rbindlist(protein_summaries, fill = TRUE) + rm(protein_summaries) + input[, TotalGroupMeasurements := uniqueN(.SD), by = c("PROTEIN", "GROUP", "LABEL"), .SDcols = c("FEATURE", "originalRUN")] @@ -74,10 +76,9 @@ MSstatsSummarizationOutput = function(input, summarized, processed, by.y = c(merge_col, "PROTEIN", "LABEL")) data.table::setnames(rqall, c("GROUP_ORIGINAL", "SUBJECT_ORIGINAL"), c("GROUP", "SUBJECT"), skip_absent = TRUE) - + rqall$GROUP = factor(as.character(rqall$GROUP)) rqall$Protein = factor(rqall$Protein) - rqmodelqc = summarized$ModelQC } if (is.element("RUN", colnames(rqall)) & !is.null(rqall)) { @@ -99,7 +100,7 @@ MSstatsSummarizationOutput = function(input, summarized, processed, input = rbind(input, processed, fill = TRUE) } data.table::setDF(input) - data.table::setDF(rqall) + if (!is.null(rqall)) data.table::setDF(rqall) list(FeatureLevelData = input, ProteinLevelData = rqall, SummaryMethod = method) diff --git a/inst/tinytest/test_MSstatsMergeFractions.R b/inst/tinytest/test_MSstatsMergeFractions.R new file mode 100644 index 00000000..ece19714 --- /dev/null +++ b/inst/tinytest/test_MSstatsMergeFractions.R @@ -0,0 +1,85 @@ +# Characterization tests for MSstatsMergeFractions -------------------------- +# +# MSstatsMergeFractions collapses the multiple fraction-runs of a sample into a +# single merged run and drops feature/fraction combinations that were never +# observed. The function is hard to read, so these tests pin its observable +# behavior so the in-place-update rewrite (and any future simplification) stays +# equivalent. They focus on the non-TECHREPLICATE branch, which is the part this +# change rewrote; the TECHREPLICATE branch is unchanged from the previous code. + +MSstatsConvert::MSstatsLogsSettings(FALSE) + +# Two subjects, two fractions each. f2 in fraction 2 is all-zero abundance, so +# that (FEATURE, FRACTION) combination has no positive observation. +make_fraction_input <- function(f2_frac2 = c(0.0, 0.0)) { + input <- data.table::data.table( + PROTEIN = "P1", + FEATURE = c("f1", "f1", "f2", "f2", "f1", "f1", "f2", "f2"), + GROUP_ORIGINAL = "G", + SUBJECT_ORIGINAL = c("S1", "S1", "S1", "S1", "S2", "S2", "S2", "S2"), + RUN = factor(c(1, 2, 1, 2, 3, 4, 3, 4)), + originalRUN = c("a1", "a2", "a1", "a2", "b1", "b2", "b1", "b2"), + FRACTION = c(1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L), + INTENSITY = c(10, 20, 30, 40, 50, 60, 70, 80), + ABUNDANCE = c(3.0, 4.0, 2.0, 0.0, 3.5, 4.5, 2.5, 0.0) + ) + # The two f2 / fraction-2 rows (positions 4 and 8) take the parametrized + # abundance. Assigned after construction to avoid referencing a local in a + # data.table() column expression. + input$ABUNDANCE[c(4L, 8L)] <- f2_frac2 + input +} + +# Single fraction -> nothing to merge, returned untouched (besides clamping) --- +single_fraction = data.table::data.table( + PROTEIN = "P1", FEATURE = c("f1", "f2"), + GROUP_ORIGINAL = "G", SUBJECT_ORIGINAL = "S1", + RUN = factor(c(1, 1)), originalRUN = c("a1", "a1"), FRACTION = c(1L, 1L), + INTENSITY = c(10, 20), ABUNDANCE = c(3.0, 4.0)) +res_single = MSstatsMergeFractions(data.table::copy(single_fraction)) +expect_equal(nrow(res_single), 2L, + info = "single-fraction input keeps all rows") +expect_equal(res_single$originalRUN, c("a1", "a1"), + info = "single-fraction input leaves originalRUN untouched") + +# Abundance clamping happens before the fraction check ------------------------ +# ABUNDANCE < 0 becomes 0; an INTENSITY of exactly 1 forces ABUNDANCE to 0. +clamp_input = data.table::data.table( + PROTEIN = "P1", FEATURE = c("f1", "f2"), + GROUP_ORIGINAL = "G", SUBJECT_ORIGINAL = "S1", + RUN = factor(c(1, 1)), originalRUN = c("a1", "a1"), FRACTION = c(1L, 1L), + INTENSITY = c(1, 50), ABUNDANCE = c(2.5, -3.0)) +res_clamp = MSstatsMergeFractions(data.table::copy(clamp_input)) +expect_equal(res_clamp$ABUNDANCE, c(0, 0), + info = "INTENSITY == 1 and negative ABUNDANCE are both clamped to 0") + +# Non-TECHREPLICATE: fractions merge and unobserved combos drop --------------- +res = MSstatsMergeFractions(make_fraction_input()) + +# f2/fraction-2 was all-zero, so those 2 rows are dropped; 6 remain. +expect_equal(nrow(res), 6L, + info = "feature/fraction with no positive abundance is removed") +expect_equal(nrow(res[FEATURE == "f2" & FRACTION == 2L]), 0L, + info = "the unobserved f2/fraction-2 rows are gone") + +# Each sample's fraction-runs collapse to one '___merged'. +expect_equal(sort(unique(res$originalRUN)), + c("G_S1_a1_merged", "G_S2_b1_merged"), + info = "originalRUN is the merged run name, one per sample") + +# RUN is refactored to one level per merged run (2 samples -> 2 levels). +expect_true(is.factor(res$RUN), info = "RUN stays a factor") +expect_equal(nlevels(res$RUN), 2L, + info = "two merged samples give two RUN levels") + +# The temporary newRun column must not leak into the output. +expect_false("newRun" %in% colnames(res), + info = "newRun helper column is removed") + +# Non-TECHREPLICATE: fully observed features are all kept --------------------- +res_keep = MSstatsMergeFractions(make_fraction_input(f2_frac2 = c(1.0, 1.5))) +expect_equal(nrow(res_keep), 8L, + info = "nothing is dropped when every feature/fraction is observed") +expect_equal(sort(unique(paste(res_keep$FEATURE, res_keep$FRACTION))), + c("f1 1", "f1 2", "f2 1", "f2 2"), + info = "all feature/fraction combinations survive") diff --git a/man/dot-finalizeInput.Rd b/man/dot-finalizeInput.Rd index 8695d188..9ed07548 100644 --- a/man/dot-finalizeInput.Rd +++ b/man/dot-finalizeInput.Rd @@ -4,18 +4,18 @@ \alias{.finalizeInput} \title{Add summary statistics to dataProcess output} \usage{ -.finalizeInput(input, summarized, method, impute, censored_symbol) +.finalizeInput(input, predicted_survival, method, impute, censored_symbol) } \arguments{ \item{input}{feature-level data} -\item{summarized}{protein-level data (list)} - \item{method}{summary method} \item{impute}{if TRUE, censored missing values were imputed} \item{censored_symbol}{censored missing value indicator} + +\item{summarized}{protein-level data (list)} } \description{ Add summary statistics to dataProcess output diff --git a/man/dot-finalizeTMP.Rd b/man/dot-finalizeTMP.Rd index 2779235c..ed0f5dae 100644 --- a/man/dot-finalizeTMP.Rd +++ b/man/dot-finalizeTMP.Rd @@ -4,7 +4,7 @@ \alias{.finalizeTMP} \title{Summary statistics for output of TMP-based summarization} \usage{ -.finalizeTMP(input, censored_symbol, impute, summarized) +.finalizeTMP(input, censored_symbol, impute, predicted_survival) } \arguments{ \item{input}{feature-level data} @@ -12,8 +12,6 @@ \item{censored_symbol}{censored missing value indicator} \item{impute}{if TRUE, censored missing values were imputed} - -\item{summarized}{protein-level data (list)} } \description{ Summary statistics for output of TMP-based summarization From 50a30daf963e7f86be7c6578a047246c47ef6015 Mon Sep 17 00:00:00 2001 From: Rudhik1904 Date: Tue, 9 Jun 2026 03:25:26 -0500 Subject: [PATCH 7/7] Adressing Matt's comments --- R/dataProcess.R | 12 ++++---- R/utils_checks.R | 29 ++++++++++++------- .../test_memory_optimization_copies.R | 16 ++++++---- 3 files changed, 35 insertions(+), 22 deletions(-) diff --git a/R/dataProcess.R b/R/dataProcess.R index bc86bd24..32cbbb47 100755 --- a/R/dataProcess.R +++ b/R/dataProcess.R @@ -401,7 +401,7 @@ MSstatsSummarizeSingleLinear = function(single_protein, if (impute & any(single_protein[["censored"]])) { fit_data = if (is_labeled_reference) { - single_protein[is_labeled_ref == FALSE, cols, with = FALSE] + single_protein[(!is_labeled_ref), cols, with = FALSE] } else { single_protein[, cols, with = FALSE] } @@ -414,8 +414,8 @@ MSstatsSummarizeSingleLinear = function(single_protein, }] if (is_labeled_reference) { - single_protein[!(censored & is_labeled_ref == FALSE), predicted := NA] - single_protein[(censored) & is_labeled_ref == FALSE, + single_protein[!(censored & !is_labeled_ref), predicted := NA] + single_protein[(censored) & !is_labeled_ref, newABUNDANCE := predicted] } else { single_protein[!(censored), predicted := NA] @@ -547,7 +547,7 @@ MSstatsSummarizeSingleTMP = function(single_protein, impute, censored_symbol, converged = TRUE fit_data = if (is_labeled_reference) { - single_protein[is_labeled_ref == FALSE, cols, with = FALSE] + single_protein[(!is_labeled_ref), cols, with = FALSE] } else { single_protein[, cols, with = FALSE] } @@ -569,8 +569,8 @@ MSstatsSummarizeSingleTMP = function(single_protein, impute, censored_symbol, } if (is_labeled_reference) { - single_protein[!(censored & is_labeled_ref == FALSE), predicted := NA] - single_protein[(censored) & is_labeled_ref == FALSE, + single_protein[!(censored & !is_labeled_ref), predicted := NA] + single_protein[(censored) & !is_labeled_ref, newABUNDANCE := predicted] } else { single_protein[!(censored), predicted := NA] diff --git a/R/utils_checks.R b/R/utils_checks.R index 2051c019..02aa476b 100644 --- a/R/utils_checks.R +++ b/R/utils_checks.R @@ -170,7 +170,7 @@ MSstatsPrepareForDataProcess = function(input, log_base, fix_missing) { input = data.table::as.data.table(unclass(input)) if (!"AnomalyScores" %in% colnames(input)){ - input$AnomalyScores = NA + data.table::set(input, j = "AnomalyScores", value = NA) } cols = c("ProteinName", "PeptideSequence", "PeptideModifiedSequence", @@ -200,7 +200,8 @@ MSstatsPrepareForDataProcess = function(input, log_base, fix_missing) { if (!is.numeric(input$INTENSITY)) { suppressWarnings({ - input$INTENSITY = as.numeric(as.character(input$INTENSITY)) + data.table::set(input, j = "INTENSITY", + value = as.numeric(as.character(input$INTENSITY))) }) } @@ -214,10 +215,12 @@ MSstatsPrepareForDataProcess = function(input, log_base, fix_missing) { drop_cols = setdiff(colnames(input), cols) for (col in drop_cols) data.table::set(input, j = col, value = NULL) - input$PEPTIDE = paste(input$PEPTIDESEQUENCE, - input$PRECURSORCHARGE, sep = "_") - input$TRANSITION = paste(input$FRAGMENTION, - input$PRODUCTCHARGE, sep = "_") + data.table::set(input, j = "PEPTIDE", + value = paste(input$PEPTIDESEQUENCE, + input$PRECURSORCHARGE, sep = "_")) + data.table::set(input, j = "TRANSITION", + value = paste(input$FRAGMENTION, + input$PRODUCTCHARGE, sep = "_")) if (data.table::uniqueN(input$ISOTOPELABELTYPE) > 2) { getOption("MSstatsLog")( @@ -227,7 +230,8 @@ MSstatsPrepareForDataProcess = function(input, log_base, fix_missing) { stop("Statistical tools in MSstats are only proper for label-free or with reference peptide experiments.") } - input$ISOTOPELABELTYPE <- .mapIsotopeLabelType(input$ISOTOPELABELTYPE) + data.table::set(input, j = "ISOTOPELABELTYPE", + value = .mapIsotopeLabelType(input$ISOTOPELABELTYPE)) input } @@ -243,9 +247,14 @@ MSstatsPrepareForDataProcess = function(input, log_base, fix_missing) { data.table::setnames( input, "PEPTIDEMODIFIEDSEQUENCE", "PEPTIDESEQUENCE") } - input$PEPTIDE = paste(input$PEPTIDESEQUENCE, input$PRECURSORCHARGE, sep = "_") - input$TRANSITION = paste(input$FRAGMENTION, input$PRODUCTCHARGE, sep = "_") - input$ISOTOPELABELTYPE <- .mapIsotopeLabelType(input$ISOTOPELABELTYPE) + data.table::set(input, j = "PEPTIDE", + value = paste(input$PEPTIDESEQUENCE, + input$PRECURSORCHARGE, sep = "_")) + data.table::set(input, j = "TRANSITION", + value = paste(input$FRAGMENTION, + input$PRODUCTCHARGE, sep = "_")) + data.table::set(input, j = "ISOTOPELABELTYPE", + value = .mapIsotopeLabelType(input$ISOTOPELABELTYPE)) input } diff --git a/inst/tinytest/test_memory_optimization_copies.R b/inst/tinytest/test_memory_optimization_copies.R index 25e4ef8d..3f142432 100644 --- a/inst/tinytest/test_memory_optimization_copies.R +++ b/inst/tinytest/test_memory_optimization_copies.R @@ -203,13 +203,17 @@ expect_equal(addr_before, addr_after, info = paste(".finalizeTMP should modify input in-place.", "Before:", addr_before, "After:", addr_after)) -# (b) Matched rows should have newABUNDANCE from predicted_survival. -# Any row whose (cen, RUN, FEATURE) key exists in predicted_survival -# should get a non-NA newABUNDANCE value. +# (b) Every input row whose (cen, RUN, FEATURE) key exists in +# predicted_survival must get a non-NA newABUNDANCE, and rows with no matching +# key must be NA. Assert the exact expected count, not merely "> 0" (which would +# still pass if the join silently matched the wrong rows). +join_key = function(dt) paste(dt$cen, dt$RUN, dt$FEATURE, sep = "|") +expected_matched = sum(join_key(result) %in% join_key(predicted_survival)) matched_count = sum(!is.na(result$newABUNDANCE)) -expect_true(matched_count > 0, - info = paste("Matched rows should have non-NA newABUNDANCE.", - "Found", matched_count, "non-NA values")) +expect_equal(matched_count, expected_matched, + info = paste("Each matched (cen, RUN, FEATURE) key should yield a", + "non-NA newABUNDANCE. Expected", expected_matched, + "got", matched_count)) # (c) Unmatched rows should have NA. # predicted_survival has 11 rows but input has 12. The missing key