diff --git a/R/dataProcess.R b/R/dataProcess.R index e9581c8b..32cbbb47 100755 --- a/R/dataProcess.R +++ b/R/dataProcess.R @@ -401,7 +401,7 @@ MSstatsSummarizeSingleLinear = function(single_protein, if (impute & any(single_protein[["censored"]])) { fit_data = if (is_labeled_reference) { - single_protein[is_labeled_ref == FALSE, cols, with = FALSE] + single_protein[(!is_labeled_ref), cols, with = FALSE] } else { single_protein[, cols, with = FALSE] } @@ -414,19 +414,19 @@ MSstatsSummarizeSingleLinear = function(single_protein, }] if (is_labeled_reference) { - single_protein[, predicted := ifelse(censored & is_labeled_ref == FALSE, predicted, NA)] - single_protein[, newABUNDANCE := ifelse(censored & is_labeled_ref == FALSE, predicted, newABUNDANCE)] + single_protein[!(censored & !is_labeled_ref), predicted := NA] + single_protein[(censored) & !is_labeled_ref, + newABUNDANCE := predicted] } else { - single_protein[, predicted := ifelse(censored, predicted, NA)] - single_protein[, newABUNDANCE := ifelse(censored, predicted, newABUNDANCE)] + single_protein[!(censored), predicted := NA] + single_protein[(censored), newABUNDANCE := predicted] } - survival = single_protein[, intersect(c(cols, "LABEL", "predicted"), colnames(single_protein)), with = FALSE] } else { survival = single_protein[, intersect(c(cols, "LABEL"), colnames(single_protein)), with = FALSE] survival[, predicted := NA] } - + if (all(!is.na(single_protein$ANOMALYSCORES))) { single_protein[, weights := anomaly_weights_z_vec(ANOMALYSCORES), @@ -547,7 +547,7 @@ MSstatsSummarizeSingleTMP = function(single_protein, impute, censored_symbol, converged = TRUE fit_data = if (is_labeled_reference) { - single_protein[is_labeled_ref == FALSE, cols, with = FALSE] + single_protein[(!is_labeled_ref), cols, with = FALSE] } else { single_protein[, cols, with = FALSE] } @@ -569,11 +569,13 @@ MSstatsSummarizeSingleTMP = function(single_protein, impute, censored_symbol, } if (is_labeled_reference) { - single_protein[, predicted := ifelse(censored & is_labeled_ref == FALSE, predicted, NA)] - single_protein[, newABUNDANCE := ifelse(censored & is_labeled_ref == FALSE, predicted, newABUNDANCE)] + single_protein[!(censored & !is_labeled_ref), predicted := NA] + single_protein[(censored) & !is_labeled_ref, + newABUNDANCE := predicted] } else { - single_protein[, predicted := ifelse(censored, predicted, NA)] - single_protein[, newABUNDANCE := ifelse(censored, predicted, newABUNDANCE)] + single_protein[!(censored), predicted := NA] + single_protein[(censored), + newABUNDANCE := predicted] } survival = single_protein[, intersect(c(cols, "LABEL", "predicted"), colnames(single_protein)), with = FALSE] } else { diff --git a/R/utils_checks.R b/R/utils_checks.R index 3596e74d..02aa476b 100644 --- a/R/utils_checks.R +++ b/R/utils_checks.R @@ -170,7 +170,7 @@ MSstatsPrepareForDataProcess = function(input, log_base, fix_missing) { input = data.table::as.data.table(unclass(input)) if (!"AnomalyScores" %in% colnames(input)){ - input$AnomalyScores = NA + data.table::set(input, j = "AnomalyScores", value = NA) } cols = c("ProteinName", "PeptideSequence", "PeptideModifiedSequence", @@ -200,7 +200,8 @@ MSstatsPrepareForDataProcess = function(input, log_base, fix_missing) { if (!is.numeric(input$INTENSITY)) { suppressWarnings({ - input$INTENSITY = as.numeric(as.character(input$INTENSITY)) + data.table::set(input, j = "INTENSITY", + value = as.numeric(as.character(input$INTENSITY))) }) } @@ -211,12 +212,15 @@ MSstatsPrepareForDataProcess = function(input, log_base, fix_missing) { cols = toupper(cols) cols = intersect(c(cols, "FRACTION", "TECHREPLICATE"), colnames(input)) - input = input[, cols, with = FALSE] - - input$PEPTIDE = paste(input$PEPTIDESEQUENCE, - input$PRECURSORCHARGE, sep = "_") - input$TRANSITION = paste(input$FRAGMENTION, - input$PRODUCTCHARGE, sep = "_") + drop_cols = setdiff(colnames(input), cols) + for (col in drop_cols) data.table::set(input, j = col, value = NULL) + + data.table::set(input, j = "PEPTIDE", + value = paste(input$PEPTIDESEQUENCE, + input$PRECURSORCHARGE, sep = "_")) + data.table::set(input, j = "TRANSITION", + value = paste(input$FRAGMENTION, + input$PRODUCTCHARGE, sep = "_")) if (data.table::uniqueN(input$ISOTOPELABELTYPE) > 2) { getOption("MSstatsLog")( @@ -226,7 +230,8 @@ MSstatsPrepareForDataProcess = function(input, log_base, fix_missing) { stop("Statistical tools in MSstats are only proper for label-free or with reference peptide experiments.") } - input$ISOTOPELABELTYPE <- .mapIsotopeLabelType(input$ISOTOPELABELTYPE) + data.table::set(input, j = "ISOTOPELABELTYPE", + value = .mapIsotopeLabelType(input$ISOTOPELABELTYPE)) input } @@ -242,9 +247,14 @@ MSstatsPrepareForDataProcess = function(input, log_base, fix_missing) { data.table::setnames( input, "PEPTIDEMODIFIEDSEQUENCE", "PEPTIDESEQUENCE") } - input$PEPTIDE = paste(input$PEPTIDESEQUENCE, input$PRECURSORCHARGE, sep = "_") - input$TRANSITION = paste(input$FRAGMENTION, input$PRODUCTCHARGE, sep = "_") - input$ISOTOPELABELTYPE <- .mapIsotopeLabelType(input$ISOTOPELABELTYPE) + data.table::set(input, j = "PEPTIDE", + value = paste(input$PEPTIDESEQUENCE, + input$PRECURSORCHARGE, sep = "_")) + data.table::set(input, j = "TRANSITION", + value = paste(input$FRAGMENTION, + input$PRODUCTCHARGE, sep = "_")) + data.table::set(input, j = "ISOTOPELABELTYPE", + value = .mapIsotopeLabelType(input$ISOTOPELABELTYPE)) input } @@ -322,8 +332,8 @@ setMethod(".checkDataValidity", "MSstatsValidated", .prepareForDataProcess) input[, PROTEIN := factor(PROTEIN)] input[, PEPTIDE := factor(PEPTIDE)] input[, TRANSITION := factor(TRANSITION)] - input = input[order(LABEL, GROUP_ORIGINAL, SUBJECT_ORIGINAL, - RUN, PROTEIN, PEPTIDE, TRANSITION), ] + data.table::setorder(input, LABEL, GROUP_ORIGINAL, SUBJECT_ORIGINAL, + RUN, PROTEIN, PEPTIDE, TRANSITION) input[, GROUP := factor(GROUP)] input[, SUBJECT := factor(SUBJECT)] input[, FEATURE := factor(FEATURE)] diff --git a/R/utils_feature_selection.R b/R/utils_feature_selection.R index b11b6f16..630b37a6 100644 --- a/R/utils_feature_selection.R +++ b/R/utils_feature_selection.R @@ -74,28 +74,22 @@ MSstatsSelectFeatures = function(input, method, top_n = 3, min_feature_count = 2 #' @return data.table #' @keywords internal .selectHighQualityFeatures = function(input, min_feature_count) { - PROTEIN = PEPTIDE = FEATURE = originalRUN = ABUNDANCE = is_censored = NULL - is_obs = log2inty = LABEL = NULL - - cols = c("PROTEIN", "PEPTIDE", "FEATURE", "originalRUN", "LABEL", - "ABUNDANCE", "censored") - cols = intersect(cols, colnames(input)) - input = input[, cols, with = FALSE] - if (!("censored" %in% cols)) { - input$censored = FALSE - } - data.table::setnames(input, "censored", "is_censored") + PROTEIN = PEPTIDE = FEATURE = originalRUN = ABUNDANCE = censored = NULL + is_obs = log2inty = is_censored = LABEL = NULL + + has_censored = is.element("censored", colnames(input)) input = input[, list(protein = as.character(PROTEIN), peptide = as.character(PEPTIDE), feature = as.character(FEATURE), run = as.character(originalRUN), label = as.character(LABEL), - log2inty = ifelse(!(is.na(ABUNDANCE) | is_censored), - ABUNDANCE, NA), - is_censored)] - input[, is_obs := !(is.na(log2inty) | is_censored)] + log2inty = ABUNDANCE, + is_censored = if (has_censored) censored else FALSE)] + # Censored or missing intensities are not observations. + input[is_censored | is.na(log2inty), log2inty := NA] + input[, is_obs := !is.na(log2inty)] input[, is_censored := NULL] - + features_quality = data.table::rbindlist(lapply(split(input, input$label), .flagUninformativeSingleLabel, min_feature_count = min_feature_count)) diff --git a/R/utils_normalize.R b/R/utils_normalize.R index ca6f1f00..936a5e59 100644 --- a/R/utils_normalize.R +++ b/R/utils_normalize.R @@ -61,8 +61,8 @@ MSstatsNormalize = function(input, normalization_method, peptides_dict = NULL, s input[, ABUNDANCE_FRACTION := median(ABUNDANCE_RUN, na.rm = TRUE), by = "FRACTION"] input[, ABUNDANCE := ABUNDANCE - ABUNDANCE_RUN + ABUNDANCE_FRACTION] - input = input[, !(colnames(input) %in% c("ABUNDANCE_RUN", "ABUNDANCE_FRACTION")), - with = FALSE] + data.table::set(input, j = "ABUNDANCE_RUN", value = NULL) + data.table::set(input, j = "ABUNDANCE_FRACTION", value = NULL) getOption("MSstatsLog")("Normalization based on median: OK") input } @@ -255,7 +255,9 @@ MSstatsNormalize = function(input, normalization_method, peptides_dict = NULL, s input[, ABUNDANCE := ABUNDANCE - median_by_run + median_by_fraction] getOption("MSstatsLog")("INFO", "Normalization : normalization with global standards protein - okay") - input[ , !(colnames(input) %in% c("median_by_run", "median_by_fraction")), with = FALSE] + data.table::set(input, j = "median_by_run", value = NULL) + data.table::set(input, j = "median_by_fraction", value = NULL) + input } @@ -282,7 +284,7 @@ MSstatsNormalize = function(input, normalization_method, peptides_dict = NULL, s #' MSstatsMergeFractions = function(input) { ABUNDANCE = INTENSITY = GROUP_ORIGINAL = SUBJECT_ORIGINAL = RUN = NULL - originalRUN = FRACTION = TECHREPLICATE = tmp = merged = newRun = NULL + originalRUN = FRACTION = TECHREPLICATE = tmp = newRun = NULL ncount = FEATURE = NULL input[!is.na(ABUNDANCE) & ABUNDANCE < 0, "ABUNDANCE"] = 0 @@ -338,29 +340,45 @@ MSstatsMergeFractions = function(input) { getOption("MSstatsLog")("ERROR", msg) stop(msg) } else { - match_runs[, merged := "merged"] - match_runs[, newRun := do.call(paste, c(.SD, sep = "_")), - .SDcols = c(1:3, ncol(match_runs))] - match_runs = unique(match_runs[, list(GROUP_ORIGINAL, - SUBJECT_ORIGINAL, - newRun)]) - - input = merge(input, match_runs, - by = c("GROUP_ORIGINAL", "SUBJECT_ORIGINAL"), - all.x = TRUE) + # dcast pivoted fractions into columns, so the fraction columns + # are everything that is not a sample-identifier column. + fraction_cols = setdiff(colnames(match_runs), + c("GROUP_ORIGINAL", "SUBJECT_ORIGINAL")) + # Use the first fraction's run as the sample's representative run. + first_fraction_run = match_runs[[fraction_cols[1]]] + # Build the merged-run name: ___merged. + match_runs[, newRun := paste(GROUP_ORIGINAL, SUBJECT_ORIGINAL, + first_fraction_run, "merged", + sep = "_")] + # Reduce to a (group, subject) -> merged-run lookup table. + match_runs = match_runs[, list(GROUP_ORIGINAL, SUBJECT_ORIGINAL, + newRun)] + # For each input row, find its sample's row in the lookup. + nr_idx = match_runs[input, + on = c("GROUP_ORIGINAL", "SUBJECT_ORIGINAL"), + which = TRUE, mult = "first"] + # Write that merged-run name onto every input row. + data.table::set(input, j = "newRun", + value = match_runs$newRun[nr_idx]) + # Count, per feature/fraction, the rows observed above zero. select_fraction = input[!is.na(ABUNDANCE) & input$ABUNDANCE > 0, list(ncount = .N), by = c("FEATURE", "FRACTION")] + # Keep only feature/fraction combinations seen at least once. select_fraction = select_fraction[ncount != 0] - select_fraction[, tmp := paste(FEATURE, FRACTION, sep = "_")] - input$tmp = paste(input$FEATURE, input$FRACTION, sep = "_") - input = input[tmp %in% select_fraction$tmp, ] - input$originalRUN = input$newRun - input$RUN = input$originalRUN - input$RUN = factor(input$RUN, levels = unique(input$RUN), - labels = seq_along(unique(input$RUN))) - input = input[, !(colnames(input) %in% c('tmp','newRun')), - with = FALSE] + # Mark which input rows belong to a kept combination (NA = none). + keep_idx = select_fraction[input, + on = c("FEATURE", "FRACTION"), + which = TRUE, mult = "first"] + # Drop rows whose feature/fraction was never observed. + input = input[!is.na(keep_idx)] + # The merged run replaces the original per-fraction run. + input[, originalRUN := newRun] + # Renumber RUN as a factor, one level per merged run. + input[, RUN := factor(newRun, levels = unique(newRun), + labels = seq_along(unique(newRun)))] + # Drop the temporary newRun helper column. + data.table::set(input, j = "newRun", value = NULL) } } } diff --git a/R/utils_output.R b/R/utils_output.R index 7b748482..75a7d60b 100644 --- a/R/utils_output.R +++ b/R/utils_output.R @@ -34,22 +34,31 @@ #' output = output = MSstatsSummarizationOutput(input, summarized, processed, #' method, impute, cens) #' -MSstatsSummarizationOutput = function(input, summarized, processed, +MSstatsSummarizationOutput = function(input, summarized, processed, method, impute, censored_symbol) { LABEL = TotalGroupMeasurements = GROUP = Protein = RUN = NULL - - input = .finalizeInput(input, summarized, method, impute, censored_symbol) - summarized = lapply(summarized, function(x) x[[1]]) - summarized = data.table::rbindlist(summarized, fill = TRUE) - if (inherits(summarized, "try-error")) { + + # Summarization failure is signalled by a NULL `summarized` (the caller + # wraps the per-subplot summarization in tryCatch(error = ...) returning + # NULL). Guard before unpacking, otherwise .finalizeInput() below joins on + # an empty predicted_survival and errors. + if (is.null(summarized)) { msg = paste("*** error : can't summarize per subplot with ", method, ".") getOption("MSstatsLog")("ERROR", msg) getOption("MSstatsMsg")("ERROR", msg) rqall = NULL - rqmodelqc = NULL - workpred = NULL } else { + predicted_survival = data.table::rbindlist(lapply(summarized, function(x) x[[2]]), + fill = TRUE) + for (i in seq_along(summarized)) summarized[[i]][[2]] = NULL + input = .finalizeInput(input, predicted_survival, method, impute, censored_symbol) + rm(predicted_survival) + protein_summaries = lapply(summarized, function(x) x[[1]]) + rm(summarized) + summarized = data.table::rbindlist(protein_summaries, fill = TRUE) + rm(protein_summaries) + input[, TotalGroupMeasurements := uniqueN(.SD), by = c("PROTEIN", "GROUP", "LABEL"), .SDcols = c("FEATURE", "originalRUN")] @@ -67,10 +76,9 @@ MSstatsSummarizationOutput = function(input, summarized, processed, by.y = c(merge_col, "PROTEIN", "LABEL")) data.table::setnames(rqall, c("GROUP_ORIGINAL", "SUBJECT_ORIGINAL"), c("GROUP", "SUBJECT"), skip_absent = TRUE) - + rqall$GROUP = factor(as.character(rqall$GROUP)) rqall$Protein = factor(rqall$Protein) - rqmodelqc = summarized$ModelQC } if (is.element("RUN", colnames(rqall)) & !is.null(rqall)) { @@ -82,18 +90,21 @@ MSstatsSummarizationOutput = function(input, summarized, processed, "originalRUN", "censored", "INTENSITY", "ABUNDANCE", "newABUNDANCE", "predicted", "feature_quality", "is_outlier", "remove", "is_labeled_ref"), colnames(input)) - input = input[, output_cols, with = FALSE] - + drop_cols = setdiff(colnames(input), output_cols) + for (col in drop_cols) data.table::set(input, j = col, value = NULL) + if (is.element("remove", colnames(processed))) { - processed = processed[(remove), - intersect(output_cols, + processed = processed[(remove), + intersect(output_cols, colnames(processed)), with = FALSE] input = rbind(input, processed, fill = TRUE) } - list(FeatureLevelData = as.data.frame(input), - ProteinLevelData = as.data.frame(rqall), + data.table::setDF(input) + if (!is.null(rqall)) data.table::setDF(rqall) + list(FeatureLevelData = input, + ProteinLevelData = rqall, SummaryMethod = method) - + } @@ -104,9 +115,9 @@ MSstatsSummarizationOutput = function(input, summarized, processed, #' @param impute if TRUE, censored missing values were imputed #' @param censored_symbol censored missing value indicator #' @keywords internal -.finalizeInput = function(input, summarized, method, impute, censored_symbol) { +.finalizeInput = function(input, predicted_survival, method, impute, censored_symbol) { # if (method == "TMP") { - input = .finalizeTMP(input, censored_symbol, impute, summarized) + input = .finalizeTMP(input, censored_symbol, impute, predicted_survival) # } else { # input = .finalizeLinear(input, censored_symbol) # } @@ -117,21 +128,23 @@ MSstatsSummarizationOutput = function(input, summarized, processed, #' Summary statistics for output of TMP-based summarization #' @inheritParams .finalizeInput #' @keywords internal -.finalizeTMP = function(input, censored_symbol, impute, summarized) { +.finalizeTMP = function(input, censored_symbol, impute, predicted_survival) { NonMissingStats = NumMeasuredFeature = MissingPercentage = LABEL = NULL total_features = more50missing = nonmissing_orig = censored = NULL INTENSITY = newABUNDANCE = NumImputedFeature = NULL - - survival_predictions = lapply(summarized, function(x) x[[2]]) - predicted_survival = data.table::rbindlist(survival_predictions, fill = TRUE) + if (impute) { - cols = intersect(colnames(input), c("newABUNDANCE", - "cen", "RUN", - "FEATURE", "ref_covariate", "LABEL")) - input = merge(input[, colnames(input) != "newABUNDANCE", with = FALSE], - predicted_survival, - by = setdiff(cols, "newABUNDANCE"), - all.x = TRUE) + join_cols = intersect(intersect(colnames(input), + colnames(predicted_survival)), + c("cen", "RUN", "FEATURE", "ref_covariate", + "LABEL")) + data.table::set(input, j = "newABUNDANCE", value = NULL) + idx = predicted_survival[input, on = join_cols, which = TRUE, + mult = "first"] + data.table::set(input, j = "newABUNDANCE", + value = predicted_survival$newABUNDANCE[idx]) + data.table::set(input, j = "predicted", + value = predicted_survival$predicted[idx]) } input[, NonMissingStats := .getNonMissingFilterStats(.SD, censored_symbol)] input[, NumMeasuredFeature := sum(NonMissingStats), @@ -144,7 +157,7 @@ MSstatsSummarizationOutput = function(input, summarized, processed, } else { input[, nonmissing_orig := !is.na(INTENSITY)] } - input[, nonmissing_orig := ifelse(is.na(newABUNDANCE), TRUE, nonmissing_orig)] + input[is.na(newABUNDANCE), nonmissing_orig := TRUE] if (impute) { input[, NumImputedFeature := sum(!nonmissing_orig), by = c("PROTEIN", "RUN", "LABEL")] @@ -175,7 +188,7 @@ MSstatsSummarizationOutput = function(input, summarized, processed, } else { input[, nonmissing_orig := !is.na(INTENSITY)] } - input[, nonmissing_orig := ifelse(is.na(newABUNDANCE), TRUE, nonmissing_orig)] + input[is.na(newABUNDANCE), nonmissing_orig := TRUE] input[, NumImputedFeature := 0] } input diff --git a/inst/tinytest/test_MSstatsMergeFractions.R b/inst/tinytest/test_MSstatsMergeFractions.R new file mode 100644 index 00000000..ece19714 --- /dev/null +++ b/inst/tinytest/test_MSstatsMergeFractions.R @@ -0,0 +1,85 @@ +# Characterization tests for MSstatsMergeFractions -------------------------- +# +# MSstatsMergeFractions collapses the multiple fraction-runs of a sample into a +# single merged run and drops feature/fraction combinations that were never +# observed. The function is hard to read, so these tests pin its observable +# behavior so the in-place-update rewrite (and any future simplification) stays +# equivalent. They focus on the non-TECHREPLICATE branch, which is the part this +# change rewrote; the TECHREPLICATE branch is unchanged from the previous code. + +MSstatsConvert::MSstatsLogsSettings(FALSE) + +# Two subjects, two fractions each. f2 in fraction 2 is all-zero abundance, so +# that (FEATURE, FRACTION) combination has no positive observation. +make_fraction_input <- function(f2_frac2 = c(0.0, 0.0)) { + input <- data.table::data.table( + PROTEIN = "P1", + FEATURE = c("f1", "f1", "f2", "f2", "f1", "f1", "f2", "f2"), + GROUP_ORIGINAL = "G", + SUBJECT_ORIGINAL = c("S1", "S1", "S1", "S1", "S2", "S2", "S2", "S2"), + RUN = factor(c(1, 2, 1, 2, 3, 4, 3, 4)), + originalRUN = c("a1", "a2", "a1", "a2", "b1", "b2", "b1", "b2"), + FRACTION = c(1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L), + INTENSITY = c(10, 20, 30, 40, 50, 60, 70, 80), + ABUNDANCE = c(3.0, 4.0, 2.0, 0.0, 3.5, 4.5, 2.5, 0.0) + ) + # The two f2 / fraction-2 rows (positions 4 and 8) take the parametrized + # abundance. Assigned after construction to avoid referencing a local in a + # data.table() column expression. + input$ABUNDANCE[c(4L, 8L)] <- f2_frac2 + input +} + +# Single fraction -> nothing to merge, returned untouched (besides clamping) --- +single_fraction = data.table::data.table( + PROTEIN = "P1", FEATURE = c("f1", "f2"), + GROUP_ORIGINAL = "G", SUBJECT_ORIGINAL = "S1", + RUN = factor(c(1, 1)), originalRUN = c("a1", "a1"), FRACTION = c(1L, 1L), + INTENSITY = c(10, 20), ABUNDANCE = c(3.0, 4.0)) +res_single = MSstatsMergeFractions(data.table::copy(single_fraction)) +expect_equal(nrow(res_single), 2L, + info = "single-fraction input keeps all rows") +expect_equal(res_single$originalRUN, c("a1", "a1"), + info = "single-fraction input leaves originalRUN untouched") + +# Abundance clamping happens before the fraction check ------------------------ +# ABUNDANCE < 0 becomes 0; an INTENSITY of exactly 1 forces ABUNDANCE to 0. +clamp_input = data.table::data.table( + PROTEIN = "P1", FEATURE = c("f1", "f2"), + GROUP_ORIGINAL = "G", SUBJECT_ORIGINAL = "S1", + RUN = factor(c(1, 1)), originalRUN = c("a1", "a1"), FRACTION = c(1L, 1L), + INTENSITY = c(1, 50), ABUNDANCE = c(2.5, -3.0)) +res_clamp = MSstatsMergeFractions(data.table::copy(clamp_input)) +expect_equal(res_clamp$ABUNDANCE, c(0, 0), + info = "INTENSITY == 1 and negative ABUNDANCE are both clamped to 0") + +# Non-TECHREPLICATE: fractions merge and unobserved combos drop --------------- +res = MSstatsMergeFractions(make_fraction_input()) + +# f2/fraction-2 was all-zero, so those 2 rows are dropped; 6 remain. +expect_equal(nrow(res), 6L, + info = "feature/fraction with no positive abundance is removed") +expect_equal(nrow(res[FEATURE == "f2" & FRACTION == 2L]), 0L, + info = "the unobserved f2/fraction-2 rows are gone") + +# Each sample's fraction-runs collapse to one '___merged'. +expect_equal(sort(unique(res$originalRUN)), + c("G_S1_a1_merged", "G_S2_b1_merged"), + info = "originalRUN is the merged run name, one per sample") + +# RUN is refactored to one level per merged run (2 samples -> 2 levels). +expect_true(is.factor(res$RUN), info = "RUN stays a factor") +expect_equal(nlevels(res$RUN), 2L, + info = "two merged samples give two RUN levels") + +# The temporary newRun column must not leak into the output. +expect_false("newRun" %in% colnames(res), + info = "newRun helper column is removed") + +# Non-TECHREPLICATE: fully observed features are all kept --------------------- +res_keep = MSstatsMergeFractions(make_fraction_input(f2_frac2 = c(1.0, 1.5))) +expect_equal(nrow(res_keep), 8L, + info = "nothing is dropped when every feature/fraction is observed") +expect_equal(sort(unique(paste(res_keep$FEATURE, res_keep$FRACTION))), + c("f1 1", "f1 2", "f2 1", "f2 2"), + info = "all feature/fraction combinations survive") diff --git a/inst/tinytest/test_memory_optimization_copies.R b/inst/tinytest/test_memory_optimization_copies.R new file mode 100644 index 00000000..3f142432 --- /dev/null +++ b/inst/tinytest/test_memory_optimization_copies.R @@ -0,0 +1,445 @@ +# Tests for in-place data.table operations in the dataProcess pipeline: +# column drops, keyed-lookup joins, targeted [i, j := v] writes, and +# the predicted_survival extraction in MSstatsSummarizationOutput. + + +# --- .normalizeMedian: column drops are in place ------------------------------ +# +# .normalizeMedian() adds two temp columns (ABUNDANCE_RUN, ABUNDANCE_FRACTION) +# via :=, uses them, then removes them with data.table::set(j=, value=NULL) +# so the data.table is modified in-place. We verify the address is preserved. + +MSstatsConvert::MSstatsLogsSettings(FALSE) + +# Test 2a: .normalizeMedian preserves data.table identity --- +# +# data.table::address() returns the hex pointer of the data.table object. +# An in-place modification preserves the address; a copy changes it. + +norm_input = data.table::data.table( + PROTEIN = factor(rep(c("P1", "P2"), each = 20)), + PEPTIDE = factor(rep(c("pep1", "pep2"), each = 10, times = 2)), + FEATURE = factor(rep(c("feat1", "feat2"), each = 10, times = 2)), + LABEL = rep("L", 40), + RUN = factor(rep(1:10, 4)), + FRACTION = rep(1L, 40), + ABUNDANCE = rnorm(40, mean = 20, sd = 2), + INTENSITY = 2^rnorm(40, mean = 20, sd = 2), + GROUP_ORIGINAL = rep(c("A", "B"), each = 5, times = 4), + SUBJECT_ORIGINAL = rep(1:10, 4), + TRANSITION = factor(rep("y3_1", 40)) +) + +addr_before = data.table::address(norm_input) +norm_result = MSstats:::.normalizeMedian(norm_input) +addr_after = data.table::address(norm_result) + +# The address should be the same — in-place modification, no copy +expect_equal(addr_before, addr_after, + info = paste(".normalizeMedian should modify in-place (same address).", + "Before:", addr_before, "After:", addr_after)) + +# Temp columns should not exist in the result +expect_false("ABUNDANCE_RUN" %in% colnames(norm_result), + info = "Temp column ABUNDANCE_RUN should be removed") +expect_false("ABUNDANCE_FRACTION" %in% colnames(norm_result), + info = "Temp column ABUNDANCE_FRACTION should be removed") + +# ABUNDANCE should still exist and be modified (normalized) +expect_true("ABUNDANCE" %in% colnames(norm_result), + info = "ABUNDANCE column should still exist after normalization") + + +# Test 2b: .normalizeMedian does not allocate a full-table copy --- +# +# With in-place set(), the only allocations should be the two temp column +# vectors (~2x nrow doubles), not the whole table. + +n_rows = 100000 +big_input = data.table::data.table( + PROTEIN = factor(rep(paste0("P", 1:100), each = n_rows / 100)), + PEPTIDE = factor(rep(paste0("pep", 1:200), each = n_rows / 200)), + FEATURE = factor(rep(paste0("feat", 1:200), each = n_rows / 200)), + LABEL = rep("L", n_rows), + RUN = factor(rep(1:50, n_rows / 50)), + FRACTION = rep(1L, n_rows), + ABUNDANCE = rnorm(n_rows, mean = 20, sd = 2), + INTENSITY = 2^rnorm(n_rows, mean = 20, sd = 2), + GROUP_ORIGINAL = rep(c("A", "B"), each = 25, times = n_rows / 50), + SUBJECT_ORIGINAL = rep(1:50, n_rows / 50), + TRANSITION = factor(rep("y3_1", n_rows)) +) + +# Measure the size of the input table and one column for reference. +# We use these to reason about whether the allocation during +# .normalizeMedian is "2 temp columns worth" vs "full table copy worth". +table_size = as.numeric(object.size(big_input)) +one_col_size = as.numeric(object.size(big_input$ABUNDANCE)) +ncols = ncol(big_input) + +# --- Memory measurement --- +# +# R manages data in "Vcells" (vector cells) — this is where column data +# lives. gc() returns a matrix with memory stats; row 2 = Vcells, +# column 6 = "max used (Mb)" = peak Vcells since last reset. +# +# gc(reset = TRUE) resets the "max used" counter to the current level, +# giving us a clean window to measure just the .normalizeMedian call. +gc(reset = TRUE) +big_result = MSstats:::.normalizeMedian(big_input) +gc_after = gc() + +# Extract peak vector memory (Vcells, max used, in MB). +# We store this for informational purposes but do NOT assert on it +# because gc() accounting is non-deterministic: +# - R may GC the temp columns before we call gc(), or may not +# - Other small allocations (formula parsing, string interning) +# contaminate the measurement +# - Two identical runs can give different numbers +# The address() check below is the reliable, deterministic test. +peak_vcells_mb = gc_after[2, 6] + +# Sanity check on the test data itself (not on the function): +# Verify that 2 temp columns are much smaller than the full table. +# If they were similar sizes, we couldn't distinguish "allocated 2 temp +# columns" from "copied the whole table" and the gc() figure above would be +# meaningless. For our 11-column, 100K-row table: 2 cols ≈ 1.6 MB, full ≈ 6 MB. +# +# This only guards the *informational* gc() measurement, which we do not +# assert on (see note above). object.size() accounting depends on R internals +# out of our control, so a wrong result here emits a message rather than +# failing the test run; the deterministic address() check below is the real test. +if (!(one_col_size * 2 < table_size * 0.5)) { + message("Note: fixture sanity check off -- 2 cols (", + round(one_col_size * 2 / 1e6, 2), " MB) not < 50% of table (", + round(table_size * 0.5 / 1e6, 2), " MB). ", + "gc-based memory figures may be unreliable on this platform.") +} + +# The real test: same address = same R object = no copy was made. +# This is deterministic and reliable, unlike gc() measurements. +expect_equal(data.table::address(big_input), data.table::address(big_result), + info = "Large table: .normalizeMedian should modify in-place") +expect_false("ABUNDANCE_RUN" %in% colnames(big_result), + info = "Large table: temp column ABUNDANCE_RUN should be removed") + + +# --- .finalizeTMP: in-place update from predicted_survival -------------------- +# +# .finalizeTMP replaces the pre-imputation newABUNDANCE column in the full +# feature-level table with imputed values from predicted_survival, using: +# set(input, j = "newABUNDANCE", value = NULL) — remove old column +# input[, c(...) := .(NA_real_, NA_real_)] — add NA placeholders +# input[predicted_survival, ... := ..., on = .] — update matched rows +# +# These tests verify: +# (a) The data.table is modified in-place (address preserved) +# (b) Matched rows get imputed values from predicted_survival +# (c) Unmatched rows are left as NA (equivalent to all.x = TRUE) +# (d) Allocation stays small relative to a full-table copy + +# Test 3a: .finalizeTMP modifies input in-place with correct values --- + +# Build a feature-level input table mimicking post-summarization state. +# Two proteins, 2 features each, 3 runs = 12 rows per label. +n = 12 +finalize_input = data.table::data.table( + PROTEIN = factor(rep(c("P1", "P2"), each = 6)), + PEPTIDE = factor(rep(c("pep1", "pep2"), each = 3, times = 2)), + FEATURE = factor(rep(c("feat1", "feat2"), each = 3, times = 2)), + LABEL = rep("L", n), + GROUP = factor(rep(c("A", "B", "C"), times = 4)), + RUN = factor(rep(1:3, times = 4)), + SUBJECT = factor(rep(1:3, times = 4)), + FRACTION = rep(1L, n), + INTENSITY = 2^rnorm(n, 20, 2), + ABUNDANCE = rnorm(n, 20, 2), + # Pre-imputation newABUNDANCE: some are NA (censored) + newABUNDANCE = c(10.1, NA, 10.5, 11.2, 10.8, NA, + 9.5, NA, 10.2, 11.0, NA, 10.6), + cen = c(1, 0, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1), + censored = c(FALSE, TRUE, FALSE, FALSE, FALSE, TRUE, + FALSE, TRUE, FALSE, FALSE, TRUE, FALSE), + originalRUN = factor(rep(paste0("Run", 1:3), times = 4)), + n_obs = rep(3, n), + n_obs_run = rep(2, n), + total_features = rep(2, n), + nonmissing = c(TRUE, FALSE, TRUE, TRUE, TRUE, FALSE, + TRUE, FALSE, TRUE, TRUE, FALSE, TRUE) +) + +# Build predicted_survival: simulates what rbindlist of per-protein survival +# tables looks like. Only contains rows that passed the n_obs/n_obs_run filter +# in MSstatsSummarizeSingleTMP — so fewer rows than input. +# Rows 2, 6, 8, 11 are censored — only some of them appear in predicted_survival +# (e.g., rows filtered out by n_obs <= 1 would be absent). +# Here we include most rows but deliberately EXCLUDE rows for RUN=2, FEATURE=feat2 +# of P2 to test the unmatched (NA) case. +predicted_survival = data.table::data.table( + newABUNDANCE = c(10.1, 8.5, 10.5, 11.2, 10.8, 7.9, + 9.5, 8.2, 10.2, 11.0, 10.6), + cen = c(1, 0, 1, 1, 1, 0, + 1, 0, 1, 1, 1), + RUN = factor(c(1, 2, 3, 1, 2, 3, + 1, 2, 3, 1, 3)), + FEATURE = factor(c(rep("feat1", 3), rep("feat2", 3), + rep("feat1", 3), rep("feat2", 2))), + predicted = c(NA, 8.5, NA, NA, NA, 7.9, + NA, 8.2, NA, NA, NA) +) + +addr_before = data.table::address(finalize_input) + +# Call .finalizeTMP with impute = TRUE. +# This should replace newABUNDANCE and add predicted via in-place join. +result = MSstats:::.finalizeTMP(finalize_input, censored_symbol = "NA", + impute = TRUE, + predicted_survival = predicted_survival) + +addr_after = data.table::address(result) + +# (a) Address preserved — the data.table was modified in-place. +expect_equal(addr_before, addr_after, + info = paste(".finalizeTMP should modify input in-place.", + "Before:", addr_before, "After:", addr_after)) + +# (b) Every input row whose (cen, RUN, FEATURE) key exists in +# predicted_survival must get a non-NA newABUNDANCE, and rows with no matching +# key must be NA. Assert the exact expected count, not merely "> 0" (which would +# still pass if the join silently matched the wrong rows). +join_key = function(dt) paste(dt$cen, dt$RUN, dt$FEATURE, sep = "|") +expected_matched = sum(join_key(result) %in% join_key(predicted_survival)) +matched_count = sum(!is.na(result$newABUNDANCE)) +expect_equal(matched_count, expected_matched, + info = paste("Each matched (cen, RUN, FEATURE) key should yield a", + "non-NA newABUNDANCE. Expected", expected_matched, + "got", matched_count)) + +# (c) Unmatched rows should have NA. +# predicted_survival has 11 rows but input has 12. The missing key +# (cen=0, RUN=2, FEATURE=feat2) has no match, so that row should be NA. +# We check by looking for the key that's absent in predicted_survival. +unmatched = result[cen == 0 & RUN == 2 & FEATURE == "feat2", newABUNDANCE] +if (length(unmatched) > 0) { + expect_true(is.na(unmatched[1]), + info = "Unmatched row should get NA for newABUNDANCE") +} + +# (d) The predicted column should exist. +expect_true("predicted" %in% colnames(result), + info = ".finalizeTMP should add the 'predicted' column") + + +# Test 3b: .finalizeTMP at scale — memory stays low --- +# +# At ~50K rows, .finalizeTMP should modify the input in place; peak Vcells +# should stay close to the input size, not balloon to a multi-table merge. + +n_big = 50000 +n_runs = 50 +n_features = 20 +n_proteins = n_big / (n_runs * n_features) # = 50 + +big_finalize_input = data.table::data.table( + PROTEIN = factor(rep(paste0("P", 1:n_proteins), each = n_runs * n_features)), + FEATURE = factor(rep(paste0("feat", 1:n_features), each = n_runs, times = n_proteins)), + LABEL = rep("L", n_big), + RUN = factor(rep(1:n_runs, times = n_proteins * n_features)), + INTENSITY = 2^rnorm(n_big, 20, 2), + ABUNDANCE = rnorm(n_big, 20, 2), + newABUNDANCE = rnorm(n_big, 20, 2), + cen = sample(c(0, 1), n_big, replace = TRUE, prob = c(0.2, 0.8)), + censored = FALSE, + originalRUN = factor(rep(paste0("Run", 1:n_runs), times = n_proteins * n_features)), + GROUP = factor(rep("A", n_big)), + SUBJECT = factor(rep(1:n_runs, times = n_proteins * n_features)), + n_obs = rep(n_runs, n_big), + n_obs_run = rep(n_features, n_big), + total_features = rep(n_features, n_big), + nonmissing = rep(TRUE, n_big) +) +big_finalize_input[cen == 0, censored := TRUE] +big_finalize_input[cen == 0, newABUNDANCE := NA] + +# predicted_survival: exclude specific (RUN, FEATURE) combinations to +# guarantee some rows in input have no match. Random row sampling doesn't +# work because multiple rows can share the same join key, so excluded rows +# may still match via a duplicate key in the kept rows. +excluded_features = c("feat1", "feat2") +excluded_run = "1" +big_predicted = big_finalize_input[ + !(FEATURE %in% excluded_features & RUN == excluded_run), + .(newABUNDANCE = ifelse(cen == 0, 7.0, newABUNDANCE), + cen, RUN, FEATURE, + predicted = ifelse(cen == 0, 7.0, NA_real_))] + +input_size = as.numeric(object.size(big_finalize_input)) +addr_big_before = data.table::address(big_finalize_input) + +# Reset GC, run .finalizeTMP, measure peak. +gc(reset = TRUE) +big_finalize_result = MSstats:::.finalizeTMP( + big_finalize_input, censored_symbol = "NA", impute = TRUE, + predicted_survival = big_predicted) +gc_after_finalize = gc() + +# Peak Vcells (MB) — informational. +peak_mb_finalize = gc_after_finalize[2, 6] + +# Address preserved = in-place modification, no full-table copy. +expect_equal(addr_big_before, data.table::address(big_finalize_result), + info = paste("Large .finalizeTMP: should modify in-place.", + "Input size:", round(input_size / 1e6, 1), "MB.", + "Peak Vcells:", peak_mb_finalize, "MB")) + +# newABUNDANCE and predicted columns should both exist. +expect_true("newABUNDANCE" %in% colnames(big_finalize_result), + info = "Large .finalizeTMP: newABUNDANCE should exist after join") +expect_true("predicted" %in% colnames(big_finalize_result), + info = "Large .finalizeTMP: predicted should exist after join") + +# Rows with excluded (RUN, FEATURE) keys should have NA for newABUNDANCE, +# because they have no match in predicted_survival. +unmatched_mask = big_finalize_result$FEATURE %in% excluded_features & + big_finalize_result$RUN == excluded_run +na_count = sum(is.na(big_finalize_result$newABUNDANCE[unmatched_mask])) +expect_true(na_count > 0, + info = paste("Rows with excluded keys should have NA newABUNDANCE.", + "Found", na_count, "NAs out of", + sum(unmatched_mask), "unmatched rows")) + + +# --- MSstatsSummarizationOutput / .finalizeTMP: predicted_survival contract --- +# +# MSstatsSummarizationOutput splits the nested 'summarized' list upfront: +# predicted_survival = rbindlist(lapply(summarized, function(x) x[[2]])) +# protein_summaries = lapply(summarized, function(x) x[[1]]) +# rm(summarized) +# .finalizeTMP receives predicted_survival directly (a combined data.table), +# not the full nested list. +# +# These tests verify: +# (a) .finalizeTMP accepts predicted_survival as a data.table +# (b) MSstatsSummarizationOutput correctly decomposes the nested list +# (c) The nested list does not persist through .finalizeTMP + +# Test 4a: .finalizeTMP takes a combined data.table, not a nested list --- +# +# .finalizeTMP's 4th parameter is 'predicted_survival', a data.table. + +# Rebuild a finalize_input fixture since .finalizeTMP modified the previous +# one in-place above. +finalize_input_4 = data.table::data.table( + PROTEIN = factor(rep(c("P1", "P2"), each = 6)), + FEATURE = factor(rep(c("feat1", "feat2"), each = 3, times = 2)), + LABEL = rep("L", 12), + RUN = factor(rep(1:3, times = 4)), + INTENSITY = 2^rnorm(12, 20, 2), + ABUNDANCE = rnorm(12, 20, 2), + newABUNDANCE = c(10.1, NA, 10.5, 11.2, 10.8, NA, + 9.5, NA, 10.2, 11.0, NA, 10.6), + cen = c(1, 0, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1), + censored = c(FALSE, TRUE, FALSE, FALSE, FALSE, TRUE, + FALSE, TRUE, FALSE, FALSE, TRUE, FALSE), + originalRUN = factor(rep(paste0("Run", 1:3), times = 4)), + GROUP = factor(rep(c("A", "B", "C"), times = 4)), + SUBJECT = factor(rep(1:3, times = 4)), + n_obs = rep(3, 12), + n_obs_run = rep(2, 12), + total_features = rep(2, 12), + nonmissing = c(TRUE, FALSE, TRUE, TRUE, TRUE, FALSE, + TRUE, FALSE, TRUE, TRUE, FALSE, TRUE) +) + +# predicted_survival as a plain data.table (the new interface). +pred_surv_4 = data.table::data.table( + newABUNDANCE = c(10.1, 8.5, 10.5, 11.2, 10.8, 7.9, + 9.5, 8.2, 10.2, 11.0, 10.6, 8.0), + cen = c(1, 0, 1, 1, 1, 0, 1, 0, 1, 1, 1, 0), + RUN = factor(rep(1:3, times = 4)), + FEATURE = factor(rep(c("feat1", "feat2"), each = 3, times = 2)), + predicted = c(NA, 8.5, NA, NA, NA, 7.9, NA, 8.2, NA, NA, NA, 8.0) +) + +# This should work — .finalizeTMP now expects a data.table, not a list. +result_4 = MSstats:::.finalizeTMP(finalize_input_4, censored_symbol = "NA", + impute = TRUE, + predicted_survival = pred_surv_4) +expect_true(data.table::is.data.table(result_4) || is.data.frame(result_4), + info = ".finalizeTMP should return a data.frame/data.table when given predicted_survival directly") +expect_true("newABUNDANCE" %in% colnames(result_4), + info = ".finalizeTMP should produce newABUNDANCE from a data.table input") +expect_true("predicted" %in% colnames(result_4), + info = ".finalizeTMP should produce predicted column from a data.table input") + + +# Test 4b: MSstatsSummarizationOutput correctly splits the nested list --- +# +# Build a minimal 'summarized' in the nested format that the summarization +# loop produces: list(list(protein_result, survival), ...). +# Verify that MSstatsSummarizationOutput decomposes it correctly and +# produces valid output. + +# Use the real pipeline on a small built-in dataset to get a realistic +# summarized list, then verify the output structure. +MSstatsConvert::MSstatsLogsSettings(FALSE) +small_input = MSstatsPrepareForDataProcess(SRMRawData, 2, NULL) +small_input = MSstatsNormalize(small_input, "EQUALIZEMEDIANS") +small_input = MSstatsMergeFractions(small_input) +small_input = MSstatsHandleMissing(small_input, "TMP", TRUE, "NA", 0.999) +small_input = MSstatsSelectFeatures(small_input, "all") +processed = getProcessed(small_input) +small_input = MSstatsPrepareForSummarization(small_input, "TMP", TRUE, "NA", FALSE) +summarized_list = MSstatsSummarizeWithSingleCore(small_input, "TMP", TRUE, "NA", + FALSE, TRUE, 90) + +# Verify summarized_list is a nested list (the format we're testing). +expect_true(is.list(summarized_list), + info = "Summarized should be a list") +expect_true(is.list(summarized_list[[1]]), + info = "Each element should be a list(protein_result, survival)") +expect_equal(length(summarized_list[[1]]), 2, + info = "Each element should have exactly 2 components") + +# MSstatsSummarizationOutput should decompose the nested list and produce +# valid FeatureLevelData / ProteinLevelData. +output = MSstatsSummarizationOutput(small_input, summarized_list, processed, + "TMP", TRUE, "NA") + +expect_true(!is.null(output$FeatureLevelData), + info = "Output should have FeatureLevelData") +expect_true(!is.null(output$ProteinLevelData), + info = "Output should have ProteinLevelData") +expect_true(nrow(output$FeatureLevelData) > 0, + info = "FeatureLevelData should have rows") +expect_true(nrow(output$ProteinLevelData) > 0, + info = "ProteinLevelData should have rows") +expect_true("newABUNDANCE" %in% colnames(output$FeatureLevelData), + info = "FeatureLevelData should contain newABUNDANCE (from imputation)") +expect_true("predicted" %in% colnames(output$FeatureLevelData), + info = "FeatureLevelData should contain predicted column") + + +# Test 4c: predicted_survival alone is smaller than the full nested list --- +# +# .finalizeTMP receives only predicted_survival (the survival half of the +# nested summarized list), not the full list. We verify the combined +# predicted_survival table is smaller than the full nested list. + +# Measure sizes of the two halves from the real summarized list. +survival_tables = lapply(summarized_list, function(x) x[[2]]) +protein_tables = lapply(summarized_list, function(x) x[[1]]) + +size_full_list = as.numeric(object.size(summarized_list)) +size_survival_only = as.numeric(object.size(survival_tables)) +size_protein_only = as.numeric(object.size(protein_tables)) +size_combined_survival = as.numeric( + object.size(data.table::rbindlist(survival_tables, fill = TRUE))) + +# The combined predicted_survival table (what .finalizeTMP receives now) +# should be smaller than the full nested list (what it received before). +expect_true(size_combined_survival < size_full_list, + info = paste("Combined predicted_survival should be smaller than", + "the full nested list.", + "predicted_survival:", size_combined_survival, "bytes.", + "Full nested list:", size_full_list, "bytes.")) diff --git a/man/dot-finalizeInput.Rd b/man/dot-finalizeInput.Rd index 8695d188..9ed07548 100644 --- a/man/dot-finalizeInput.Rd +++ b/man/dot-finalizeInput.Rd @@ -4,18 +4,18 @@ \alias{.finalizeInput} \title{Add summary statistics to dataProcess output} \usage{ -.finalizeInput(input, summarized, method, impute, censored_symbol) +.finalizeInput(input, predicted_survival, method, impute, censored_symbol) } \arguments{ \item{input}{feature-level data} -\item{summarized}{protein-level data (list)} - \item{method}{summary method} \item{impute}{if TRUE, censored missing values were imputed} \item{censored_symbol}{censored missing value indicator} + +\item{summarized}{protein-level data (list)} } \description{ Add summary statistics to dataProcess output diff --git a/man/dot-finalizeTMP.Rd b/man/dot-finalizeTMP.Rd index 2779235c..ed0f5dae 100644 --- a/man/dot-finalizeTMP.Rd +++ b/man/dot-finalizeTMP.Rd @@ -4,7 +4,7 @@ \alias{.finalizeTMP} \title{Summary statistics for output of TMP-based summarization} \usage{ -.finalizeTMP(input, censored_symbol, impute, summarized) +.finalizeTMP(input, censored_symbol, impute, predicted_survival) } \arguments{ \item{input}{feature-level data} @@ -12,8 +12,6 @@ \item{censored_symbol}{censored missing value indicator} \item{impute}{if TRUE, censored missing values were imputed} - -\item{summarized}{protein-level data (list)} } \description{ Summary statistics for output of TMP-based summarization