From 20dc07e3a41254598f6898953d0ef8ea76fb7098 Mon Sep 17 00:00:00 2001 From: Tony Wu Date: Thu, 7 May 2026 14:46:51 -0400 Subject: [PATCH 01/11] filter columns for readr initially --- R/clean_spectronaut.R | 22 +++++++++++++++++++--- 1 file changed, 19 insertions(+), 3 deletions(-) diff --git a/R/clean_spectronaut.R b/R/clean_spectronaut.R index 1eeb89c..5048f93 100644 --- a/R/clean_spectronaut.R +++ b/R/clean_spectronaut.R @@ -5,7 +5,7 @@ reduceBigSpectronaut <- function(input_file, output_path, filter_by_identified = FALSE, filter_by_qvalue = TRUE, qvalue_cutoff = 0.01, - calculateAnomalyScores=FALSE, + calculateAnomalyScores=FALSE, anomalyModelFeatures=c()) { if (grepl("csv", input_file)) { delim = "," @@ -14,6 +14,21 @@ reduceBigSpectronaut <- function(input_file, output_path, } else { delim <- ";" } + + # Restrict parsing to the columns cleanSpectronautChunk actually consumes. + # Spectronaut exports often have 50+ columns; reading only this subset + # cuts per-chunk peak memory roughly proportionally to the column ratio. + needed_cols <- c("R.FileName", "R.Condition", "R.Replicate", + "PG.ProteinAccessions", "EG.ModifiedSequence", + "FG.LabeledSequence", "FG.Charge", + "F.FrgIon", "F.Charge", + "EG.Identified", "F.ExcludedFromQuantification", + "F.FrgLossType", "PG.Qvalue", "EG.Qvalue", + intensity) + if (calculateAnomalyScores) { + needed_cols <- c(needed_cols, anomalyModelFeatures) + } + spec_chunk <- function(x, pos) cleanSpectronautChunk(x, output_path, intensity, @@ -22,12 +37,13 @@ reduceBigSpectronaut <- function(input_file, output_path, filter_by_qvalue, qvalue_cutoff, pos, - calculateAnomalyScores, + calculateAnomalyScores, anomalyModelFeatures) readr::read_delim_chunked(input_file, readr::DataFrameCallback$new(spec_chunk), delim = delim, - chunk_size = 1e6) + chunk_size = 1e6, + col_select = tidyselect::any_of(needed_cols)) } #' @keywords internal From 5129c7780b1760ac864d5a958e4f44667a01ad47 Mon Sep 17 00:00:00 2001 From: Tony Wu Date: Thu, 7 May 2026 14:50:29 -0400 Subject: [PATCH 02/11] use col_names parameter --- R/clean_spectronaut.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/clean_spectronaut.R b/R/clean_spectronaut.R index 5048f93..971dad6 100644 --- a/R/clean_spectronaut.R +++ b/R/clean_spectronaut.R @@ -43,7 +43,7 @@ reduceBigSpectronaut <- function(input_file, output_path, readr::DataFrameCallback$new(spec_chunk), delim = delim, chunk_size = 1e6, - col_select = tidyselect::any_of(needed_cols)) + col_names = tidyselect::any_of(needed_cols)) } #' @keywords internal From 08b0db20e88bc5dc6573991b3440846203028123 Mon Sep 17 00:00:00 2001 From: Tony Wu Date: Thu, 7 May 2026 14:52:29 -0400 Subject: [PATCH 03/11] fix col_names input --- R/clean_spectronaut.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/clean_spectronaut.R b/R/clean_spectronaut.R index 971dad6..5983005 100644 --- a/R/clean_spectronaut.R +++ b/R/clean_spectronaut.R @@ -43,7 +43,7 @@ reduceBigSpectronaut <- function(input_file, output_path, readr::DataFrameCallback$new(spec_chunk), delim = delim, chunk_size = 1e6, - col_names = tidyselect::any_of(needed_cols)) + col_names = needed_cols) } #' @keywords internal From 5a03986ef0d4672f961b17085e0c6b9e72634b19 Mon Sep 17 00:00:00 2001 From: Tony Wu Date: Thu, 7 May 2026 14:55:10 -0400 Subject: [PATCH 04/11] reduce chunk size --- R/clean_spectronaut.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/clean_spectronaut.R b/R/clean_spectronaut.R index 5983005..f3b66df 100644 --- a/R/clean_spectronaut.R +++ b/R/clean_spectronaut.R @@ -42,7 +42,7 @@ reduceBigSpectronaut <- function(input_file, output_path, readr::read_delim_chunked(input_file, readr::DataFrameCallback$new(spec_chunk), delim = delim, - chunk_size = 1e6, + chunk_size = 1e5, col_names = needed_cols) } From f844758e3129dad17c41a0d88b71da631ae27ac4 Mon Sep 17 00:00:00 2001 From: Tony Wu Date: Thu, 7 May 2026 15:42:51 -0400 Subject: [PATCH 05/11] try arrow csv reader delimted reader --- R/clean_spectronaut.R | 64 +++++++++++++++++++++++++++++-------------- 1 file changed, 44 insertions(+), 20 deletions(-) diff --git a/R/clean_spectronaut.R b/R/clean_spectronaut.R index f3b66df..4c037ea 100644 --- a/R/clean_spectronaut.R +++ b/R/clean_spectronaut.R @@ -8,16 +8,16 @@ reduceBigSpectronaut <- function(input_file, output_path, calculateAnomalyScores=FALSE, anomalyModelFeatures=c()) { if (grepl("csv", input_file)) { - delim = "," + delim <- "," } else if (grepl("tsv|xls", input_file)) { - delim = "\t" + delim <- "\t" } else { delim <- ";" } - # Restrict parsing to the columns cleanSpectronautChunk actually consumes. - # Spectronaut exports often have 50+ columns; reading only this subset - # cuts per-chunk peak memory roughly proportionally to the column ratio. + # Columns cleanSpectronautChunk actually consumes; Arrow's + # convert_options$include_columns drops everything else at parse time so + # we never materialize the ~35 unused columns Spectronaut exports. needed_cols <- c("R.FileName", "R.Condition", "R.Replicate", "PG.ProteinAccessions", "EG.ModifiedSequence", "FG.LabeledSequence", "FG.Charge", @@ -29,21 +29,45 @@ reduceBigSpectronaut <- function(input_file, output_path, needed_cols <- c(needed_cols, anomalyModelFeatures) } - spec_chunk <- function(x, pos) cleanSpectronautChunk(x, - output_path, - intensity, - filter_by_excluded, - filter_by_identified, - filter_by_qvalue, - qvalue_cutoff, - pos, - calculateAnomalyScores, - anomalyModelFeatures) - readr::read_delim_chunked(input_file, - readr::DataFrameCallback$new(spec_chunk), - delim = delim, - chunk_size = 1e5, - col_names = needed_cols) + # Arrow's CSV reader replaces readr::read_delim_chunked. Arrow releases + # per-batch state as soon as a batch is consumed, so peak memory is + # bounded by one record batch instead of growing with the dataset (readr + # keeps a string-interning pool that accumulates across chunks). The + # `delim` switch above already covers comma / tab / semicolon variants; + # Arrow's CSV reader handles all three the same way through + # CsvParseOptions$delimiter. + parse_opts <- arrow::CsvParseOptions$create(delimiter = delim) + convert_opts <- arrow::CsvConvertOptions$create(include_columns = needed_cols) + read_opts <- arrow::CsvReadOptions$create(block_size = 256L * 1024L) + + ds <- arrow::open_dataset( + input_file, + format = "csv", + parse_options = parse_opts, + convert_options = convert_opts, + read_options = read_opts + ) + + reader <- arrow::Scanner$create(ds)$ToRecordBatchReader() + + pos <- 1L + repeat { + batch <- reader$read_next_batch() + if (is.null(batch)) break + chunk_df <- as.data.frame(batch) + cleanSpectronautChunk(chunk_df, + output_path, + intensity, + filter_by_excluded, + filter_by_identified, + filter_by_qvalue, + qvalue_cutoff, + pos, + calculateAnomalyScores, + anomalyModelFeatures) + pos <- pos + nrow(chunk_df) + rm(batch, chunk_df) + } } #' @keywords internal From 53f7a782b1fdcde7d64f562bd2a6b93da29a9fda Mon Sep 17 00:00:00 2001 From: Tony Wu Date: Thu, 7 May 2026 16:02:35 -0400 Subject: [PATCH 06/11] fix column selection --- R/clean_spectronaut.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/clean_spectronaut.R b/R/clean_spectronaut.R index 4c037ea..480e85e 100644 --- a/R/clean_spectronaut.R +++ b/R/clean_spectronaut.R @@ -37,7 +37,7 @@ reduceBigSpectronaut <- function(input_file, output_path, # Arrow's CSV reader handles all three the same way through # CsvParseOptions$delimiter. parse_opts <- arrow::CsvParseOptions$create(delimiter = delim) - convert_opts <- arrow::CsvConvertOptions$create(include_columns = needed_cols) + convert_opts <- arrow::CsvConvertOptions$create() read_opts <- arrow::CsvReadOptions$create(block_size = 256L * 1024L) ds <- arrow::open_dataset( From a08a65b4d03b30907580b841671b24b4332c25e6 Mon Sep 17 00:00:00 2001 From: Tony Wu Date: Thu, 7 May 2026 16:09:14 -0400 Subject: [PATCH 07/11] add progress tracking --- R/clean_spectronaut.R | 30 ++++++++++++++++++++++++++++-- 1 file changed, 28 insertions(+), 2 deletions(-) diff --git a/R/clean_spectronaut.R b/R/clean_spectronaut.R index 480e85e..9cd5249 100644 --- a/R/clean_spectronaut.R +++ b/R/clean_spectronaut.R @@ -50,7 +50,9 @@ reduceBigSpectronaut <- function(input_file, output_path, reader <- arrow::Scanner$create(ds)$ToRecordBatchReader() - pos <- 1L + t_start <- Sys.time() + pos <- 1L + batch_idx <- 0L repeat { batch <- reader$read_next_batch() if (is.null(batch)) break @@ -65,9 +67,33 @@ reduceBigSpectronaut <- function(input_file, output_path, pos, calculateAnomalyScores, anomalyModelFeatures) - pos <- pos + nrow(chunk_df) + pos <- pos + nrow(chunk_df) + batch_idx <- batch_idx + 1L + + if (batch_idx %% 1000L == 0L) { + elapsed <- as.numeric(Sys.time() - t_start, units = "secs") + rate <- (pos - 1L) / elapsed + message(sprintf( + "[reduceBigSpectronaut] %d batches | %s rows | %.1fk rows/s | %.0fs elapsed", + batch_idx, + format(pos - 1L, big.mark = ","), + rate / 1000, + elapsed)) + } + rm(batch, chunk_df) } + + if (batch_idx %% 1000L != 0L) { + elapsed <- as.numeric(Sys.time() - t_start, units = "secs") + rate <- (pos - 1L) / elapsed + message(sprintf( + "[reduceBigSpectronaut] done: %d batches | %s rows | %.1fk rows/s | %.0fs elapsed", + batch_idx, + format(pos - 1L, big.mark = ","), + rate / 1000, + elapsed)) + } } #' @keywords internal From 4a6045a462c9548c49a46ef3f710070dbcade0dc Mon Sep 17 00:00:00 2001 From: Rudhik1904 Date: Fri, 15 May 2026 16:21:56 -0500 Subject: [PATCH 08/11] Summary of changes: R/clean_spectronaut.R:9-12: added block_size parameter (default 16L * 1024L * 1024L) with coerce + validation. R/clean_spectronaut.R:44: CsvReadOptions$create now uses the parameter. R/converters.R:120-125: new @param block_size roxygen with the straddling-object workaround note. R/converters.R:148-156: bigSpectronauttoMSstatsFormat gains block_size, plumbed to reduceBigSpectronaut. tests/testthat/test-converters.R:97-163: validation tests (rejects negative/zero/NA/vector/string) + plumbing tests (default forwards 16 MiB, override forwards user's value). man/bigSpectronauttoMSstatsFormat.Rd: regenerated from roxygen. --- R/clean_spectronaut.R | 8 +++- R/converters.R | 13 ++++-- man/bigSpectronauttoMSstatsFormat.Rd | 9 +++- man/dot-prefixedPath.Rd | 24 ++++++++++ tests/testthat/test-converters.R | 69 ++++++++++++++++++++++++++++ 5 files changed, 117 insertions(+), 6 deletions(-) create mode 100644 man/dot-prefixedPath.Rd diff --git a/R/clean_spectronaut.R b/R/clean_spectronaut.R index 9cd5249..a2d05fe 100644 --- a/R/clean_spectronaut.R +++ b/R/clean_spectronaut.R @@ -6,7 +6,11 @@ reduceBigSpectronaut <- function(input_file, output_path, filter_by_qvalue = TRUE, qvalue_cutoff = 0.01, calculateAnomalyScores=FALSE, - anomalyModelFeatures=c()) { + anomalyModelFeatures=c(), + block_size = 16L * 1024L * 1024L) { + block_size <- as.integer(block_size) + stopifnot(length(block_size) == 1L, !is.na(block_size), block_size > 0L) + if (grepl("csv", input_file)) { delim <- "," } else if (grepl("tsv|xls", input_file)) { @@ -38,7 +42,7 @@ reduceBigSpectronaut <- function(input_file, output_path, # CsvParseOptions$delimiter. parse_opts <- arrow::CsvParseOptions$create(delimiter = delim) convert_opts <- arrow::CsvConvertOptions$create() - read_opts <- arrow::CsvReadOptions$create(block_size = 256L * 1024L) + read_opts <- arrow::CsvReadOptions$create(block_size = block_size) ds <- arrow::open_dataset( input_file, diff --git a/R/converters.R b/R/converters.R index 13b4383..2b406ac 100644 --- a/R/converters.R +++ b/R/converters.R @@ -117,6 +117,11 @@ bigFragPipetoMSstatsFormat <- function(input_file, output_file_name, #' @param filter_by_identified if TRUE, will filter by the `EG.Identified` column. #' @param filter_by_qvalue if TRUE, will filter by EG.Qvalue and PG.Qvalue columns. #' @param qvalue_cutoff cutoff which will be used for q-value filtering. +#' @param block_size Arrow CSV reader block size in bytes; each input row must +#' fit inside one block. Defaults to 16 MiB (`16L * 1024L * 1024L`). If you +#' see `Invalid: straddling object straddles two block boundaries` on +#' extra-wide Spectronaut exports, pass a larger value +#' (e.g. `64L * 1024L * 1024L`). #' #' @export #' @@ -143,14 +148,16 @@ bigSpectronauttoMSstatsFormat <- function(input_file, output_file_name, aggregate_psms = FALSE, filter_few_obs = FALSE, remove_annotation = FALSE, - calculateAnomalyScores=FALSE, + calculateAnomalyScores=FALSE, anomalyModelFeatures=c(), - connection = NULL) { + connection = NULL, + block_size = 16L * 1024L * 1024L) { reduced_file <- .prefixedPath("reduce_output_", output_file_name) reduceBigSpectronaut(input_file, reduced_file, intensity, filter_by_excluded, filter_by_identified, filter_by_qvalue, qvalue_cutoff, - calculateAnomalyScores, anomalyModelFeatures) + calculateAnomalyScores, anomalyModelFeatures, + block_size = block_size) msstats_data <- MSstatsPreprocessBig( input_file = reduced_file, output_file_name = output_file_name, diff --git a/man/bigSpectronauttoMSstatsFormat.Rd b/man/bigSpectronauttoMSstatsFormat.Rd index 01706ef..b3e8f0c 100644 --- a/man/bigSpectronauttoMSstatsFormat.Rd +++ b/man/bigSpectronauttoMSstatsFormat.Rd @@ -20,7 +20,8 @@ bigSpectronauttoMSstatsFormat( remove_annotation = FALSE, calculateAnomalyScores = FALSE, anomalyModelFeatures = c(), - connection = NULL + connection = NULL, + block_size = 16L * 1024L * 1024L ) } \arguments{ @@ -63,6 +64,12 @@ using dataProcess function. Only applicable to sparklyr backend.} \item{connection}{Connection to a spark instance created with the `spark_connect` function from `sparklyr` package.} + +\item{block_size}{Arrow CSV reader block size in bytes; each input row must +fit inside one block. Defaults to 16 MiB (`16L * 1024L * 1024L`). If you +see `Invalid: straddling object straddles two block boundaries` on +extra-wide Spectronaut exports, pass a larger value +(e.g. `64L * 1024L * 1024L`).} } \value{ either arrow object or sparklyr table that can be optionally collected diff --git a/man/dot-prefixedPath.Rd b/man/dot-prefixedPath.Rd new file mode 100644 index 0000000..be036ac --- /dev/null +++ b/man/dot-prefixedPath.Rd @@ -0,0 +1,24 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/utils.R +\name{.prefixedPath} +\alias{.prefixedPath} +\title{Build an intermediate output path by prefixing only the basename.} +\usage{ +.prefixedPath(prefix, path) +} +\arguments{ +\item{prefix}{Character scalar prepended to the basename.} + +\item{path}{Output file path supplied by the caller.} +} +\value{ +Character scalar. +} +\description{ +Naive `paste0(prefix, output_file_name)` corrupts paths that contain a +directory (`subdir/out.csv` → `topN_subdir/out.csv`, +`/tmp/out.csv` → `topN_/tmp/out.csv`). Splitting via dirname/basename keeps +the directory component intact so intermediate files land beside the final +output. +} +\keyword{internal} diff --git a/tests/testthat/test-converters.R b/tests/testthat/test-converters.R index 78f6da3..433d584 100644 --- a/tests/testthat/test-converters.R +++ b/tests/testthat/test-converters.R @@ -94,6 +94,75 @@ test_that("bigSpectronauttoMSstatsFormat works correctly", { unlink(paste0("reduce_output_", output_file), recursive = TRUE, force = TRUE) }) +test_that("reduceBigSpectronaut rejects invalid block_size values", { + input_file <- tempfile(fileext = ".csv") + writeLines("a,b\n1,2", input_file) + output_file <- tempfile() + on.exit({ + unlink(input_file, force = TRUE) + unlink(output_file, recursive = TRUE, force = TRUE) + }, add = TRUE) + + expect_error(reduceBigSpectronaut(input_file, output_file, block_size = -1L)) + expect_error(reduceBigSpectronaut(input_file, output_file, block_size = 0L)) + expect_error(reduceBigSpectronaut(input_file, output_file, block_size = NA_integer_)) + expect_error(reduceBigSpectronaut(input_file, output_file, block_size = c(1L, 2L))) + expect_error(suppressWarnings( + reduceBigSpectronaut(input_file, output_file, block_size = "16MB") + )) +}) + +test_that("bigSpectronauttoMSstatsFormat plumbs block_size through to reduceBigSpectronaut", { + captured <- new.env(parent = emptyenv()) + captured$block_size <- NULL + + spy_reduce <- function(input_file, output_path, intensity, filter_by_excluded, + filter_by_identified, filter_by_qvalue, qvalue_cutoff, + calculateAnomalyScores, anomalyModelFeatures, + block_size = 16L * 1024L * 1024L) { + captured$block_size <- block_size + msstats_data <- data.frame( + ProteinName = "P1", PeptideSequence = "PEPTIDE", PrecursorCharge = 2, + FragmentIon = "frag1", ProductCharge = 1, + IsotopeLabelType = "L", Condition = "A", BioReplicate = 1, + Run = "run1", Intensity = 100 + ) + readr::write_csv(msstats_data, output_path) + } + + input_file <- "dummy_spectro_input.csv" + + # Default forwards 16 MiB. + stub(bigSpectronauttoMSstatsFormat, "reduceBigSpectronaut", spy_reduce) + output_file_default <- tempfile(fileext = ".csv") + on.exit({ + unlink(output_file_default, recursive = TRUE, force = TRUE) + unlink(paste0("reduce_output_", basename(output_file_default)), + recursive = TRUE, force = TRUE) + }, add = TRUE) + bigSpectronauttoMSstatsFormat( + input_file = input_file, + output_file_name = output_file_default, + backend = "arrow" + ) + expect_identical(captured$block_size, 16L * 1024L * 1024L) + + # Override forwards the user's value. + output_file_override <- tempfile(fileext = ".csv") + on.exit({ + unlink(output_file_override, recursive = TRUE, force = TRUE) + unlink(paste0("reduce_output_", basename(output_file_override)), + recursive = TRUE, force = TRUE) + }, add = TRUE) + bigSpectronauttoMSstatsFormat( + input_file = input_file, + output_file_name = output_file_override, + backend = "arrow", + block_size = 8L * 1024L * 1024L + ) + expect_identical(captured$block_size, 8L * 1024L * 1024L) +}) + # test_that("bigDIANNtoMSstatsFormat works with real MSstatsConvert tinytest data", { # input_file <- "/Users/rudhikshah/NorthEasternContractWork/MSstatsConvert/inst/tinytest/raw_data/DIANN/diann_input.tsv" # annotation_file <- "/Users/rudhikshah/NorthEasternContractWork/MSstatsConvert/inst/tinytest/raw_data/DIANN/annotation.csv" From c8c835e2fedda039fdfc62d56f7675aaf25d3ccb Mon Sep 17 00:00:00 2001 From: Rudhik1904 Date: Fri, 15 May 2026 18:48:17 -0500 Subject: [PATCH 09/11] =?UTF-8?q?R/clean=5Fspectronaut.R:1-2:=20added=20@i?= =?UTF-8?q?mportFrom=20data.table=20:=3D=20.SD=20setDT=20setnames=20so=20t?= =?UTF-8?q?he=20package=20is=20data.table-aware=20(cedta()).=20R/clean=5Fs?= =?UTF-8?q?pectronaut.R:103-187:=20rewrote=20cleanSpectronautChunk=20in=20?= =?UTF-8?q?data.table:=20setDT(input)=20at=20entry;=20subsequent=20operati?= =?UTF-8?q?ons=20modify=20in=20place=20via=20:=3D.=20Two-step=20rename=20(?= =?UTF-8?q?setnames=20for=20standardize,=20then=20setnames=20with=20skip?= =?UTF-8?q?=5Fabsent=20=3D=20TRUE=20to=20map=20standardized=E2=86=92MSstat?= =?UTF-8?q?s)=20matches=20the=20MSstatsConvert=20family=20pattern.=20Condi?= =?UTF-8?q?tional=20NA=20assignment=20uses=20mask=20form=20dt[cond,=20Inte?= =?UTF-8?q?nsity=20:=3D=20NA=5Freal=5F].=20Q-value=20filters=20preserve=20?= =?UTF-8?q?dplyr::if=5Felse=20NA=20semantics=20via=20explicit=20is.na(EGQv?= =?UTF-8?q?alue)=20|=20EGQvalue=20>=3D=20cutoff.=20Dropped=20the=20leftove?= =?UTF-8?q?r=20dplyr::collect(head(dplyr::select(...)))=20pattern=20?= =?UTF-8?q?=E2=80=94=20was=20a=20no-op=20residue=20from=20a=20prior=20refa?= =?UTF-8?q?ctor.=20Function=20shrank=20from=20~88=20lines=20to=20~64.=20DE?= =?UTF-8?q?SCRIPTION:20:=20added=20data.table=20to=20Imports.=20NAMESPACE:?= =?UTF-8?q?=20regenerated,=20now=20imports=20:=3D,=20.SD,=20setDT,=20setna?= =?UTF-8?q?mes=20from=20data.table.=20tests/testthat/test-converters.R:97-?= =?UTF-8?q?211:=205=20new=20tests=20=E2=80=94=20schema=20smoke=20test,=20f?= =?UTF-8?q?ilter=5Fby=5Fexcluded,=20filter=5Fby=5Fidentified,=20filter=5Fb?= =?UTF-8?q?y=5Fqvalue=20(covering=20the=20NA-q-value=20case),=20and=20FFrg?= =?UTF-8?q?LossType=20row=20drop.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- DESCRIPTION | 3 +- NAMESPACE | 4 ++ R/clean_spectronaut.R | 106 ++++++++++++++++++++---------------------- 3 files changed, 57 insertions(+), 56 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index 83fdc00..8a6629b 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -15,8 +15,9 @@ Description: MSstats package provide tools for preprocessing, summarization and License: Artistic-2.0 Encoding: UTF-8 RoxygenNote: 7.3.3 -Imports: +Imports: arrow, + data.table, DBI, dplyr, MSstats, diff --git a/NAMESPACE b/NAMESPACE index 9ec4823..8312349 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -10,5 +10,9 @@ importFrom(MSstats,groupComparison) importFrom(MSstatsConvert,MSstatsClean) importFrom(MSstatsConvert,MSstatsImport) importFrom(MSstatsConvert,MSstatsMakeAnnotation) +importFrom(data.table,":=") +importFrom(data.table,.SD) +importFrom(data.table,setDT) +importFrom(data.table,setnames) importFrom(utils,head) importFrom(utils,sessionInfo) diff --git a/R/clean_spectronaut.R b/R/clean_spectronaut.R index a2d05fe..ae5a408 100644 --- a/R/clean_spectronaut.R +++ b/R/clean_spectronaut.R @@ -1,3 +1,6 @@ +#' @importFrom data.table := .SD setDT setnames +NULL + #' @keywords internal reduceBigSpectronaut <- function(input_file, output_path, intensity="F.NormalizedPeakArea", @@ -108,85 +111,78 @@ cleanSpectronautChunk = function(input, output_path, filter_by_qvalue = TRUE, qvalue_cutoff = 0.01, pos = NULL, - calculateAnomalyScores=FALSE, + calculateAnomalyScores=FALSE, anomalyModelFeatures=c()) { + data.table::setDT(input) + all_cols <- c("R.FileName", "R.Condition", "R.Replicate", "PG.ProteinAccessions", "EG.ModifiedSequence", "FG.LabeledSequence", "FG.Charge", "F.FrgIon", "F.Charge", "EG.Identified", "F.ExcludedFromQuantification", "F.FrgLossType", "PG.Qvalue", "EG.Qvalue", intensity) - - if (calculateAnomalyScores){ - all_cols <- c(all_cols, anomalyModelFeatures) - } - - cols <- intersect(all_cols, colnames(input)) - input <- dplyr::select(input, all_of(cols)) - input <- dplyr::rename_with(input, .fn = MSstatsConvert:::.standardizeColnames) - new_names <- c("Run", "Condition", "BioReplicate", "ProteinName", "PeptideSequence", "LabeledSequence", "PrecursorCharge", "FragmentIon", "ProductCharge", "Identified", "Excluded", "FFrgLossType", "PGQvalue", "EGQvalue", "Intensity") - if (calculateAnomalyScores){ + if (calculateAnomalyScores) { + all_cols <- c(all_cols, anomalyModelFeatures) new_names <- c(new_names, MSstatsConvert:::.standardizeColnames(anomalyModelFeatures)) } - - # non_standardized = - old_names <- MSstatsConvert:::.standardizeColnames(all_cols) - names(old_names) <- new_names - old_names <- old_names[old_names %in% colnames(input)] - - input <- dplyr::rename(input, !!old_names) - input <- dplyr::mutate(input, Intensity = as.numeric(Intensity)) - - if (is.character(dplyr::pull(dplyr::collect(head(dplyr::select(input, Excluded))), Excluded))) { - input <- dplyr::mutate(input, Excluded = Excluded == "True") + + present_orig <- intersect(all_cols, colnames(input)) + input <- input[, present_orig, with = FALSE] + + # Two-step rename matching the MSstatsConvert family pattern: standardize + # all column names, then map standardized -> MSstats final names. + data.table::setnames(input, MSstatsConvert:::.standardizeColnames(colnames(input))) + std_to_msstats <- stats::setNames(new_names, + MSstatsConvert:::.standardizeColnames(all_cols)) + data.table::setnames(input, + old = names(std_to_msstats), + new = unname(std_to_msstats), + skip_absent = TRUE) + + input[, Intensity := as.numeric(Intensity)] + + if (is.character(input[["Excluded"]])) { + input[, Excluded := Excluded == "True"] } - if (is.element("Identified", colnames(input))) { - if (is.character(dplyr::pull(dplyr::collect(head(dplyr::select(input, Identified))), Identified))) { - input <- dplyr::mutate(input, Identified = Identified == "True") - } + if ("Identified" %in% colnames(input) && is.character(input[["Identified"]])) { + input[, Identified := Identified == "True"] } - + if (filter_by_excluded) { - input <- dplyr::mutate( - input, Intensity = dplyr::if_else(Excluded, NA_real_, Intensity)) - + input[Excluded == TRUE, Intensity := NA_real_] } - if (filter_by_identified) { - input <- dplyr::mutate( - input, Intensity = dplyr::if_else(Identified, Intensity, NA_real_)) + input[Identified == FALSE, Intensity := NA_real_] } - if (filter_by_qvalue) { - input <- dplyr::mutate( - input, - Intensity = dplyr::if_else(EGQvalue < qvalue_cutoff, Intensity, NA_real_)) - input <- dplyr::mutate( - input, - Intensity = dplyr::if_else(PGQvalue < qvalue_cutoff, Intensity, NA_real_)) + # Preserve dplyr::if_else semantics: rows with NA q-values become NA. + input[is.na(EGQvalue) | EGQvalue >= qvalue_cutoff, Intensity := NA_real_] + input[is.na(PGQvalue) | PGQvalue >= qvalue_cutoff, Intensity := NA_real_] } - - input <- dplyr::filter(input, FFrgLossType == "noloss") - if (is.element("LabeledSequence", colnames(input))) { - input <- dplyr::mutate(input, IsLabeled = grepl("Lys8", LabeledSequence) | grepl("Arg10", LabeledSequence)) - input <- dplyr::mutate(input, IsotopeLabelType := dplyr::if_else(IsLabeled, "H", "L")) + + input <- input[FFrgLossType == "noloss"] + + if ("LabeledSequence" %in% colnames(input)) { + input[, IsotopeLabelType := ifelse( + grepl("Lys8", LabeledSequence) | grepl("Arg10", LabeledSequence), + "H", "L")] } else { - input <- dplyr::mutate(input, IsotopeLabelType = "L") + input[, IsotopeLabelType := "L"] } - - select_cols = c("ProteinName", "PeptideSequence", "PrecursorCharge", "FragmentIon", - "ProductCharge", "IsotopeLabelType", "Run", "BioReplicate", "Condition", - "Intensity") - if (calculateAnomalyScores){ - select_cols = c(select_cols, - MSstatsConvert:::.standardizeColnames(anomalyModelFeatures)) + + select_cols <- c("ProteinName", "PeptideSequence", "PrecursorCharge", "FragmentIon", + "ProductCharge", "IsotopeLabelType", "Run", "BioReplicate", "Condition", + "Intensity") + if (calculateAnomalyScores) { + select_cols <- c(select_cols, + MSstatsConvert:::.standardizeColnames(anomalyModelFeatures)) } - - input <- dplyr::select(input, select_cols) + + input <- input[, select_cols, with = FALSE] .writeChunkToFile(input, output_path, pos) NULL } From fdd7476b7d423f9439c96804059aa6a6322b6f4c Mon Sep 17 00:00:00 2001 From: Rudhik1904 Date: Fri, 15 May 2026 18:48:41 -0500 Subject: [PATCH 10/11] add more tests --- tests/testthat/test-converters.R | 118 +++++++++++++++++++++++++++++++ 1 file changed, 118 insertions(+) diff --git a/tests/testthat/test-converters.R b/tests/testthat/test-converters.R index 433d584..78e0c1b 100644 --- a/tests/testthat/test-converters.R +++ b/tests/testthat/test-converters.R @@ -94,6 +94,124 @@ test_that("bigSpectronauttoMSstatsFormat works correctly", { unlink(paste0("reduce_output_", output_file), recursive = TRUE, force = TRUE) }) +make_spectronaut_input <- function(n = 1L, ...) { + base <- data.frame( + R.FileName = rep("run1", n), + R.Condition = "A", + R.Replicate = 1L, + PG.ProteinAccessions = "P1", + EG.ModifiedSequence = paste0("PEP", seq_len(n)), + FG.LabeledSequence = paste0("PEP", seq_len(n)), + FG.Charge = 2L, + F.FrgIon = "y1", + F.Charge = 1L, + EG.Identified = "True", + F.ExcludedFromQuantification = "False", + F.FrgLossType = "noloss", + PG.Qvalue = 0.001, + EG.Qvalue = 0.001, + F.NormalizedPeakArea = seq_len(n) * 100, + stringsAsFactors = FALSE + ) + overrides <- list(...) + for (col in names(overrides)) base[[col]] <- overrides[[col]] + base +} + +test_that("cleanSpectronautChunk produces the expected MSstats schema", { + input <- make_spectronaut_input(n = 1L) + output_file <- tempfile(fileext = ".csv") + on.exit(unlink(output_file, force = TRUE), add = TRUE) + + MSstatsBig:::cleanSpectronautChunk(input, output_file, pos = 1L) + result <- readr::read_csv(output_file, show_col_types = FALSE) + + expected_cols <- c("ProteinName", "PeptideSequence", "PrecursorCharge", "FragmentIon", + "ProductCharge", "IsotopeLabelType", "Run", "BioReplicate", "Condition", + "Intensity") + expect_setequal(colnames(result), expected_cols) + expect_equal(nrow(result), 1L) + expect_equal(result$Intensity, 100) + expect_equal(result$IsotopeLabelType, "L") +}) + +test_that("cleanSpectronautChunk filter_by_excluded sets Intensity to NA on excluded rows", { + input <- make_spectronaut_input( + n = 2L, + F.ExcludedFromQuantification = c("True", "False"), + F.NormalizedPeakArea = c(100, 200) + ) + output_file <- tempfile(fileext = ".csv") + on.exit(unlink(output_file, force = TRUE), add = TRUE) + + MSstatsBig:::cleanSpectronautChunk(input, output_file, pos = 1L, + filter_by_excluded = TRUE, + filter_by_qvalue = FALSE) + result <- readr::read_csv(output_file, show_col_types = FALSE) + result <- result[order(result$PeptideSequence), ] + + expect_true(is.na(result$Intensity[1])) + expect_equal(result$Intensity[2], 200) +}) + +test_that("cleanSpectronautChunk filter_by_identified sets Intensity to NA on unidentified rows", { + input <- make_spectronaut_input( + n = 2L, + EG.Identified = c("True", "False"), + F.NormalizedPeakArea = c(100, 200) + ) + output_file <- tempfile(fileext = ".csv") + on.exit(unlink(output_file, force = TRUE), add = TRUE) + + MSstatsBig:::cleanSpectronautChunk(input, output_file, pos = 1L, + filter_by_identified = TRUE, + filter_by_qvalue = FALSE) + result <- readr::read_csv(output_file, show_col_types = FALSE) + result <- result[order(result$PeptideSequence), ] + + expect_equal(result$Intensity[1], 100) + expect_true(is.na(result$Intensity[2])) +}) + +test_that("cleanSpectronautChunk filter_by_qvalue NA-aware semantics match dplyr::if_else", { + input <- make_spectronaut_input( + n = 3L, + EG.Qvalue = c(0.001, 0.5, NA_real_), + F.NormalizedPeakArea = c(100, 200, 300) + ) + output_file <- tempfile(fileext = ".csv") + on.exit(unlink(output_file, force = TRUE), add = TRUE) + + MSstatsBig:::cleanSpectronautChunk(input, output_file, pos = 1L, + filter_by_qvalue = TRUE, + qvalue_cutoff = 0.01) + result <- readr::read_csv(output_file, show_col_types = FALSE) + result <- result[order(result$PeptideSequence), ] + + # PEP1: EGQvalue below cutoff -> kept + expect_equal(result$Intensity[1], 100) + # PEP2: EGQvalue above cutoff -> NA + expect_true(is.na(result$Intensity[2])) + # PEP3: EGQvalue NA -> NA (preserves dplyr::if_else semantics) + expect_true(is.na(result$Intensity[3])) +}) + +test_that("cleanSpectronautChunk drops rows where FFrgLossType != noloss", { + input <- make_spectronaut_input( + n = 3L, + F.FrgLossType = c("noloss", "H2O", "noloss") + ) + output_file <- tempfile(fileext = ".csv") + on.exit(unlink(output_file, force = TRUE), add = TRUE) + + MSstatsBig:::cleanSpectronautChunk(input, output_file, pos = 1L, + filter_by_qvalue = FALSE) + result <- readr::read_csv(output_file, show_col_types = FALSE) + + expect_equal(nrow(result), 2L) + expect_setequal(result$PeptideSequence, c("PEP1", "PEP3")) +}) + test_that("reduceBigSpectronaut rejects invalid block_size values", { input_file <- tempfile(fileext = ".csv") writeLines("a,b\n1,2", input_file) From 8188f4e1c90221c8338c7a5a4ef8039422d1348d Mon Sep 17 00:00:00 2001 From: Rudhik1904 Date: Fri, 22 May 2026 16:31:46 -0500 Subject: [PATCH 11/11] temp Commit so I can get the list of col --- R/clean_spectronaut.R | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/R/clean_spectronaut.R b/R/clean_spectronaut.R index ae5a408..198f4d1 100644 --- a/R/clean_spectronaut.R +++ b/R/clean_spectronaut.R @@ -131,6 +131,16 @@ cleanSpectronautChunk = function(input, output_path, } present_orig <- intersect(all_cols, colnames(input)) + if (length(present_orig) == 0L) { + stop(sprintf( + paste0("cleanSpectronautChunk: none of the expected Spectronaut ", + "columns were found in the input batch. ", + "Expected any of: %s. Found: %s. ", + "Check that the file is comma-delimited and that the ", + "Spectronaut export uses the standard column names."), + paste(all_cols, collapse = ", "), + paste(colnames(input), collapse = ", "))) + } input <- input[, present_orig, with = FALSE] # Two-step rename matching the MSstatsConvert family pattern: standardize