From 20dc07e3a41254598f6898953d0ef8ea76fb7098 Mon Sep 17 00:00:00 2001
From: Tony Wu <wu.anthon@northeastern.edu>
Date: Thu, 7 May 2026 14:46:51 -0400
Subject: [PATCH 01/11] filter columns for readr initially

---
 R/clean_spectronaut.R | 22 +++++++++++++++++++---
 1 file changed, 19 insertions(+), 3 deletions(-)

diff --git a/R/clean_spectronaut.R b/R/clean_spectronaut.R
index 1eeb89c..5048f93 100644
--- a/R/clean_spectronaut.R
+++ b/R/clean_spectronaut.R
@@ -5,7 +5,7 @@ reduceBigSpectronaut <- function(input_file, output_path,
                                  filter_by_identified = FALSE,
                                  filter_by_qvalue = TRUE,
                                  qvalue_cutoff = 0.01,
-                                 calculateAnomalyScores=FALSE, 
+                                 calculateAnomalyScores=FALSE,
                                  anomalyModelFeatures=c()) {
   if (grepl("csv", input_file)) {
     delim = ","
@@ -14,6 +14,21 @@ reduceBigSpectronaut <- function(input_file, output_path,
   } else {
     delim <- ";"
   }
+
+  # Restrict parsing to the columns cleanSpectronautChunk actually consumes.
+  # Spectronaut exports often have 50+ columns; reading only this subset
+  # cuts per-chunk peak memory roughly proportionally to the column ratio.
+  needed_cols <- c("R.FileName", "R.Condition", "R.Replicate",
+                   "PG.ProteinAccessions", "EG.ModifiedSequence",
+                   "FG.LabeledSequence", "FG.Charge",
+                   "F.FrgIon", "F.Charge",
+                   "EG.Identified", "F.ExcludedFromQuantification",
+                   "F.FrgLossType", "PG.Qvalue", "EG.Qvalue",
+                   intensity)
+  if (calculateAnomalyScores) {
+    needed_cols <- c(needed_cols, anomalyModelFeatures)
+  }
+
   spec_chunk <- function(x, pos) cleanSpectronautChunk(x,
                                                        output_path,
                                                        intensity,
@@ -22,12 +37,13 @@ reduceBigSpectronaut <- function(input_file, output_path,
                                                        filter_by_qvalue,
                                                        qvalue_cutoff,
                                                        pos,
-                                                       calculateAnomalyScores, 
+                                                       calculateAnomalyScores,
                                                        anomalyModelFeatures)
   readr::read_delim_chunked(input_file,
                             readr::DataFrameCallback$new(spec_chunk),
                             delim = delim,
-                            chunk_size = 1e6)
+                            chunk_size = 1e6,
+                            col_select = tidyselect::any_of(needed_cols))
 }
 
 #' @keywords internal

From 5129c7780b1760ac864d5a958e4f44667a01ad47 Mon Sep 17 00:00:00 2001
From: Tony Wu <wu.anthon@northeastern.edu>
Date: Thu, 7 May 2026 14:50:29 -0400
Subject: [PATCH 02/11] use col_names parameter

---
 R/clean_spectronaut.R | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/R/clean_spectronaut.R b/R/clean_spectronaut.R
index 5048f93..971dad6 100644
--- a/R/clean_spectronaut.R
+++ b/R/clean_spectronaut.R
@@ -43,7 +43,7 @@ reduceBigSpectronaut <- function(input_file, output_path,
                             readr::DataFrameCallback$new(spec_chunk),
                             delim = delim,
                             chunk_size = 1e6,
-                            col_select = tidyselect::any_of(needed_cols))
+                            col_names = tidyselect::any_of(needed_cols))
 }
 
 #' @keywords internal

From 08b0db20e88bc5dc6573991b3440846203028123 Mon Sep 17 00:00:00 2001
From: Tony Wu <wu.anthon@northeastern.edu>
Date: Thu, 7 May 2026 14:52:29 -0400
Subject: [PATCH 03/11] fix col_names input

---
 R/clean_spectronaut.R | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/R/clean_spectronaut.R b/R/clean_spectronaut.R
index 971dad6..5983005 100644
--- a/R/clean_spectronaut.R
+++ b/R/clean_spectronaut.R
@@ -43,7 +43,7 @@ reduceBigSpectronaut <- function(input_file, output_path,
                             readr::DataFrameCallback$new(spec_chunk),
                             delim = delim,
                             chunk_size = 1e6,
-                            col_names = tidyselect::any_of(needed_cols))
+                            col_names = needed_cols)
 }
 
 #' @keywords internal

From 5a03986ef0d4672f961b17085e0c6b9e72634b19 Mon Sep 17 00:00:00 2001
From: Tony Wu <wu.anthon@northeastern.edu>
Date: Thu, 7 May 2026 14:55:10 -0400
Subject: [PATCH 04/11] reduce chunk size

---
 R/clean_spectronaut.R | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/R/clean_spectronaut.R b/R/clean_spectronaut.R
index 5983005..f3b66df 100644
--- a/R/clean_spectronaut.R
+++ b/R/clean_spectronaut.R
@@ -42,7 +42,7 @@ reduceBigSpectronaut <- function(input_file, output_path,
   readr::read_delim_chunked(input_file,
                             readr::DataFrameCallback$new(spec_chunk),
                             delim = delim,
-                            chunk_size = 1e6,
+                            chunk_size = 1e5,
                             col_names = needed_cols)
 }
 

From f844758e3129dad17c41a0d88b71da631ae27ac4 Mon Sep 17 00:00:00 2001
From: Tony Wu <wu.anthon@northeastern.edu>
Date: Thu, 7 May 2026 15:42:51 -0400
Subject: [PATCH 05/11] try arrow csv reader delimted reader

---
 R/clean_spectronaut.R | 64 +++++++++++++++++++++++++++++--------------
 1 file changed, 44 insertions(+), 20 deletions(-)

diff --git a/R/clean_spectronaut.R b/R/clean_spectronaut.R
index f3b66df..4c037ea 100644
--- a/R/clean_spectronaut.R
+++ b/R/clean_spectronaut.R
@@ -8,16 +8,16 @@ reduceBigSpectronaut <- function(input_file, output_path,
                                  calculateAnomalyScores=FALSE,
                                  anomalyModelFeatures=c()) {
   if (grepl("csv", input_file)) {
-    delim = ","
+    delim <- ","
   } else if (grepl("tsv|xls", input_file)) {
-    delim = "\t"
+    delim <- "\t"
   } else {
     delim <- ";"
   }
 
-  # Restrict parsing to the columns cleanSpectronautChunk actually consumes.
-  # Spectronaut exports often have 50+ columns; reading only this subset
-  # cuts per-chunk peak memory roughly proportionally to the column ratio.
+  # Columns cleanSpectronautChunk actually consumes; Arrow's
+  # convert_options$include_columns drops everything else at parse time so
+  # we never materialize the ~35 unused columns Spectronaut exports.
   needed_cols <- c("R.FileName", "R.Condition", "R.Replicate",
                    "PG.ProteinAccessions", "EG.ModifiedSequence",
                    "FG.LabeledSequence", "FG.Charge",
@@ -29,21 +29,45 @@ reduceBigSpectronaut <- function(input_file, output_path,
     needed_cols <- c(needed_cols, anomalyModelFeatures)
   }
 
-  spec_chunk <- function(x, pos) cleanSpectronautChunk(x,
-                                                       output_path,
-                                                       intensity,
-                                                       filter_by_excluded,
-                                                       filter_by_identified,
-                                                       filter_by_qvalue,
-                                                       qvalue_cutoff,
-                                                       pos,
-                                                       calculateAnomalyScores,
-                                                       anomalyModelFeatures)
-  readr::read_delim_chunked(input_file,
-                            readr::DataFrameCallback$new(spec_chunk),
-                            delim = delim,
-                            chunk_size = 1e5,
-                            col_names = needed_cols)
+  # Arrow's CSV reader replaces readr::read_delim_chunked.  Arrow releases
+  # per-batch state as soon as a batch is consumed, so peak memory is
+  # bounded by one record batch instead of growing with the dataset (readr
+  # keeps a string-interning pool that accumulates across chunks).  The
+  # `delim` switch above already covers comma / tab / semicolon variants;
+  # Arrow's CSV reader handles all three the same way through
+  # CsvParseOptions$delimiter.
+  parse_opts   <- arrow::CsvParseOptions$create(delimiter = delim)
+  convert_opts <- arrow::CsvConvertOptions$create(include_columns = needed_cols)
+  read_opts    <- arrow::CsvReadOptions$create(block_size = 256L * 1024L)
+
+  ds <- arrow::open_dataset(
+    input_file,
+    format          = "csv",
+    parse_options   = parse_opts,
+    convert_options = convert_opts,
+    read_options    = read_opts
+  )
+
+  reader <- arrow::Scanner$create(ds)$ToRecordBatchReader()
+
+  pos <- 1L
+  repeat {
+    batch <- reader$read_next_batch()
+    if (is.null(batch)) break
+    chunk_df <- as.data.frame(batch)
+    cleanSpectronautChunk(chunk_df,
+                          output_path,
+                          intensity,
+                          filter_by_excluded,
+                          filter_by_identified,
+                          filter_by_qvalue,
+                          qvalue_cutoff,
+                          pos,
+                          calculateAnomalyScores,
+                          anomalyModelFeatures)
+    pos <- pos + nrow(chunk_df)
+    rm(batch, chunk_df)
+  }
 }
 
 #' @keywords internal

From 53f7a782b1fdcde7d64f562bd2a6b93da29a9fda Mon Sep 17 00:00:00 2001
From: Tony Wu <wu.anthon@northeastern.edu>
Date: Thu, 7 May 2026 16:02:35 -0400
Subject: [PATCH 06/11] fix column selection

---
 R/clean_spectronaut.R | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/R/clean_spectronaut.R b/R/clean_spectronaut.R
index 4c037ea..480e85e 100644
--- a/R/clean_spectronaut.R
+++ b/R/clean_spectronaut.R
@@ -37,7 +37,7 @@ reduceBigSpectronaut <- function(input_file, output_path,
   # Arrow's CSV reader handles all three the same way through
   # CsvParseOptions$delimiter.
   parse_opts   <- arrow::CsvParseOptions$create(delimiter = delim)
-  convert_opts <- arrow::CsvConvertOptions$create(include_columns = needed_cols)
+  convert_opts <- arrow::CsvConvertOptions$create()
   read_opts    <- arrow::CsvReadOptions$create(block_size = 256L * 1024L)
 
   ds <- arrow::open_dataset(

From a08a65b4d03b30907580b841671b24b4332c25e6 Mon Sep 17 00:00:00 2001
From: Tony Wu <wu.anthon@northeastern.edu>
Date: Thu, 7 May 2026 16:09:14 -0400
Subject: [PATCH 07/11] add progress tracking

---
 R/clean_spectronaut.R | 30 ++++++++++++++++++++++++++++--
 1 file changed, 28 insertions(+), 2 deletions(-)

diff --git a/R/clean_spectronaut.R b/R/clean_spectronaut.R
index 480e85e..9cd5249 100644
--- a/R/clean_spectronaut.R
+++ b/R/clean_spectronaut.R
@@ -50,7 +50,9 @@ reduceBigSpectronaut <- function(input_file, output_path,
 
   reader <- arrow::Scanner$create(ds)$ToRecordBatchReader()
 
-  pos <- 1L
+  t_start   <- Sys.time()
+  pos       <- 1L
+  batch_idx <- 0L
   repeat {
     batch <- reader$read_next_batch()
     if (is.null(batch)) break
@@ -65,9 +67,33 @@ reduceBigSpectronaut <- function(input_file, output_path,
                           pos,
                           calculateAnomalyScores,
                           anomalyModelFeatures)
-    pos <- pos + nrow(chunk_df)
+    pos       <- pos + nrow(chunk_df)
+    batch_idx <- batch_idx + 1L
+
+    if (batch_idx %% 1000L == 0L) {
+      elapsed <- as.numeric(Sys.time() - t_start, units = "secs")
+      rate    <- (pos - 1L) / elapsed
+      message(sprintf(
+        "[reduceBigSpectronaut] %d batches | %s rows | %.1fk rows/s | %.0fs elapsed",
+        batch_idx,
+        format(pos - 1L, big.mark = ","),
+        rate / 1000,
+        elapsed))
+    }
+
     rm(batch, chunk_df)
   }
+
+  if (batch_idx %% 1000L != 0L) {
+    elapsed <- as.numeric(Sys.time() - t_start, units = "secs")
+    rate    <- (pos - 1L) / elapsed
+    message(sprintf(
+      "[reduceBigSpectronaut] done: %d batches | %s rows | %.1fk rows/s | %.0fs elapsed",
+      batch_idx,
+      format(pos - 1L, big.mark = ","),
+      rate / 1000,
+      elapsed))
+  }
 }
 
 #' @keywords internal

From 4a6045a462c9548c49a46ef3f710070dbcade0dc Mon Sep 17 00:00:00 2001
From: Rudhik1904 <rudhikshah50@gmail.com>
Date: Fri, 15 May 2026 16:21:56 -0500
Subject: [PATCH 08/11] Summary of changes:

R/clean_spectronaut.R:9-12: added block_size parameter (default 16L * 1024L * 1024L) with coerce + validation.
R/clean_spectronaut.R:44: CsvReadOptions$create now uses the parameter.
R/converters.R:120-125: new @param block_size roxygen with the straddling-object workaround note.
R/converters.R:148-156: bigSpectronauttoMSstatsFormat gains block_size, plumbed to reduceBigSpectronaut.
tests/testthat/test-converters.R:97-163: validation tests (rejects negative/zero/NA/vector/string) + plumbing tests (default forwards 16 MiB, override forwards user's value).
man/bigSpectronauttoMSstatsFormat.Rd: regenerated from roxygen.
---
 R/clean_spectronaut.R                |  8 +++-
 R/converters.R                       | 13 ++++--
 man/bigSpectronauttoMSstatsFormat.Rd |  9 +++-
 man/dot-prefixedPath.Rd              | 24 ++++++++++
 tests/testthat/test-converters.R     | 69 ++++++++++++++++++++++++++++
 5 files changed, 117 insertions(+), 6 deletions(-)
 create mode 100644 man/dot-prefixedPath.Rd

diff --git a/R/clean_spectronaut.R b/R/clean_spectronaut.R
index 9cd5249..a2d05fe 100644
--- a/R/clean_spectronaut.R
+++ b/R/clean_spectronaut.R
@@ -6,7 +6,11 @@ reduceBigSpectronaut <- function(input_file, output_path,
                                  filter_by_qvalue = TRUE,
                                  qvalue_cutoff = 0.01,
                                  calculateAnomalyScores=FALSE,
-                                 anomalyModelFeatures=c()) {
+                                 anomalyModelFeatures=c(),
+                                 block_size = 16L * 1024L * 1024L) {
+  block_size <- as.integer(block_size)
+  stopifnot(length(block_size) == 1L, !is.na(block_size), block_size > 0L)
+
   if (grepl("csv", input_file)) {
     delim <- ","
   } else if (grepl("tsv|xls", input_file)) {
@@ -38,7 +42,7 @@ reduceBigSpectronaut <- function(input_file, output_path,
   # CsvParseOptions$delimiter.
   parse_opts   <- arrow::CsvParseOptions$create(delimiter = delim)
   convert_opts <- arrow::CsvConvertOptions$create()
-  read_opts    <- arrow::CsvReadOptions$create(block_size = 256L * 1024L)
+  read_opts    <- arrow::CsvReadOptions$create(block_size = block_size)
 
   ds <- arrow::open_dataset(
     input_file,
diff --git a/R/converters.R b/R/converters.R
index 13b4383..2b406ac 100644
--- a/R/converters.R
+++ b/R/converters.R
@@ -117,6 +117,11 @@ bigFragPipetoMSstatsFormat <-  function(input_file, output_file_name,
 #' @param filter_by_identified if TRUE, will filter by the `EG.Identified` column.
 #' @param filter_by_qvalue if TRUE, will filter by EG.Qvalue and PG.Qvalue columns.
 #' @param qvalue_cutoff cutoff which will be used for q-value filtering.
+#' @param block_size Arrow CSV reader block size in bytes; each input row must
+#'   fit inside one block. Defaults to 16 MiB (`16L * 1024L * 1024L`). If you
+#'   see `Invalid: straddling object straddles two block boundaries` on
+#'   extra-wide Spectronaut exports, pass a larger value
+#'   (e.g. `64L * 1024L * 1024L`).
 #'
 #' @export
 #'
@@ -143,14 +148,16 @@ bigSpectronauttoMSstatsFormat <-  function(input_file, output_file_name,
                                           aggregate_psms =  FALSE,
                                           filter_few_obs =  FALSE,
                                           remove_annotation =  FALSE,
-                                          calculateAnomalyScores=FALSE, 
+                                          calculateAnomalyScores=FALSE,
                                           anomalyModelFeatures=c(),
-                                          connection =  NULL) {
+                                          connection =  NULL,
+                                          block_size = 16L * 1024L * 1024L) {
   reduced_file <- .prefixedPath("reduce_output_", output_file_name)
   reduceBigSpectronaut(input_file, reduced_file,
                        intensity, filter_by_excluded, filter_by_identified,
                        filter_by_qvalue, qvalue_cutoff,
-                       calculateAnomalyScores, anomalyModelFeatures)
+                       calculateAnomalyScores, anomalyModelFeatures,
+                       block_size = block_size)
   msstats_data <- MSstatsPreprocessBig(
     input_file = reduced_file,
     output_file_name = output_file_name, 
diff --git a/man/bigSpectronauttoMSstatsFormat.Rd b/man/bigSpectronauttoMSstatsFormat.Rd
index 01706ef..b3e8f0c 100644
--- a/man/bigSpectronauttoMSstatsFormat.Rd
+++ b/man/bigSpectronauttoMSstatsFormat.Rd
@@ -20,7 +20,8 @@ bigSpectronauttoMSstatsFormat(
   remove_annotation = FALSE,
   calculateAnomalyScores = FALSE,
   anomalyModelFeatures = c(),
-  connection = NULL
+  connection = NULL,
+  block_size = 16L * 1024L * 1024L
 )
 }
 \arguments{
@@ -63,6 +64,12 @@ using dataProcess function. Only applicable to sparklyr backend.}
 
 \item{connection}{Connection to a spark instance created with the
 `spark_connect` function from `sparklyr` package.}
+
+\item{block_size}{Arrow CSV reader block size in bytes; each input row must
+fit inside one block. Defaults to 16 MiB (`16L * 1024L * 1024L`). If you
+see `Invalid: straddling object straddles two block boundaries` on
+extra-wide Spectronaut exports, pass a larger value
+(e.g. `64L * 1024L * 1024L`).}
 }
 \value{
 either arrow object or sparklyr table that can be optionally collected
diff --git a/man/dot-prefixedPath.Rd b/man/dot-prefixedPath.Rd
new file mode 100644
index 0000000..be036ac
--- /dev/null
+++ b/man/dot-prefixedPath.Rd
@@ -0,0 +1,24 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/utils.R
+\name{.prefixedPath}
+\alias{.prefixedPath}
+\title{Build an intermediate output path by prefixing only the basename.}
+\usage{
+.prefixedPath(prefix, path)
+}
+\arguments{
+\item{prefix}{Character scalar prepended to the basename.}
+
+\item{path}{Output file path supplied by the caller.}
+}
+\value{
+Character scalar.
+}
+\description{
+Naive `paste0(prefix, output_file_name)` corrupts paths that contain a
+directory (`subdir/out.csv` → `topN_subdir/out.csv`,
+`/tmp/out.csv` → `topN_/tmp/out.csv`). Splitting via dirname/basename keeps
+the directory component intact so intermediate files land beside the final
+output.
+}
+\keyword{internal}
diff --git a/tests/testthat/test-converters.R b/tests/testthat/test-converters.R
index 78f6da3..433d584 100644
--- a/tests/testthat/test-converters.R
+++ b/tests/testthat/test-converters.R
@@ -94,6 +94,75 @@ test_that("bigSpectronauttoMSstatsFormat works correctly", {
   unlink(paste0("reduce_output_", output_file), recursive = TRUE, force = TRUE)
 })
 
+test_that("reduceBigSpectronaut rejects invalid block_size values", {
+  input_file <- tempfile(fileext = ".csv")
+  writeLines("a,b\n1,2", input_file)
+  output_file <- tempfile()
+  on.exit({
+    unlink(input_file, force = TRUE)
+    unlink(output_file, recursive = TRUE, force = TRUE)
+  }, add = TRUE)
+
+  expect_error(reduceBigSpectronaut(input_file, output_file, block_size = -1L))
+  expect_error(reduceBigSpectronaut(input_file, output_file, block_size = 0L))
+  expect_error(reduceBigSpectronaut(input_file, output_file, block_size = NA_integer_))
+  expect_error(reduceBigSpectronaut(input_file, output_file, block_size = c(1L, 2L)))
+  expect_error(suppressWarnings(
+    reduceBigSpectronaut(input_file, output_file, block_size = "16MB")
+  ))
+})
+
+test_that("bigSpectronauttoMSstatsFormat plumbs block_size through to reduceBigSpectronaut", {
+  captured <- new.env(parent = emptyenv())
+  captured$block_size <- NULL
+
+  spy_reduce <- function(input_file, output_path, intensity, filter_by_excluded,
+                        filter_by_identified, filter_by_qvalue, qvalue_cutoff,
+                        calculateAnomalyScores, anomalyModelFeatures,
+                        block_size = 16L * 1024L * 1024L) {
+    captured$block_size <- block_size
+    msstats_data <- data.frame(
+      ProteinName = "P1", PeptideSequence = "PEPTIDE", PrecursorCharge = 2,
+      FragmentIon = "frag1", ProductCharge = 1,
+      IsotopeLabelType = "L", Condition = "A", BioReplicate = 1,
+      Run = "run1", Intensity = 100
+    )
+    readr::write_csv(msstats_data, output_path)
+  }
+
+  input_file <- "dummy_spectro_input.csv"
+
+  # Default forwards 16 MiB.
+  stub(bigSpectronauttoMSstatsFormat, "reduceBigSpectronaut", spy_reduce)
+  output_file_default <- tempfile(fileext = ".csv")
+  on.exit({
+    unlink(output_file_default, recursive = TRUE, force = TRUE)
+    unlink(paste0("reduce_output_", basename(output_file_default)),
+           recursive = TRUE, force = TRUE)
+  }, add = TRUE)
+  bigSpectronauttoMSstatsFormat(
+    input_file = input_file,
+    output_file_name = output_file_default,
+    backend = "arrow"
+  )
+  expect_identical(captured$block_size, 16L * 1024L * 1024L)
+
+  # Override forwards the user's value.
+  output_file_override <- tempfile(fileext = ".csv")
+  on.exit({
+    unlink(output_file_override, recursive = TRUE, force = TRUE)
+    unlink(paste0("reduce_output_", basename(output_file_override)),
+           recursive = TRUE, force = TRUE)
+  }, add = TRUE)
+  bigSpectronauttoMSstatsFormat(
+    input_file = input_file,
+    output_file_name = output_file_override,
+    backend = "arrow",
+    block_size = 8L * 1024L * 1024L
+  )
+  expect_identical(captured$block_size, 8L * 1024L * 1024L)
+})
+
 # test_that("bigDIANNtoMSstatsFormat works with real MSstatsConvert tinytest data", {
 #   input_file <- "/Users/rudhikshah/NorthEasternContractWork/MSstatsConvert/inst/tinytest/raw_data/DIANN/diann_input.tsv"
 #   annotation_file <- "/Users/rudhikshah/NorthEasternContractWork/MSstatsConvert/inst/tinytest/raw_data/DIANN/annotation.csv"

From c8c835e2fedda039fdfc62d56f7675aaf25d3ccb Mon Sep 17 00:00:00 2001
From: Rudhik1904 <rudhikshah50@gmail.com>
Date: Fri, 15 May 2026 18:48:17 -0500
Subject: [PATCH 09/11] =?UTF-8?q?R/clean=5Fspectronaut.R:1-2:=20added=20@i?=
 =?UTF-8?q?mportFrom=20data.table=20:=3D=20.SD=20setDT=20setnames=20so=20t?=
 =?UTF-8?q?he=20package=20is=20data.table-aware=20(cedta()).=20R/clean=5Fs?=
 =?UTF-8?q?pectronaut.R:103-187:=20rewrote=20cleanSpectronautChunk=20in=20?=
 =?UTF-8?q?data.table:=20setDT(input)=20at=20entry;=20subsequent=20operati?=
 =?UTF-8?q?ons=20modify=20in=20place=20via=20:=3D.=20Two-step=20rename=20(?=
 =?UTF-8?q?setnames=20for=20standardize,=20then=20setnames=20with=20skip?=
 =?UTF-8?q?=5Fabsent=20=3D=20TRUE=20to=20map=20standardized=E2=86=92MSstat?=
 =?UTF-8?q?s)=20matches=20the=20MSstatsConvert=20family=20pattern.=20Condi?=
 =?UTF-8?q?tional=20NA=20assignment=20uses=20mask=20form=20dt[cond,=20Inte?=
 =?UTF-8?q?nsity=20:=3D=20NA=5Freal=5F].=20Q-value=20filters=20preserve=20?=
 =?UTF-8?q?dplyr::if=5Felse=20NA=20semantics=20via=20explicit=20is.na(EGQv?=
 =?UTF-8?q?alue)=20|=20EGQvalue=20>=3D=20cutoff.=20Dropped=20the=20leftove?=
 =?UTF-8?q?r=20dplyr::collect(head(dplyr::select(...)))=20pattern=20?=
 =?UTF-8?q?=E2=80=94=20was=20a=20no-op=20residue=20from=20a=20prior=20refa?=
 =?UTF-8?q?ctor.=20Function=20shrank=20from=20~88=20lines=20to=20~64.=20DE?=
 =?UTF-8?q?SCRIPTION:20:=20added=20data.table=20to=20Imports.=20NAMESPACE:?=
 =?UTF-8?q?=20regenerated,=20now=20imports=20:=3D,=20.SD,=20setDT,=20setna?=
 =?UTF-8?q?mes=20from=20data.table.=20tests/testthat/test-converters.R:97-?=
 =?UTF-8?q?211:=205=20new=20tests=20=E2=80=94=20schema=20smoke=20test,=20f?=
 =?UTF-8?q?ilter=5Fby=5Fexcluded,=20filter=5Fby=5Fidentified,=20filter=5Fb?=
 =?UTF-8?q?y=5Fqvalue=20(covering=20the=20NA-q-value=20case),=20and=20FFrg?=
 =?UTF-8?q?LossType=20row=20drop.?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 DESCRIPTION           |   3 +-
 NAMESPACE             |   4 ++
 R/clean_spectronaut.R | 106 ++++++++++++++++++++----------------------
 3 files changed, 57 insertions(+), 56 deletions(-)

diff --git a/DESCRIPTION b/DESCRIPTION
index 83fdc00..8a6629b 100644
--- a/DESCRIPTION
+++ b/DESCRIPTION
@@ -15,8 +15,9 @@ Description: MSstats package provide tools for preprocessing, summarization and
 License: Artistic-2.0
 Encoding: UTF-8
 RoxygenNote: 7.3.3
-Imports: 
+Imports:
     arrow,
+    data.table,
     DBI,
     dplyr,
     MSstats,
diff --git a/NAMESPACE b/NAMESPACE
index 9ec4823..8312349 100644
--- a/NAMESPACE
+++ b/NAMESPACE
@@ -10,5 +10,9 @@ importFrom(MSstats,groupComparison)
 importFrom(MSstatsConvert,MSstatsClean)
 importFrom(MSstatsConvert,MSstatsImport)
 importFrom(MSstatsConvert,MSstatsMakeAnnotation)
+importFrom(data.table,":=")
+importFrom(data.table,.SD)
+importFrom(data.table,setDT)
+importFrom(data.table,setnames)
 importFrom(utils,head)
 importFrom(utils,sessionInfo)
diff --git a/R/clean_spectronaut.R b/R/clean_spectronaut.R
index a2d05fe..ae5a408 100644
--- a/R/clean_spectronaut.R
+++ b/R/clean_spectronaut.R
@@ -1,3 +1,6 @@
+#' @importFrom data.table := .SD setDT setnames
+NULL
+
 #' @keywords internal
 reduceBigSpectronaut <- function(input_file, output_path,
                                  intensity="F.NormalizedPeakArea",
@@ -108,85 +111,78 @@ cleanSpectronautChunk = function(input, output_path,
                                  filter_by_qvalue = TRUE,
                                  qvalue_cutoff = 0.01,
                                  pos = NULL,
-                                 calculateAnomalyScores=FALSE, 
+                                 calculateAnomalyScores=FALSE,
                                  anomalyModelFeatures=c()) {
+  data.table::setDT(input)
+
   all_cols <- c("R.FileName", "R.Condition", "R.Replicate",
                 "PG.ProteinAccessions", "EG.ModifiedSequence", "FG.LabeledSequence",
                 "FG.Charge", "F.FrgIon", "F.Charge",
                 "EG.Identified", "F.ExcludedFromQuantification", "F.FrgLossType",
                 "PG.Qvalue", "EG.Qvalue", intensity)
-  
-  if (calculateAnomalyScores){
-    all_cols <- c(all_cols, anomalyModelFeatures)
-  }
-  
-  cols <- intersect(all_cols, colnames(input))
-  input <- dplyr::select(input, all_of(cols))
-  input <- dplyr::rename_with(input, .fn = MSstatsConvert:::.standardizeColnames)
-  
   new_names <- c("Run", "Condition", "BioReplicate", "ProteinName",
                  "PeptideSequence", "LabeledSequence", "PrecursorCharge", "FragmentIon",
                  "ProductCharge", "Identified", "Excluded",
                  "FFrgLossType", "PGQvalue", "EGQvalue",
                  "Intensity")
-  if (calculateAnomalyScores){
+  if (calculateAnomalyScores) {
+    all_cols  <- c(all_cols, anomalyModelFeatures)
     new_names <- c(new_names, MSstatsConvert:::.standardizeColnames(anomalyModelFeatures))
   }
-  
-  # non_standardized =
-  old_names <- MSstatsConvert:::.standardizeColnames(all_cols)
-  names(old_names) <- new_names
-  old_names <- old_names[old_names %in% colnames(input)]
-  
-  input <- dplyr::rename(input, !!old_names)
-  input <- dplyr::mutate(input, Intensity = as.numeric(Intensity))
-  
-  if (is.character(dplyr::pull(dplyr::collect(head(dplyr::select(input, Excluded))), Excluded))) {
-    input <- dplyr::mutate(input, Excluded = Excluded == "True")
+
+  present_orig <- intersect(all_cols, colnames(input))
+  input <- input[, present_orig, with = FALSE]
+
+  # Two-step rename matching the MSstatsConvert family pattern: standardize
+  # all column names, then map standardized -> MSstats final names.
+  data.table::setnames(input, MSstatsConvert:::.standardizeColnames(colnames(input)))
+  std_to_msstats <- stats::setNames(new_names,
+                                    MSstatsConvert:::.standardizeColnames(all_cols))
+  data.table::setnames(input,
+                       old = names(std_to_msstats),
+                       new = unname(std_to_msstats),
+                       skip_absent = TRUE)
+
+  input[, Intensity := as.numeric(Intensity)]
+
+  if (is.character(input[["Excluded"]])) {
+    input[, Excluded := Excluded == "True"]
   }
-  if (is.element("Identified", colnames(input))) {
-    if (is.character(dplyr::pull(dplyr::collect(head(dplyr::select(input, Identified))), Identified))) {
-      input <- dplyr::mutate(input, Identified = Identified == "True")
-    }
+  if ("Identified" %in% colnames(input) && is.character(input[["Identified"]])) {
+    input[, Identified := Identified == "True"]
   }
-  
+
   if (filter_by_excluded) {
-    input <- dplyr::mutate(
-      input, Intensity = dplyr::if_else(Excluded, NA_real_, Intensity))
-    
+    input[Excluded == TRUE, Intensity := NA_real_]
   }
-  
   if (filter_by_identified) {
-    input <- dplyr::mutate(
-      input, Intensity = dplyr::if_else(Identified, Intensity, NA_real_))
+    input[Identified == FALSE, Intensity := NA_real_]
   }
-  
   if (filter_by_qvalue) {
-    input <- dplyr::mutate(
-      input,
-      Intensity = dplyr::if_else(EGQvalue < qvalue_cutoff, Intensity, NA_real_))
-    input <- dplyr::mutate(
-      input, 
-      Intensity = dplyr::if_else(PGQvalue < qvalue_cutoff, Intensity, NA_real_))
+    # Preserve dplyr::if_else semantics: rows with NA q-values become NA.
+    input[is.na(EGQvalue) | EGQvalue >= qvalue_cutoff, Intensity := NA_real_]
+    input[is.na(PGQvalue) | PGQvalue >= qvalue_cutoff, Intensity := NA_real_]
   }
-  
-  input <- dplyr::filter(input, FFrgLossType == "noloss")
-  if (is.element("LabeledSequence", colnames(input))) {
-    input <- dplyr::mutate(input, IsLabeled = grepl("Lys8", LabeledSequence) | grepl("Arg10", LabeledSequence))
-    input <- dplyr::mutate(input, IsotopeLabelType := dplyr::if_else(IsLabeled, "H", "L"))
+
+  input <- input[FFrgLossType == "noloss"]
+
+  if ("LabeledSequence" %in% colnames(input)) {
+    input[, IsotopeLabelType := ifelse(
+      grepl("Lys8", LabeledSequence) | grepl("Arg10", LabeledSequence),
+      "H", "L")]
   } else {
-    input <- dplyr::mutate(input, IsotopeLabelType = "L")
+    input[, IsotopeLabelType := "L"]
   }
-  
-  select_cols = c("ProteinName", "PeptideSequence", "PrecursorCharge", "FragmentIon",
-                  "ProductCharge", "IsotopeLabelType", "Run", "BioReplicate", "Condition",
-                  "Intensity")
-  if (calculateAnomalyScores){
-    select_cols = c(select_cols, 
-                    MSstatsConvert:::.standardizeColnames(anomalyModelFeatures))
+
+  select_cols <- c("ProteinName", "PeptideSequence", "PrecursorCharge", "FragmentIon",
+                   "ProductCharge", "IsotopeLabelType", "Run", "BioReplicate", "Condition",
+                   "Intensity")
+  if (calculateAnomalyScores) {
+    select_cols <- c(select_cols,
+                     MSstatsConvert:::.standardizeColnames(anomalyModelFeatures))
   }
-  
-  input <- dplyr::select(input, select_cols)
+
+  input <- input[, select_cols, with = FALSE]
   .writeChunkToFile(input, output_path, pos)
   NULL
 }

From fdd7476b7d423f9439c96804059aa6a6322b6f4c Mon Sep 17 00:00:00 2001
From: Rudhik1904 <rudhikshah50@gmail.com>
Date: Fri, 15 May 2026 18:48:41 -0500
Subject: [PATCH 10/11] add more tests

---
 tests/testthat/test-converters.R | 118 +++++++++++++++++++++++++++++++
 1 file changed, 118 insertions(+)

diff --git a/tests/testthat/test-converters.R b/tests/testthat/test-converters.R
index 433d584..78e0c1b 100644
--- a/tests/testthat/test-converters.R
+++ b/tests/testthat/test-converters.R
@@ -94,6 +94,124 @@ test_that("bigSpectronauttoMSstatsFormat works correctly", {
   unlink(paste0("reduce_output_", output_file), recursive = TRUE, force = TRUE)
 })
 
+make_spectronaut_input <- function(n = 1L, ...) {
+  base <- data.frame(
+    R.FileName = rep("run1", n),
+    R.Condition = "A",
+    R.Replicate = 1L,
+    PG.ProteinAccessions = "P1",
+    EG.ModifiedSequence = paste0("PEP", seq_len(n)),
+    FG.LabeledSequence = paste0("PEP", seq_len(n)),
+    FG.Charge = 2L,
+    F.FrgIon = "y1",
+    F.Charge = 1L,
+    EG.Identified = "True",
+    F.ExcludedFromQuantification = "False",
+    F.FrgLossType = "noloss",
+    PG.Qvalue = 0.001,
+    EG.Qvalue = 0.001,
+    F.NormalizedPeakArea = seq_len(n) * 100,
+    stringsAsFactors = FALSE
+  )
+  overrides <- list(...)
+  for (col in names(overrides)) base[[col]] <- overrides[[col]]
+  base
+}
+
+test_that("cleanSpectronautChunk produces the expected MSstats schema", {
+  input <- make_spectronaut_input(n = 1L)
+  output_file <- tempfile(fileext = ".csv")
+  on.exit(unlink(output_file, force = TRUE), add = TRUE)
+
+  MSstatsBig:::cleanSpectronautChunk(input, output_file, pos = 1L)
+  result <- readr::read_csv(output_file, show_col_types = FALSE)
+
+  expected_cols <- c("ProteinName", "PeptideSequence", "PrecursorCharge", "FragmentIon",
+                     "ProductCharge", "IsotopeLabelType", "Run", "BioReplicate", "Condition",
+                     "Intensity")
+  expect_setequal(colnames(result), expected_cols)
+  expect_equal(nrow(result), 1L)
+  expect_equal(result$Intensity, 100)
+  expect_equal(result$IsotopeLabelType, "L")
+})
+
+test_that("cleanSpectronautChunk filter_by_excluded sets Intensity to NA on excluded rows", {
+  input <- make_spectronaut_input(
+    n = 2L,
+    F.ExcludedFromQuantification = c("True", "False"),
+    F.NormalizedPeakArea = c(100, 200)
+  )
+  output_file <- tempfile(fileext = ".csv")
+  on.exit(unlink(output_file, force = TRUE), add = TRUE)
+
+  MSstatsBig:::cleanSpectronautChunk(input, output_file, pos = 1L,
+                                     filter_by_excluded = TRUE,
+                                     filter_by_qvalue = FALSE)
+  result <- readr::read_csv(output_file, show_col_types = FALSE)
+  result <- result[order(result$PeptideSequence), ]
+
+  expect_true(is.na(result$Intensity[1]))
+  expect_equal(result$Intensity[2], 200)
+})
+
+test_that("cleanSpectronautChunk filter_by_identified sets Intensity to NA on unidentified rows", {
+  input <- make_spectronaut_input(
+    n = 2L,
+    EG.Identified = c("True", "False"),
+    F.NormalizedPeakArea = c(100, 200)
+  )
+  output_file <- tempfile(fileext = ".csv")
+  on.exit(unlink(output_file, force = TRUE), add = TRUE)
+
+  MSstatsBig:::cleanSpectronautChunk(input, output_file, pos = 1L,
+                                     filter_by_identified = TRUE,
+                                     filter_by_qvalue = FALSE)
+  result <- readr::read_csv(output_file, show_col_types = FALSE)
+  result <- result[order(result$PeptideSequence), ]
+
+  expect_equal(result$Intensity[1], 100)
+  expect_true(is.na(result$Intensity[2]))
+})
+
+test_that("cleanSpectronautChunk filter_by_qvalue NA-aware semantics match dplyr::if_else", {
+  input <- make_spectronaut_input(
+    n = 3L,
+    EG.Qvalue = c(0.001, 0.5, NA_real_),
+    F.NormalizedPeakArea = c(100, 200, 300)
+  )
+  output_file <- tempfile(fileext = ".csv")
+  on.exit(unlink(output_file, force = TRUE), add = TRUE)
+
+  MSstatsBig:::cleanSpectronautChunk(input, output_file, pos = 1L,
+                                     filter_by_qvalue = TRUE,
+                                     qvalue_cutoff = 0.01)
+  result <- readr::read_csv(output_file, show_col_types = FALSE)
+  result <- result[order(result$PeptideSequence), ]
+
+  # PEP1: EGQvalue below cutoff -> kept
+  expect_equal(result$Intensity[1], 100)
+  # PEP2: EGQvalue above cutoff -> NA
+  expect_true(is.na(result$Intensity[2]))
+  # PEP3: EGQvalue NA -> NA (preserves dplyr::if_else semantics)
+  expect_true(is.na(result$Intensity[3]))
+})
+
+test_that("cleanSpectronautChunk drops rows where FFrgLossType != noloss", {
+  input <- make_spectronaut_input(
+    n = 3L,
+    F.FrgLossType = c("noloss", "H2O", "noloss")
+  )
+  output_file <- tempfile(fileext = ".csv")
+  on.exit(unlink(output_file, force = TRUE), add = TRUE)
+
+  MSstatsBig:::cleanSpectronautChunk(input, output_file, pos = 1L,
+                                     filter_by_qvalue = FALSE)
+  result <- readr::read_csv(output_file, show_col_types = FALSE)
+
+  expect_equal(nrow(result), 2L)
+  expect_setequal(result$PeptideSequence, c("PEP1", "PEP3"))
+})
+
 test_that("reduceBigSpectronaut rejects invalid block_size values", {
   input_file <- tempfile(fileext = ".csv")
   writeLines("a,b\n1,2", input_file)

From 8188f4e1c90221c8338c7a5a4ef8039422d1348d Mon Sep 17 00:00:00 2001
From: Rudhik1904 <rudhikshah50@gmail.com>
Date: Fri, 22 May 2026 16:31:46 -0500
Subject: [PATCH 11/11] temp Commit so I can get the list of col

---
 R/clean_spectronaut.R | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/R/clean_spectronaut.R b/R/clean_spectronaut.R
index ae5a408..198f4d1 100644
--- a/R/clean_spectronaut.R
+++ b/R/clean_spectronaut.R
@@ -131,6 +131,16 @@ cleanSpectronautChunk = function(input, output_path,
   }
 
   present_orig <- intersect(all_cols, colnames(input))
+  if (length(present_orig) == 0L) {
+    stop(sprintf(
+      paste0("cleanSpectronautChunk: none of the expected Spectronaut ",
+             "columns were found in the input batch. ",
+             "Expected any of: %s. Found: %s. ",
+             "Check that the file is comma-delimited and that the ",
+             "Spectronaut export uses the standard column names."),
+      paste(all_cols, collapse = ", "),
+      paste(colnames(input), collapse = ", ")))
+  }
   input <- input[, present_orig, with = FALSE]
 
   # Two-step rename matching the MSstatsConvert family pattern: standardize