Vitek-Lab · Rudhik1904 · May 7, 2026 · May 7, 2026 · May 7, 2026 · May 7, 2026
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -15,8 +15,9 @@ Description: MSstats package provide tools for preprocessing, summarization and
 License: Artistic-2.0
 Encoding: UTF-8
 RoxygenNote: 7.3.3
-Imports: 
+Imports:
     arrow,
+    data.table,
     DBI,
     dplyr,
     MSstats,

diff --git a/NAMESPACE b/NAMESPACE
@@ -10,5 +10,9 @@ importFrom(MSstats,groupComparison)
 importFrom(MSstatsConvert,MSstatsClean)
 importFrom(MSstatsConvert,MSstatsImport)
 importFrom(MSstatsConvert,MSstatsMakeAnnotation)
+importFrom(data.table,":=")
+importFrom(data.table,.SD)
+importFrom(data.table,setDT)
+importFrom(data.table,setnames)
 importFrom(utils,head)
 importFrom(utils,sessionInfo)
diff --git a/R/clean_spectronaut.R b/R/clean_spectronaut.R
@@ -1,33 +1,106 @@
+#' @importFrom data.table := .SD setDT setnames
+NULL
+
 #' @keywords internal
 reduceBigSpectronaut <- function(input_file, output_path,
                                  intensity="F.NormalizedPeakArea",
                                  filter_by_excluded = FALSE,
                                  filter_by_identified = FALSE,
                                  filter_by_qvalue = TRUE,
                                  qvalue_cutoff = 0.01,
-                                 calculateAnomalyScores=FALSE, 
-                                 anomalyModelFeatures=c()) {
+                                 calculateAnomalyScores=FALSE,
+                                 anomalyModelFeatures=c(),
+                                 block_size = 16L * 1024L * 1024L) {
+  block_size <- as.integer(block_size)
+  stopifnot(length(block_size) == 1L, !is.na(block_size), block_size > 0L)
+
   if (grepl("csv", input_file)) {
-    delim = ","
+    delim <- ","
   } else if (grepl("tsv|xls", input_file)) {
-    delim = "\t"
+    delim <- "\t"
   } else {
     delim <- ";"
   }
-  spec_chunk <- function(x, pos) cleanSpectronautChunk(x,
-                                                       output_path,
-                                                       intensity,
-                                                       filter_by_excluded,
-                                                       filter_by_identified,
-                                                       filter_by_qvalue,
-                                                       qvalue_cutoff,
-                                                       pos,
-                                                       calculateAnomalyScores, 
-                                                       anomalyModelFeatures)
-  readr::read_delim_chunked(input_file,
-                            readr::DataFrameCallback$new(spec_chunk),
-                            delim = delim,
-                            chunk_size = 1e6)
+
+  # Columns cleanSpectronautChunk actually consumes; Arrow's
+  # convert_options$include_columns drops everything else at parse time so
+  # we never materialize the ~35 unused columns Spectronaut exports.
+  needed_cols <- c("R.FileName", "R.Condition", "R.Replicate",
+                   "PG.ProteinAccessions", "EG.ModifiedSequence",
+                   "FG.LabeledSequence", "FG.Charge",
+                   "F.FrgIon", "F.Charge",
+                   "EG.Identified", "F.ExcludedFromQuantification",
+                   "F.FrgLossType", "PG.Qvalue", "EG.Qvalue",
+                   intensity)
+  if (calculateAnomalyScores) {
+    needed_cols <- c(needed_cols, anomalyModelFeatures)
+  }
+
+  # Arrow's CSV reader replaces readr::read_delim_chunked.  Arrow releases
+  # per-batch state as soon as a batch is consumed, so peak memory is
+  # bounded by one record batch instead of growing with the dataset (readr
+  # keeps a string-interning pool that accumulates across chunks).  The
+  # `delim` switch above already covers comma / tab / semicolon variants;
+  # Arrow's CSV reader handles all three the same way through
+  # CsvParseOptions$delimiter.
+  parse_opts   <- arrow::CsvParseOptions$create(delimiter = delim)
+  convert_opts <- arrow::CsvConvertOptions$create()
+  read_opts    <- arrow::CsvReadOptions$create(block_size = block_size)
+
+  ds <- arrow::open_dataset(
+    input_file,
+    format          = "csv",
+    parse_options   = parse_opts,
+    convert_options = convert_opts,
+    read_options    = read_opts
+  )
+
+  reader <- arrow::Scanner$create(ds)$ToRecordBatchReader()
+
+  t_start   <- Sys.time()
+  pos       <- 1L
+  batch_idx <- 0L
+  repeat {
+    batch <- reader$read_next_batch()
+    if (is.null(batch)) break
+    chunk_df <- as.data.frame(batch)
+    cleanSpectronautChunk(chunk_df,
+                          output_path,
+                          intensity,
+                          filter_by_excluded,
+                          filter_by_identified,
+                          filter_by_qvalue,
+                          qvalue_cutoff,
+                          pos,
+                          calculateAnomalyScores,
+                          anomalyModelFeatures)
+    pos       <- pos + nrow(chunk_df)
+    batch_idx <- batch_idx + 1L
+
+    if (batch_idx %% 1000L == 0L) {
+      elapsed <- as.numeric(Sys.time() - t_start, units = "secs")
+      rate    <- (pos - 1L) / elapsed
+      message(sprintf(
+        "[reduceBigSpectronaut] %d batches | %s rows | %.1fk rows/s | %.0fs elapsed",
+        batch_idx,
+        format(pos - 1L, big.mark = ","),
+        rate / 1000,
+        elapsed))
+    }
+
+    rm(batch, chunk_df)
+  }
+
+  if (batch_idx %% 1000L != 0L) {
+    elapsed <- as.numeric(Sys.time() - t_start, units = "secs")
+    rate    <- (pos - 1L) / elapsed
+    message(sprintf(
+      "[reduceBigSpectronaut] done: %d batches | %s rows | %.1fk rows/s | %.0fs elapsed",
+      batch_idx,
+      format(pos - 1L, big.mark = ","),
+      rate / 1000,
+      elapsed))
+  }
 }
 
 #' @keywords internal
@@ -38,85 +111,88 @@ cleanSpectronautChunk = function(input, output_path,
                                  filter_by_qvalue = TRUE,
                                  qvalue_cutoff = 0.01,
                                  pos = NULL,
-                                 calculateAnomalyScores=FALSE, 
+                                 calculateAnomalyScores=FALSE,
                                  anomalyModelFeatures=c()) {
+  data.table::setDT(input)
+
   all_cols <- c("R.FileName", "R.Condition", "R.Replicate",
                 "PG.ProteinAccessions", "EG.ModifiedSequence", "FG.LabeledSequence",
                 "FG.Charge", "F.FrgIon", "F.Charge",
                 "EG.Identified", "F.ExcludedFromQuantification", "F.FrgLossType",
                 "PG.Qvalue", "EG.Qvalue", intensity)
-
-  if (calculateAnomalyScores){
-    all_cols <- c(all_cols, anomalyModelFeatures)
-  }
-
-  cols <- intersect(all_cols, colnames(input))
-  input <- dplyr::select(input, all_of(cols))
-  input <- dplyr::rename_with(input, .fn = MSstatsConvert:::.standardizeColnames)
-
   new_names <- c("Run", "Condition", "BioReplicate", "ProteinName",
                  "PeptideSequence", "LabeledSequence", "PrecursorCharge", "FragmentIon",
                  "ProductCharge", "Identified", "Excluded",
                  "FFrgLossType", "PGQvalue", "EGQvalue",
                  "Intensity")
-  if (calculateAnomalyScores){
+  if (calculateAnomalyScores) {
+    all_cols  <- c(all_cols, anomalyModelFeatures)
     new_names <- c(new_names, MSstatsConvert:::.standardizeColnames(anomalyModelFeatures))
   }
-  
-  # non_standardized =
-  old_names <- MSstatsConvert:::.standardizeColnames(all_cols)
-  names(old_names) <- new_names
-  old_names <- old_names[old_names %in% colnames(input)]
-
-  input <- dplyr::rename(input, !!old_names)
-  input <- dplyr::mutate(input, Intensity = as.numeric(Intensity))
-
-  if (is.character(dplyr::pull(dplyr::collect(head(dplyr::select(input, Excluded))), Excluded))) {
-    input <- dplyr::mutate(input, Excluded = Excluded == "True")
+
+  present_orig <- intersect(all_cols, colnames(input))
+  if (length(present_orig) == 0L) {
+    stop(sprintf(
+      paste0("cleanSpectronautChunk: none of the expected Spectronaut ",
+             "columns were found in the input batch. ",
+             "Expected any of: %s. Found: %s. ",
+             "Check that the file is comma-delimited and that the ",
+             "Spectronaut export uses the standard column names."),
+      paste(all_cols, collapse = ", "),
+      paste(colnames(input), collapse = ", ")))
   }
-  if (is.element("Identified", colnames(input))) {
-    if (is.character(dplyr::pull(dplyr::collect(head(dplyr::select(input, Identified))), Identified))) {
-      input <- dplyr::mutate(input, Identified = Identified == "True")
-    }
+  input <- input[, present_orig, with = FALSE]
+
+  # Two-step rename matching the MSstatsConvert family pattern: standardize
+  # all column names, then map standardized -> MSstats final names.
+  data.table::setnames(input, MSstatsConvert:::.standardizeColnames(colnames(input)))
+  std_to_msstats <- stats::setNames(new_names,
+                                    MSstatsConvert:::.standardizeColnames(all_cols))
+  data.table::setnames(input,
+                       old = names(std_to_msstats),
+                       new = unname(std_to_msstats),
+                       skip_absent = TRUE)
+
+  input[, Intensity := as.numeric(Intensity)]
+
+  if (is.character(input[["Excluded"]])) {
+    input[, Excluded := Excluded == "True"]
+  }
+  if ("Identified" %in% colnames(input) && is.character(input[["Identified"]])) {
+    input[, Identified := Identified == "True"]
   }
-  
+
   if (filter_by_excluded) {
-    input <- dplyr::mutate(
-      input, Intensity = dplyr::if_else(Excluded, NA_real_, Intensity))
-
+    input[Excluded == TRUE, Intensity := NA_real_]
   }
-
   if (filter_by_identified) {
-    input <- dplyr::mutate(
-      input, Intensity = dplyr::if_else(Identified, Intensity, NA_real_))
+    input[Identified == FALSE, Intensity := NA_real_]
   }
-
   if (filter_by_qvalue) {
-    input <- dplyr::mutate(
-      input,
-      Intensity = dplyr::if_else(EGQvalue < qvalue_cutoff, Intensity, NA_real_))
-    input <- dplyr::mutate(
-      input, 
-      Intensity = dplyr::if_else(PGQvalue < qvalue_cutoff, Intensity, NA_real_))
+    # Preserve dplyr::if_else semantics: rows with NA q-values become NA.
+    input[is.na(EGQvalue) | EGQvalue >= qvalue_cutoff, Intensity := NA_real_]
+    input[is.na(PGQvalue) | PGQvalue >= qvalue_cutoff, Intensity := NA_real_]
   }
-
-  input <- dplyr::filter(input, FFrgLossType == "noloss")
-  if (is.element("LabeledSequence", colnames(input))) {
-    input <- dplyr::mutate(input, IsLabeled = grepl("Lys8", LabeledSequence) | grepl("Arg10", LabeledSequence))
-    input <- dplyr::mutate(input, IsotopeLabelType := dplyr::if_else(IsLabeled, "H", "L"))
+
+  input <- input[FFrgLossType == "noloss"]
-  if (filter_by_excluded) {
-    input <- dplyr::mutate(
-      input, Intensity = dplyr::if_else(Excluded, NA_real_, Intensity))
-    
-    input[Excluded == TRUE, Intensity := NA_real_]
-  }
-  
-  if (filter_by_identified) {
-    input <- dplyr::mutate(
-      input, Intensity = dplyr::if_else(Identified, Intensity, NA_real_))
-    input[Identified == FALSE, Intensity := NA_real_]
-  }
-  
-  if (filter_by_qvalue) {
-    input <- dplyr::mutate(
-      input,
-      Intensity = dplyr::if_else(EGQvalue < qvalue_cutoff, Intensity, NA_real_))
-    input <- dplyr::mutate(
-      input, 
-      Intensity = dplyr::if_else(PGQvalue < qvalue_cutoff, Intensity, NA_real_))
-    # Preserve dplyr::if_else semantics: rows with NA q-values become NA.
-    input[is.na(EGQvalue) | EGQvalue >= qvalue_cutoff, Intensity := NA_real_]
-    input[is.na(PGQvalue) | PGQvalue >= qvalue_cutoff, Intensity := NA_real_]
-  }
-  
-  input <- dplyr::filter(input, FFrgLossType == "noloss")
-  if (is.element("LabeledSequence", colnames(input))) {
-    input <- dplyr::mutate(input, IsLabeled = grepl("Lys8", LabeledSequence) | grepl("Arg10", LabeledSequence))
-    input <- dplyr::mutate(input, IsotopeLabelType := dplyr::if_else(IsLabeled, "H", "L"))
-
-  input <- input[FFrgLossType == "noloss"]
+  if (filter_by_excluded && "Excluded" %in% colnames(input)) {
+    input[Excluded == TRUE, Intensity := NA_real_]
+  }
+  if (filter_by_identified && "Identified" %in% colnames(input)) {
+    input[Identified == FALSE, Intensity := NA_real_]
+  }
+  if (filter_by_qvalue) {
+    # Preserve dplyr::if_else semantics: rows with NA q-values become NA.
+    if ("EGQvalue" %in% colnames(input)) {
+      input[is.na(EGQvalue) | EGQvalue >= qvalue_cutoff, Intensity := NA_real_]
+    }
+    if ("PGQvalue" %in% colnames(input)) {
+      input[is.na(PGQvalue) | PGQvalue >= qvalue_cutoff, Intensity := NA_real_]
+    }
+  }
+
+  if ("FFrgLossType" %in% colnames(input)) {
+    input <- input[FFrgLossType == "noloss"]
+  }
-  if (filter_by_excluded) {
-    input <- dplyr::mutate(
-      input, Intensity = dplyr::if_else(Excluded, NA_real_, Intensity))
-    
-    input[Excluded == TRUE, Intensity := NA_real_]
-  }
-  
-  if (filter_by_identified) {
-    input <- dplyr::mutate(
-      input, Intensity = dplyr::if_else(Identified, Intensity, NA_real_))
-    input[Identified == FALSE, Intensity := NA_real_]
-  }
-  
-  if (filter_by_qvalue) {
-    input <- dplyr::mutate(
-      input,
-      Intensity = dplyr::if_else(EGQvalue < qvalue_cutoff, Intensity, NA_real_))
-    input <- dplyr::mutate(
-      input, 
-      Intensity = dplyr::if_else(PGQvalue < qvalue_cutoff, Intensity, NA_real_))
-    # Preserve dplyr::if_else semantics: rows with NA q-values become NA.
-    input[is.na(EGQvalue) | EGQvalue >= qvalue_cutoff, Intensity := NA_real_]
-    input[is.na(PGQvalue) | PGQvalue >= qvalue_cutoff, Intensity := NA_real_]
-  }
-  
-  input <- dplyr::filter(input, FFrgLossType == "noloss")
-  if (is.element("LabeledSequence", colnames(input))) {
-    input <- dplyr::mutate(input, IsLabeled = grepl("Lys8", LabeledSequence) | grepl("Arg10", LabeledSequence))
-    input <- dplyr::mutate(input, IsotopeLabelType := dplyr::if_else(IsLabeled, "H", "L"))
-
-  input <- input[FFrgLossType == "noloss"]
+  if (filter_by_excluded && "Excluded" %in% colnames(input)) {
+    input[Excluded == TRUE, Intensity := NA_real_]
+  }
+  if (filter_by_identified && "Identified" %in% colnames(input)) {
+    input[Identified == FALSE, Intensity := NA_real_]
+  }
+  if (filter_by_qvalue) {
+    # Preserve dplyr::if_else semantics: rows with NA q-values become NA.
+    if ("EGQvalue" %in% colnames(input)) {
+      input[is.na(EGQvalue) | EGQvalue >= qvalue_cutoff, Intensity := NA_real_]
+    }
+    if ("PGQvalue" %in% colnames(input)) {
+      input[is.na(PGQvalue) | PGQvalue >= qvalue_cutoff, Intensity := NA_real_]
+    }
+  }
+
+  if ("FFrgLossType" %in% colnames(input)) {
+    input <- input[FFrgLossType == "noloss"]
+  }
+
+  if ("LabeledSequence" %in% colnames(input)) {
+    input[, IsotopeLabelType := ifelse(
+      grepl("Lys8", LabeledSequence) | grepl("Arg10", LabeledSequence),
+      "H", "L")]
   } else {
-    input <- dplyr::mutate(input, IsotopeLabelType = "L")
+    input[, IsotopeLabelType := "L"]
   }
-  
-  select_cols = c("ProteinName", "PeptideSequence", "PrecursorCharge", "FragmentIon",
-                  "ProductCharge", "IsotopeLabelType", "Run", "BioReplicate", "Condition",
-                  "Intensity")
-  if (calculateAnomalyScores){
-    select_cols = c(select_cols, 
-                    MSstatsConvert:::.standardizeColnames(anomalyModelFeatures))
+
+  select_cols <- c("ProteinName", "PeptideSequence", "PrecursorCharge", "FragmentIon",
+                   "ProductCharge", "IsotopeLabelType", "Run", "BioReplicate", "Condition",
+                   "Intensity")
+  if (calculateAnomalyScores) {
+    select_cols <- c(select_cols,
+                     MSstatsConvert:::.standardizeColnames(anomalyModelFeatures))
   }
-  
-  input <- dplyr::select(input, select_cols)
+
+  input <- input[, select_cols, with = FALSE]
   .writeChunkToFile(input, output_path, pos)
   NULL
 }
diff --git a/R/converters.R b/R/converters.R
@@ -117,6 +117,11 @@ bigFragPipetoMSstatsFormat <-  function(input_file, output_file_name,
 #' @param filter_by_identified if TRUE, will filter by the `EG.Identified` column.
 #' @param filter_by_qvalue if TRUE, will filter by EG.Qvalue and PG.Qvalue columns.
 #' @param qvalue_cutoff cutoff which will be used for q-value filtering.
+#' @param block_size Arrow CSV reader block size in bytes; each input row must
+#'   fit inside one block. Defaults to 16 MiB (`16L * 1024L * 1024L`). If you
+#'   see `Invalid: straddling object straddles two block boundaries` on
+#'   extra-wide Spectronaut exports, pass a larger value
+#'   (e.g. `64L * 1024L * 1024L`).
 #'
 #' @export
 #'
@@ -143,14 +148,16 @@ bigSpectronauttoMSstatsFormat <-  function(input_file, output_file_name,
                                           aggregate_psms =  FALSE,
                                           filter_few_obs =  FALSE,
                                           remove_annotation =  FALSE,
-                                          calculateAnomalyScores=FALSE, 
+                                          calculateAnomalyScores=FALSE,
                                           anomalyModelFeatures=c(),
-                                          connection =  NULL) {
+                                          connection =  NULL,
+                                          block_size = 16L * 1024L * 1024L) {
   reduced_file <- .prefixedPath("reduce_output_", output_file_name)
   reduceBigSpectronaut(input_file, reduced_file,
                        intensity, filter_by_excluded, filter_by_identified,
                        filter_by_qvalue, qvalue_cutoff,
-                       calculateAnomalyScores, anomalyModelFeatures)
+                       calculateAnomalyScores, anomalyModelFeatures,
+                       block_size = block_size)
   msstats_data <- MSstatsPreprocessBig(
     input_file = reduced_file,
     output_file_name = output_file_name, 

diff --git a/man/bigSpectronauttoMSstatsFormat.Rd b/man/bigSpectronauttoMSstatsFormat.Rd
diff --git a/man/dot-prefixedPath.Rd b/man/dot-prefixedPath.Rd