Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion DESCRIPTION
Original file line number Diff line number Diff line change
Expand Up @@ -15,8 +15,9 @@ Description: MSstats package provide tools for preprocessing, summarization and
License: Artistic-2.0
Encoding: UTF-8
RoxygenNote: 7.3.3
Imports:
Imports:
arrow,
data.table,
DBI,
dplyr,
MSstats,
Expand Down
4 changes: 4 additions & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -10,5 +10,9 @@ importFrom(MSstats,groupComparison)
importFrom(MSstatsConvert,MSstatsClean)
importFrom(MSstatsConvert,MSstatsImport)
importFrom(MSstatsConvert,MSstatsMakeAnnotation)
importFrom(data.table,":=")
importFrom(data.table,.SD)
importFrom(data.table,setDT)
importFrom(data.table,setnames)
importFrom(utils,head)
importFrom(utils,sessionInfo)
222 changes: 149 additions & 73 deletions R/clean_spectronaut.R
Original file line number Diff line number Diff line change
@@ -1,33 +1,106 @@
#' @importFrom data.table := .SD setDT setnames
NULL

#' @keywords internal
reduceBigSpectronaut <- function(input_file, output_path,
intensity="F.NormalizedPeakArea",
filter_by_excluded = FALSE,
filter_by_identified = FALSE,
filter_by_qvalue = TRUE,
qvalue_cutoff = 0.01,
calculateAnomalyScores=FALSE,
anomalyModelFeatures=c()) {
calculateAnomalyScores=FALSE,
anomalyModelFeatures=c(),
block_size = 16L * 1024L * 1024L) {

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

  1. For people who may want to increase their block size to increase the speed of processing, add a recommendation on how to estimate the adequate block size to maximize speed while reducing the risk of the system crashing
  2. Make it more clear in the MSstatsBig / MSstatsConvert documentation on which columns we actually need from users for Spectronaut

block_size <- as.integer(block_size)
stopifnot(length(block_size) == 1L, !is.na(block_size), block_size > 0L)

if (grepl("csv", input_file)) {
delim = ","
delim <- ","
} else if (grepl("tsv|xls", input_file)) {
delim = "\t"
delim <- "\t"
} else {
delim <- ";"
}
spec_chunk <- function(x, pos) cleanSpectronautChunk(x,
output_path,
intensity,
filter_by_excluded,
filter_by_identified,
filter_by_qvalue,
qvalue_cutoff,
pos,
calculateAnomalyScores,
anomalyModelFeatures)
readr::read_delim_chunked(input_file,
readr::DataFrameCallback$new(spec_chunk),
delim = delim,
chunk_size = 1e6)

# Columns cleanSpectronautChunk actually consumes; Arrow's
# convert_options$include_columns drops everything else at parse time so
# we never materialize the ~35 unused columns Spectronaut exports.
needed_cols <- c("R.FileName", "R.Condition", "R.Replicate",
"PG.ProteinAccessions", "EG.ModifiedSequence",
"FG.LabeledSequence", "FG.Charge",
"F.FrgIon", "F.Charge",
"EG.Identified", "F.ExcludedFromQuantification",
"F.FrgLossType", "PG.Qvalue", "EG.Qvalue",
intensity)
if (calculateAnomalyScores) {
needed_cols <- c(needed_cols, anomalyModelFeatures)
}

# Arrow's CSV reader replaces readr::read_delim_chunked. Arrow releases
# per-batch state as soon as a batch is consumed, so peak memory is
# bounded by one record batch instead of growing with the dataset (readr
# keeps a string-interning pool that accumulates across chunks). The
# `delim` switch above already covers comma / tab / semicolon variants;
# Arrow's CSV reader handles all three the same way through
# CsvParseOptions$delimiter.
parse_opts <- arrow::CsvParseOptions$create(delimiter = delim)
convert_opts <- arrow::CsvConvertOptions$create()
read_opts <- arrow::CsvReadOptions$create(block_size = block_size)

ds <- arrow::open_dataset(
input_file,
format = "csv",
parse_options = parse_opts,
convert_options = convert_opts,
read_options = read_opts
)

reader <- arrow::Scanner$create(ds)$ToRecordBatchReader()
Comment on lines +25 to +58

Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟠 Major | ⚡ Quick win

🧩 Analysis chain

🏁 Script executed:

#!/bin/bash
# Verify whether column projection is actually wired in the Arrow path.
rg -n -C2 'needed_cols|CsvConvertOptions\\$create|include_columns|Scanner\\$create|open_dataset\\(' R/clean_spectronaut.R

Repository: Vitek-Lab/MSstatsBig

Length of output: 289


🏁 Script executed:

#!/bin/bash
# First, check for needed_cols usage in the file
echo "=== Searching for needed_cols usage ==="
rg -n 'needed_cols' R/clean_spectronaut.R

echo ""
echo "=== Searching for CsvConvertOptions and include_columns ==="
rg -n 'CsvConvertOptions\|include_columns' R/clean_spectronaut.R

echo ""
echo "=== Checking context around CsvConvertOptions create call ==="
rg -n -B3 -A3 'CsvConvertOptions\$create' R/clean_spectronaut.R

Repository: Vitek-Lab/MSstatsBig

Length of output: 688


🏁 Script executed:

#!/bin/bash
# Check what happens after reader creation
echo "=== Code from line 50 onwards ==="
sed -n '50,80p' R/clean_spectronaut.R

echo ""
echo "=== Complete function to see full context ==="
wc -l R/clean_spectronaut.R

Repository: Vitek-Lab/MSstatsBig

Length of output: 1118


🏁 Script executed:

#!/bin/bash
# Search for include_columns usage in the entire codebase
echo "=== Searching for include_columns in codebase ==="
rg -n 'include_columns' .

echo ""
echo "=== Searching for CsvConvertOptions usage patterns ==="
rg -n 'CsvConvertOptions' . -A2

echo ""
echo "=== Check the comment context more carefully ==="
sed -n '25,50p' R/clean_spectronaut.R

Repository: Vitek-Lab/MSstatsBig

Length of output: 1892


Wire needed_cols into Arrow's column projection.

needed_cols is computed but never applied to CsvConvertOptions, so the reader still parses all CSV columns. This defeats the intended memory/performance reduction on wide Spectronaut exports.

💡 Proposed fix
-  convert_opts <- arrow::CsvConvertOptions$create()
+  convert_opts <- arrow::CsvConvertOptions$create(
+    include_columns = needed_cols
+  )
🤖 Prompt for AI Agents
Verify each finding against current code. Fix only still-valid issues, skip the
rest with a brief reason, keep changes minimal, and validate.

In `@R/clean_spectronaut.R` around lines 25 - 58, The computed needed_cols is
never passed to Arrow, so the CSV reader still parses all columns; update the
CsvConvertOptions usage to include the projection by calling
arrow::CsvConvertOptions$create(include_columns = needed_cols) (or set the
include_columns field on convert_opts after creation) so that convert_opts
includes needed_cols before calling arrow::open_dataset/Scanner$create;
reference the symbols needed_cols and convert_opts (and the call
arrow::CsvConvertOptions$create) and ensure this happens prior to creating
ds/reader.


t_start <- Sys.time()
pos <- 1L
batch_idx <- 0L
repeat {
batch <- reader$read_next_batch()
if (is.null(batch)) break
chunk_df <- as.data.frame(batch)
cleanSpectronautChunk(chunk_df,
output_path,
intensity,
filter_by_excluded,
filter_by_identified,
filter_by_qvalue,
qvalue_cutoff,
pos,
calculateAnomalyScores,
anomalyModelFeatures)
pos <- pos + nrow(chunk_df)
batch_idx <- batch_idx + 1L

if (batch_idx %% 1000L == 0L) {
elapsed <- as.numeric(Sys.time() - t_start, units = "secs")
rate <- (pos - 1L) / elapsed
message(sprintf(
"[reduceBigSpectronaut] %d batches | %s rows | %.1fk rows/s | %.0fs elapsed",
batch_idx,
format(pos - 1L, big.mark = ","),
rate / 1000,
elapsed))
}

rm(batch, chunk_df)
}

if (batch_idx %% 1000L != 0L) {
elapsed <- as.numeric(Sys.time() - t_start, units = "secs")
rate <- (pos - 1L) / elapsed
message(sprintf(
"[reduceBigSpectronaut] done: %d batches | %s rows | %.1fk rows/s | %.0fs elapsed",
batch_idx,
format(pos - 1L, big.mark = ","),
rate / 1000,
elapsed))
}
}

#' @keywords internal
Expand All @@ -38,85 +111,88 @@ cleanSpectronautChunk = function(input, output_path,
filter_by_qvalue = TRUE,
qvalue_cutoff = 0.01,
pos = NULL,
calculateAnomalyScores=FALSE,
calculateAnomalyScores=FALSE,
anomalyModelFeatures=c()) {
data.table::setDT(input)

all_cols <- c("R.FileName", "R.Condition", "R.Replicate",
"PG.ProteinAccessions", "EG.ModifiedSequence", "FG.LabeledSequence",
"FG.Charge", "F.FrgIon", "F.Charge",
"EG.Identified", "F.ExcludedFromQuantification", "F.FrgLossType",
"PG.Qvalue", "EG.Qvalue", intensity)

if (calculateAnomalyScores){
all_cols <- c(all_cols, anomalyModelFeatures)
}

cols <- intersect(all_cols, colnames(input))
input <- dplyr::select(input, all_of(cols))
input <- dplyr::rename_with(input, .fn = MSstatsConvert:::.standardizeColnames)

new_names <- c("Run", "Condition", "BioReplicate", "ProteinName",
"PeptideSequence", "LabeledSequence", "PrecursorCharge", "FragmentIon",
"ProductCharge", "Identified", "Excluded",
"FFrgLossType", "PGQvalue", "EGQvalue",
"Intensity")
if (calculateAnomalyScores){
if (calculateAnomalyScores) {
all_cols <- c(all_cols, anomalyModelFeatures)
new_names <- c(new_names, MSstatsConvert:::.standardizeColnames(anomalyModelFeatures))
}
# non_standardized =
old_names <- MSstatsConvert:::.standardizeColnames(all_cols)
names(old_names) <- new_names
old_names <- old_names[old_names %in% colnames(input)]

input <- dplyr::rename(input, !!old_names)
input <- dplyr::mutate(input, Intensity = as.numeric(Intensity))

if (is.character(dplyr::pull(dplyr::collect(head(dplyr::select(input, Excluded))), Excluded))) {
input <- dplyr::mutate(input, Excluded = Excluded == "True")

present_orig <- intersect(all_cols, colnames(input))
if (length(present_orig) == 0L) {
stop(sprintf(
paste0("cleanSpectronautChunk: none of the expected Spectronaut ",
"columns were found in the input batch. ",
"Expected any of: %s. Found: %s. ",
"Check that the file is comma-delimited and that the ",
"Spectronaut export uses the standard column names."),
paste(all_cols, collapse = ", "),
paste(colnames(input), collapse = ", ")))
}
if (is.element("Identified", colnames(input))) {
if (is.character(dplyr::pull(dplyr::collect(head(dplyr::select(input, Identified))), Identified))) {
input <- dplyr::mutate(input, Identified = Identified == "True")
}
input <- input[, present_orig, with = FALSE]

# Two-step rename matching the MSstatsConvert family pattern: standardize
# all column names, then map standardized -> MSstats final names.
data.table::setnames(input, MSstatsConvert:::.standardizeColnames(colnames(input)))
std_to_msstats <- stats::setNames(new_names,
MSstatsConvert:::.standardizeColnames(all_cols))
data.table::setnames(input,
old = names(std_to_msstats),
new = unname(std_to_msstats),
skip_absent = TRUE)

input[, Intensity := as.numeric(Intensity)]

if (is.character(input[["Excluded"]])) {
input[, Excluded := Excluded == "True"]
}
if ("Identified" %in% colnames(input) && is.character(input[["Identified"]])) {
input[, Identified := Identified == "True"]
}

if (filter_by_excluded) {
input <- dplyr::mutate(
input, Intensity = dplyr::if_else(Excluded, NA_real_, Intensity))

input[Excluded == TRUE, Intensity := NA_real_]
}

if (filter_by_identified) {
input <- dplyr::mutate(
input, Intensity = dplyr::if_else(Identified, Intensity, NA_real_))
input[Identified == FALSE, Intensity := NA_real_]
}

if (filter_by_qvalue) {
input <- dplyr::mutate(
input,
Intensity = dplyr::if_else(EGQvalue < qvalue_cutoff, Intensity, NA_real_))
input <- dplyr::mutate(
input,
Intensity = dplyr::if_else(PGQvalue < qvalue_cutoff, Intensity, NA_real_))
# Preserve dplyr::if_else semantics: rows with NA q-values become NA.
input[is.na(EGQvalue) | EGQvalue >= qvalue_cutoff, Intensity := NA_real_]
input[is.na(PGQvalue) | PGQvalue >= qvalue_cutoff, Intensity := NA_real_]
}

input <- dplyr::filter(input, FFrgLossType == "noloss")
if (is.element("LabeledSequence", colnames(input))) {
input <- dplyr::mutate(input, IsLabeled = grepl("Lys8", LabeledSequence) | grepl("Arg10", LabeledSequence))
input <- dplyr::mutate(input, IsotopeLabelType := dplyr::if_else(IsLabeled, "H", "L"))

input <- input[FFrgLossType == "noloss"]
Comment on lines 165 to +177

Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟠 Major | ⚡ Quick win

Guard filter steps when selected columns are absent.

After present_orig subsetting, downstream filters still assume Excluded, Identified, EGQvalue, PGQvalue, and FFrgLossType exist. Missing columns here will raise runtime errors.

🛡️ Proposed fix
-  if (filter_by_excluded) {
+  if (filter_by_excluded && "Excluded" %in% colnames(input)) {
     input[Excluded == TRUE, Intensity := NA_real_]
   }
-  if (filter_by_identified) {
+  if (filter_by_identified && "Identified" %in% colnames(input)) {
     input[Identified == FALSE, Intensity := NA_real_]
   }
   if (filter_by_qvalue) {
-    input[is.na(EGQvalue) | EGQvalue >= qvalue_cutoff, Intensity := NA_real_]
-    input[is.na(PGQvalue) | PGQvalue >= qvalue_cutoff, Intensity := NA_real_]
+    if ("EGQvalue" %in% colnames(input)) {
+      input[is.na(EGQvalue) | EGQvalue >= qvalue_cutoff, Intensity := NA_real_]
+    }
+    if ("PGQvalue" %in% colnames(input)) {
+      input[is.na(PGQvalue) | PGQvalue >= qvalue_cutoff, Intensity := NA_real_]
+    }
   }
-
-  input <- input[FFrgLossType == "noloss"]
+  if ("FFrgLossType" %in% colnames(input)) {
+    input <- input[FFrgLossType == "noloss"]
+  }
📝 Committable suggestion

‼️ IMPORTANT
Carefully review the code before committing. Ensure that it accurately replaces the highlighted code, contains no missing lines, and has no issues with indentation. Thoroughly test & benchmark the code to ensure it meets the requirements.

Suggested change
if (filter_by_excluded) {
input <- dplyr::mutate(
input, Intensity = dplyr::if_else(Excluded, NA_real_, Intensity))
input[Excluded == TRUE, Intensity := NA_real_]
}
if (filter_by_identified) {
input <- dplyr::mutate(
input, Intensity = dplyr::if_else(Identified, Intensity, NA_real_))
input[Identified == FALSE, Intensity := NA_real_]
}
if (filter_by_qvalue) {
input <- dplyr::mutate(
input,
Intensity = dplyr::if_else(EGQvalue < qvalue_cutoff, Intensity, NA_real_))
input <- dplyr::mutate(
input,
Intensity = dplyr::if_else(PGQvalue < qvalue_cutoff, Intensity, NA_real_))
# Preserve dplyr::if_else semantics: rows with NA q-values become NA.
input[is.na(EGQvalue) | EGQvalue >= qvalue_cutoff, Intensity := NA_real_]
input[is.na(PGQvalue) | PGQvalue >= qvalue_cutoff, Intensity := NA_real_]
}
input <- dplyr::filter(input, FFrgLossType == "noloss")
if (is.element("LabeledSequence", colnames(input))) {
input <- dplyr::mutate(input, IsLabeled = grepl("Lys8", LabeledSequence) | grepl("Arg10", LabeledSequence))
input <- dplyr::mutate(input, IsotopeLabelType := dplyr::if_else(IsLabeled, "H", "L"))
input <- input[FFrgLossType == "noloss"]
if (filter_by_excluded && "Excluded" %in% colnames(input)) {
input[Excluded == TRUE, Intensity := NA_real_]
}
if (filter_by_identified && "Identified" %in% colnames(input)) {
input[Identified == FALSE, Intensity := NA_real_]
}
if (filter_by_qvalue) {
# Preserve dplyr::if_else semantics: rows with NA q-values become NA.
if ("EGQvalue" %in% colnames(input)) {
input[is.na(EGQvalue) | EGQvalue >= qvalue_cutoff, Intensity := NA_real_]
}
if ("PGQvalue" %in% colnames(input)) {
input[is.na(PGQvalue) | PGQvalue >= qvalue_cutoff, Intensity := NA_real_]
}
}
if ("FFrgLossType" %in% colnames(input)) {
input <- input[FFrgLossType == "noloss"]
}
🤖 Prompt for AI Agents
Verify each finding against current code. Fix only still-valid issues, skip the
rest with a brief reason, keep changes minimal, and validate.

In `@R/clean_spectronaut.R` around lines 155 - 167, The current filter block
assumes columns Excluded, Identified, EGQvalue, PGQvalue, and FFrgLossType
always exist and will error if any are missing; update the logic in
clean_spectronaut.R to guard each filter by checking column presence (e.g., use
"if ('Excluded' %in% names(input))" before the Excluded filter, similarly for
Identified, EGQvalue and PGQvalue before applying qvalue-based NA assignment,
and check for FFrgLossType before subsetting), so each conditional only runs
when its target column exists and the behavior remains unchanged otherwise.


if ("LabeledSequence" %in% colnames(input)) {
input[, IsotopeLabelType := ifelse(
grepl("Lys8", LabeledSequence) | grepl("Arg10", LabeledSequence),
"H", "L")]
} else {
input <- dplyr::mutate(input, IsotopeLabelType = "L")
input[, IsotopeLabelType := "L"]
}
select_cols = c("ProteinName", "PeptideSequence", "PrecursorCharge", "FragmentIon",
"ProductCharge", "IsotopeLabelType", "Run", "BioReplicate", "Condition",
"Intensity")
if (calculateAnomalyScores){
select_cols = c(select_cols,
MSstatsConvert:::.standardizeColnames(anomalyModelFeatures))

select_cols <- c("ProteinName", "PeptideSequence", "PrecursorCharge", "FragmentIon",
"ProductCharge", "IsotopeLabelType", "Run", "BioReplicate", "Condition",
"Intensity")
if (calculateAnomalyScores) {
select_cols <- c(select_cols,
MSstatsConvert:::.standardizeColnames(anomalyModelFeatures))
}
input <- dplyr::select(input, select_cols)

input <- input[, select_cols, with = FALSE]
.writeChunkToFile(input, output_path, pos)
NULL
}
13 changes: 10 additions & 3 deletions R/converters.R
Original file line number Diff line number Diff line change
Expand Up @@ -117,6 +117,11 @@ bigFragPipetoMSstatsFormat <- function(input_file, output_file_name,
#' @param filter_by_identified if TRUE, will filter by the `EG.Identified` column.
#' @param filter_by_qvalue if TRUE, will filter by EG.Qvalue and PG.Qvalue columns.
#' @param qvalue_cutoff cutoff which will be used for q-value filtering.
#' @param block_size Arrow CSV reader block size in bytes; each input row must
#' fit inside one block. Defaults to 16 MiB (`16L * 1024L * 1024L`). If you
#' see `Invalid: straddling object straddles two block boundaries` on
#' extra-wide Spectronaut exports, pass a larger value
#' (e.g. `64L * 1024L * 1024L`).
#'
#' @export
#'
Expand All @@ -143,14 +148,16 @@ bigSpectronauttoMSstatsFormat <- function(input_file, output_file_name,
aggregate_psms = FALSE,
filter_few_obs = FALSE,
remove_annotation = FALSE,
calculateAnomalyScores=FALSE,
calculateAnomalyScores=FALSE,
anomalyModelFeatures=c(),
connection = NULL) {
connection = NULL,
block_size = 16L * 1024L * 1024L) {
reduced_file <- .prefixedPath("reduce_output_", output_file_name)
reduceBigSpectronaut(input_file, reduced_file,
intensity, filter_by_excluded, filter_by_identified,
filter_by_qvalue, qvalue_cutoff,
calculateAnomalyScores, anomalyModelFeatures)
calculateAnomalyScores, anomalyModelFeatures,
block_size = block_size)
msstats_data <- MSstatsPreprocessBig(
input_file = reduced_file,
output_file_name = output_file_name,
Expand Down
9 changes: 8 additions & 1 deletion man/bigSpectronauttoMSstatsFormat.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

24 changes: 24 additions & 0 deletions man/dot-prefixedPath.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Loading
Loading