From 11d4d101ac995c8cfd4489a1305034c3a6196862 Mon Sep 17 00:00:00 2001 From: Tony Wu Date: Tue, 26 May 2026 08:48:32 -0400 Subject: [PATCH 1/3] Fix(Spectronaut): Enable annotation to be added to input * Added annotation = NULL parameter to bigSpectronauttoMSstatsFormat (positional arg #2, mirroring bigDIANNtoMSstatsFormat from #16). * When supplied, the converter merges the annotation onto the output via MSstatsAddAnnotationBig, overriding any Condition / BioReplicate columns that came from R.Condition / R.Replicate. * Required for paired designs and other experimental layouts that Spectronaut's own annotation cannot express. * Added override test under tests/testthat/test-converters.R. See MSstats-ai/todos/active/TODO-MSBig-20260526_bigspectronaut_annotation_param.md Co-Authored-By: Claude --- DESCRIPTION | 2 +- R/converters.R | 61 +++++++++++++++++++++------- man/bigSpectronauttoMSstatsFormat.Rd | 31 ++++++++++++-- man/dot-prefixedPath.Rd | 24 +++++++++++ tests/testthat/test-converters.R | 47 +++++++++++++++++++++ 5 files changed, 147 insertions(+), 18 deletions(-) create mode 100644 man/dot-prefixedPath.Rd diff --git a/DESCRIPTION b/DESCRIPTION index 83fdc00..1371566 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -14,7 +14,6 @@ Description: MSstats package provide tools for preprocessing, summarization and processing larger than memory data sets. License: Artistic-2.0 Encoding: UTF-8 -RoxygenNote: 7.3.3 Imports: arrow, DBI, @@ -31,3 +30,4 @@ Suggests: rmarkdown VignetteBuilder: knitr biocViews: MassSpectrometry, Proteomics, Software +Config/roxygen2/version: 8.0.0 diff --git a/R/converters.R b/R/converters.R index 13b4383..d961a2e 100644 --- a/R/converters.R +++ b/R/converters.R @@ -112,6 +112,15 @@ bigFragPipetoMSstatsFormat <- function(input_file, output_file_name, #' Convert out-of-memory Spectronaut files to MSstats format. #' #' @inheritParams MSstatsPreprocessBig +#' @param annotation Optional data.frame with columns `Run`, +#' `BioReplicate`, `Condition` (plus any additional annotation +#' columns). If supplied, the converter merges it onto the output +#' on `Run` and overrides any `Condition` / `BioReplicate` values +#' that came from Spectronaut's `R.Condition` / `R.Replicate` +#' columns. Required when the experimental design cannot be +#' expressed in Spectronaut's own annotation — most notably paired +#' designs, where `BioReplicate` must encode the pairing +#' structure rather than per-sample IDs. #' @param intensity Name of the intensity column to be used in Spectronaut #' @param filter_by_excluded if TRUE, will filter by the `F.ExcludedFromQuantification` column. #' @param filter_by_identified if TRUE, will filter by the `EG.Identified` column. @@ -122,16 +131,32 @@ bigFragPipetoMSstatsFormat <- function(input_file, output_file_name, #' #' @examples #' converted_data <- bigSpectronauttoMSstatsFormat( -#' system.file("extdata", "spectronaut_input.csv", package = "MSstatsBig"), -#' "output_file.csv", -#' backend="arrow") +#' input_file = system.file("extdata", "spectronaut_input.csv", +#' package = "MSstatsBig"), +#' output_file_name = "output_file.csv", +#' backend = "arrow") #' converted_data <- dplyr::collect(converted_data) #' head(converted_data) #' +#' # Override Spectronaut's embedded Condition / BioReplicate with +#' # a custom annotation (e.g. for a paired design): +#' annot <- data.frame(Run = unique(converted_data[["Run"]])) +#' annot$BioReplicate <- seq_len(nrow(annot)) +#' annot$Condition <- rep(c("ctrl", "treat"), length.out = nrow(annot)) +#' overridden <- bigSpectronauttoMSstatsFormat( +#' input_file = system.file("extdata", "spectronaut_input.csv", +#' package = "MSstatsBig"), +#' annotation = annot, +#' output_file_name = "output_file.csv", +#' backend = "arrow") +#' head(dplyr::collect(overridden)) +#' #' @return either arrow object or sparklyr table that can be optionally collected #' into memory by using dplyr::collect function. #' -bigSpectronauttoMSstatsFormat <- function(input_file, output_file_name, +bigSpectronauttoMSstatsFormat <- function(input_file, + annotation = NULL, + output_file_name, backend, intensity = "F.NormalizedPeakArea", filter_by_excluded = FALSE, @@ -143,7 +168,7 @@ bigSpectronauttoMSstatsFormat <- function(input_file, output_file_name, aggregate_psms = FALSE, filter_few_obs = FALSE, remove_annotation = FALSE, - calculateAnomalyScores=FALSE, + calculateAnomalyScores=FALSE, anomalyModelFeatures=c(), connection = NULL) { reduced_file <- .prefixedPath("reduce_output_", output_file_name) @@ -153,19 +178,27 @@ bigSpectronauttoMSstatsFormat <- function(input_file, output_file_name, calculateAnomalyScores, anomalyModelFeatures) msstats_data <- MSstatsPreprocessBig( input_file = reduced_file, - output_file_name = output_file_name, - backend = backend, + output_file_name = output_file_name, + backend = backend, max_feature_count = max_feature_count, filter_unique_peptides = filter_unique_peptides, - aggregate_psms = aggregate_psms, - filter_few_obs = filter_few_obs, - remove_annotation = remove_annotation, - calculateAnomalyScores = calculateAnomalyScores, - anomalyModelFeatures = anomalyModelFeatures, + aggregate_psms = aggregate_psms, + filter_few_obs = filter_few_obs, + remove_annotation = remove_annotation, + calculateAnomalyScores = calculateAnomalyScores, + anomalyModelFeatures = anomalyModelFeatures, connection = connection) - + + if (!is.null(annotation)) { + msstats_data <- MSstatsAddAnnotationBig(msstats_data, annotation) + if (backend == "arrow") { + unlink(output_file_name, recursive = TRUE, force = TRUE) + arrow::write_dataset(msstats_data, output_file_name, format = "csv") + } + } + return(msstats_data) - + } diff --git a/man/bigSpectronauttoMSstatsFormat.Rd b/man/bigSpectronauttoMSstatsFormat.Rd index 01706ef..e1a3f73 100644 --- a/man/bigSpectronauttoMSstatsFormat.Rd +++ b/man/bigSpectronauttoMSstatsFormat.Rd @@ -6,6 +6,7 @@ \usage{ bigSpectronauttoMSstatsFormat( input_file, + annotation = NULL, output_file_name, backend, intensity = "F.NormalizedPeakArea", @@ -26,6 +27,16 @@ bigSpectronauttoMSstatsFormat( \arguments{ \item{input_file}{name of the input text file in 10-column MSstats format.} +\item{annotation}{Optional data.frame with columns `Run`, +`BioReplicate`, `Condition` (plus any additional annotation +columns). If supplied, the converter merges it onto the output +on `Run` and overrides any `Condition` / `BioReplicate` values +that came from Spectronaut's `R.Condition` / `R.Replicate` +columns. Required when the experimental design cannot be +expressed in Spectronaut's own annotation — most notably paired +designs, where `BioReplicate` must encode the pairing +structure rather than per-sample IDs.} + \item{output_file_name}{name of an output file which will be saved after pre-processing} \item{backend}{"arrow" or "sparklyr". Option "sparklyr" requires a spark installation @@ -73,10 +84,24 @@ Convert out-of-memory Spectronaut files to MSstats format. } \examples{ converted_data <- bigSpectronauttoMSstatsFormat( - system.file("extdata", "spectronaut_input.csv", package = "MSstatsBig"), - "output_file.csv", - backend="arrow") + input_file = system.file("extdata", "spectronaut_input.csv", + package = "MSstatsBig"), + output_file_name = "output_file.csv", + backend = "arrow") converted_data <- dplyr::collect(converted_data) head(converted_data) +# Override Spectronaut's embedded Condition / BioReplicate with +# a custom annotation (e.g. for a paired design): +annot <- data.frame(Run = unique(converted_data[["Run"]])) +annot$BioReplicate <- seq_len(nrow(annot)) +annot$Condition <- rep(c("ctrl", "treat"), length.out = nrow(annot)) +overridden <- bigSpectronauttoMSstatsFormat( + input_file = system.file("extdata", "spectronaut_input.csv", + package = "MSstatsBig"), + annotation = annot, + output_file_name = "output_file.csv", + backend = "arrow") +head(dplyr::collect(overridden)) + } diff --git a/man/dot-prefixedPath.Rd b/man/dot-prefixedPath.Rd new file mode 100644 index 0000000..be036ac --- /dev/null +++ b/man/dot-prefixedPath.Rd @@ -0,0 +1,24 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/utils.R +\name{.prefixedPath} +\alias{.prefixedPath} +\title{Build an intermediate output path by prefixing only the basename.} +\usage{ +.prefixedPath(prefix, path) +} +\arguments{ +\item{prefix}{Character scalar prepended to the basename.} + +\item{path}{Output file path supplied by the caller.} +} +\value{ +Character scalar. +} +\description{ +Naive `paste0(prefix, output_file_name)` corrupts paths that contain a +directory (`subdir/out.csv` → `topN_subdir/out.csv`, +`/tmp/out.csv` → `topN_/tmp/out.csv`). Splitting via dirname/basename keeps +the directory component intact so intermediate files land beside the final +output. +} +\keyword{internal} diff --git a/tests/testthat/test-converters.R b/tests/testthat/test-converters.R index 78f6da3..51395fa 100644 --- a/tests/testthat/test-converters.R +++ b/tests/testthat/test-converters.R @@ -94,6 +94,53 @@ test_that("bigSpectronauttoMSstatsFormat works correctly", { unlink(paste0("reduce_output_", output_file), recursive = TRUE, force = TRUE) }) +test_that("bigSpectronauttoMSstatsFormat overrides Condition/BioReplicate from annotation", { + # Mock reduce to emit rows tagged with values we can distinguish + # from the supplied annotation — if the override works, + # downstream Condition/BioReplicate must come from `annotation`, + # not from these mocked values. + stub(bigSpectronauttoMSstatsFormat, "reduceBigSpectronaut", function(input_file, output_path, ...) { + msstats_data <- data.frame( + ProteinName = "P1", PeptideSequence = "PEPTIDE", PrecursorCharge = 2, + FragmentIon = "frag1", ProductCharge = 1, + IsotopeLabelType = "L", + Condition = "FROM_SPECTRONAUT", BioReplicate = 999, + Run = rep(c("run1", "run2"), each = 1), + Intensity = c(1000, 2000) + ) + readr::write_csv(msstats_data, output_path) + }) + + input_file <- "dummy_spectro_input.csv" + output_file <- "spectro_output_annot.csv" + + annotation <- data.frame( + Run = c("run1", "run2"), + BioReplicate = c(7L, 8L), + Condition = c("ctrl", "treat"), + stringsAsFactors = FALSE + ) + + processed <- bigSpectronauttoMSstatsFormat( + input_file = input_file, + annotation = annotation, + output_file_name = output_file, + backend = "arrow", + max_feature_count = 1 + ) + result <- dplyr::collect(processed) + result <- result[order(result$Run), ] + + expect_equal(result$Condition, c("ctrl", "treat")) + expect_equal(result$BioReplicate, c(7L, 8L)) + expect_false(any(result$Condition == "FROM_SPECTRONAUT")) + expect_false(any(result$BioReplicate == 999)) + + # Cleanup + unlink(output_file, recursive = TRUE, force = TRUE) + unlink(paste0("reduce_output_", output_file), recursive = TRUE, force = TRUE) +}) + # test_that("bigDIANNtoMSstatsFormat works with real MSstatsConvert tinytest data", { # input_file <- "/Users/rudhikshah/NorthEasternContractWork/MSstatsConvert/inst/tinytest/raw_data/DIANN/diann_input.tsv" # annotation_file <- "/Users/rudhikshah/NorthEasternContractWork/MSstatsConvert/inst/tinytest/raw_data/DIANN/annotation.csv" From e78a1596d08459a827e545feb8b799d1f1911393 Mon Sep 17 00:00:00 2001 From: Tony Wu Date: Tue, 26 May 2026 09:00:01 -0400 Subject: [PATCH 2/3] Move annotation arg to end of bigSpectronauttoMSstatsFormat signature * Slotted annotation = NULL just before connection = NULL instead of at position #2, so the pre-existing positional signature (input_file, output_file_name, backend, intensity, ...) keeps working for any external positional callers. * This intentionally diverges from bigDIANNtoMSstatsFormat (#16), which puts annotation at position #2. Backward compatibility was prioritized for the Spectronaut converter because it had a longer pre-annotation life. DIANN can be re-flowed separately if consistency is needed later. * Restored the simpler positional example call (no longer needs named-arg workaround that the position-#2 signature forced). See MSstats-ai/todos/active/TODO-MSBig-20260526_bigspectronaut_annotation_param.md Co-Authored-By: Claude --- R/converters.R | 19 +++++++-------- man/bigSpectronauttoMSstatsFormat.Rd | 36 +++++++++++++--------------- 2 files changed, 25 insertions(+), 30 deletions(-) diff --git a/R/converters.R b/R/converters.R index d961a2e..8d543b8 100644 --- a/R/converters.R +++ b/R/converters.R @@ -131,9 +131,8 @@ bigFragPipetoMSstatsFormat <- function(input_file, output_file_name, #' #' @examples #' converted_data <- bigSpectronauttoMSstatsFormat( -#' input_file = system.file("extdata", "spectronaut_input.csv", -#' package = "MSstatsBig"), -#' output_file_name = "output_file.csv", +#' system.file("extdata", "spectronaut_input.csv", package = "MSstatsBig"), +#' "output_file.csv", #' backend = "arrow") #' converted_data <- dplyr::collect(converted_data) #' head(converted_data) @@ -144,19 +143,16 @@ bigFragPipetoMSstatsFormat <- function(input_file, output_file_name, #' annot$BioReplicate <- seq_len(nrow(annot)) #' annot$Condition <- rep(c("ctrl", "treat"), length.out = nrow(annot)) #' overridden <- bigSpectronauttoMSstatsFormat( -#' input_file = system.file("extdata", "spectronaut_input.csv", -#' package = "MSstatsBig"), -#' annotation = annot, -#' output_file_name = "output_file.csv", -#' backend = "arrow") +#' system.file("extdata", "spectronaut_input.csv", package = "MSstatsBig"), +#' "output_file.csv", +#' backend = "arrow", +#' annotation = annot) #' head(dplyr::collect(overridden)) #' #' @return either arrow object or sparklyr table that can be optionally collected #' into memory by using dplyr::collect function. #' -bigSpectronauttoMSstatsFormat <- function(input_file, - annotation = NULL, - output_file_name, +bigSpectronauttoMSstatsFormat <- function(input_file, output_file_name, backend, intensity = "F.NormalizedPeakArea", filter_by_excluded = FALSE, @@ -170,6 +166,7 @@ bigSpectronauttoMSstatsFormat <- function(input_file, remove_annotation = FALSE, calculateAnomalyScores=FALSE, anomalyModelFeatures=c(), + annotation = NULL, connection = NULL) { reduced_file <- .prefixedPath("reduce_output_", output_file_name) reduceBigSpectronaut(input_file, reduced_file, diff --git a/man/bigSpectronauttoMSstatsFormat.Rd b/man/bigSpectronauttoMSstatsFormat.Rd index e1a3f73..99379e8 100644 --- a/man/bigSpectronauttoMSstatsFormat.Rd +++ b/man/bigSpectronauttoMSstatsFormat.Rd @@ -6,7 +6,6 @@ \usage{ bigSpectronauttoMSstatsFormat( input_file, - annotation = NULL, output_file_name, backend, intensity = "F.NormalizedPeakArea", @@ -21,22 +20,13 @@ bigSpectronauttoMSstatsFormat( remove_annotation = FALSE, calculateAnomalyScores = FALSE, anomalyModelFeatures = c(), + annotation = NULL, connection = NULL ) } \arguments{ \item{input_file}{name of the input text file in 10-column MSstats format.} -\item{annotation}{Optional data.frame with columns `Run`, -`BioReplicate`, `Condition` (plus any additional annotation -columns). If supplied, the converter merges it onto the output -on `Run` and overrides any `Condition` / `BioReplicate` values -that came from Spectronaut's `R.Condition` / `R.Replicate` -columns. Required when the experimental design cannot be -expressed in Spectronaut's own annotation — most notably paired -designs, where `BioReplicate` must encode the pairing -structure rather than per-sample IDs.} - \item{output_file_name}{name of an output file which will be saved after pre-processing} \item{backend}{"arrow" or "sparklyr". Option "sparklyr" requires a spark installation @@ -72,6 +62,16 @@ using dataProcess function. Only applicable to sparklyr backend.} \item{anomalyModelFeatures}{Character vector of column names to be carried through the pipeline} +\item{annotation}{Optional data.frame with columns `Run`, +`BioReplicate`, `Condition` (plus any additional annotation +columns). If supplied, the converter merges it onto the output +on `Run` and overrides any `Condition` / `BioReplicate` values +that came from Spectronaut's `R.Condition` / `R.Replicate` +columns. Required when the experimental design cannot be +expressed in Spectronaut's own annotation — most notably paired +designs, where `BioReplicate` must encode the pairing +structure rather than per-sample IDs.} + \item{connection}{Connection to a spark instance created with the `spark_connect` function from `sparklyr` package.} } @@ -84,9 +84,8 @@ Convert out-of-memory Spectronaut files to MSstats format. } \examples{ converted_data <- bigSpectronauttoMSstatsFormat( - input_file = system.file("extdata", "spectronaut_input.csv", - package = "MSstatsBig"), - output_file_name = "output_file.csv", + system.file("extdata", "spectronaut_input.csv", package = "MSstatsBig"), + "output_file.csv", backend = "arrow") converted_data <- dplyr::collect(converted_data) head(converted_data) @@ -97,11 +96,10 @@ annot <- data.frame(Run = unique(converted_data[["Run"]])) annot$BioReplicate <- seq_len(nrow(annot)) annot$Condition <- rep(c("ctrl", "treat"), length.out = nrow(annot)) overridden <- bigSpectronauttoMSstatsFormat( - input_file = system.file("extdata", "spectronaut_input.csv", - package = "MSstatsBig"), - annotation = annot, - output_file_name = "output_file.csv", - backend = "arrow") + system.file("extdata", "spectronaut_input.csv", package = "MSstatsBig"), + "output_file.csv", + backend = "arrow", + annotation = annot) head(dplyr::collect(overridden)) } From b95ef9305da76fc4b48f83897a35532561c45949 Mon Sep 17 00:00:00 2001 From: Tony Wu Date: Tue, 26 May 2026 09:12:41 -0400 Subject: [PATCH 3/3] update tests --- .Rbuildignore | 2 + .gitignore | 1 + tests/testthat/test-converters.R | 77 +------------------------------- 3 files changed, 4 insertions(+), 76 deletions(-) diff --git a/.Rbuildignore b/.Rbuildignore index 91114bf..ea84996 100644 --- a/.Rbuildignore +++ b/.Rbuildignore @@ -1,2 +1,4 @@ ^.*\.Rproj$ ^\.Rproj\.user$ +^\.positai$ +^\.claude$ diff --git a/.gitignore b/.gitignore index bf58646..c2a9478 100644 --- a/.gitignore +++ b/.gitignore @@ -4,3 +4,4 @@ .Ruserdata inst/doc *.Rproj +.positai diff --git a/tests/testthat/test-converters.R b/tests/testthat/test-converters.R index 51395fa..1f9162d 100644 --- a/tests/testthat/test-converters.R +++ b/tests/testthat/test-converters.R @@ -95,10 +95,6 @@ test_that("bigSpectronauttoMSstatsFormat works correctly", { }) test_that("bigSpectronauttoMSstatsFormat overrides Condition/BioReplicate from annotation", { - # Mock reduce to emit rows tagged with values we can distinguish - # from the supplied annotation — if the override works, - # downstream Condition/BioReplicate must come from `annotation`, - # not from these mocked values. stub(bigSpectronauttoMSstatsFormat, "reduceBigSpectronaut", function(input_file, output_path, ...) { msstats_data <- data.frame( ProteinName = "P1", PeptideSequence = "PEPTIDE", PrecursorCharge = 2, @@ -139,75 +135,4 @@ test_that("bigSpectronauttoMSstatsFormat overrides Condition/BioReplicate from a # Cleanup unlink(output_file, recursive = TRUE, force = TRUE) unlink(paste0("reduce_output_", output_file), recursive = TRUE, force = TRUE) -}) - -# test_that("bigDIANNtoMSstatsFormat works with real MSstatsConvert tinytest data", { -# input_file <- "/Users/rudhikshah/NorthEasternContractWork/MSstatsConvert/inst/tinytest/raw_data/DIANN/diann_input.tsv" -# annotation_file <- "/Users/rudhikshah/NorthEasternContractWork/MSstatsConvert/inst/tinytest/raw_data/DIANN/annotation.csv" - -# # Skip test if the local files are not found (e.g. on CI/CD) -# skip_if_not(file.exists(input_file), "Local DIANN input file not found") -# skip_if_not(file.exists(annotation_file), "Local annotation file not found") - -# annot <- read.csv(annotation_file) -# output_file <- "real_diann_output.csv" - -# processed <- bigDIANNtoMSstatsFormat( -# input_file = input_file, -# annotation = annot, -# output_file_name = output_file, -# backend = "arrow", -# MBR = FALSE, -# quantificationColumn = "FragmentQuantCorrected", -# max_feature_count = 100, -# filter_unique_peptides = FALSE, -# aggregate_psms = FALSE, -# filter_few_obs = FALSE -# ) - -# result <- dplyr::collect(processed) - -# expect_true(!is.null(result)) -# expect_true(nrow(result) > 0) - -# # Cleanup — outputs may be directories when backend = "arrow" -# unlink(output_file, recursive = TRUE, force = TRUE) -# unlink(paste0("reduce_output_", output_file), recursive = TRUE, force = TRUE) -# unlink(paste0("topN_", output_file), recursive = TRUE, force = TRUE) -# }) - -# test_that("bigDIANNtoMSstatsFormat works with DIANN 2.0 parquet input", { -# input_file <- "/Users/rudhikshah/NorthEasternContractWork/MSstatsConvert/inst/tinytest/raw_data/DIANN/diann_2.0.parquet" -# annotation_file <- "/Users/rudhikshah/NorthEasternContractWork/MSstatsConvert/inst/tinytest/raw_data/DIANN/annotation_diann_2.0.csv" - -# skip_if_not(file.exists(input_file), "Local DIANN 2.0 parquet file not found") -# skip_if_not(file.exists(annotation_file), "Local DIANN 2.0 annotation file not found") -# skip_if_not_installed("arrow") - -# annot <- read.csv(annotation_file) -# output_file <- "diann_2_0_output.csv" - -# processed <- bigDIANNtoMSstatsFormat( -# input_file = input_file, -# annotation = annot, -# output_file_name = output_file, -# backend = "arrow", -# MBR = FALSE, -# quantificationColumn = "auto", -# max_feature_count = 100, -# filter_unique_peptides = FALSE, -# aggregate_psms = FALSE, -# filter_few_obs = FALSE -# ) - -# result <- dplyr::collect(processed) - -# expect_true(!is.null(result)) -# expect_true(nrow(result) > 0) - -# # Cleanup — outputs may be directories when backend = "arrow" -# unlink(output_file, recursive = TRUE, force = TRUE) -# unlink(paste0("reduce_output_", output_file), recursive = TRUE, force = TRUE) -# unlink(paste0("topN_", output_file), recursive = TRUE, force = TRUE) -# unlink(paste0("cleaned_", output_file), recursive = TRUE, force = TRUE) -# }) \ No newline at end of file +}) \ No newline at end of file