diff --git a/.Rbuildignore b/.Rbuildignore index 91114bf..ea84996 100644 --- a/.Rbuildignore +++ b/.Rbuildignore @@ -1,2 +1,4 @@ ^.*\.Rproj$ ^\.Rproj\.user$ +^\.positai$ +^\.claude$ diff --git a/.gitignore b/.gitignore index bf58646..c2a9478 100644 --- a/.gitignore +++ b/.gitignore @@ -4,3 +4,4 @@ .Ruserdata inst/doc *.Rproj +.positai diff --git a/DESCRIPTION b/DESCRIPTION index 83fdc00..1371566 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -14,7 +14,6 @@ Description: MSstats package provide tools for preprocessing, summarization and processing larger than memory data sets. License: Artistic-2.0 Encoding: UTF-8 -RoxygenNote: 7.3.3 Imports: arrow, DBI, @@ -31,3 +30,4 @@ Suggests: rmarkdown VignetteBuilder: knitr biocViews: MassSpectrometry, Proteomics, Software +Config/roxygen2/version: 8.0.0 diff --git a/R/converters.R b/R/converters.R index 13b4383..8d543b8 100644 --- a/R/converters.R +++ b/R/converters.R @@ -112,6 +112,15 @@ bigFragPipetoMSstatsFormat <- function(input_file, output_file_name, #' Convert out-of-memory Spectronaut files to MSstats format. #' #' @inheritParams MSstatsPreprocessBig +#' @param annotation Optional data.frame with columns `Run`, +#' `BioReplicate`, `Condition` (plus any additional annotation +#' columns). If supplied, the converter merges it onto the output +#' on `Run` and overrides any `Condition` / `BioReplicate` values +#' that came from Spectronaut's `R.Condition` / `R.Replicate` +#' columns. Required when the experimental design cannot be +#' expressed in Spectronaut's own annotation — most notably paired +#' designs, where `BioReplicate` must encode the pairing +#' structure rather than per-sample IDs. #' @param intensity Name of the intensity column to be used in Spectronaut #' @param filter_by_excluded if TRUE, will filter by the `F.ExcludedFromQuantification` column. #' @param filter_by_identified if TRUE, will filter by the `EG.Identified` column. @@ -124,10 +133,22 @@ bigFragPipetoMSstatsFormat <- function(input_file, output_file_name, #' converted_data <- bigSpectronauttoMSstatsFormat( #' system.file("extdata", "spectronaut_input.csv", package = "MSstatsBig"), #' "output_file.csv", -#' backend="arrow") +#' backend = "arrow") #' converted_data <- dplyr::collect(converted_data) #' head(converted_data) #' +#' # Override Spectronaut's embedded Condition / BioReplicate with +#' # a custom annotation (e.g. for a paired design): +#' annot <- data.frame(Run = unique(converted_data[["Run"]])) +#' annot$BioReplicate <- seq_len(nrow(annot)) +#' annot$Condition <- rep(c("ctrl", "treat"), length.out = nrow(annot)) +#' overridden <- bigSpectronauttoMSstatsFormat( +#' system.file("extdata", "spectronaut_input.csv", package = "MSstatsBig"), +#' "output_file.csv", +#' backend = "arrow", +#' annotation = annot) +#' head(dplyr::collect(overridden)) +#' #' @return either arrow object or sparklyr table that can be optionally collected #' into memory by using dplyr::collect function. #' @@ -143,8 +164,9 @@ bigSpectronauttoMSstatsFormat <- function(input_file, output_file_name, aggregate_psms = FALSE, filter_few_obs = FALSE, remove_annotation = FALSE, - calculateAnomalyScores=FALSE, + calculateAnomalyScores=FALSE, anomalyModelFeatures=c(), + annotation = NULL, connection = NULL) { reduced_file <- .prefixedPath("reduce_output_", output_file_name) reduceBigSpectronaut(input_file, reduced_file, @@ -153,19 +175,27 @@ bigSpectronauttoMSstatsFormat <- function(input_file, output_file_name, calculateAnomalyScores, anomalyModelFeatures) msstats_data <- MSstatsPreprocessBig( input_file = reduced_file, - output_file_name = output_file_name, - backend = backend, + output_file_name = output_file_name, + backend = backend, max_feature_count = max_feature_count, filter_unique_peptides = filter_unique_peptides, - aggregate_psms = aggregate_psms, - filter_few_obs = filter_few_obs, - remove_annotation = remove_annotation, - calculateAnomalyScores = calculateAnomalyScores, - anomalyModelFeatures = anomalyModelFeatures, + aggregate_psms = aggregate_psms, + filter_few_obs = filter_few_obs, + remove_annotation = remove_annotation, + calculateAnomalyScores = calculateAnomalyScores, + anomalyModelFeatures = anomalyModelFeatures, connection = connection) - + + if (!is.null(annotation)) { + msstats_data <- MSstatsAddAnnotationBig(msstats_data, annotation) + if (backend == "arrow") { + unlink(output_file_name, recursive = TRUE, force = TRUE) + arrow::write_dataset(msstats_data, output_file_name, format = "csv") + } + } + return(msstats_data) - + } diff --git a/man/bigSpectronauttoMSstatsFormat.Rd b/man/bigSpectronauttoMSstatsFormat.Rd index 01706ef..99379e8 100644 --- a/man/bigSpectronauttoMSstatsFormat.Rd +++ b/man/bigSpectronauttoMSstatsFormat.Rd @@ -20,6 +20,7 @@ bigSpectronauttoMSstatsFormat( remove_annotation = FALSE, calculateAnomalyScores = FALSE, anomalyModelFeatures = c(), + annotation = NULL, connection = NULL ) } @@ -61,6 +62,16 @@ using dataProcess function. Only applicable to sparklyr backend.} \item{anomalyModelFeatures}{Character vector of column names to be carried through the pipeline} +\item{annotation}{Optional data.frame with columns `Run`, +`BioReplicate`, `Condition` (plus any additional annotation +columns). If supplied, the converter merges it onto the output +on `Run` and overrides any `Condition` / `BioReplicate` values +that came from Spectronaut's `R.Condition` / `R.Replicate` +columns. Required when the experimental design cannot be +expressed in Spectronaut's own annotation — most notably paired +designs, where `BioReplicate` must encode the pairing +structure rather than per-sample IDs.} + \item{connection}{Connection to a spark instance created with the `spark_connect` function from `sparklyr` package.} } @@ -75,8 +86,20 @@ Convert out-of-memory Spectronaut files to MSstats format. converted_data <- bigSpectronauttoMSstatsFormat( system.file("extdata", "spectronaut_input.csv", package = "MSstatsBig"), "output_file.csv", - backend="arrow") + backend = "arrow") converted_data <- dplyr::collect(converted_data) head(converted_data) +# Override Spectronaut's embedded Condition / BioReplicate with +# a custom annotation (e.g. for a paired design): +annot <- data.frame(Run = unique(converted_data[["Run"]])) +annot$BioReplicate <- seq_len(nrow(annot)) +annot$Condition <- rep(c("ctrl", "treat"), length.out = nrow(annot)) +overridden <- bigSpectronauttoMSstatsFormat( + system.file("extdata", "spectronaut_input.csv", package = "MSstatsBig"), + "output_file.csv", + backend = "arrow", + annotation = annot) +head(dplyr::collect(overridden)) + } diff --git a/man/dot-prefixedPath.Rd b/man/dot-prefixedPath.Rd new file mode 100644 index 0000000..be036ac --- /dev/null +++ b/man/dot-prefixedPath.Rd @@ -0,0 +1,24 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/utils.R +\name{.prefixedPath} +\alias{.prefixedPath} +\title{Build an intermediate output path by prefixing only the basename.} +\usage{ +.prefixedPath(prefix, path) +} +\arguments{ +\item{prefix}{Character scalar prepended to the basename.} + +\item{path}{Output file path supplied by the caller.} +} +\value{ +Character scalar. +} +\description{ +Naive `paste0(prefix, output_file_name)` corrupts paths that contain a +directory (`subdir/out.csv` → `topN_subdir/out.csv`, +`/tmp/out.csv` → `topN_/tmp/out.csv`). Splitting via dirname/basename keeps +the directory component intact so intermediate files land beside the final +output. +} +\keyword{internal} diff --git a/tests/testthat/test-converters.R b/tests/testthat/test-converters.R index 78f6da3..1f9162d 100644 --- a/tests/testthat/test-converters.R +++ b/tests/testthat/test-converters.R @@ -94,73 +94,45 @@ test_that("bigSpectronauttoMSstatsFormat works correctly", { unlink(paste0("reduce_output_", output_file), recursive = TRUE, force = TRUE) }) -# test_that("bigDIANNtoMSstatsFormat works with real MSstatsConvert tinytest data", { -# input_file <- "/Users/rudhikshah/NorthEasternContractWork/MSstatsConvert/inst/tinytest/raw_data/DIANN/diann_input.tsv" -# annotation_file <- "/Users/rudhikshah/NorthEasternContractWork/MSstatsConvert/inst/tinytest/raw_data/DIANN/annotation.csv" - -# # Skip test if the local files are not found (e.g. on CI/CD) -# skip_if_not(file.exists(input_file), "Local DIANN input file not found") -# skip_if_not(file.exists(annotation_file), "Local annotation file not found") - -# annot <- read.csv(annotation_file) -# output_file <- "real_diann_output.csv" - -# processed <- bigDIANNtoMSstatsFormat( -# input_file = input_file, -# annotation = annot, -# output_file_name = output_file, -# backend = "arrow", -# MBR = FALSE, -# quantificationColumn = "FragmentQuantCorrected", -# max_feature_count = 100, -# filter_unique_peptides = FALSE, -# aggregate_psms = FALSE, -# filter_few_obs = FALSE -# ) - -# result <- dplyr::collect(processed) - -# expect_true(!is.null(result)) -# expect_true(nrow(result) > 0) - -# # Cleanup — outputs may be directories when backend = "arrow" -# unlink(output_file, recursive = TRUE, force = TRUE) -# unlink(paste0("reduce_output_", output_file), recursive = TRUE, force = TRUE) -# unlink(paste0("topN_", output_file), recursive = TRUE, force = TRUE) -# }) - -# test_that("bigDIANNtoMSstatsFormat works with DIANN 2.0 parquet input", { -# input_file <- "/Users/rudhikshah/NorthEasternContractWork/MSstatsConvert/inst/tinytest/raw_data/DIANN/diann_2.0.parquet" -# annotation_file <- "/Users/rudhikshah/NorthEasternContractWork/MSstatsConvert/inst/tinytest/raw_data/DIANN/annotation_diann_2.0.csv" - -# skip_if_not(file.exists(input_file), "Local DIANN 2.0 parquet file not found") -# skip_if_not(file.exists(annotation_file), "Local DIANN 2.0 annotation file not found") -# skip_if_not_installed("arrow") - -# annot <- read.csv(annotation_file) -# output_file <- "diann_2_0_output.csv" - -# processed <- bigDIANNtoMSstatsFormat( -# input_file = input_file, -# annotation = annot, -# output_file_name = output_file, -# backend = "arrow", -# MBR = FALSE, -# quantificationColumn = "auto", -# max_feature_count = 100, -# filter_unique_peptides = FALSE, -# aggregate_psms = FALSE, -# filter_few_obs = FALSE -# ) - -# result <- dplyr::collect(processed) - -# expect_true(!is.null(result)) -# expect_true(nrow(result) > 0) - -# # Cleanup — outputs may be directories when backend = "arrow" -# unlink(output_file, recursive = TRUE, force = TRUE) -# unlink(paste0("reduce_output_", output_file), recursive = TRUE, force = TRUE) -# unlink(paste0("topN_", output_file), recursive = TRUE, force = TRUE) -# unlink(paste0("cleaned_", output_file), recursive = TRUE, force = TRUE) -# }) \ No newline at end of file +test_that("bigSpectronauttoMSstatsFormat overrides Condition/BioReplicate from annotation", { + stub(bigSpectronauttoMSstatsFormat, "reduceBigSpectronaut", function(input_file, output_path, ...) { + msstats_data <- data.frame( + ProteinName = "P1", PeptideSequence = "PEPTIDE", PrecursorCharge = 2, + FragmentIon = "frag1", ProductCharge = 1, + IsotopeLabelType = "L", + Condition = "FROM_SPECTRONAUT", BioReplicate = 999, + Run = rep(c("run1", "run2"), each = 1), + Intensity = c(1000, 2000) + ) + readr::write_csv(msstats_data, output_path) + }) + + input_file <- "dummy_spectro_input.csv" + output_file <- "spectro_output_annot.csv" + + annotation <- data.frame( + Run = c("run1", "run2"), + BioReplicate = c(7L, 8L), + Condition = c("ctrl", "treat"), + stringsAsFactors = FALSE + ) + + processed <- bigSpectronauttoMSstatsFormat( + input_file = input_file, + annotation = annotation, + output_file_name = output_file, + backend = "arrow", + max_feature_count = 1 + ) + result <- dplyr::collect(processed) + result <- result[order(result$Run), ] + + expect_equal(result$Condition, c("ctrl", "treat")) + expect_equal(result$BioReplicate, c(7L, 8L)) + expect_false(any(result$Condition == "FROM_SPECTRONAUT")) + expect_false(any(result$BioReplicate == 999)) + + # Cleanup + unlink(output_file, recursive = TRUE, force = TRUE) + unlink(paste0("reduce_output_", output_file), recursive = TRUE, force = TRUE) +}) \ No newline at end of file