Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions .Rbuildignore
Original file line number Diff line number Diff line change
@@ -1,2 +1,4 @@
^.*\.Rproj$
^\.Rproj\.user$
^\.positai$
^\.claude$
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -4,3 +4,4 @@
.Ruserdata
inst/doc
*.Rproj
.positai
2 changes: 1 addition & 1 deletion DESCRIPTION
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,6 @@ Description: MSstats package provide tools for preprocessing, summarization and
processing larger than memory data sets.
License: Artistic-2.0
Encoding: UTF-8
RoxygenNote: 7.3.3
Imports:
arrow,
DBI,
Expand All @@ -31,3 +30,4 @@ Suggests:
rmarkdown
VignetteBuilder: knitr
biocViews: MassSpectrometry, Proteomics, Software
Config/roxygen2/version: 8.0.0
52 changes: 41 additions & 11 deletions R/converters.R
Original file line number Diff line number Diff line change
Expand Up @@ -112,6 +112,15 @@ bigFragPipetoMSstatsFormat <- function(input_file, output_file_name,
#' Convert out-of-memory Spectronaut files to MSstats format.
#'
#' @inheritParams MSstatsPreprocessBig
#' @param annotation Optional data.frame with columns `Run`,
#' `BioReplicate`, `Condition` (plus any additional annotation
#' columns). If supplied, the converter merges it onto the output
#' on `Run` and overrides any `Condition` / `BioReplicate` values
#' that came from Spectronaut's `R.Condition` / `R.Replicate`
#' columns. Required when the experimental design cannot be
#' expressed in Spectronaut's own annotation — most notably paired
#' designs, where `BioReplicate` must encode the pairing
#' structure rather than per-sample IDs.
#' @param intensity Name of the intensity column to be used in Spectronaut
#' @param filter_by_excluded if TRUE, will filter by the `F.ExcludedFromQuantification` column.
#' @param filter_by_identified if TRUE, will filter by the `EG.Identified` column.
Expand All @@ -124,10 +133,22 @@ bigFragPipetoMSstatsFormat <- function(input_file, output_file_name,
#' converted_data <- bigSpectronauttoMSstatsFormat(
#' system.file("extdata", "spectronaut_input.csv", package = "MSstatsBig"),
#' "output_file.csv",
#' backend="arrow")
#' backend = "arrow")
#' converted_data <- dplyr::collect(converted_data)
#' head(converted_data)
#'
#' # Override Spectronaut's embedded Condition / BioReplicate with
#' # a custom annotation (e.g. for a paired design):
#' annot <- data.frame(Run = unique(converted_data[["Run"]]))
#' annot$BioReplicate <- seq_len(nrow(annot))
#' annot$Condition <- rep(c("ctrl", "treat"), length.out = nrow(annot))
#' overridden <- bigSpectronauttoMSstatsFormat(
#' system.file("extdata", "spectronaut_input.csv", package = "MSstatsBig"),
#' "output_file.csv",
#' backend = "arrow",
#' annotation = annot)
#' head(dplyr::collect(overridden))
#'
#' @return either arrow object or sparklyr table that can be optionally collected
#' into memory by using dplyr::collect function.
#'
Expand All @@ -143,8 +164,9 @@ bigSpectronauttoMSstatsFormat <- function(input_file, output_file_name,
aggregate_psms = FALSE,
filter_few_obs = FALSE,
remove_annotation = FALSE,
calculateAnomalyScores=FALSE,
calculateAnomalyScores=FALSE,
anomalyModelFeatures=c(),
annotation = NULL,
connection = NULL) {
reduced_file <- .prefixedPath("reduce_output_", output_file_name)
reduceBigSpectronaut(input_file, reduced_file,
Expand All @@ -153,19 +175,27 @@ bigSpectronauttoMSstatsFormat <- function(input_file, output_file_name,
calculateAnomalyScores, anomalyModelFeatures)
msstats_data <- MSstatsPreprocessBig(
input_file = reduced_file,
output_file_name = output_file_name,
backend = backend,
output_file_name = output_file_name,
backend = backend,
max_feature_count = max_feature_count,
filter_unique_peptides = filter_unique_peptides,
aggregate_psms = aggregate_psms,
filter_few_obs = filter_few_obs,
remove_annotation = remove_annotation,
calculateAnomalyScores = calculateAnomalyScores,
anomalyModelFeatures = anomalyModelFeatures,
aggregate_psms = aggregate_psms,
filter_few_obs = filter_few_obs,
remove_annotation = remove_annotation,
calculateAnomalyScores = calculateAnomalyScores,
anomalyModelFeatures = anomalyModelFeatures,
connection = connection)


if (!is.null(annotation)) {
msstats_data <- MSstatsAddAnnotationBig(msstats_data, annotation)
if (backend == "arrow") {
unlink(output_file_name, recursive = TRUE, force = TRUE)
arrow::write_dataset(msstats_data, output_file_name, format = "csv")
}
Comment thread
tonywu1999 marked this conversation as resolved.
}

return(msstats_data)

}


Expand Down
25 changes: 24 additions & 1 deletion man/bigSpectronauttoMSstatsFormat.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

24 changes: 24 additions & 0 deletions man/dot-prefixedPath.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

112 changes: 42 additions & 70 deletions tests/testthat/test-converters.R
Original file line number Diff line number Diff line change
Expand Up @@ -94,73 +94,45 @@ test_that("bigSpectronauttoMSstatsFormat works correctly", {
unlink(paste0("reduce_output_", output_file), recursive = TRUE, force = TRUE)
})

# test_that("bigDIANNtoMSstatsFormat works with real MSstatsConvert tinytest data", {
# input_file <- "/Users/rudhikshah/NorthEasternContractWork/MSstatsConvert/inst/tinytest/raw_data/DIANN/diann_input.tsv"
# annotation_file <- "/Users/rudhikshah/NorthEasternContractWork/MSstatsConvert/inst/tinytest/raw_data/DIANN/annotation.csv"

# # Skip test if the local files are not found (e.g. on CI/CD)
# skip_if_not(file.exists(input_file), "Local DIANN input file not found")
# skip_if_not(file.exists(annotation_file), "Local annotation file not found")

# annot <- read.csv(annotation_file)
# output_file <- "real_diann_output.csv"

# processed <- bigDIANNtoMSstatsFormat(
# input_file = input_file,
# annotation = annot,
# output_file_name = output_file,
# backend = "arrow",
# MBR = FALSE,
# quantificationColumn = "FragmentQuantCorrected",
# max_feature_count = 100,
# filter_unique_peptides = FALSE,
# aggregate_psms = FALSE,
# filter_few_obs = FALSE
# )

# result <- dplyr::collect(processed)

# expect_true(!is.null(result))
# expect_true(nrow(result) > 0)

# # Cleanup — outputs may be directories when backend = "arrow"
# unlink(output_file, recursive = TRUE, force = TRUE)
# unlink(paste0("reduce_output_", output_file), recursive = TRUE, force = TRUE)
# unlink(paste0("topN_", output_file), recursive = TRUE, force = TRUE)
# })

# test_that("bigDIANNtoMSstatsFormat works with DIANN 2.0 parquet input", {
# input_file <- "/Users/rudhikshah/NorthEasternContractWork/MSstatsConvert/inst/tinytest/raw_data/DIANN/diann_2.0.parquet"
# annotation_file <- "/Users/rudhikshah/NorthEasternContractWork/MSstatsConvert/inst/tinytest/raw_data/DIANN/annotation_diann_2.0.csv"

# skip_if_not(file.exists(input_file), "Local DIANN 2.0 parquet file not found")
# skip_if_not(file.exists(annotation_file), "Local DIANN 2.0 annotation file not found")
# skip_if_not_installed("arrow")

# annot <- read.csv(annotation_file)
# output_file <- "diann_2_0_output.csv"

# processed <- bigDIANNtoMSstatsFormat(
# input_file = input_file,
# annotation = annot,
# output_file_name = output_file,
# backend = "arrow",
# MBR = FALSE,
# quantificationColumn = "auto",
# max_feature_count = 100,
# filter_unique_peptides = FALSE,
# aggregate_psms = FALSE,
# filter_few_obs = FALSE
# )

# result <- dplyr::collect(processed)

# expect_true(!is.null(result))
# expect_true(nrow(result) > 0)

# # Cleanup — outputs may be directories when backend = "arrow"
# unlink(output_file, recursive = TRUE, force = TRUE)
# unlink(paste0("reduce_output_", output_file), recursive = TRUE, force = TRUE)
# unlink(paste0("topN_", output_file), recursive = TRUE, force = TRUE)
# unlink(paste0("cleaned_", output_file), recursive = TRUE, force = TRUE)
# })
test_that("bigSpectronauttoMSstatsFormat overrides Condition/BioReplicate from annotation", {
stub(bigSpectronauttoMSstatsFormat, "reduceBigSpectronaut", function(input_file, output_path, ...) {
msstats_data <- data.frame(
ProteinName = "P1", PeptideSequence = "PEPTIDE", PrecursorCharge = 2,
FragmentIon = "frag1", ProductCharge = 1,
IsotopeLabelType = "L",
Condition = "FROM_SPECTRONAUT", BioReplicate = 999,
Run = rep(c("run1", "run2"), each = 1),
Intensity = c(1000, 2000)
)
readr::write_csv(msstats_data, output_path)
})

input_file <- "dummy_spectro_input.csv"
output_file <- "spectro_output_annot.csv"

annotation <- data.frame(
Run = c("run1", "run2"),
BioReplicate = c(7L, 8L),
Condition = c("ctrl", "treat"),
stringsAsFactors = FALSE
)

processed <- bigSpectronauttoMSstatsFormat(
input_file = input_file,
annotation = annotation,
output_file_name = output_file,
backend = "arrow",
max_feature_count = 1
)
result <- dplyr::collect(processed)
result <- result[order(result$Run), ]

expect_equal(result$Condition, c("ctrl", "treat"))
expect_equal(result$BioReplicate, c(7L, 8L))
expect_false(any(result$Condition == "FROM_SPECTRONAUT"))
expect_false(any(result$BioReplicate == 999))

# Cleanup
unlink(output_file, recursive = TRUE, force = TRUE)
unlink(paste0("reduce_output_", output_file), recursive = TRUE, force = TRUE)
})
Loading