Vitek-Lab · tonywu1999 · May 26, 2026 · May 26, 2026 · May 26, 2026 · May 26, 2026
diff --git a/.Rbuildignore b/.Rbuildignore
@@ -1,2 +1,4 @@
 ^.*\.Rproj$
 ^\.Rproj\.user$
+^\.positai$
+^\.claude$
diff --git a/.gitignore b/.gitignore
@@ -4,3 +4,4 @@
 .Ruserdata
 inst/doc
 *.Rproj
+.positai
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -14,7 +14,6 @@ Description: MSstats package provide tools for preprocessing, summarization and
     processing larger than memory data sets.
 License: Artistic-2.0
 Encoding: UTF-8
-RoxygenNote: 7.3.3
 Imports: 
     arrow,
     DBI,
@@ -31,3 +30,4 @@ Suggests:
     rmarkdown
 VignetteBuilder: knitr
 biocViews: MassSpectrometry, Proteomics, Software
+Config/roxygen2/version: 8.0.0
diff --git a/R/converters.R b/R/converters.R
@@ -112,6 +112,15 @@ bigFragPipetoMSstatsFormat <-  function(input_file, output_file_name,
 #' Convert out-of-memory Spectronaut files to MSstats format.
 #'
 #' @inheritParams MSstatsPreprocessBig
+#' @param annotation Optional data.frame with columns `Run`,
+#'   `BioReplicate`, `Condition` (plus any additional annotation
+#'   columns). If supplied, the converter merges it onto the output
+#'   on `Run` and overrides any `Condition` / `BioReplicate` values
+#'   that came from Spectronaut's `R.Condition` / `R.Replicate`
+#'   columns. Required when the experimental design cannot be
+#'   expressed in Spectronaut's own annotation — most notably paired
+#'   designs, where `BioReplicate` must encode the pairing
+#'   structure rather than per-sample IDs.
 #' @param intensity Name of the intensity column to be used in Spectronaut
 #' @param filter_by_excluded if TRUE, will filter by the `F.ExcludedFromQuantification` column.
 #' @param filter_by_identified if TRUE, will filter by the `EG.Identified` column.
@@ -124,10 +133,22 @@ bigFragPipetoMSstatsFormat <-  function(input_file, output_file_name,
 #' converted_data <- bigSpectronauttoMSstatsFormat(
 #'   system.file("extdata", "spectronaut_input.csv", package = "MSstatsBig"),
 #'   "output_file.csv",
-#'   backend="arrow")
+#'   backend = "arrow")
 #' converted_data <- dplyr::collect(converted_data)
 #' head(converted_data)
 #'
+#' # Override Spectronaut's embedded Condition / BioReplicate with
+#' # a custom annotation (e.g. for a paired design):
+#' annot <- data.frame(Run = unique(converted_data[["Run"]]))
+#' annot$BioReplicate <- seq_len(nrow(annot))
+#' annot$Condition <- rep(c("ctrl", "treat"), length.out = nrow(annot))
+#' overridden <- bigSpectronauttoMSstatsFormat(
+#'   system.file("extdata", "spectronaut_input.csv", package = "MSstatsBig"),
+#'   "output_file.csv",
+#'   backend = "arrow",
+#'   annotation = annot)
+#' head(dplyr::collect(overridden))
+#'
 #' @return either arrow object or sparklyr table that can be optionally collected
 #' into memory by using dplyr::collect function.
 #'
@@ -143,8 +164,9 @@ bigSpectronauttoMSstatsFormat <-  function(input_file, output_file_name,
                                           aggregate_psms =  FALSE,
                                           filter_few_obs =  FALSE,
                                           remove_annotation =  FALSE,
-                                          calculateAnomalyScores=FALSE, 
+                                          calculateAnomalyScores=FALSE,
                                           anomalyModelFeatures=c(),
+                                          annotation = NULL,
                                           connection =  NULL) {
   reduced_file <- .prefixedPath("reduce_output_", output_file_name)
   reduceBigSpectronaut(input_file, reduced_file,
@@ -153,19 +175,27 @@ bigSpectronauttoMSstatsFormat <-  function(input_file, output_file_name,
                        calculateAnomalyScores, anomalyModelFeatures)
   msstats_data <- MSstatsPreprocessBig(
     input_file = reduced_file,
-    output_file_name = output_file_name, 
-    backend = backend, 
+    output_file_name = output_file_name,
+    backend = backend,
     max_feature_count = max_feature_count,
     filter_unique_peptides = filter_unique_peptides,
-    aggregate_psms = aggregate_psms, 
-    filter_few_obs = filter_few_obs, 
-    remove_annotation = remove_annotation, 
-    calculateAnomalyScores = calculateAnomalyScores, 
-    anomalyModelFeatures = anomalyModelFeatures, 
+    aggregate_psms = aggregate_psms,
+    filter_few_obs = filter_few_obs,
+    remove_annotation = remove_annotation,
+    calculateAnomalyScores = calculateAnomalyScores,
+    anomalyModelFeatures = anomalyModelFeatures,
     connection = connection)
-
+
+  if (!is.null(annotation)) {
+    msstats_data <- MSstatsAddAnnotationBig(msstats_data, annotation)
+    if (backend == "arrow") {
+      unlink(output_file_name, recursive = TRUE, force = TRUE)
+      arrow::write_dataset(msstats_data, output_file_name, format = "csv")
+    }
+  }
+
   return(msstats_data)
-  
+
 }
 
 

diff --git a/man/bigSpectronauttoMSstatsFormat.Rd b/man/bigSpectronauttoMSstatsFormat.Rd
diff --git a/man/dot-prefixedPath.Rd b/man/dot-prefixedPath.Rd
diff --git a/tests/testthat/test-converters.R b/tests/testthat/test-converters.R
@@ -94,73 +94,45 @@ test_that("bigSpectronauttoMSstatsFormat works correctly", {
   unlink(paste0("reduce_output_", output_file), recursive = TRUE, force = TRUE)
 })
 
-# test_that("bigDIANNtoMSstatsFormat works with real MSstatsConvert tinytest data", {
-#   input_file <- "/Users/rudhikshah/NorthEasternContractWork/MSstatsConvert/inst/tinytest/raw_data/DIANN/diann_input.tsv"
-#   annotation_file <- "/Users/rudhikshah/NorthEasternContractWork/MSstatsConvert/inst/tinytest/raw_data/DIANN/annotation.csv"
-
-#   # Skip test if the local files are not found (e.g. on CI/CD)
-#   skip_if_not(file.exists(input_file), "Local DIANN input file not found")
-#   skip_if_not(file.exists(annotation_file), "Local annotation file not found")
-
-#   annot <- read.csv(annotation_file)
-#   output_file <- "real_diann_output.csv"
-
-#   processed <- bigDIANNtoMSstatsFormat(
-#     input_file = input_file,
-#     annotation = annot,
-#     output_file_name = output_file,
-#     backend = "arrow",
-#     MBR = FALSE,
-#     quantificationColumn = "FragmentQuantCorrected",
-#     max_feature_count = 100,
-#     filter_unique_peptides = FALSE,
-#     aggregate_psms = FALSE,
-#     filter_few_obs = FALSE
-#   )
-
-#   result <- dplyr::collect(processed)
-
-#   expect_true(!is.null(result))
-#   expect_true(nrow(result) > 0)
-
-#   # Cleanup — outputs may be directories when backend = "arrow"
-#   unlink(output_file, recursive = TRUE, force = TRUE)
-#   unlink(paste0("reduce_output_", output_file), recursive = TRUE, force = TRUE)
-#   unlink(paste0("topN_", output_file), recursive = TRUE, force = TRUE)
-# })
-
-# test_that("bigDIANNtoMSstatsFormat works with DIANN 2.0 parquet input", {
-#   input_file <- "/Users/rudhikshah/NorthEasternContractWork/MSstatsConvert/inst/tinytest/raw_data/DIANN/diann_2.0.parquet"
-#   annotation_file <- "/Users/rudhikshah/NorthEasternContractWork/MSstatsConvert/inst/tinytest/raw_data/DIANN/annotation_diann_2.0.csv"
-
-#   skip_if_not(file.exists(input_file), "Local DIANN 2.0 parquet file not found")
-#   skip_if_not(file.exists(annotation_file), "Local DIANN 2.0 annotation file not found")
-#   skip_if_not_installed("arrow")
-
-#   annot <- read.csv(annotation_file)
-#   output_file <- "diann_2_0_output.csv"
-
-#   processed <- bigDIANNtoMSstatsFormat(
-#     input_file = input_file,
-#     annotation = annot,
-#     output_file_name = output_file,
-#     backend = "arrow",
-#     MBR = FALSE,
-#     quantificationColumn = "auto",
-#     max_feature_count = 100,
-#     filter_unique_peptides = FALSE,
-#     aggregate_psms = FALSE,
-#     filter_few_obs = FALSE
-#   )
-
-#   result <- dplyr::collect(processed)
-
-#   expect_true(!is.null(result))
-#   expect_true(nrow(result) > 0)
-
-#   # Cleanup — outputs may be directories when backend = "arrow"
-#   unlink(output_file, recursive = TRUE, force = TRUE)
-#   unlink(paste0("reduce_output_", output_file), recursive = TRUE, force = TRUE)
-#   unlink(paste0("topN_", output_file), recursive = TRUE, force = TRUE)
-#   unlink(paste0("cleaned_", output_file), recursive = TRUE, force = TRUE)
-# })
+test_that("bigSpectronauttoMSstatsFormat overrides Condition/BioReplicate from annotation", {
+  stub(bigSpectronauttoMSstatsFormat, "reduceBigSpectronaut", function(input_file, output_path, ...) {
+    msstats_data <- data.frame(
+      ProteinName = "P1", PeptideSequence = "PEPTIDE", PrecursorCharge = 2,
+      FragmentIon = "frag1", ProductCharge = 1,
+      IsotopeLabelType = "L",
+      Condition = "FROM_SPECTRONAUT", BioReplicate = 999,
+      Run = rep(c("run1", "run2"), each = 1),
+      Intensity = c(1000, 2000)
+    )
+    readr::write_csv(msstats_data, output_path)
+  })
+
+  input_file <- "dummy_spectro_input.csv"
+  output_file <- "spectro_output_annot.csv"
+
+  annotation <- data.frame(
+    Run = c("run1", "run2"),
+    BioReplicate = c(7L, 8L),
+    Condition = c("ctrl", "treat"),
+    stringsAsFactors = FALSE
+  )
+
+  processed <- bigSpectronauttoMSstatsFormat(
+    input_file = input_file,
+    annotation = annotation,
+    output_file_name = output_file,
+    backend = "arrow",
+    max_feature_count = 1
+  )
+  result <- dplyr::collect(processed)
+  result <- result[order(result$Run), ]
+
+  expect_equal(result$Condition, c("ctrl", "treat"))
+  expect_equal(result$BioReplicate, c(7L, 8L))
+  expect_false(any(result$Condition == "FROM_SPECTRONAUT"))
+  expect_false(any(result$BioReplicate == 999))
+
+  # Cleanup
+  unlink(output_file, recursive = TRUE, force = TRUE)
+  unlink(paste0("reduce_output_", output_file), recursive = TRUE, force = TRUE)
+})