parsemsf/R/make_pep_table.R at 6592d37ef0d250e62fff96b34bfe49d33e740889 · benjaminjack/parsemsf · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126

#' Make a data frame of peptides
#'
#' @description Extracts amino acid sequences (without post-translational modifications), assigned protein groups, and quality scores.
#' @param msf_file A file path to a ThermoFisher MSF file.
#' @param min_conf "High", "Medium", or "Low". The minimum peptide confidence level to retrieve from MSF file.
#' @param prot_regex Regular expression where the first group matches a protein name or ID from the protein description. Regex must contain ONE group. The protein description is typically generated from a fasta reference file that was used for the database search.
#' @param collapse If TRUE, peptides that match to multiple protein sequences are collapsed into a single row with multiple protein descriptions and protein IDs in the \code{Proteins} and \code{ProteinID} columns separated by semi-colons (";").
#'
#' @return A data frame of all peptides above the confidence cut-off from a ThermoFisher MSF file.
#'
#' \item{peptide_id}{a unique peptide ID}
#' \item{spectrum_id}{a unique spectrum ID}
#' \item{protein_id}{unique protein group ID to which this peptide maps}
#' \item{protein_desc}{protein description from reference database used to assign peptides to protein groups, parsed according to \code{prot_regex}}
#' \item{sequence}{amino acid sequence (does not show post-translational modifications)}
#' \item{pep_score}{PEP score}
#' \item{q_value}{Q-value score}
#'
#' @export
#'
#' @examples
#' # Read from a path
#'
#' make_pep_table(parsemsf_example("test_db.msf"))
#'
#' # Retrieve full protein description
#'
#' make_pep_table(parsemsf_example("test_db.msf"), prot_regex = "")
#'
#' # ...which is also equivalent to...
#'
#' make_pep_table(parsemsf_example("test_db.msf"), prot_regex = "^(.+)$")
#'
make_pep_table <- function(msf_file,
                           min_conf = "High",
                           prot_regex = "^>([a-zA-Z0-9._]+)\\b",
                           collapse = TRUE) {

  confidence <- switch(min_conf,
                       High = 3,
                       Medium = 2,
                       Low = 1,
                       3)

  if (prot_regex == "") {
    prot_regex = "^(.+)$"
  }

  # Access MSF database file
  # my_db <- dplyr::src_sqlite(msf_file)
  my_db <- DBI::dbConnect(RSQLite::SQLite(), msf_file)
  on.exit(DBI::dbDisconnect(my_db))

  # First check file version (only Proteome Discoverer 1.4.x is supported)
  schema <- tbl(my_db, "SchemaInfo") %>%
    select("SoftwareVersion") %>%
    collect()
  if (!str_detect(as.character(schema$SoftwareVersion[[1]]), "^1\\.4\\..*$")) {
    warning("Only ThermoFisher MSF files generated by Proteome Discoverer 1.4.x are supported. parsemsf functions may produce unexpected results.")
  }

  # This table maps PeptideIDs to ProteinIDs
  PeptidesProteins <- tbl(my_db, "PeptidesProteins") %>%
    select("PeptideID", "ProteinID")

  # Here are the actual peptide sequences with corresponding SpectrumIDs
  Peptides <- tbl(my_db, "Peptides") %>%
    select("PeptideID", "SpectrumID", "ConfidenceLevel", "Sequence") %>%
    filter(.data$ConfidenceLevel >= confidence)

  # Protein descriptions are contained in this table
  ProteinAnnotations <- tbl(my_db, "ProteinAnnotations") %>%
    select("ProteinID", "Description")

  # Build a peptide table
  pep_table <- PeptidesProteins %>%
    inner_join(Peptides, by = c("PeptideID" = "PeptideID")) %>%
    inner_join(ProteinAnnotations, by = "ProteinID") %>%
    collect(n = Inf) %>%
    # Extract protein ID; assumes that protein ID is immediately after ">" and ends with a space
    mutate(Proteins = str_match(.data$Description, prot_regex)[,2]) %>%
    group_by(.data$PeptideID, .data$SpectrumID, .data$Sequence)

  # Collapse peptides that map to multiple proteins into a single row
  if (collapse == TRUE) {
    pep_table <- pep_table %>%
      summarize(
        Proteins = paste0(.data$Proteins, collapse = "; "),
        ProteinID = paste0(.data$ProteinID, collapse = "; ")
      )
  }

  pep_table <- pep_table %>%
    select("PeptideID", "SpectrumID", "Sequence", "Proteins", "ProteinID")
  # Append custom fields
  CustomFields <- tbl(my_db, "CustomDataFields")

  # Grab custom peptide data using SQL because of automatic sqlite typing
  # This assumes that FieldValue will always be a number
  CustomPeptides <- tbl(my_db, sql("SELECT FieldID, PeptideID, CAST(FieldValue as REAL) AS FieldValue FROM CustomDataPeptides"))

  # Spread custom fields as separate columns
  custom_data <- CustomPeptides %>%
    left_join(CustomFields, by = "FieldID") %>%
    select("PeptideID", "DisplayName", "FieldValue") %>%
    collect(n = Inf) %>%
    spread("DisplayName", "FieldValue")

  pep_table <- pep_table %>%
    # Join to peptide table
    left_join(custom_data, by = "PeptideID") %>%
    # Rename columns for a more consistent column name style
    select(
      peptide_id = "PeptideID",
      spectrum_id = "SpectrumID",
      protein_id = "ProteinID",
      protein_desc = "Proteins",
      sequence = "Sequence",
      pep_score = "PEP",
      q_value = "q-Value"
    )

  return(pep_table)

}