-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathmake_pep_table.R
More file actions
126 lines (110 loc) · 4.97 KB
/
make_pep_table.R
File metadata and controls
126 lines (110 loc) · 4.97 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
#' Make a data frame of peptides
#'
#' @description Extracts amino acid sequences (without post-translational modifications), assigned protein groups, and quality scores.
#' @param msf_file A file path to a ThermoFisher MSF file.
#' @param min_conf "High", "Medium", or "Low". The minimum peptide confidence level to retrieve from MSF file.
#' @param prot_regex Regular expression where the first group matches a protein name or ID from the protein description. Regex must contain ONE group. The protein description is typically generated from a fasta reference file that was used for the database search.
#' @param collapse If TRUE, peptides that match to multiple protein sequences are collapsed into a single row with multiple protein descriptions and protein IDs in the \code{Proteins} and \code{ProteinID} columns separated by semi-colons (";").
#'
#' @return A data frame of all peptides above the confidence cut-off from a ThermoFisher MSF file.
#'
#' \item{peptide_id}{a unique peptide ID}
#' \item{spectrum_id}{a unique spectrum ID}
#' \item{protein_id}{unique protein group ID to which this peptide maps}
#' \item{protein_desc}{protein description from reference database used to assign peptides to protein groups, parsed according to \code{prot_regex}}
#' \item{sequence}{amino acid sequence (does not show post-translational modifications)}
#' \item{pep_score}{PEP score}
#' \item{q_value}{Q-value score}
#'
#' @export
#'
#' @examples
#' # Read from a path
#'
#' make_pep_table(parsemsf_example("test_db.msf"))
#'
#' # Retrieve full protein description
#'
#' make_pep_table(parsemsf_example("test_db.msf"), prot_regex = "")
#'
#' # ...which is also equivalent to...
#'
#' make_pep_table(parsemsf_example("test_db.msf"), prot_regex = "^(.+)$")
#'
make_pep_table <- function(msf_file,
min_conf = "High",
prot_regex = "^>([a-zA-Z0-9._]+)\\b",
collapse = TRUE) {
confidence <- switch(min_conf,
High = 3,
Medium = 2,
Low = 1,
3)
if (prot_regex == "") {
prot_regex = "^(.+)$"
}
# Access MSF database file
# my_db <- dplyr::src_sqlite(msf_file)
my_db <- DBI::dbConnect(RSQLite::SQLite(), msf_file)
on.exit(DBI::dbDisconnect(my_db))
# First check file version (only Proteome Discoverer 1.4.x is supported)
schema <- tbl(my_db, "SchemaInfo") %>%
select("SoftwareVersion") %>%
collect()
if (!str_detect(as.character(schema$SoftwareVersion[[1]]), "^1\\.4\\..*$")) {
warning("Only ThermoFisher MSF files generated by Proteome Discoverer 1.4.x are supported. parsemsf functions may produce unexpected results.")
}
# This table maps PeptideIDs to ProteinIDs
PeptidesProteins <- tbl(my_db, "PeptidesProteins") %>%
select("PeptideID", "ProteinID")
# Here are the actual peptide sequences with corresponding SpectrumIDs
Peptides <- tbl(my_db, "Peptides") %>%
select("PeptideID", "SpectrumID", "ConfidenceLevel", "Sequence") %>%
filter(.data$ConfidenceLevel >= confidence)
# Protein descriptions are contained in this table
ProteinAnnotations <- tbl(my_db, "ProteinAnnotations") %>%
select("ProteinID", "Description")
# Build a peptide table
pep_table <- PeptidesProteins %>%
inner_join(Peptides, by = c("PeptideID" = "PeptideID")) %>%
inner_join(ProteinAnnotations, by = "ProteinID") %>%
collect(n = Inf) %>%
# Extract protein ID; assumes that protein ID is immediately after ">" and ends with a space
mutate(Proteins = str_match(.data$Description, prot_regex)[,2]) %>%
group_by(.data$PeptideID, .data$SpectrumID, .data$Sequence)
# Collapse peptides that map to multiple proteins into a single row
if (collapse == TRUE) {
pep_table <- pep_table %>%
summarize(
Proteins = paste0(.data$Proteins, collapse = "; "),
ProteinID = paste0(.data$ProteinID, collapse = "; ")
)
}
pep_table <- pep_table %>%
select("PeptideID", "SpectrumID", "Sequence", "Proteins", "ProteinID")
# Append custom fields
CustomFields <- tbl(my_db, "CustomDataFields")
# Grab custom peptide data using SQL because of automatic sqlite typing
# This assumes that FieldValue will always be a number
CustomPeptides <- tbl(my_db, sql("SELECT FieldID, PeptideID, CAST(FieldValue as REAL) AS FieldValue FROM CustomDataPeptides"))
# Spread custom fields as separate columns
custom_data <- CustomPeptides %>%
left_join(CustomFields, by = "FieldID") %>%
select("PeptideID", "DisplayName", "FieldValue") %>%
collect(n = Inf) %>%
spread("DisplayName", "FieldValue")
pep_table <- pep_table %>%
# Join to peptide table
left_join(custom_data, by = "PeptideID") %>%
# Rename columns for a more consistent column name style
select(
peptide_id = "PeptideID",
spectrum_id = "SpectrumID",
protein_id = "ProteinID",
protein_desc = "Proteins",
sequence = "Sequence",
pep_score = "PEP",
q_value = "q-Value"
)
return(pep_table)
}