-
Notifications
You must be signed in to change notification settings - Fork 3
Expand file tree
/
Copy pathsegmentPedigree.R
More file actions
269 lines (251 loc) · 9.4 KB
/
segmentPedigree.R
File metadata and controls
269 lines (251 loc) · 9.4 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
#' Segment Pedigree into Extended Families
#'
#' This function adds an extended family ID variable to a pedigree by segmenting
#' that dataset into independent extended families
#' using the weakly connected components algorithm.
#'
#'
#' @param ped a pedigree dataset. Needs ID, momID, and dadID columns
#' @param personID character. Name of the column in ped for the person ID variable
#' @param momID character. Name of the column in ped for the mother ID variable
#' @param dadID character. Name of the column in ped for the father ID variable
#' @param famID character. Name of the column to be created in ped for the family ID variable
#' @param twinID character. Name of the column in ped for the twin ID variable, if applicable
#' @param overwrite logical. If TRUE, will overwrite existing famID variable if it exists. Default is TRUE.
#' @param ... additional arguments to be passed to \code{\link{ped2com}}
#' @details
#' The general idea of this function is to use person ID, mother ID, and father ID to
#' create an extended family ID such that everyone with the same family ID is in the
#' same (perhaps very extended) pedigree. That is, a pair of people with the same family ID
#' have at least one traceable relation of any length to one another.
#'
#' This function works by turning the pedigree into a mathematical graph using the igraph
#' package. Once in graph form, the function uses weakly connected components to search
#' for all possible relationship paths that could connect anyone in the data to anyone
#' else in the data.
#'
#' @returns
#' A pedigree dataset with one additional column for the newly created extended family ID
#'
#' @export
#'
ped2fam <- function(ped, personID = "ID",
momID = "momID", dadID = "dadID", famID = "famID",
twinID = "twinID",
overwrite = TRUE,
...) {
# Call to wrapper function
.ped2id(
ped = ped, personID = personID, momID = momID, dadID = dadID, famID = famID, twinID = twinID,
type = "parents",
overwrite = overwrite
)
}
.ped2id <- function(ped,
personID = "ID", momID = "momID", dadID = "dadID",
famID = "famID", twinID = "twinID",
type, overwrite = TRUE,
...) {
# Turn pedigree into family
pg <- ped2graph(
ped = ped, personID = personID, momID = momID, dadID = dadID, twinID = twinID,
adjacent = type
)
# Find weakly connected components of graph
wcc <- igraph::components(pg)
# Create famID data.frame
# Convert IDs to numeric, with warning if coercion collapses IDs
uniques <- suppressWarnings(unique(as.numeric(names(wcc$membership))))
keep_string <- FALSE
if (length(uniques) == 1L && is.na(uniques)) {
warning("After converting IDs to numeric, all IDs became NA. This indicates ID coercion collapsed IDs. Please ensure IDs aren't character or factor variables.")
keep_string <- TRUE
} else if (length(uniques) < length(wcc$membership)) {
warning("After converting IDs to numeric, some IDs became NA. This indicates ID coercion collapsed some IDs. Please ensure IDs aren't character or factor variables.")
keep_string <- TRUE
}
if(keep_string==TRUE) {
fam <- data.frame(
V1 = names(wcc$membership),
V2 = wcc$membership
)
} else {
fam <- data.frame(
V1 = as.numeric(names(wcc$membership)),
V2 = wcc$membership
)
}
names(fam) <- c(personID, famID)
if(famID %in% names(ped)) {
if(overwrite) {
overwrite_message <- "be overwritten."
ped[[famID]] <- NULL
} else {
overwrite_message <- "not be overwritten."
fam[[famID]] <- NULL
}
warning(sprintf("The famID variable '%s' already exists in the pedigree. The existing variable will %s", famID, overwrite_message))
}
ped2 <- merge(fam, ped,
by = personID, all.x = FALSE, all.y = TRUE
)
return(ped2)
}
#' Turn a pedigree into a graph
#' @param ped a pedigree dataset. Needs ID, momID, and dadID columns
#' @inheritParams ped2fam
#' @param directed Logical scalar. Default is TRUE. Indicates whether or not to create a directed graph.
#' @param adjacent Character. Relationship that defines adjacency in the graph: parents, mothers, or fathers
#' @details
#' The general idea of this function is to represent a pedigree as a graph using the igraph package.
#'
#' Once in graph form, several common pedigree tasks become much simpler.
#'
#' The \code{adjacent} argument allows for different kinds of graph structures.
#' When using \code{parents} for adjacency, the graph shows all parent-child relationships.
#' When using \code{mother} for adjacency, the graph only shows mother-child relationships.
#' Similarly when using \code{father} for adjacency, only father-child relationships appear in the graph.
#' Construct extended families from the parent graph, maternal lines from the mothers graph,
#' and paternal lines from the fathers graph.
#'
#' @returns
#' A graph
#'
#' @export
#'
ped2graph <- function(ped,
personID = "ID",
momID = "momID",
dadID = "dadID",
twinID = "twinID",
directed = TRUE,
adjacent = c("parents", "mothers", "fathers"),
...) {
# Check ped/data.fram
if (!inherits(ped, "data.frame")) {
stop("ped should be a data.frame or inherit to a data.frame")
}
# Handle adjacent argument
adjacent <- match.arg(tolower(adjacent)[1],
choices = c(
"parents",
"mothers",
"fathers"
)
)
# Check the needed IDs are in the data
if (adjacent == "parents") {
needIds <- c(personID, momID, dadID)
} else if (adjacent == "mothers") {
needIds <- c(personID, momID)
} else if (adjacent == "fathers") {
needIds <- c(personID, dadID)
}
if (!all(c(needIds) %in% names(ped))) {
msg <- paste0(
"The following ID variables are needed but were not found:\n",
paste(needIds[!(c(needIds) %in% names(ped))], collapse = ", "),
"\n",
"Make sure you have the variable names correct."
)
stop(msg)
}
# Create nodes and edges
if (adjacent == "parents") {
nodes <- unique(
stats::na.omit(
as.character(c(ped[[personID]], ped[[momID]], ped[[dadID]]))
)
)
edges <- rbind(
as.matrix(data.frame(
# need to be parent to child for igraph
momID = as.character(ped[[momID]]),
personID = as.character(ped[[personID]])
)),
as.matrix(data.frame(
dadID = as.character(ped[[dadID]]),
personID = as.character(ped[[personID]])
))
)
} else if (adjacent == "mothers") {
nodes <- unique(
stats::na.omit(
as.character(c(ped[[personID]], ped[[momID]]))
)
)
edges <- as.matrix(data.frame(
momID = as.character(ped[[momID]]),
personID = as.character(ped[[personID]])
))
} else if (adjacent == "fathers") {
nodes <- unique(
stats::na.omit(
as.character(c(ped[[personID]], ped[[dadID]]))
)
)
edges <- as.matrix(data.frame(
dadID = as.character(ped[[dadID]]),
personID = as.character(ped[[personID]])
))
}
edges <- edges[stats::complete.cases(edges), ]
# Make graph
pg <- igraph::graph_from_data_frame(
d = edges,
directed = directed, # directed = TRUE looks better
vertices = nodes
)
return(pg)
}
#' Add a maternal line ID variable to a pedigree
#' @inheritParams ped2fam
#' @param matID Character. Maternal line ID variable to be created and added to the pedigree
#' @details
#' Under various scenarios it is useful to know which people in a pedigree
#' belong to the same maternal lines. This function first turns a pedigree
#' into a graph where adjacency is defined by mother-child relationships.
#' Subsequently, the weakly connected components algorithm finds all the
#' separate maternal lines and gives them an ID variable.
#' @seealso [ped2fam()] for creating extended family IDs, and [ped2paternal()]
#' for creating paternal line IDs
#' @export
#'
ped2maternal <- function(ped, personID = "ID",
momID = "momID", dadID = "dadID",
matID = "matID",
twinID = "twinID",
...) {
# Call to wrapper function
.ped2id(
ped = ped, personID = personID, momID = momID,
dadID = dadID, famID = matID, twinID = twinID,
type = "mothers"
)
}
#' Add a paternal line ID variable to a pedigree
#' @inheritParams ped2fam
#' @param patID Character. Paternal line ID variable to be created and added to the pedigree
#' @details
#' Under various scenarios it is useful to know which people in a pedigree
#' belong to the same paternal lines. This function first turns a pedigree
#' into a graph where adjacency is defined by father-child relationships.
#' Subsequently, the weakly connected components algorithm finds all the
#' separate paternal lines and gives them an ID variable.
#' @seealso [ped2fam()] for creating extended family IDs, and [ped2maternal()]
#' for creating maternal line IDs
#' @export
#'
ped2paternal <- function(ped, personID = "ID",
momID = "momID", dadID = "dadID",
patID = "patID",
twinID = "twinID",
...) {
# Call to wrapper function
.ped2id(
ped = ped, personID = personID, momID = momID,
dadID = dadID, famID = patID,
twinID = twinID,
type = "fathers"
)
}