Skip to content

Commit 217e71f

Browse files
Merge pull request #150 from R-Computing-Lab/copilot/feature-simulate-multiple-families
Add `simulatePedigrees()` for simulating multiple families in one call
2 parents 42ae819 + c2fa605 commit 217e71f

7 files changed

Lines changed: 287 additions & 8 deletions

File tree

NAMESPACE

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -49,6 +49,7 @@ export(related_coef)
4949
export(repairIDs)
5050
export(repairSex)
5151
export(simulatePedigree)
52+
export(simulatePedigrees)
5253
export(sliceFamilies)
5354
export(summariseFamilies)
5455
export(summariseMatrilines)

NEWS.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
* Smarter string ID handling for ped2id
88
* Fixed how different-sized matrices are handled by `com2links()`
99
* Added alignPhenToMatrix function to align phenotypic data to the order of the relatedness matrix
10+
* Added `simulatePedigrees()` function to easily simulate multiple families at once and return them as a single combined data frame
1011

1112
# BGmisc 1.7.0.0
1213
* Fixed bug in parList

R/simulatePedigree.R

Lines changed: 106 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -879,6 +879,7 @@ buildBtwnGenerations_opt <- function(df_Fam,
879879
#' @param ... Additional arguments to be passed to other functions.
880880
#' @inheritParams ped2fam
881881
#' @param spouseID The name of the column that will contain the spouse ID in the output data frame. Default is "spID".
882+
#' @param remap_ids logical. If TRUE, remap all ID columns to sequential integers (1, 2, 3, ...) in row order.
882883
#' @return A \code{data.frame} with each row representing a simulated individual. The columns are as follows:
883884
#' \itemize{
884885
#' \item{fam: The family id of each simulated individual. It is 'fam1' in a single simulated pedigree.}
@@ -914,6 +915,7 @@ simulatePedigree <- function(kpc = 3,
914915
code_male = "M",
915916
code_female = "F",
916917
fam_shift = 1L,
918+
remap_ids = FALSE,
917919
beta = FALSE) {
918920
# SexRatio: ratio of male over female in the offspring setting; used in the between generation combinations
919921
# SexRatio <- sexR / (1 - sexR)
@@ -966,11 +968,24 @@ simulatePedigree <- function(kpc = 3,
966968
df_Fam <- df_Fam[, 1:7]
967969
df_Fam <- df_Fam[!(is.na(df_Fam$pat) & is.na(df_Fam$mat) & is.na(df_Fam$spID)), ]
968970

969-
colnames(df_Fam)[c(2, 4, 5)] <- c(personID, dadID, momID)
971+
names(df_Fam) <- c("fam", personID, "gen", dadID, momID, spouseID, "sex")
970972

971973
# connect the detached members
972974
df_Fam[is.na(df_Fam[[momID]]) & is.na(df_Fam[[dadID]]) & df_Fam$gen > 1, ]
973975

976+
977+
if(remap_ids) {
978+
# Remap all ID columns to sequential integers (1, 2, 3, ...) in row order,
979+
# so the final data frame has tidy consecutive IDs regardless of fam_shift offsets.
980+
old_ids <- rbind( df_Fam[[personID]], df_Fam[[momID]], df_Fam[[dadID]], df_Fam[[spouseID]])
981+
old_ids <- unique(old_ids[!is.na(old_ids)])
982+
id_map <- setNames(seq_along(old_ids), as.character(old_ids))
983+
984+
df_Fam[[personID]] <- as.integer(id_map[as.character( df_Fam[[personID]])])
985+
df_Fam[[momID]] <- as.integer(id_map[as.character( df_Fam[[momID]])])
986+
df_Fam[[dadID]] <- as.integer(id_map[as.character( df_Fam[[dadID]])])
987+
df_Fam[[spouseID]] <- as.integer(id_map[as.character( df_Fam[[spouseID]])])
988+
}
974989
df_Fam
975990
}
976991

@@ -980,3 +995,93 @@ SimPed <- function(...) { # nolint: object_name_linter.
980995
warning("The 'SimPed' function is deprecated. Please use 'simulatePedigree' instead.")
981996
simulatePedigree(...)
982997
}
998+
999+
#' Simulate Multiple Pedigrees
1000+
#'
1001+
#' This function simulates multiple "balanced" pedigrees and returns them
1002+
#' combined into a single data frame. It is a convenience wrapper around
1003+
#' \code{\link{simulatePedigree}} that makes it easy to simulate many families
1004+
#' at once, with unique IDs across all families.
1005+
#'
1006+
#' @param n_fam Integer. Number of families to simulate. Default is 2.
1007+
#' @param remap_ids Logical. If TRUE (default), all ID columns (personID, momID, dadID, spouseID) will be remapped to sequential integers starting at 1 across the combined data frame. This ensures tidy consecutive IDs regardless of fam_shift offsets. If FALSE, IDs will retain their original values from each pedigree simulation, which may include gaps or non-sequential values due to fam_shift.
1008+
#' @inheritParams simulatePedigree
1009+
#' @return A \code{data.frame} containing all simulated individuals from all
1010+
#' families combined, with the same columns as \code{\link{simulatePedigree}}.
1011+
#' The \code{fam} column uniquely identifies each family (e.g., "fam1",
1012+
#' "fam2", ...). Individual IDs are sequential integers starting at 1
1013+
#' (i.e., \code{1:nrow(result)}), and all parent/spouse ID references are
1014+
#' remapped to match.
1015+
#' @export
1016+
#' @examples
1017+
#' set.seed(5)
1018+
#' df_peds <- simulatePedigrees(
1019+
#' n_fam = 3,
1020+
#' kpc = 4,
1021+
#' Ngen = 4,
1022+
#' sexR = .5,
1023+
#' marR = .7
1024+
#' )
1025+
#' summary(df_peds)
1026+
simulatePedigrees <- function(n_fam = 2,
1027+
kpc = 3,
1028+
Ngen = 4,
1029+
sexR = .5,
1030+
marR = 2 / 3,
1031+
rd_kpc = FALSE,
1032+
balancedSex = TRUE,
1033+
balancedMar = TRUE,
1034+
verbose = FALSE,
1035+
personID = "ID",
1036+
momID = "momID",
1037+
dadID = "dadID",
1038+
spouseID = "spouseID",
1039+
code_male = "M",
1040+
code_female = "F",
1041+
remap_ids = TRUE,
1042+
beta = FALSE
1043+
) {
1044+
n_fam <- as.integer(n_fam)
1045+
if (is.na(n_fam) || n_fam < 1L) {
1046+
stop("'n_fam' must be a positive integer.")
1047+
}
1048+
ped_list <- vector("list", n_fam)
1049+
for (i in seq_len(n_fam)) {
1050+
ped_i <- simulatePedigree(
1051+
kpc = kpc,
1052+
Ngen = Ngen,
1053+
sexR = sexR,
1054+
marR = marR,
1055+
rd_kpc = rd_kpc,
1056+
balancedSex = balancedSex,
1057+
balancedMar = balancedMar,
1058+
verbose = verbose,
1059+
personID = personID,
1060+
momID = momID,
1061+
dadID = dadID,
1062+
spouseID = spouseID,
1063+
code_male = code_male,
1064+
code_female = code_female,
1065+
fam_shift = i,
1066+
remap_ids = FALSE, # Keep original IDs for now; we'll remap after combining.
1067+
beta = beta
1068+
)
1069+
ped_i$fam <- paste0("fam", i)
1070+
ped_list[[i]] <- ped_i
1071+
}
1072+
combined <- data.table::rbindlist(ped_list) |> as.data.frame()
1073+
names(combined) <- c("fam", personID, "gen", dadID, momID, spouseID, "sex")
1074+
if(remap_ids) {
1075+
# Remap all ID columns to sequential integers (1, 2, 3, ...) in row order,
1076+
# so the final data frame has tidy consecutive IDs regardless of fam_shift offsets.
1077+
old_ids <- rbind(combined[[personID]], combined[[momID]], combined[[dadID]], combined[[spouseID]])
1078+
old_ids <- unique(old_ids[!is.na(old_ids)])
1079+
id_map <- setNames(seq_along(old_ids), as.character(old_ids))
1080+
1081+
combined[[personID]] <- as.integer(id_map[as.character(combined[[personID]])])
1082+
combined[[momID]] <- as.integer(id_map[as.character(combined[[momID]])])
1083+
combined[[dadID]] <- as.integer(id_map[as.character(combined[[dadID]])])
1084+
combined[[spouseID]] <- as.integer(id_map[as.character(combined[[spouseID]])])
1085+
}
1086+
combined
1087+
}

man/simulatePedigree.Rd

Lines changed: 3 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

man/simulatePedigrees.Rd

Lines changed: 110 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

tests/testthat/test-simulatePedigree.R

Lines changed: 65 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -126,9 +126,8 @@ test_that("simulated pedigree generates expected data structure when sexR is imb
126126
expect_equal(max(results$gen), Ngen, tolerance = strict_tolerance)
127127

128128
# expect there to be parents in each for all generations except the first one
129-
filter_parents <- results %>%
130-
group_by(gen) %>%
131-
summarize(num_parents = sum(!is.na(dadID), na.rm = TRUE) + sum(!is.na(momID), na.rm = TRUE))
129+
filter_parents <- dplyr::group_by(results, gen) %>%
130+
dplyr::summarize(num_parents = sum(!is.na(dadID), na.rm = TRUE) + sum(!is.na(momID), na.rm = TRUE))
132131

133132
expect_true(all(filter_parents$num_parents[filter_parents$gen > 1] > 0), info = paste0("Beta option: ", beta))
134133
expect_true(all(filter_parents$num_parents[filter_parents$gen == 1] == 0), info = paste0("Beta option: ", beta))
@@ -199,9 +198,8 @@ test_that("simulated pedigree generates expected data structure but supply var n
199198
expect_lt(sex_mean_male, sex_mean_female)
200199

201200
# expect there to be parents in each for all generations except the first one
202-
filter_parents <- results %>%
203-
group_by(gen) %>%
204-
summarize(num_parents = sum(!is.na(dadID), na.rm = TRUE) + sum(!is.na(momID), na.rm = TRUE))
201+
filter_parents <- dplyr::group_by(results, gen) %>%
202+
dplyr::summarize(num_parents = sum(!is.na(dadID), na.rm = TRUE) + sum(!is.na(momID), na.rm = TRUE))
205203

206204
expect_true(all(filter_parents$num_parents[filter_parents$gen > 1] > 0), info = paste0("Beta option: ", beta))
207205
expect_true(all(filter_parents$num_parents[filter_parents$gen == 1] == 0), info = paste0("Beta option: ", beta))
@@ -289,3 +287,64 @@ test_that("simulatePedigree accepts string aliases for beta parameter", {
289287
"not yet implemented"
290288
)
291289
})
290+
291+
test_that("simulatePedigrees returns combined data frame for multiple families", {
292+
set.seed(5)
293+
n_fam <- 3
294+
results <- simulatePedigrees(n_fam = n_fam, kpc = 3, Ngen = 4, marR = 0.6)
295+
296+
# Should return a data frame
297+
expect_s3_class(results, "data.frame")
298+
299+
# Should have exactly n_fam unique family IDs
300+
fam_ids <- unique(results$fam)
301+
expect_setequal(fam_ids, paste0("fam", seq_len(n_fam)))
302+
303+
# All person IDs should be unique across families
304+
expect_equal(length(unique(results$ID)), nrow(results))
305+
306+
# Should have standard pedigree columns
307+
expect_true(all(c("fam", "ID", "gen", "dadID", "momID", "spouseID", "sex") %in% colnames(results)))
308+
})
309+
310+
test_that("simulatePedigrees with n_fam = 1 matches simulatePedigree structure", {
311+
set.seed(42)
312+
result_multi <- simulatePedigrees(n_fam = 1, kpc = 3, Ngen = 4, marR = 0.6)
313+
314+
set.seed(42)
315+
result_single <- simulatePedigree(kpc = 3, Ngen = 4, marR = 0.6, fam_shift = 1L)
316+
317+
# Both should have the same number of rows and columns
318+
expect_equal(nrow(result_multi), nrow(result_single))
319+
expect_equal(ncol(result_multi), ncol(result_single))
320+
})
321+
322+
test_that("simulatePedigrees returns sequential IDs starting at 1", {
323+
set.seed(5)
324+
results <- simulatePedigrees(n_fam = 3, kpc = 3, Ngen = 4, marR = 0.6)
325+
326+
# Person IDs should be close to 1:nrow(results) spouse might change this but they should still be sequential and unique
327+
expect_equal(sort(results$ID), seq_len(nrow(results)))
328+
329+
# All parent/spouse references should be within the ID range (or NA)
330+
valid_ids <- seq_len(nrow(results))
331+
expect_true(all(is.na(results$momID) | results$momID %in% valid_ids))
332+
expect_true(all(is.na(results$dadID) | results$dadID %in% valid_ids))
333+
expect_true(all(is.na(results$spouseID) | results$spouseID %in% valid_ids))
334+
})
335+
336+
test_that("simulatePedigrees works with beta = TRUE", {
337+
set.seed(5)
338+
n_fam <- 2
339+
results <- simulatePedigrees(n_fam = n_fam, kpc = 3, Ngen = 4, marR = 0.6, beta = TRUE)
340+
341+
expect_s3_class(results, "data.frame")
342+
expect_equal(length(unique(results$fam)), n_fam)
343+
expect_equal(length(unique(results$ID)), nrow(results))
344+
})
345+
346+
test_that("simulatePedigrees validates n_fam input", {
347+
expect_error(simulatePedigrees(n_fam = 0), "'n_fam' must be a positive integer")
348+
expect_error(simulatePedigrees(n_fam = -1), "'n_fam' must be a positive integer")
349+
expect_error(simulatePedigrees(n_fam = NA), "'n_fam' must be a positive integer")
350+
})

vignettes/v2_pedigree.Rmd

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -97,7 +97,7 @@ df_ped_3$fam <- NULL
9797
df_ped_3$ID <- df_ped_3$ID / 100
9898
df_ped_3$dadID <- df_ped_3$dadID / 100
9999
df_ped_3$momID <- df_ped_3$momID / 100
100-
df_ped_3$spID <- df_ped_3$spID / 100
100+
df_ped_3$spouseID <- df_ped_3$spouseID / 100
101101
df_ped_4$famID <- 2
102102
df_ped_4$fam <- NULL
103103

0 commit comments

Comments
 (0)