diff --git a/NAMESPACE b/NAMESPACE index 6a7305c5..cfdbe433 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -49,6 +49,7 @@ export(related_coef) export(repairIDs) export(repairSex) export(simulatePedigree) +export(simulatePedigrees) export(sliceFamilies) export(summariseFamilies) export(summariseMatrilines) diff --git a/NEWS.md b/NEWS.md index 93e04c5f..f125a57d 100644 --- a/NEWS.md +++ b/NEWS.md @@ -7,6 +7,7 @@ * Smarter string ID handling for ped2id * Fixed how different-sized matrices are handled by `com2links()` * Added alignPhenToMatrix function to align phenotypic data to the order of the relatedness matrix +* Added `simulatePedigrees()` function to easily simulate multiple families at once and return them as a single combined data frame # BGmisc 1.7.0.0 * Fixed bug in parList diff --git a/R/simulatePedigree.R b/R/simulatePedigree.R index f3c170b6..f48cd0d3 100644 --- a/R/simulatePedigree.R +++ b/R/simulatePedigree.R @@ -879,6 +879,7 @@ buildBtwnGenerations_opt <- function(df_Fam, #' @param ... Additional arguments to be passed to other functions. #' @inheritParams ped2fam #' @param spouseID The name of the column that will contain the spouse ID in the output data frame. Default is "spID". +#' @param remap_ids logical. If TRUE, remap all ID columns to sequential integers (1, 2, 3, ...) in row order. #' @return A \code{data.frame} with each row representing a simulated individual. The columns are as follows: #' \itemize{ #' \item{fam: The family id of each simulated individual. It is 'fam1' in a single simulated pedigree.} @@ -914,6 +915,7 @@ simulatePedigree <- function(kpc = 3, code_male = "M", code_female = "F", fam_shift = 1L, + remap_ids = FALSE, beta = FALSE) { # SexRatio: ratio of male over female in the offspring setting; used in the between generation combinations # SexRatio <- sexR / (1 - sexR) @@ -966,11 +968,24 @@ simulatePedigree <- function(kpc = 3, df_Fam <- df_Fam[, 1:7] df_Fam <- df_Fam[!(is.na(df_Fam$pat) & is.na(df_Fam$mat) & is.na(df_Fam$spID)), ] - colnames(df_Fam)[c(2, 4, 5)] <- c(personID, dadID, momID) + names(df_Fam) <- c("fam", personID, "gen", dadID, momID, spouseID, "sex") # connect the detached members df_Fam[is.na(df_Fam[[momID]]) & is.na(df_Fam[[dadID]]) & df_Fam$gen > 1, ] + + if(remap_ids) { + # Remap all ID columns to sequential integers (1, 2, 3, ...) in row order, + # so the final data frame has tidy consecutive IDs regardless of fam_shift offsets. + old_ids <- rbind( df_Fam[[personID]], df_Fam[[momID]], df_Fam[[dadID]], df_Fam[[spouseID]]) + old_ids <- unique(old_ids[!is.na(old_ids)]) + id_map <- setNames(seq_along(old_ids), as.character(old_ids)) + + df_Fam[[personID]] <- as.integer(id_map[as.character( df_Fam[[personID]])]) + df_Fam[[momID]] <- as.integer(id_map[as.character( df_Fam[[momID]])]) + df_Fam[[dadID]] <- as.integer(id_map[as.character( df_Fam[[dadID]])]) + df_Fam[[spouseID]] <- as.integer(id_map[as.character( df_Fam[[spouseID]])]) + } df_Fam } @@ -980,3 +995,93 @@ SimPed <- function(...) { # nolint: object_name_linter. warning("The 'SimPed' function is deprecated. Please use 'simulatePedigree' instead.") simulatePedigree(...) } + +#' Simulate Multiple Pedigrees +#' +#' This function simulates multiple "balanced" pedigrees and returns them +#' combined into a single data frame. It is a convenience wrapper around +#' \code{\link{simulatePedigree}} that makes it easy to simulate many families +#' at once, with unique IDs across all families. +#' +#' @param n_fam Integer. Number of families to simulate. Default is 2. +#' @param remap_ids Logical. If TRUE (default), all ID columns (personID, momID, dadID, spouseID) will be remapped to sequential integers starting at 1 across the combined data frame. This ensures tidy consecutive IDs regardless of fam_shift offsets. If FALSE, IDs will retain their original values from each pedigree simulation, which may include gaps or non-sequential values due to fam_shift. +#' @inheritParams simulatePedigree +#' @return A \code{data.frame} containing all simulated individuals from all +#' families combined, with the same columns as \code{\link{simulatePedigree}}. +#' The \code{fam} column uniquely identifies each family (e.g., "fam1", +#' "fam2", ...). Individual IDs are sequential integers starting at 1 +#' (i.e., \code{1:nrow(result)}), and all parent/spouse ID references are +#' remapped to match. +#' @export +#' @examples +#' set.seed(5) +#' df_peds <- simulatePedigrees( +#' n_fam = 3, +#' kpc = 4, +#' Ngen = 4, +#' sexR = .5, +#' marR = .7 +#' ) +#' summary(df_peds) +simulatePedigrees <- function(n_fam = 2, + kpc = 3, + Ngen = 4, + sexR = .5, + marR = 2 / 3, + rd_kpc = FALSE, + balancedSex = TRUE, + balancedMar = TRUE, + verbose = FALSE, + personID = "ID", + momID = "momID", + dadID = "dadID", + spouseID = "spouseID", + code_male = "M", + code_female = "F", + remap_ids = TRUE, + beta = FALSE + ) { + n_fam <- as.integer(n_fam) + if (is.na(n_fam) || n_fam < 1L) { + stop("'n_fam' must be a positive integer.") + } + ped_list <- vector("list", n_fam) + for (i in seq_len(n_fam)) { + ped_i <- simulatePedigree( + kpc = kpc, + Ngen = Ngen, + sexR = sexR, + marR = marR, + rd_kpc = rd_kpc, + balancedSex = balancedSex, + balancedMar = balancedMar, + verbose = verbose, + personID = personID, + momID = momID, + dadID = dadID, + spouseID = spouseID, + code_male = code_male, + code_female = code_female, + fam_shift = i, + remap_ids = FALSE, # Keep original IDs for now; we'll remap after combining. + beta = beta + ) + ped_i$fam <- paste0("fam", i) + ped_list[[i]] <- ped_i + } + combined <- data.table::rbindlist(ped_list) |> as.data.frame() + names(combined) <- c("fam", personID, "gen", dadID, momID, spouseID, "sex") +if(remap_ids) { + # Remap all ID columns to sequential integers (1, 2, 3, ...) in row order, + # so the final data frame has tidy consecutive IDs regardless of fam_shift offsets. + old_ids <- rbind(combined[[personID]], combined[[momID]], combined[[dadID]], combined[[spouseID]]) + old_ids <- unique(old_ids[!is.na(old_ids)]) + id_map <- setNames(seq_along(old_ids), as.character(old_ids)) + + combined[[personID]] <- as.integer(id_map[as.character(combined[[personID]])]) + combined[[momID]] <- as.integer(id_map[as.character(combined[[momID]])]) + combined[[dadID]] <- as.integer(id_map[as.character(combined[[dadID]])]) + combined[[spouseID]] <- as.integer(id_map[as.character(combined[[spouseID]])]) +} + combined +} diff --git a/man/simulatePedigree.Rd b/man/simulatePedigree.Rd index d56ab72c..df600970 100644 --- a/man/simulatePedigree.Rd +++ b/man/simulatePedigree.Rd @@ -26,6 +26,7 @@ simulatePedigree( code_male = "M", code_female = "F", fam_shift = 1L, + remap_ids = FALSE, beta = FALSE ) @@ -76,6 +77,8 @@ current version.} \item{fam_shift}{An integer to shift the person ID. Default is 1L. This is useful when simulating multiple pedigrees to avoid ID conflicts.} +\item{remap_ids}{logical. If TRUE, remap all ID columns to sequential integers (1, 2, 3, ...) in row order.} + \item{beta}{logical or character. Controls which algorithm version to use: \itemize{ \item{\code{FALSE}, \code{"base"}, or \code{"original"} (default): Use the original algorithm. diff --git a/man/simulatePedigrees.Rd b/man/simulatePedigrees.Rd new file mode 100644 index 00000000..0ae193d5 --- /dev/null +++ b/man/simulatePedigrees.Rd @@ -0,0 +1,110 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/simulatePedigree.R +\name{simulatePedigrees} +\alias{simulatePedigrees} +\title{Simulate Multiple Pedigrees} +\usage{ +simulatePedigrees( + n_fam = 2, + kpc = 3, + Ngen = 4, + sexR = 0.5, + marR = 2/3, + rd_kpc = FALSE, + balancedSex = TRUE, + balancedMar = TRUE, + verbose = FALSE, + personID = "ID", + momID = "momID", + dadID = "dadID", + spouseID = "spouseID", + code_male = "M", + code_female = "F", + remap_ids = TRUE, + beta = FALSE +) +} +\arguments{ +\item{n_fam}{Integer. Number of families to simulate. Default is 2.} + +\item{kpc}{Number of kids per couple. An integer >= 2 that determines how +many kids each fertilized mated couple will have in the pedigree. Default +value is 3. Returns an error when kpc equals 1.} + +\item{Ngen}{Number of generations. An integer >= 2 that determines how many +generations the simulated pedigree will have. The first generation is always +a fertilized couple. The last generation has no mated individuals.} + +\item{sexR}{Sex ratio of offspring. A numeric value ranging from 0 to 1 that +determines the proportion of males in all offspring in this pedigree. For +instance, 0.4 means 40 percent of the offspring will be male.} + +\item{marR}{Mating rate. A numeric value ranging from 0 to 1 which determines +the proportion of mated (fertilized) couples in the pedigree within each +generation. For instance, marR = 0.5 suggests 50 percent of the offspring in + a specific generation will be mated and have their offspring.} + +\item{rd_kpc}{logical. If TRUE, the number of kids per mate will be randomly +generated from a poisson distribution with mean kpc. If FALSE, the number of +kids per mate will be fixed at kpc.} + +\item{balancedSex}{Not fully developed yet. Always \code{TRUE} in the +current version.} + +\item{balancedMar}{Not fully developed yet. Always \code{TRUE} in the +current version.} + +\item{verbose}{logical If TRUE, message progress through stages of algorithm} + +\item{personID}{character. Name of the column in ped for the person ID variable} + +\item{momID}{character. Name of the column in ped for the mother ID variable} + +\item{dadID}{character. Name of the column in ped for the father ID variable} + +\item{spouseID}{The name of the column that will contain the spouse ID in the output data frame. Default is "spID".} + +\item{code_male}{The value to use for males. Default is "M"} + +\item{code_female}{The value to use for females. Default is "F"} + +\item{remap_ids}{Logical. If TRUE (default), all ID columns (personID, momID, dadID, spouseID) will be remapped to sequential integers starting at 1 across the combined data frame. This ensures tidy consecutive IDs regardless of fam_shift offsets. If FALSE, IDs will retain their original values from each pedigree simulation, which may include gaps or non-sequential values due to fam_shift.} + +\item{beta}{logical or character. Controls which algorithm version to use: +\itemize{ + \item{\code{FALSE}, \code{"base"}, or \code{"original"} (default): Use the original algorithm. + Slower but ensures exact reproducibility with set.seed().} + \item{\code{TRUE} or \code{"optimized"}: Use the optimized algorithm with 4-5x speedup. + Produces statistically equivalent results but not identical to base version + due to different random number consumption. Recommended for large simulations + where speed matters more than exact reproducibility.} +} +Note: Both versions are mathematically correct and produce valid pedigrees with the +same statistical properties (sex ratios, mating rates, etc.). The optimized version +uses vectorized operations instead of loops, making it much faster for large pedigrees.} +} +\value{ +A \code{data.frame} containing all simulated individuals from all + families combined, with the same columns as \code{\link{simulatePedigree}}. + The \code{fam} column uniquely identifies each family (e.g., "fam1", + "fam2", ...). Individual IDs are sequential integers starting at 1 + (i.e., \code{1:nrow(result)}), and all parent/spouse ID references are + remapped to match. +} +\description{ +This function simulates multiple "balanced" pedigrees and returns them +combined into a single data frame. It is a convenience wrapper around +\code{\link{simulatePedigree}} that makes it easy to simulate many families +at once, with unique IDs across all families. +} +\examples{ +set.seed(5) +df_peds <- simulatePedigrees( + n_fam = 3, + kpc = 4, + Ngen = 4, + sexR = .5, + marR = .7 +) +summary(df_peds) +} diff --git a/tests/testthat/test-makeLinks.R b/tests/testthat/test-makeLinks.R index f2340485..59e0847a 100644 --- a/tests/testthat/test-makeLinks.R +++ b/tests/testthat/test-makeLinks.R @@ -355,6 +355,15 @@ test_that("com2links handles mismatched matrix dimensions by subsetting to small # Only IDs from the smaller matrix should appear all_output_ids <- unique(c(result_mismatch$ID1, result_mismatch$ID2)) expect_true(all(all_output_ids %in% as.numeric(dimnames(ad_small)[[1]]))) + expect_true(all(all_output_ids %in% as.numeric(dimnames(mit_ped_matrix)[[1]]))) + expect_true(all(all_output_ids %in% as.numeric(dimnames(cn_ped_matrix)[[1]]))) + + # Check that the number of unique IDs in the output matches the number of IDs in the smallest matrix + expect_equal(length(all_output_ids), length(subset_ids)) + + # check that full matrix has more unique IDs than the smaller matrix + expect_true(length(unique(c(dimnames(mit_ped_matrix)[[1]], + dimnames(cn_ped_matrix)[[1]]))) > length(subset_ids)) }) test_that("com2links mismatched dimensions with two matrices", { diff --git a/tests/testthat/test-simulatePedigree.R b/tests/testthat/test-simulatePedigree.R index 7c88145c..0ae04a5e 100644 --- a/tests/testthat/test-simulatePedigree.R +++ b/tests/testthat/test-simulatePedigree.R @@ -126,9 +126,8 @@ test_that("simulated pedigree generates expected data structure when sexR is imb expect_equal(max(results$gen), Ngen, tolerance = strict_tolerance) # expect there to be parents in each for all generations except the first one - filter_parents <- results %>% - group_by(gen) %>% - summarize(num_parents = sum(!is.na(dadID), na.rm = TRUE) + sum(!is.na(momID), na.rm = TRUE)) + filter_parents <- dplyr::group_by(results, gen) %>% + dplyr::summarize(num_parents = sum(!is.na(dadID), na.rm = TRUE) + sum(!is.na(momID), na.rm = TRUE)) expect_true(all(filter_parents$num_parents[filter_parents$gen > 1] > 0), info = paste0("Beta option: ", beta)) expect_true(all(filter_parents$num_parents[filter_parents$gen == 1] == 0), info = paste0("Beta option: ", beta)) @@ -199,9 +198,8 @@ test_that("simulated pedigree generates expected data structure but supply var n expect_lt(sex_mean_male, sex_mean_female) # expect there to be parents in each for all generations except the first one - filter_parents <- results %>% - group_by(gen) %>% - summarize(num_parents = sum(!is.na(dadID), na.rm = TRUE) + sum(!is.na(momID), na.rm = TRUE)) + filter_parents <- dplyr::group_by(results, gen) %>% + dplyr::summarize(num_parents = sum(!is.na(dadID), na.rm = TRUE) + sum(!is.na(momID), na.rm = TRUE)) expect_true(all(filter_parents$num_parents[filter_parents$gen > 1] > 0), info = paste0("Beta option: ", beta)) expect_true(all(filter_parents$num_parents[filter_parents$gen == 1] == 0), info = paste0("Beta option: ", beta)) @@ -289,3 +287,64 @@ test_that("simulatePedigree accepts string aliases for beta parameter", { "not yet implemented" ) }) + +test_that("simulatePedigrees returns combined data frame for multiple families", { + set.seed(5) + n_fam <- 3 + results <- simulatePedigrees(n_fam = n_fam, kpc = 3, Ngen = 4, marR = 0.6) + + # Should return a data frame + expect_s3_class(results, "data.frame") + + # Should have exactly n_fam unique family IDs + fam_ids <- unique(results$fam) + expect_setequal(fam_ids, paste0("fam", seq_len(n_fam))) + + # All person IDs should be unique across families + expect_equal(length(unique(results$ID)), nrow(results)) + + # Should have standard pedigree columns + expect_true(all(c("fam", "ID", "gen", "dadID", "momID", "spouseID", "sex") %in% colnames(results))) +}) + +test_that("simulatePedigrees with n_fam = 1 matches simulatePedigree structure", { + set.seed(42) + result_multi <- simulatePedigrees(n_fam = 1, kpc = 3, Ngen = 4, marR = 0.6) + + set.seed(42) + result_single <- simulatePedigree(kpc = 3, Ngen = 4, marR = 0.6, fam_shift = 1L) + + # Both should have the same number of rows and columns + expect_equal(nrow(result_multi), nrow(result_single)) + expect_equal(ncol(result_multi), ncol(result_single)) +}) + +test_that("simulatePedigrees returns sequential IDs starting at 1", { + set.seed(5) + results <- simulatePedigrees(n_fam = 3, kpc = 3, Ngen = 4, marR = 0.6) + + # Person IDs should be close to 1:nrow(results) spouse might change this but they should still be sequential and unique + expect_equal(sort(results$ID), seq_len(nrow(results))) + + # All parent/spouse references should be within the ID range (or NA) + valid_ids <- seq_len(nrow(results)) + expect_true(all(is.na(results$momID) | results$momID %in% valid_ids)) + expect_true(all(is.na(results$dadID) | results$dadID %in% valid_ids)) + expect_true(all(is.na(results$spouseID) | results$spouseID %in% valid_ids)) +}) + +test_that("simulatePedigrees works with beta = TRUE", { + set.seed(5) + n_fam <- 2 + results <- simulatePedigrees(n_fam = n_fam, kpc = 3, Ngen = 4, marR = 0.6, beta = TRUE) + + expect_s3_class(results, "data.frame") + expect_equal(length(unique(results$fam)), n_fam) + expect_equal(length(unique(results$ID)), nrow(results)) +}) + +test_that("simulatePedigrees validates n_fam input", { + expect_error(simulatePedigrees(n_fam = 0), "'n_fam' must be a positive integer") + expect_error(simulatePedigrees(n_fam = -1), "'n_fam' must be a positive integer") + expect_error(simulatePedigrees(n_fam = NA), "'n_fam' must be a positive integer") +}) diff --git a/vignettes/v2_pedigree.Rmd b/vignettes/v2_pedigree.Rmd index b68b21f2..e0d0e1ae 100644 --- a/vignettes/v2_pedigree.Rmd +++ b/vignettes/v2_pedigree.Rmd @@ -97,7 +97,7 @@ df_ped_3$fam <- NULL df_ped_3$ID <- df_ped_3$ID / 100 df_ped_3$dadID <- df_ped_3$dadID / 100 df_ped_3$momID <- df_ped_3$momID / 100 -df_ped_3$spID <- df_ped_3$spID / 100 +df_ped_3$spouseID <- df_ped_3$spouseID / 100 df_ped_4$famID <- 2 df_ped_4$fam <- NULL