From 8bf36d6c2ed260dd4be075342c3453d3397ca63c Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Mon, 13 Apr 2026 20:35:59 +0000 Subject: [PATCH 1/9] Initial plan From b6de8f3c367feb6d80fac5b615341b947abaaf69 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Mon, 13 Apr 2026 20:39:43 +0000 Subject: [PATCH 2/9] Add simulatePedigrees() function to simulate multiple families at once Agent-Logs-Url: https://github.com/R-Computing-Lab/BGmisc/sessions/9367c84a-deb8-4f13-a855-367a204cc843 Co-authored-by: smasongarrison <6001608+smasongarrison@users.noreply.github.com> --- NAMESPACE | 1 + NEWS.md | 1 + R/simulatePedigree.R | 66 ++++++++++++++++ man/simulatePedigrees.Rd | 105 +++++++++++++++++++++++++ tests/testthat/test-simulatePedigree.R | 45 +++++++++++ 5 files changed, 218 insertions(+) create mode 100644 man/simulatePedigrees.Rd diff --git a/NAMESPACE b/NAMESPACE index 6a7305c5..cfdbe433 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -49,6 +49,7 @@ export(related_coef) export(repairIDs) export(repairSex) export(simulatePedigree) +export(simulatePedigrees) export(sliceFamilies) export(summariseFamilies) export(summariseMatrilines) diff --git a/NEWS.md b/NEWS.md index 93e04c5f..f125a57d 100644 --- a/NEWS.md +++ b/NEWS.md @@ -7,6 +7,7 @@ * Smarter string ID handling for ped2id * Fixed how different-sized matrices are handled by `com2links()` * Added alignPhenToMatrix function to align phenotypic data to the order of the relatedness matrix +* Added `simulatePedigrees()` function to easily simulate multiple families at once and return them as a single combined data frame # BGmisc 1.7.0.0 * Fixed bug in parList diff --git a/R/simulatePedigree.R b/R/simulatePedigree.R index f3c170b6..33965c15 100644 --- a/R/simulatePedigree.R +++ b/R/simulatePedigree.R @@ -980,3 +980,69 @@ SimPed <- function(...) { # nolint: object_name_linter. warning("The 'SimPed' function is deprecated. Please use 'simulatePedigree' instead.") simulatePedigree(...) } + +#' Simulate Multiple Pedigrees +#' +#' This function simulates multiple "balanced" pedigrees and returns them +#' combined into a single data frame. It is a convenience wrapper around +#' \code{\link{simulatePedigree}} that makes it easy to simulate many families +#' at once, with unique IDs across all families. +#' +#' @param n_fam Integer. Number of families to simulate. Default is 2. +#' @inheritParams simulatePedigree +#' @return A \code{data.frame} containing all simulated individuals from all +#' families combined, with the same columns as \code{\link{simulatePedigree}}. +#' The \code{fam} column uniquely identifies each family (e.g., "fam1", +#' "fam2", ...). Individual IDs are guaranteed to be unique across families. +#' @export +#' @examples +#' set.seed(5) +#' df_peds <- simulatePedigrees( +#' n_fam = 3, +#' kpc = 4, +#' Ngen = 4, +#' sexR = .5, +#' marR = .7 +#' ) +#' summary(df_peds) +simulatePedigrees <- function(n_fam = 2, + kpc = 3, + Ngen = 4, + sexR = .5, + marR = 2 / 3, + rd_kpc = FALSE, + balancedSex = TRUE, + balancedMar = TRUE, + verbose = FALSE, + personID = "ID", + momID = "momID", + dadID = "dadID", + spouseID = "spouseID", + code_male = "M", + code_female = "F", + beta = FALSE) { + ped_list <- vector("list", n_fam) + for (i in seq_len(n_fam)) { + ped_i <- simulatePedigree( + kpc = kpc, + Ngen = Ngen, + sexR = sexR, + marR = marR, + rd_kpc = rd_kpc, + balancedSex = balancedSex, + balancedMar = balancedMar, + verbose = verbose, + personID = personID, + momID = momID, + dadID = dadID, + spouseID = spouseID, + code_male = code_male, + code_female = code_female, + fam_shift = i, + beta = beta + ) + ped_i$fam <- paste0("fam", i) + ped_list[[i]] <- ped_i + } + do.call(rbind, ped_list) +} diff --git a/man/simulatePedigrees.Rd b/man/simulatePedigrees.Rd new file mode 100644 index 00000000..2f06ac50 --- /dev/null +++ b/man/simulatePedigrees.Rd @@ -0,0 +1,105 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/simulatePedigree.R +\name{simulatePedigrees} +\alias{simulatePedigrees} +\title{Simulate Multiple Pedigrees} +\usage{ +simulatePedigrees( + n_fam = 2, + kpc = 3, + Ngen = 4, + sexR = 0.5, + marR = 2/3, + rd_kpc = FALSE, + balancedSex = TRUE, + balancedMar = TRUE, + verbose = FALSE, + personID = "ID", + momID = "momID", + dadID = "dadID", + spouseID = "spouseID", + code_male = "M", + code_female = "F", + beta = FALSE +) +} +\arguments{ +\item{n_fam}{Integer. Number of families to simulate. Default is 2.} + +\item{kpc}{Number of kids per couple. An integer >= 2 that determines how +many kids each fertilized mated couple will have in the pedigree. Default +value is 3. Returns an error when kpc equals 1.} + +\item{Ngen}{Number of generations. An integer >= 2 that determines how many +generations the simulated pedigree will have. The first generation is always +a fertilized couple. The last generation has no mated individuals.} + +\item{sexR}{Sex ratio of offspring. A numeric value ranging from 0 to 1 that +determines the proportion of males in all offspring in this pedigree. For +instance, 0.4 means 40 percent of the offspring will be male.} + +\item{marR}{Mating rate. A numeric value ranging from 0 to 1 which determines +the proportion of mated (fertilized) couples in the pedigree within each +generation. For instance, marR = 0.5 suggests 50 percent of the offspring in + a specific generation will be mated and have their offspring.} + +\item{rd_kpc}{logical. If TRUE, the number of kids per mate will be randomly +generated from a poisson distribution with mean kpc. If FALSE, the number of +kids per mate will be fixed at kpc.} + +\item{balancedSex}{Not fully developed yet. Always \code{TRUE} in the +current version.} + +\item{balancedMar}{Not fully developed yet. Always \code{TRUE} in the +current version.} + +\item{verbose}{logical If TRUE, message progress through stages of algorithm} + +\item{personID}{character. Name of the column in ped for the person ID variable} + +\item{momID}{character. Name of the column in ped for the mother ID variable} + +\item{dadID}{character. Name of the column in ped for the father ID variable} + +\item{spouseID}{The name of the column that will contain the spouse ID in the output data frame. Default is "spouseID".} + +\item{code_male}{The value to use for males. Default is "M"} + +\item{code_female}{The value to use for females. Default is "F"} + +\item{beta}{logical or character. Controls which algorithm version to use: +\itemize{ + \item{\code{FALSE}, \code{"base"}, or \code{"original"} (default): Use the original algorithm. + Slower but ensures exact reproducibility with set.seed().} + \item{\code{TRUE} or \code{"optimized"}: Use the optimized algorithm with 4-5x speedup. + Produces statistically equivalent results but not identical to base version + due to different random number consumption. Recommended for large simulations + where speed matters more than exact reproducibility.} +} +Note: Both versions are mathematically correct and produce valid pedigrees with the +same statistical properties (sex ratios, mating rates, etc.). The optimized version +uses vectorized operations instead of loops, making it much faster for large pedigrees.} +} +\value{ +A \code{data.frame} containing all simulated individuals from all + families combined, with the same columns as \code{\link{simulatePedigree}}. + The \code{fam} column uniquely identifies each family (e.g., "fam1", + "fam2", ...). Individual IDs are guaranteed to be unique across families. +} +\description{ +This function simulates multiple "balanced" pedigrees and returns them +combined into a single data frame. It is a convenience wrapper around +\code{\link{simulatePedigree}} that makes it easy to simulate many families +at once, with unique IDs across all families. +} +\examples{ +set.seed(5) +df_peds <- simulatePedigrees( + n_fam = 3, + kpc = 4, + Ngen = 4, + sexR = .5, + marR = .7 +) +summary(df_peds) +} diff --git a/tests/testthat/test-simulatePedigree.R b/tests/testthat/test-simulatePedigree.R index 7c88145c..81d3f2a9 100644 --- a/tests/testthat/test-simulatePedigree.R +++ b/tests/testthat/test-simulatePedigree.R @@ -289,3 +289,48 @@ test_that("simulatePedigree accepts string aliases for beta parameter", { "not yet implemented" ) }) + +test_that("simulatePedigrees returns combined data frame for multiple families", { + set.seed(5) + n_fam <- 3 + results <- simulatePedigrees(n_fam = n_fam, kpc = 3, Ngen = 4, marR = 0.6) + + # Should return a data frame + expect_s3_class(results, "data.frame") + + # Should have exactly n_fam unique family IDs + fam_ids <- unique(results$fam) + expect_equal(length(fam_ids), n_fam) + expect_true(all(paste0("fam", seq_len(n_fam)) %in% fam_ids)) + + # All person IDs should be unique across families + expect_equal(length(unique(results$ID)), nrow(results)) + + # Should have standard pedigree columns + expect_true(all(c("fam", "ID", "gen", "dadID", "momID", "spouseID", "sex") %in% colnames(results))) +}) + +test_that("simulatePedigrees with n_fam = 1 matches simulatePedigree structure", { + set.seed(42) + result_multi <- simulatePedigrees(n_fam = 1, kpc = 3, Ngen = 4, marR = 0.6) + + set.seed(42) + result_single <- simulatePedigree(kpc = 3, Ngen = 4, marR = 0.6, fam_shift = 1L) + + # Both should have the same number of rows and columns + expect_equal(nrow(result_multi), nrow(result_single)) + expect_equal(ncol(result_multi), ncol(result_single)) + + # The IDs should match + expect_equal(result_multi$ID, result_single$ID) +}) + +test_that("simulatePedigrees works with beta = TRUE", { + set.seed(5) + n_fam <- 2 + results <- simulatePedigrees(n_fam = n_fam, kpc = 3, Ngen = 4, marR = 0.6, beta = TRUE) + + expect_s3_class(results, "data.frame") + expect_equal(length(unique(results$fam)), n_fam) + expect_equal(length(unique(results$ID)), nrow(results)) +}) From 34f818b57e0f5673b3512d5fcc50ab276f706c06 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Mon, 13 Apr 2026 20:40:45 +0000 Subject: [PATCH 3/9] Address code review: input validation, use rbindlist, fix test assertions Agent-Logs-Url: https://github.com/R-Computing-Lab/BGmisc/sessions/9367c84a-deb8-4f13-a855-367a204cc843 Co-authored-by: smasongarrison <6001608+smasongarrison@users.noreply.github.com> --- R/simulatePedigree.R | 6 +++++- tests/testthat/test-simulatePedigree.R | 9 +++++++-- 2 files changed, 12 insertions(+), 3 deletions(-) diff --git a/R/simulatePedigree.R b/R/simulatePedigree.R index 33965c15..d48c8ad4 100644 --- a/R/simulatePedigree.R +++ b/R/simulatePedigree.R @@ -1021,6 +1021,10 @@ simulatePedigrees <- function(n_fam = 2, code_male = "M", code_female = "F", beta = FALSE) { + n_fam <- as.integer(n_fam) + if (is.na(n_fam) || n_fam < 1L) { + stop("'n_fam' must be a positive integer.") + } ped_list <- vector("list", n_fam) for (i in seq_len(n_fam)) { ped_i <- simulatePedigree( @@ -1044,5 +1048,5 @@ simulatePedigrees <- function(n_fam = 2, ped_i$fam <- paste0("fam", i) ped_list[[i]] <- ped_i } - do.call(rbind, ped_list) + data.table::rbindlist(ped_list) |> as.data.frame() } diff --git a/tests/testthat/test-simulatePedigree.R b/tests/testthat/test-simulatePedigree.R index 81d3f2a9..5f3e2873 100644 --- a/tests/testthat/test-simulatePedigree.R +++ b/tests/testthat/test-simulatePedigree.R @@ -300,8 +300,7 @@ test_that("simulatePedigrees returns combined data frame for multiple families", # Should have exactly n_fam unique family IDs fam_ids <- unique(results$fam) - expect_equal(length(fam_ids), n_fam) - expect_true(all(paste0("fam", seq_len(n_fam)) %in% fam_ids)) + expect_setequal(fam_ids, paste0("fam", seq_len(n_fam))) # All person IDs should be unique across families expect_equal(length(unique(results$ID)), nrow(results)) @@ -334,3 +333,9 @@ test_that("simulatePedigrees works with beta = TRUE", { expect_equal(length(unique(results$fam)), n_fam) expect_equal(length(unique(results$ID)), nrow(results)) }) + +test_that("simulatePedigrees validates n_fam input", { + expect_error(simulatePedigrees(n_fam = 0), "'n_fam' must be a positive integer") + expect_error(simulatePedigrees(n_fam = -1), "'n_fam' must be a positive integer") + expect_error(simulatePedigrees(n_fam = NA), "'n_fam' must be a positive integer") +}) From f3910a31231030f7bb31f6b3fe2a616213e270a4 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Mon, 13 Apr 2026 20:41:33 +0000 Subject: [PATCH 4/9] Fix minor punctuation in man/simulatePedigrees.Rd documentation Agent-Logs-Url: https://github.com/R-Computing-Lab/BGmisc/sessions/9367c84a-deb8-4f13-a855-367a204cc843 Co-authored-by: smasongarrison <6001608+smasongarrison@users.noreply.github.com> --- man/simulatePedigrees.Rd | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/man/simulatePedigrees.Rd b/man/simulatePedigrees.Rd index 2f06ac50..6397da55 100644 --- a/man/simulatePedigrees.Rd +++ b/man/simulatePedigrees.Rd @@ -53,7 +53,7 @@ current version.} \item{balancedMar}{Not fully developed yet. Always \code{TRUE} in the current version.} -\item{verbose}{logical If TRUE, message progress through stages of algorithm} +\item{verbose}{logical. If TRUE, message progress through stages of algorithm} \item{personID}{character. Name of the column in ped for the person ID variable} From 1c193a46f13957383c1c786fa2ce645c927c0a83 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Mon, 13 Apr 2026 20:48:41 +0000 Subject: [PATCH 5/9] feat: remap IDs to sequential integers in simulatePedigrees() Agent-Logs-Url: https://github.com/R-Computing-Lab/BGmisc/sessions/44f2511b-4beb-420a-924c-bd42c5f1de1f Co-authored-by: smasongarrison <6001608+smasongarrison@users.noreply.github.com> --- R/simulatePedigree.R | 24 ++++++++++++++++++++++-- tests/testthat/test-simulatePedigree.R | 15 +++++++++++++-- 2 files changed, 35 insertions(+), 4 deletions(-) diff --git a/R/simulatePedigree.R b/R/simulatePedigree.R index d48c8ad4..5e1fa4c0 100644 --- a/R/simulatePedigree.R +++ b/R/simulatePedigree.R @@ -993,7 +993,9 @@ SimPed <- function(...) { # nolint: object_name_linter. #' @return A \code{data.frame} containing all simulated individuals from all #' families combined, with the same columns as \code{\link{simulatePedigree}}. #' The \code{fam} column uniquely identifies each family (e.g., "fam1", -#' "fam2", ...). Individual IDs are guaranteed to be unique across families. +#' "fam2", ...). Individual IDs are sequential integers starting at 1 +#' (i.e., \code{1:nrow(result)}), and all parent/spouse ID references are +#' remapped to match. #' @export #' @examples #' set.seed(5) @@ -1048,5 +1050,23 @@ simulatePedigrees <- function(n_fam = 2, ped_i$fam <- paste0("fam", i) ped_list[[i]] <- ped_i } - data.table::rbindlist(ped_list) |> as.data.frame() + combined <- data.table::rbindlist(ped_list) |> as.data.frame() + + # Remap all ID columns to sequential integers (1, 2, 3, ...) in row order, + # so the final data frame has tidy consecutive IDs regardless of fam_shift offsets. + old_ids <- combined[[personID]] + id_map <- setNames(seq_along(old_ids), as.character(old_ids)) + + remap_col <- function(x) { + out <- id_map[as.character(x)] + out[is.na(x)] <- NA_integer_ + as.integer(out) + } + + combined[[personID]] <- as.integer(id_map[as.character(old_ids)]) + combined[[momID]] <- remap_col(combined[[momID]]) + combined[[dadID]] <- remap_col(combined[[dadID]]) + combined[[spouseID]] <- remap_col(combined[[spouseID]]) + + combined } diff --git a/tests/testthat/test-simulatePedigree.R b/tests/testthat/test-simulatePedigree.R index 5f3e2873..c2b5d66b 100644 --- a/tests/testthat/test-simulatePedigree.R +++ b/tests/testthat/test-simulatePedigree.R @@ -319,9 +319,20 @@ test_that("simulatePedigrees with n_fam = 1 matches simulatePedigree structure", # Both should have the same number of rows and columns expect_equal(nrow(result_multi), nrow(result_single)) expect_equal(ncol(result_multi), ncol(result_single)) +}) + +test_that("simulatePedigrees returns sequential IDs starting at 1", { + set.seed(5) + results <- simulatePedigrees(n_fam = 3, kpc = 3, Ngen = 4, marR = 0.6) + + # Person IDs should be exactly 1:nrow + expect_equal(results$ID, seq_len(nrow(results))) - # The IDs should match - expect_equal(result_multi$ID, result_single$ID) + # All parent/spouse references should be within the ID range (or NA) + valid_ids <- seq_len(nrow(results)) + expect_true(all(is.na(results$momID) | results$momID %in% valid_ids)) + expect_true(all(is.na(results$dadID) | results$dadID %in% valid_ids)) + expect_true(all(is.na(results$spouseID) | results$spouseID %in% valid_ids)) }) test_that("simulatePedigrees works with beta = TRUE", { From 32684f627bdf927174903d74ceb2d857b0bb2094 Mon Sep 17 00:00:00 2001 From: Mason Garrison Date: Mon, 13 Apr 2026 17:07:15 -0400 Subject: [PATCH 6/9] Squashed commit of the following: commit 42ae819f65bba25c79cbff86dfadca0a2b53f065 Author: Mason Garrison Date: Mon Apr 13 16:08:31 2026 -0400 Update test-makeLinks.R --- tests/testthat/test-makeLinks.R | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/tests/testthat/test-makeLinks.R b/tests/testthat/test-makeLinks.R index f2340485..59e0847a 100644 --- a/tests/testthat/test-makeLinks.R +++ b/tests/testthat/test-makeLinks.R @@ -355,6 +355,15 @@ test_that("com2links handles mismatched matrix dimensions by subsetting to small # Only IDs from the smaller matrix should appear all_output_ids <- unique(c(result_mismatch$ID1, result_mismatch$ID2)) expect_true(all(all_output_ids %in% as.numeric(dimnames(ad_small)[[1]]))) + expect_true(all(all_output_ids %in% as.numeric(dimnames(mit_ped_matrix)[[1]]))) + expect_true(all(all_output_ids %in% as.numeric(dimnames(cn_ped_matrix)[[1]]))) + + # Check that the number of unique IDs in the output matches the number of IDs in the smallest matrix + expect_equal(length(all_output_ids), length(subset_ids)) + + # check that full matrix has more unique IDs than the smaller matrix + expect_true(length(unique(c(dimnames(mit_ped_matrix)[[1]], + dimnames(cn_ped_matrix)[[1]]))) > length(subset_ids)) }) test_that("com2links mismatched dimensions with two matrices", { From b33820a56cd766bb9e1aef3acc01dd420200a8a5 Mon Sep 17 00:00:00 2001 From: Mason Garrison Date: Mon, 13 Apr 2026 17:35:57 -0400 Subject: [PATCH 7/9] smarter id handling --- R/simulatePedigree.R | 45 +++++++++++++++++--------- man/simulatePedigree.Rd | 3 ++ man/simulatePedigrees.Rd | 11 +++++-- tests/testthat/test-simulatePedigree.R | 16 ++++----- 4 files changed, 49 insertions(+), 26 deletions(-) diff --git a/R/simulatePedigree.R b/R/simulatePedigree.R index 5e1fa4c0..f48cd0d3 100644 --- a/R/simulatePedigree.R +++ b/R/simulatePedigree.R @@ -879,6 +879,7 @@ buildBtwnGenerations_opt <- function(df_Fam, #' @param ... Additional arguments to be passed to other functions. #' @inheritParams ped2fam #' @param spouseID The name of the column that will contain the spouse ID in the output data frame. Default is "spID". +#' @param remap_ids logical. If TRUE, remap all ID columns to sequential integers (1, 2, 3, ...) in row order. #' @return A \code{data.frame} with each row representing a simulated individual. The columns are as follows: #' \itemize{ #' \item{fam: The family id of each simulated individual. It is 'fam1' in a single simulated pedigree.} @@ -914,6 +915,7 @@ simulatePedigree <- function(kpc = 3, code_male = "M", code_female = "F", fam_shift = 1L, + remap_ids = FALSE, beta = FALSE) { # SexRatio: ratio of male over female in the offspring setting; used in the between generation combinations # SexRatio <- sexR / (1 - sexR) @@ -966,11 +968,24 @@ simulatePedigree <- function(kpc = 3, df_Fam <- df_Fam[, 1:7] df_Fam <- df_Fam[!(is.na(df_Fam$pat) & is.na(df_Fam$mat) & is.na(df_Fam$spID)), ] - colnames(df_Fam)[c(2, 4, 5)] <- c(personID, dadID, momID) + names(df_Fam) <- c("fam", personID, "gen", dadID, momID, spouseID, "sex") # connect the detached members df_Fam[is.na(df_Fam[[momID]]) & is.na(df_Fam[[dadID]]) & df_Fam$gen > 1, ] + + if(remap_ids) { + # Remap all ID columns to sequential integers (1, 2, 3, ...) in row order, + # so the final data frame has tidy consecutive IDs regardless of fam_shift offsets. + old_ids <- rbind( df_Fam[[personID]], df_Fam[[momID]], df_Fam[[dadID]], df_Fam[[spouseID]]) + old_ids <- unique(old_ids[!is.na(old_ids)]) + id_map <- setNames(seq_along(old_ids), as.character(old_ids)) + + df_Fam[[personID]] <- as.integer(id_map[as.character( df_Fam[[personID]])]) + df_Fam[[momID]] <- as.integer(id_map[as.character( df_Fam[[momID]])]) + df_Fam[[dadID]] <- as.integer(id_map[as.character( df_Fam[[dadID]])]) + df_Fam[[spouseID]] <- as.integer(id_map[as.character( df_Fam[[spouseID]])]) + } df_Fam } @@ -989,6 +1004,7 @@ SimPed <- function(...) { # nolint: object_name_linter. #' at once, with unique IDs across all families. #' #' @param n_fam Integer. Number of families to simulate. Default is 2. +#' @param remap_ids Logical. If TRUE (default), all ID columns (personID, momID, dadID, spouseID) will be remapped to sequential integers starting at 1 across the combined data frame. This ensures tidy consecutive IDs regardless of fam_shift offsets. If FALSE, IDs will retain their original values from each pedigree simulation, which may include gaps or non-sequential values due to fam_shift. #' @inheritParams simulatePedigree #' @return A \code{data.frame} containing all simulated individuals from all #' families combined, with the same columns as \code{\link{simulatePedigree}}. @@ -1022,7 +1038,9 @@ simulatePedigrees <- function(n_fam = 2, spouseID = "spouseID", code_male = "M", code_female = "F", - beta = FALSE) { + remap_ids = TRUE, + beta = FALSE + ) { n_fam <- as.integer(n_fam) if (is.na(n_fam) || n_fam < 1L) { stop("'n_fam' must be a positive integer.") @@ -1045,28 +1063,25 @@ simulatePedigrees <- function(n_fam = 2, code_male = code_male, code_female = code_female, fam_shift = i, + remap_ids = FALSE, # Keep original IDs for now; we'll remap after combining. beta = beta ) ped_i$fam <- paste0("fam", i) ped_list[[i]] <- ped_i } combined <- data.table::rbindlist(ped_list) |> as.data.frame() - + names(combined) <- c("fam", personID, "gen", dadID, momID, spouseID, "sex") +if(remap_ids) { # Remap all ID columns to sequential integers (1, 2, 3, ...) in row order, # so the final data frame has tidy consecutive IDs regardless of fam_shift offsets. - old_ids <- combined[[personID]] + old_ids <- rbind(combined[[personID]], combined[[momID]], combined[[dadID]], combined[[spouseID]]) + old_ids <- unique(old_ids[!is.na(old_ids)]) id_map <- setNames(seq_along(old_ids), as.character(old_ids)) - remap_col <- function(x) { - out <- id_map[as.character(x)] - out[is.na(x)] <- NA_integer_ - as.integer(out) - } - - combined[[personID]] <- as.integer(id_map[as.character(old_ids)]) - combined[[momID]] <- remap_col(combined[[momID]]) - combined[[dadID]] <- remap_col(combined[[dadID]]) - combined[[spouseID]] <- remap_col(combined[[spouseID]]) - + combined[[personID]] <- as.integer(id_map[as.character(combined[[personID]])]) + combined[[momID]] <- as.integer(id_map[as.character(combined[[momID]])]) + combined[[dadID]] <- as.integer(id_map[as.character(combined[[dadID]])]) + combined[[spouseID]] <- as.integer(id_map[as.character(combined[[spouseID]])]) +} combined } diff --git a/man/simulatePedigree.Rd b/man/simulatePedigree.Rd index d56ab72c..df600970 100644 --- a/man/simulatePedigree.Rd +++ b/man/simulatePedigree.Rd @@ -26,6 +26,7 @@ simulatePedigree( code_male = "M", code_female = "F", fam_shift = 1L, + remap_ids = FALSE, beta = FALSE ) @@ -76,6 +77,8 @@ current version.} \item{fam_shift}{An integer to shift the person ID. Default is 1L. This is useful when simulating multiple pedigrees to avoid ID conflicts.} +\item{remap_ids}{logical. If TRUE, remap all ID columns to sequential integers (1, 2, 3, ...) in row order.} + \item{beta}{logical or character. Controls which algorithm version to use: \itemize{ \item{\code{FALSE}, \code{"base"}, or \code{"original"} (default): Use the original algorithm. diff --git a/man/simulatePedigrees.Rd b/man/simulatePedigrees.Rd index 6397da55..0ae193d5 100644 --- a/man/simulatePedigrees.Rd +++ b/man/simulatePedigrees.Rd @@ -20,6 +20,7 @@ simulatePedigrees( spouseID = "spouseID", code_male = "M", code_female = "F", + remap_ids = TRUE, beta = FALSE ) } @@ -53,7 +54,7 @@ current version.} \item{balancedMar}{Not fully developed yet. Always \code{TRUE} in the current version.} -\item{verbose}{logical. If TRUE, message progress through stages of algorithm} +\item{verbose}{logical If TRUE, message progress through stages of algorithm} \item{personID}{character. Name of the column in ped for the person ID variable} @@ -61,12 +62,14 @@ current version.} \item{dadID}{character. Name of the column in ped for the father ID variable} -\item{spouseID}{The name of the column that will contain the spouse ID in the output data frame. Default is "spouseID".} +\item{spouseID}{The name of the column that will contain the spouse ID in the output data frame. Default is "spID".} \item{code_male}{The value to use for males. Default is "M"} \item{code_female}{The value to use for females. Default is "F"} +\item{remap_ids}{Logical. If TRUE (default), all ID columns (personID, momID, dadID, spouseID) will be remapped to sequential integers starting at 1 across the combined data frame. This ensures tidy consecutive IDs regardless of fam_shift offsets. If FALSE, IDs will retain their original values from each pedigree simulation, which may include gaps or non-sequential values due to fam_shift.} + \item{beta}{logical or character. Controls which algorithm version to use: \itemize{ \item{\code{FALSE}, \code{"base"}, or \code{"original"} (default): Use the original algorithm. @@ -84,7 +87,9 @@ uses vectorized operations instead of loops, making it much faster for large ped A \code{data.frame} containing all simulated individuals from all families combined, with the same columns as \code{\link{simulatePedigree}}. The \code{fam} column uniquely identifies each family (e.g., "fam1", - "fam2", ...). Individual IDs are guaranteed to be unique across families. + "fam2", ...). Individual IDs are sequential integers starting at 1 + (i.e., \code{1:nrow(result)}), and all parent/spouse ID references are + remapped to match. } \description{ This function simulates multiple "balanced" pedigrees and returns them diff --git a/tests/testthat/test-simulatePedigree.R b/tests/testthat/test-simulatePedigree.R index c2b5d66b..86044c01 100644 --- a/tests/testthat/test-simulatePedigree.R +++ b/tests/testthat/test-simulatePedigree.R @@ -126,9 +126,9 @@ test_that("simulated pedigree generates expected data structure when sexR is imb expect_equal(max(results$gen), Ngen, tolerance = strict_tolerance) # expect there to be parents in each for all generations except the first one - filter_parents <- results %>% - group_by(gen) %>% - summarize(num_parents = sum(!is.na(dadID), na.rm = TRUE) + sum(!is.na(momID), na.rm = TRUE)) + filter_parents <- results |> + dplyr::group_by(gen) |> + dplyr::summarize(num_parents = sum(!is.na(dadID), na.rm = TRUE) + sum(!is.na(momID), na.rm = TRUE)) expect_true(all(filter_parents$num_parents[filter_parents$gen > 1] > 0), info = paste0("Beta option: ", beta)) expect_true(all(filter_parents$num_parents[filter_parents$gen == 1] == 0), info = paste0("Beta option: ", beta)) @@ -199,9 +199,9 @@ test_that("simulated pedigree generates expected data structure but supply var n expect_lt(sex_mean_male, sex_mean_female) # expect there to be parents in each for all generations except the first one - filter_parents <- results %>% - group_by(gen) %>% - summarize(num_parents = sum(!is.na(dadID), na.rm = TRUE) + sum(!is.na(momID), na.rm = TRUE)) + filter_parents <- results |> + dplyr::group_by(gen) |> + dplyr::summarize(num_parents = sum(!is.na(dadID), na.rm = TRUE) + sum(!is.na(momID), na.rm = TRUE)) expect_true(all(filter_parents$num_parents[filter_parents$gen > 1] > 0), info = paste0("Beta option: ", beta)) expect_true(all(filter_parents$num_parents[filter_parents$gen == 1] == 0), info = paste0("Beta option: ", beta)) @@ -325,8 +325,8 @@ test_that("simulatePedigrees returns sequential IDs starting at 1", { set.seed(5) results <- simulatePedigrees(n_fam = 3, kpc = 3, Ngen = 4, marR = 0.6) - # Person IDs should be exactly 1:nrow - expect_equal(results$ID, seq_len(nrow(results))) + # Person IDs should be close to 1:nrow(results) spouse might change this but they should still be sequential and unique + expect_equal(sort(results$ID), seq_len(nrow(results))) # All parent/spouse references should be within the ID range (or NA) valid_ids <- seq_len(nrow(results)) From 4455fc703e921da22f6841048ffcc79636cfe964 Mon Sep 17 00:00:00 2001 From: Mason Garrison Date: Mon, 13 Apr 2026 17:47:47 -0400 Subject: [PATCH 8/9] pipes! --- tests/testthat/test-simulatePedigree.R | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/tests/testthat/test-simulatePedigree.R b/tests/testthat/test-simulatePedigree.R index 86044c01..0ae04a5e 100644 --- a/tests/testthat/test-simulatePedigree.R +++ b/tests/testthat/test-simulatePedigree.R @@ -126,8 +126,7 @@ test_that("simulated pedigree generates expected data structure when sexR is imb expect_equal(max(results$gen), Ngen, tolerance = strict_tolerance) # expect there to be parents in each for all generations except the first one - filter_parents <- results |> - dplyr::group_by(gen) |> + filter_parents <- dplyr::group_by(results, gen) %>% dplyr::summarize(num_parents = sum(!is.na(dadID), na.rm = TRUE) + sum(!is.na(momID), na.rm = TRUE)) expect_true(all(filter_parents$num_parents[filter_parents$gen > 1] > 0), info = paste0("Beta option: ", beta)) @@ -199,8 +198,7 @@ test_that("simulated pedigree generates expected data structure but supply var n expect_lt(sex_mean_male, sex_mean_female) # expect there to be parents in each for all generations except the first one - filter_parents <- results |> - dplyr::group_by(gen) |> + filter_parents <- dplyr::group_by(results, gen) %>% dplyr::summarize(num_parents = sum(!is.na(dadID), na.rm = TRUE) + sum(!is.na(momID), na.rm = TRUE)) expect_true(all(filter_parents$num_parents[filter_parents$gen > 1] > 0), info = paste0("Beta option: ", beta)) From c2fa6057244bff2da054786ef0d0995d45835df1 Mon Sep 17 00:00:00 2001 From: Mason Garrison Date: Mon, 13 Apr 2026 18:03:27 -0400 Subject: [PATCH 9/9] Update v2_pedigree.Rmd --- vignettes/v2_pedigree.Rmd | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vignettes/v2_pedigree.Rmd b/vignettes/v2_pedigree.Rmd index b68b21f2..e0d0e1ae 100644 --- a/vignettes/v2_pedigree.Rmd +++ b/vignettes/v2_pedigree.Rmd @@ -97,7 +97,7 @@ df_ped_3$fam <- NULL df_ped_3$ID <- df_ped_3$ID / 100 df_ped_3$dadID <- df_ped_3$dadID / 100 df_ped_3$momID <- df_ped_3$momID / 100 -df_ped_3$spID <- df_ped_3$spID / 100 +df_ped_3$spouseID <- df_ped_3$spouseID / 100 df_ped_4$famID <- 2 df_ped_4$fam <- NULL