Skip to content

Commit b33820a

Browse files
smarter id handling
1 parent 32684f6 commit b33820a

4 files changed

Lines changed: 49 additions & 26 deletions

File tree

R/simulatePedigree.R

Lines changed: 30 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -879,6 +879,7 @@ buildBtwnGenerations_opt <- function(df_Fam,
879879
#' @param ... Additional arguments to be passed to other functions.
880880
#' @inheritParams ped2fam
881881
#' @param spouseID The name of the column that will contain the spouse ID in the output data frame. Default is "spID".
882+
#' @param remap_ids logical. If TRUE, remap all ID columns to sequential integers (1, 2, 3, ...) in row order.
882883
#' @return A \code{data.frame} with each row representing a simulated individual. The columns are as follows:
883884
#' \itemize{
884885
#' \item{fam: The family id of each simulated individual. It is 'fam1' in a single simulated pedigree.}
@@ -914,6 +915,7 @@ simulatePedigree <- function(kpc = 3,
914915
code_male = "M",
915916
code_female = "F",
916917
fam_shift = 1L,
918+
remap_ids = FALSE,
917919
beta = FALSE) {
918920
# SexRatio: ratio of male over female in the offspring setting; used in the between generation combinations
919921
# SexRatio <- sexR / (1 - sexR)
@@ -966,11 +968,24 @@ simulatePedigree <- function(kpc = 3,
966968
df_Fam <- df_Fam[, 1:7]
967969
df_Fam <- df_Fam[!(is.na(df_Fam$pat) & is.na(df_Fam$mat) & is.na(df_Fam$spID)), ]
968970

969-
colnames(df_Fam)[c(2, 4, 5)] <- c(personID, dadID, momID)
971+
names(df_Fam) <- c("fam", personID, "gen", dadID, momID, spouseID, "sex")
970972

971973
# connect the detached members
972974
df_Fam[is.na(df_Fam[[momID]]) & is.na(df_Fam[[dadID]]) & df_Fam$gen > 1, ]
973975

976+
977+
if(remap_ids) {
978+
# Remap all ID columns to sequential integers (1, 2, 3, ...) in row order,
979+
# so the final data frame has tidy consecutive IDs regardless of fam_shift offsets.
980+
old_ids <- rbind( df_Fam[[personID]], df_Fam[[momID]], df_Fam[[dadID]], df_Fam[[spouseID]])
981+
old_ids <- unique(old_ids[!is.na(old_ids)])
982+
id_map <- setNames(seq_along(old_ids), as.character(old_ids))
983+
984+
df_Fam[[personID]] <- as.integer(id_map[as.character( df_Fam[[personID]])])
985+
df_Fam[[momID]] <- as.integer(id_map[as.character( df_Fam[[momID]])])
986+
df_Fam[[dadID]] <- as.integer(id_map[as.character( df_Fam[[dadID]])])
987+
df_Fam[[spouseID]] <- as.integer(id_map[as.character( df_Fam[[spouseID]])])
988+
}
974989
df_Fam
975990
}
976991

@@ -989,6 +1004,7 @@ SimPed <- function(...) { # nolint: object_name_linter.
9891004
#' at once, with unique IDs across all families.
9901005
#'
9911006
#' @param n_fam Integer. Number of families to simulate. Default is 2.
1007+
#' @param remap_ids Logical. If TRUE (default), all ID columns (personID, momID, dadID, spouseID) will be remapped to sequential integers starting at 1 across the combined data frame. This ensures tidy consecutive IDs regardless of fam_shift offsets. If FALSE, IDs will retain their original values from each pedigree simulation, which may include gaps or non-sequential values due to fam_shift.
9921008
#' @inheritParams simulatePedigree
9931009
#' @return A \code{data.frame} containing all simulated individuals from all
9941010
#' families combined, with the same columns as \code{\link{simulatePedigree}}.
@@ -1022,7 +1038,9 @@ simulatePedigrees <- function(n_fam = 2,
10221038
spouseID = "spouseID",
10231039
code_male = "M",
10241040
code_female = "F",
1025-
beta = FALSE) {
1041+
remap_ids = TRUE,
1042+
beta = FALSE
1043+
) {
10261044
n_fam <- as.integer(n_fam)
10271045
if (is.na(n_fam) || n_fam < 1L) {
10281046
stop("'n_fam' must be a positive integer.")
@@ -1045,28 +1063,25 @@ simulatePedigrees <- function(n_fam = 2,
10451063
code_male = code_male,
10461064
code_female = code_female,
10471065
fam_shift = i,
1066+
remap_ids = FALSE, # Keep original IDs for now; we'll remap after combining.
10481067
beta = beta
10491068
)
10501069
ped_i$fam <- paste0("fam", i)
10511070
ped_list[[i]] <- ped_i
10521071
}
10531072
combined <- data.table::rbindlist(ped_list) |> as.data.frame()
1054-
1073+
names(combined) <- c("fam", personID, "gen", dadID, momID, spouseID, "sex")
1074+
if(remap_ids) {
10551075
# Remap all ID columns to sequential integers (1, 2, 3, ...) in row order,
10561076
# so the final data frame has tidy consecutive IDs regardless of fam_shift offsets.
1057-
old_ids <- combined[[personID]]
1077+
old_ids <- rbind(combined[[personID]], combined[[momID]], combined[[dadID]], combined[[spouseID]])
1078+
old_ids <- unique(old_ids[!is.na(old_ids)])
10581079
id_map <- setNames(seq_along(old_ids), as.character(old_ids))
10591080

1060-
remap_col <- function(x) {
1061-
out <- id_map[as.character(x)]
1062-
out[is.na(x)] <- NA_integer_
1063-
as.integer(out)
1064-
}
1065-
1066-
combined[[personID]] <- as.integer(id_map[as.character(old_ids)])
1067-
combined[[momID]] <- remap_col(combined[[momID]])
1068-
combined[[dadID]] <- remap_col(combined[[dadID]])
1069-
combined[[spouseID]] <- remap_col(combined[[spouseID]])
1070-
1081+
combined[[personID]] <- as.integer(id_map[as.character(combined[[personID]])])
1082+
combined[[momID]] <- as.integer(id_map[as.character(combined[[momID]])])
1083+
combined[[dadID]] <- as.integer(id_map[as.character(combined[[dadID]])])
1084+
combined[[spouseID]] <- as.integer(id_map[as.character(combined[[spouseID]])])
1085+
}
10711086
combined
10721087
}

man/simulatePedigree.Rd

Lines changed: 3 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

man/simulatePedigrees.Rd

Lines changed: 8 additions & 3 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

tests/testthat/test-simulatePedigree.R

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -126,9 +126,9 @@ test_that("simulated pedigree generates expected data structure when sexR is imb
126126
expect_equal(max(results$gen), Ngen, tolerance = strict_tolerance)
127127

128128
# expect there to be parents in each for all generations except the first one
129-
filter_parents <- results %>%
130-
group_by(gen) %>%
131-
summarize(num_parents = sum(!is.na(dadID), na.rm = TRUE) + sum(!is.na(momID), na.rm = TRUE))
129+
filter_parents <- results |>
130+
dplyr::group_by(gen) |>
131+
dplyr::summarize(num_parents = sum(!is.na(dadID), na.rm = TRUE) + sum(!is.na(momID), na.rm = TRUE))
132132

133133
expect_true(all(filter_parents$num_parents[filter_parents$gen > 1] > 0), info = paste0("Beta option: ", beta))
134134
expect_true(all(filter_parents$num_parents[filter_parents$gen == 1] == 0), info = paste0("Beta option: ", beta))
@@ -199,9 +199,9 @@ test_that("simulated pedigree generates expected data structure but supply var n
199199
expect_lt(sex_mean_male, sex_mean_female)
200200

201201
# expect there to be parents in each for all generations except the first one
202-
filter_parents <- results %>%
203-
group_by(gen) %>%
204-
summarize(num_parents = sum(!is.na(dadID), na.rm = TRUE) + sum(!is.na(momID), na.rm = TRUE))
202+
filter_parents <- results |>
203+
dplyr::group_by(gen) |>
204+
dplyr::summarize(num_parents = sum(!is.na(dadID), na.rm = TRUE) + sum(!is.na(momID), na.rm = TRUE))
205205

206206
expect_true(all(filter_parents$num_parents[filter_parents$gen > 1] > 0), info = paste0("Beta option: ", beta))
207207
expect_true(all(filter_parents$num_parents[filter_parents$gen == 1] == 0), info = paste0("Beta option: ", beta))
@@ -325,8 +325,8 @@ test_that("simulatePedigrees returns sequential IDs starting at 1", {
325325
set.seed(5)
326326
results <- simulatePedigrees(n_fam = 3, kpc = 3, Ngen = 4, marR = 0.6)
327327

328-
# Person IDs should be exactly 1:nrow
329-
expect_equal(results$ID, seq_len(nrow(results)))
328+
# Person IDs should be close to 1:nrow(results) spouse might change this but they should still be sequential and unique
329+
expect_equal(sort(results$ID), seq_len(nrow(results)))
330330

331331
# All parent/spouse references should be within the ID range (or NA)
332332
valid_ids <- seq_len(nrow(results))

0 commit comments

Comments
 (0)