Skip to content

Commit 90b1bbe

Browse files
Merge pull request #106 from R-Computing-Lab/dev_main
Add customizable sex coding to checkParentIDs
2 parents 2e38d01 + fb30e5b commit 90b1bbe

8 files changed

Lines changed: 314 additions & 48 deletions

File tree

NEWS.md

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,10 @@
1-
# BGmisc beta 1.5.2
1+
# BGmisc 1.5.2
22
* More flexible ID generation for simulatePedigree
33
* Created ped2gen function to extract generation information from pedigree data.frames
44
* Added tests for ped2gen
55
* Fixed handling of character ID variables leading to a warning in ped2fam
66
* Added famIDs to phantom parents
7+
* Tweaked how sex coding is handled to allow for unknown sex
78

89
# BGmisc 1.5.1
910
## CRAN submission

R/checkParents.R

Lines changed: 21 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -12,11 +12,12 @@
1212
#' @param repairsex A logical flag indicating whether to attempt repairs on sex of the parents
1313
#' @param addphantoms A logical flag indicating whether to add phantom parents for missing parent IDs.
1414
#' @param parentswithoutrow A logical flag indicating whether to add parents without a row in the pedigree.
15-
#' @param famID Character. Column name for family IDs.
16-
#' @param personID Character. Column name for individual IDs.
15+
#' @param famID Character. Column name for family IDs.
16+
#' @param personID Character. Column name for individual IDs.
1717
#' @param momID Character. Column name for maternal IDs.
1818
#' @param dadID Character. Column name for paternal IDs.
19-
#'
19+
#' @param code_male The code value used to represent male sex in the 'sex' column of \code{ped}.
20+
#' @param code_female The code value used to represent female sex in the 'sex' column of \code{ped}.
2021
#'
2122
#' @return Depending on the value of `repair`, either a list containing validation results or a repaired dataframe is returned.
2223
#' @examples
@@ -32,7 +33,10 @@ checkParentIDs <- function(ped, verbose = FALSE, repair = FALSE,
3233
famID = "famID",
3334
personID = "ID",
3435
momID = "momID",
35-
dadID = "dadID") {
36+
dadID = "dadID",
37+
code_male = NULL,
38+
code_female = NULL
39+
) {
3640
# Standardize column names in the input dataframe
3741
ped <- standardizeColnames(ped, verbose = verbose)
3842

@@ -87,18 +91,28 @@ checkParentIDs <- function(ped, verbose = FALSE, repair = FALSE,
8791
cat("Step 2: Determining the if moms are the same sex and dads are same sex\n")
8892
}
8993
# Determine modal sex values for moms and dads
94+
95+
96+
9097
mom_results <- checkParentSex(ped, parent_col = "momID", verbose = verbose)
9198
dad_results <- checkParentSex(ped, parent_col = "dadID", verbose = verbose)
9299

93100
validation_results$mom_sex <- mom_results$unique_sexes
94101
validation_results$dad_sex <- dad_results$unique_sexes
95-
validation_results$female_var <- mom_results$modal_sex
96-
validation_results$male_var <- dad_results$modal_sex
102+
97103
validation_results$wrong_sex_moms <- mom_results$inconsistent_parents
98104
validation_results$wrong_sex_dads <- dad_results$inconsistent_parents
99105
validation_results$female_moms <- mom_results$all_same_sex
100106
validation_results$male_dads <- dad_results$all_same_sex
101-
107+
if (!is.null(code_male) && !is.null(code_female)) {
108+
validation_results$male_var <- code_male
109+
validation_results$female_var <- code_female
110+
validation_results$sex_code_source <- "user_provided_codes"
111+
} else {
112+
validation_results$female_var <- mom_results$modal_sex
113+
validation_results$male_var <- dad_results$modal_sex
114+
validation_results$sex_code_source <- "modal_parent_sex"
115+
}
102116
# Are any parents in both momID and dadID?
103117
momdad <- intersect(ped$dadID, ped$momID)
104118
if (length(momdad) > 0 && !is.na(momdad)) {

R/checkSex.R

Lines changed: 81 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -17,13 +17,14 @@
1717
#'
1818
#' @details This function uses the terms 'male' and 'female' in a biological context, referring to chromosomal and other biologically-based characteristics necessary for constructing genetic pedigrees. The biological aspect of sex used in genetic analysis (genotype) is distinct from the broader, richer concept of gender identity (phenotype).
1919
#'
20-
#' We recognize the importance of using language and methodologies that affirm and respect the full spectrum of gender identities.
20+
#' We recognize the importance of using language and methodologies that affirm and respect the full spectrum of gender identities.
2121
#' The developers of this package express unequivocal support for folx in the transgender
2222
#' and LGBTQ+ communities.
2323
#'
2424
#' @param ped A dataframe representing the pedigree data with a 'sex' column.
2525
#' @param code_male The current code used to represent males in the 'sex' column.
2626
#' @param code_female The current code used to represent females in the 'sex' column. If both are NULL, no recoding is performed.
27+
#' @param code_unknown The current code used to represent unknown or ambiguous sex in the 'sex' column. Can be NA to indicate that missing values should be treated as unknown. If NULL and both code_male and code_female are provided, values not matching either will be inferred as unknown.
2728
#' @param verbose A logical flag indicating whether to print progress and validation messages to the console.
2829
#' @param repair A logical flag indicating whether to attempt repairs on the sex coding.
2930
#' @param momID The column name for maternal IDs. Default is "momID".
@@ -37,7 +38,10 @@
3738
#' }
3839
#' @export
3940
#'
40-
checkSex <- function(ped, code_male = NULL, code_female = NULL, verbose = FALSE, repair = FALSE,
41+
checkSex <- function(ped, code_male = NULL,
42+
code_female = NULL,
43+
code_unknown = NULL,
44+
verbose = FALSE, repair = FALSE,
4145
momID = "momID",
4246
dadID = "dadID") {
4347
# Standardize column names in the input dataframe
@@ -61,7 +65,6 @@ checkSex <- function(ped, code_male = NULL, code_female = NULL, verbose = FALSE,
6165
}
6266

6367

64-
6568
# Are there multiple sexes/genders in the list of dads and moms?
6669

6770
dad_results <- checkParentSex(ped, parent_col = dadID, verbose = verbose)
@@ -92,7 +95,11 @@ checkSex <- function(ped, code_male = NULL, code_female = NULL, verbose = FALSE,
9295

9396
if (validation_results$sex_length == 2) {
9497
# Recode all dads to the most frequent male value
95-
ped <- recodeSex(ped, code_male = validation_results$most_frequent_sex_dad)
98+
ped <- recodeSex(ped,
99+
code_male = validation_results$most_frequent_sex_dad,
100+
code_female = validation_results$most_frequent_sex_mom,
101+
code_unknown = code_unknown
102+
)
96103
# Count and record the change
97104
num_changes <- sum(original_ped$sex != ped$sex)
98105
# Record the change and the count
@@ -128,8 +135,16 @@ checkSex <- function(ped, code_male = NULL, code_female = NULL, verbose = FALSE,
128135
#' @export
129136
#'
130137
#' @seealso \code{\link{checkSex}}
131-
repairSex <- function(ped, verbose = FALSE, code_male = NULL, code_female = NULL) {
132-
checkSex(ped = ped, verbose = verbose, repair = TRUE, code_male = code_male, code_female = code_female)
138+
repairSex <- function(ped, verbose = FALSE,
139+
code_male = NULL,
140+
code_female = NULL,
141+
code_unknown = NULL) {
142+
checkSex(
143+
ped = ped, verbose = verbose, repair = TRUE,
144+
code_male = code_male,
145+
code_female = code_female,
146+
code_unknown = code_unknown
147+
)
133148
}
134149

135150
#' Recodes Sex Variable in a Pedigree Dataframe
@@ -142,51 +157,76 @@ repairSex <- function(ped, verbose = FALSE, code_male = NULL, code_female = NULL
142157
#' @param recode_na The value to use for missing values. Default is NA_character_
143158
#' @param recode_male The value to use for males. Default is "M"
144159
#' @param recode_female The value to use for females. Default is "F"
160+
#' @param recode_unknown The value to use for unknown values. Default is "U"
145161
#' @inherit checkSex details
146162
#' @return A modified version of the input data.frame \code{ped}, containing an additional or modified 'sex_recode' column where the 'sex' values are recoded according to \code{code_male}. NA values in the 'sex' column are preserved.
147163
#' @export
148164

149165
recodeSex <- function(
150-
ped, verbose = FALSE, code_male = NULL, code_na = NULL, code_female = NULL,
151-
recode_male = "M", recode_female = "F", recode_na = NA_character_) {
166+
ped, verbose = FALSE, code_male = NULL, code_na = NULL, code_female = NULL,
167+
code_unknown = NULL,
168+
recode_male = "M",
169+
recode_female = "F",
170+
recode_unknown = "U",
171+
recode_na = NA_character_
172+
) {
173+
if (is.null(code_male) && is.null(code_female)) {
174+
if (verbose == TRUE) {
175+
warning("Both code male and code female are empty. No recoding was done.")
176+
}
177+
return(ped)
178+
}
179+
# First, set any code_na values to NA
152180
if (!is.null(code_na)) {
153181
ped$sex[ped$sex == code_na] <- NA
154182
}
155-
# Recode as "F" or "M" based on code_male, preserving NAs
156-
if (!is.null(code_male) && !is.null(code_female)) {
157-
# Initialize sex_recode as NA, preserving the length of the 'sex' column
158-
ped$sex_recode <- recode_na
159-
ped$sex_recode[ped$sex == code_female] <- recode_female
160-
ped$sex_recode[ped$sex == code_male] <- recode_male
161-
# Overwriting temp recode variable
162-
ped$sex <- ped$sex_recode
163-
ped$sex_recode <- NULL
164-
} else if (!is.null(code_male) && is.null(code_female)) {
165-
# Initialize sex_recode as NA, preserving the length of the 'sex' column
166-
ped$sex_recode <- recode_na
167-
ped$sex_recode[ped$sex != code_male & !is.na(ped$sex)] <- recode_female
183+
184+
# Initialize sex_recode as NA, preserving the length of the 'sex' column
185+
ped$sex_recode <- recode_na
186+
187+
188+
if (!is.null(code_male)) {
168189
ped$sex_recode[ped$sex == code_male] <- recode_male
169-
# Overwriting temp recode variable
170-
ped$sex <- ped$sex_recode
171-
ped$sex_recode <- NULL
172-
} else if (is.null(code_male) && !is.null(code_female)) {
173-
# Initialize sex_recode as NA, preserving the length of the 'sex' column
174-
ped$sex_recode <- recode_na
175-
ped$sex_recode[ped$sex != code_female & !is.na(ped$sex)] <- recode_male
190+
}
191+
if (!is.null(code_female)) {
176192
ped$sex_recode[ped$sex == code_female] <- recode_female
177-
# Overwriting temp recode variable
178-
ped$sex <- ped$sex_recode
179-
ped$sex_recode <- NULL
180-
} else {
181-
if (verbose == TRUE) {
182-
warning("Both code male and code female are empty. No recoding was done.")
193+
}
194+
195+
# handle unknown codes
196+
if (!is.null(code_unknown) && !is.na(code_unknown)) {
197+
ped$sex_recode[ped$sex == code_unknown] <- recode_unknown
198+
} else if (!is.null(code_unknown) && is.na(code_unknown)) {
199+
ped$sex_recode[is.na(ped$sex)] <- recode_unknown
200+
} else if (!is.null(code_male) && !is.null(code_female)) {
201+
ped$sex_recode[!ped$sex %in% c(code_male, code_female) & !is.na(ped$sex)] <- recode_unknown
202+
}
203+
204+
205+
# Handle cases where only one of code
206+
# just male
207+
if (!is.null(code_male) && is.null(code_female)) {
208+
if (!is.null(code_unknown)) {
209+
ped$sex_recode[ped$sex != code_male & !is.na(ped$sex) & ped$sex != code_unknown] <- recode_female
210+
} else if (is.null(code_unknown)) {
211+
ped$sex_recode[ped$sex != code_male & !is.na(ped$sex)] <- recode_female
183212
}
184213
}
214+
# just female
215+
if (is.null(code_male) && !is.null(code_female)) {
216+
if (!is.null(code_unknown)) {
217+
ped$sex_recode[ped$sex != code_female & !is.na(ped$sex) & ped$sex != code_unknown] <- recode_male
218+
} else if (is.null(code_unknown)) {
219+
ped$sex_recode[ped$sex != code_female & !is.na(ped$sex)] <- recode_male
220+
}
221+
}
222+
223+
# Overwriting temp recode variable
224+
ped$sex <- ped$sex_recode
225+
ped$sex_recode <- NULL
185226
return(ped)
186227
}
187228

188229

189-
190230
#' Check Parental Role Sex Consistency
191231
#'
192232
#' Validates sex coding consistency for a given parental role (momID or dadID).
@@ -225,9 +265,15 @@ checkParentSex <- function(ped, parent_col, sex_col = "sex", verbose = FALSE) {
225265
# Store the most frequent sex for moms and dads
226266
modal_sex <- names(sort(table(parent_sexes), decreasing = TRUE))[1]
227267

268+
if (all(is.na(modal_sex)) && verbose == TRUE) {
269+
cat(paste0("All parents in role ", parent_col, " have missing sex values.\n"))
270+
}
271+
228272
# Type coercion based on ped$sex type
229273
if (is.numeric(ped[[sex_col]])) {
230274
modal_sex <- as.numeric(modal_sex)
275+
} else if (is.character(ped[[sex_col]])) {
276+
modal_sex <- as.character(modal_sex)
231277
}
232278

233279
# List ids for dads that are female, moms that are male

man/checkParentIDs.Rd

Lines changed: 7 additions & 1 deletion
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

man/checkSex.Rd

Lines changed: 4 additions & 1 deletion
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

man/recodeSex.Rd

Lines changed: 7 additions & 1 deletion
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

man/repairSex.Rd

Lines changed: 10 additions & 2 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

0 commit comments

Comments
 (0)