@@ -13,20 +13,10 @@ library(Matrix)
1313library(dplyr )
1414
1515spec <- matrix (c(
16- ' inputFile' , ' -f' , 1 , ' character' ,
17- ' mergeSpeciesWithHybrids' , ' -m' , 0 , ' logical' ,
18- ' performKinshipValidation' , ' -v' , 0 , ' logical'
16+ ' inputFile' , ' -f' , 1 , ' character'
1917), ncol = 4 , byrow = TRUE )
2018opts <- getopt(spec , commandArgs(trailingOnly = TRUE ))
2119
22- if (is.null(opts $ mergeSpeciesWithHybrids )){
23- opts $ mergeSpeciesWithHybrids <- FALSE
24- }
25-
26- if (is.null(opts $ performKinshipValidation )){
27- opts $ performKinshipValidation <- FALSE
28- }
29-
3020allPed <- read.table(opts $ inputFile , quote = " \" " )
3121colnames(allPed )<- c(' Id' , ' Dam' , ' Sire' , ' Gender' , ' Species' )
3222
@@ -43,147 +33,10 @@ if (any(allPed$Species == 'Unknown')) {
4333 print(paste0(' There are ' , sum(allPed $ Species == ' Unknown' ), ' Ids with species = Unknown' ))
4434}
4535
46- # The purpose of this function is to handle instances where there was cross-breeding.
47- # While this is probably rare, it can occur. When this happens, simply merge the entire species together and process as one unit.
48- # This ensures that all relevant ancestors from both species are present
49- generateSpeciesToProcess <- function (allPed , mergeSpeciesWithHybrids ) {
50- hybridParents <- dplyr :: bind_rows(
51- merge(allPed , allPed , by.x = ' Dam' , by.y = ' Id' ) %> % filter(Species.x != Species.y ) %> % select(Id , Dam , Species.x , Species.y ) %> % rename(Parent = Dam ) %> % mutate(ParentType = ' Dam' ),
52- merge(allPed , allPed , by.x = ' Sire' , by.y = ' Id' ) %> % filter(Species.x != Species.y ) %> % select(Id , Sire , Species.x , Species.y ) %> % rename(Parent = Sire ) %> % mutate(ParentType = ' Sire' )
53- )
54-
55- if (nrow(hybridParents ) > 0 ) {
56- print(paste0(' There were ' , nrow(hybridParents ), ' records with parents of a different species' ))
57- }
58-
59- if (mergeSpeciesWithHybrids && nrow(hybridParents ) > 0 ) {
60- speciesGroups <- as.list(as.character(unique(allPed $ Species )))
61- toMerge <- unique(hybridParents [c(' Species.x' , ' Species.y' )])
62- for (idx in 1 : nrow(toMerge )){
63- speciesToCollapse <- as.character(unlist(toMerge [idx ,,drop = TRUE ]))
64- matchingIdx <- sapply(speciesGroups , function (x ){
65- return (length(intersect(speciesToCollapse , x )) > 0 )
66- })
67-
68- speciesToCollapse <- sort(unique(c(speciesToCollapse , unlist(speciesGroups [matchingIdx ]))))
69- speciesGroups <- speciesGroups [! matchingIdx ]
70- speciesGroups <- append(speciesGroups , list (speciesToCollapse ))
71- }
72-
73- print(' These species will be merged for processing:' )
74- invisible (lapply(speciesGroups , function (x ){
75- if (length(x ) > 1 ) {
76- print(x )
77- }
78- }))
79-
80- return (speciesGroups )
81- } else {
82- return (unique(allPed $ Species [allPed $ Species != ' Unknown' ]))
83- }
84- }
85-
86- validateExpectedKinshipSubset <- function (dataToTest , expectedValues , errorRows , testReciprocal = TRUE ) {
87- if (nrow(dataToTest ) == 0 || nrow(expectedValues ) == 0 ) {
88- return (errorRows )
89- }
90-
91- # Generate the reciprocal of relationships as well:
92- if (testReciprocal ) {
93- ret2 <- data.frame (Id = expectedValues $ Id2 , Id2 = expectedValues $ Id , Relationship = expectedValues $ Relationship , ExpectedCoefficient = expectedValues $ ExpectedCoefficient )
94- ret2 $ Relationship <- sapply(expectedValues $ Relationship , function (x ){
95- x <- unlist(strsplit(x , split = ' /' ))
96- if (length(x ) == 1 ) {
97- return (x )
98- }
99-
100- return (paste0(x [2 ], ' /' , x [1 ]))
101- })
102- expectedValues <- dplyr :: bind_rows(expectedValues , ret2 )
103- rm(ret2 )
104- }
105-
106- dat <- merge(dataToTest , expectedValues , by = c(' Id' , ' Id2' ), all.x = T , all.y = T ) %> %
107- filter(! is.na(ExpectedCoefficient )) %> %
108- filter(is.na(coefficient ) | coefficient < ExpectedCoefficient )
109-
110- if (nrow(dat ) == 0 ) {
111- return (errorRows )
112- }
113-
114- if (all(is.null(errorRows ))) {
115- return (dat )
116- }
117-
118- return (dplyr :: bind_rows(errorRows , dat ))
119- }
120-
121- validateExpectedKinship <- function (pedDf , dataToTest ) {
122- errorRows <- NULL
123-
124- # See reference: https://en.wikipedia.org/wiki/Coefficient_of_relationship#Kinship_coefficient
125- self <- data.frame (Id = pedDf $ Id , Id2 = pedDf $ Id , Relationship = ' Self' , ExpectedCoefficient = 0.5 )
126- errorRows <- validateExpectedKinshipSubset(dataToTest , self , errorRows , testReciprocal = FALSE )
127- rm(self )
128-
129- parentChild <- dplyr :: bind_rows(
130- data.frame (Id = pedDf $ Id , Id2 = pedDf $ Dam , Relationship = ' Child/Parent' ),
131- data.frame (Id = pedDf $ Id , Id2 = pedDf $ Sire , Relationship = ' Child/Parent' )
132- ) %> % filter(! is.na(Id2 )) %> % mutate(ExpectedCoefficient = 0.25 )
133- errorRows <- validateExpectedKinshipSubset(dataToTest , parentChild , errorRows )
134- rm(parentChild )
135-
136- grandParentOffspring1 <- merge(pedDf [! is.na(pedDf $ Dam ),], pedDf , by.x = c(' Dam' ), by.y = c(' Id' ), all.x = F , all.y = F )
137- grandParentOffspring1 <- dplyr :: bind_rows(
138- grandParentOffspring1 %> % select(Id , Dam.y ) %> % filter(! is.na(Dam.y )) %> % rename(Id2 = Dam.y ) %> % mutate(Relationship = ' Grandchild/Maternal Granddam' ),
139- grandParentOffspring1 %> % select(Id , Sire.y ) %> % filter(! is.na(Sire.y )) %> % rename(Id2 = Sire.y ) %> % mutate(Relationship = ' Grandchild/Maternal Grandsire' )
140- ) %> % mutate(ExpectedCoefficient = 0.125 )
141- errorRows <- validateExpectedKinshipSubset(dataToTest , grandParentOffspring1 , errorRows )
142- rm(grandParentOffspring1 )
143-
144- grandParentOffspring2 <- merge(pedDf [! is.na(pedDf $ Sire ),], pedDf , by.x = c(' Sire' ), by.y = c(' Id' ), all.x = F , all.y = F )
145- grandParentOffspring2 <- dplyr :: bind_rows(
146- grandParentOffspring2 %> % select(Id , Dam.y ) %> % filter(! is.na(Dam.y )) %> % rename(Id2 = Dam.y ) %> % mutate(Relationship = ' Grandchild/Paternal Granddam' ),
147- grandParentOffspring2 %> % select(Id , Sire.y ) %> % filter(! is.na(Sire.y )) %> % rename(Id2 = Sire.y ) %> % mutate(Relationship = ' Grandchild/Paternal Grandsire' )
148- ) %> % mutate(ExpectedCoefficient = 0.125 )
149- errorRows <- validateExpectedKinshipSubset(dataToTest , grandParentOffspring2 , errorRows )
150- rm(grandParentOffspring2 )
151-
152- fullSibs <- merge(pedDf [! is.na(pedDf $ Dam ) & ! is.na(pedDf $ Sire ),], pedDf [! is.na(pedDf $ Dam ) & ! is.na(pedDf $ Sire ),], by = c(' Sire' , ' Dam' ), all.x = F , all.y = F ) %> %
153- select(Id.x , Id.y ) %> %
154- rename(Id = Id.x , Id2 = Id.y ) %> %
155- filter(Id != Id2 ) %> %
156- mutate(Relationship = ' Full sib' , ExpectedCoefficient = 0.25 )
157- errorRows <- validateExpectedKinshipSubset(dataToTest , fullSibs , errorRows )
158- rm(fullSibs )
159-
160- halfSibs1 <- merge(pedDf [! is.na(pedDf $ Dam ),], pedDf [! is.na(pedDf $ Dam ),], by = c(' Dam' ), all.x = F , all.y = F ) %> %
161- filter(Sire.x != Sire.y ) %> %
162- select(Id.x , Id.y ) %> %
163- rename(Id = Id.x , Id2 = Id.y ) %> %
164- filter(Id != Id2 ) %> %
165- mutate(Relationship = ' Half sib' , ExpectedCoefficient = 0.125 )
166- errorRows <- validateExpectedKinshipSubset(dataToTest , halfSibs1 , errorRows )
167- rm(halfSibs1 )
168-
169- halfSibs2 <- merge(pedDf [! is.na(pedDf $ Sire ),], pedDf [! is.na(pedDf $ Sire ),], by = c(' Sire' ), all.x = F , all.y = F ) %> %
170- filter(Dam.x != Dam.y ) %> %
171- select(Id.x , Id.y ) %> %
172- rename(Id = Id.x , Id2 = Id.y ) %> %
173- filter(Id != Id2 ) %> %
174- mutate(Relationship = ' Half sib' , ExpectedCoefficient = 0.125 )
175- errorRows <- validateExpectedKinshipSubset(dataToTest , halfSibs2 , errorRows )
176- rm(halfSibs2 )
177-
178- return (errorRows )
179- }
180-
181- speciesToProcess <- generateSpeciesToProcess(allPed , opts $ mergeSpeciesWithHybrids )
182-
18336newRecords <- NULL
184- for (speciesSet in speciesToProcess ){
185- allRecordsForSpecies <- allPed [allPed $ Species %in% speciesSet ,]
186- print(paste0(' Processing species set : ' , paste0( speciesSet , collapse = ' , ' ) , ' , with ' , nrow(allRecordsForSpecies ), ' IDs' ))
37+ for (species in unique( allPed $ Species ) ){
38+ allRecordsForSpecies <- allPed [allPed $ Species %in% species ,]
39+ print(paste0(' Processing species: ' , species , ' , with ' , nrow(allRecordsForSpecies ), ' IDs' ))
18740 if (nrow(allRecordsForSpecies ) == 1 ) {
18841 print(' single record, skipping' )
18942 newRecords <- dplyr :: bind_rows(newRecords ,data.frame (Id = allRecordsForSpecies $ Id , Id2 = allRecordsForSpecies $ Id , coefficient = 0.5 , Species = allRecordsForSpecies $ Species ))
@@ -219,19 +72,6 @@ for (speciesSet in speciesToProcess){
21972 temp.tri <- merge(temp.tri , allRecordsForSpecies [c(' Id' , ' Species' )], by = ' Id' , all.x = TRUE )
22073
22174 newRecords <- dplyr :: bind_rows(newRecords ,temp.tri )
222-
223- # NOTE: perform per-species to save memory
224- if (opts $ performKinshipValidation ) {
225- print(' Validating coefficients against expected values' )
226- errorRows <- validateExpectedKinship(allRecordsForSpecies , temp.tri )
227- if (! all(is.null(errorRows ))) {
228- fileName <- paste0(' kinshipErrors_' , paste0(speciesSet , collapse = ' .' ), ' .txt' )
229- print(paste0(' There were unexpected kinship values! See the file ' , fileName , ' for more information' ))
230- write.table(newRecords , file = fileName , row.names = FALSE , quote = FALSE , sep = ' \t ' )
231- } else {
232- print(' All coefficients were within expected ranges from predicted values' )
233- }
234- }
23575}
23676
23777# write TSV to disk
0 commit comments