PRDM9-Variants/compile-publication-variants.md at main · hgibling/PRDM9-Variants

Step 2. Compile znf sequences, get list of unique znfs, and give them temporary standardized names

Append author & year to each znf name and save in new file:

for FILE in intermediate-files/*znf-sequences*
do
NAME=$(basename ${FILE%-znf*})
awk -v NAME="$NAME" '{print NAME "_" $1 "\t" $2}' $FILE >> intermediate-files/publication-znf-sequences.tsv
done

Check which sequences are identical/unique and give standardized names:

# done in R

library(tidyr)
library(dplyr)
library(stringr)

pub.znf.seqs <- read.table("intermediate-files/publication-znf-sequences.tsv", header=F,
                      col.names=c("PubZnfName", "Sequence"))

# get all unique sequences and give standard names
znf.sequences <- pub.znf.seqs %>%
  # separate author from znf name
  separate(PubZnfName, sep="_", into=c("Publication", "ZnfName"), extra="merge") %>%
  mutate(Publication=sub("-", ".", Publication)) %>%
  mutate(Publication=str_to_title(Publication)) %>%
  # shift Oliver 2009 sequences to match the rest
  mutate(Sequence=ifelse(Publication=="Oliver.2009", sub("TGCAGGGAG", "", Sequence), Sequence)) %>%
  mutate(Sequence=ifelse(Publication=="Oliver.2009", sub("$", "TGCAGGGAG", Sequence), Sequence)) %>%
  # pivot wider to obtain unique znf sequences as rows
  pivot_wider(names_from=Publication, values_from=ZnfName) %>%
  # arrange in publication order (not including Oliver, since Berg is better known)
  arrange(Berg.2010, Berg.2011, Borel.2012, Jeffreys.2013, Hussin.2013, Alleva.2021) %>%
  # give temp standard names
  mutate(StandardName=paste0("z", str_pad(row_number(), 3, pad="0")), .before=1)

# Save full mapping info
write.table(znf.sequences, "intermediate-files/standardized-znf-sequences-map-step2.tsv", row.names=F, quote=F, sep="\t")

# Save just standard name and sequence
write.table(znf.sequences %>% select(StandardName, Sequence), "intermediate-files/standardized-znf-sequences-step2.tsv", row.names=F, quote=F, sep="\t", col.names=F)

Step 3. Compile allele znf content, get list of unique alleles, and give them temporary standardized names

Append author & year to each allele name and save in new file:

# exclude files with unnamed znfs
for FILE in intermediate-files/*allele-znf-content.tsv
do
NAME=$(basename ${FILE%-allele*})
awk -v NAME="$NAME" '{print NAME "\t" $1 "\t" $2}' $FILE >> intermediate-files/publication-allele-znf-content.tsv
done

Convert publication znf names to standardized names:

# done in R

library(tidyr)
library(dplyr)
library(stringr)

pub.znf.map <- read.table("intermediate-files/standardized-znf-sequences-map-step2.tsv", header=T)
pub.allele.znfs <- read.table("intermediate-files/publication-allele-znf-content.tsv",
                         header=F, col.names=c("Publication", "AlleleName", "ZnfContent"))

pub.znf.map.long <- pub.znf.map %>%
  select(-Sequence) %>%
  pivot_longer(cols=contains("20"), names_to="Publication", values_to="PubZnfName",
               values_drop_na=T)

# All unique sequences
allele.znfs <- pub.allele.znfs %>%
  mutate(Publication=sub("-", ".", Publication)) %>%
  mutate(Publication=str_to_title(Publication)) %>%
  # separate znfs with spacer
  mutate(ZnfContent=case_when(
    Publication=="Alleva.2021" ~ gsub("(.{2})", "\\1_\\2", ZnfContent),
    Publication!="Alleva.2021" ~ gsub("(.)", "\\1_\\2", ZnfContent))) %>%
  mutate(ZnfContent=sub("_$", "", ZnfContent)) %>%
  separate_rows(ZnfContent, sep="_") %>%
  rename(PubZnfName=ZnfContent) %>% 
  # Ponting.2011 used lowercase version of berg names, so convert to join with berg
  mutate(PubZnfName=case_when(
    Publication=="Ponting.2011" ~ toupper(PubZnfName),
    TRUE ~ PubZnfName)) %>%
  # duplicate Berg.2010 znfs to make a copy for Ponting.2011
  # assume Jeffreys.2013 N znf is the same as Berg.2010 N znf and add
  left_join(bind_rows(pub.znf.map.long,
                      pub.znf.map.long %>% filter(Publication=="Berg.2010") %>%
                        mutate(Publication="Ponting.2011"),
                      pub.znf.map.long %>% filter(Publication=="Berg.2010", PubZnfName=="N") %>%
                        mutate(Publication="Jeffreys.2013"))) %>%
    group_by(Publication, AlleleName) %>%
    summarize(StandardZnfContent=str_c(StandardName, collapse="_")) %>%
  # remove Ponting.2011 allele L24 and Borel.2012 allele I since they are incorrect
  filter(!(Publication=="Ponting.2011" & AlleleName=="L24"),
         !(Publication=="Borel.2012" & AlleleName=="I")) %>%
  pivot_wider(names_from=Publication, values_from=AlleleName) %>%
  # sort in publication order
  arrange(Berg.2010, Berg.2011, Ponting.2011, Borel.2012, Jeffreys.2013, Hussin.2013, Alleva.2021) %>%
  # give temporary standard names
  mutate(StandardName=paste0("p", str_pad(row_number(), 3, pad="0")), .before=1)

# Save full mapping info
write.table(allele.znfs, "intermediate-files/standardized-allele-znf-content-map-step3.tsv", row.names=F, quote=F, sep="\t")

# Save just standardized name and znf content
write.table(allele.znfs %>% select(StandardName, StandardZnfContent), "intermediate-files/standardized-allele-znf-content-step3.tsv", row.names=F, quote=F, sep="\t", col.names=F)

Step 4. Convert allele DNA sequences to temporary standardized znf names to confirm known alleles and identify ones not in current list

Append author & year to each allele name and save in new file:

for FILE in intermediate-files/*allele-sequences.tsv
do
NAME=$(basename ${FILE%-allele*})
awk -v NAME="$NAME" '{print NAME "\t" $1 "\t" $2}' $FILE >> intermediate-files/publication-allele-sequences.tsv
done

Replace znf sequences with temporary standardized znf names:

cp intermediate-files/publication-allele-sequences.tsv intermediate-files/publication-allele-sequences-standardized-step4.tsv

while read ZNF SEQUENCE
do
sed -i '' "s/$SEQUENCE/$ZNF\_/g" intermediate-files/publication-allele-sequences-standardized-step4.tsv
done < intermediate-files/standardized-znf-sequences-step2.tsv

# add _ between remaining nucs and named znfs & remove trailing _ after final named znf
sed -i '' -e 's/\([ACGT]\)\([Zz]\)/\1_\2/g' -e 's/\_$//' intermediate-files/publication-allele-sequences-standardized-step4.tsv

See if sequences that still have unnamed nucleotide chunks can be separated into 84bp chunks

# done in R

library(tidyr)
library(dplyr)

allele.seqs.znf.converted <- read.table("intermediate-files/publication-allele-sequences-standardized-step4.tsv", 
                                        header=F, col.names=c("Publication", "AlleleName", "ZnfContent"))
pub.znf.map <- read.table("intermediate-files/standardized-znf-sequences-map-step2.tsv", header=T)

unknown <- allele.seqs.znf.converted %>%
  # get only seqs with nucleotides left
  filter(grepl("[ACGT]", ZnfContent)) %>%
  separate_rows(ZnfContent, sep="_") %>%
  # remove known znfs
  filter(!grepl("Z", ZnfContent)) %>%
  mutate(ZnfLength=nchar(ZnfContent)) %>%
  # try to split seqs at common ending seq
  mutate(ZnfContent=ifelse(ZnfLength>84, 
                           gsub("CAGGGAG", "CAGGGAG_", ZnfContent), 
                           ZnfContent)) %>%
  separate_rows(ZnfContent, sep="_") %>%
  mutate(ZnfLength=nchar(ZnfContent)) %>%
  filter(ZnfLength!=0)
  # several nuc chunks do not have expected 84bp length

# one allele has 85bp nuc chunk instead of 84, could be 1bp insertion error
check.seq <- unknown %>%
  filter(AlleleName=="chr5:23527530:FN.4") %>%
  select(ZnfContent) %>%
  unname()

# calculate levenshtein distances between 85bp seq and known znf seqs
pub.znf.map %>%
  select(StandardName, Sequence) %>%
  mutate(LevDistanceToUnknown=adist(Sequence, check.seq)) %>%
  filter(LevDistanceToUnknown==min(LevDistanceToUnknown)) %>%
  as_tibble()

# two possible sequences are only off by 1
# StandardName Sequence                                  LevDistanceToUnknow…
# 1 z006         TGTGGGCGGGGCTTTAGCAATAAGTCACACCTCCTCAGACACCAGA…             1
# 2 z048         TGTGGGCGGGGCTTTAGAGATAAGTCACACCTCCTCAGACACCAGA…             1

# too ambiguous to declare, so ignore these three alleles

Three sequences from Beyter 2021 have unidentified sequences that don't have the expected lengths of 84bp. Ignore alleles chr5:23527530:FN.0, chr5:23527530:FN.1, and chr5:23527530:FN.4 for now.

Add any additional allele sequences to main list:

# done in R

library(tidyr)
library(dplyr)
library(stringr)

allele.seqs.znf.converted <- read.table("intermediate-files/publication-allele-sequences-standardized-step4.tsv", 
                                        header=F, col.names=c("Publication", "AlleleName", "StandardZnfContent"))
allele.znf.map <- read.table("intermediate-files/standardized-allele-znf-content-map-step3.tsv", 
                                  header=T)


allele.znfs.updated <- allele.seqs.znf.converted %>%
  # remove seqs with nucleotides left
  filter(!grepl("[ACGT]", StandardZnfContent)) %>%
  mutate(Publication=sub("-", ".", Publication)) %>%
  mutate(Publication=str_to_title(Publication)) %>%
  pivot_wider(names_from=Publication, values_from=AlleleName) %>%
  # join with original list of standardized allele znf content
  full_join(allele.znf.map) %>%
  arrange(StandardName) %>%
  # add temp standardized name to new allele
  mutate(StandardName=ifelse(is.na(StandardName), 
                             paste0("P", str_pad(row_number(), 3, pad="0")), 
                             StandardName)) %>%
  relocate(StandardName, .before=StandardZnfContent) %>%
  relocate(Beyter.2021, .after=Berg.2011)

# write updated files
write.table(allele.znfs.updated, "intermediate-files/standardized-allele-znf-content-map-step4.tsv", row.names=F, quote=F, sep="\t")

write.table(allele.znfs.updated %>% select(StandardName, StandardZnfContent), "intermediate-files/standardized-allele-znf-content-step4.tsv", row.names=F, quote=F, sep="\t", col.names=F)

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Step 2. Compile znf sequences, get list of unique znfs, and give them temporary standardized names

Step 3. Compile allele znf content, get list of unique alleles, and give them temporary standardized names

Step 4. Convert allele DNA sequences to temporary standardized znf names to confirm known alleles and identify ones not in current list

FilesExpand file tree

compile-publication-variants.md

Latest commit

History

compile-publication-variants.md

File metadata and controls

Step 2. Compile znf sequences, get list of unique znfs, and give them temporary standardized names

Step 3. Compile allele znf content, get list of unique alleles, and give them temporary standardized names

Step 4. Convert allele DNA sequences to temporary standardized znf names to confirm known alleles and identify ones not in current list