-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathprepare_common_snps.sh
More file actions
executable file
·103 lines (85 loc) · 5.61 KB
/
prepare_common_snps.sh
File metadata and controls
executable file
·103 lines (85 loc) · 5.61 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
#!/bin/bash
# This script Prepares a file containing the mapping of RefSeq genome assembly identifiers to chromosome names and downloads and prepares
# the 1000 Genomes Project Phase 3 data variant sites and the latest NCBI dbsnp set (~30 GB; download time ~45 min).
# These files will be used to filter out common snps
if [ "$1" != "local" ] && [ "$1" != "hpc" ]; then
echo "Specify either 'local' or 'hpc' as an argument to the script, ex., $0 local"
exit 1
fi
DATA_DIR=/data
# Download the assembly report file to prepare a file containing the mapping of RefSeq genome assembly identifiers (e.g. GCF attachments) to chromosome names
# To be used for filtering common snps from called variants
ASSEMBLY_DIR=.${DATA_DIR}/assembly_mapping
ASSEMBLY_VERSION=GCF_000001405.40_GRCh38.p14
ASSEMBLY_REPORT_LINK=https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000/001/405/${ASSEMBLY_VERSION}/${ASSEMBLY_VERSION}_assembly_report.txt
ASSEMBLY_REPORT_FILE=assembly_report_file.tsv
MAPPING_FILE=gcf_refseq2chr.tsv
mkdir -p ${ASSEMBLY_DIR}
if [ ! -f ${ASSEMBLY_DIR}/${MAPPING_FILE} ]; then
echo "Download the assembly report file to prepare a file containing the mapping of RefSeq genome assembly ids to chromosome names ..."
wget -O ${ASSEMBLY_DIR}/${ASSEMBLY_REPORT_FILE} ${ASSEMBLY_REPORT_LINK}
awk -F '\t' '$0 !~ /^#/ && $5 != "na" {print $7 "\t" $1}' ${ASSEMBLY_DIR}/${ASSEMBLY_REPORT_FILE} > ${ASSEMBLY_DIR}/${MAPPING_FILE}
rm ${ASSEMBLY_DIR}/${ASSEMBLY_REPORT_FILE}
fi
################
# Downloading the latest NCBI dbsnp file for filtering common snps from called variants
DBSNP_DIR=.${DATA_DIR}/ncbi_dbsnp
DBSNP_LINK=https://ftp.ncbi.nih.gov/snp/latest_release/VCF/GCF_000001405.40.gz
DBSNP_FILE=ncbi_dbsnp_latest.gz
FINAL_FILE=ncbi_dbsnp_latest.common.chr.tsv
mkdir -p ${DBSNP_DIR}
# If the final file is not available perform the following steps
if [ ! -f ${DBSNP_DIR}/${FINAL_FILE} ]; then
# First check if the compressed prepared files are available and uncompress and concatenate them to the final file if they are
if [ -f ${DBSNP_DIR}/${FINAL_FILE}.1.gz ] && [ -f ${DBSNP_DIR}/${FINAL_FILE}.2.gz ]; then
echo "The ${DBSNP_DIR}/${FINAL_FILE}.1.gz and ${DBSNP_DIR}/${FINAL_FILE}.2.gz file are detected. Unpacking and concatenating ..."
gunzip -c ${DBSNP_DIR}/${FINAL_FILE}.1.gz > ${DBSNP_DIR}/${FINAL_FILE}
gunzip -c ${DBSNP_DIR}/${FINAL_FILE}.2.gz >> ${DBSNP_DIR}/${FINAL_FILE}
# If those files are not available then perform all the necessary steps to obtain and prepare the final file
else
echo "Downloading gz compressed NBCI dbsnp database file (> 27.5 G, 10-20 min) ..."
wget -O ${DBSNP_DIR}/${DBSNP_FILE} ${DBSNP_LINK}
echo "Prepare file for filtering called variants (~10-15 min; requires RAM; can be helped through periodic cache dropping) ..."
zgrep "^#" ${DBSNP_DIR}/${DBSNP_FILE} > ${DBSNP_DIR}/${DBSNP_FILE%.*}.common.vcf
echo "Label COMMON for variants with AF>=0.01 (1%) (10-15 min; requires RAM; can be helped through periodic cache dropping) ... "
zgrep ";COMMON" ${DBSNP_DIR}/${DBSNP_FILE} >> ${DBSNP_DIR}/${DBSNP_FILE%.*}.common.vcf
# get chr and position for common variants (>0.01)
refseq=$(cut -f1 ${ASSEMBLY_DIR}/${MAPPING_FILE})
IFS=' ' read -ra refseq <<< $(echo $refseq)
chr=$(cut -f2 ${ASSEMBLY_DIR}/${MAPPING_FILE})
IFS=$' ' read -ra chr <<< $(echo $chr)
touch ${DBSNP_DIR}/${DBSNP_FILE%.*}.common.chr.tsv
echo "Prpare a file with common variants replacing refseq ids with corresponding chromosome ids in the NCBI dbsnp file (~40 min) ..."
for i in ${!refseq[@]}; do
date
echo ${chr[$i]}
grep ${refseq[$i]} ${DBSNP_DIR}/${DBSNP_FILE%.*}.common.vcf | sed "s/${refseq[$i]}/${chr[$i]}/g" | cut -f1,2 >> ${DBSNP_DIR}/${DBSNP_FILE%.*}.common.chr.tsv
done
fi
fi
#####
# Downloading a variant call format (VCF) file derived from the 1000 Genomes Project Phase 3 data,
# aligned to the GRCh38 (hg38) human genome reference. This file includes variant sites with an allele frequency (AF)
# of at least 1% (AF ≥ 0.01).
# The final prepared file with variants with allele frequency (AF) ≥ 0.01 will be used for variant filtering
GPP_DIR=${DATA_DIR}/gpp_vcf
GPP_LINK=https://storage.googleapis.com/gcp-public-data--broad-references/hg38/v0/1000G_phase3_v4_20130502.sites.hg38.vcf
GPP_FILE=1000G_phase3_sites.hg38.vcf
mkdir -p .${GPP_DIR}
if [ ! -f .${GPP_DIR}/${GPP_FILE} ]; then
echo "Downloading the VCF file derived from the 1000 Genomes Project Phase 3 data and its index (5.3G + 1.09M; ~4-5 min) ..."
wget -O .${GPP_DIR}/${GPP_FILE} ${GPP_LINK}
wget -O .${GPP_DIR}/${GPP_FILE}.idx ${GPP_LINK}.idx
echo "Filter variants with allele frequency (AF) ≥ 0.01 to be used for variant filtering ..."
if [ "$1" == "local" ] && command -v docker &> /dev/null; then
echo "Docker found. Running with Docker..."
docker run --rm -w ${GPP_DIR} -v .${DATA_DIR}:${DATA_DIR} community.wave.seqera.io/library/bcftools_gatk4:d89f6490b3de65be \
bcftools view -i 'INFO/AF>=0.01' ${GPP_DIR}/${GPP_FILE} -o ${GPP_DIR}/${GPP_FILE%.*}.af01.vcf
elif [ "$1" == "hpc" ] && command -v singularity &> /dev/null; then
echo "Singularity found. Running with Singularity..."
singularity exec --pwd ${GPP_DIR} --bind .${DATA_DIR}:${DATA_DIR} docker://community.wave.seqera.io/library/bcftools_gatk4:d89f6490b3de65be \
bcftools view -i 'INFO/AF>=0.01' ${GPP_DIR}/${GPP_FILE} -o ${GPP_DIR}/${GPP_FILE%.*}.af01.vcf
else
echo "No supported containerization tool matching argument '$1' was found in the environment."
fi
fi