Skip to content

Commit f79e512

Browse files
committed
adding script to download data
1 parent a79cb02 commit f79e512

1 file changed

Lines changed: 239 additions & 0 deletions

File tree

downloadData.sh

Lines changed: 239 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,239 @@
1+
#!/bin/bash
2+
3+
## This script will create a Data/ directory in the download
4+
## the data used in this study each in its given sub-directory.
5+
6+
shopt -s extglob
7+
8+
# Set the download directory (same as working directory by default):
9+
10+
workingdir="$( pwd )"
11+
outputdir=${workingdir}
12+
13+
14+
# Setting option for downloading only specific dataset (default: all):
15+
16+
declare -a steps2run
17+
steps2run=(step1 step2 step3 step4 step5)
18+
runonlystep=""
19+
20+
21+
# Help menu:
22+
23+
function usage() {
24+
echo "
25+
Download Data Help Section:
26+
===========================
27+
28+
Usage: $0
29+
30+
Or for more options:
31+
32+
Example: $0 -o ~/path/to/my/directory/
33+
34+
Optional arguments:
35+
-o|--outputdir <path> : Directory path for downloaded data
36+
--runonlystep <string> : Indicate a specific step to run (see below)
37+
-h|--help : Display this help message
38+
39+
This script will download all 4 datasets by default. To download only
40+
a specific dataset, designate a step number when running the script.
41+
The following step numbers are valid options:
42+
43+
step1 : downloads the Human Genome Dating atlas dataset from https://human.genome.dating/
44+
step2 : downloads recombination maps for GRCh37 from
45+
https://github.com/joepickrell/1000-genomes-genetic-maps
46+
step3 : downloads introgression map files from http://dical-admix.sourceforge.net/
47+
step4 : downloads population-specific annotation file from
48+
http://ftp.ensembl.org/pub/grch37/release-103/variation/gvf/homo_sapiens/
49+
step5 : download GRCh37 reference genome files from
50+
https://www.ncbi.nlm.nih.gov/assembly/GCF_000001405.25/
51+
"
52+
}
53+
54+
if [ -z "$*" ]; then usage ; exit 1 ; fi
55+
56+
57+
# Parsing command-line arguments:
58+
59+
OPTIONS=`getopt -o o:h --long outputdir:,runonlystep:,help -n 'download error' -- "$@"`
60+
if [ $? != 0 ]; then echo " " ; echo "Could not parse options (see above) ..." >&2 ; usage ; exit 1 ; fi
61+
62+
eval set -- "$OPTIONS"
63+
64+
while true ; do
65+
case "$1" in
66+
-o|--outputdir)
67+
case "$2" in
68+
-*) echo "Please designate an output directory when using -o"; usage; exit 1 ;;
69+
*) outputdir=$2 ; shift 2 ;;
70+
esac ;;
71+
72+
--runonlystep)
73+
case "$2" in
74+
-*) echo "Please designate the code4Rice3K step to execute"; usage; exit 1 ;;
75+
*) runonlystep=$2 ; shift 2 ;;
76+
esac ;;
77+
78+
-h|--help)
79+
usage; exit 1 ;;
80+
81+
--) shift ; break ;;
82+
83+
*) echo "Unknown option or error" ; usage ; exit 1 ;;
84+
85+
esac
86+
done
87+
88+
89+
# Control which step to run:
90+
91+
if [[ $runonlystep != '' ]] ; then
92+
steps2run=($runonlystep)
93+
fi
94+
95+
# Step 0
96+
#
97+
echo ""
98+
echo "=============================================================================="
99+
echo " Downloading datasets for the Wang etal., 2022 paper ..."
100+
echo "=============================================================================="
101+
echo ""
102+
103+
runstep1=0
104+
runstep2=0
105+
runstep3=0
106+
runstep4=0
107+
runstep5=0
108+
for step in ${steps2run[@]} ; do
109+
if [[ $step == "step1" ]] ; then runstep1=1 ; fi
110+
if [[ $step == "step2" ]] ; then runstep2=1 ; fi
111+
if [[ $step == "step3" ]] ; then runstep3=1 ; fi
112+
if [[ $step == "step4" ]] ; then runstep4=1 ; fi
113+
if [[ $step == "step5" ]] ; then runstep5=1 ; fi
114+
done
115+
116+
117+
if [ ! -d "$outputdir" ] ; then
118+
echo ""
119+
echo "... new output directory $outputdir will be created in your working directory."
120+
mkdir $outputdir
121+
fi
122+
123+
cd $outputdir
124+
mkdir -p Atlas Recombination Introgression Populations Reference
125+
126+
127+
# Step 1
128+
if [[ $runstep1 == 1 ]]; then
129+
echo ""
130+
echo "=============================================================================="
131+
echo "Step 1 Downlowding atlas SNP dating files ..."
132+
echo "=============================================================================="
133+
echo ""
134+
135+
cd ${workingdir}
136+
cd ${outputdir}/Atlas
137+
138+
for i in {1..22}; do
139+
if [[ ! -e atlas.chr${i}.csv ]]; then
140+
wget https://human.genome.dating/bulk/atlas.chr${i}.csv.gz && gunzip atlas.chr${i}.csv.gz
141+
fi
142+
done
143+
144+
echo "Atlas files downloaded"
145+
echo ""
146+
fi
147+
148+
149+
# Step 2
150+
if [[ $runstep2 == 1 ]]; then
151+
echo ""
152+
echo "=============================================================================="
153+
echo "Step 2 Downlowding GRCH37 recombination maps ..."
154+
echo "=============================================================================="
155+
echo ""
156+
157+
cd ${workingdir}
158+
cd ${outputdir}/Recombination
159+
160+
if [[ ! -e HapmapII_GRCh37_RecombinationHotspots.tar.gz ]]; then
161+
wget ftp://ftp-trace.ncbi.nih.gov/1000genomes/ftp/technical/working/20110106_recombination_hotspots/HapmapII_GRCh37_RecombinationHotspots.tar.gz && tar -xzvf HapmapII_GRCh37_RecombinationHotspots.tar.gz
162+
fi
163+
164+
echo "Recombination maps downloaded"
165+
echo ""
166+
fi
167+
168+
169+
# Step 3
170+
if [[ $runstep3 == 1 ]]; then
171+
echo ""
172+
echo "=============================================================================="
173+
echo "Step 3 Downlowding Neanderthal introgression files ..."
174+
echo "=============================================================================="
175+
echo ""
176+
177+
cd ${workingdir}
178+
cd ${outputdir}/Introgression
179+
180+
echo "
181+
Actually, I couldn't put a script here to download this data directly.
182+
The Neanderthal introgression files from Steinrucken et al., 2018
183+
are hosted on a google drive. You can download it yourself following
184+
this link here:
185+
186+
https://drive.google.com/drive/folders/175ae-y9Q9Q6FQN6kQS6iGduzeAdHQZXY
187+
"
188+
echo ""
189+
fi
190+
191+
192+
# Step 4
193+
if [[ $runstep4 == 1 ]]; then
194+
echo ""
195+
echo "=============================================================================="
196+
echo "Step 4 Downlowding population-specific annotations ..."
197+
echo "=============================================================================="
198+
echo ""
199+
200+
cd ${workingdir}
201+
cd ${outputdir}/Populations
202+
203+
if [[ ! -e 1000GENOMES-phase_3.gvf ]]; then
204+
wget http://ftp.ensembl.org/pub/grch37/release-103/variation/gvf/homo_sapiens/1000GENOMES-phase_3.gvf.gz && gunzip 1000GENOMES-phase_3.gvf.gz
205+
fi
206+
207+
echo "Population-specific GVF file downloaded"
208+
echo ""
209+
fi
210+
211+
212+
# Step 5
213+
if [[ $runstep5 == 1 ]]; then
214+
echo ""
215+
echo "=============================================================================="
216+
echo "Step 5 Downlowding GRCh37 reference genome ..."
217+
echo "=============================================================================="
218+
echo ""
219+
220+
cd ${workingdir}
221+
cd ${outputdir}/Reference
222+
223+
if [[ ! -e GCF_000001405.25_GRCh37.p13_genomic.fna ]]; then
224+
wget https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000/001/405/GCF_000001405.25_GRCh37.p13/GCF_000001405.25_GRCh37.p13_genomic.fna.gz && gunzip GCF_000001405.25_GRCh37.p13_genomic.fna.gz
225+
fi
226+
227+
echo "Referece genome file downloaded"
228+
echo ""
229+
fi
230+
231+
232+
echo ""
233+
echo "=============================================================================="
234+
echo "Files downloaded."
235+
echo "=============================================================================="
236+
echo ""
237+
238+
exit
239+

0 commit comments

Comments
 (0)