Skip to content

Commit 7832314

Browse files
committed
add hg38 HELA/K562 20M simulations scripts
add scripts to generate hg38 versions of the HELA and K562 datasets at 20M
1 parent 387b28f commit 7832314

5 files changed

Lines changed: 109 additions & 9 deletions

File tree

paper/SyntheticStrain/depth_simulations.txt

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,4 +20,5 @@ hg19_HELA 2M 18000
2020
hg19_HELA 5M 19000
2121
hg19_HELA 10M 20000
2222
hg19_HELA 20M 21000
23-
23+
hg38_K562 20M 16000
24+
hg38_HELA 20M 21000

paper/SyntheticStrain/job/generate_synthetic_genomes.sh

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
ADDSNPS=scripts/add_VCF_into_Genomic_FASTA.py
66
SVCF=../db/sacCer3_VCF/full_VCF
77
HVCF=../db/hg19_VCF
8+
H38VCF=../db/VCF_hg38
89
[ -d synthetic_genome ] || mkdir synthetic_genome
910

1011
python $ADDSNPS -f ../input/sacCer3.fa -v $SVCF/RM11-1A.gatk.vcf > synthetic_genome/sacCer3_RM11-1A.fa
@@ -13,6 +14,9 @@ python $ADDSNPS -f ../input/sacCer3.fa -v $SVCF/CEN.PK2-1Ca.gatk.vcf > synthetic
1314
python $ADDSNPS -f ../input/hg19.fa -v $HVCF/K562.vcf > synthetic_genome/hg19_K562.fa
1415
python $ADDSNPS -f ../input/hg19.fa -v $HVCF/HELA.vcf > synthetic_genome/hg19_HELA.fa
1516

17+
python $ADDSNPS -f ../input/hg38.fa -v $H38VCF/K562.vcf > synthetic_genome/hg38_K562.fa
18+
python $ADDSNPS -f ../input/hg38.fa -v $H38VCF/HELA.vcf > synthetic_genome/hg38_HELA.fa
19+
1620
for FASTA in `ls synthetic_genome/*.fa`;
1721
do
1822
samtools faidx $FASTA
Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,35 @@
1+
#!/bin/bash
2+
#PBS -l nodes=1:ppn=4
3+
#PBS -l pmem=16gb
4+
#PBS -l walltime=03:00:00
5+
#PBS -A open
6+
#PBS -o logs/depth.hg38.HELA.20M.log.out
7+
#PBS -e logs/depth.hg38.HELA.20M.log.err
8+
#PBS -t 1-1000
9+
10+
# FIRST CHANGE PATH TO EXECUTE
11+
WRK=/path/to/GenoPipe/paper/SyntheticStrain
12+
cd $WRK
13+
14+
module load bedtools
15+
module load bwa
16+
module load samtools
17+
module load anaconda3
18+
source activate my-genopipe-env
19+
20+
INFO=`sed "24q;d" depth_simulations.txt`
21+
STRAIN=`awk '{print $1}' <(echo $INFO)`
22+
DEPTH=`awk '{print $2}' <(echo $INFO)`
23+
BASE=`awk '{print $3}' <(echo $INFO)`
24+
REF=`awk -F"_" '{print $1}' <(echo $STRAIN)`
25+
26+
GENOME=synthetic_genome/$STRAIN.fa
27+
OUTPUT=results/$STRAIN\_$DEPTH
28+
SEED=$(($BASE+$PBS_ARRAYID))
29+
30+
start=`date +%s`
31+
bash ../scripts/simulate.sh -i $PBS_ARRAYID -d $DEPTH -s $SEED -g $GENOME -o $OUTPUT
32+
bash ../scripts/align.sh -i $PBS_ARRAYID -g ../input/$REF.fa -o $OUTPUT -t 4
33+
end=`date +%s`
34+
runtime=$((end-start))
35+
echo "${STRAIN} ${DEPTH} simulate in ${runtime}"
Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,35 @@
1+
#!/bin/bash
2+
#PBS -l nodes=1:ppn=4
3+
#PBS -l pmem=16gb
4+
#PBS -l walltime=03:00:00
5+
#PBS -A open
6+
#PBS -o logs/depth.hg38.K562.20M.log.out
7+
#PBS -e logs/depth.hg38.K562.20M.log.err
8+
#PBS -t 1-1000
9+
10+
# FIRST CHANGE PATH TO EXECUTE
11+
WRK=/path/to/GenoPipe/paper/SyntheticStrain
12+
cd $WRK
13+
14+
module load bedtools
15+
module load bwa
16+
module load samtools
17+
module load anaconda3
18+
source activate my-genopipe-env
19+
20+
INFO=`sed "23q;d" depth_simulations.txt`
21+
STRAIN=`awk '{print $1}' <(echo $INFO)`
22+
DEPTH=`awk '{print $2}' <(echo $INFO)`
23+
BASE=`awk '{print $3}' <(echo $INFO)`
24+
REF=`awk -F"_" '{print $1}' <(echo $STRAIN)`
25+
26+
GENOME=synthetic_genome/$STRAIN.fa
27+
OUTPUT=results/$STRAIN\_$DEPTH
28+
SEED=$(($BASE+$PBS_ARRAYID))
29+
30+
start=`date +%s`
31+
bash ../scripts/simulate.sh -i $PBS_ARRAYID -d $DEPTH -s $SEED -g $GENOME -o $OUTPUT
32+
bash ../scripts/align.sh -i $PBS_ARRAYID -g ../input/$REF.fa -o $OUTPUT -t 4
33+
end=`date +%s`
34+
runtime=$((end-start))
35+
echo "${STRAIN} ${DEPTH} simulate in ${runtime}"

paper/setup.sh

Lines changed: 33 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -48,13 +48,10 @@ if [[ ! -f $YGENOME ]]; then
4848
echo "Parsing genome..."
4949
YPARSER=../EpitopeID/utility_scripts/genome_data/parsers/parse_sacCer3_Genome_FASTA.pl
5050
perl $YPARSER S288C_reference_genome_R64-1-1_20110203/S288C_reference_sequence_R64-1-1_20110203.fsa $YGENOME
51-
echo "Complete"
5251
echo "BWA Indexing genome..."
5352
bwa index $YGENOME
54-
echo "Complete"
5553
echo "Bowtie2 Indexing genome..."
5654
bowtie2-build $YGENOME $YGENOME
57-
echo "Complete"
5855
rm S288C_reference_genome_R64-1-1_20110203.tgz
5956
rm -r S288C_reference_genome_R64-1-1_20110203/
6057
fi
@@ -76,17 +73,39 @@ if [[ ! -f $HGENOME ]]; then
7673
chmod 777 twoBitToFa
7774
echo "Converting 2bit to fa..."
7875
./twoBitToFa hg19.2bit $HGENOME.raw
79-
echo "Complete"
8076
echo "Strip haplotypes..."
8177
python scripts/parse_hg19_Genome_FASTA.py $HGENOME.raw > $HGENOME
82-
echo "Complete"
8378
echo "BWA Indexing genome..."
8479
bwa index $HGENOME
80+
echo "Bowtie2 Indexing genome..."
8581
bowtie2-build $HGENOME $HGENOME
8682
echo "Complete"
8783
rm twoBitToFa hg19.2bit $HGENOME.raw
8884
fi
8985

86+
87+
# Download Human Genome (hg38)
88+
HGENOME=input/hg38.fa
89+
if [[ ! -f $HGENOME ]]; then
90+
echo "**Human genome not found, downloading to $HGENOME..."
91+
wget -N http://hgdownload.cse.ucsc.edu/goldenPath/hg38/bigZips/hg38.2bit
92+
# Check for the existence of twoBitToFa, download if not present and make globally executable
93+
if ! command -v twoBitToFa; then
94+
unameOUT="$(uname -s)"
95+
if [ $unameOUT == "Darwin" ]; then
96+
wget -N http://hgdownload.soe.ucsc.edu/admin/exe/macOSX.x86_64/twoBitToFa
97+
else
98+
wget -N http://hgdownload.soe.ucsc.edu/admin/exe/linux.x86_64.v369/twoBitToFa
99+
fi
100+
fi
101+
chmod 777 twoBitToFa
102+
echo "Converting 2bit to fa..."
103+
./twoBitToFa hg38.2bit $HGENOME
104+
echo "BWA Indexing genome..."
105+
bwa index $HGENOME
106+
rm twoBitToFa hg38.2bit
107+
fi
108+
90109
# Build Yeast EpiID with R500
91110
YEPIDB=db/sacCer3_EpiID
92111
[ -d $YEPIDB ] || cp -r ../EpitopeID/sacCer3_EpiID/ $YEPIDB
@@ -95,10 +114,8 @@ if [ ! -f $YEPIDB/FASTA_genome/genome.fa ]; then
95114
cp input/sacCer3.fa $YEPIDB/FASTA_genome/genome.fa
96115
echo "BWA Indexing genome..."
97116
bwa index $YEPIDB/FASTA_genome/genome.fa
98-
echo "Complete"
99117
echo "Bowtie2 Indexing genome..."
100118
bowtie2-build $YEPIDB/FASTA_genome/genome.fa $YEPIDB/FASTA_genome/genome.fa
101-
echo "Complete"
102119
fi
103120
echo "Setup Random Epitope for Yeast EpiID..."
104121
cp input/RAND_500.fa $YEPIDB/FASTA_tag/Tag_DB/
@@ -124,7 +141,6 @@ if [ ! -f $HEPIDB/FASTA_genome/genome.fa ]; then
124141
# echo "Complete"
125142
echo "Bowtie2 Indexing genome..."
126143
bowtie2-build $HEPIDB/FASTA_genome/genome.fa $HEPIDB/FASTA_genome/genome.fa
127-
echo "Complete"
128144
fi
129145
echo "Setup Random Epitope for Human EpiID..."
130146
cp input/RAND_500.fa $HEPIDB/FASTA_tag/Tag_DB/
@@ -167,9 +183,18 @@ ln -s ../../DeletionID/sacCer3_Del
167183
cd $WRK
168184

169185
# Add Yeast & Human StrainID DB to paper/db
186+
cd $WRK/../StrainID
187+
tar -xvf hg38_VCF
188+
tar -xvf hg38_ENCODE
189+
cd utility_scripts
190+
bash generate_hg38_VariantDB.#!/bin/sh
191+
mv hg38_DepMap ../
170192
cd $WRK/db
171193
ln -s ../../StrainID/sacCer3_VCF
172194
ln -s ../../StrainID/hg19_VCF
195+
ln -s ../../StrainID/hg38_VCF
196+
ln -s ../../StrainID/hg38_ENCODE
197+
ln -s ../../StrainID/hg38_DepMap
173198
cd $WRK
174199

175200
# Setup color-space index for yeast genome

0 commit comments

Comments
 (0)