Skip to content

Commit 908adc3

Browse files
authored
Merge pull request #14 from CEGRcode/datasets
SyntheticStrain change sacCer3 depths and tally simulation results
2 parents b41fa80 + 83c7abb commit 908adc3

43 files changed

Lines changed: 1147 additions & 133 deletions

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

paper/SyntheticStrain/depth_simulations.txt

Lines changed: 12 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -1,15 +1,15 @@
1-
sacCer3_CEN.PK2-1Ca 500K 0
2-
sacCer3_CEN.PK2-1Ca 1M 1000
3-
sacCer3_CEN.PK2-1Ca 2M 2000
4-
sacCer3_CEN.PK2-1Ca 3M 3000
5-
sacCer3_CEN.PK2-1Ca 4M 4000
6-
sacCer3_CEN.PK2-1Ca 5M 5000
7-
sacCer3_RM11-1A 500K 6000
8-
sacCer3_RM11-1A 1M 7000
9-
sacCer3_RM11-1A 2M 8000
10-
sacCer3_RM11-1A 3M 9000
11-
sacCer3_RM11-1A 4M 10000
12-
sacCer3_RM11-1A 5M 11000
1+
sacCer3_CEN.PK2-1Ca 10K 0
2+
sacCer3_CEN.PK2-1Ca 50K 1000
3+
sacCer3_CEN.PK2-1Ca 100K 2000
4+
sacCer3_CEN.PK2-1Ca 500K 3000
5+
sacCer3_CEN.PK2-1Ca 1M 4000
6+
sacCer3_CEN.PK2-1Ca 2M 5000
7+
sacCer3_RM11-1A 10K 6000
8+
sacCer3_RM11-1A 50K 7000
9+
sacCer3_RM11-1A 100K 8000
10+
sacCer3_RM11-1A 500K 9000
11+
sacCer3_RM11-1A 1M 10000
12+
sacCer3_RM11-1A 2M 11000
1313
hg19_K562 1M 12000
1414
hg19_K562 2M 13000
1515
hg19_K562 5M 14000

paper/SyntheticStrain/job/generate_synthetic_genomes.sh

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,9 @@
11
#!/bin/bash
22

3-
# This script makes genomes to simulate from. Two yeast and two human genomes each with variants from VCF incorporated into sequence
3+
# This script makes genomes to simulate from. Two yeast and two human genomes each with a different variant profile based on their respective full sized VCF files
44

55
ADDSNPS=scripts/add_VCF_into_Genomic_FASTA.py
6-
SVCF=../db/sacCer3_VCF
6+
SVCF=../db/sacCer3_VCF/full_VCF
77
HVCF=../db/hg19_VCF
88
[ -d synthetic_genome ] || mkdir synthetic_genome
99

Lines changed: 68 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,68 @@
1+
#!/bin/bash
2+
#PBS -l nodes=1:ppn=6
3+
#PBS -l pmem=24gb
4+
#PBS -l walltime=00:05:00
5+
#PBS -A open
6+
#PBS -o logs/depth.sid.CENPK.100K.log.out
7+
#PBS -e logs/depth.sid.CENPK.100K.log.err
8+
#PBS -t 1-1000
9+
10+
module load gcc/8.3.1
11+
module load bedtools/2.27.1
12+
module load bwa/0.7.15
13+
module load samtools/1.5
14+
module load anaconda3
15+
source activate genopipe
16+
17+
# FIRST CHANGE PATH TO EXECUTE
18+
WRK=/path/to/GenoPipe/paper/SyntheticStrain
19+
cd $WRK
20+
21+
INFO=`sed "3q;d" depth_simulations.txt`
22+
LOCUS=`awk '{print $1}' <(echo $INFO)`
23+
DEPTH=`awk '{print $2}' <(echo $INFO)`
24+
BASE=`awk '{print $3}' <(echo $INFO)`
25+
26+
REF=`echo $LOCUS | awk -F'_' '{print $1}'`
27+
SEED=$(($BASE+$PBS_ARRAYID))
28+
29+
OUTPUT=$WRK/results/$LOCUS\_$DEPTH
30+
BAM=$WRK/results/$LOCUS\_$DEPTH/BAM/Simulation_$PBS_ARRAYID.bam
31+
TEMP=$WRK/temp3-$PBS_ARRAYID
32+
33+
[ -d $OUTPUT/ID ] || mkdir $OUTPUT/ID
34+
[ -d logs ] || mkdir logs
35+
[ -d $TEMP ] || mkdir $TEMP
36+
37+
#Check that BAM file was generated first
38+
if [ ! -f $BAM ];
39+
then
40+
echo "BAM input for ${LOCUS}_${DEPTH}_${PBS_ARRAYID} does not exist. Exiting."
41+
exit
42+
fi
43+
#Check that BAM Index file exists
44+
if [ ! -f $BAM.bai ];
45+
then
46+
echo "BAI missing for for ${LOCUS}_${DEPTH}_${PBS_ARRAYID}. Exiting."
47+
exit
48+
fi
49+
50+
# Set-up Temp directory
51+
cd $TEMP
52+
echo $BAM
53+
ln -s $BAM
54+
ln -s $BAM.bai
55+
56+
GENOME=$WRK/../input/$REF.fa
57+
DATABASE=$WRK/../db/$REF\_VCF
58+
GENOPIPE=$WRK/../..
59+
60+
## Execute Single StrainID and record time
61+
cd $GENOPIPE/StrainID
62+
echo "**Begin executing StrainID for ${LOCUS}_${DEPTH}..."
63+
{ time bash identify-Strain.sh -i $TEMP -g $GENOME -v $DATABASE -o $OUTPUT/ID -s $SEED > $OUTPUT/ID/Simulation_$PBS_ARRAYID.std ; } 2> $OUTPUT/ID/Simulation_$PBS_ARRAYID.time
64+
echo "...single StrainID for ${LOCUS} ${DEPTH} finished."
65+
cd $WRK
66+
67+
## Clean-up
68+
rm -r $TEMP
Lines changed: 68 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,68 @@
1+
#!/bin/bash
2+
#PBS -l nodes=1:ppn=6
3+
#PBS -l pmem=24gb
4+
#PBS -l walltime=00:05:00
5+
#PBS -A open
6+
#PBS -o logs/depth.sid.CENPK.10K.log.out
7+
#PBS -e logs/depth.sid.CENPK.10K.log.err
8+
#PBS -t 1-1000
9+
10+
module load gcc/8.3.1
11+
module load bedtools/2.27.1
12+
module load bwa/0.7.15
13+
module load samtools/1.5
14+
module load anaconda3
15+
source activate genopipe
16+
17+
# FIRST CHANGE PATH TO EXECUTE
18+
WRK=/path/to/GenoPipe/paper/SyntheticStrain
19+
cd $WRK
20+
21+
INFO=`sed "1q;d" depth_simulations.txt`
22+
LOCUS=`awk '{print $1}' <(echo $INFO)`
23+
DEPTH=`awk '{print $2}' <(echo $INFO)`
24+
BASE=`awk '{print $3}' <(echo $INFO)`
25+
26+
REF=`echo $LOCUS | awk -F'_' '{print $1}'`
27+
SEED=$(($BASE+$PBS_ARRAYID))
28+
29+
OUTPUT=$WRK/results/$LOCUS\_$DEPTH
30+
BAM=$WRK/results/$LOCUS\_$DEPTH/BAM/Simulation_$PBS_ARRAYID.bam
31+
TEMP=$WRK/temp1-$PBS_ARRAYID
32+
33+
[ -d $OUTPUT/ID ] || mkdir $OUTPUT/ID
34+
[ -d logs ] || mkdir logs
35+
[ -d $TEMP ] || mkdir $TEMP
36+
37+
#Check that BAM file was generated first
38+
if [ ! -f $BAM ];
39+
then
40+
echo "BAM input for ${LOCUS}_${DEPTH}_${PBS_ARRAYID} does not exist. Exiting."
41+
exit
42+
fi
43+
#Check that BAM Index file exists
44+
if [ ! -f $BAM.bai ];
45+
then
46+
echo "BAI missing for for ${LOCUS}_${DEPTH}_${PBS_ARRAYID}. Exiting."
47+
exit
48+
fi
49+
50+
# Set-up Temp directory
51+
cd $TEMP
52+
echo $BAM
53+
ln -s $BAM
54+
ln -s $BAM.bai
55+
56+
GENOME=$WRK/../input/$REF.fa
57+
DATABASE=$WRK/../db/$REF\_VCF
58+
GENOPIPE=$WRK/../..
59+
60+
## Execute Single StrainID and record time
61+
cd $GENOPIPE/StrainID
62+
echo "**Begin executing StrainID for ${LOCUS}_${DEPTH}..."
63+
{ time bash identify-Strain.sh -i $TEMP -g $GENOME -v $DATABASE -o $OUTPUT/ID -s $SEED > $OUTPUT/ID/Simulation_$PBS_ARRAYID.std ; } 2> $OUTPUT/ID/Simulation_$PBS_ARRAYID.time
64+
echo "...single StrainID for ${LOCUS} ${DEPTH} finished."
65+
cd $WRK
66+
67+
## Clean-up
68+
rm -r $TEMP
Lines changed: 68 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,68 @@
1+
#!/bin/bash
2+
#PBS -l nodes=1:ppn=6
3+
#PBS -l pmem=24gb
4+
#PBS -l walltime=00:05:00
5+
#PBS -A open
6+
#PBS -o logs/depth.sid.CENPK.1M.log.out
7+
#PBS -e logs/depth.sid.CENPK.1M.log.err
8+
#PBS -t 1-1000
9+
10+
module load gcc/8.3.1
11+
module load bedtools/2.27.1
12+
module load bwa/0.7.15
13+
module load samtools/1.5
14+
module load anaconda3
15+
source activate genopipe
16+
17+
# FIRST CHANGE PATH TO EXECUTE
18+
WRK=/path/to/GenoPipe/paper/SyntheticStrain
19+
cd $WRK
20+
21+
INFO=`sed "5q;d" depth_simulations.txt`
22+
LOCUS=`awk '{print $1}' <(echo $INFO)`
23+
DEPTH=`awk '{print $2}' <(echo $INFO)`
24+
BASE=`awk '{print $3}' <(echo $INFO)`
25+
26+
REF=`echo $LOCUS | awk -F'_' '{print $1}'`
27+
SEED=$(($BASE+$PBS_ARRAYID))
28+
29+
OUTPUT=$WRK/results/$LOCUS\_$DEPTH
30+
BAM=$WRK/results/$LOCUS\_$DEPTH/BAM/Simulation_$PBS_ARRAYID.bam
31+
TEMP=$WRK/temp5-$PBS_ARRAYID
32+
33+
[ -d $OUTPUT/ID ] || mkdir $OUTPUT/ID
34+
[ -d logs ] || mkdir logs
35+
[ -d $TEMP ] || mkdir $TEMP
36+
37+
#Check that BAM file was generated first
38+
if [ ! -f $BAM ];
39+
then
40+
echo "BAM input for ${LOCUS}_${DEPTH}_${PBS_ARRAYID} does not exist. Exiting."
41+
exit
42+
fi
43+
#Check that BAM Index file exists
44+
if [ ! -f $BAM.bai ];
45+
then
46+
echo "BAI missing for for ${LOCUS}_${DEPTH}_${PBS_ARRAYID}. Exiting."
47+
exit
48+
fi
49+
50+
# Set-up Temp directory
51+
cd $TEMP
52+
echo $BAM
53+
ln -s $BAM
54+
ln -s $BAM.bai
55+
56+
GENOME=$WRK/../input/$REF.fa
57+
DATABASE=$WRK/../db/$REF\_VCF
58+
GENOPIPE=$WRK/../..
59+
60+
## Execute Single StrainID and record time
61+
cd $GENOPIPE/StrainID
62+
echo "**Begin executing StrainID for ${LOCUS}_${DEPTH}..."
63+
{ time bash identify-Strain.sh -i $TEMP -g $GENOME -v $DATABASE -o $OUTPUT/ID -s $SEED > $OUTPUT/ID/Simulation_$PBS_ARRAYID.std ; } 2> $OUTPUT/ID/Simulation_$PBS_ARRAYID.time
64+
echo "...single StrainID for ${LOCUS} ${DEPTH} finished."
65+
cd $WRK
66+
67+
## Clean-up
68+
rm -r $TEMP
Lines changed: 68 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,68 @@
1+
#!/bin/bash
2+
#PBS -l nodes=1:ppn=6
3+
#PBS -l pmem=24gb
4+
#PBS -l walltime=00:05:00
5+
#PBS -A open
6+
#PBS -o logs/depth.sid.CENPK.2M.log.out
7+
#PBS -e logs/depth.sid.CENPK.2M.log.err
8+
#PBS -t 1-1000
9+
10+
module load gcc/8.3.1
11+
module load bedtools/2.27.1
12+
module load bwa/0.7.15
13+
module load samtools/1.5
14+
module load anaconda3
15+
source activate genopipe
16+
17+
# FIRST CHANGE PATH TO EXECUTE
18+
WRK=/path/to/GenoPipe/paper/SyntheticStrain
19+
cd $WRK
20+
21+
INFO=`sed "6q;d" depth_simulations.txt`
22+
LOCUS=`awk '{print $1}' <(echo $INFO)`
23+
DEPTH=`awk '{print $2}' <(echo $INFO)`
24+
BASE=`awk '{print $3}' <(echo $INFO)`
25+
26+
REF=`echo $LOCUS | awk -F'_' '{print $1}'`
27+
SEED=$(($BASE+$PBS_ARRAYID))
28+
29+
OUTPUT=$WRK/results/$LOCUS\_$DEPTH
30+
BAM=$WRK/results/$LOCUS\_$DEPTH/BAM/Simulation_$PBS_ARRAYID.bam
31+
TEMP=$WRK/temp6-$PBS_ARRAYID
32+
33+
[ -d $OUTPUT/ID ] || mkdir $OUTPUT/ID
34+
[ -d logs ] || mkdir logs
35+
[ -d $TEMP ] || mkdir $TEMP
36+
37+
#Check that BAM file was generated first
38+
if [ ! -f $BAM ];
39+
then
40+
echo "BAM input for ${LOCUS}_${DEPTH}_${PBS_ARRAYID} does not exist. Exiting."
41+
exit
42+
fi
43+
#Check that BAM Index file exists
44+
if [ ! -f $BAM.bai ];
45+
then
46+
echo "BAI missing for for ${LOCUS}_${DEPTH}_${PBS_ARRAYID}. Exiting."
47+
exit
48+
fi
49+
50+
# Set-up Temp directory
51+
cd $TEMP
52+
echo $BAM
53+
ln -s $BAM
54+
ln -s $BAM.bai
55+
56+
GENOME=$WRK/../input/$REF.fa
57+
DATABASE=$WRK/../db/$REF\_VCF
58+
GENOPIPE=$WRK/../..
59+
60+
## Execute Single StrainID and record time
61+
cd $GENOPIPE/StrainID
62+
echo "**Begin executing StrainID for ${LOCUS}_${DEPTH}..."
63+
{ time bash identify-Strain.sh -i $TEMP -g $GENOME -v $DATABASE -o $OUTPUT/ID -s $SEED > $OUTPUT/ID/Simulation_$PBS_ARRAYID.std ; } 2> $OUTPUT/ID/Simulation_$PBS_ARRAYID.time
64+
echo "...single StrainID for ${LOCUS} ${DEPTH} finished."
65+
cd $WRK
66+
67+
## Clean-up
68+
rm -r $TEMP
Lines changed: 68 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,68 @@
1+
#!/bin/bash
2+
#PBS -l nodes=1:ppn=6
3+
#PBS -l pmem=24gb
4+
#PBS -l walltime=00:05:00
5+
#PBS -A open
6+
#PBS -o logs/depth.sid.CENPK.500K.log.out
7+
#PBS -e logs/depth.sid.CENPK.500K.log.err
8+
#PBS -t 1-1000
9+
10+
module load gcc/8.3.1
11+
module load bedtools/2.27.1
12+
module load bwa/0.7.15
13+
module load samtools/1.5
14+
module load anaconda3
15+
source activate genopipe
16+
17+
# FIRST CHANGE PATH TO EXECUTE
18+
WRK=/path/to/GenoPipe/paper/SyntheticStrain
19+
cd $WRK
20+
21+
INFO=`sed "4q;d" depth_simulations.txt`
22+
LOCUS=`awk '{print $1}' <(echo $INFO)`
23+
DEPTH=`awk '{print $2}' <(echo $INFO)`
24+
BASE=`awk '{print $3}' <(echo $INFO)`
25+
26+
REF=`echo $LOCUS | awk -F'_' '{print $1}'`
27+
SEED=$(($BASE+$PBS_ARRAYID))
28+
29+
OUTPUT=$WRK/results/$LOCUS\_$DEPTH
30+
BAM=$WRK/results/$LOCUS\_$DEPTH/BAM/Simulation_$PBS_ARRAYID.bam
31+
TEMP=$WRK/temp4-$PBS_ARRAYID
32+
33+
[ -d $OUTPUT/ID ] || mkdir $OUTPUT/ID
34+
[ -d logs ] || mkdir logs
35+
[ -d $TEMP ] || mkdir $TEMP
36+
37+
#Check that BAM file was generated first
38+
if [ ! -f $BAM ];
39+
then
40+
echo "BAM input for ${LOCUS}_${DEPTH}_${PBS_ARRAYID} does not exist. Exiting."
41+
exit
42+
fi
43+
#Check that BAM Index file exists
44+
if [ ! -f $BAM.bai ];
45+
then
46+
echo "BAI missing for for ${LOCUS}_${DEPTH}_${PBS_ARRAYID}. Exiting."
47+
exit
48+
fi
49+
50+
# Set-up Temp directory
51+
cd $TEMP
52+
echo $BAM
53+
ln -s $BAM
54+
ln -s $BAM.bai
55+
56+
GENOME=$WRK/../input/$REF.fa
57+
DATABASE=$WRK/../db/$REF\_VCF
58+
GENOPIPE=$WRK/../..
59+
60+
## Execute Single StrainID and record time
61+
cd $GENOPIPE/StrainID
62+
echo "**Begin executing StrainID for ${LOCUS}_${DEPTH}..."
63+
{ time bash identify-Strain.sh -i $TEMP -g $GENOME -v $DATABASE -o $OUTPUT/ID -s $SEED > $OUTPUT/ID/Simulation_$PBS_ARRAYID.std ; } 2> $OUTPUT/ID/Simulation_$PBS_ARRAYID.time
64+
echo "...single StrainID for ${LOCUS} ${DEPTH} finished."
65+
cd $WRK
66+
67+
## Clean-up
68+
rm -r $TEMP

0 commit comments

Comments
 (0)