Skip to content

Commit 661a974

Browse files
authored
Merge pull request #9 from CEGRcode/validation
Validation-simulate data for StrainID: Commit scripts for generating simulated data only--scripts running StrainID on this data still to come.
2 parents e269303 + 04e0f2e commit 661a974

28 files changed

Lines changed: 986 additions & 13 deletions

paper/.gitignore

Lines changed: 13 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -2,24 +2,27 @@ input/hg19.fa*
22
input/sacCer3.fa*
33
db/
44
SyntheticEpitope/synthetic_genome/
5-
SyntheticDeletion/synthetic_genome/
5+
SyntheticEpitope/logs/*.err-*
6+
SyntheticEpitope/logs/*.out-*
67
SyntheticEpitope/results/sacCer3*
78
SyntheticEpitope/results/hg19*
8-
SyntheticDeletion/results/sacCer3*
99
SyntheticEpitope/results/mix_*
10+
ENCODEdata-eGFP/logs/*.out-*
11+
ENCODEdata-eGFP/logs/*.err-*
12+
ENCODEdata-eGFP/results/FASTQ
13+
ENCODEdata-eGFP/results/ID
14+
SyntheticDeletion/synthetic_genome/
1015
SyntheticDeletion/logs/*.err-*
1116
SyntheticDeletion/logs/*.out-*
12-
SyntheticEpitope/logs/*.err-*
13-
SyntheticEpitope/logs/*.out-*
14-
SyntheticStrain/logs/*.err-*
15-
SyntheticStrain/logs/*.out-*
17+
SyntheticDeletion/results/sacCer3*
1618
YKOC-wgs/logs/*.err-*
1719
YKOC-wgs/logs/*.out-*
1820
YKOC-wgs/results/FASTQ
1921
YKOC-wgs/results/BAM
2022
YKOC-wgs/results/ID
2123
YKOC-wgs/results/BedGraphs
22-
ENCODEdata-eGFP/logs/*.out-*
23-
ENCODEdata-eGFP/logs/*.err-*
24-
ENCODEdata-eGFP/results/FASTQ
25-
ENCODEdata-eGFP/results/ID
24+
SyntheticStrain/synthetic_genome/
25+
SyntheticStrain/logs/*.err-*
26+
SyntheticStrain/logs/*.out-*
27+
SyntheticStrain/results/sacCer3*
28+
SyntheticStrain/results/hg19*

paper/SyntheticStrain/README

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -10,8 +10,8 @@ nucleotides with the variant nucleotides.
1010

1111
# Sequencing-Depth Test:
1212
For each synthetic genome, simulate 1000 datasets for each of the various number of reads
13-
-sacCer3 genomes sequence at 10K, 100K, 1M, and 10M reads
14-
-hg19 genomes sequence at 100K, 1M, 10M, and 50M reads
13+
-sacCer3 genomes sequence at 500K, 1M, 2M, 3M, 4M, and 5M reads
14+
-hg19 genomes sequence at 1M, 2M, 5M, 10M, and 20M reads
1515
Run them through StrainID and evaluate how often StrainID correctly identifies the
1616
correct strain. Time StrainID to determine how its execution speed performance.
1717

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,23 @@
1+
sacCer3_CEN.PK2-1Ca 500K 0
2+
sacCer3_CEN.PK2-1Ca 1M 1000
3+
sacCer3_CEN.PK2-1Ca 2M 2000
4+
sacCer3_CEN.PK2-1Ca 3M 3000
5+
sacCer3_CEN.PK2-1Ca 4M 4000
6+
sacCer3_CEN.PK2-1Ca 5M 5000
7+
sacCer3_RM11-1A 500K 6000
8+
sacCer3_RM11-1A 1M 7000
9+
sacCer3_RM11-1A 2M 8000
10+
sacCer3_RM11-1A 3M 9000
11+
sacCer3_RM11-1A 4M 10000
12+
sacCer3_RM11-1A 5M 11000
13+
hg19_K562 1M 12000
14+
hg19_K562 2M 13000
15+
hg19_K562 5M 14000
16+
hg19_K562 10M 15000
17+
hg19_K562 20M 16000
18+
hg19_HELA 1M 17000
19+
hg19_HELA 2M 18000
20+
hg19_HELA 5M 19000
21+
hg19_HELA 10M 20000
22+
hg19_HELA 20M 21000
23+
Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,14 @@
1+
#!/bin/bash
2+
3+
# This script makes genomes to simulate from. Two yeast and two human genomes each with variants from VCF incorporated into sequence
4+
5+
ADDSNPS=scripts/add_VCF_into_Genomic_FASTA.py
6+
SVCF=../db/sacCer3_VCF
7+
HVCF=../db/hg19_VCF
8+
[ -d synthetic_genome ] || mkdir synthetic_genome
9+
10+
python $ADDSNPS -f ../input/sacCer3.fa -v $SVCF/RM11-1A.gatk.vcf > synthetic_genome/sacCer3_RM11-1A.fa
11+
python $ADDSNPS -f ../input/sacCer3.fa -v $SVCF/CEN.PK2-1Ca.gatk.vcf > synthetic_genome/sacCer3_CEN.PK2-1Ca.fa
12+
13+
python $ADDSNPS -f ../input/hg19.fa -v $HVCF/K562.vcf > synthetic_genome/hg19_K562.fa
14+
python $ADDSNPS -f ../input/hg19.fa -v $HVCF/HELA.vcf > synthetic_genome/hg19_HELA.fa
Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,36 @@
1+
#!/bin/bash
2+
#PBS -l nodes=1:ppn=4
3+
#PBS -l pmem=16gb
4+
#PBS -l walltime=00:05:00
5+
#PBS -A open
6+
#PBS -o logs/depth.CENPK.1M.log.out
7+
#PBS -e logs/depth.CENPK.1M.log.err
8+
#PBS -t 1-1000
9+
10+
# FIRST CHANGE PATH TO EXECUTE
11+
WRK=/path/to/GenoPipe/paper/SyntheticStrain
12+
cd $WRK
13+
14+
module load gcc/8.3.1
15+
module load bedtools/2.27.1
16+
module load bwa/0.7.15
17+
module load samtools/1.5
18+
module load anaconda3
19+
source activate genopipe
20+
21+
INFO=`sed "2q;d" depth_simulations.txt`
22+
STRAIN=`awk '{print $1}' <(echo $INFO)`
23+
DEPTH=`awk '{print $2}' <(echo $INFO)`
24+
BASE=`awk '{print $3}' <(echo $INFO)`
25+
REF=`awk -F"_" '{print $1}' <(echo $STRAIN)`
26+
27+
GENOME=synthetic_genome/$STRAIN.fa
28+
OUTPUT=results/$STRAIN\_$DEPTH
29+
SEED=$(($BASE+$PBS_ARRAYID))
30+
31+
start=`date +%s`
32+
bash ../scripts/simulate.sh -i $PBS_ARRAYID -d $DEPTH -s $SEED -g $GENOME -o $OUTPUT
33+
bash ../scripts/align.sh -i $PBS_ARRAYID -g ../input/$REF.fa -o $OUTPUT -t 4
34+
end=`date +%s`
35+
runtime=$((end-start))
36+
echo "${STRAIN} ${DEPTH} simulate in ${runtime}"
Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,36 @@
1+
#!/bin/bash
2+
#PBS -l nodes=1:ppn=4
3+
#PBS -l pmem=16gb
4+
#PBS -l walltime=00:10:00
5+
#PBS -A open
6+
#PBS -o logs/depth.CENPK.2M.log.out
7+
#PBS -e logs/depth.CENPK.2M.log.err
8+
#PBS -t 1-1000
9+
10+
# FIRST CHANGE PATH TO EXECUTE
11+
WRK=/path/to/GenoPipe/paper/SyntheticStrain
12+
cd $WRK
13+
14+
module load gcc/8.3.1
15+
module load bedtools/2.27.1
16+
module load bwa/0.7.15
17+
module load samtools/1.5
18+
module load anaconda3
19+
source activate genopipe
20+
21+
INFO=`sed "3q;d" depth_simulations.txt`
22+
STRAIN=`awk '{print $1}' <(echo $INFO)`
23+
DEPTH=`awk '{print $2}' <(echo $INFO)`
24+
BASE=`awk '{print $3}' <(echo $INFO)`
25+
REF=`awk -F"_" '{print $1}' <(echo $STRAIN)`
26+
27+
GENOME=synthetic_genome/$STRAIN.fa
28+
OUTPUT=results/$STRAIN\_$DEPTH
29+
SEED=$(($BASE+$PBS_ARRAYID))
30+
31+
start=`date +%s`
32+
bash ../scripts/simulate.sh -i $PBS_ARRAYID -d $DEPTH -s $SEED -g $GENOME -o $OUTPUT
33+
bash ../scripts/align.sh -i $PBS_ARRAYID -g ../input/$REF.fa -o $OUTPUT -t 4
34+
end=`date +%s`
35+
runtime=$((end-start))
36+
echo "${STRAIN} ${DEPTH} simulate in ${runtime}"
Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,36 @@
1+
#!/bin/bash
2+
#PBS -l nodes=1:ppn=4
3+
#PBS -l pmem=16gb
4+
#PBS -l walltime=00:10:00
5+
#PBS -A open
6+
#PBS -o logs/depth.CENPK.3M.log.out
7+
#PBS -e logs/depth.CENPK.3M.log.err
8+
#PBS -t 1-1000
9+
10+
# FIRST CHANGE PATH TO EXECUTE
11+
WRK=/path/to/GenoPipe/paper/SyntheticStrain
12+
cd $WRK
13+
14+
module load gcc/8.3.1
15+
module load bedtools/2.27.1
16+
module load bwa/0.7.15
17+
module load samtools/1.5
18+
module load anaconda3
19+
source activate genopipe
20+
21+
INFO=`sed "4q;d" depth_simulations.txt`
22+
STRAIN=`awk '{print $1}' <(echo $INFO)`
23+
DEPTH=`awk '{print $2}' <(echo $INFO)`
24+
BASE=`awk '{print $3}' <(echo $INFO)`
25+
REF=`awk -F"_" '{print $1}' <(echo $STRAIN)`
26+
27+
GENOME=synthetic_genome/$STRAIN.fa
28+
OUTPUT=results/$STRAIN\_$DEPTH
29+
SEED=$(($BASE+$PBS_ARRAYID))
30+
31+
start=`date +%s`
32+
bash ../scripts/simulate.sh -i $PBS_ARRAYID -d $DEPTH -s $SEED -g $GENOME -o $OUTPUT
33+
bash ../scripts/align.sh -i $PBS_ARRAYID -g ../input/$REF.fa -o $OUTPUT -t 4
34+
end=`date +%s`
35+
runtime=$((end-start))
36+
echo "${STRAIN} ${DEPTH} simulate in ${runtime}"
Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,36 @@
1+
#!/bin/bash
2+
#PBS -l nodes=1:ppn=4
3+
#PBS -l pmem=16gb
4+
#PBS -l walltime=00:15:00
5+
#PBS -A open
6+
#PBS -o logs/depth.CENPK.4M.log.out
7+
#PBS -e logs/depth.CENPK.4M.log.err
8+
#PBS -t 1-1000
9+
10+
# FIRST CHANGE PATH TO EXECUTE
11+
WRK=/path/to/GenoPipe/paper/SyntheticStrain
12+
cd $WRK
13+
14+
module load gcc/8.3.1
15+
module load bedtools/2.27.1
16+
module load bwa/0.7.15
17+
module load samtools/1.5
18+
module load anaconda3
19+
source activate genopipe
20+
21+
INFO=`sed "5q;d" depth_simulations.txt`
22+
STRAIN=`awk '{print $1}' <(echo $INFO)`
23+
DEPTH=`awk '{print $2}' <(echo $INFO)`
24+
BASE=`awk '{print $3}' <(echo $INFO)`
25+
REF=`awk -F"_" '{print $1}' <(echo $STRAIN)`
26+
27+
GENOME=synthetic_genome/$STRAIN.fa
28+
OUTPUT=results/$STRAIN\_$DEPTH
29+
SEED=$(($BASE+$PBS_ARRAYID))
30+
31+
start=`date +%s`
32+
bash ../scripts/simulate.sh -i $PBS_ARRAYID -d $DEPTH -s $SEED -g $GENOME -o $OUTPUT
33+
bash ../scripts/align.sh -i $PBS_ARRAYID -g ../input/$REF.fa -o $OUTPUT -t 4
34+
end=`date +%s`
35+
runtime=$((end-start))
36+
echo "${STRAIN} ${DEPTH} simulate in ${runtime}"
Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,36 @@
1+
#!/bin/bash
2+
#PBS -l nodes=1:ppn=4
3+
#PBS -l pmem=16gb
4+
#PBS -l walltime=00:05:00
5+
#PBS -A open
6+
#PBS -o logs/depth.CENPK.500K.log.out
7+
#PBS -e logs/depth.CENPK.500K.log.err
8+
#PBS -t 1-1000
9+
10+
# FIRST CHANGE PATH TO EXECUTE
11+
WRK=/path/to/GenoPipe/paper/SyntheticStrain
12+
cd $WRK
13+
14+
module load gcc/8.3.1
15+
module load bedtools/2.27.1
16+
module load bwa/0.7.15
17+
module load samtools/1.5
18+
module load anaconda3
19+
source activate genopipe
20+
21+
INFO=`sed "1q;d" depth_simulations.txt`
22+
STRAIN=`awk '{print $1}' <(echo $INFO)`
23+
DEPTH=`awk '{print $2}' <(echo $INFO)`
24+
BASE=`awk '{print $3}' <(echo $INFO)`
25+
REF=`awk -F"_" '{print $1}' <(echo $STRAIN)`
26+
27+
GENOME=synthetic_genome/$STRAIN.fa
28+
OUTPUT=results/$STRAIN\_$DEPTH
29+
SEED=$(($BASE+$PBS_ARRAYID))
30+
31+
start=`date +%s`
32+
bash ../scripts/simulate.sh -i $PBS_ARRAYID -d $DEPTH -s $SEED -g $GENOME -o $OUTPUT
33+
bash ../scripts/align.sh -i $PBS_ARRAYID -g ../input/$REF.fa -o $OUTPUT -t 4
34+
end=`date +%s`
35+
runtime=$((end-start))
36+
echo "${STRAIN} ${DEPTH} simulate in ${runtime}"
Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,36 @@
1+
#!/bin/bash
2+
#PBS -l nodes=1:ppn=4
3+
#PBS -l pmem=16gb
4+
#PBS -l walltime=00:30:00
5+
#PBS -A open
6+
#PBS -o logs/depth.CENPK.5M.log.out
7+
#PBS -e logs/depth.CENPK.5M.log.err
8+
#PBS -t 1-1000
9+
10+
# FIRST CHANGE PATH TO EXECUTE
11+
WRK=/path/to/GenoPipe/paper/SyntheticStrain
12+
cd $WRK
13+
14+
module load gcc/8.3.1
15+
module load bedtools/2.27.1
16+
module load bwa/0.7.15
17+
module load samtools/1.5
18+
module load anaconda3
19+
source activate genopipe
20+
21+
INFO=`sed "6q;d" depth_simulations.txt`
22+
STRAIN=`awk '{print $1}' <(echo $INFO)`
23+
DEPTH=`awk '{print $2}' <(echo $INFO)`
24+
BASE=`awk '{print $3}' <(echo $INFO)`
25+
REF=`awk -F"_" '{print $1}' <(echo $STRAIN)`
26+
27+
GENOME=synthetic_genome/$STRAIN.fa
28+
OUTPUT=results/$STRAIN\_$DEPTH
29+
SEED=$(($BASE+$PBS_ARRAYID))
30+
31+
start=`date +%s`
32+
bash ../scripts/simulate.sh -i $PBS_ARRAYID -d $DEPTH -s $SEED -g $GENOME -o $OUTPUT
33+
bash ../scripts/align.sh -i $PBS_ARRAYID -g ../input/$REF.fa -o $OUTPUT -t 4
34+
end=`date +%s`
35+
runtime=$((end-start))
36+
echo "${STRAIN} ${DEPTH} simulate in ${runtime}"

0 commit comments

Comments
 (0)