Skip to content

Commit b0cb6d1

Browse files
committed
update mixture contamination scripts
Switch to a simulation index-oriented run of the mixture simulations (rather than a per-mixture ratio). These new scripts are modified to be more similar to the depth and epitopeid template PBS scripts. The simulation script was also updated to force overwrite of gzip files for convenient re-running of simulations and the check for completion was removed in favor of an immediate overwrite setup.
1 parent 7843ba1 commit b0cb6d1

5 files changed

Lines changed: 133 additions & 123 deletions

File tree

paper/SyntheticEpitope/job/run_EpitopeID_on_mix_human.pbs

Lines changed: 45 additions & 41 deletions
Original file line numberDiff line numberDiff line change
@@ -3,56 +3,60 @@
33
#PBS -l pmem=16gb
44
#PBS -l walltime=01:00:00
55
#PBS -A open
6-
#PBS -o logs/mix.human.10M.eid.log.out
7-
#PBS -e logs/mix.human.10M.eid.log.err
8-
#PBS -t 90,80,70,60,50,40,30,20,10
6+
#PBS -o logs/mix.human.50M.eid.log.out
7+
#PBS -e logs/mix.human.50M.eid.log.err
8+
#PBS -t 1-1000
99

1010
# Each index of the job array t:[1,16] represents a different set of mixed simulations. This script will execute EpitopeID against all all simulated datasets at various titrations and use the perl script to calculate the detection statistics. Each index corresponds to a different titration mix.
1111

1212
# This script will check that 2000 FASTQ files have been generated before executing EpitopeID.
1313

1414
WRK=/path/to/GenoPipe/paper/SyntheticEpitope
15+
THREADS=6
16+
EPITOPEID=$WRK/../../EpitopeID
1517
cd $WRK
1618

17-
module load bedtools/2.27.1
18-
module load bwa/0.7.15
19-
module load python/2.7.14-anaconda5.0.1
19+
module load bedtools
20+
module load samtools
21+
module load anaconda3
22+
source activate genopipe
2023

21-
DEPTH=10M
24+
REF=hg19
25+
# DEPTH=20M
26+
DEPTH=50M
27+
EPITOPE=R500
2228
AFACTOR=CTCF
2329
BFACTOR=POLR2H
2430

25-
RESULT=$WRK/results/mix_human_$DEPTH/$PBS_ARRAYID\_$AFACTOR\_$BFACTOR
26-
[ -d $RESULT/ID ] || mkdir $RESULT/ID
27-
echo $RESULT
28-
#Check that all FASTQ files were genertaed first
29-
if [ $(ls $RESULT/FASTQ/*.fastq.gz |wc -l ) -lt 2000 ];
30-
then
31-
NFASTQ=`ls $OUTPUT/FASTQ/*.fastq.gz |wc -l`
32-
echo "Insufficient simulations for mix_yeast_${DEPTH}. Only have $NFASTQ FASTQ files, make sure you generate all 2000 before running EpitopeID"
33-
exit
34-
fi
35-
36-
GENOPIPE=$WRK/../..
37-
cd $GENOPIPE/EpitopeID
38-
## Execute Mass EpitopeID and record time
39-
echo "**Begin executing EpitopeID for mix_yeast_${DEPTH}..."
40-
start=`date +%s`
41-
bash identify-Epitope.sh -i $RESULT/FASTQ -o $RESULT/ID -d $GENOPIPE/paper/db/hg19_EpiID -t 4
42-
end=`date +%s`
43-
runtime=$((end-start))
44-
MESSAGE="...mass EpitopeID for mix_human_${DEPTH} finished in ${runtime}"
45-
echo $MESSAGE
46-
cd $WRK
47-
48-
## Calculate detection
49-
CALCULATE=scripts/calculate_detection_Stats.pl
50-
echo "**Calculate detection of $GENE..."
51-
echo $MESSAGE > $RESULT/runtime.txt
52-
head -n 9999 $RESULT/ID/*-ID.tab | perl $CALCULATE - $AFACTOR $RESULT/$AFACTOR\_$DEPTH\_detectionStats.txt
53-
head -n 9999 $RESULT/ID/*-ID.tab | perl $CALCULATE - $BFACTOR $RESULT/$BFACTOR\_$DEPTH\_detectionStats.txt
54-
grep ${AFACTOR^^} $RESULT/ID/*-ID.tab | cut -d':' -f1 | sort | uniq > $RESULT/$AFACTOR\_hits.txt
55-
grep ${BFACTOR^^} $RESULT/ID/*-ID.tab | cut -d':' -f1 | sort | uniq > $RESULT/$BFACTOR\_hits.txt
56-
comm -12 $RESULT/$AFACTOR\_hits.txt $RESULT/$BFACTOR\_hits.txt > $RESULT/$AFACTOR\_and_$BFACTOR\_hits.txt
57-
echo "Calculated"
58-
31+
DATABASE=$WRK/../db/$REF\_EpiID
32+
[ -d logs ] || mkdir logs
33+
34+
# Titrate out various ratios
35+
for PER in 90 80 70 60 50 40 30 20 10
36+
do
37+
# Construct results directory pathname
38+
RESULTS=$WRK/results/mix_human/$DEPTH/$PER\_$AFACTOR\_$BFACTOR
39+
[ -d $RESULTS/ID ] || mkdir $RESULTS/ID
40+
[ -d $RESULTS/runtime ] || mkdir $RESULTS/runtime
41+
echo $RESULT
42+
43+
# Set-up Temp directory
44+
TEMP=$WRK/temp-$PER-$PBS_ARRAYID-$DEPTH
45+
[ -d $TEMP ] || mkdir $TEMP
46+
47+
cd $TEMP
48+
FQ=$RESULTS/FASTQ/Simulation_$PBS_ARRAYID
49+
[ -f Simulation_$PBS_ARRAYID\_R1.fastq.gz ] || ln -s $FQ\_R1.fastq.gz
50+
[ -f Simulation_$PBS_ARRAYID\_R2.fastq.gz ] || ln -s $FQ\_R2.fastq.gz
51+
52+
# Execute EpitopeID and record time
53+
cd $EPITOPEID
54+
start=`date +%s`
55+
bash identify-Epitope.sh -i $TEMP -o $RESULTS/ID -d $DATABASE -t $THREADS
56+
end=`date +%s`
57+
runtime=$((end-start))
58+
echo "Experiment $EXPERIMENT.$DEPTH.$PBS_ARRAYID ($RESULTS) finished in ${runtime}" > $RESULTS/runtime/Simulation_$PBS_ARRAYID.runtime
59+
60+
# Clean-up
61+
rm -r $TEMP
62+
done

paper/SyntheticEpitope/job/run_EpitopeID_on_mix_yeast.pbs

Lines changed: 49 additions & 46 deletions
Original file line numberDiff line numberDiff line change
@@ -3,57 +3,60 @@
33
#PBS -l pmem=16gb
44
#PBS -l walltime=01:00:00
55
#PBS -A open
6-
#PBS -o logs/mix.yeast.eid.log.out
7-
#PBS -e logs/mix.yeast.eid.log.err
8-
#PBS -t 90,80,70,60,50,40,30,20,10
6+
#PBS -o logs/mix.yeast.1M.eid.log.out
7+
#PBS -e logs/mix.yeast.1M.eid.log.err
8+
#PBS -t 1-1000
99

1010
# Each index of the job array t:[1,16] represents a different set of mixed simulations. This script will execute EpitopeID against all all simulated datasets at various titrations and use the perl script to calculate the detection statistics. Each index corresponds to a different titration mix.
1111

1212
# This script will check that 2000 FASTQ files have been generated before executing EpitopeID.
1313

1414
WRK=/path/to/GenoPipe/paper/SyntheticEpitope
15+
THREADS=6
16+
EPITOPEID=$WRK/../../EpitopeID
1517
cd $WRK
1618

17-
module load bedtools/2.27.1
18-
module load bwa/0.7.15
19-
module load python/2.7.14-anaconda5.0.1
20-
module load samtools/1.5
21-
22-
DEPTH=100K
23-
AFACTOR=Reb1
24-
BFACTOR=Rap1
25-
26-
RESULT=$WRK/results/mix_yeast_$DEPTH/$PBS_ARRAYID\_$AFACTOR\_$BFACTOR
27-
[ -d $RESULT/ID ] || mkdir $RESULT/ID
28-
echo $RESULT
29-
#Check that all FASTQ files were genertaed first
30-
if [ $(ls $RESULT/FASTQ/*.fastq.gz |wc -l ) -lt 2000 ];
31-
then
32-
NFASTQ=`ls $OUTPUT/FASTQ/*.fastq.gz |wc -l`
33-
echo "Insufficient simulations for mix_yeast_${DEPTH}. Only have $NFASTQ FASTQ files, make sure you generate all 2000 before running EpitopeID"
34-
exit
35-
fi
36-
37-
GENOPIPE=$WRK/../..
38-
cd $GENOPIPE/EpitopeID
39-
## Execute Mass EpitopeID and record time
40-
echo "**Begin executing EpitopeID for mix_yeast_${DEPTH}..."
41-
start=`date +%s`
42-
bash identify-Epitope.sh -i $RESULT/FASTQ -o $RESULT/ID -d $GENOPIPE/paper/db/sacCer3_EpiID -t 4
43-
end=`date +%s`
44-
runtime=$((end-start))
45-
MESSAGE="...mass EpitopeID for mix_yeast_${DEPTH} finished in ${runtime}"
46-
echo $MESSAGE
47-
cd $WRK
48-
49-
## Calculate detection
50-
CALCULATE=scripts/calculate_detection_Stats.pl
51-
echo "**Calculate detection of $GENE..."
52-
echo $MESSAGE > $RESULT/runtime.txt
53-
head -n 9999 $RESULT/ID/*-ID.tab | perl $CALCULATE - $AFACTOR $RESULT/$AFACTOR\_$DEPTH\_detectionStats.txt
54-
head -n 9999 $RESULT/ID/*-ID.tab | perl $CALCULATE - $BFACTOR $RESULT/$BFACTOR\_$DEPTH\_detectionStats.txt
55-
grep ${AFACTOR^^} $RESULT/ID/*-ID.tab | cut -d':' -f1 | sort | uniq > $RESULT/$AFACTOR\_hits.txt
56-
grep ${BFACTOR^^} $RESULT/ID/*-ID.tab | cut -d':' -f1 | sort | uniq > $RESULT/$BFACTOR\_hits.txt
57-
comm -12 $RESULT/$AFACTOR\_hits.txt $RESULT/$BFACTOR\_hits.txt > $RESULT/$AFACTOR\_and_$BFACTOR\_hits.txt
58-
echo "Calculated"
59-
19+
module load bedtools
20+
module load samtools
21+
module load anaconda3
22+
source activate genopipe
23+
24+
REF=sacCer3
25+
# DEPTH=100K
26+
DEPTH=1M
27+
EPITOPE=R500
28+
AFACTOR=Rap1
29+
BFACTOR=Reb1
30+
31+
DATABASE=$WRK/../db/$REF\_EpiID
32+
[ -d logs ] || mkdir logs
33+
34+
# Titrate out various ratios
35+
for PER in 90 80 70 60 50 40 30 20 10
36+
do
37+
# Construct results directory pathname
38+
RESULTS=$WRK/results/mix_yeast/$DEPTH/$PER\_$AFACTOR\_$BFACTOR
39+
[ -d $RESULTS/ID ] || mkdir $RESULTS/ID
40+
[ -d $RESULTS/runtime ] || mkdir $RESULTS/runtime
41+
echo $RESULT
42+
43+
# Set-up Temp directory
44+
TEMP=$WRK/temp-$PER-$PBS_ARRAYID-$DEPTH
45+
[ -d $TEMP ] || mkdir $TEMP
46+
47+
cd $TEMP
48+
FQ=$RESULTS/FASTQ/Simulation_$PBS_ARRAYID
49+
[ -f Simulation_$PBS_ARRAYID\_R1.fastq.gz ] || ln -s $FQ\_R1.fastq.gz
50+
[ -f Simulation_$PBS_ARRAYID\_R2.fastq.gz ] || ln -s $FQ\_R2.fastq.gz
51+
52+
# Execute EpitopeID and record time
53+
cd $EPITOPEID
54+
start=`date +%s`
55+
bash identify-Epitope.sh -i $TEMP -o $RESULTS/ID -d $DATABASE -t $THREADS
56+
end=`date +%s`
57+
runtime=$((end-start))
58+
echo "Experiment $EXPERIMENT.$DEPTH.$PBS_ARRAYID ($RESULTS) finished in ${runtime}" > $RESULTS/runtime/Simulation_$PBS_ARRAYID.runtime
59+
60+
# Clean-up
61+
rm -r $TEMP
62+
done

paper/SyntheticEpitope/job/run_mix_human.pbs

Lines changed: 12 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -3,8 +3,8 @@
33
#PBS -l pmem=16gb
44
#PBS -l walltime=02:00:00
55
#PBS -A open
6-
#PBS -o logs/mix.human.10M.log.out
7-
#PBS -e logs/mix.human.10M.log.err
6+
#PBS -o logs/mix.human.50M.log.out
7+
#PBS -e logs/mix.human.50M.log.err
88
#PBS -t 1-1000
99

1010
# Required Software
@@ -14,19 +14,19 @@
1414
WRK=/path/to/GenoPipe/paper/SyntheticEpitope
1515
cd $WRK
1616

17-
DEPTH=10M
18-
A_STRAIN=hg19_CTCF-Nterm
19-
B_STRAIN=hg19_POLR2H-Nterm
17+
REF=hg19
18+
# DEPTH=20M
19+
DEPTH=50M
20+
EPITOPE=R500
21+
AFACTOR=CTCF
22+
BFACTOR=POLR2H
2023

21-
AFACTOR=`echo $A_STRAIN | awk -F'-' '{print $1}' | awk -F'_' '{print $2}'`
22-
BFACTOR=`echo $B_STRAIN | awk -F'-' '{print $1}' | awk -F'_' '{print $2}'`
23-
24-
OUTPUT=results/mix_human_$DEPTH
25-
[ -d $OUTPUT ] || mkdir $OUTPUT
24+
OUTPUT=results/mix_human/$DEPTH
25+
[ -d $OUTPUT ] || mkdir -p $OUTPUT
2626

2727
MIX=../scripts/mix_fastq.sh
28-
ABASE=results/$A_STRAIN\_$DEPTH/FASTQ/Simulation_$PBS_ARRAYID
29-
BBASE=results/$B_STRAIN\_$DEPTH/FASTQ/Simulation_$PBS_ARRAYID
28+
ABASE=results/$REF/$DEPTH/$EPITOPE/$AFACTOR/FASTQ/Simulation_$PBS_ARRAYID
29+
BBASE=results/$REF/$DEPTH/$EPITOPE/$BFACTOR/FASTQ/Simulation_$PBS_ARRAYID
3030

3131
start=`date +%s`
3232
# Titrate out various ratios
@@ -40,4 +40,3 @@ done
4040
end=`date +%s`
4141
runtime=$((end-start))
4242
echo "Mix ${A_STRAIN} and ${B_STRAIN} from depth at ${DEPTH} simulated in ${runtime}"
43-

paper/SyntheticEpitope/job/run_mix_yeast.pbs

Lines changed: 12 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -3,8 +3,8 @@
33
#PBS -l pmem=16gb
44
#PBS -l walltime=00:05:00
55
#PBS -A open
6-
#PBS -o logs/mix.yeast.100K.log.out
7-
#PBS -e logs/mix.yeast.100K.log.err
6+
#PBS -o logs/mix.yeast.1M.log.out
7+
#PBS -e logs/mix.yeast.1M.log.err
88
#PBS -t 1-1000
99

1010
# Required Software
@@ -14,19 +14,19 @@
1414
WRK=/path/to/GenoPipe/paper/SyntheticEpitope
1515
cd $WRK
1616

17-
DEPTH=100K
18-
A_STRAIN=sacCer3_Reb1-Cterm
19-
B_STRAIN=sacCer3_Rap1-Nterm
17+
REF=sacCer3
18+
# DEPTH=100K
19+
DEPTH=1M
20+
EPITOPE=R500
21+
AFACTOR=Rap1
22+
BFACTOR=Reb1
2023

21-
AFACTOR=`echo $A_STRAIN | awk -F'-' '{print $1}' | awk -F'_' '{print $2}'`
22-
BFACTOR=`echo $B_STRAIN | awk -F'-' '{print $1}' | awk -F'_' '{print $2}'`
23-
24-
OUTPUT=results/mix_yeast_$DEPTH
25-
[ -d $OUTPUT ] || mkdir $OUTPUT
24+
OUTPUT=results/mix_yeast/$DEPTH
25+
[ -d $OUTPUT ] || mkdir -p $OUTPUT
2626

2727
MIX=../scripts/mix_fastq.sh
28-
ABASE=results/$A_STRAIN\_$DEPTH/FASTQ/Simulation_$PBS_ARRAYID
29-
BBASE=results/$B_STRAIN\_$DEPTH/FASTQ/Simulation_$PBS_ARRAYID
28+
ABASE=results/$REF/$DEPTH/$EPITOPE/$AFACTOR/FASTQ/Simulation_$PBS_ARRAYID
29+
BBASE=results/$REF/$DEPTH/$EPITOPE/$BFACTOR/FASTQ/Simulation_$PBS_ARRAYID
3030

3131
start=`date +%s`
3232
# Titrate out various ratios
@@ -40,4 +40,3 @@ done
4040
end=`date +%s`
4141
runtime=$((end-start))
4242
echo "Mix ${A_STRAIN} and ${B_STRAIN} from depth at ${DEPTH} simulated in ${runtime}"
43-

paper/scripts/mix_fastq.sh

Lines changed: 15 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,9 @@
99
# gzip
1010
# seqtk
1111

12+
module load anaconda3
13+
source activate genopipe
14+
1215
usage()
1316
{
1417
echo 'bash mix_fastq.sh -p <percentOfA> -A <fastqBaseA> -B <fastqBaseB> -o <outBase>'
@@ -51,13 +54,14 @@ echo "FASTQ_B = ${BFQ} (${B_NUM})"
5154
echo "OUTPUT = ${OFQ}"
5255

5356
# Check that the files haven't already been generated
54-
if [[ ! -f $OFQ\_readnames.txt && \
55-
-f $OFQ\_R1.fastq.gz && \
56-
-f $OFQ\_R2.fastq.gz ]];
57-
then
58-
echo "Both R1 and R2 files have already been generated for ${OFQ}"
59-
exit
60-
fi
57+
#if [[ ! -f $OFQ\_readnames.txt && \
58+
# -f $OFQ\_R1.fastq.gz && \
59+
# -f $OFQ\_R2.fastq.gz ]];
60+
#then
61+
# echo "Both R1 and R2 files have already been generated for ${OFQ}"
62+
# exit
63+
#fi
64+
6165
# Cleanup part-generated FQs if indicator file present from a previous run
6266
[ -f $OFQ\_readnames.txt ] && rm $OFQ\_R1.fastq*
6367
[ -f $OFQ\_readnames.txt ] && rm $OFQ\_R2.fastq*
@@ -86,6 +90,7 @@ seqtk subseq $AFQ\_R2.fastq.gz $OFQ\_readnames.txt > $OFQ\_R2.fastq
8690
seqtk subseq $BFQ\_R2.fastq.gz $OFQ\_readnames.txt >> $OFQ\_R2.fastq
8791

8892
# Zip and clean-up files
89-
gzip $OFQ\_R1.fastq
90-
gzip $OFQ\_R2.fastq
91-
rm $OFQ\_readnames.txt
93+
gzip -f $OFQ\_R1.fastq
94+
gzip -f $OFQ\_R2.fastq
95+
96+
#rm $OFQ\_readnames.txt

0 commit comments

Comments
 (0)