update mixture contamination scripts

owlang · owlang · commit b0cb6d143c91 · 2023-02-17T16:12:52.000-05:00
Switch to a simulation index-oriented run of the mixture simulations (rather than a per-mixture ratio). These new scripts are modified to be more similar to the depth and epitopeid template PBS scripts. The simulation script was also updated to force overwrite of gzip files for convenient re-running of simulations and the check for completion was removed in favor of an immediate overwrite setup.
diff --git a/paper/SyntheticEpitope/job/run_EpitopeID_on_mix_human.pbs b/paper/SyntheticEpitope/job/run_EpitopeID_on_mix_human.pbs
@@ -3,56 +3,60 @@
 #PBS -l pmem=16gb
 #PBS -l walltime=01:00:00
 #PBS -A open
-#PBS -o logs/mix.human.10M.eid.log.out
-#PBS -e logs/mix.human.10M.eid.log.err
-#PBS -t 90,80,70,60,50,40,30,20,10
+#PBS -o logs/mix.human.50M.eid.log.out
+#PBS -e logs/mix.human.50M.eid.log.err
+#PBS -t 1-1000
 
 # Each index of the job array t:[1,16] represents a different set of mixed simulations. This script will execute EpitopeID against all all simulated datasets at various titrations and use the perl script to calculate the detection statistics. Each index corresponds to a different titration mix.
 
 # This script will check that 2000 FASTQ files have been generated before executing EpitopeID.
 
 WRK=/path/to/GenoPipe/paper/SyntheticEpitope
+THREADS=6
+EPITOPEID=$WRK/../../EpitopeID
 cd $WRK
 
-module load bedtools/2.27.1
-module load bwa/0.7.15
-module load python/2.7.14-anaconda5.0.1
+module load bedtools
+module load samtools
+module load anaconda3
+source activate genopipe
 
-DEPTH=10M
+REF=hg19
+# DEPTH=20M
+DEPTH=50M
+EPITOPE=R500
 AFACTOR=CTCF
 BFACTOR=POLR2H
 
-RESULT=$WRK/results/mix_human_$DEPTH/$PBS_ARRAYID\_$AFACTOR\_$BFACTOR
-[ -d $RESULT/ID ] || mkdir $RESULT/ID
-echo $RESULT
-#Check that all FASTQ files were genertaed first
-if [ $(ls $RESULT/FASTQ/*.fastq.gz |wc -l ) -lt 2000 ];
-then
-	NFASTQ=`ls $OUTPUT/FASTQ/*.fastq.gz |wc -l`
-	echo "Insufficient simulations for mix_yeast_${DEPTH}. Only have $NFASTQ FASTQ files, make sure you generate all 2000 before running EpitopeID"
-	exit
-fi
-
-GENOPIPE=$WRK/../..
-cd $GENOPIPE/EpitopeID
-## Execute Mass EpitopeID and record time
-echo "**Begin executing EpitopeID for mix_yeast_${DEPTH}..."
-start=`date +%s`
-bash identify-Epitope.sh -i $RESULT/FASTQ -o $RESULT/ID -d $GENOPIPE/paper/db/hg19_EpiID -t 4
-end=`date +%s`
-runtime=$((end-start))
-MESSAGE="...mass EpitopeID for mix_human_${DEPTH} finished in ${runtime}"
-echo $MESSAGE
-cd $WRK
-
-## Calculate detection
-CALCULATE=scripts/calculate_detection_Stats.pl
-echo "**Calculate detection of $GENE..."
-echo $MESSAGE > $RESULT/runtime.txt
-head -n 9999 $RESULT/ID/*-ID.tab | perl $CALCULATE - $AFACTOR $RESULT/$AFACTOR\_$DEPTH\_detectionStats.txt
-head -n 9999 $RESULT/ID/*-ID.tab | perl $CALCULATE - $BFACTOR $RESULT/$BFACTOR\_$DEPTH\_detectionStats.txt
-grep ${AFACTOR^^} $RESULT/ID/*-ID.tab | cut -d':' -f1 | sort | uniq > $RESULT/$AFACTOR\_hits.txt
-grep ${BFACTOR^^} $RESULT/ID/*-ID.tab | cut -d':' -f1 | sort | uniq > $RESULT/$BFACTOR\_hits.txt
-comm -12 $RESULT/$AFACTOR\_hits.txt $RESULT/$BFACTOR\_hits.txt > $RESULT/$AFACTOR\_and_$BFACTOR\_hits.txt
-echo "Calculated"
-
+DATABASE=$WRK/../db/$REF\_EpiID
+[ -d logs ] || mkdir logs
+
+# Titrate out various ratios
+for PER in 90 80 70 60 50 40 30 20 10
+do
+	# Construct results directory pathname
+	RESULTS=$WRK/results/mix_human/$DEPTH/$PER\_$AFACTOR\_$BFACTOR
+	[ -d $RESULTS/ID ] || mkdir $RESULTS/ID
+	[ -d $RESULTS/runtime ] || mkdir $RESULTS/runtime
+	echo $RESULT
+
+	# Set-up Temp directory
+	TEMP=$WRK/temp-$PER-$PBS_ARRAYID-$DEPTH
+	[ -d $TEMP ] || mkdir $TEMP
+
+	cd $TEMP
+	FQ=$RESULTS/FASTQ/Simulation_$PBS_ARRAYID
+	[ -f Simulation_$PBS_ARRAYID\_R1.fastq.gz ] || ln -s $FQ\_R1.fastq.gz
+	[ -f Simulation_$PBS_ARRAYID\_R2.fastq.gz ] || ln -s $FQ\_R2.fastq.gz
+
+	# Execute EpitopeID and record time
+	cd $EPITOPEID
+	start=`date +%s`
+	bash identify-Epitope.sh -i $TEMP -o $RESULTS/ID -d $DATABASE -t $THREADS
+	end=`date +%s`
+	runtime=$((end-start))
+	echo "Experiment $EXPERIMENT.$DEPTH.$PBS_ARRAYID ($RESULTS) finished in ${runtime}" > $RESULTS/runtime/Simulation_$PBS_ARRAYID.runtime
+
+	# Clean-up
+	rm -r $TEMP
+done
diff --git a/paper/SyntheticEpitope/job/run_EpitopeID_on_mix_yeast.pbs b/paper/SyntheticEpitope/job/run_EpitopeID_on_mix_yeast.pbs
@@ -3,57 +3,60 @@
 #PBS -l pmem=16gb
 #PBS -l walltime=01:00:00
 #PBS -A open
-#PBS -o logs/mix.yeast.eid.log.out
-#PBS -e logs/mix.yeast.eid.log.err
-#PBS -t 90,80,70,60,50,40,30,20,10
+#PBS -o logs/mix.yeast.1M.eid.log.out
+#PBS -e logs/mix.yeast.1M.eid.log.err
+#PBS -t 1-1000
 
 # Each index of the job array t:[1,16] represents a different set of mixed simulations. This script will execute EpitopeID against all all simulated datasets at various titrations and use the perl script to calculate the detection statistics. Each index corresponds to a different titration mix.
 
 # This script will check that 2000 FASTQ files have been generated before executing EpitopeID.
 
 WRK=/path/to/GenoPipe/paper/SyntheticEpitope
+THREADS=6
+EPITOPEID=$WRK/../../EpitopeID
 cd $WRK
 
-module load bedtools/2.27.1
-module load bwa/0.7.15
-module load python/2.7.14-anaconda5.0.1
-module load samtools/1.5
-
-DEPTH=100K
-AFACTOR=Reb1
-BFACTOR=Rap1
-
-RESULT=$WRK/results/mix_yeast_$DEPTH/$PBS_ARRAYID\_$AFACTOR\_$BFACTOR
-[ -d $RESULT/ID ] || mkdir $RESULT/ID
-echo $RESULT
-#Check that all FASTQ files were genertaed first
-if [ $(ls $RESULT/FASTQ/*.fastq.gz |wc -l ) -lt 2000 ];
-then
-	NFASTQ=`ls $OUTPUT/FASTQ/*.fastq.gz |wc -l`
-	echo "Insufficient simulations for mix_yeast_${DEPTH}. Only have $NFASTQ FASTQ files, make sure you generate all 2000 before running EpitopeID"
-	exit
-fi
-
-GENOPIPE=$WRK/../..
-cd $GENOPIPE/EpitopeID
-## Execute Mass EpitopeID and record time
-echo "**Begin executing EpitopeID for mix_yeast_${DEPTH}..."
-start=`date +%s`
-bash identify-Epitope.sh -i $RESULT/FASTQ -o $RESULT/ID -d $GENOPIPE/paper/db/sacCer3_EpiID -t 4
-end=`date +%s`
-runtime=$((end-start))
-MESSAGE="...mass EpitopeID for mix_yeast_${DEPTH} finished in ${runtime}"
-echo $MESSAGE
-cd $WRK
-
-## Calculate detection
-CALCULATE=scripts/calculate_detection_Stats.pl
-echo "**Calculate detection of $GENE..."
-echo $MESSAGE > $RESULT/runtime.txt
-head -n 9999 $RESULT/ID/*-ID.tab | perl $CALCULATE - $AFACTOR $RESULT/$AFACTOR\_$DEPTH\_detectionStats.txt
-head -n 9999 $RESULT/ID/*-ID.tab | perl $CALCULATE - $BFACTOR $RESULT/$BFACTOR\_$DEPTH\_detectionStats.txt
-grep ${AFACTOR^^} $RESULT/ID/*-ID.tab | cut -d':' -f1 | sort | uniq > $RESULT/$AFACTOR\_hits.txt
-grep ${BFACTOR^^} $RESULT/ID/*-ID.tab | cut -d':' -f1 | sort | uniq > $RESULT/$BFACTOR\_hits.txt
-comm -12 $RESULT/$AFACTOR\_hits.txt $RESULT/$BFACTOR\_hits.txt > $RESULT/$AFACTOR\_and_$BFACTOR\_hits.txt
-echo "Calculated"
-
+module load bedtools
+module load samtools
+module load anaconda3
+source activate genopipe
+
+REF=sacCer3
+# DEPTH=100K
+DEPTH=1M
+EPITOPE=R500
+AFACTOR=Rap1
+BFACTOR=Reb1
+
+DATABASE=$WRK/../db/$REF\_EpiID
+[ -d logs ] || mkdir logs
+
+# Titrate out various ratios
+for PER in 90 80 70 60 50 40 30 20 10
+do
+	# Construct results directory pathname
+	RESULTS=$WRK/results/mix_yeast/$DEPTH/$PER\_$AFACTOR\_$BFACTOR
+	[ -d $RESULTS/ID ] || mkdir $RESULTS/ID
+	[ -d $RESULTS/runtime ] || mkdir $RESULTS/runtime
+	echo $RESULT
+
+	# Set-up Temp directory
+	TEMP=$WRK/temp-$PER-$PBS_ARRAYID-$DEPTH
+	[ -d $TEMP ] || mkdir $TEMP
+
+	cd $TEMP
+	FQ=$RESULTS/FASTQ/Simulation_$PBS_ARRAYID
+	[ -f Simulation_$PBS_ARRAYID\_R1.fastq.gz ] || ln -s $FQ\_R1.fastq.gz
+	[ -f Simulation_$PBS_ARRAYID\_R2.fastq.gz ] || ln -s $FQ\_R2.fastq.gz
+
+	# Execute EpitopeID and record time
+	cd $EPITOPEID
+	start=`date +%s`
+	bash identify-Epitope.sh -i $TEMP -o $RESULTS/ID -d $DATABASE -t $THREADS
+	end=`date +%s`
+	runtime=$((end-start))
+	echo "Experiment $EXPERIMENT.$DEPTH.$PBS_ARRAYID ($RESULTS) finished in ${runtime}" > $RESULTS/runtime/Simulation_$PBS_ARRAYID.runtime
+
+	# Clean-up
+	rm -r $TEMP
+done
diff --git a/paper/SyntheticEpitope/job/run_mix_human.pbs b/paper/SyntheticEpitope/job/run_mix_human.pbs
@@ -3,8 +3,8 @@
 #PBS -l pmem=16gb
 #PBS -l walltime=02:00:00
 #PBS -A open
-#PBS -o logs/mix.human.10M.log.out
-#PBS -e logs/mix.human.10M.log.err
+#PBS -o logs/mix.human.50M.log.out
+#PBS -e logs/mix.human.50M.log.err
 #PBS -t 1-1000
 
 # Required Software
@@ -14,19 +14,19 @@
 WRK=/path/to/GenoPipe/paper/SyntheticEpitope
 cd $WRK
 
-DEPTH=10M
-A_STRAIN=hg19_CTCF-Nterm
-B_STRAIN=hg19_POLR2H-Nterm
+REF=hg19
+# DEPTH=20M
+DEPTH=50M
+EPITOPE=R500
+AFACTOR=CTCF
+BFACTOR=POLR2H
 
-AFACTOR=`echo $A_STRAIN | awk -F'-' '{print $1}' | awk -F'_' '{print $2}'`
-BFACTOR=`echo $B_STRAIN | awk -F'-' '{print $1}' | awk -F'_' '{print $2}'`
-
-OUTPUT=results/mix_human_$DEPTH
-[ -d $OUTPUT ] || mkdir $OUTPUT
+OUTPUT=results/mix_human/$DEPTH
+[ -d $OUTPUT ] || mkdir -p $OUTPUT
 
 MIX=../scripts/mix_fastq.sh
-ABASE=results/$A_STRAIN\_$DEPTH/FASTQ/Simulation_$PBS_ARRAYID
-BBASE=results/$B_STRAIN\_$DEPTH/FASTQ/Simulation_$PBS_ARRAYID
+ABASE=results/$REF/$DEPTH/$EPITOPE/$AFACTOR/FASTQ/Simulation_$PBS_ARRAYID
+BBASE=results/$REF/$DEPTH/$EPITOPE/$BFACTOR/FASTQ/Simulation_$PBS_ARRAYID
 
 start=`date +%s`
 # Titrate out various ratios
@@ -40,4 +40,3 @@ done
 end=`date +%s`
 runtime=$((end-start))
 echo "Mix ${A_STRAIN} and ${B_STRAIN} from depth at ${DEPTH} simulated in ${runtime}"
-
diff --git a/paper/SyntheticEpitope/job/run_mix_yeast.pbs b/paper/SyntheticEpitope/job/run_mix_yeast.pbs
@@ -3,8 +3,8 @@
 #PBS -l pmem=16gb
 #PBS -l walltime=00:05:00
 #PBS -A open
-#PBS -o logs/mix.yeast.100K.log.out
-#PBS -e logs/mix.yeast.100K.log.err
+#PBS -o logs/mix.yeast.1M.log.out
+#PBS -e logs/mix.yeast.1M.log.err
 #PBS -t 1-1000
 
 # Required Software
@@ -14,19 +14,19 @@
 WRK=/path/to/GenoPipe/paper/SyntheticEpitope
 cd $WRK
 
-DEPTH=100K
-A_STRAIN=sacCer3_Reb1-Cterm
-B_STRAIN=sacCer3_Rap1-Nterm
+REF=sacCer3
+# DEPTH=100K
+DEPTH=1M
+EPITOPE=R500
+AFACTOR=Rap1
+BFACTOR=Reb1
 
-AFACTOR=`echo $A_STRAIN | awk -F'-' '{print $1}' | awk -F'_' '{print $2}'`
-BFACTOR=`echo $B_STRAIN | awk -F'-' '{print $1}' | awk -F'_' '{print $2}'`
-
-OUTPUT=results/mix_yeast_$DEPTH
-[ -d $OUTPUT ] || mkdir $OUTPUT
+OUTPUT=results/mix_yeast/$DEPTH
+[ -d $OUTPUT ] || mkdir -p $OUTPUT
 
 MIX=../scripts/mix_fastq.sh
-ABASE=results/$A_STRAIN\_$DEPTH/FASTQ/Simulation_$PBS_ARRAYID
-BBASE=results/$B_STRAIN\_$DEPTH/FASTQ/Simulation_$PBS_ARRAYID
+ABASE=results/$REF/$DEPTH/$EPITOPE/$AFACTOR/FASTQ/Simulation_$PBS_ARRAYID
+BBASE=results/$REF/$DEPTH/$EPITOPE/$BFACTOR/FASTQ/Simulation_$PBS_ARRAYID
 
 start=`date +%s`
 # Titrate out various ratios
@@ -40,4 +40,3 @@ done
 end=`date +%s`
 runtime=$((end-start))
 echo "Mix ${A_STRAIN} and ${B_STRAIN} from depth at ${DEPTH} simulated in ${runtime}"
-
diff --git a/paper/scripts/mix_fastq.sh b/paper/scripts/mix_fastq.sh
@@ -9,6 +9,9 @@
 # gzip
 # seqtk
 
+module load anaconda3
+source activate genopipe
+
 usage()
 {
 	echo 'bash mix_fastq.sh -p <percentOfA> -A <fastqBaseA> -B <fastqBaseB> -o <outBase>'
@@ -51,13 +54,14 @@ echo "FASTQ_B = ${BFQ} (${B_NUM})"
 echo "OUTPUT = ${OFQ}"
 
 # Check that the files haven't already been generated
-if [[ ! -f $OFQ\_readnames.txt && \
-	-f $OFQ\_R1.fastq.gz && \
-	-f $OFQ\_R2.fastq.gz ]];
-then
-	echo "Both R1 and R2 files have already been generated for ${OFQ}"
-	exit
-fi
+#if [[ ! -f $OFQ\_readnames.txt && \
+#	-f $OFQ\_R1.fastq.gz && \
+#	-f $OFQ\_R2.fastq.gz ]];
+#then
+#	echo "Both R1 and R2 files have already been generated for ${OFQ}"
+#	exit
+#fi
+
 # Cleanup part-generated FQs if indicator file present from a previous run
 [ -f $OFQ\_readnames.txt ] && rm $OFQ\_R1.fastq*
 [ -f $OFQ\_readnames.txt ] && rm $OFQ\_R2.fastq*
@@ -86,6 +90,7 @@ seqtk subseq $AFQ\_R2.fastq.gz $OFQ\_readnames.txt > $OFQ\_R2.fastq
 seqtk subseq $BFQ\_R2.fastq.gz $OFQ\_readnames.txt >> $OFQ\_R2.fastq
 
 # Zip and clean-up files
-gzip $OFQ\_R1.fastq
-gzip $OFQ\_R2.fastq
-rm $OFQ\_readnames.txt
+gzip -f $OFQ\_R1.fastq
+gzip -f $OFQ\_R2.fastq
+
+#rm $OFQ\_readnames.txt