CEGRcode
diff --git a/‎paper/.gitignore‎
Lines changed: 4 additions & 0 deletions b/‎paper/.gitignore‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎paper/ENCODEdata-eGFP/210227_sample_metadata.txt‎
Lines changed: 3443 additions & 0 deletions b/‎paper/ENCODEdata-eGFP/210227_sample_metadata.txt‎
Lines changed: 3443 additions & 0 deletions
diff --git a/‎paper/ENCODEdata-eGFP/README‎
Lines changed: 12 additions & 0 deletions b/‎paper/ENCODEdata-eGFP/README‎
Lines changed: 12 additions & 0 deletions
diff --git a/‎paper/ENCODEdata-eGFP/job/00_download_data.pbs‎
Lines changed: 43 additions & 0 deletions b/‎paper/ENCODEdata-eGFP/job/00_download_data.pbs‎
Lines changed: 43 additions & 0 deletions
diff --git a/‎paper/ENCODEdata-eGFP/job/01_indexed_runEID.pbs‎
Lines changed: 67 additions & 0 deletions b/‎paper/ENCODEdata-eGFP/job/01_indexed_runEID.pbs‎
Lines changed: 67 additions & 0 deletions
diff --git a/‎paper/ENCODEdata-eGFP/job/02_tally_results.sh‎
Lines changed: 15 additions & 0 deletions b/‎paper/ENCODEdata-eGFP/job/02_tally_results.sh‎
Lines changed: 15 additions & 0 deletions
diff --git a/‎paper/ENCODEdata-eGFP/logs/README‎
Lines changed: 1 addition & 0 deletions b/‎paper/ENCODEdata-eGFP/logs/README‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎paper/ENCODEdata-eGFP/results/README‎
Lines changed: 1 addition & 0 deletions b/‎paper/ENCODEdata-eGFP/results/README‎
Lines changed: 1 addition & 0 deletions
@@ -15,3 +15,7 @@ SyntheticStrain/logs/*.err-*
 SyntheticStrain/logs/*.out-*
 YKOC-wgs/logs/*.err-*
 YKOC-wgs/logs/*.out-*
+ENCODEdata-eGFP/logs/*.out-*
+ENCODEdata-eGFP/logs/*.err-*
+ENCODEdata-eGFP/results/FASTQ
+ENCODEdata-eGFP/results/ID
@@ -1,5 +1,17 @@
 # Run EpitopeID on ENCODE data and evaluate EptiopeID's performance
 
+
+# Reference files
+ENCODE metadata was pulled on February 27, 2021 using the `scripts/get_metadata.py`
+script that pulls all Genetic Modification accessions with category=insertion and 
+purpose=tagging. These are used to pull Biosample accessions with the organism=
+human constraint. The File accessions with type=FASTQ are finally pulled with the 
+filter that they are associated with Libraries that come from one of these Biosample
+accessions and their assay=[ChIPseq|DNAseq]. They are saved with all relevant 
+metadata to the `210227_sample_metadata.txt` file.
+
+Command used: python scripts/get_metadata.py > 210227_sample_metadata.txt
+
 # Download ENCODE eGFP data and run through EpitopeID
 Use the files.txt with ENCODE accessions to download the data to the data directory.
 Then make sure the LAP-tag is in the hg19_EpiID and run the data through EpitopeID.
 
@@ -0,0 +1,43 @@
+#!/bin/bash
+#PBS -l nodes=1:ppn=4
+#PBS -l pmem=16gb
+#PBS -l walltime=00:05:00
+#PBS -l feature=rhel7
+#PBS -A open
+#PBS -o logs/download.data.log.out
+#PBS -e logs/download.data.log.err
+#PBS -t 1-3443
+
+WRK=/path/to/GenoPipe/paper/ENCODE-eGFP
+cd $WRK
+
+[ -d logs ] || mkdir logs
+[ -d results/FASTQ ] || mkdir -p results/FASTQ
+
+METADATA=210227_sample_metadata.txt
+INFO=`sed "${PBS_ARRAYID}q;d" $METADATA`
+ENCFF=`echo $INFO | awk '{print $1}'`
+#echo $INFO
+
+FASTQ=$ENCFF\_R1.fastq.gz
+if [[ `echo $INFO | awk '{print $2}'` =~ 2 ]]; then
+	FASTQ=`echo $INFO | awk '{print $3}' | awk -F"/" '{print $3}'`_R2.fastq.gz
+fi
+
+# ENCODE data download
+cd results/FASTQ
+if [[ ! -f $FASTQ ]]; then
+	echo "Fetching from https://www.encodeproject.org/files/$ENCFF/@@download/$ENCFF.fastq.gz"
+	wget https://www.encodeproject.org/files/$ENCFF/@@download/$ENCFF.fastq.gz
+	mv $ENCFF.fastq.gz $FASTQ
+else
+	echo "$FASTQ already downloaded..."
+fi
+
+# Checksum of resulting FASTQ
+MD5SUM=`echo $INFO | awk '{print $5}'`
+if [[ `md5sum $FASTQ` =~ $MD5SUM ]]; then
+	echo "($PBS_ARRAYID) $FASTQ passed."
+else
+	echo "($PBS_ARRAYID) $FASTQ md5checksum failed!"
+fi
@@ -0,0 +1,67 @@
+#!/bin/bash
+#PBS -l nodes=1:ppn=4
+#PBS -l pmem=16gb
+#PBS -l walltime=03:30:00
+#PBS -l feature=rhel7
+#PBS -A open
+#PBS -o logs/eid.log.out
+#PBS -e logs/eid.log.err
+#PBS -t 1-3443
+
+module load gcc/8.3.1
+module load bedtools/2.27.1
+module load bwa/0.7.15
+#module load python/3.6.8
+module load samtools/1.5
+module load anaconda3
+source activate genopipe
+# genopipe env includes Python=3.9.0 and scipy
+
+WRK=/path/to/GenoPipe/paper/ENCODE-eGFP
+cd $WRK
+
+# Store directory paths
+DATABASE=$WRK/../db/hg19_EpiID
+GENOPIPE=$WRK/../../
+FASTQ=$WRK/results/FASTQ
+ID=$WRK/results/ID
+
+[ -d logs ] || mkdir logs
+[ -d $ID ] || mkdir -p $ID
+
+# Parse metadata
+METADATA=210227_sample_metadata.txt
+INFO=`sed "${PBS_ARRAYID}q;d" $METADATA`
+ENCFF=`echo $INFO | awk '{print $1}'`
+R1=$ENCFF\_R1.fastq.gz
+R2=$ENCFF\_R2.fastq.gz
+#echo $INFO
+
+# Skip Read2 or if ID file already generaetd
+if [[ `echo $INFO | awk '{print $2}'` =~ 2 ]]; then
+	echo "Skip Read2 entries..."
+	exit
+fi
+if [[ -f $ID/$ENCFF\_R1-ID.tab ]]; then
+	echo "ID already generated ($ENCFF). Exiting.."
+	exit
+fi
+
+# Set-up Temp directory
+TEMP=$WRK/temp$PBS_ARRAYID
+[ -d $TEMP ] || mkdir $TEMP
+cd $TEMP
+ln -s $FASTQ/$R1
+[ -f $FASTQ/$R2 ] && ln -s $FASTQ/$R2
+
+## Execute Single EpitopeID and record time
+cd $GENOPIPE/EpitopeID
+echo "(2) Begin executing EpitopeID for $R1..."
+start=`date +%s`
+bash identify-Epitope.sh -i $TEMP -o $ID -d $DATABASE -t 6
+end=`date +%s`
+runtime=$((end-start))
+echo "...single EpitopeID for ($PBS_ARRAYID) $R1 finished in ${runtime}"
+
+# Clean-up
+rm -r $TEMP
@@ -0,0 +1,15 @@
+WRK=/path/to/GenoPipe/paper/ENCODEdata-eGFP
+cd $WRK
+
+module load anaconda3
+source activate genopipe
+#module load python/3.6.8
+python -V
+
+# Get success rate based on metadata file and results of EpitopeID
+METADATA=210227_sample_metadata.txt
+CALCULATE=scripts/analyze_encode_results.py
+python $CALCULATE -m $METADATA -i results/ID \
+	> results/eGFPandFLAG_results_analyzed.txt \
+	2> results/eGFPandFLAG_results_analyzed.err
+
@@ -0,0 +1 @@
+# logfiles from STDERR and STDOUT of running job files go here
@@ -0,0 +1 @@
+# Downloaded FASTQ files and EpitopeID results go here
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1 @@`
	`1`	`+# logfiles from STDERR and STDOUT of running job files go here`
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1 @@`
	`1`	`+# Downloaded FASTQ files and EpitopeID results go here`