Skip to content

Commit 4e7eb30

Browse files
authored
Merge branch 'master' into validation
2 parents 6012acd + f8b9c64 commit 4e7eb30

13 files changed

Lines changed: 7339 additions & 0 deletions

paper/.gitignore

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,3 +15,7 @@ SyntheticStrain/logs/*.err-*
1515
SyntheticStrain/logs/*.out-*
1616
YKOC-wgs/logs/*.err-*
1717
YKOC-wgs/logs/*.out-*
18+
ENCODEdata-eGFP/logs/*.out-*
19+
ENCODEdata-eGFP/logs/*.err-*
20+
ENCODEdata-eGFP/results/FASTQ
21+
ENCODEdata-eGFP/results/ID

paper/ENCODEdata-eGFP/210227_sample_metadata.txt

Lines changed: 3443 additions & 0 deletions
Large diffs are not rendered by default.

paper/ENCODEdata-eGFP/README

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,17 @@
11
# Run EpitopeID on ENCODE data and evaluate EptiopeID's performance
22

3+
4+
# Reference files
5+
ENCODE metadata was pulled on February 27, 2021 using the `scripts/get_metadata.py`
6+
script that pulls all Genetic Modification accessions with category=insertion and
7+
purpose=tagging. These are used to pull Biosample accessions with the organism=
8+
human constraint. The File accessions with type=FASTQ are finally pulled with the
9+
filter that they are associated with Libraries that come from one of these Biosample
10+
accessions and their assay=[ChIPseq|DNAseq]. They are saved with all relevant
11+
metadata to the `210227_sample_metadata.txt` file.
12+
13+
Command used: python scripts/get_metadata.py > 210227_sample_metadata.txt
14+
315
# Download ENCODE eGFP data and run through EpitopeID
416
Use the files.txt with ENCODE accessions to download the data to the data directory.
517
Then make sure the LAP-tag is in the hg19_EpiID and run the data through EpitopeID.
Lines changed: 43 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,43 @@
1+
#!/bin/bash
2+
#PBS -l nodes=1:ppn=4
3+
#PBS -l pmem=16gb
4+
#PBS -l walltime=00:05:00
5+
#PBS -l feature=rhel7
6+
#PBS -A open
7+
#PBS -o logs/download.data.log.out
8+
#PBS -e logs/download.data.log.err
9+
#PBS -t 1-3443
10+
11+
WRK=/path/to/GenoPipe/paper/ENCODE-eGFP
12+
cd $WRK
13+
14+
[ -d logs ] || mkdir logs
15+
[ -d results/FASTQ ] || mkdir -p results/FASTQ
16+
17+
METADATA=210227_sample_metadata.txt
18+
INFO=`sed "${PBS_ARRAYID}q;d" $METADATA`
19+
ENCFF=`echo $INFO | awk '{print $1}'`
20+
#echo $INFO
21+
22+
FASTQ=$ENCFF\_R1.fastq.gz
23+
if [[ `echo $INFO | awk '{print $2}'` =~ 2 ]]; then
24+
FASTQ=`echo $INFO | awk '{print $3}' | awk -F"/" '{print $3}'`_R2.fastq.gz
25+
fi
26+
27+
# ENCODE data download
28+
cd results/FASTQ
29+
if [[ ! -f $FASTQ ]]; then
30+
echo "Fetching from https://www.encodeproject.org/files/$ENCFF/@@download/$ENCFF.fastq.gz"
31+
wget https://www.encodeproject.org/files/$ENCFF/@@download/$ENCFF.fastq.gz
32+
mv $ENCFF.fastq.gz $FASTQ
33+
else
34+
echo "$FASTQ already downloaded..."
35+
fi
36+
37+
# Checksum of resulting FASTQ
38+
MD5SUM=`echo $INFO | awk '{print $5}'`
39+
if [[ `md5sum $FASTQ` =~ $MD5SUM ]]; then
40+
echo "($PBS_ARRAYID) $FASTQ passed."
41+
else
42+
echo "($PBS_ARRAYID) $FASTQ md5checksum failed!"
43+
fi
Lines changed: 67 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,67 @@
1+
#!/bin/bash
2+
#PBS -l nodes=1:ppn=4
3+
#PBS -l pmem=16gb
4+
#PBS -l walltime=03:30:00
5+
#PBS -l feature=rhel7
6+
#PBS -A open
7+
#PBS -o logs/eid.log.out
8+
#PBS -e logs/eid.log.err
9+
#PBS -t 1-3443
10+
11+
module load gcc/8.3.1
12+
module load bedtools/2.27.1
13+
module load bwa/0.7.15
14+
#module load python/3.6.8
15+
module load samtools/1.5
16+
module load anaconda3
17+
source activate genopipe
18+
# genopipe env includes Python=3.9.0 and scipy
19+
20+
WRK=/path/to/GenoPipe/paper/ENCODE-eGFP
21+
cd $WRK
22+
23+
# Store directory paths
24+
DATABASE=$WRK/../db/hg19_EpiID
25+
GENOPIPE=$WRK/../../
26+
FASTQ=$WRK/results/FASTQ
27+
ID=$WRK/results/ID
28+
29+
[ -d logs ] || mkdir logs
30+
[ -d $ID ] || mkdir -p $ID
31+
32+
# Parse metadata
33+
METADATA=210227_sample_metadata.txt
34+
INFO=`sed "${PBS_ARRAYID}q;d" $METADATA`
35+
ENCFF=`echo $INFO | awk '{print $1}'`
36+
R1=$ENCFF\_R1.fastq.gz
37+
R2=$ENCFF\_R2.fastq.gz
38+
#echo $INFO
39+
40+
# Skip Read2 or if ID file already generaetd
41+
if [[ `echo $INFO | awk '{print $2}'` =~ 2 ]]; then
42+
echo "Skip Read2 entries..."
43+
exit
44+
fi
45+
if [[ -f $ID/$ENCFF\_R1-ID.tab ]]; then
46+
echo "ID already generated ($ENCFF). Exiting.."
47+
exit
48+
fi
49+
50+
# Set-up Temp directory
51+
TEMP=$WRK/temp$PBS_ARRAYID
52+
[ -d $TEMP ] || mkdir $TEMP
53+
cd $TEMP
54+
ln -s $FASTQ/$R1
55+
[ -f $FASTQ/$R2 ] && ln -s $FASTQ/$R2
56+
57+
## Execute Single EpitopeID and record time
58+
cd $GENOPIPE/EpitopeID
59+
echo "(2) Begin executing EpitopeID for $R1..."
60+
start=`date +%s`
61+
bash identify-Epitope.sh -i $TEMP -o $ID -d $DATABASE -t 6
62+
end=`date +%s`
63+
runtime=$((end-start))
64+
echo "...single EpitopeID for ($PBS_ARRAYID) $R1 finished in ${runtime}"
65+
66+
# Clean-up
67+
rm -r $TEMP
Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
WRK=/path/to/GenoPipe/paper/ENCODEdata-eGFP
2+
cd $WRK
3+
4+
module load anaconda3
5+
source activate genopipe
6+
#module load python/3.6.8
7+
python -V
8+
9+
# Get success rate based on metadata file and results of EpitopeID
10+
METADATA=210227_sample_metadata.txt
11+
CALCULATE=scripts/analyze_encode_results.py
12+
python $CALCULATE -m $METADATA -i results/ID \
13+
> results/eGFPandFLAG_results_analyzed.txt \
14+
2> results/eGFPandFLAG_results_analyzed.err
15+

paper/ENCODEdata-eGFP/logs/README

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
# logfiles from STDERR and STDOUT of running job files go here
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
# Downloaded FASTQ files and EpitopeID results go here

0 commit comments

Comments
 (0)