Skip to content

Commit d0149a0

Browse files
authored
Merge pull request #11 from CEGRcode/encode
Scripts to process ENCODE data and run StrainID Bulk of ENCODE analysis of StrainID included in these commits
2 parents 25d300c + 74fcbf2 commit d0149a0

14 files changed

Lines changed: 14704 additions & 8 deletions

paper/.gitignore

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,3 +26,6 @@ SyntheticStrain/logs/*.err-*
2626
SyntheticStrain/logs/*.out-*
2727
SyntheticStrain/results/sacCer3*
2828
SyntheticStrain/results/hg19*
29+
ENCODE_CellLines/results/BAM
30+
ENCODE_CellLines/results/BAM-nospike
31+
ENCODE_CellLines/results/ID

paper/ENCODEdata-CellLines/210512_sample_metadata.txt

Lines changed: 14260 additions & 0 deletions
Large diffs are not rendered by default.

paper/ENCODEdata-CellLines/README

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
# Run StrainID on ENCODE data and evaluate StrainID's performance
2+
3+
# Reference files
4+
ENCODE metadata was pulled on May 12, 2021 using the `scripts/get_metadata.py`
5+
script that pulls all Biosample accessions with classification="cell_line" and
6+
whose string matches one of the cell lines we have in our hg19_VCF database.
7+
These are used to pull File accessions with type=BAM and assembly=hg19. We did
8+
not filter by assay for this analysis. They are saved with all relevant metadata
9+
to the `210512_sample_metadata.txt` file.
10+
11+
Command used: python scripts/get_metadata.py > 210512_sample_metadata.txt
12+
13+
# Download ENCODE CellLine data and run through StrainID
14+
Use the 210512_sample_metadata.txt file with ENCODE accessions to download the
15+
data to the data directory.
16+
Then run the data through StrainID.
17+
18+
# Compare the results to the metadata information
19+
Evaluate the accuracy of StrainID on real data.
Lines changed: 45 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,45 @@
1+
#!/bin/bash
2+
#PBS -l nodes=1:ppn=6
3+
#PBS -l pmem=24gb
4+
#PBS -l walltime=05:00:00
5+
#PBS -A open
6+
#PBS -o logs/download.data.log.out
7+
#PBS -e logs/download.data.log.err
8+
#PBS -t 1-14260
9+
10+
module load gcc/9.3.1
11+
module load samtools
12+
module load anaconda3
13+
source activate genopipe
14+
15+
WRK=/path/to/GenoPipe/paper/ENCODE-CellLines
16+
cd $WRK
17+
18+
[ -d logs ] || mkdir logs
19+
[ -d results/BAM ] || mkdir -p results/BAM
20+
21+
METADATA=210512_sample_metadata.txt
22+
INFO=`sed "${PBS_ARRAYID}q;d" $METADATA`
23+
ENCFF=`echo $INFO | awk '{print $1}'`
24+
#echo $INFO
25+
26+
cd results/BAM
27+
BAM=$ENCFF.bam
28+
29+
# ENCODE data download
30+
HREF=`echo $INFO | awk '{print $2}'`
31+
echo "Fetching from https://www.encodeproject.org$HREF"
32+
wget https://www.encodeproject.org$HREF
33+
34+
# Checksum of resulting BAM
35+
MD5SUM=`echo $INFO | awk '{print $3}'`
36+
if [[ `md5sum $BAM` =~ $MD5SUM ]]; then
37+
echo "($PBS_ARRAYID) $BAM passed."
38+
else
39+
echo "($PBS_ARRAYID) $BAM md5checksum failed!"
40+
rm $BAM
41+
exit
42+
fi
43+
44+
# Index BAM file
45+
samtools index $BAM
Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,35 @@
1+
#!/bin/bash
2+
#PBS -l nodes=1:ppn=6
3+
#PBS -l pmem=24gb
4+
#PBS -l walltime=01:00:00
5+
#PBS -A open
6+
#PBS -o logs/filter.count.log.out
7+
#PBS -e logs/filter.count.log.err
8+
#PBS -t 1-14260
9+
10+
module load gcc/9.3.1
11+
module load samtools
12+
module load anaconda3
13+
source activate genopipe
14+
15+
WRK=/path/to/GenoPipe/paper/ENCODE-CellLines
16+
cd $WRK
17+
18+
[ -d logs ] || mkdir logs
19+
[ -d results/BAM-nospike ] || mkdir -p results/BAM-nospike
20+
21+
METADATA=210512_sample_metadata.txt
22+
INFO=`sed "${PBS_ARRAYID}q;d" $METADATA`
23+
ENCFF=`echo $INFO | awk '{print $1}'`
24+
#echo $INFO
25+
26+
BAM=results/BAM/$ENCFF
27+
NOSPIKE=results/BAM-nospike/$ENCFF
28+
29+
# Strip Spike-in
30+
python scripts/filter_spikein.py -b $BAM.bam -g ../input/hg19.fa -o $NOSPIKE.bam
31+
samtools index $NOSPIKE.bam
32+
33+
# Count Bases
34+
[ -f $BAM.bam.bai ] && python scripts/estimate_bp_sequenced.py -b $BAM.bam > $BAM\_bpcount.txt
35+
[ -f $NOSPIKE.bam.bai ] && python scripts/estimate_bp_sequenced.py -b $NOSPIKE.bam > $NOSPIKE\_bpcount.txt
Lines changed: 67 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,67 @@
1+
#!/bin/bash
2+
#PBS -l nodes=1:ppn=6
3+
#PBS -l pmem=24gb
4+
#PBS -l walltime=01:00:00
5+
#PBS -A open
6+
#PBS -o logs/sid.log.out
7+
#PBS -e logs/sid.log.err
8+
#PBS -t 1-14260
9+
10+
module load gcc/8.3.1
11+
module load bedtools/2.27.1
12+
module load bwa/0.7.15
13+
module load samtools/1.5
14+
module load anaconda3
15+
source activate genopipe
16+
17+
WRK=/path/to/GenoPipe/paper/ENCODE-CellLines
18+
cd $WRK
19+
20+
# Store directory paths
21+
DATABASE=$WRK/../db/hg19_VCF
22+
GENOME=$WRK/../input/hg19.fa
23+
SEED=$PBS_ARRAYID
24+
GENOPIPE=$WRK/../..
25+
BAM=$WRK/results/BAM-nospike
26+
ID=$WRK/results/ID
27+
28+
[ -d logs ] || mkdir logs
29+
[ -d $ID ] || mkdir -p $ID
30+
31+
# Parse metadata
32+
METADATA=210512_sample_metadata.txt
33+
INFO=`sed "${PBS_ARRAYID}q;d" $METADATA`
34+
ENCFF=`echo $INFO | awk '{print $1}'`
35+
#echo $INFO
36+
37+
#Check that BAM file was generated first
38+
if [ ! -f $BAM/$ENCFF.bam ];
39+
then
40+
echo "BAM input for $BAM/$ENCFF does not exist. Exiting."
41+
exit
42+
fi
43+
44+
#Check that BAM Index file exists
45+
if [ ! -f $BAM/$ENCFF.bam.bai ];
46+
then
47+
echo "BAI missing for for $ENCFF. Exiting."
48+
exit
49+
fi
50+
51+
# Set-up Temp directory
52+
TEMP=$WRK/temp-$PBS_ARRAYID
53+
[ -d $TEMP ] || mkdir $TEMP
54+
cd $TEMP
55+
echo $BAM
56+
ln -s $BAM/$ENCFF.bam
57+
ln -s $BAM/$ENCFF.bam.bai
58+
59+
## Execute Single StrainID and record time
60+
cd $GENOPIPE/StrainID
61+
echo "**Begin executing StrainID for ${ENCFF}..."
62+
{ time bash identify-Strain.sh -i $TEMP -g $GENOME -v $DATABASE -s $SEED -o $ID > $ID/$ENCFF.std ; } 2> $ID/$ENCFF.time
63+
echo "...single StrainID for ($PBS_ARRAYID) ${ENCFF} finished."
64+
cd $WRK
65+
66+
## Clean-up
67+
rm -r $TEMP
Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,16 @@
1+
#!/bin/bash
2+
#PBS -l nodes=1:ppn=6
3+
#PBS -l pmem=24gb
4+
#PBS -l walltime=03:00:00
5+
#PBS -A open
6+
#PBS -o logs/tally.log.out
7+
#PBS -e logs/tally.log.err
8+
9+
module load anaconda3
10+
source activate genopipe
11+
12+
WRK=/path/to/GenoPipe/paper/ENCODE-CellLines
13+
cd $WRK
14+
15+
# Compile StrainID results
16+
python scripts/analyze_encode_results.py -i results/ID/ -m 210512_sample_metadata.txt > results/encode_cell_line_results.txt
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
# logfiles from STDERR and STDOUT of running job files go here
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
# Downloaded BAM files and StrainID results go here
Lines changed: 76 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,76 @@
1+
from os import listdir
2+
from os.path import isfile, join
3+
import sys
4+
import argparse
5+
6+
# Python 3 needed for encoding feature for UTF-8
7+
# (ENCODE uses some capital delta chars in summary descriptions of GeneticModifications)
8+
9+
def getParams():
10+
'''Parse parameters from the command line'''
11+
parser = argparse.ArgumentParser(description='Parse metadata file and GenoPipe output to check detection rates of the GenoPipe tool.')
12+
parser.add_argument('-m','--metadata', metavar='metadata_fn', required=True, help='the metadata file downloaded with ENCODE dataset that includes info like PE/SE, cell line, assay type, and read lengths/SE-PE')
13+
parser.add_argument('-i','--input-dir', metavar='input_dir', required=True, help='the directory where all the EpitopeID output files were saved (*strain.tab)')
14+
15+
args = parser.parse_args()
16+
return(args)
17+
18+
# ENCFF000DZC.bam
19+
#LnCap.vcf -5.082117812158647
20+
#MCF7.vcf -6.1143012059424935
21+
#SKnSH.vcf -5.7641601741217645
22+
#HepG2.vcf -5.595833186705702
23+
#K562.vcf 1.8812984639660986
24+
#A549.vcf -6.059318584695944
25+
#HCT116.vcf -4.847450343904915
26+
#HELA.vcf -4.906670711358038
27+
def parse_file(var_file):
28+
dict = {}
29+
reader = open(var_file,'r')
30+
for line in reader:
31+
tokens = line.split("\t")
32+
if(tokens[0]==""):
33+
continue
34+
score = float(tokens[1].strip())
35+
if(tokens[1].strip().lower()=="inf"):
36+
score = 500000
37+
elif(tokens[1].strip().lower()=="nan"):
38+
score = -500000
39+
40+
# update dict
41+
dict[tokens[0].split(".")[0]] = score
42+
reader.close()
43+
return(dict)
44+
45+
#ENCFF364CPX /files/ENCFF364CPX/@@download/ENCFF364CPX.bam 79651f67b1c4d564395c18be9cdff62f HeLa-S3 ChIP-seq single-ended 36 /files/ENCFF807MUK/|/files/ENCFF000BAO/ unfiltered alignments /experiments/ENCSR000AOB/ /biosample-types/cell_line_EFO_0002791/ 1884977463 released 2020-02-18T20:47:36.519163+00:00
46+
#ENCFF325UJS /files/ENCFF325UJS/@@download/ENCFF325UJS.bam 042b20b3e149df6c1f4e5c95f83653ee HepG2 ChIP-seq single-ended 36 /files/ENCFF807MUK/|/files/ENCFF000BGR/ alignments /experiments/ENCSR000AOM/ /biosample-types/cell_line_EFO_0001187/ 978259147 released 2020-02-18T09:15:21.891603+00:00
47+
#ENCFF821WQW /files/ENCFF821WQW/@@download/ENCFF821WQW.bam e9c6eeedee7dc41d6e19ca6f7a6777f3 HepG2 ChIP-seq paired-ended 100 /files/ENCFF195VBJ/|/files/ENCFF807MUK/|/files/ENCFF594PZU/ alignments /experiments/ENCSR730TBC/ /biosample-types/cell_line_EFO_0001187/ 4831825733 released 2017-06-06T18:20:14.545786+00:00
48+
if __name__ == "__main__":
49+
'''Collect metadata and StrainID results to get detection stats on the cell line ENCODE data'''
50+
args = getParams()
51+
52+
# Parse metadata
53+
reader = open(args.metadata, 'r', encoding='utf-8')
54+
for mline in reader:
55+
# Pull relevant info from metadata tokens
56+
mtokens = mline.strip().split('\t')
57+
encff = mtokens[0]
58+
59+
# Check file exists
60+
id_file = join(args.input_dir,"%s_strain.tab" % encff)
61+
if(not isfile(id_file)):
62+
sys.stderr.write("%s: no results generated.\n" % (id_file))
63+
continue
64+
65+
# Initialize id file variables to save
66+
called_strain = ""
67+
# Parse id file for cell line score info and sort strains by score
68+
strain_info = parse_file(id_file)
69+
strain_sortbyscore = sorted( strain_info.keys(), key=lambda x: (strain_info[x]), reverse=True)
70+
# Assign strain with best score to called_strain
71+
if(len(strain_info.keys())>1):
72+
called_strain = strain_sortbyscore[0]
73+
74+
# Write called strain with metadata
75+
sys.stdout.write( "%s\t%s\t%s\t%s\n" % (encff, called_strain, strain_info.get(called_strain,"NaN"), "\t".join(mtokens[3:])) )
76+
reader.close()

0 commit comments

Comments
 (0)