Merge pull request #11 from CEGRcode/encode

owlang · web-flow · commit d0149a0d8b3e · 2021-10-12T12:45:25.000-04:00
Scripts to process ENCODE data and run StrainID

Bulk of ENCODE analysis of StrainID included in these commits
diff --git a/paper/.gitignore b/paper/.gitignore
@@ -26,3 +26,6 @@ SyntheticStrain/logs/*.err-*
 SyntheticStrain/logs/*.out-*
 SyntheticStrain/results/sacCer3*
 SyntheticStrain/results/hg19*
+ENCODE_CellLines/results/BAM
+ENCODE_CellLines/results/BAM-nospike
+ENCODE_CellLines/results/ID
diff --git a/paper/ENCODEdata-CellLines/210512_sample_metadata.txt b/paper/ENCODEdata-CellLines/210512_sample_metadata.txt
diff --git a/paper/ENCODEdata-CellLines/README b/paper/ENCODEdata-CellLines/README
@@ -0,0 +1,19 @@
+# Run StrainID on ENCODE data and evaluate StrainID's performance
+
+# Reference files
+ENCODE metadata was pulled on May 12, 2021 using the `scripts/get_metadata.py`
+script that pulls all Biosample accessions with classification="cell_line" and
+whose string matches one of the cell lines we have in our hg19_VCF database.
+These are used to pull File accessions with type=BAM and assembly=hg19. We did
+not filter by assay for this analysis. They are saved with all relevant metadata
+to the `210512_sample_metadata.txt` file.
+
+Command used: python scripts/get_metadata.py > 210512_sample_metadata.txt
+
+# Download ENCODE CellLine data and run through StrainID
+Use the 210512_sample_metadata.txt file with ENCODE accessions to download the
+data to the data directory.
+Then run the data through StrainID.
+
+# Compare the results to the metadata information
+Evaluate the accuracy of StrainID on real data.
diff --git a/paper/ENCODEdata-CellLines/job/00_download_data.pbs b/paper/ENCODEdata-CellLines/job/00_download_data.pbs
@@ -0,0 +1,45 @@
+#!/bin/bash
+#PBS -l nodes=1:ppn=6
+#PBS -l pmem=24gb
+#PBS -l walltime=05:00:00
+#PBS -A open
+#PBS -o logs/download.data.log.out
+#PBS -e logs/download.data.log.err
+#PBS -t 1-14260
+
+module load gcc/9.3.1
+module load samtools
+module load anaconda3
+source activate genopipe
+
+WRK=/path/to/GenoPipe/paper/ENCODE-CellLines
+cd $WRK
+
+[ -d logs ] || mkdir logs
+[ -d results/BAM ] || mkdir -p results/BAM
+
+METADATA=210512_sample_metadata.txt
+INFO=`sed "${PBS_ARRAYID}q;d" $METADATA`
+ENCFF=`echo $INFO | awk '{print $1}'`
+#echo $INFO
+
+cd results/BAM
+BAM=$ENCFF.bam
+
+# ENCODE data download
+HREF=`echo $INFO | awk '{print $2}'`
+echo "Fetching from https://www.encodeproject.org$HREF"
+wget https://www.encodeproject.org$HREF
+
+# Checksum of resulting BAM
+MD5SUM=`echo $INFO | awk '{print $3}'`
+if [[ `md5sum $BAM` =~ $MD5SUM ]]; then
+	echo "($PBS_ARRAYID) $BAM passed."
+else
+	echo "($PBS_ARRAYID) $BAM md5checksum failed!"
+	rm $BAM
+	exit
+fi
+
+# Index BAM file
+samtools index $BAM
diff --git a/paper/ENCODEdata-CellLines/job/01_filter_and_count.pbs b/paper/ENCODEdata-CellLines/job/01_filter_and_count.pbs
@@ -0,0 +1,35 @@
+#!/bin/bash
+#PBS -l nodes=1:ppn=6
+#PBS -l pmem=24gb
+#PBS -l walltime=01:00:00
+#PBS -A open
+#PBS -o logs/filter.count.log.out
+#PBS -e logs/filter.count.log.err
+#PBS -t 1-14260
+
+module load gcc/9.3.1
+module load samtools
+module load anaconda3
+source activate genopipe
+
+WRK=/path/to/GenoPipe/paper/ENCODE-CellLines
+cd $WRK
+
+[ -d logs ] || mkdir logs
+[ -d results/BAM-nospike ] || mkdir -p results/BAM-nospike
+
+METADATA=210512_sample_metadata.txt
+INFO=`sed "${PBS_ARRAYID}q;d" $METADATA`
+ENCFF=`echo $INFO | awk '{print $1}'`
+#echo $INFO
+
+BAM=results/BAM/$ENCFF
+NOSPIKE=results/BAM-nospike/$ENCFF
+
+# Strip Spike-in
+python scripts/filter_spikein.py -b $BAM.bam -g ../input/hg19.fa -o $NOSPIKE.bam
+samtools index $NOSPIKE.bam
+
+# Count Bases
+[ -f $BAM.bam.bai ] && python scripts/estimate_bp_sequenced.py -b $BAM.bam > $BAM\_bpcount.txt
+[ -f $NOSPIKE.bam.bai ] && python scripts/estimate_bp_sequenced.py -b $NOSPIKE.bam > $NOSPIKE\_bpcount.txt
diff --git a/paper/ENCODEdata-CellLines/job/02_indexed_runSID.pbs b/paper/ENCODEdata-CellLines/job/02_indexed_runSID.pbs
@@ -0,0 +1,67 @@
+#!/bin/bash
+#PBS -l nodes=1:ppn=6
+#PBS -l pmem=24gb
+#PBS -l walltime=01:00:00
+#PBS -A open
+#PBS -o logs/sid.log.out
+#PBS -e logs/sid.log.err
+#PBS -t 1-14260
+
+module load gcc/8.3.1
+module load bedtools/2.27.1
+module load bwa/0.7.15
+module load samtools/1.5
+module load anaconda3
+source activate genopipe
+
+WRK=/path/to/GenoPipe/paper/ENCODE-CellLines
+cd $WRK
+
+# Store directory paths
+DATABASE=$WRK/../db/hg19_VCF
+GENOME=$WRK/../input/hg19.fa
+SEED=$PBS_ARRAYID
+GENOPIPE=$WRK/../..
+BAM=$WRK/results/BAM-nospike
+ID=$WRK/results/ID
+
+[ -d logs ] || mkdir logs
+[ -d $ID ] || mkdir -p $ID
+
+# Parse metadata
+METADATA=210512_sample_metadata.txt
+INFO=`sed "${PBS_ARRAYID}q;d" $METADATA`
+ENCFF=`echo $INFO | awk '{print $1}'`
+#echo $INFO
+
+#Check that BAM file was generated first
+if [ ! -f $BAM/$ENCFF.bam ];
+then
+	echo "BAM input for $BAM/$ENCFF does not exist. Exiting."
+	exit
+fi
+
+#Check that BAM Index file exists
+if [ ! -f $BAM/$ENCFF.bam.bai ];
+then
+	echo "BAI missing for for $ENCFF. Exiting."
+	exit
+fi
+
+# Set-up Temp directory
+TEMP=$WRK/temp-$PBS_ARRAYID
+[ -d $TEMP ] || mkdir $TEMP
+cd $TEMP
+echo $BAM
+ln -s $BAM/$ENCFF.bam
+ln -s $BAM/$ENCFF.bam.bai
+
+## Execute Single StrainID and record time
+cd $GENOPIPE/StrainID
+echo "**Begin executing StrainID for ${ENCFF}..."
+{ time bash identify-Strain.sh -i $TEMP -g $GENOME -v $DATABASE -s $SEED -o $ID > $ID/$ENCFF.std ; } 2> $ID/$ENCFF.time
+echo "...single StrainID for ($PBS_ARRAYID) ${ENCFF} finished."
+cd $WRK
+
+## Clean-up
+rm -r $TEMP
diff --git a/paper/ENCODEdata-CellLines/job/03_tally_results.sh b/paper/ENCODEdata-CellLines/job/03_tally_results.sh
@@ -0,0 +1,16 @@
+#!/bin/bash
+#PBS -l nodes=1:ppn=6
+#PBS -l pmem=24gb
+#PBS -l walltime=03:00:00
+#PBS -A open
+#PBS -o logs/tally.log.out
+#PBS -e logs/tally.log.err
+
+module load anaconda3
+source activate genopipe
+
+WRK=/path/to/GenoPipe/paper/ENCODE-CellLines
+cd $WRK
+
+# Compile StrainID results
+python scripts/analyze_encode_results.py -i results/ID/ -m 210512_sample_metadata.txt > results/encode_cell_line_results.txt 
diff --git a/paper/ENCODEdata-CellLines/logs/README b/paper/ENCODEdata-CellLines/logs/README
@@ -0,0 +1 @@
+# logfiles from STDERR and STDOUT of running job files go here
diff --git a/paper/ENCODEdata-CellLines/results/README b/paper/ENCODEdata-CellLines/results/README
@@ -0,0 +1 @@
+# Downloaded BAM files and StrainID results go here
diff --git a/paper/ENCODEdata-CellLines/scripts/analyze_encode_results.py b/paper/ENCODEdata-CellLines/scripts/analyze_encode_results.py
@@ -0,0 +1,76 @@
+from os import listdir
+from os.path import isfile, join
+import sys
+import argparse
+
+# Python 3 needed for encoding feature for UTF-8
+# (ENCODE uses some capital delta chars in summary descriptions of GeneticModifications)
+
+def getParams():
+	'''Parse parameters from the command line'''
+	parser = argparse.ArgumentParser(description='Parse metadata file and GenoPipe output to check detection rates of the GenoPipe tool.')
+	parser.add_argument('-m','--metadata', metavar='metadata_fn', required=True, help='the metadata file downloaded with ENCODE dataset that includes info like PE/SE, cell line, assay type, and read lengths/SE-PE')
+	parser.add_argument('-i','--input-dir', metavar='input_dir', required=True, help='the directory where all the EpitopeID output files were saved (*strain.tab)')
+
+	args = parser.parse_args()
+	return(args)
+
+#	ENCFF000DZC.bam
+#LnCap.vcf	-5.082117812158647
+#MCF7.vcf	-6.1143012059424935
+#SKnSH.vcf	-5.7641601741217645
+#HepG2.vcf	-5.595833186705702
+#K562.vcf	1.8812984639660986
+#A549.vcf	-6.059318584695944
+#HCT116.vcf	-4.847450343904915
+#HELA.vcf	-4.906670711358038
+def parse_file(var_file):
+	dict = {}
+	reader = open(var_file,'r')
+	for line in reader:
+		tokens = line.split("\t")
+		if(tokens[0]==""):
+			continue
+		score = float(tokens[1].strip())
+		if(tokens[1].strip().lower()=="inf"):
+			score = 500000
+		elif(tokens[1].strip().lower()=="nan"):
+			score = -500000
+
+		# update dict
+		dict[tokens[0].split(".")[0]] = score
+	reader.close()
+	return(dict)
+
+#ENCFF364CPX	/files/ENCFF364CPX/@@download/ENCFF364CPX.bam	79651f67b1c4d564395c18be9cdff62f	HeLa-S3	ChIP-seq	single-ended	36	/files/ENCFF807MUK/|/files/ENCFF000BAO/	unfiltered alignments	/experiments/ENCSR000AOB/	/biosample-types/cell_line_EFO_0002791/	1884977463	released	2020-02-18T20:47:36.519163+00:00
+#ENCFF325UJS	/files/ENCFF325UJS/@@download/ENCFF325UJS.bam	042b20b3e149df6c1f4e5c95f83653ee	HepG2	ChIP-seq	single-ended	36	/files/ENCFF807MUK/|/files/ENCFF000BGR/	alignments	/experiments/ENCSR000AOM/	/biosample-types/cell_line_EFO_0001187/	978259147	released	2020-02-18T09:15:21.891603+00:00
+#ENCFF821WQW	/files/ENCFF821WQW/@@download/ENCFF821WQW.bam	e9c6eeedee7dc41d6e19ca6f7a6777f3	HepG2	ChIP-seq	paired-ended	100	/files/ENCFF195VBJ/|/files/ENCFF807MUK/|/files/ENCFF594PZU/	alignments	/experiments/ENCSR730TBC/	/biosample-types/cell_line_EFO_0001187/	4831825733	released	2017-06-06T18:20:14.545786+00:00
+if __name__ == "__main__":
+	'''Collect metadata and StrainID results to get detection stats on the cell line ENCODE data'''
+	args = getParams()
+
+	# Parse metadata
+	reader = open(args.metadata, 'r', encoding='utf-8')
+	for mline in reader:
+		# Pull relevant info from metadata tokens
+		mtokens = mline.strip().split('\t')
+		encff = mtokens[0]
+
+		# Check file exists
+		id_file = join(args.input_dir,"%s_strain.tab" % encff)
+		if(not isfile(id_file)):
+			sys.stderr.write("%s: no results generated.\n" % (id_file))
+			continue
+
+		# Initialize id file variables to save
+		called_strain = ""
+		# Parse id file for cell line score info and sort strains by score
+		strain_info = parse_file(id_file)
+		strain_sortbyscore = sorted( strain_info.keys(), key=lambda x: (strain_info[x]), reverse=True)
+		# Assign strain with best score to called_strain
+		if(len(strain_info.keys())>1):
+			called_strain = strain_sortbyscore[0]
+
+		# Write called strain with metadata
+		sys.stdout.write( "%s\t%s\t%s\t%s\n" % (encff, called_strain, strain_info.get(called_strain,"NaN"), "\t".join(mtokens[3:])) )
+	reader.close()
diff --git a/paper/ENCODEdata-CellLines/scripts/estimate_bp_sequenced.py b/paper/ENCODEdata-CellLines/scripts/estimate_bp_sequenced.py
@@ -0,0 +1,66 @@
+import sys, os
+import math, random
+import argparse
+import pysam
+
+def getParams():
+	'''Parse parameters from the command line'''
+	parser = argparse.ArgumentParser(description='Sample reads from a BAM file to estimate total bp sequenced')
+
+	parser.add_argument('-b','--bam', metavar='bamfile', dest='bamfilepath', required=True, help='The BAM file to sample and estimate bp sequenced from')
+
+	args = parser.parse_args()
+	return(args)
+
+def validateBAM(bam):
+	try:
+		samfile = pysam.AlignmentFile(bam, "rb")
+		index = samfile.check_index()
+		samfile.close()
+		return index
+	except:
+		print("BAM index not detected.\nAttempting to index now...\n")
+		pysam.index(str(bam))
+		if not os.path.isfile(bam + ".bai"):
+			raise RuntimeError("BAM indexing failed, please check if BAM file is sorted")
+			return False
+		print("BAM index successfully generated.\n")
+		return True
+
+# Main program which takes in input parameters
+if __name__ == '__main__':
+	'''Main sampling function'''
+	args = getParams()
+
+	RLEN = []
+	# Validate BAM file
+	if(not validateBAM(args.bamfilepath)):
+		sys.exit(-1)
+	# Open BAM file
+	samfile = pysam.AlignmentFile(args.bamfilepath, "rb")
+	# Get num mapped reads
+	MAPPED = samfile.mapped
+
+	# Select random reads in BAM file to build SNP background model
+	# Sample the larger of 5% of mapped or 1 million reads
+	count = max(int((float(MAPPED) * 0.05)), 1000000)
+	RANDREAD = [random.randint(0,MAPPED - 1) for _ in range(count)]
+	RANDREAD.sort()
+	it = samfile.fetch()
+	# Iterate across the randomly selected read indexes
+	for INDEX in RANDREAD:
+		# Enumerate the samfile iterator and jump to the index directly
+		for index, read in  enumerate(it, INDEX):
+			# Save query/read length
+			RLEN.append(read.query_length)
+			# Break out the iterator so we can move to the next index directly with the next loop pass
+			break
+	# Close files
+	samfile.close()
+
+	RLEN.sort()
+	MEDIAN = RLEN[int(len(RLEN)/2)]
+	MEAN = sum(RLEN)/len(RLEN)
+	ESTBP = MEDIAN * MAPPED
+
+	sys.stdout.write("\t".join([str(s) for s in [os.path.basename(args.bamfilepath),ESTBP,MAPPED,MEDIAN,max(RLEN),min(RLEN),MEAN]]) + "\n")
diff --git a/paper/ENCODEdata-CellLines/scripts/filter_spikein.py b/paper/ENCODEdata-CellLines/scripts/filter_spikein.py
@@ -0,0 +1,35 @@
+import sys,argparse
+import pysam
+
+def getParams():
+	'''Parse parameters from the command line'''
+	parser = argparse.ArgumentParser(description='Remove non-hg19 chromosomes/contigs from the input BAM file.')
+
+	parser.add_argument('-b','--bam', metavar='bamfile', dest='bamfilepath', required=True, help='The BAM data to filter spike in chromosomes from')
+	parser.add_argument('-g','--genome', metavar='genome_fasta', dest='genomefilepath', required=True, help='The FASTA with genomic sequence to pull chr names to keep from')
+	parser.add_argument('-o','--output', metavar='outfile', dest='outfilepath', required=True, help='The filtered output BAM file path')
+
+	args = parser.parse_args()
+	return(args)
+
+if __name__ == '__main__':
+	'''Main pileup function'''
+	args = getParams()
+
+	# open Genome fasta file
+	GENOME = pysam.FastaFile(args.genomefilepath)
+	CHR = list(GENOME.references)
+	GENOME.close()
+
+	# open BAM files
+	samfile = pysam.AlignmentFile(args.bamfilepath, "rb")
+	nospike = pysam.AlignmentFile(args.outfilepath,"wb", template=samfile)
+
+	for valid_chr in CHR:
+		if(samfile.get_tid(valid_chr) == -1):
+			continue
+		for read in samfile.fetch(valid_chr):
+			nospike.write(read)
+	# Close files
+	samfile.close()
+	nospike.close()
diff --git a/paper/ENCODEdata-CellLines/scripts/get_metadata.py b/paper/ENCODEdata-CellLines/scripts/get_metadata.py
diff --git a/paper/ENCODEdata-TFChIPseq/README b/paper/ENCODEdata-TFChIPseq/README

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1 @@`
	`1`	`+# logfiles from STDERR and STDOUT of running job files go here`
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1 @@`
	`1`	`+# Downloaded BAM files and StrainID results go here`