Skip to content

Commit 74fcbf2

Browse files
committed
add scripts to tally results
This commit includes both a python script for parsing StrainID results and a PBS script for calling the script on the `results/ID` directory and metadata file. The results are tallied such that the best score and strain associated for that score are reported for each sample.
1 parent df17e26 commit 74fcbf2

2 files changed

Lines changed: 92 additions & 0 deletions

File tree

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,16 @@
1+
#!/bin/bash
2+
#PBS -l nodes=1:ppn=6
3+
#PBS -l pmem=24gb
4+
#PBS -l walltime=03:00:00
5+
#PBS -A open
6+
#PBS -o logs/tally.log.out
7+
#PBS -e logs/tally.log.err
8+
9+
module load anaconda3
10+
source activate genopipe
11+
12+
WRK=/path/to/GenoPipe/paper/ENCODE-CellLines
13+
cd $WRK
14+
15+
# Compile StrainID results
16+
python scripts/analyze_encode_results.py -i results/ID/ -m 210512_sample_metadata.txt > results/encode_cell_line_results.txt
Lines changed: 76 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,76 @@
1+
from os import listdir
2+
from os.path import isfile, join
3+
import sys
4+
import argparse
5+
6+
# Python 3 needed for encoding feature for UTF-8
7+
# (ENCODE uses some capital delta chars in summary descriptions of GeneticModifications)
8+
9+
def getParams():
10+
'''Parse parameters from the command line'''
11+
parser = argparse.ArgumentParser(description='Parse metadata file and GenoPipe output to check detection rates of the GenoPipe tool.')
12+
parser.add_argument('-m','--metadata', metavar='metadata_fn', required=True, help='the metadata file downloaded with ENCODE dataset that includes info like PE/SE, cell line, assay type, and read lengths/SE-PE')
13+
parser.add_argument('-i','--input-dir', metavar='input_dir', required=True, help='the directory where all the EpitopeID output files were saved (*strain.tab)')
14+
15+
args = parser.parse_args()
16+
return(args)
17+
18+
# ENCFF000DZC.bam
19+
#LnCap.vcf -5.082117812158647
20+
#MCF7.vcf -6.1143012059424935
21+
#SKnSH.vcf -5.7641601741217645
22+
#HepG2.vcf -5.595833186705702
23+
#K562.vcf 1.8812984639660986
24+
#A549.vcf -6.059318584695944
25+
#HCT116.vcf -4.847450343904915
26+
#HELA.vcf -4.906670711358038
27+
def parse_file(var_file):
28+
dict = {}
29+
reader = open(var_file,'r')
30+
for line in reader:
31+
tokens = line.split("\t")
32+
if(tokens[0]==""):
33+
continue
34+
score = float(tokens[1].strip())
35+
if(tokens[1].strip().lower()=="inf"):
36+
score = 500000
37+
elif(tokens[1].strip().lower()=="nan"):
38+
score = -500000
39+
40+
# update dict
41+
dict[tokens[0].split(".")[0]] = score
42+
reader.close()
43+
return(dict)
44+
45+
#ENCFF364CPX /files/ENCFF364CPX/@@download/ENCFF364CPX.bam 79651f67b1c4d564395c18be9cdff62f HeLa-S3 ChIP-seq single-ended 36 /files/ENCFF807MUK/|/files/ENCFF000BAO/ unfiltered alignments /experiments/ENCSR000AOB/ /biosample-types/cell_line_EFO_0002791/ 1884977463 released 2020-02-18T20:47:36.519163+00:00
46+
#ENCFF325UJS /files/ENCFF325UJS/@@download/ENCFF325UJS.bam 042b20b3e149df6c1f4e5c95f83653ee HepG2 ChIP-seq single-ended 36 /files/ENCFF807MUK/|/files/ENCFF000BGR/ alignments /experiments/ENCSR000AOM/ /biosample-types/cell_line_EFO_0001187/ 978259147 released 2020-02-18T09:15:21.891603+00:00
47+
#ENCFF821WQW /files/ENCFF821WQW/@@download/ENCFF821WQW.bam e9c6eeedee7dc41d6e19ca6f7a6777f3 HepG2 ChIP-seq paired-ended 100 /files/ENCFF195VBJ/|/files/ENCFF807MUK/|/files/ENCFF594PZU/ alignments /experiments/ENCSR730TBC/ /biosample-types/cell_line_EFO_0001187/ 4831825733 released 2017-06-06T18:20:14.545786+00:00
48+
if __name__ == "__main__":
49+
'''Collect metadata and StrainID results to get detection stats on the cell line ENCODE data'''
50+
args = getParams()
51+
52+
# Parse metadata
53+
reader = open(args.metadata, 'r', encoding='utf-8')
54+
for mline in reader:
55+
# Pull relevant info from metadata tokens
56+
mtokens = mline.strip().split('\t')
57+
encff = mtokens[0]
58+
59+
# Check file exists
60+
id_file = join(args.input_dir,"%s_strain.tab" % encff)
61+
if(not isfile(id_file)):
62+
sys.stderr.write("%s: no results generated.\n" % (id_file))
63+
continue
64+
65+
# Initialize id file variables to save
66+
called_strain = ""
67+
# Parse id file for cell line score info and sort strains by score
68+
strain_info = parse_file(id_file)
69+
strain_sortbyscore = sorted( strain_info.keys(), key=lambda x: (strain_info[x]), reverse=True)
70+
# Assign strain with best score to called_strain
71+
if(len(strain_info.keys())>1):
72+
called_strain = strain_sortbyscore[0]
73+
74+
# Write called strain with metadata
75+
sys.stdout.write( "%s\t%s\t%s\t%s\n" % (encff, called_strain, strain_info.get(called_strain,"NaN"), "\t".join(mtokens[3:])) )
76+
reader.close()

0 commit comments

Comments
 (0)