Skip to content

Commit 3677949

Browse files
committed
add scripts to tally simulation results
For each experiment (strain x depth), parse out the StrainID scores and runtimes using the tally PBS script and the two helper python scripts for parsing the results.
1 parent ed27182 commit 3677949

3 files changed

Lines changed: 131 additions & 0 deletions

File tree

Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,38 @@
1+
#!/bin/bash
2+
#PBS -l nodes=1:ppn=4
3+
#PBS -l pmem=16gb
4+
#PBS -l walltime=02:00:00
5+
#PBS -A open
6+
#PBS -o logs/depth.tally.log.out
7+
#PBS -e logs/depth.tally.log.err
8+
9+
module load anaconda3
10+
source activate genopipe
11+
12+
WRK=/path/to/GenoPipe/paper/SyntheticStrain
13+
cd $WRK
14+
15+
TALLY=scripts/parse_simulation_results.py
16+
RUNTIME=scripts/parse_runtimes.py
17+
# Parse hg19 results
18+
for STRAIN in "HELA" "K562";
19+
do
20+
for DEPTH in "1M" "2M" "5M" "10M" "20M";
21+
do
22+
DIR=results/hg19_$STRAIN\_$DEPTH
23+
echo "Tally for $DIR..."
24+
python $TALLY -v ../db/hg19_VCF/ -i $DIR/ID > $DIR\_scores.txt
25+
python $RUNTIME -i <(grep 'real' $DIR/ID/*.time) > $DIR\_runtimes.txt
26+
done
27+
done
28+
# Parse sacCer3 results
29+
for STRAIN in "CEN.PK2-1Ca" "RM11-1A";
30+
do
31+
for DEPTH in "10K" "50K" "100K" "500K" "1M" "2M";
32+
do
33+
DIR=results/sacCer3_$STRAIN\_$DEPTH
34+
echo "Tally for $DIR..."
35+
python $TALLY -v ../db/sacCer3_VCF/ -i $DIR/ID > $DIR\_scores.txt
36+
python $RUNTIME -i <(grep 'real' $DIR/ID/*.time) > $DIR\_runtimes.txt
37+
done
38+
done
Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,24 @@
1+
import sys
2+
import re
3+
import argparse
4+
5+
def getParams():
6+
'''Parse parameters from the command line'''
7+
parser = argparse.ArgumentParser(description='Parse runtimes from system and convert to seconds for plotting purposes.')
8+
parser.add_argument('-i','--input-times', metavar='grep_time', required=True, help='the file "grepping" out the runtimes, `grep "real" input_dir/*.time`')
9+
args = parser.parse_args()
10+
return(args)
11+
12+
def parse_time(time_string):
13+
parsed_time = re.findall("([0-9]+)m([\.0-9]+)s", time_string)
14+
seconds = float(parsed_time[0][1])
15+
minutes = int(parsed_time[0][0])
16+
sys.stdout.write("%f\n" % (seconds + 60.0*minutes))
17+
18+
if __name__ == "__main__":
19+
args = getParams()
20+
reader = open(args.input_times)
21+
for line in reader:
22+
tokens = line.strip().split("\t")
23+
parse_time(tokens[1])
24+
reader.close()
Lines changed: 69 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,69 @@
1+
from os import listdir
2+
from os.path import isfile, join
3+
import sys
4+
import argparse
5+
6+
# Python 3 needed for encoding feature for UTF-8
7+
# (ENCODE uses some capital delta chars in summary descriptions of GeneticModifications)
8+
9+
def getParams():
10+
'''Parse parameters from the command line'''
11+
parser = argparse.ArgumentParser(description='Parse metadata file and GenoPipe output to check detection rates of the GenoPipe tool.')
12+
parser.add_argument('-i','--input-dir', metavar='input_dir', required=True, help='the directory where all the StrainID output files were saved (*strain.tab)')
13+
parser.add_argument('-v','--vcf-dir', metavar='vcf_dir', required=True, help='the directory where all the StrainID VCF db files are housed (for header formatting purposes)')
14+
args = parser.parse_args()
15+
return(args)
16+
17+
# Simulation_11.bam
18+
# Y55.gatk.vcf 5.893664745278011
19+
# BY4741.gatk.vcf 4.3921009156813495
20+
# SEY6210.gatk.vcf 6.799588141350312
21+
# Sigma1278b-10560-6B.gatk.vcf 6.236837493072714
22+
# CEN.PK2-1Ca.gatk.vcf 8.360122378850173
23+
# D273-10B.gatk.vcf 6.24252455963084
24+
# RM11-1A.gatk.vcf 5.897733174105499
25+
# BY4742.gatk.vcf 4.57267316132317
26+
# FL100.gatk.vcf 6.480670542594632
27+
# X2180-1A.gatk.vcf 4.525367446544814
28+
# JK9-3d.gatk.vcf 6.523429642303322
29+
# W303.gatk.vcf 6.593092702562104
30+
def parse_file(var_file):
31+
dict = {}
32+
reader = open(var_file,'r')
33+
for line in reader:
34+
tokens = line.strip().split("\t")
35+
if(len(tokens)==1):
36+
continue
37+
dict[tokens[0].split(".")[0]] = tokens[1]
38+
reader.close()
39+
return(dict)
40+
41+
42+
if __name__ == "__main__":
43+
'''Collect metadata and StrainID results to get detection stats on the cell line ENCODE data'''
44+
args = getParams()
45+
46+
# Parse strains to track
47+
strain_keys = []
48+
for filename in listdir(args.vcf_dir):
49+
filename_tokens = filename.split(".")
50+
if(filename_tokens[-1]=="vcf"):
51+
strain_keys.append(filename_tokens[0])
52+
strain_keys.sort()
53+
54+
# Write header
55+
sys.stdout.write("#\t%s\n" % "\t".join(strain_keys))
56+
57+
# Parse metadata
58+
for sindex in range(1,1001):
59+
# Check file exists
60+
id_file = join(args.input_dir,"Simulation_%i_strain.tab" % sindex)
61+
if(not isfile(id_file)):
62+
sys.stderr.write("%s: no results generated.\n" % (id_file))
63+
continue
64+
65+
# Parse id file for cell line score info
66+
strain_info = parse_file(id_file)
67+
68+
# Write called strain with metadata
69+
sys.stdout.write( "Simulation_%i\t%s\n" % (sindex, "\t".join([strain_info.get(s,"-") for s in strain_keys]) ))

0 commit comments

Comments
 (0)