add scripts to tally simulation results

owlang · owlang · commit 367794986bb0 · 2021-10-21T12:37:32.000-04:00
For each experiment (strain x depth), parse out the StrainID scores and runtimes using the tally PBS script and the two helper python scripts for parsing the results.
diff --git a/paper/SyntheticStrain/job/tally_results.sh b/paper/SyntheticStrain/job/tally_results.sh
@@ -0,0 +1,38 @@
+#!/bin/bash
+#PBS -l nodes=1:ppn=4
+#PBS -l pmem=16gb
+#PBS -l walltime=02:00:00
+#PBS -A open
+#PBS -o logs/depth.tally.log.out
+#PBS -e logs/depth.tally.log.err
+
+module load anaconda3
+source activate genopipe
+
+WRK=/path/to/GenoPipe/paper/SyntheticStrain
+cd $WRK
+
+TALLY=scripts/parse_simulation_results.py
+RUNTIME=scripts/parse_runtimes.py
+# Parse hg19 results
+for STRAIN in "HELA" "K562";
+do
+	for DEPTH in "1M" "2M" "5M" "10M" "20M";
+	do
+		DIR=results/hg19_$STRAIN\_$DEPTH
+		echo "Tally for $DIR..."
+		python $TALLY -v ../db/hg19_VCF/ -i $DIR/ID > $DIR\_scores.txt
+		python $RUNTIME -i <(grep 'real' $DIR/ID/*.time) > $DIR\_runtimes.txt
+	done
+done
+# Parse sacCer3 results
+for STRAIN in "CEN.PK2-1Ca" "RM11-1A";
+do
+	for DEPTH in "10K" "50K" "100K" "500K" "1M" "2M";
+	do
+		DIR=results/sacCer3_$STRAIN\_$DEPTH
+		echo "Tally for $DIR..."
+		python $TALLY -v ../db/sacCer3_VCF/ -i $DIR/ID > $DIR\_scores.txt
+		python $RUNTIME -i <(grep 'real' $DIR/ID/*.time) > $DIR\_runtimes.txt
+	done
+done
diff --git a/paper/SyntheticStrain/scripts/parse_runtimes.py b/paper/SyntheticStrain/scripts/parse_runtimes.py
@@ -0,0 +1,24 @@
+import sys
+import re
+import argparse
+
+def getParams():
+	'''Parse parameters from the command line'''
+	parser = argparse.ArgumentParser(description='Parse runtimes from system and convert to seconds for plotting purposes.')
+	parser.add_argument('-i','--input-times', metavar='grep_time', required=True, help='the file "grepping" out the runtimes, `grep "real" input_dir/*.time`')
+	args = parser.parse_args()
+	return(args)
+
+def parse_time(time_string):
+	parsed_time = re.findall("([0-9]+)m([\.0-9]+)s", time_string)
+	seconds = float(parsed_time[0][1])
+	minutes = int(parsed_time[0][0])
+	sys.stdout.write("%f\n" % (seconds + 60.0*minutes))
+
+if __name__ == "__main__":
+	args = getParams()
+	reader = open(args.input_times)
+	for line in reader:
+		tokens = line.strip().split("\t")
+		parse_time(tokens[1])
+	reader.close()
diff --git a/paper/SyntheticStrain/scripts/parse_simulation_results.py b/paper/SyntheticStrain/scripts/parse_simulation_results.py
@@ -0,0 +1,69 @@
+from os import listdir
+from os.path import isfile, join
+import sys
+import argparse
+
+# Python 3 needed for encoding feature for UTF-8
+# (ENCODE uses some capital delta chars in summary descriptions of GeneticModifications)
+
+def getParams():
+	'''Parse parameters from the command line'''
+	parser = argparse.ArgumentParser(description='Parse metadata file and GenoPipe output to check detection rates of the GenoPipe tool.')
+	parser.add_argument('-i','--input-dir', metavar='input_dir', required=True, help='the directory where all the StrainID output files were saved (*strain.tab)')
+	parser.add_argument('-v','--vcf-dir', metavar='vcf_dir', required=True, help='the directory where all the StrainID VCF db files are housed (for header formatting purposes)')
+	args = parser.parse_args()
+	return(args)
+
+# 	Simulation_11.bam
+# Y55.gatk.vcf	5.893664745278011
+# BY4741.gatk.vcf	4.3921009156813495
+# SEY6210.gatk.vcf	6.799588141350312
+# Sigma1278b-10560-6B.gatk.vcf	6.236837493072714
+# CEN.PK2-1Ca.gatk.vcf	8.360122378850173
+# D273-10B.gatk.vcf	6.24252455963084
+# RM11-1A.gatk.vcf	5.897733174105499
+# BY4742.gatk.vcf	4.57267316132317
+# FL100.gatk.vcf	6.480670542594632
+# X2180-1A.gatk.vcf	4.525367446544814
+# JK9-3d.gatk.vcf	6.523429642303322
+# W303.gatk.vcf	6.593092702562104
+def parse_file(var_file):
+	dict = {}
+	reader = open(var_file,'r')
+	for line in reader:
+		tokens = line.strip().split("\t")
+		if(len(tokens)==1):
+			continue
+		dict[tokens[0].split(".")[0]] = tokens[1]
+	reader.close()
+	return(dict)
+
+
+if __name__ == "__main__":
+	'''Collect metadata and StrainID results to get detection stats on the cell line ENCODE data'''
+	args = getParams()
+
+	# Parse strains to track
+	strain_keys = []
+	for filename in listdir(args.vcf_dir):
+		filename_tokens = filename.split(".")
+		if(filename_tokens[-1]=="vcf"):
+			strain_keys.append(filename_tokens[0])
+	strain_keys.sort()
+
+	# Write header
+	sys.stdout.write("#\t%s\n" % "\t".join(strain_keys))
+
+	# Parse metadata
+	for sindex in range(1,1001):
+		# Check file exists
+		id_file = join(args.input_dir,"Simulation_%i_strain.tab" % sindex)
+		if(not isfile(id_file)):
+			sys.stderr.write("%s: no results generated.\n" % (id_file))
+			continue
+
+		# Parse id file for cell line score info
+		strain_info = parse_file(id_file)
+
+		# Write called strain with metadata
+		sys.stdout.write( "Simulation_%i\t%s\n" % (sindex, "\t".join([strain_info.get(s,"-") for s in strain_keys]) ))