add scripts to create final figure panels

OLIVIA LANG · OLIVIA LANG · commit e13d7b67b26d · 2021-05-20T17:57:48.000-04:00
A heatmap is generated for samples identified by both Puddu and DeletionID methods, neither, only DeletionID, or only Puddu. These heatmaps are made from CDT files that concatenate the CDT row samples generated by the "04" script. Traces are also generated for the ORF boundaries using a version of ScriptManager that supports colors that use the alpha channel.

Figures 3E and 3F are genome browser shots zooming in on some of the coverage rows from the heatmap highlighting various cases DeletionID did not identify the expected gene knockout.
diff --git a/paper/YKOC-wgs/fig3e_config.txt b/paper/YKOC-wgs/fig3e_config.txt
@@ -0,0 +1,6 @@
+FLANKING	200
+LOCI	APE3	VAC17	PIR3
+SAMPLES	ERS838258	ERS903113	ERS969760	ERS1076728
+S_LABEL	APE3	VAC17	PIR3	WT-1
+S_COLOR	forestgreen	firebrick	salmon	gray
+S_MAX_Y	200	150	10	150
diff --git a/paper/YKOC-wgs/fig3f_config.txt b/paper/YKOC-wgs/fig3f_config.txt
@@ -0,0 +1,6 @@
+FLANKING	200
+LOCI	SWT1	PNS1
+SAMPLES	ERS1076684	ERS1076685	ERS1029415	ERS1029416	ERS1076728
+S_LABEL	del_SWT1_rep1	del_SWT1_rep2	del_PNS1_rep1	del_PNS1_rep2	WT-1
+S_COLOR	royalblue	cornflowerblue	goldenrod	gold	gray
+S_MAX_Y	150	150	150	150	150
diff --git a/paper/YKOC-wgs/job/05_make_figs.pbs b/paper/YKOC-wgs/job/05_make_figs.pbs
@@ -0,0 +1,71 @@
+#!/bin/bash
+#PBS -l nodes=1:ppn=4
+#PBS -l pmem=16gb
+#PBS -l walltime=05:00:00
+#PBS -A open
+#PBS -o logs/make.figs.log.out
+#PBS -e logs/make.figs.log.err
+
+#1-9010
+#bfp2_h_g_sc_default
+#bfp2_j_g_bc_default
+
+module load gcc/8.3.1
+module load samtools/1.5
+module load bedtools/2.27.1
+module load anaconda3
+source activate genopipe
+
+WRK=/path/to/GenoPipe/paper/YKOC-wgs
+WRK=/storage/home/owl5022/scratch/GenotypingProject/GenoPipe/paper/YKOC-wgs
+cd $WRK
+
+[ -d logs ] || mkdir logs
+[ -d results/figs ] || mkdir results/figs
+
+for PARTITION in "TentativePass" "TrueFails" "TentativeFails" "ConfidentPass";
+do
+	BASE=results/$PARTITION
+	rm $BASE.temp
+	
+	METADATA=results/$PARTITION.txt
+	LNCT=`wc -l $METADATA | awk '{print $1}'`
+	for ((INDEX=1; INDEX<=$LNCT; INDEX++));
+	do
+		#echo $INDEX
+		ERS=`cut -d $'\t' -f10 $METADATA | sed "${INDEX}q;d"`
+		ORF=`cut -d $'\t' -f7 $METADATA | sed "${INDEX}q;d"`
+		echo "($INDEX) Parsed out ERS=$ERS and ORF=$ORF"
+		ROW=results/BedGraphs/$ERS\_$ORF\_6000bp.cdt
+		cat $ROW >> $BASE.temp
+	done
+	
+	sort -nk3,3 $BASE.temp > $BASE.cdt
+	
+	# Make Heatmap PNG
+	##ScriptManager Two-color heatmap
+	# Black, .95 threshold
+	# $BASE.cdt > $BASE.png
+
+	# Make Trace CDT
+	TRACE=scripts/make_trace.py
+	python $TRACE -c $BASE.cdt -g saccharomyces_cerevisiae.gff -w 6000 > $BASE\_trace.cdt
+
+	rm $BASE.temp	
+done
+
+# Make trace (x4 lines)
+##Script to make trace cdt
+##ScriptManager Three-color heatmap
+# diff colors, set alpha channel, threshold 0<0<.95
+
+SUBPLOTS=scripts/make_subplots.py
+
+# Figure 3E SVG image
+python $SUBPLOTS -t Figure3E -c fig3e_config.txt -g saccharomyces_cerevisiae.gff
+
+# Figure 3F SVG image
+python $SUBPLOTS -t Figure3F -c fig3f_config.txt -g saccharomyces_cerevisiae.gff
+
+
+
diff --git a/paper/YKOC-wgs/scripts/make_subplots.py b/paper/YKOC-wgs/scripts/make_subplots.py
@@ -0,0 +1,135 @@
+from os import listdir
+from os.path import isfile, join
+import sys
+import re
+import argparse
+import matplotlib.pyplot as plt
+import numpy as np
+
+# Python 3.6+
+# relies on dict insertion order
+
+# Check Matplotlib colors when building your config files: https://matplotlib.org/stable/gallery/color/named_colors.html
+
+roman2arabic = {"chrI":"chr1","chrII":"chr2","chrIII":"chr3","chrIV":"chr4","chrV":"chr5",
+			"chrVI":"chr6","chrVII":"chr7","chrVIII":"chr8","chrIX":"chr9","chrX":"chr10",
+			"chrXI":"chr11","chrXII":"chr12","chrXIII":"chr13","chrXIV":"chr14","chrXV":"chr15",
+			"chrXVI":"chr16",}
+
+def getParams():
+	'''Parse parameters from the command line'''
+	parser = argparse.ArgumentParser(description='')
+
+	parser.add_argument('-t','--title', metavar='figure_title', dest='title', required=True, help='')
+	parser.add_argument('-c','--config', metavar='config_fn', dest='config_fn', required=True, help='the config file for a grid-organized subplot')
+	parser.add_argument('-g','--features-gff', metavar='features_gff', dest='features_gff', required=True, help='the featuer GFF file from SGD to get the gene coordinates')
+	parser.add_argument('-d','--header', dest='header', default=False, required=False, help='skip first line as column header')
+
+	args = parser.parse_args()
+	return(args)
+
+def get_bedgraph_info(bedgraph_file, locus_coord, flanking):
+	flanking_range = (roman2arabic[locus_coord[0]], locus_coord[1]-flanking, locus_coord[2]+flanking)
+	x_vector = list(range(flanking_range[1],flanking_range[2]))
+	y_vector = [0] * len(x_vector)
+	reader = open(bedgraph_file,'r')
+	for line in reader:
+		tokens = line.strip().split('\t')
+		# Skip if chromosome doesn't match
+		if(tokens[0]!=flanking_range[0]):
+			continue
+			# Skip if interval before interval of interest
+		elif(int(tokens[1])<flanking_range[1] and int(tokens[2])<flanking_range[1]):
+			continue
+			# Skip if interval after interval of interest
+		elif(int(tokens[1])>flanking_range[2] and int(tokens[2])>flanking_range[2]):
+			continue
+		value = int(tokens[3])
+		for local_x in range(int(tokens[1]),int(tokens[2])):
+			if(local_x in x_vector):
+				y_vector[x_vector.index(local_x)] = value
+	reader.close()
+	plot_info = {"X":x_vector,"Y":y_vector,"chrom":locus_coord[0]}
+	return(plot_info)
+
+def parse_configs(configs_fn):
+	subplot_configs = {}
+	reader = open(configs_fn,'r')
+	for line in reader:
+		tokens = line.strip().split('\t')
+		if(tokens[0]=="FLANKING"):
+			subplot_configs.update({tokens[0]:int(tokens[1])})
+			continue
+		elif(tokens[0]=="S_MAX_Y"):
+			subplot_configs.update({tokens[0]:[ int(i) for i in tokens[1:]]})
+			continue
+		subplot_configs.update({tokens[0]:tokens[1:]})
+	reader.close()
+	# Count subplot dimensions
+	subplot_configs.update({"N_SAMPLES":len(subplot_configs["SAMPLES"])})
+	subplot_configs.update({"N_LOCI":len(subplot_configs["LOCI"])})
+	# Validate configs
+	for key in ["S_LABEL","S_COLOR","S_MAX_Y"]:
+		if(len(subplot_configs[key])!=subplot_configs["N_SAMPLES"]):
+			sys.stderr.write("Mismatch in number of samples with %i field. Exiting...\n" % (key))
+			quit()
+	return(subplot_configs)
+
+def parse_gff(gff_fn, loci_list):
+	locus2coord = {}
+	reader = open(gff_fn,'r')
+	for line in reader:
+		if(line.find("#")==0):
+			continue
+		if(line.find(">")==0):
+			break
+		tokens = line.strip().split('\t')
+		gene_name = ""
+		for feature in tokens[8].split(';'):
+			if(feature.find("gene=")!=0):
+				continue
+			gene_name = feature.split('=')[1]
+			break
+		if(gene_name in loci_list):
+			locus2coord.update({gene_name:(tokens[0],int(tokens[3])-1,int(tokens[4]))})
+	reader.close()
+	return(locus2coord)
+
+if __name__ == "__main__":
+	'''Plot scatter'''
+	args = getParams()
+
+	CONFIGS = parse_configs(args.config_fn)
+	LOCUS2COORD = parse_gff(args.features_gff, CONFIGS["LOCI"])
+
+	fig, asx = plt.subplots(CONFIGS["N_SAMPLES"],CONFIGS["N_LOCI"])
+	fig.suptitle(args.title)
+	plt.tight_layout()
+	for s in range(CONFIGS["N_SAMPLES"]):
+		bedgraph_fn = "results/BedGraphs/%s.raw.bedgraph" % CONFIGS["SAMPLES"][s]
+		for l in range(CONFIGS["N_LOCI"]):
+			locus = CONFIGS["LOCI"][l]
+			sys.stderr.write("Processing sample %s by locus %s...\n" % (bedgraph_fn, locus))
+			data = get_bedgraph_info(bedgraph_fn, LOCUS2COORD[locus], CONFIGS["FLANKING"])
+			# Plot data
+			asx[s,l].fill_between(data["X"], data["Y"], color=CONFIGS["S_COLOR"][s])
+			asx[s,l].set_ylim(bottom=0,top=CONFIGS["S_MAX_Y"][s])
+			asx[s,l].label_outer()
+			x0 = data["X"][0]
+			xend = data["X"][-1]
+			xstart = data["X"][0] + CONFIGS["FLANKING"]
+			xstop = data["X"][-1] - CONFIGS["FLANKING"]
+			asx[s,l].set_xlim([x0,xend])
+			plt.sca(asx[s,l])
+			plt.xticks([x0,xstart,xstop,xend],["-200","start","stop","+200"])
+		# Label Samples
+		asx[s,0].set_ylabel(CONFIGS["S_LABEL"][s])
+
+	for l in range(CONFIGS["N_LOCI"]):
+		coord = LOCUS2COORD[CONFIGS["LOCI"][l]]
+		asx[CONFIGS["N_SAMPLES"]-1,l].set_xlabel("%s:%i-%i" % (coord[0],coord[1],coord[2]))
+	fig.set_size_inches(14,8)
+	#plt.show()
+	out_pic_fn = args.title.replace(" ","_")+".svg"
+	plt.savefig(out_pic_fn)
+	print(out_pic_fn)
diff --git a/paper/YKOC-wgs/scripts/make_trace.py b/paper/YKOC-wgs/scripts/make_trace.py
@@ -0,0 +1,140 @@
+from os import listdir
+from os.path import isfile, join
+import sys
+import re
+import argparse
+
+# Python 3.6+
+# relies on dict insertion order
+roman2arabic = {"chrI":"chr1","chrII":"chr2","chrIII":"chr3","chrIV":"chr4","chrV":"chr5",
+			"chrVI":"chr6","chrVII":"chr7","chrVIII":"chr8","chrIX":"chr9","chrX":"chr10",
+			"chrXI":"chr11","chrXII":"chr12","chrXIII":"chr13","chrXIV":"chr14","chrXV":"chr15",
+			"chrXVI":"chr16",}
+
+
+
+def getParams():
+	'''Parse parameters from the command line'''
+	parser = argparse.ArgumentParser(description='Use pileup information to get a heatmap of each sample\'s coverage at the expected KO site.')
+
+	parser.add_argument('-c','--cdt', metavar='cdt_fn', dest='cdt_fn', required=True, help='the cdt of bellplot (to determine sort order)')
+	parser.add_argument('-g','--features-gff', metavar='features_gff', dest='features_gff', required=True, help='the featuer GFF file from SGD to get the gene coordinates')
+	parser.add_argument('-w','--window', metavar='win_size', dest='window', default=6000, type=int, help='the window size to center around feature range')
+
+	args = parser.parse_args()
+	return(args)
+
+
+
+# chr1	0	1	14
+# chr1	1	2	20
+# chr1	2	3	25
+# chr1	3	5	27
+def parse_bedgraph(bg_fn, bed_coord, window=2000):
+	window_coord = expand_coord(bed_coord,window)
+	pileup = ["NaN"] * (window_coord[2]-window_coord[1])
+	reader = open(bg_fn, 'r')
+	for line in reader.readlines():
+		if(line.find('#')==0):
+			continue
+		tokens = line.strip().split('\t')
+		if(tokens[0]!=window_coord[0]):
+			continue
+			# Skip if interval before interval of interest
+		elif(int(tokens[1])<window_coord[1] and int(tokens[2])<window_coord[1]):
+			continue
+			# Skip if interval after interval of interest
+		elif(int(tokens[1])>window_coord[2] and int(tokens[2])>window_coord[2]):
+			continue
+		value = float(tokens[3])
+		for local_x in range(int(tokens[1]),int(tokens[2])):
+			if(local_x >= window_coord[1] and local_x<window_coord[2]):
+				pileup[local_x-window_coord[1]] = value
+	reader.close()
+	return(pileup)
+
+
+def parse_gff(gff_fn):
+	locus2coord = {}
+	reader = open(gff_fn,'r')
+	for line in reader:
+		if(line.find("#")==0):
+			continue
+		if(line.find(">")==0):
+			break
+		tokens = line.strip().split('\t')
+		if(tokens[2] in ["mRNA","CDS"]):
+			continue
+		gene_name = ""
+		for feature in tokens[8].split(';'):
+			if(feature.find("gene=")!=0):
+				continue
+			gene_name = feature.split('=')[1]
+			break
+		locus2coord.update({gene_name:(roman2arabic.get(tokens[0],"chrZ"),int(tokens[3])-1,int(tokens[4]))})
+	reader.close()
+	return(locus2coord)
+
+def expand_coord(bed_coord, window):
+	midpoint = bed_coord[1] + (bed_coord[2] - bed_coord[1])//2
+	flank = window//2
+	return(midpoint-flank,midpoint+flank)
+
+#STATUS	feature_type	NOTES	KO_SCORE	SYS	STD	TableS1_Deletion	TableS1_replicate_id	ERS_accession	n_hits	hit_list	hit_scores	LEU2_SCORE	URA3_SCORE	experiment_accession	run_accession	submission_accession	nominal_length	read_count	base_count	first_public	nominal_sdev
+#PASS	ORF-Uncharacterized		-6.109952403138639	YAL064C-A	TDA8	Del1_TDA8	SD0863b	ERS838232	3	LEU2|URA3|TDA8	ND|ND|-6.109952403138639	ND	ND	ERX1406336	ERR1334744	ERA587837	484	8807338	1329908038	2016-03-22	81
+#PASS	ORF-Uncharacterized		-5.807910468072448	YAL064C-A	TDA8	Del1_TDA8	SD0863b2	ERS838233	3	LEU2|URA3|TDA8	ND|ND|-5.807910468072448	ND	ND	ERX1406337	ERR1334745	ERA587837	484	8996386	1358454286	2016-03-22	81
+#FAIL	ORF-Verified		-	YBL091C-A	SCS22	Del2_SCS22	SD0864b	ERS838234	2	LEU2|URA3	ND|ND	ND	ND	ERX1406338	ERR1334746	ERA587837	484	8710346	1315262246	2016-03-22	81
+#FAIL	ORF-Verified		-	YBL091C-A	SCS22	Del2_SCS22	SD0864b2	ERS838235	2	LEU2|URA3	ND|ND	ND	ND	ERX1406339	ERR1334747	ERA587837	484	8579514	1295506614	2016-03-22	81
+if __name__ == "__main__":
+	'''Collect metadata and DeletionID results to get detection stats on the YKOC data'''
+
+	hardcode_name_remap = {
+		"YCR061W":"TVS1",
+		"YCR100C":"EMA35",
+		"YFR045W":"MRX20",
+		"YIR035C":"NRE1",
+		"YNR062C":"PUL3",
+		"YNR063W":"PUL4",
+		"YER156C":"MYG1",
+		"YMR087W":"PDL32",
+		"YLR050C":"EMA19",
+		"YMR279C":"ATR2",
+		"YMR102C":"LAF1",
+		"YMR111C":"EUC1",
+		"YMR130W":"DPI35",
+		"YJR039W":"MLO127",
+		"YJR061W":"MNN14",
+		"YGR053C":"MCO32",
+		"YKR023W":"RQT4",
+		"PET10":"PLN1"
+	}
+
+	# Get params
+	args = getParams()
+	WINDOW = args.window
+	orf2bed = parse_gff(args.features_gff)
+	OLINES = []
+
+	# Parse metadata
+	reader = open(args.cdt_fn, 'r')#, encoding='utf-8')
+	for line in reader:
+		if(line.find("YORF")==0):
+			continue
+		tokens = line.strip().split('\t')
+		ERS = tokens[0]
+		STD = tokens[1]
+		COORD = orf2bed.get(STD,("chrZ",0,1))
+
+		# Build BedGraph filename
+		# Pileup BedGraph in CDT interval, then normalize
+		EXPAND = expand_coord(COORD,WINDOW)
+		VALUES = [ 1 if(i==COORD[1] or i==COORD[2]) else 0 for i in range(EXPAND[0],EXPAND[1])]
+		OLINES.append( "%s\t%s\t%s" % (ERS, STD, '\t'.join([str(i) for i in VALUES])  ))
+
+	reader.close()
+
+	# Write CDT header
+	sys.stdout.write("\t".join([ "YORF", "NAME"]) + "\t" + \
+						"\t".join([ str(i) for i in range(WINDOW)]) + "\n")
+	# Write Output by gene length
+	sys.stdout.write("\n".join(OLINES))