Skip to content

Commit e13d7b6

Browse files
author
OLIVIA LANG
committed
add scripts to create final figure panels
A heatmap is generated for samples identified by both Puddu and DeletionID methods, neither, only DeletionID, or only Puddu. These heatmaps are made from CDT files that concatenate the CDT row samples generated by the "04" script. Traces are also generated for the ORF boundaries using a version of ScriptManager that supports colors that use the alpha channel. Figures 3E and 3F are genome browser shots zooming in on some of the coverage rows from the heatmap highlighting various cases DeletionID did not identify the expected gene knockout.
1 parent 78c5a8f commit e13d7b6

5 files changed

Lines changed: 358 additions & 0 deletions

File tree

paper/YKOC-wgs/fig3e_config.txt

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
FLANKING 200
2+
LOCI APE3 VAC17 PIR3
3+
SAMPLES ERS838258 ERS903113 ERS969760 ERS1076728
4+
S_LABEL APE3 VAC17 PIR3 WT-1
5+
S_COLOR forestgreen firebrick salmon gray
6+
S_MAX_Y 200 150 10 150

paper/YKOC-wgs/fig3f_config.txt

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
FLANKING 200
2+
LOCI SWT1 PNS1
3+
SAMPLES ERS1076684 ERS1076685 ERS1029415 ERS1029416 ERS1076728
4+
S_LABEL del_SWT1_rep1 del_SWT1_rep2 del_PNS1_rep1 del_PNS1_rep2 WT-1
5+
S_COLOR royalblue cornflowerblue goldenrod gold gray
6+
S_MAX_Y 150 150 150 150 150
Lines changed: 71 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,71 @@
1+
#!/bin/bash
2+
#PBS -l nodes=1:ppn=4
3+
#PBS -l pmem=16gb
4+
#PBS -l walltime=05:00:00
5+
#PBS -A open
6+
#PBS -o logs/make.figs.log.out
7+
#PBS -e logs/make.figs.log.err
8+
9+
#1-9010
10+
#bfp2_h_g_sc_default
11+
#bfp2_j_g_bc_default
12+
13+
module load gcc/8.3.1
14+
module load samtools/1.5
15+
module load bedtools/2.27.1
16+
module load anaconda3
17+
source activate genopipe
18+
19+
WRK=/path/to/GenoPipe/paper/YKOC-wgs
20+
WRK=/storage/home/owl5022/scratch/GenotypingProject/GenoPipe/paper/YKOC-wgs
21+
cd $WRK
22+
23+
[ -d logs ] || mkdir logs
24+
[ -d results/figs ] || mkdir results/figs
25+
26+
for PARTITION in "TentativePass" "TrueFails" "TentativeFails" "ConfidentPass";
27+
do
28+
BASE=results/$PARTITION
29+
rm $BASE.temp
30+
31+
METADATA=results/$PARTITION.txt
32+
LNCT=`wc -l $METADATA | awk '{print $1}'`
33+
for ((INDEX=1; INDEX<=$LNCT; INDEX++));
34+
do
35+
#echo $INDEX
36+
ERS=`cut -d $'\t' -f10 $METADATA | sed "${INDEX}q;d"`
37+
ORF=`cut -d $'\t' -f7 $METADATA | sed "${INDEX}q;d"`
38+
echo "($INDEX) Parsed out ERS=$ERS and ORF=$ORF"
39+
ROW=results/BedGraphs/$ERS\_$ORF\_6000bp.cdt
40+
cat $ROW >> $BASE.temp
41+
done
42+
43+
sort -nk3,3 $BASE.temp > $BASE.cdt
44+
45+
# Make Heatmap PNG
46+
##ScriptManager Two-color heatmap
47+
# Black, .95 threshold
48+
# $BASE.cdt > $BASE.png
49+
50+
# Make Trace CDT
51+
TRACE=scripts/make_trace.py
52+
python $TRACE -c $BASE.cdt -g saccharomyces_cerevisiae.gff -w 6000 > $BASE\_trace.cdt
53+
54+
rm $BASE.temp
55+
done
56+
57+
# Make trace (x4 lines)
58+
##Script to make trace cdt
59+
##ScriptManager Three-color heatmap
60+
# diff colors, set alpha channel, threshold 0<0<.95
61+
62+
SUBPLOTS=scripts/make_subplots.py
63+
64+
# Figure 3E SVG image
65+
python $SUBPLOTS -t Figure3E -c fig3e_config.txt -g saccharomyces_cerevisiae.gff
66+
67+
# Figure 3F SVG image
68+
python $SUBPLOTS -t Figure3F -c fig3f_config.txt -g saccharomyces_cerevisiae.gff
69+
70+
71+
Lines changed: 135 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,135 @@
1+
from os import listdir
2+
from os.path import isfile, join
3+
import sys
4+
import re
5+
import argparse
6+
import matplotlib.pyplot as plt
7+
import numpy as np
8+
9+
# Python 3.6+
10+
# relies on dict insertion order
11+
12+
# Check Matplotlib colors when building your config files: https://matplotlib.org/stable/gallery/color/named_colors.html
13+
14+
roman2arabic = {"chrI":"chr1","chrII":"chr2","chrIII":"chr3","chrIV":"chr4","chrV":"chr5",
15+
"chrVI":"chr6","chrVII":"chr7","chrVIII":"chr8","chrIX":"chr9","chrX":"chr10",
16+
"chrXI":"chr11","chrXII":"chr12","chrXIII":"chr13","chrXIV":"chr14","chrXV":"chr15",
17+
"chrXVI":"chr16",}
18+
19+
def getParams():
20+
'''Parse parameters from the command line'''
21+
parser = argparse.ArgumentParser(description='')
22+
23+
parser.add_argument('-t','--title', metavar='figure_title', dest='title', required=True, help='')
24+
parser.add_argument('-c','--config', metavar='config_fn', dest='config_fn', required=True, help='the config file for a grid-organized subplot')
25+
parser.add_argument('-g','--features-gff', metavar='features_gff', dest='features_gff', required=True, help='the featuer GFF file from SGD to get the gene coordinates')
26+
parser.add_argument('-d','--header', dest='header', default=False, required=False, help='skip first line as column header')
27+
28+
args = parser.parse_args()
29+
return(args)
30+
31+
def get_bedgraph_info(bedgraph_file, locus_coord, flanking):
32+
flanking_range = (roman2arabic[locus_coord[0]], locus_coord[1]-flanking, locus_coord[2]+flanking)
33+
x_vector = list(range(flanking_range[1],flanking_range[2]))
34+
y_vector = [0] * len(x_vector)
35+
reader = open(bedgraph_file,'r')
36+
for line in reader:
37+
tokens = line.strip().split('\t')
38+
# Skip if chromosome doesn't match
39+
if(tokens[0]!=flanking_range[0]):
40+
continue
41+
# Skip if interval before interval of interest
42+
elif(int(tokens[1])<flanking_range[1] and int(tokens[2])<flanking_range[1]):
43+
continue
44+
# Skip if interval after interval of interest
45+
elif(int(tokens[1])>flanking_range[2] and int(tokens[2])>flanking_range[2]):
46+
continue
47+
value = int(tokens[3])
48+
for local_x in range(int(tokens[1]),int(tokens[2])):
49+
if(local_x in x_vector):
50+
y_vector[x_vector.index(local_x)] = value
51+
reader.close()
52+
plot_info = {"X":x_vector,"Y":y_vector,"chrom":locus_coord[0]}
53+
return(plot_info)
54+
55+
def parse_configs(configs_fn):
56+
subplot_configs = {}
57+
reader = open(configs_fn,'r')
58+
for line in reader:
59+
tokens = line.strip().split('\t')
60+
if(tokens[0]=="FLANKING"):
61+
subplot_configs.update({tokens[0]:int(tokens[1])})
62+
continue
63+
elif(tokens[0]=="S_MAX_Y"):
64+
subplot_configs.update({tokens[0]:[ int(i) for i in tokens[1:]]})
65+
continue
66+
subplot_configs.update({tokens[0]:tokens[1:]})
67+
reader.close()
68+
# Count subplot dimensions
69+
subplot_configs.update({"N_SAMPLES":len(subplot_configs["SAMPLES"])})
70+
subplot_configs.update({"N_LOCI":len(subplot_configs["LOCI"])})
71+
# Validate configs
72+
for key in ["S_LABEL","S_COLOR","S_MAX_Y"]:
73+
if(len(subplot_configs[key])!=subplot_configs["N_SAMPLES"]):
74+
sys.stderr.write("Mismatch in number of samples with %i field. Exiting...\n" % (key))
75+
quit()
76+
return(subplot_configs)
77+
78+
def parse_gff(gff_fn, loci_list):
79+
locus2coord = {}
80+
reader = open(gff_fn,'r')
81+
for line in reader:
82+
if(line.find("#")==0):
83+
continue
84+
if(line.find(">")==0):
85+
break
86+
tokens = line.strip().split('\t')
87+
gene_name = ""
88+
for feature in tokens[8].split(';'):
89+
if(feature.find("gene=")!=0):
90+
continue
91+
gene_name = feature.split('=')[1]
92+
break
93+
if(gene_name in loci_list):
94+
locus2coord.update({gene_name:(tokens[0],int(tokens[3])-1,int(tokens[4]))})
95+
reader.close()
96+
return(locus2coord)
97+
98+
if __name__ == "__main__":
99+
'''Plot scatter'''
100+
args = getParams()
101+
102+
CONFIGS = parse_configs(args.config_fn)
103+
LOCUS2COORD = parse_gff(args.features_gff, CONFIGS["LOCI"])
104+
105+
fig, asx = plt.subplots(CONFIGS["N_SAMPLES"],CONFIGS["N_LOCI"])
106+
fig.suptitle(args.title)
107+
plt.tight_layout()
108+
for s in range(CONFIGS["N_SAMPLES"]):
109+
bedgraph_fn = "results/BedGraphs/%s.raw.bedgraph" % CONFIGS["SAMPLES"][s]
110+
for l in range(CONFIGS["N_LOCI"]):
111+
locus = CONFIGS["LOCI"][l]
112+
sys.stderr.write("Processing sample %s by locus %s...\n" % (bedgraph_fn, locus))
113+
data = get_bedgraph_info(bedgraph_fn, LOCUS2COORD[locus], CONFIGS["FLANKING"])
114+
# Plot data
115+
asx[s,l].fill_between(data["X"], data["Y"], color=CONFIGS["S_COLOR"][s])
116+
asx[s,l].set_ylim(bottom=0,top=CONFIGS["S_MAX_Y"][s])
117+
asx[s,l].label_outer()
118+
x0 = data["X"][0]
119+
xend = data["X"][-1]
120+
xstart = data["X"][0] + CONFIGS["FLANKING"]
121+
xstop = data["X"][-1] - CONFIGS["FLANKING"]
122+
asx[s,l].set_xlim([x0,xend])
123+
plt.sca(asx[s,l])
124+
plt.xticks([x0,xstart,xstop,xend],["-200","start","stop","+200"])
125+
# Label Samples
126+
asx[s,0].set_ylabel(CONFIGS["S_LABEL"][s])
127+
128+
for l in range(CONFIGS["N_LOCI"]):
129+
coord = LOCUS2COORD[CONFIGS["LOCI"][l]]
130+
asx[CONFIGS["N_SAMPLES"]-1,l].set_xlabel("%s:%i-%i" % (coord[0],coord[1],coord[2]))
131+
fig.set_size_inches(14,8)
132+
#plt.show()
133+
out_pic_fn = args.title.replace(" ","_")+".svg"
134+
plt.savefig(out_pic_fn)
135+
print(out_pic_fn)
Lines changed: 140 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,140 @@
1+
from os import listdir
2+
from os.path import isfile, join
3+
import sys
4+
import re
5+
import argparse
6+
7+
# Python 3.6+
8+
# relies on dict insertion order
9+
roman2arabic = {"chrI":"chr1","chrII":"chr2","chrIII":"chr3","chrIV":"chr4","chrV":"chr5",
10+
"chrVI":"chr6","chrVII":"chr7","chrVIII":"chr8","chrIX":"chr9","chrX":"chr10",
11+
"chrXI":"chr11","chrXII":"chr12","chrXIII":"chr13","chrXIV":"chr14","chrXV":"chr15",
12+
"chrXVI":"chr16",}
13+
14+
15+
16+
def getParams():
17+
'''Parse parameters from the command line'''
18+
parser = argparse.ArgumentParser(description='Use pileup information to get a heatmap of each sample\'s coverage at the expected KO site.')
19+
20+
parser.add_argument('-c','--cdt', metavar='cdt_fn', dest='cdt_fn', required=True, help='the cdt of bellplot (to determine sort order)')
21+
parser.add_argument('-g','--features-gff', metavar='features_gff', dest='features_gff', required=True, help='the featuer GFF file from SGD to get the gene coordinates')
22+
parser.add_argument('-w','--window', metavar='win_size', dest='window', default=6000, type=int, help='the window size to center around feature range')
23+
24+
args = parser.parse_args()
25+
return(args)
26+
27+
28+
29+
# chr1 0 1 14
30+
# chr1 1 2 20
31+
# chr1 2 3 25
32+
# chr1 3 5 27
33+
def parse_bedgraph(bg_fn, bed_coord, window=2000):
34+
window_coord = expand_coord(bed_coord,window)
35+
pileup = ["NaN"] * (window_coord[2]-window_coord[1])
36+
reader = open(bg_fn, 'r')
37+
for line in reader.readlines():
38+
if(line.find('#')==0):
39+
continue
40+
tokens = line.strip().split('\t')
41+
if(tokens[0]!=window_coord[0]):
42+
continue
43+
# Skip if interval before interval of interest
44+
elif(int(tokens[1])<window_coord[1] and int(tokens[2])<window_coord[1]):
45+
continue
46+
# Skip if interval after interval of interest
47+
elif(int(tokens[1])>window_coord[2] and int(tokens[2])>window_coord[2]):
48+
continue
49+
value = float(tokens[3])
50+
for local_x in range(int(tokens[1]),int(tokens[2])):
51+
if(local_x >= window_coord[1] and local_x<window_coord[2]):
52+
pileup[local_x-window_coord[1]] = value
53+
reader.close()
54+
return(pileup)
55+
56+
57+
def parse_gff(gff_fn):
58+
locus2coord = {}
59+
reader = open(gff_fn,'r')
60+
for line in reader:
61+
if(line.find("#")==0):
62+
continue
63+
if(line.find(">")==0):
64+
break
65+
tokens = line.strip().split('\t')
66+
if(tokens[2] in ["mRNA","CDS"]):
67+
continue
68+
gene_name = ""
69+
for feature in tokens[8].split(';'):
70+
if(feature.find("gene=")!=0):
71+
continue
72+
gene_name = feature.split('=')[1]
73+
break
74+
locus2coord.update({gene_name:(roman2arabic.get(tokens[0],"chrZ"),int(tokens[3])-1,int(tokens[4]))})
75+
reader.close()
76+
return(locus2coord)
77+
78+
def expand_coord(bed_coord, window):
79+
midpoint = bed_coord[1] + (bed_coord[2] - bed_coord[1])//2
80+
flank = window//2
81+
return(midpoint-flank,midpoint+flank)
82+
83+
#STATUS feature_type NOTES KO_SCORE SYS STD TableS1_Deletion TableS1_replicate_id ERS_accession n_hits hit_list hit_scores LEU2_SCORE URA3_SCORE experiment_accession run_accession submission_accession nominal_length read_count base_count first_public nominal_sdev
84+
#PASS ORF-Uncharacterized -6.109952403138639 YAL064C-A TDA8 Del1_TDA8 SD0863b ERS838232 3 LEU2|URA3|TDA8 ND|ND|-6.109952403138639 ND ND ERX1406336 ERR1334744 ERA587837 484 8807338 1329908038 2016-03-22 81
85+
#PASS ORF-Uncharacterized -5.807910468072448 YAL064C-A TDA8 Del1_TDA8 SD0863b2 ERS838233 3 LEU2|URA3|TDA8 ND|ND|-5.807910468072448 ND ND ERX1406337 ERR1334745 ERA587837 484 8996386 1358454286 2016-03-22 81
86+
#FAIL ORF-Verified - YBL091C-A SCS22 Del2_SCS22 SD0864b ERS838234 2 LEU2|URA3 ND|ND ND ND ERX1406338 ERR1334746 ERA587837 484 8710346 1315262246 2016-03-22 81
87+
#FAIL ORF-Verified - YBL091C-A SCS22 Del2_SCS22 SD0864b2 ERS838235 2 LEU2|URA3 ND|ND ND ND ERX1406339 ERR1334747 ERA587837 484 8579514 1295506614 2016-03-22 81
88+
if __name__ == "__main__":
89+
'''Collect metadata and DeletionID results to get detection stats on the YKOC data'''
90+
91+
hardcode_name_remap = {
92+
"YCR061W":"TVS1",
93+
"YCR100C":"EMA35",
94+
"YFR045W":"MRX20",
95+
"YIR035C":"NRE1",
96+
"YNR062C":"PUL3",
97+
"YNR063W":"PUL4",
98+
"YER156C":"MYG1",
99+
"YMR087W":"PDL32",
100+
"YLR050C":"EMA19",
101+
"YMR279C":"ATR2",
102+
"YMR102C":"LAF1",
103+
"YMR111C":"EUC1",
104+
"YMR130W":"DPI35",
105+
"YJR039W":"MLO127",
106+
"YJR061W":"MNN14",
107+
"YGR053C":"MCO32",
108+
"YKR023W":"RQT4",
109+
"PET10":"PLN1"
110+
}
111+
112+
# Get params
113+
args = getParams()
114+
WINDOW = args.window
115+
orf2bed = parse_gff(args.features_gff)
116+
OLINES = []
117+
118+
# Parse metadata
119+
reader = open(args.cdt_fn, 'r')#, encoding='utf-8')
120+
for line in reader:
121+
if(line.find("YORF")==0):
122+
continue
123+
tokens = line.strip().split('\t')
124+
ERS = tokens[0]
125+
STD = tokens[1]
126+
COORD = orf2bed.get(STD,("chrZ",0,1))
127+
128+
# Build BedGraph filename
129+
# Pileup BedGraph in CDT interval, then normalize
130+
EXPAND = expand_coord(COORD,WINDOW)
131+
VALUES = [ 1 if(i==COORD[1] or i==COORD[2]) else 0 for i in range(EXPAND[0],EXPAND[1])]
132+
OLINES.append( "%s\t%s\t%s" % (ERS, STD, '\t'.join([str(i) for i in VALUES]) ))
133+
134+
reader.close()
135+
136+
# Write CDT header
137+
sys.stdout.write("\t".join([ "YORF", "NAME"]) + "\t" + \
138+
"\t".join([ str(i) for i in range(WINDOW)]) + "\n")
139+
# Write Output by gene length
140+
sys.stdout.write("\n".join(OLINES))

0 commit comments

Comments
 (0)