Skip to content

Commit 78c5a8f

Browse files
author
OLIVIA LANG
committed
Add PBS scripts to pileup coverage every sample
The aligned reads are piledup into a bedgraph formatted file and the values are normalized by a custom python script (`scripts/normalize_BedGraph.py`). Then another script identifies the expected knockout interval and pulls out the coverage within a window centered around the expected knockout gene for a width of 6000bp and formatted as a CDT file. The new results directory file BedGraphs that holds the raw and normalized pileup bedgraphs, and the single interval CDTs for each sample is added to the .gitignore. Notes about a different number of samples from the EBI sample set and the publication (Puddu et al 2019) sample accessions is explained in the README.
1 parent 37b802d commit 78c5a8f

5 files changed

Lines changed: 283 additions & 0 deletions

File tree

paper/.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@ YKOC-wgs/logs/*.out-*
1818
YKOC-wgs/results/FASTQ
1919
YKOC-wgs/results/BAM
2020
YKOC-wgs/results/ID
21+
YKOC-wgs/results/BedGraphs
2122
ENCODEdata-eGFP/logs/*.out-*
2223
ENCODEdata-eGFP/logs/*.err-*
2324
ENCODEdata-eGFP/results/FASTQ

paper/YKOC-wgs/README

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,14 @@ April 3, 2021 to download this metadata.
1818

1919
https://www.ebi.ac.uk/ena/browser/view/PRJEB27160
2020

21+
Note: Four ERS sample accessions referenced in the paper are missing from the EBI
22+
metadata download:
23+
24+
ERS1041599
25+
ERS969339
26+
ERS969700
27+
ERS969701
28+
2129
# Download & Align YKOC data
2230
Use the EBI accessions to download the data to the `results/FASTQ` directory. The
2331
`job/00_download_data.pbs` PBS script uses the `wget` command but for the paper, these
Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,46 @@
1+
#!/bin/bash
2+
#PBS -l nodes=1:ppn=4
3+
#PBS -l pmem=16gb
4+
#PBS -l walltime=03:00:00
5+
#PBS -A open
6+
#PBS -o logs/pileup.samples.log.out
7+
#PBS -e logs/pileup.samples.log.err
8+
#PBS -t 1-9010
9+
10+
module load gcc/8.3.1
11+
module load samtools/1.5
12+
module load bedtools/2.27.1
13+
module load anaconda3
14+
source activate genopipe
15+
16+
WRK=/path/to/GenoPipe/paper/YKOC-wgs
17+
cd $WRK
18+
19+
[ -d logs ] || mkdir logs
20+
[ -d results/BedGraphs ] || mkdir results/BedGraphs
21+
22+
METADATA=results/ykoc_output_summary.txt
23+
INDEX=$(($PBS_ARRAYID+1))
24+
25+
INFO=`sed "${INDEX}q;d" $METADATA`
26+
ERS=`cut -f10 $METADATA |sed "${INDEX}q;d"`
27+
BAM=results/BAM/$ERS.bam
28+
BG=results/BedGraphs/$ERS
29+
#echo $INFO
30+
31+
# Pileup to BedGraph
32+
echo "($PBS_ARRAYID) Make raw bedgraph pileups for $ERS"
33+
bedtools genomecov -bga -scale 1.0 -ibam $BAM \
34+
| sort -k1,1 -k2,2n \
35+
> $BG.raw.bedgraph
36+
37+
# Normalize BedGraph
38+
echo "($PBS_ARRAYID) Normalize bedgraph for $ERS"
39+
NORMALIZE=scripts/normalize_BedGraph.py
40+
python $NORMALIZE -i $BG.raw.bedgraph > $BG.bedgraph
41+
42+
# Make CDT row centered on expected KO
43+
echo "($PBS_ARRAYID) Generate CDT slice for $ERS"
44+
ROWCDT=scripts/make_cdt_row.py
45+
echo $INFO
46+
python $ROWCDT -s <(head -n $INDEX $METADATA |tail -n 1) -i results/BedGraphs/ -g saccharomyces_cerevisiae.gff
Lines changed: 173 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,173 @@
1+
from os import listdir
2+
from os.path import isfile, join
3+
import sys
4+
import re
5+
import argparse
6+
7+
# Python 3.6+
8+
# relies on dict insertion order
9+
roman2arabic = {"chrI":"chr1","chrII":"chr2","chrIII":"chr3","chrIV":"chr4","chrV":"chr5",
10+
"chrVI":"chr6","chrVII":"chr7","chrVIII":"chr8","chrIX":"chr9","chrX":"chr10",
11+
"chrXI":"chr11","chrXII":"chr12","chrXIII":"chr13","chrXIV":"chr14","chrXV":"chr15",
12+
"chrXVI":"chr16",}
13+
14+
15+
16+
def getParams():
17+
'''Parse parameters from the command line'''
18+
parser = argparse.ArgumentParser(description='Use pileup information to get a heatmap of each sample\'s coverage at the expected KO site.')
19+
20+
parser.add_argument('-s','--summary_fn', metavar='summary_fn', dest='summary_fn', required=True, help='the output_summary file failed entries')
21+
parser.add_argument('-i','--input-dir', metavar='input_dir', dest='input_dir', required=True, help='the directory where all the BedGraph pileup files are saved')
22+
parser.add_argument('-g','--features-gff', metavar='features_gff', dest='features_gff', required=True, help='the featuer GFF file from SGD to get the gene coordinates')
23+
parser.add_argument('-w','--window', metavar='win_size', dest='window', default=6000, type=int, help='the window size to center around feature range')
24+
25+
args = parser.parse_args()
26+
return(args)
27+
28+
#chr2 334391 336812 REB1 . -
29+
def parse_bedfile(bed_fn):
30+
orf2coord = {}
31+
reader = open(bed_fn,'r')
32+
for line in reader:
33+
tokens = line.strip().split('\t')
34+
orf2coord.update({tokens[3]:(tokens[0],int(tokens[1]),int(tokens[2]))})
35+
reader.close()
36+
return(orf2coord)
37+
38+
def expand_coord(bed_coord, window):
39+
midpoint = bed_coord[1] + (bed_coord[2] - bed_coord[1])//2
40+
flank = window//2
41+
return(bed_coord[0],midpoint-flank,midpoint+flank)
42+
43+
# chr1 0 1 14
44+
# chr1 1 2 20
45+
# chr1 2 3 25
46+
# chr1 3 5 27
47+
def parse_bedgraph(bg_fn, bed_coord, window=2000):
48+
window_coord = expand_coord(bed_coord,window)
49+
pileup = ["NaN"] * (window_coord[2]-window_coord[1])
50+
reader = open(bg_fn, 'r')
51+
for line in reader.readlines():
52+
if(line.find('#')==0):
53+
continue
54+
tokens = line.strip().split('\t')
55+
if(tokens[0]!=window_coord[0]):
56+
continue
57+
# Skip if interval before interval of interest
58+
elif(int(tokens[1])<window_coord[1] and int(tokens[2])<window_coord[1]):
59+
continue
60+
# Skip if interval after interval of interest
61+
elif(int(tokens[1])>window_coord[2] and int(tokens[2])>window_coord[2]):
62+
continue
63+
value = float(tokens[3])
64+
for local_x in range(int(tokens[1]),int(tokens[2])):
65+
if(local_x >= window_coord[1] and local_x<window_coord[2]):
66+
pileup[local_x-window_coord[1]] = value
67+
reader.close()
68+
return(pileup)
69+
70+
71+
def parse_gff(gff_fn):
72+
locus2coord = {}
73+
reader = open(gff_fn,'r')
74+
for line in reader:
75+
if(line.find("#")==0):
76+
continue
77+
if(line.find(">")==0):
78+
break
79+
tokens = line.strip().split('\t')
80+
if(tokens[2] in ["mRNA","CDS"]):
81+
continue
82+
gene_name = ""
83+
for feature in tokens[8].split(';'):
84+
if(feature.find("ID=")!=0):
85+
continue
86+
gene_name = feature.split('=')[1]
87+
break
88+
locus2coord.update({gene_name:(roman2arabic.get(tokens[0],"chrZ"),int(tokens[3])-1,int(tokens[4]))})
89+
reader.close()
90+
return(locus2coord)
91+
92+
93+
#STATUS feature_type NOTES KO_SCORE SYS STD TableS1_Deletion TableS1_replicate_id ERS_accession n_hits hit_list hit_scores LEU2_SCORE URA3_SCORE experiment_accession run_accession submission_accession nominal_length read_count base_count first_public nominal_sdev
94+
#PASS ORF-Uncharacterized -6.109952403138639 YAL064C-A TDA8 Del1_TDA8 SD0863b ERS838232 3 LEU2|URA3|TDA8 ND|ND|-6.109952403138639 ND ND ERX1406336 ERR1334744 ERA587837 484 8807338 1329908038 2016-03-22 81
95+
#PASS ORF-Uncharacterized -5.807910468072448 YAL064C-A TDA8 Del1_TDA8 SD0863b2 ERS838233 3 LEU2|URA3|TDA8 ND|ND|-5.807910468072448 ND ND ERX1406337 ERR1334745 ERA587837 484 8996386 1358454286 2016-03-22 81
96+
#FAIL ORF-Verified - YBL091C-A SCS22 Del2_SCS22 SD0864b ERS838234 2 LEU2|URA3 ND|ND ND ND ERX1406338 ERR1334746 ERA587837 484 8710346 1315262246 2016-03-22 81
97+
#FAIL ORF-Verified - YBL091C-A SCS22 Del2_SCS22 SD0864b2 ERS838235 2 LEU2|URA3 ND|ND ND ND ERX1406339 ERR1334747 ERA587837 484 8579514 1295506614 2016-03-22 81
98+
if __name__ == "__main__":
99+
'''Collect metadata and DeletionID results to get detection stats on the YKOC data'''
100+
101+
hardcode_name_remap = {
102+
"YCR061W":"TVS1",
103+
"YCR100C":"EMA35",
104+
"YFR045W":"MRX20",
105+
"YIR035C":"NRE1",
106+
"YNR062C":"PUL3",
107+
"YNR063W":"PUL4",
108+
"YER156C":"MYG1",
109+
"YMR087W":"PDL32",
110+
"YLR050C":"EMA19",
111+
"YMR279C":"ATR2",
112+
"YMR102C":"LAF1",
113+
"YMR111C":"EUC1",
114+
"YMR130W":"DPI35",
115+
"YJR039W":"MLO127",
116+
"YJR061W":"MNN14",
117+
"YGR053C":"MCO32",
118+
"YKR023W":"RQT4",
119+
"PET10":"PLN1"
120+
}
121+
122+
# Get params
123+
args = getParams()
124+
WINDOW = args.window
125+
orf2bed = parse_gff(args.features_gff)
126+
ers2pileup = {}
127+
128+
# Parse metadata
129+
reader = open(args.summary_fn, 'r')#, encoding='utf-8')
130+
for mline in reader:
131+
mtokens = mline.strip().split('\t')
132+
ERS = mtokens[9]
133+
#ORF = hardcode_name_remap.get(mtokens[5],mtokens[5])
134+
SYS = mtokens[5]
135+
STD = mtokens[6]
136+
COORD = orf2bed.get(SYS,("chrZ",0,1))
137+
138+
sys.stderr.write("Parsing for sample %s\n" % ERS )
139+
140+
# Build BedGraph filename
141+
# Pileup BedGraph in CDT interval, then normalize
142+
bedgraph_fn = join(args.input_dir,"%s.bedgraph" % ERS)
143+
if(isfile(bedgraph_fn)):
144+
pileup = parse_bedgraph(bedgraph_fn, COORD, WINDOW)
145+
else:
146+
sys.stderr.write("%s BedGraph pileup does not exist\n" % bedgraph_fn)
147+
continue
148+
149+
# Build CDT filename
150+
cdt_row_fn = join(args.input_dir,"%s_%s_%ibp.cdt" % (ERS,STD,WINDOW))
151+
writer = open(cdt_row_fn,'w')
152+
# Write CDT header
153+
writer.write("\t".join([ "YORF", "NAME", "LENGTH" ]) + "\t" + \
154+
"\t".join([ str(i) for i in range(WINDOW)]) + "\n")
155+
# Write CDT row vals
156+
writer.write("\t".join([ ERS, STD, str(COORD[2]-COORD[1]) ]) + "\t" + \
157+
"\t".join([ str(i) for i in pileup]) + "\n")
158+
writer.close()
159+
160+
# (ERS,orf_len):[pileup CDT row]
161+
# cdt_line = "%s\t%s\t%s" % (ERS,ORF,"\t".join([str(i) for i in pileup]))
162+
# ers2pileup.update({(ERS,COORD[2]-COORD[1]):cdt_line})
163+
164+
sys.stderr.write("%s pileup complete\n" % ERS)
165+
166+
reader.close()
167+
168+
# # Write CDT header
169+
# sys.stdout.write("\t".join([ "YORF", "NAME"]) + "\t" + \
170+
# "\t".join([ str(i) for i in range(WINDOW)]) + "\n")
171+
# # Write Output by gene length
172+
# COORD_KEYS = sorted(ers2pileup, key=lambda x:x[1])
173+
# sys.stdout.write("\n".join([ers2pileup[KEY] for KEY in COORD_KEYS]))
Lines changed: 55 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,55 @@
1+
from os import listdir
2+
from os.path import isfile, join
3+
import sys
4+
import re
5+
import argparse
6+
7+
# Python 3.6+
8+
# relies on dict insertion order
9+
10+
def getParams():
11+
'''Parse parameters from the command line'''
12+
parser = argparse.ArgumentParser(description='Parse bedgraph file to total tag normalize.')
13+
14+
parser.add_argument('-i','--bedgraph', metavar='bedgraph_fn', dest='bedgraph_fn', required=True, help='the BedGraph file')
15+
16+
args = parser.parse_args()
17+
return(args)
18+
19+
# chr1 0 1 14
20+
# chr1 1 2 20
21+
# chr1 2 3 25
22+
# chr1 3 5 27
23+
def parse_bedgraph(bg_fn):
24+
normalize_factor = 0.0
25+
reader = open(bg_fn, 'r')
26+
for line in reader:
27+
if(line.find('#')==0):
28+
continue
29+
tokens = line.strip().split('\t')
30+
start = int(tokens[1])
31+
stop = int(tokens[2])
32+
score = float(tokens[3])
33+
normalize_factor += score*(stop-start)
34+
reader.close()
35+
return(normalize_factor)
36+
37+
if __name__ == "__main__":
38+
'''Normalize total tag of Bedgraph'''
39+
# Get params
40+
args = getParams()
41+
BG_FILE = args.bedgraph_fn
42+
43+
# Pileup BedGraph in CDT interval, then normalize
44+
if(isfile(BG_FILE)):
45+
normalize_factor = parse_bedgraph(BG_FILE)
46+
sys.stderr.write( "Normalization Factor: %i\n" % (normalize_factor))
47+
reader = open(BG_FILE,'r')
48+
for line in reader:
49+
tokens = line.strip().split("\t")
50+
tokens[3] = str(float(tokens[3])/normalize_factor*1000000)
51+
sys.stdout.write("\t".join(tokens) + "\n")
52+
reader.close()
53+
sys.stderr.write("Normalization Complete for %s\n" % (BG_FILE))
54+
else:
55+
sys.stderr.write("File does not exist: %s\n" % (BG_FILE))

0 commit comments

Comments
 (0)