|
| 1 | +from os import listdir |
| 2 | +from os.path import isfile, join |
| 3 | +import sys |
| 4 | +import re |
| 5 | +import argparse |
| 6 | + |
| 7 | +# Python 3.6+ |
| 8 | +# relies on dict insertion order |
| 9 | +roman2arabic = {"chrI":"chr1","chrII":"chr2","chrIII":"chr3","chrIV":"chr4","chrV":"chr5", |
| 10 | + "chrVI":"chr6","chrVII":"chr7","chrVIII":"chr8","chrIX":"chr9","chrX":"chr10", |
| 11 | + "chrXI":"chr11","chrXII":"chr12","chrXIII":"chr13","chrXIV":"chr14","chrXV":"chr15", |
| 12 | + "chrXVI":"chr16",} |
| 13 | + |
| 14 | + |
| 15 | + |
| 16 | +def getParams(): |
| 17 | + '''Parse parameters from the command line''' |
| 18 | + parser = argparse.ArgumentParser(description='Use pileup information to get a heatmap of each sample\'s coverage at the expected KO site.') |
| 19 | + |
| 20 | + parser.add_argument('-s','--summary_fn', metavar='summary_fn', dest='summary_fn', required=True, help='the output_summary file failed entries') |
| 21 | + parser.add_argument('-i','--input-dir', metavar='input_dir', dest='input_dir', required=True, help='the directory where all the BedGraph pileup files are saved') |
| 22 | + parser.add_argument('-g','--features-gff', metavar='features_gff', dest='features_gff', required=True, help='the featuer GFF file from SGD to get the gene coordinates') |
| 23 | + parser.add_argument('-w','--window', metavar='win_size', dest='window', default=6000, type=int, help='the window size to center around feature range') |
| 24 | + |
| 25 | + args = parser.parse_args() |
| 26 | + return(args) |
| 27 | + |
| 28 | +#chr2 334391 336812 REB1 . - |
| 29 | +def parse_bedfile(bed_fn): |
| 30 | + orf2coord = {} |
| 31 | + reader = open(bed_fn,'r') |
| 32 | + for line in reader: |
| 33 | + tokens = line.strip().split('\t') |
| 34 | + orf2coord.update({tokens[3]:(tokens[0],int(tokens[1]),int(tokens[2]))}) |
| 35 | + reader.close() |
| 36 | + return(orf2coord) |
| 37 | + |
| 38 | +def expand_coord(bed_coord, window): |
| 39 | + midpoint = bed_coord[1] + (bed_coord[2] - bed_coord[1])//2 |
| 40 | + flank = window//2 |
| 41 | + return(bed_coord[0],midpoint-flank,midpoint+flank) |
| 42 | + |
| 43 | +# chr1 0 1 14 |
| 44 | +# chr1 1 2 20 |
| 45 | +# chr1 2 3 25 |
| 46 | +# chr1 3 5 27 |
| 47 | +def parse_bedgraph(bg_fn, bed_coord, window=2000): |
| 48 | + window_coord = expand_coord(bed_coord,window) |
| 49 | + pileup = ["NaN"] * (window_coord[2]-window_coord[1]) |
| 50 | + reader = open(bg_fn, 'r') |
| 51 | + for line in reader.readlines(): |
| 52 | + if(line.find('#')==0): |
| 53 | + continue |
| 54 | + tokens = line.strip().split('\t') |
| 55 | + if(tokens[0]!=window_coord[0]): |
| 56 | + continue |
| 57 | + # Skip if interval before interval of interest |
| 58 | + elif(int(tokens[1])<window_coord[1] and int(tokens[2])<window_coord[1]): |
| 59 | + continue |
| 60 | + # Skip if interval after interval of interest |
| 61 | + elif(int(tokens[1])>window_coord[2] and int(tokens[2])>window_coord[2]): |
| 62 | + continue |
| 63 | + value = float(tokens[3]) |
| 64 | + for local_x in range(int(tokens[1]),int(tokens[2])): |
| 65 | + if(local_x >= window_coord[1] and local_x<window_coord[2]): |
| 66 | + pileup[local_x-window_coord[1]] = value |
| 67 | + reader.close() |
| 68 | + return(pileup) |
| 69 | + |
| 70 | + |
| 71 | +def parse_gff(gff_fn): |
| 72 | + locus2coord = {} |
| 73 | + reader = open(gff_fn,'r') |
| 74 | + for line in reader: |
| 75 | + if(line.find("#")==0): |
| 76 | + continue |
| 77 | + if(line.find(">")==0): |
| 78 | + break |
| 79 | + tokens = line.strip().split('\t') |
| 80 | + if(tokens[2] in ["mRNA","CDS"]): |
| 81 | + continue |
| 82 | + gene_name = "" |
| 83 | + for feature in tokens[8].split(';'): |
| 84 | + if(feature.find("ID=")!=0): |
| 85 | + continue |
| 86 | + gene_name = feature.split('=')[1] |
| 87 | + break |
| 88 | + locus2coord.update({gene_name:(roman2arabic.get(tokens[0],"chrZ"),int(tokens[3])-1,int(tokens[4]))}) |
| 89 | + reader.close() |
| 90 | + return(locus2coord) |
| 91 | + |
| 92 | + |
| 93 | +#STATUS feature_type NOTES KO_SCORE SYS STD TableS1_Deletion TableS1_replicate_id ERS_accession n_hits hit_list hit_scores LEU2_SCORE URA3_SCORE experiment_accession run_accession submission_accession nominal_length read_count base_count first_public nominal_sdev |
| 94 | +#PASS ORF-Uncharacterized -6.109952403138639 YAL064C-A TDA8 Del1_TDA8 SD0863b ERS838232 3 LEU2|URA3|TDA8 ND|ND|-6.109952403138639 ND ND ERX1406336 ERR1334744 ERA587837 484 8807338 1329908038 2016-03-22 81 |
| 95 | +#PASS ORF-Uncharacterized -5.807910468072448 YAL064C-A TDA8 Del1_TDA8 SD0863b2 ERS838233 3 LEU2|URA3|TDA8 ND|ND|-5.807910468072448 ND ND ERX1406337 ERR1334745 ERA587837 484 8996386 1358454286 2016-03-22 81 |
| 96 | +#FAIL ORF-Verified - YBL091C-A SCS22 Del2_SCS22 SD0864b ERS838234 2 LEU2|URA3 ND|ND ND ND ERX1406338 ERR1334746 ERA587837 484 8710346 1315262246 2016-03-22 81 |
| 97 | +#FAIL ORF-Verified - YBL091C-A SCS22 Del2_SCS22 SD0864b2 ERS838235 2 LEU2|URA3 ND|ND ND ND ERX1406339 ERR1334747 ERA587837 484 8579514 1295506614 2016-03-22 81 |
| 98 | +if __name__ == "__main__": |
| 99 | + '''Collect metadata and DeletionID results to get detection stats on the YKOC data''' |
| 100 | + |
| 101 | + hardcode_name_remap = { |
| 102 | + "YCR061W":"TVS1", |
| 103 | + "YCR100C":"EMA35", |
| 104 | + "YFR045W":"MRX20", |
| 105 | + "YIR035C":"NRE1", |
| 106 | + "YNR062C":"PUL3", |
| 107 | + "YNR063W":"PUL4", |
| 108 | + "YER156C":"MYG1", |
| 109 | + "YMR087W":"PDL32", |
| 110 | + "YLR050C":"EMA19", |
| 111 | + "YMR279C":"ATR2", |
| 112 | + "YMR102C":"LAF1", |
| 113 | + "YMR111C":"EUC1", |
| 114 | + "YMR130W":"DPI35", |
| 115 | + "YJR039W":"MLO127", |
| 116 | + "YJR061W":"MNN14", |
| 117 | + "YGR053C":"MCO32", |
| 118 | + "YKR023W":"RQT4", |
| 119 | + "PET10":"PLN1" |
| 120 | + } |
| 121 | + |
| 122 | + # Get params |
| 123 | + args = getParams() |
| 124 | + WINDOW = args.window |
| 125 | + orf2bed = parse_gff(args.features_gff) |
| 126 | + ers2pileup = {} |
| 127 | + |
| 128 | + # Parse metadata |
| 129 | + reader = open(args.summary_fn, 'r')#, encoding='utf-8') |
| 130 | + for mline in reader: |
| 131 | + mtokens = mline.strip().split('\t') |
| 132 | + ERS = mtokens[9] |
| 133 | + #ORF = hardcode_name_remap.get(mtokens[5],mtokens[5]) |
| 134 | + SYS = mtokens[5] |
| 135 | + STD = mtokens[6] |
| 136 | + COORD = orf2bed.get(SYS,("chrZ",0,1)) |
| 137 | + |
| 138 | + sys.stderr.write("Parsing for sample %s\n" % ERS ) |
| 139 | + |
| 140 | + # Build BedGraph filename |
| 141 | + # Pileup BedGraph in CDT interval, then normalize |
| 142 | + bedgraph_fn = join(args.input_dir,"%s.bedgraph" % ERS) |
| 143 | + if(isfile(bedgraph_fn)): |
| 144 | + pileup = parse_bedgraph(bedgraph_fn, COORD, WINDOW) |
| 145 | + else: |
| 146 | + sys.stderr.write("%s BedGraph pileup does not exist\n" % bedgraph_fn) |
| 147 | + continue |
| 148 | + |
| 149 | + # Build CDT filename |
| 150 | + cdt_row_fn = join(args.input_dir,"%s_%s_%ibp.cdt" % (ERS,STD,WINDOW)) |
| 151 | + writer = open(cdt_row_fn,'w') |
| 152 | + # Write CDT header |
| 153 | + writer.write("\t".join([ "YORF", "NAME", "LENGTH" ]) + "\t" + \ |
| 154 | + "\t".join([ str(i) for i in range(WINDOW)]) + "\n") |
| 155 | + # Write CDT row vals |
| 156 | + writer.write("\t".join([ ERS, STD, str(COORD[2]-COORD[1]) ]) + "\t" + \ |
| 157 | + "\t".join([ str(i) for i in pileup]) + "\n") |
| 158 | + writer.close() |
| 159 | + |
| 160 | + # (ERS,orf_len):[pileup CDT row] |
| 161 | + # cdt_line = "%s\t%s\t%s" % (ERS,ORF,"\t".join([str(i) for i in pileup])) |
| 162 | + # ers2pileup.update({(ERS,COORD[2]-COORD[1]):cdt_line}) |
| 163 | + |
| 164 | + sys.stderr.write("%s pileup complete\n" % ERS) |
| 165 | + |
| 166 | + reader.close() |
| 167 | + |
| 168 | + # # Write CDT header |
| 169 | + # sys.stdout.write("\t".join([ "YORF", "NAME"]) + "\t" + \ |
| 170 | + # "\t".join([ str(i) for i in range(WINDOW)]) + "\n") |
| 171 | + # # Write Output by gene length |
| 172 | + # COORD_KEYS = sorted(ers2pileup, key=lambda x:x[1]) |
| 173 | + # sys.stdout.write("\n".join([ers2pileup[KEY] for KEY in COORD_KEYS])) |
0 commit comments