-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathExtract_phasiRNA.py
More file actions
145 lines (104 loc) · 5.52 KB
/
Extract_phasiRNA.py
File metadata and controls
145 lines (104 loc) · 5.52 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
#!/usr/bin/python
# coding=utf-8
#Created by Parth Patel, DBI @ University of Delaware, Newark, Delaware 19717
#Date created: 3/30/2015 | Date updated:
##This script calculates number of phasiRNAs per phaseLoci with register length of (by default 10 on both strands=20, so reg(20)x21=442)and extract them.
import MySQLdb
import numpy as np
#import scipy as sp
import matplotlib as mpl
import sys
## agg backend is used to create plot as a .png file
mpl.use('agg')
from pylab import *
import matplotlib.pyplot as plt
import random
from operator import itemgetter
from collections import defaultdict
# - - - - - G L O B A L V A R I A B L E S - - - - - -
database_name="XXXXXXX"
user_name="XXXXX"
user_password="XXXXXX"
server="raichu.ddpsc.org" #"localhost"
nthPhas_length= 24 # or 21
ntheREGISTER=10
nthshift=0 # deafualt 0 is recommended. mismatched reads (-1/+1), you should use (start+/- 1) to query against our dbs.
nthLIMIT= 50# num. of phasiRNAs to be extracted
def main(inputfile,out1):
#connect to MySQL database
db = MySQLdb.connect(host= server, user= user_name, passwd=user_password,db=database_name )
cursor = db.cursor()
inp_file_name = inputfile#str(sys.argv[1]) #user defined name for the input file
out_file_name=out1
out= open(out_file_name,'w');
out.write('tag \t abundance \t strand \t cycle \t PHAS_ID \n')
# phase_ID=""
chr_ID=0
start=0
end=0
INPUT= open(inp_file_name).readlines()
#firstLine = INPUT.pop(0) #removes the first line
num_lines = sum(1 for line in open(inp_file_name))
print "num_of_PHASloci = "+ str(num_lines)
phasiRNA=[]
sorted_phasiRNA=[]
line_index=0
for lines in INPUT: #reading each line from the file - reading each PHAS locus
block = lines.split('\n')
print block[0]
phase_ID = block[0].split('\t')[0]
chr_ID= int(block[0].split('\t')[1])
start= int(block[0].split('\t')[2])+nthshift
end= int(block[0].split('\t')[3])
# PHASLoci_length=end-start
index = ntheREGISTER
register_increment=nthPhas_length
watson_reg_beg=start
watson_reg_end=0
creek_reg_beg=0
creek_reg_end=0
for i in range (0,index):
#print i
# extract phasiRNAs per register per PHASlocus on WATSON STRAND (UPPER)
watson_reg_beg=start+(register_increment*i)
watson_reg_end=watson_reg_beg+register_increment-1
# cursor.execute("select tag,norm_sum,strand FROM tag_position WHERE chr_id=%i and strand ='w' and (position between %i and %i) and length(tag)=%i order by norm_sum desc limit %i" % (chr_ID,watson_reg_beg,watson_reg_end,nthPhas_length,nthLIMIT))
cursor.execute("select tag,norm_sum,strand FROM tag_position WHERE chr_id=%i and strand ='w' and (position between %i and %i) and length(tag)=%i order by norm_sum desc" % (chr_ID,watson_reg_beg,watson_reg_end,nthPhas_length))
rows=cursor.fetchall()
#print rows
for row in rows:
tag,abundance,strand=row
abundance=int(abundance)
#print "w",tag,abundance
phasiRNA.append([tag,abundance,strand,i,phase_ID])
# extract phasiRNAs per register per PHASlocus on CREEK STRAND (LOWER)
creek_reg_beg=(start+register_increment*i)+(register_increment-3) # -3 is for the 2-nt overhang
creek_reg_end=creek_reg_beg-register_increment+1
# cursor.execute("select tag,norm_sum,strand FROM tag_position WHERE chr_id=%i and strand ='c' and (position between %i and %i) and length(tag)=%i order by norm_sum desc limit %i" % (chr_ID,creek_reg_end,creek_reg_beg,nthPhas_length,nthLIMIT))
cursor.execute("select tag,norm_sum,strand FROM tag_position WHERE chr_id=%i and strand ='c' and (position between %i and %i) and length(tag)=%i order by norm_sum desc" % (chr_ID,creek_reg_end,creek_reg_beg,nthPhas_length))
rows_1=cursor.fetchall()
for row_1 in rows_1:
tag_1,abundance_1,strand=row_1
#print "c",tag_1,abundance_1
abundance_1=int(abundance_1)
phasiRNA.append([tag_1,abundance_1,strand,i,phase_ID])
#for i in range(0,len(phasiRNA)):
#print phasiRNA[i]
#print "\n"
line_index+=1
temp={}
for v in sorted(phasiRNA, key=lambda phasiRNA: phasiRNA[1]): # where L is your list
temp[v[0]] = v
print temp[v[0]]
sorted_phasiRNA = temp.values()
sorted_phasiRNA=sorted(sorted_phasiRNA,key=itemgetter(1),reverse=True)
for i in range(0,nthLIMIT*num_lines):
tag,abun,strand,register,phase_ID=sorted_phasiRNA[i]
out.write('%s \t %d \t %s \t %d \t %s \n' % (tag,abun,strand,register,phase_ID))
out.close()
if __name__ == '__main__':#calls the main function
inp_file_name = "21_phaseLoci_Coordinates.txt" #user defined name for the input file with name,chromosome,start,end TAB seperated. (e.g. 21PHAS_NO8 1 159980586 159980986)
out_file_name_1="24_Extracted_Rui.out" # Output file name
main(inp_file_name,out_file_name_1)
print ("Done!\n")
sys.exit()