99from pybedtools import Interval , BedTool
1010from pybedtools .helpers import chromsizes
1111
12+ def get_genome_sizes (gs = None , genome = None , to_filter = None , to_keep = None ):
13+ """
14+ Loads the genome sizes file, filter or keep chromosomes
15+
16+ :param gs: genome size file path, only one of `gs` and `genome` is required
17+ :type gs: string
18+ :param genome: genome build name, only one of `gs` and `genome` is required
19+ :type genome: string
20+ :param to_filter: list of chrmosomes to filter, mutually exclusive with `to_keep`
21+ :type to_filter: list
22+ :param to_keep: list of chromosomes to keep, mutually exclusive with `to_filter`
23+ :type to_keep: list
24+ """
25+
26+ if gs :
27+ genome_sizes = pd .read_table (gs , header = None , usecols = [0 ,1 ], names = ['chrom' , 'length' ])
28+ elif genome :
29+ genome_sizes = (pd .DataFrame (chromsizes (genome ))
30+ .T
31+ .rename (columns = {0 :"chrom" , 1 :"len" }))
32+ else :
33+ raise Exception ("Either gs or genome should be provided!" )
34+
35+ genome_sizes_filt = filter_chromosomes (genome_sizes , to_filter = to_filter , to_keep = to_keep )
36+
37+ return genome_sizes_filt
38+
1239def filter_chromosomes (coords , to_filter = None , to_keep = None ):
1340 """
1441 Filter or keep the specified chromosomes
1542
16- :param coords: input coordinate dataframe, first 3 columns are: [ chrom, start, end]
43+ :param coords: input coordinate dataframe, first column should be ` chrom`
1744 :type coords: pandas.DataFrame
1845 :param to_filter: list of chrmosomes to filter, mutually exclusive with `to_keep`
1946 :type to_filter: list
@@ -68,7 +95,7 @@ def make_random_shift(coords, L, buffer=25):
6895
6996 """
7097 low = int (- L / 2 + buffer )
71- high = int (L / 2 - buffer )
98+ high = int (L / 2 - buffer + 1 )
7299
73100 return (coords .assign (midpoint = lambda x : (x ["start" ]+ x ["end" ])/ 2 )
74101 .astype ({"midpoint" : int })
@@ -96,16 +123,54 @@ def make_flank(coords, L, d):
96123 .apply (lambda s : pd .Series ([s ["chrom" ], int (s ["midpoint" ]- L / 2 ), int (s ["midpoint" ]+ L / 2 )],
97124 index = ["chrom" , "start" , "end" ]), axis = 1 ))
98125
126+
127+ def random_coords (gs :str = None , genome :str = None , incl :BedTool = None , excl :BedTool = None , l = 500 , n = 1000 ):
128+ """
129+ Randomly sample n intervals of length l from the genome,
130+ shuffle to make all intervals inside the desired regions
131+ and outside exclusion regions
132+
133+ :param gs: genome size file path, only one of `gs` and `genome` is required
134+ :type gs: string
135+ :param genome: genome build name, only one of `gs` and `genome` is required
136+ :type genome: string
137+ :param excl: regions that chopped regions should overlap with
138+ :type incl: BedTool object
139+ :param incl: regions that chopped regions shouldn't overlap with
140+ :type excl: BedTool object
141+ :param l: random interval size
142+ :type l: integer
143+ :param n: number of random intervals generated
144+ :type n: integer
145+ """
146+ random_kwargs = {}
147+ shuffle_kwargs = {}
148+ if genome :
149+ random_kwargs .update ({"genome" : genome }); shuffle_kwargs .update ({"genome" : genome })
150+ elif gs :
151+ random_kwargs .update ({"g" : gs }); shuffle_kwargs .update ({"g" : gs })
152+ else :
153+ raise Exception ("Either gs or genome should be provided!" )
154+
155+ if incl : shuffle_kwargs .update ({"incl" : incl })
156+ if excl : shuffle_kwargs .update ({"excl" : excl })
157+
158+ return (BedTool ()
159+ .random (l = l , n = n , ** random_kwargs )
160+ .shuffle (** shuffle_kwargs )
161+ .to_dataframe ()[["chrom" , "start" , "end" ]])
99162
100- def chop_genome (chroms :list = None , excl :BedTool = None , gs = None , genome = None , stride = 500 , l = 500 ):
163+ def chop_genome (chroms :list = None , incl : BedTool = None , excl :BedTool = None , gs = None , genome = None , stride = 500 , l = 500 ):
101164 """
102165 Given a genome size file and chromosome list,
103166 chop these chromosomes into intervals of length l,
104167 with include/exclude regions specified
105168
106169 :param chroms: list of chromosomes to be chopped
107170 :type chroms: list of strings
108- :param excl: regions that chopped regions shouldn't overlap with
171+ :param excl: regions that chopped regions should overlap with
172+ :type incl: BedTool object
173+ :param incl: regions that chopped regions shouldn't overlap with
109174 :type excl: BedTool object
110175 :param gs: genome size file path, only one of `gs` and `genome` is required
111176 :type gs: string
@@ -127,22 +192,16 @@ def intervals_loop(chrom, start, stride, l, size):
127192 start += stride
128193 return pd .DataFrame (intervals , columns = ["chrom" , "start" , "end" ])
129194
130- if genome :
131- genome_sizes = (pd .DataFrame (chromsizes (genome ))
132- .T
133- .rename (columns = {0 :"chrom" , 1 :"len" })
134- .set_index ("chrom" )
135- .loc [chroms ])
136- elif gs :
137- genome_sizes = (pd .read_csv (gs , sep = "\t " , usecols = [0 ,1 ], names = ["chrom" , "len" ])
138- .set_index ("chrom" )
139- .loc [chroms ])
195+ genome_sizes = get_genome_sizes (gs = gs , genome = genome , to_keep = chroms )
196+
140197 genome_chops = pd .concat ([intervals_loop (i .Index , 0 , stride , l , i .len )
141198 for i in genome_sizes .itertuples ()])
142199 genome_chops_bdt = BedTool .from_dataframe (genome_chops )
200+
201+ if incl : genome_chops_bdt = genome_chops_bdt .intersect (incl , wa = True )
202+ if excl : genome_chops_bdt = genome_chops_bdt .intersect (excl , v = True )
143203
144- return (genome_chops_bdt .intersect (excl , v = True )
145- .to_dataframe ()[["chrom" , "start" , "end" ]])
204+ return genome_chops_bdt .to_dataframe ()[["chrom" , "start" , "end" ]]
146205
147206class DNA2OneHot (object ):
148207 def __init__ (self ):
0 commit comments