Skip to content

Commit ebaa070

Browse files
committed
add get_genome_size and random_coords utility functions
1 parent edf069f commit ebaa070

1 file changed

Lines changed: 75 additions & 16 deletions

File tree

seqchromloader/utils.py

Lines changed: 75 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -9,11 +9,38 @@
99
from pybedtools import Interval, BedTool
1010
from pybedtools.helpers import chromsizes
1111

12+
def get_genome_sizes(gs=None, genome=None, to_filter=None, to_keep=None):
13+
"""
14+
Loads the genome sizes file, filter or keep chromosomes
15+
16+
:param gs: genome size file path, only one of `gs` and `genome` is required
17+
:type gs: string
18+
:param genome: genome build name, only one of `gs` and `genome` is required
19+
:type genome: string
20+
:param to_filter: list of chrmosomes to filter, mutually exclusive with `to_keep`
21+
:type to_filter: list
22+
:param to_keep: list of chromosomes to keep, mutually exclusive with `to_filter`
23+
:type to_keep: list
24+
"""
25+
26+
if gs:
27+
genome_sizes = pd.read_table(gs, header=None, usecols=[0,1], names=['chrom', 'length'])
28+
elif genome:
29+
genome_sizes = (pd.DataFrame(chromsizes(genome))
30+
.T
31+
.rename(columns={0:"chrom", 1:"len"}))
32+
else:
33+
raise Exception("Either gs or genome should be provided!")
34+
35+
genome_sizes_filt = filter_chromosomes(genome_sizes, to_filter=to_filter, to_keep=to_keep)
36+
37+
return genome_sizes_filt
38+
1239
def filter_chromosomes(coords, to_filter=None, to_keep=None):
1340
"""
1441
Filter or keep the specified chromosomes
1542
16-
:param coords: input coordinate dataframe, first 3 columns are: [chrom, start, end]
43+
:param coords: input coordinate dataframe, first column should be `chrom`
1744
:type coords: pandas.DataFrame
1845
:param to_filter: list of chrmosomes to filter, mutually exclusive with `to_keep`
1946
:type to_filter: list
@@ -68,7 +95,7 @@ def make_random_shift(coords, L, buffer=25):
6895
6996
"""
7097
low = int(-L/2 + buffer)
71-
high = int(L/2 - buffer)
98+
high = int(L/2 - buffer + 1)
7299

73100
return (coords.assign(midpoint=lambda x: (x["start"]+x["end"])/2)
74101
.astype({"midpoint": int})
@@ -96,16 +123,54 @@ def make_flank(coords, L, d):
96123
.apply(lambda s: pd.Series([s["chrom"], int(s["midpoint"]-L/2), int(s["midpoint"]+L/2)],
97124
index=["chrom", "start", "end"]), axis=1))
98125

126+
127+
def random_coords(gs:str=None, genome:str=None, incl:BedTool=None, excl:BedTool=None, l=500, n=1000):
128+
"""
129+
Randomly sample n intervals of length l from the genome,
130+
shuffle to make all intervals inside the desired regions
131+
and outside exclusion regions
132+
133+
:param gs: genome size file path, only one of `gs` and `genome` is required
134+
:type gs: string
135+
:param genome: genome build name, only one of `gs` and `genome` is required
136+
:type genome: string
137+
:param excl: regions that chopped regions should overlap with
138+
:type incl: BedTool object
139+
:param incl: regions that chopped regions shouldn't overlap with
140+
:type excl: BedTool object
141+
:param l: random interval size
142+
:type l: integer
143+
:param n: number of random intervals generated
144+
:type n: integer
145+
"""
146+
random_kwargs = {}
147+
shuffle_kwargs = {}
148+
if genome:
149+
random_kwargs.update({"genome": genome}); shuffle_kwargs.update({"genome": genome})
150+
elif gs:
151+
random_kwargs.update({"g": gs}); shuffle_kwargs.update({"g": gs})
152+
else:
153+
raise Exception("Either gs or genome should be provided!")
154+
155+
if incl: shuffle_kwargs.update({"incl": incl})
156+
if excl: shuffle_kwargs.update({"excl": excl})
157+
158+
return (BedTool()
159+
.random(l=l, n=n, **random_kwargs)
160+
.shuffle(**shuffle_kwargs)
161+
.to_dataframe()[["chrom", "start", "end"]])
99162

100-
def chop_genome(chroms:list=None, excl:BedTool=None, gs=None, genome=None, stride=500, l=500):
163+
def chop_genome(chroms:list=None, incl:BedTool=None, excl:BedTool=None, gs=None, genome=None, stride=500, l=500):
101164
"""
102165
Given a genome size file and chromosome list,
103166
chop these chromosomes into intervals of length l,
104167
with include/exclude regions specified
105168
106169
:param chroms: list of chromosomes to be chopped
107170
:type chroms: list of strings
108-
:param excl: regions that chopped regions shouldn't overlap with
171+
:param excl: regions that chopped regions should overlap with
172+
:type incl: BedTool object
173+
:param incl: regions that chopped regions shouldn't overlap with
109174
:type excl: BedTool object
110175
:param gs: genome size file path, only one of `gs` and `genome` is required
111176
:type gs: string
@@ -127,22 +192,16 @@ def intervals_loop(chrom, start, stride, l, size):
127192
start += stride
128193
return pd.DataFrame(intervals, columns=["chrom", "start", "end"])
129194

130-
if genome:
131-
genome_sizes = (pd.DataFrame(chromsizes(genome))
132-
.T
133-
.rename(columns={0:"chrom", 1:"len"})
134-
.set_index("chrom")
135-
.loc[chroms])
136-
elif gs:
137-
genome_sizes = (pd.read_csv(gs, sep="\t", usecols=[0,1], names=["chrom", "len"])
138-
.set_index("chrom")
139-
.loc[chroms])
195+
genome_sizes = get_genome_sizes(gs=gs, genome=genome, to_keep=chroms)
196+
140197
genome_chops = pd.concat([intervals_loop(i.Index, 0, stride, l, i.len)
141198
for i in genome_sizes.itertuples()])
142199
genome_chops_bdt = BedTool.from_dataframe(genome_chops)
200+
201+
if incl: genome_chops_bdt = genome_chops_bdt.intersect(incl, wa=True)
202+
if excl: genome_chops_bdt = genome_chops_bdt.intersect(excl, v=True)
143203

144-
return (genome_chops_bdt.intersect(excl, v=True)
145-
.to_dataframe()[["chrom", "start", "end"]])
204+
return genome_chops_bdt.to_dataframe()[["chrom", "start", "end"]]
146205

147206
class DNA2OneHot(object):
148207
def __init__(self):

0 commit comments

Comments
 (0)