Skip to content

Commit 94024bf

Browse files
authored
Merge pull request #3 from sreemol-gokuladhas/V3.1
Added new feature: PCHi-C data analysis
2 parents 94e857b + cbdd3eb commit 94024bf

12 files changed

Lines changed: 1335 additions & 435 deletions

codes3d/codes3d.py

Lines changed: 175 additions & 97 deletions
Large diffs are not rendered by default.

codes3d/genes.py

Lines changed: 176 additions & 88 deletions
Original file line numberDiff line numberDiff line change
@@ -12,28 +12,50 @@
1212
from itertools import repeat
1313

1414

15-
def get_gene_fragments(gene_df, restriction_enzymes, db):
15+
def get_gene_fragments(gene_df, restriction_enzymes, db, pchic=False):
1616
db.dispose()
1717
gene_df = gene_df.sort_values(by=['id'])
1818
fragment_df = []
1919
chunksize = 1000
2020
chunks = [gene_df[i:i+chunksize]
2121
for i in range(0, gene_df.shape[0], chunksize)]
2222
for enzyme in restriction_enzymes:
23-
table = 'gene_lookup_{}'
23+
if pchic:
24+
table = 'gene_lookup_pchic_{}'
25+
else:
26+
table = 'gene_lookup_{}'
27+
2428
if enzyme in ['MboI', 'DpnII']: # MboI and DpnII have the same restriction sites
2529
table = table.format('mboi')
2630
else:
2731
table = table.format(enzyme.lower())
28-
sql = '''SELECT * FROM {} WHERE id >= {} AND id <= {}'''
32+
33+
if pchic:
34+
sql = '''SELECT * FROM {} WHERE gencode_id = '{}' '''
35+
else:
36+
sql = '''SELECT * FROM {} WHERE id >= {} AND id <= {}'''
37+
2938
with db.connect() as con:
3039
for chunk in chunks:
31-
df = pd.read_sql(sql.format(
32-
table, chunk['id'].min(), chunk['id'].max()), con=con)
33-
df['enzyme'] = enzyme
34-
fragment_df.append(df)
35-
fragment_df = pd.concat(fragment_df)
36-
gene_df = pd.merge(gene_df, fragment_df, how='inner', on=['id', 'chrom'])
40+
if not pchic:
41+
df = pd.read_sql(sql.format(
42+
table, chunk['id'].min(), chunk['id'].max()), con=con)
43+
df['enzyme'] = enzyme
44+
fragment_df.append(df)
45+
else:
46+
for _, row in chunk.iterrows():
47+
df = pd.read_sql(sql.format(
48+
table, row['gencode_id']), con=con)
49+
df['enzyme'] = enzyme
50+
fragment_df.append(df)
51+
fragment_df = pd.concat(fragment_df).drop_duplicates()
52+
if pchic:
53+
gene_df = pd.merge(gene_df, fragment_df, how='inner',
54+
left_on = ['chrom','start','end','name','gencode_id'],
55+
right_on = ['chr', 'start', 'end', 'gene', 'gencode_id'])
56+
else:
57+
gene_df = pd.merge(gene_df, fragment_df, how='inner', on=['id', 'chrom'])
58+
3759
return gene_df
3860

3961

@@ -70,139 +92,201 @@ def process_snp_genes(
7092
def find_snp_genes(
7193
chunk_df,
7294
enzyme,
73-
enzyme_genes):
95+
enzyme_genes,
96+
pchic=False):
7497
db.dispose()
75-
chunk_df = chunk_df.sort_values(by=['fragment'])
76-
chrom = chunk_df['fragment_chr'].drop_duplicates().to_list()[0]
77-
table = 'gene_lookup_{}'
98+
if pchic:
99+
#celline = chunk_df['cell_line'].drop_duplicates().to_list()
100+
table = 'gene_lookup_pchic_{}'
101+
else:
102+
chunk_df = chunk_df.sort_values(by=['fragment'])
103+
chrom = chunk_df['fragment_chr'].drop_duplicates().to_list()[0]
104+
table = 'gene_lookup_{}'
105+
78106
if enzyme in ['MboI', 'DpnII']: # MboI and DpnII have the same restriction sites
79107
table = table.format('mboi')
80108
else:
81109
table = table.format(enzyme.lower())
82-
chunk_df['fragment'] = chunk_df['fragment'].astype(int)
83-
sql = ''' SELECT * FROM {0}
84-
JOIN genes on {0}.id=genes.id
85-
WHERE {0}.chrom = '{1}' AND {0}.frag_id >= {2} AND {0}.frag_id <= {3}'''
86-
sql = sql.format(table, chrom,
110+
111+
if pchic:
112+
inter_df_ls = chunk_df['inter_fid'].unique().tolist()
113+
if len(inter_df_ls) > 1:
114+
inter_df_ls = tuple(inter_df_ls)
115+
sql = '''SELECT * FROM {} WHERE frag_id IN {}'''.format(table, inter_df_ls)
116+
else:
117+
inter_df_ls = (inter_df_ls[0])
118+
sql = '''SELECT * FROM {} WHERE frag_id = {}'''.format(table, inter_df_ls)
119+
else:
120+
chunk_df['fragment'] = chunk_df['fragment'].astype(int)
121+
sql = ''' SELECT * FROM {0}
122+
JOIN genes on {0}.id=genes.id
123+
WHERE {0}.chrom = '{1}' AND {0}.frag_id >= {2} AND {0}.frag_id <= {3}'''
124+
sql = sql.format(table, chrom,
87125
chunk_df['fragment'].min(), chunk_df['fragment'].max())
88126
df = pd.DataFrame()
89127
with db.connect() as con:
90128
df = pd.read_sql_query(sql, con)
91129
if df.empty:
92130
return
93-
df = df.loc[:, ~df.columns.duplicated()]
94-
df = df.rename(
95-
columns={'id': 'gene_id', 'name': 'gene', 'chrom': 'gene_chr',
96-
'start': 'gene_start', 'end': 'gene_end'})
97-
chunk_df = pd.merge(chunk_df, df, how='inner',
131+
132+
if pchic:
133+
df = df.rename(
134+
columns={'chr': 'gene_chr', 'start': 'gene_start', 'end': 'gene_end'})
135+
chunk_df = pd.merge(chunk_df, df, how= 'inner', sort=False, left_on='inter_fid',
136+
right_on='frag_id')
137+
else:
138+
df = df.loc[:, ~df.columns.duplicated()]
139+
df = df.rename(
140+
columns={'id': 'gene_id', 'name': 'gene', 'chrom': 'gene_chr',
141+
'start': 'gene_start', 'end': 'gene_end'})
142+
chunk_df = pd.merge(chunk_df, df, how='inner',
98143
left_on=['fragment_chr', 'fragment'], right_on=['gene_chr', 'frag_id'])
99144

100145
enzyme_genes.append(chunk_df)
101146

102147

103-
def fetch_hic_libs(db):
148+
def fetch_3dgi_libs(db, pchic=False):
104149
with db.connect() as con:
105-
hic_libs = pd.read_sql_query(
106-
'SELECT library, enzyme, rep_count FROM meta_hic',
107-
con=con)
108-
return hic_libs.drop_duplicates()
109-
150+
if pchic:
151+
_3dgi_libs = pd.read_sql_query(
152+
'SELECT library, enzyme, rep_count FROM meta_pchic', con=con)
153+
else:
154+
_3dgi_libs = pd.read_sql_query(
155+
'SELECT library, enzyme, rep_count FROM meta_hic', con=con)
156+
157+
return _3dgi_libs.drop_duplicates()
110158

111159
def get_gene_by_id(
112160
snp_df,
113161
inter_df,
114162
_db,
115163
logger,
116-
):
117-
logger.write('Identifying genes interacting with SNPs in...')
164+
pchic=False):
165+
if pchic:
166+
logger.write('Identifying gene promoters interacting with SNPs in...')
167+
else:
168+
logger.write('Identifying genes interacting with SNPs in...')
118169
global db
119170
db = _db
120171
start_time = time.time()
121172
enzymes = inter_df['enzyme'].drop_duplicates().tolist()
122173
all_genes_df = []
123174
#db = create_engine(db_url, echo=False, poolclass=NullPool)
124-
hic_libs = fetch_hic_libs(db)
125-
hic_libs = hic_libs.rename(columns={'rep_count': 'cell_line_replicates'})
175+
_3dgi_libs = fetch_3dgi_libs(db, pchic)
176+
_3dgi_libs = _3dgi_libs.rename(columns={'rep_count': 'cell_line_replicates'})
126177
for enzyme in enzymes:
127178
manager = multiprocessing.Manager()
128179
num_processes = int(min(16, multiprocessing.cpu_count()/2))
129180
enzyme_genes = manager.list()
130181
enzyme_df = []
131182
with multiprocessing.Pool(processes=num_processes) as pool:
132183
df = inter_df[inter_df['enzyme'] == enzyme]
133-
snp_interactions = [
134-
df[df['fragment_chr'] == chrom]
135-
for chrom in df['fragment_chr'].drop_duplicates().to_list()
136-
]
137-
desc = ' * Hi-C libraries restricted with {}'.format(enzyme)
184+
if pchic:
185+
df_subset = df[['p_fid', 'oe_fid', 'n_reads', 'score',
186+
'query_type', 'query_fragment', 'replicate', 'cell_line', 'enzyme']]
187+
df_subset['inter_fid'] = np.where(df_subset['query_fragment'] ==
188+
df_subset['p_fid'], df_subset['oe_fid'], df_subset['p_fid'])
189+
snp_interactions = [df_subset[df_subset['cell_line'] == celline]
190+
for celline in df_subset['cell_line'].to_list()
191+
]
192+
desc = ' * PCHi-C libraries restricted with {}'.format(enzyme)
193+
else:
194+
snp_interactions = [
195+
df[df['fragment_chr'] == chrom]
196+
for chrom in df['fragment_chr'].drop_duplicates().to_list()
197+
]
198+
desc = ' * Hi-C libraries restricted with {}'.format(enzyme)
138199
bar_format = '{desc}: {percentage:3.0f}% |{bar}| {n_fmt}/{total_fmt} {unit}'
139-
'''
140-
for snp in snp_interactions:
141-
find_snp_genes(snp,
142-
enzyme,
143-
enzyme_genes,
144-
db)
145-
'''
146200
for _ in tqdm.tqdm(pool.istarmap(
147201
find_snp_genes,
148202
zip(snp_interactions,
149203
repeat(enzyme),
150-
repeat(enzyme_genes))),
204+
repeat(enzyme_genes),
205+
repeat(pchic))),
151206
total=len(snp_interactions), desc=desc, unit='batches',
152207
ncols=80, bar_format=bar_format):
153208
pass
154-
155209
for df in enzyme_genes:
156210
enzyme_df.append(df)
157211
enzyme_df = pd.concat(enzyme_df)
158212
enzyme_df = enzyme_df.merge(
159-
hic_libs, how='left',
213+
_3dgi_libs, how='left',
160214
left_on=['cell_line', 'enzyme'], right_on=['library', 'enzyme'])
161-
enzyme_df['interactions'] = enzyme_df.groupby(
162-
['query_chr', 'query_fragment', 'gencode_id', 'cell_line'])[
163-
'fragment'].transform('count')
164-
df = enzyme_df[
165-
['query_chr', 'query_fragment', 'gencode_id', 'cell_line', 'replicate']
166-
].drop_duplicates()
167-
df['replicates'] = df.groupby(
168-
['query_chr', 'query_fragment', 'gencode_id', 'cell_line'])[
169-
'replicate'].transform('count')
170-
enzyme_df = enzyme_df.merge(
171-
df, how='left',
172-
on=['query_chr', 'query_fragment', 'gencode_id', 'cell_line', 'replicate'])
173-
enzyme_df = enzyme_df.drop(
174-
columns=['fragment_chr', 'fragment',
215+
if pchic:
216+
enzyme_df = enzyme_df.drop(
217+
columns=['p_fid','oe_fid','library','frag_id','cell_line_replicates'])
218+
else:
219+
enzyme_df['interactions'] = enzyme_df.groupby(
220+
['query_chr', 'query_fragment', 'gencode_id', 'cell_line'])[
221+
'fragment'].transform('count')
222+
df = enzyme_df[
223+
['query_chr', 'query_fragment', 'gencode_id', 'cell_line', 'replicate']
224+
].drop_duplicates()
225+
df['replicates'] = df.groupby(
226+
['query_chr', 'query_fragment', 'gencode_id', 'cell_line'])[
227+
'replicate'].transform('count')
228+
enzyme_df = enzyme_df.merge(
229+
df, how='left',
230+
on=['query_chr', 'query_fragment', 'gencode_id', 'cell_line', 'replicate'])
231+
enzyme_df = enzyme_df.drop(
232+
columns=['fragment_chr', 'fragment',
175233
'frag_id', 'library', 'replicate']
176-
)
234+
)
177235
enzyme_df = enzyme_df.drop_duplicates()
178236
all_genes_df.append(enzyme_df.drop_duplicates())
179-
logger.write(' * Filtering gene interactions...')
237+
if not pchic:
238+
logger.write(' * Filtering gene interactions...')
180239
all_genes_df = pd.concat(all_genes_df)
181-
all_genes_df = all_genes_df.merge(
182-
snp_df, left_on=['query_fragment', 'query_chr', 'enzyme'],
183-
right_on=['fragment', 'chrom', 'enzyme'], how='inner'
184-
) # .drop_duplicates()
185-
all_genes_df['sum_cell_lines'] = all_genes_df.groupby(
186-
['snp', 'gene'])['cell_line'].transform('count')
187-
all_genes_df['sum_interactions'] = all_genes_df.groupby(
188-
['snp', 'gene'])[
189-
'interactions'].transform('sum')
190-
all_genes_df['sum_replicates'] = all_genes_df.groupby(
191-
['snp', 'gene'])[
192-
'replicates'].transform('sum')
193-
condition = (
194-
(all_genes_df['interactions'] / all_genes_df['cell_line_replicates'] <= 1) &
195-
(all_genes_df['sum_replicates'] < 2) &
196-
(all_genes_df['sum_cell_lines'] < 2))
197-
gene_df = all_genes_df[~condition].drop_duplicates()
198-
cols = ['snp', 'chrom', 'locus', 'variant_id',
240+
241+
if pchic:
242+
all_genes_df = all_genes_df.drop_duplicates()
243+
df = all_genes_df[['query_fragment','gencode_id','cell_line','n_reads',
244+
'score']].drop_duplicates()
245+
df['N_reads'] = df.groupby(['query_fragment','gencode_id','cell_line'])[
246+
'n_reads'].transform('sum')
247+
df['Score'] = df.groupby(['query_fragment','gencode_id','cell_line','N_reads'])[
248+
'score'].transform('mean').round(2)
249+
df = df[['query_fragment','gencode_id','cell_line','N_reads','Score']].drop_duplicates()
250+
all_genes_df = all_genes_df.merge(
251+
df, on=['query_fragment','gencode_id','cell_line'],
252+
how='left'
253+
).drop(['n_reads','score','inter_fid'], axis=1).drop_duplicates()
254+
all_genes_df = all_genes_df.merge(
255+
snp_df, left_on=['query_fragment','enzyme'],
256+
right_on=['fragment','enzyme'],
257+
how='inner'
258+
).drop_duplicates()
259+
gene_df = all_genes_df[['snp', 'chrom', 'locus', 'variant_id',
199260
'gene', 'gencode_id', 'gene_chr', 'gene_start', 'gene_end',
200-
'interactions', 'replicates', 'cell_line', 'cell_line_replicates',
201-
'sum_interactions', 'sum_replicates', 'sum_cell_lines']
261+
'N_reads', 'Score', 'cell_line']].drop_duplicates()
262+
else:
263+
all_genes_df = all_genes_df.merge(
264+
snp_df, left_on=['query_fragment', 'query_chr', 'enzyme'],
265+
right_on=['fragment', 'chrom', 'enzyme'], how='inner'
266+
) # .drop_duplicates()
267+
all_genes_df['sum_cell_lines'] = all_genes_df.groupby(
268+
['snp', 'gene'])['cell_line'].transform('count')
269+
all_genes_df['sum_interactions'] = all_genes_df.groupby(
270+
['snp', 'gene'])[
271+
'interactions'].transform('sum')
272+
all_genes_df['sum_replicates'] = all_genes_df.groupby(
273+
['snp', 'gene'])[
274+
'replicates'].transform('sum')
275+
condition = (
276+
(all_genes_df['interactions'] / all_genes_df['cell_line_replicates'] <= 1) &
277+
(all_genes_df['sum_replicates'] < 2) &
278+
(all_genes_df['sum_cell_lines'] < 2))
279+
gene_df = all_genes_df[~condition].drop_duplicates()
280+
cols = ['snp', 'chrom', 'locus', 'variant_id',
281+
'gene', 'gencode_id', 'gene_chr', 'gene_start', 'gene_end',
282+
'interactions', 'replicates', 'cell_line', 'cell_line_replicates',
283+
'sum_interactions', 'sum_replicates', 'sum_cell_lines']
202284
logger.write(' Time elasped: {:.2f} mins.'.format(
203285
(time.time() - start_time)/60))
204-
return gene_df[cols].drop_duplicates()
205-
286+
if pchic:
287+
return gene_df
288+
else:
289+
return gene_df[cols].drop_duplicates()
206290

207291
def get_gene_by_position(df, db):
208292
gene_df = []
@@ -288,6 +372,7 @@ def get_gene_info(
288372
output_dir,
289373
db,
290374
logger,
375+
pchic = False,
291376
suppress_intermediate_files=False):
292377
enzymes = hic_df['enzyme'].drop_duplicates().tolist()
293378
gene_df = []
@@ -340,7 +425,10 @@ def get_gene_info(
340425
len(omitted_genes))
341426
msg = msg + ':\n\t{}'.format(', '.join(omitted_genes))
342427
logger.write(msg)
343-
344-
gene_df = get_gene_fragments(gene_df, enzymes, db)
428+
gene_df = get_gene_fragments(gene_df, enzymes, db, pchic)
345429
gene_df = gene_df.rename(columns={'frag_id': 'fragment'})
430+
if pchic:
431+
gene_df = gene_df.drop(['chr','gene'], axis=1)
432+
else:
433+
pass
346434
return(gene_df)

0 commit comments

Comments
 (0)