|
12 | 12 | from itertools import repeat |
13 | 13 |
|
14 | 14 |
|
15 | | -def get_gene_fragments(gene_df, restriction_enzymes, db): |
| 15 | +def get_gene_fragments(gene_df, restriction_enzymes, db, pchic=False): |
16 | 16 | db.dispose() |
17 | 17 | gene_df = gene_df.sort_values(by=['id']) |
18 | 18 | fragment_df = [] |
19 | 19 | chunksize = 1000 |
20 | 20 | chunks = [gene_df[i:i+chunksize] |
21 | 21 | for i in range(0, gene_df.shape[0], chunksize)] |
22 | 22 | for enzyme in restriction_enzymes: |
23 | | - table = 'gene_lookup_{}' |
| 23 | + if pchic: |
| 24 | + table = 'gene_lookup_pchic_{}' |
| 25 | + else: |
| 26 | + table = 'gene_lookup_{}' |
| 27 | + |
24 | 28 | if enzyme in ['MboI', 'DpnII']: # MboI and DpnII have the same restriction sites |
25 | 29 | table = table.format('mboi') |
26 | 30 | else: |
27 | 31 | table = table.format(enzyme.lower()) |
28 | | - sql = '''SELECT * FROM {} WHERE id >= {} AND id <= {}''' |
| 32 | + |
| 33 | + if pchic: |
| 34 | + sql = '''SELECT * FROM {} WHERE gencode_id = '{}' ''' |
| 35 | + else: |
| 36 | + sql = '''SELECT * FROM {} WHERE id >= {} AND id <= {}''' |
| 37 | + |
29 | 38 | with db.connect() as con: |
30 | 39 | for chunk in chunks: |
31 | | - df = pd.read_sql(sql.format( |
32 | | - table, chunk['id'].min(), chunk['id'].max()), con=con) |
33 | | - df['enzyme'] = enzyme |
34 | | - fragment_df.append(df) |
35 | | - fragment_df = pd.concat(fragment_df) |
36 | | - gene_df = pd.merge(gene_df, fragment_df, how='inner', on=['id', 'chrom']) |
| 40 | + if not pchic: |
| 41 | + df = pd.read_sql(sql.format( |
| 42 | + table, chunk['id'].min(), chunk['id'].max()), con=con) |
| 43 | + df['enzyme'] = enzyme |
| 44 | + fragment_df.append(df) |
| 45 | + else: |
| 46 | + for _, row in chunk.iterrows(): |
| 47 | + df = pd.read_sql(sql.format( |
| 48 | + table, row['gencode_id']), con=con) |
| 49 | + df['enzyme'] = enzyme |
| 50 | + fragment_df.append(df) |
| 51 | + fragment_df = pd.concat(fragment_df).drop_duplicates() |
| 52 | + if pchic: |
| 53 | + gene_df = pd.merge(gene_df, fragment_df, how='inner', |
| 54 | + left_on = ['chrom','start','end','name','gencode_id'], |
| 55 | + right_on = ['chr', 'start', 'end', 'gene', 'gencode_id']) |
| 56 | + else: |
| 57 | + gene_df = pd.merge(gene_df, fragment_df, how='inner', on=['id', 'chrom']) |
| 58 | + |
37 | 59 | return gene_df |
38 | 60 |
|
39 | 61 |
|
@@ -70,139 +92,201 @@ def process_snp_genes( |
70 | 92 | def find_snp_genes( |
71 | 93 | chunk_df, |
72 | 94 | enzyme, |
73 | | - enzyme_genes): |
| 95 | + enzyme_genes, |
| 96 | + pchic=False): |
74 | 97 | db.dispose() |
75 | | - chunk_df = chunk_df.sort_values(by=['fragment']) |
76 | | - chrom = chunk_df['fragment_chr'].drop_duplicates().to_list()[0] |
77 | | - table = 'gene_lookup_{}' |
| 98 | + if pchic: |
| 99 | + #celline = chunk_df['cell_line'].drop_duplicates().to_list() |
| 100 | + table = 'gene_lookup_pchic_{}' |
| 101 | + else: |
| 102 | + chunk_df = chunk_df.sort_values(by=['fragment']) |
| 103 | + chrom = chunk_df['fragment_chr'].drop_duplicates().to_list()[0] |
| 104 | + table = 'gene_lookup_{}' |
| 105 | + |
78 | 106 | if enzyme in ['MboI', 'DpnII']: # MboI and DpnII have the same restriction sites |
79 | 107 | table = table.format('mboi') |
80 | 108 | else: |
81 | 109 | table = table.format(enzyme.lower()) |
82 | | - chunk_df['fragment'] = chunk_df['fragment'].astype(int) |
83 | | - sql = ''' SELECT * FROM {0} |
84 | | - JOIN genes on {0}.id=genes.id |
85 | | - WHERE {0}.chrom = '{1}' AND {0}.frag_id >= {2} AND {0}.frag_id <= {3}''' |
86 | | - sql = sql.format(table, chrom, |
| 110 | + |
| 111 | + if pchic: |
| 112 | + inter_df_ls = chunk_df['inter_fid'].unique().tolist() |
| 113 | + if len(inter_df_ls) > 1: |
| 114 | + inter_df_ls = tuple(inter_df_ls) |
| 115 | + sql = '''SELECT * FROM {} WHERE frag_id IN {}'''.format(table, inter_df_ls) |
| 116 | + else: |
| 117 | + inter_df_ls = (inter_df_ls[0]) |
| 118 | + sql = '''SELECT * FROM {} WHERE frag_id = {}'''.format(table, inter_df_ls) |
| 119 | + else: |
| 120 | + chunk_df['fragment'] = chunk_df['fragment'].astype(int) |
| 121 | + sql = ''' SELECT * FROM {0} |
| 122 | + JOIN genes on {0}.id=genes.id |
| 123 | + WHERE {0}.chrom = '{1}' AND {0}.frag_id >= {2} AND {0}.frag_id <= {3}''' |
| 124 | + sql = sql.format(table, chrom, |
87 | 125 | chunk_df['fragment'].min(), chunk_df['fragment'].max()) |
88 | 126 | df = pd.DataFrame() |
89 | 127 | with db.connect() as con: |
90 | 128 | df = pd.read_sql_query(sql, con) |
91 | 129 | if df.empty: |
92 | 130 | return |
93 | | - df = df.loc[:, ~df.columns.duplicated()] |
94 | | - df = df.rename( |
95 | | - columns={'id': 'gene_id', 'name': 'gene', 'chrom': 'gene_chr', |
96 | | - 'start': 'gene_start', 'end': 'gene_end'}) |
97 | | - chunk_df = pd.merge(chunk_df, df, how='inner', |
| 131 | + |
| 132 | + if pchic: |
| 133 | + df = df.rename( |
| 134 | + columns={'chr': 'gene_chr', 'start': 'gene_start', 'end': 'gene_end'}) |
| 135 | + chunk_df = pd.merge(chunk_df, df, how= 'inner', sort=False, left_on='inter_fid', |
| 136 | + right_on='frag_id') |
| 137 | + else: |
| 138 | + df = df.loc[:, ~df.columns.duplicated()] |
| 139 | + df = df.rename( |
| 140 | + columns={'id': 'gene_id', 'name': 'gene', 'chrom': 'gene_chr', |
| 141 | + 'start': 'gene_start', 'end': 'gene_end'}) |
| 142 | + chunk_df = pd.merge(chunk_df, df, how='inner', |
98 | 143 | left_on=['fragment_chr', 'fragment'], right_on=['gene_chr', 'frag_id']) |
99 | 144 |
|
100 | 145 | enzyme_genes.append(chunk_df) |
101 | 146 |
|
102 | 147 |
|
103 | | -def fetch_hic_libs(db): |
| 148 | +def fetch_3dgi_libs(db, pchic=False): |
104 | 149 | with db.connect() as con: |
105 | | - hic_libs = pd.read_sql_query( |
106 | | - 'SELECT library, enzyme, rep_count FROM meta_hic', |
107 | | - con=con) |
108 | | - return hic_libs.drop_duplicates() |
109 | | - |
| 150 | + if pchic: |
| 151 | + _3dgi_libs = pd.read_sql_query( |
| 152 | + 'SELECT library, enzyme, rep_count FROM meta_pchic', con=con) |
| 153 | + else: |
| 154 | + _3dgi_libs = pd.read_sql_query( |
| 155 | + 'SELECT library, enzyme, rep_count FROM meta_hic', con=con) |
| 156 | + |
| 157 | + return _3dgi_libs.drop_duplicates() |
110 | 158 |
|
111 | 159 | def get_gene_by_id( |
112 | 160 | snp_df, |
113 | 161 | inter_df, |
114 | 162 | _db, |
115 | 163 | logger, |
116 | | -): |
117 | | - logger.write('Identifying genes interacting with SNPs in...') |
| 164 | + pchic=False): |
| 165 | + if pchic: |
| 166 | + logger.write('Identifying gene promoters interacting with SNPs in...') |
| 167 | + else: |
| 168 | + logger.write('Identifying genes interacting with SNPs in...') |
118 | 169 | global db |
119 | 170 | db = _db |
120 | 171 | start_time = time.time() |
121 | 172 | enzymes = inter_df['enzyme'].drop_duplicates().tolist() |
122 | 173 | all_genes_df = [] |
123 | 174 | #db = create_engine(db_url, echo=False, poolclass=NullPool) |
124 | | - hic_libs = fetch_hic_libs(db) |
125 | | - hic_libs = hic_libs.rename(columns={'rep_count': 'cell_line_replicates'}) |
| 175 | + _3dgi_libs = fetch_3dgi_libs(db, pchic) |
| 176 | + _3dgi_libs = _3dgi_libs.rename(columns={'rep_count': 'cell_line_replicates'}) |
126 | 177 | for enzyme in enzymes: |
127 | 178 | manager = multiprocessing.Manager() |
128 | 179 | num_processes = int(min(16, multiprocessing.cpu_count()/2)) |
129 | 180 | enzyme_genes = manager.list() |
130 | 181 | enzyme_df = [] |
131 | 182 | with multiprocessing.Pool(processes=num_processes) as pool: |
132 | 183 | df = inter_df[inter_df['enzyme'] == enzyme] |
133 | | - snp_interactions = [ |
134 | | - df[df['fragment_chr'] == chrom] |
135 | | - for chrom in df['fragment_chr'].drop_duplicates().to_list() |
136 | | - ] |
137 | | - desc = ' * Hi-C libraries restricted with {}'.format(enzyme) |
| 184 | + if pchic: |
| 185 | + df_subset = df[['p_fid', 'oe_fid', 'n_reads', 'score', |
| 186 | + 'query_type', 'query_fragment', 'replicate', 'cell_line', 'enzyme']] |
| 187 | + df_subset['inter_fid'] = np.where(df_subset['query_fragment'] == |
| 188 | + df_subset['p_fid'], df_subset['oe_fid'], df_subset['p_fid']) |
| 189 | + snp_interactions = [df_subset[df_subset['cell_line'] == celline] |
| 190 | + for celline in df_subset['cell_line'].to_list() |
| 191 | + ] |
| 192 | + desc = ' * PCHi-C libraries restricted with {}'.format(enzyme) |
| 193 | + else: |
| 194 | + snp_interactions = [ |
| 195 | + df[df['fragment_chr'] == chrom] |
| 196 | + for chrom in df['fragment_chr'].drop_duplicates().to_list() |
| 197 | + ] |
| 198 | + desc = ' * Hi-C libraries restricted with {}'.format(enzyme) |
138 | 199 | bar_format = '{desc}: {percentage:3.0f}% |{bar}| {n_fmt}/{total_fmt} {unit}' |
139 | | - ''' |
140 | | - for snp in snp_interactions: |
141 | | - find_snp_genes(snp, |
142 | | - enzyme, |
143 | | - enzyme_genes, |
144 | | - db) |
145 | | - ''' |
146 | 200 | for _ in tqdm.tqdm(pool.istarmap( |
147 | 201 | find_snp_genes, |
148 | 202 | zip(snp_interactions, |
149 | 203 | repeat(enzyme), |
150 | | - repeat(enzyme_genes))), |
| 204 | + repeat(enzyme_genes), |
| 205 | + repeat(pchic))), |
151 | 206 | total=len(snp_interactions), desc=desc, unit='batches', |
152 | 207 | ncols=80, bar_format=bar_format): |
153 | 208 | pass |
154 | | - |
155 | 209 | for df in enzyme_genes: |
156 | 210 | enzyme_df.append(df) |
157 | 211 | enzyme_df = pd.concat(enzyme_df) |
158 | 212 | enzyme_df = enzyme_df.merge( |
159 | | - hic_libs, how='left', |
| 213 | + _3dgi_libs, how='left', |
160 | 214 | left_on=['cell_line', 'enzyme'], right_on=['library', 'enzyme']) |
161 | | - enzyme_df['interactions'] = enzyme_df.groupby( |
162 | | - ['query_chr', 'query_fragment', 'gencode_id', 'cell_line'])[ |
163 | | - 'fragment'].transform('count') |
164 | | - df = enzyme_df[ |
165 | | - ['query_chr', 'query_fragment', 'gencode_id', 'cell_line', 'replicate'] |
166 | | - ].drop_duplicates() |
167 | | - df['replicates'] = df.groupby( |
168 | | - ['query_chr', 'query_fragment', 'gencode_id', 'cell_line'])[ |
169 | | - 'replicate'].transform('count') |
170 | | - enzyme_df = enzyme_df.merge( |
171 | | - df, how='left', |
172 | | - on=['query_chr', 'query_fragment', 'gencode_id', 'cell_line', 'replicate']) |
173 | | - enzyme_df = enzyme_df.drop( |
174 | | - columns=['fragment_chr', 'fragment', |
| 215 | + if pchic: |
| 216 | + enzyme_df = enzyme_df.drop( |
| 217 | + columns=['p_fid','oe_fid','library','frag_id','cell_line_replicates']) |
| 218 | + else: |
| 219 | + enzyme_df['interactions'] = enzyme_df.groupby( |
| 220 | + ['query_chr', 'query_fragment', 'gencode_id', 'cell_line'])[ |
| 221 | + 'fragment'].transform('count') |
| 222 | + df = enzyme_df[ |
| 223 | + ['query_chr', 'query_fragment', 'gencode_id', 'cell_line', 'replicate'] |
| 224 | + ].drop_duplicates() |
| 225 | + df['replicates'] = df.groupby( |
| 226 | + ['query_chr', 'query_fragment', 'gencode_id', 'cell_line'])[ |
| 227 | + 'replicate'].transform('count') |
| 228 | + enzyme_df = enzyme_df.merge( |
| 229 | + df, how='left', |
| 230 | + on=['query_chr', 'query_fragment', 'gencode_id', 'cell_line', 'replicate']) |
| 231 | + enzyme_df = enzyme_df.drop( |
| 232 | + columns=['fragment_chr', 'fragment', |
175 | 233 | 'frag_id', 'library', 'replicate'] |
176 | | - ) |
| 234 | + ) |
177 | 235 | enzyme_df = enzyme_df.drop_duplicates() |
178 | 236 | all_genes_df.append(enzyme_df.drop_duplicates()) |
179 | | - logger.write(' * Filtering gene interactions...') |
| 237 | + if not pchic: |
| 238 | + logger.write(' * Filtering gene interactions...') |
180 | 239 | all_genes_df = pd.concat(all_genes_df) |
181 | | - all_genes_df = all_genes_df.merge( |
182 | | - snp_df, left_on=['query_fragment', 'query_chr', 'enzyme'], |
183 | | - right_on=['fragment', 'chrom', 'enzyme'], how='inner' |
184 | | - ) # .drop_duplicates() |
185 | | - all_genes_df['sum_cell_lines'] = all_genes_df.groupby( |
186 | | - ['snp', 'gene'])['cell_line'].transform('count') |
187 | | - all_genes_df['sum_interactions'] = all_genes_df.groupby( |
188 | | - ['snp', 'gene'])[ |
189 | | - 'interactions'].transform('sum') |
190 | | - all_genes_df['sum_replicates'] = all_genes_df.groupby( |
191 | | - ['snp', 'gene'])[ |
192 | | - 'replicates'].transform('sum') |
193 | | - condition = ( |
194 | | - (all_genes_df['interactions'] / all_genes_df['cell_line_replicates'] <= 1) & |
195 | | - (all_genes_df['sum_replicates'] < 2) & |
196 | | - (all_genes_df['sum_cell_lines'] < 2)) |
197 | | - gene_df = all_genes_df[~condition].drop_duplicates() |
198 | | - cols = ['snp', 'chrom', 'locus', 'variant_id', |
| 240 | + |
| 241 | + if pchic: |
| 242 | + all_genes_df = all_genes_df.drop_duplicates() |
| 243 | + df = all_genes_df[['query_fragment','gencode_id','cell_line','n_reads', |
| 244 | + 'score']].drop_duplicates() |
| 245 | + df['N_reads'] = df.groupby(['query_fragment','gencode_id','cell_line'])[ |
| 246 | + 'n_reads'].transform('sum') |
| 247 | + df['Score'] = df.groupby(['query_fragment','gencode_id','cell_line','N_reads'])[ |
| 248 | + 'score'].transform('mean').round(2) |
| 249 | + df = df[['query_fragment','gencode_id','cell_line','N_reads','Score']].drop_duplicates() |
| 250 | + all_genes_df = all_genes_df.merge( |
| 251 | + df, on=['query_fragment','gencode_id','cell_line'], |
| 252 | + how='left' |
| 253 | + ).drop(['n_reads','score','inter_fid'], axis=1).drop_duplicates() |
| 254 | + all_genes_df = all_genes_df.merge( |
| 255 | + snp_df, left_on=['query_fragment','enzyme'], |
| 256 | + right_on=['fragment','enzyme'], |
| 257 | + how='inner' |
| 258 | + ).drop_duplicates() |
| 259 | + gene_df = all_genes_df[['snp', 'chrom', 'locus', 'variant_id', |
199 | 260 | 'gene', 'gencode_id', 'gene_chr', 'gene_start', 'gene_end', |
200 | | - 'interactions', 'replicates', 'cell_line', 'cell_line_replicates', |
201 | | - 'sum_interactions', 'sum_replicates', 'sum_cell_lines'] |
| 261 | + 'N_reads', 'Score', 'cell_line']].drop_duplicates() |
| 262 | + else: |
| 263 | + all_genes_df = all_genes_df.merge( |
| 264 | + snp_df, left_on=['query_fragment', 'query_chr', 'enzyme'], |
| 265 | + right_on=['fragment', 'chrom', 'enzyme'], how='inner' |
| 266 | + ) # .drop_duplicates() |
| 267 | + all_genes_df['sum_cell_lines'] = all_genes_df.groupby( |
| 268 | + ['snp', 'gene'])['cell_line'].transform('count') |
| 269 | + all_genes_df['sum_interactions'] = all_genes_df.groupby( |
| 270 | + ['snp', 'gene'])[ |
| 271 | + 'interactions'].transform('sum') |
| 272 | + all_genes_df['sum_replicates'] = all_genes_df.groupby( |
| 273 | + ['snp', 'gene'])[ |
| 274 | + 'replicates'].transform('sum') |
| 275 | + condition = ( |
| 276 | + (all_genes_df['interactions'] / all_genes_df['cell_line_replicates'] <= 1) & |
| 277 | + (all_genes_df['sum_replicates'] < 2) & |
| 278 | + (all_genes_df['sum_cell_lines'] < 2)) |
| 279 | + gene_df = all_genes_df[~condition].drop_duplicates() |
| 280 | + cols = ['snp', 'chrom', 'locus', 'variant_id', |
| 281 | + 'gene', 'gencode_id', 'gene_chr', 'gene_start', 'gene_end', |
| 282 | + 'interactions', 'replicates', 'cell_line', 'cell_line_replicates', |
| 283 | + 'sum_interactions', 'sum_replicates', 'sum_cell_lines'] |
202 | 284 | logger.write(' Time elasped: {:.2f} mins.'.format( |
203 | 285 | (time.time() - start_time)/60)) |
204 | | - return gene_df[cols].drop_duplicates() |
205 | | - |
| 286 | + if pchic: |
| 287 | + return gene_df |
| 288 | + else: |
| 289 | + return gene_df[cols].drop_duplicates() |
206 | 290 |
|
207 | 291 | def get_gene_by_position(df, db): |
208 | 292 | gene_df = [] |
@@ -288,6 +372,7 @@ def get_gene_info( |
288 | 372 | output_dir, |
289 | 373 | db, |
290 | 374 | logger, |
| 375 | + pchic = False, |
291 | 376 | suppress_intermediate_files=False): |
292 | 377 | enzymes = hic_df['enzyme'].drop_duplicates().tolist() |
293 | 378 | gene_df = [] |
@@ -340,7 +425,10 @@ def get_gene_info( |
340 | 425 | len(omitted_genes)) |
341 | 426 | msg = msg + ':\n\t{}'.format(', '.join(omitted_genes)) |
342 | 427 | logger.write(msg) |
343 | | - |
344 | | - gene_df = get_gene_fragments(gene_df, enzymes, db) |
| 428 | + gene_df = get_gene_fragments(gene_df, enzymes, db, pchic) |
345 | 429 | gene_df = gene_df.rename(columns={'frag_id': 'fragment'}) |
| 430 | + if pchic: |
| 431 | + gene_df = gene_df.drop(['chr','gene'], axis=1) |
| 432 | + else: |
| 433 | + pass |
346 | 434 | return(gene_df) |
0 commit comments