Skip to content

Commit 6d59a6c

Browse files
committed
mergehogs and handle Singleton
1 parent acdf642 commit 6d59a6c

2 files changed

Lines changed: 213 additions & 11 deletions

File tree

FastOMA/_utils_roothog.py

Lines changed: 211 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@
77
from ._utils_subhog import logger_hog
88
from . import _config
99

10-
10+
import numpy as np
1111

1212
def parse_proteomes(folder="./"): # list_oma_species
1313
"""
@@ -134,15 +134,17 @@ def group_prots_roothogs(hogmaps):
134134
# prot_recs = prot_recs_all[species_name]
135135

136136
for prot_id, prot_map in prots_map.items():
137-
if len(prot_map)>1:
138-
scores = [float(i[1]) for i in prot_map] # (hogid,score,seqlen,subfamily_medianseqlen)
139-
rhogids =[i[0] for i in prot_map]
140-
# select hog with the best score
141-
max_index = scores.index(max(scores))
142-
hogid =rhogids[max_index]
143-
else:
144-
hogid = prot_map[0][0]
145-
137+
# omamer output is sorted based on normcount. but that's ok
138+
# this helps me in other functions like handle_singleton in this
139+
# if len(prot_map)>1:
140+
# scores = [float(i[1]) for i in prot_map] # (hogid,score,seqlen,subfamily_medianseqlen)
141+
# rhogids =[i[0] for i in prot_map]
142+
# # select hog with the best score
143+
# max_index = scores.index(max(scores))
144+
# hogid =rhogids[max_index]
145+
# else:
146+
# hogid = prot_map[0][0]
147+
hogid = prot_map[0][0]
146148
rhogid = hogid.split(".")[0].split(":")[1]
147149

148150
if rhogid in rhogs_prots:
@@ -154,6 +156,86 @@ def group_prots_roothogs(hogmaps):
154156
return rhogs_prots
155157

156158

159+
def handle_singleton(rhogs_prots,hogmaps):
160+
query_singleton_rhog = [] # they are the only one to hit to this HOG
161+
for rhog, prot_sp_list in rhogs_prots.items(): # 'C0602129': [('GORGO', 'GORGO00025'), ('GORGO', 'GORGO00026')],
162+
if len(prot_sp_list) == 1:
163+
query_singleton_rhog += prot_sp_list
164+
logger_hog.debug(
165+
"There are " + str(len(query_singleton_rhog)) + " singleton. The hog on which only one query proteins mapped.")
166+
167+
# adding singleton to already rhogs with >1 members
168+
query_singleton_rhog_solved = []
169+
for (species, prot) in query_singleton_rhog:
170+
prot_maps = hogmaps[species][prot]
171+
if len(prot_maps) > 1:
172+
prot_maps2 = prot_maps
173+
# omamer output is sorted based on normcount. but that's ok
174+
# family_p family_count family_normcount
175+
scores = [float(i[1]) for i in prot_maps] # (hogid,score,seqlen,subfamily_medianseqlen)
176+
hogids = [i[0] for i in prot_maps]
177+
for hogid in hogids[1:]: # the 0-index is already checked and is a singleton
178+
rhogid = hogid.split(".")[0].split(":")[1]
179+
if rhogid in rhogs_prots:
180+
if len(rhogs_prots[rhogid]) > 1:
181+
rhogs_prots[rhogid].append((species, prot))
182+
rhogid0 = hogids[0].split(".")[0].split(":")[1] # 'HOG:D0903929.1a'
183+
del rhogs_prots[rhogid0]
184+
query_singleton_rhog_solved.append((species, prot))
185+
break
186+
logger_hog.debug("We add " + str(
187+
len(query_singleton_rhog_solved)) + " proteins/singleton to another rHOG based on omamer multi-hit.")
188+
189+
query_singleton_rhog_solved_set = set(query_singleton_rhog_solved)
190+
query_singleton_rhog_remained = [i for i in query_singleton_rhog if i not in query_singleton_rhog_solved_set]
191+
logger_hog.debug("However, " + str(len(query_singleton_rhog_remained)) + " proteins/singletons are remained.")
192+
193+
# try to create group from multi-hits of remained singleton
194+
dic_singlton_remained = {}
195+
for (species, prot) in query_singleton_rhog_remained:
196+
prot_maps = hogmaps[species][prot]
197+
scores = [float(i[1]) for i in prot_maps] # (hogid,score,seqlen,subfamily_medianseqlen)
198+
hogids = [i[0] for i in prot_maps]
199+
for hogid in hogids[1:]:
200+
rhogid = hogid.split(".")[0].split(":")[1]
201+
if rhogid in dic_singlton_remained:
202+
dic_singlton_remained[rhogid].append((species, prot))
203+
else:
204+
dic_singlton_remained[rhogid] = [(species, prot)]
205+
logger_hog.debug("These are associated to " + str(len(dic_singlton_remained)) + " HOGs considering all multi-hits.")
206+
207+
count_new_rhogids = 0
208+
count_updated_rhogids = 0
209+
already_grouped = []
210+
for rhogid, spec_prot_list in dic_singlton_remained.items():
211+
if len(set(spec_prot_list)) > 1:
212+
already_grouped_set = set(already_grouped)
213+
val = sum([1 for i in spec_prot_list if i in already_grouped_set])
214+
215+
if val == 0: # the prot has not yet grouped
216+
if rhogid in rhogs_prots:
217+
count_updated_rhogids += 1
218+
else:
219+
count_new_rhogids += 1
220+
rhogs_prots[rhogid] = spec_prot_list
221+
already_grouped += spec_prot_list
222+
logger_hog.debug("We updated " + str(count_updated_rhogids) + " and created " + str(
223+
count_new_rhogids) + " new HOGs which are not singleton anymore.")
224+
225+
counter_rhog_singlton_final = 0
226+
counter_notsingl_final = 0
227+
for rhogid, spec_prot_list in rhogs_prots.items():
228+
if len(set(spec_prot_list)) > 1:
229+
counter_notsingl_final += 1
230+
else:
231+
counter_rhog_singlton_final += 1
232+
233+
logger_hog.debug("Now, we have " + str(counter_notsingl_final) + " rootHOGs with >1 proteins and and " + str(
234+
counter_rhog_singlton_final) + " singleton rootHOGs ")
235+
236+
return rhogs_prots
237+
238+
157239
def roothogs_postprocess(hogmaps, rhogs_prots):
158240

159241

@@ -243,6 +325,124 @@ def write_rhog(rhogs_prot_records, prot_recs_all, address_rhogs_folder, min_rhog
243325
return rhogid_written_list
244326

245327

328+
def find_rhog_candidate_pairs(hogmaps, rhogs_prots):
329+
threshod_f_score_merging = 70
330+
pair_rhogs_count = {}
331+
for species, prt_prot_maps in hogmaps.items():
332+
for prot, prot_maps in prt_prot_maps.items():
333+
# [('HOG:D0017631.5a', '1771.7328874713658', '253', '234'), ('HOG:D0863448', '163.60700392437903', '253', '244'),
334+
rhogids = []
335+
for prot_map in prot_maps:
336+
hog, score = prot_map[:2]
337+
if float(score) > threshod_f_score_merging:
338+
rhogid = hog.split(".")[0].split(":")[1]
339+
rhogids.append(rhogid)
340+
for ii in range(len(rhogids)):
341+
for jj in range(ii + 1, len(rhogids)):
342+
hogi, hogj = rhogids[ii], rhogids[jj]
343+
if (hogi, hogj) in pair_rhogs_count:
344+
pair_rhogs_count[(hogi, hogj)] += 1
345+
else:
346+
pair_rhogs_count[(hogi, hogj)] = 1
347+
348+
print(len(pair_rhogs_count))
349+
logger_hog.debug("There are " + str(len(pair_rhogs_count)) + " pairs of rhogs.")
350+
351+
rhogs_size = {}
352+
for rhog, list_prot in rhogs_prots.items():
353+
rhogs_size[rhog] = len(list_prot)
354+
355+
candidates_pair = []
356+
# dic_pair_ratio = {}
357+
for (hogi, hogj), count_shared in pair_rhogs_count.items():
358+
if hogi in rhogs_size and hogj in rhogs_size: # during previous functions, we might
359+
ratioMax = count_shared / max(rhogs_size[hogi], rhogs_size[hogj])
360+
ratioMin = count_shared / min(rhogs_size[hogi], rhogs_size[hogj])
361+
362+
if (ratioMax > 0.8 or ratioMin > 0.9) and count_shared > 20 and rhogs_size[hogi] < _config.big_rhog_size / 2 and \
363+
rhogs_size[hogj] < _config.big_rhog_size / 2:
364+
if rhogs_size[hogi] >= rhogs_size[hogj]:
365+
candidates_pair.append((hogi, hogj)) # bigger first
366+
else:
367+
candidates_pair.append((hogj, hogi))
368+
# print(hogi,"(",rhogs_size[hogi],")",hogj,"(",rhogs_size[hogj],")", count_shared,round(ratioMax,2),round(ratioMin,2))
369+
# D0651051 ( 59 ) D0658569 ( 180 ) 62 0.34 1.05 # todo why bigger than 1?
370+
# D0646495 ( 2 ) D0631227 ( 14 ) 25 1.79 12.5
371+
372+
logger_hog.debug("There are " + str(len(candidates_pair)) + " candidate pairs of rhogs for merging.")
373+
print(len(candidates_pair))
374+
375+
return candidates_pair
376+
377+
378+
def cluster_rhogs(candidates_pair):
379+
# init
380+
all_hog_raw = []
381+
for pair in candidates_pair:
382+
all_hog_raw.append(pair[0])
383+
all_hog_raw.append(pair[1])
384+
all_hog = list(set(all_hog_raw))
385+
386+
allcc = [] # connected compoenets
387+
for hog in all_hog:
388+
allcc.append([hog])
389+
390+
dic_where = {}
391+
for idx, cc in enumerate(allcc):
392+
dic_where[cc[0]] = idx # in the begining, each inner list has only on element, a unique hog
393+
394+
# print(len(all_hog_raw),len(all_hog),len(allcc),allcc[:2])
395+
logger_hog.debug("There are " + str(len(all_hog)) + " all_hog.")
396+
397+
# print(dic_where)
398+
for pair in candidates_pair:
399+
# print(pair)
400+
idx_0 = dic_where[pair[0]]
401+
idx_1 = dic_where[pair[1]]
402+
403+
dic_where[pair[1]] = idx_0
404+
405+
# print(idx_0)
406+
allcc[idx_0] += allcc[idx_1]
407+
allcc[idx_1] = []
408+
409+
# print(dic_where,allcc)
410+
411+
cluster_rhogs_list = [i for i in allcc if len(i) > 1]
412+
# print(cluster_rhogs_list)
413+
logger_hog.debug("There are " + str(len(cluster_rhogs_list)) + " cluster_rhogs.")
414+
415+
# allcc_cleaned_len = [len(i) for i in cluster_rhogs]
416+
# len(allcc),len(cluster_rhogs),np.sum(allcc_cleaned_len)
417+
418+
return cluster_rhogs_list
419+
420+
421+
def merge_rhogs(hogmaps, rhogs_prots):
422+
logger_hog.debug("started merging ")
423+
424+
candidates_pair = find_rhog_candidate_pairs(hogmaps, rhogs_prots)
425+
426+
cluster_rhogs_list = cluster_rhogs(candidates_pair)
427+
428+
logger_hog.debug("There are " + str(len(rhogs_prots)) + " rhogs before merging.")
429+
print(len(rhogs_prots))
430+
431+
for cluster in cluster_rhogs_list:
432+
433+
prots = [rhogs_prots[hog] for hog in cluster]
434+
435+
host_hog = cluster[0]
436+
all_prots = []
437+
for hog in cluster:
438+
all_prots += rhogs_prots[hog]
439+
del rhogs_prots[hog]
440+
rhogs_prots[host_hog] = all_prots
441+
442+
print(len(rhogs_prots))
443+
logger_hog.debug("There are " + str(len(rhogs_prots)) + " rhogs after merging.")
444+
445+
return rhogs_prots
246446

247447

248448
# import pyoma.browser.db as db
@@ -621,7 +821,7 @@ def find_nonbest_isoform(species_names, isoform_by_gene_all, hogmaps):
621821
seq_lens = [i[2] for i in prot_maps]
622822
subf_med_lens = [i[3] for i in prot_maps]
623823
family_score = max(scores)
624-
idx_max = scores.index(max_score)
824+
idx_max = scores.index(family_score)
625825
seq_len = seq_lens[idx_max]
626826
subf_med_len = subf_med_lens[idx_max]
627827
# hogid = hogids[max_index]

FastOMA/infer_roothogs.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -48,6 +48,8 @@ def infer_roothogs():
4848

4949

5050
rhogs_prots = _utils_roothog.group_prots_roothogs(hogmaps)
51+
rhogs_prots = _utils_roothog.handle_singleton(rhogs_prots,hogmaps)
52+
rhogs_prots = _utils_roothog.merge_rhogs(hogmaps, rhogs_prots)
5153
rhogs_prots = _utils_roothog.roothogs_postprocess(hogmaps, rhogs_prots)
5254
address_rhogs_folder = "./omamer_rhogs/"
5355
rhogid_written_list = _utils_roothog.write_rhog(rhogs_prots, prot_recs_all, address_rhogs_folder, min_rhog_size=2)

0 commit comments

Comments
 (0)