77from ._utils_subhog import logger_hog
88from . import _config
99
10-
10+ import numpy as np
1111
1212def parse_proteomes (folder = "./" ): # list_oma_species
1313 """
@@ -134,15 +134,17 @@ def group_prots_roothogs(hogmaps):
134134 # prot_recs = prot_recs_all[species_name]
135135
136136 for prot_id , prot_map in prots_map .items ():
137- if len (prot_map )> 1 :
138- scores = [float (i [1 ]) for i in prot_map ] # (hogid,score,seqlen,subfamily_medianseqlen)
139- rhogids = [i [0 ] for i in prot_map ]
140- # select hog with the best score
141- max_index = scores .index (max (scores ))
142- hogid = rhogids [max_index ]
143- else :
144- hogid = prot_map [0 ][0 ]
145-
137+ # omamer output is sorted based on normcount. but that's ok
138+ # this helps me in other functions like handle_singleton in this
139+ # if len(prot_map)>1:
140+ # scores = [float(i[1]) for i in prot_map] # (hogid,score,seqlen,subfamily_medianseqlen)
141+ # rhogids =[i[0] for i in prot_map]
142+ # # select hog with the best score
143+ # max_index = scores.index(max(scores))
144+ # hogid =rhogids[max_index]
145+ # else:
146+ # hogid = prot_map[0][0]
147+ hogid = prot_map [0 ][0 ]
146148 rhogid = hogid .split ("." )[0 ].split (":" )[1 ]
147149
148150 if rhogid in rhogs_prots :
@@ -154,6 +156,86 @@ def group_prots_roothogs(hogmaps):
154156 return rhogs_prots
155157
156158
159+ def handle_singleton (rhogs_prots ,hogmaps ):
160+ query_singleton_rhog = [] # they are the only one to hit to this HOG
161+ for rhog , prot_sp_list in rhogs_prots .items (): # 'C0602129': [('GORGO', 'GORGO00025'), ('GORGO', 'GORGO00026')],
162+ if len (prot_sp_list ) == 1 :
163+ query_singleton_rhog += prot_sp_list
164+ logger_hog .debug (
165+ "There are " + str (len (query_singleton_rhog )) + " singleton. The hog on which only one query proteins mapped." )
166+
167+ # adding singleton to already rhogs with >1 members
168+ query_singleton_rhog_solved = []
169+ for (species , prot ) in query_singleton_rhog :
170+ prot_maps = hogmaps [species ][prot ]
171+ if len (prot_maps ) > 1 :
172+ prot_maps2 = prot_maps
173+ # omamer output is sorted based on normcount. but that's ok
174+ # family_p family_count family_normcount
175+ scores = [float (i [1 ]) for i in prot_maps ] # (hogid,score,seqlen,subfamily_medianseqlen)
176+ hogids = [i [0 ] for i in prot_maps ]
177+ for hogid in hogids [1 :]: # the 0-index is already checked and is a singleton
178+ rhogid = hogid .split ("." )[0 ].split (":" )[1 ]
179+ if rhogid in rhogs_prots :
180+ if len (rhogs_prots [rhogid ]) > 1 :
181+ rhogs_prots [rhogid ].append ((species , prot ))
182+ rhogid0 = hogids [0 ].split ("." )[0 ].split (":" )[1 ] # 'HOG:D0903929.1a'
183+ del rhogs_prots [rhogid0 ]
184+ query_singleton_rhog_solved .append ((species , prot ))
185+ break
186+ logger_hog .debug ("We add " + str (
187+ len (query_singleton_rhog_solved )) + " proteins/singleton to another rHOG based on omamer multi-hit." )
188+
189+ query_singleton_rhog_solved_set = set (query_singleton_rhog_solved )
190+ query_singleton_rhog_remained = [i for i in query_singleton_rhog if i not in query_singleton_rhog_solved_set ]
191+ logger_hog .debug ("However, " + str (len (query_singleton_rhog_remained )) + " proteins/singletons are remained." )
192+
193+ # try to create group from multi-hits of remained singleton
194+ dic_singlton_remained = {}
195+ for (species , prot ) in query_singleton_rhog_remained :
196+ prot_maps = hogmaps [species ][prot ]
197+ scores = [float (i [1 ]) for i in prot_maps ] # (hogid,score,seqlen,subfamily_medianseqlen)
198+ hogids = [i [0 ] for i in prot_maps ]
199+ for hogid in hogids [1 :]:
200+ rhogid = hogid .split ("." )[0 ].split (":" )[1 ]
201+ if rhogid in dic_singlton_remained :
202+ dic_singlton_remained [rhogid ].append ((species , prot ))
203+ else :
204+ dic_singlton_remained [rhogid ] = [(species , prot )]
205+ logger_hog .debug ("These are associated to " + str (len (dic_singlton_remained )) + " HOGs considering all multi-hits." )
206+
207+ count_new_rhogids = 0
208+ count_updated_rhogids = 0
209+ already_grouped = []
210+ for rhogid , spec_prot_list in dic_singlton_remained .items ():
211+ if len (set (spec_prot_list )) > 1 :
212+ already_grouped_set = set (already_grouped )
213+ val = sum ([1 for i in spec_prot_list if i in already_grouped_set ])
214+
215+ if val == 0 : # the prot has not yet grouped
216+ if rhogid in rhogs_prots :
217+ count_updated_rhogids += 1
218+ else :
219+ count_new_rhogids += 1
220+ rhogs_prots [rhogid ] = spec_prot_list
221+ already_grouped += spec_prot_list
222+ logger_hog .debug ("We updated " + str (count_updated_rhogids ) + " and created " + str (
223+ count_new_rhogids ) + " new HOGs which are not singleton anymore." )
224+
225+ counter_rhog_singlton_final = 0
226+ counter_notsingl_final = 0
227+ for rhogid , spec_prot_list in rhogs_prots .items ():
228+ if len (set (spec_prot_list )) > 1 :
229+ counter_notsingl_final += 1
230+ else :
231+ counter_rhog_singlton_final += 1
232+
233+ logger_hog .debug ("Now, we have " + str (counter_notsingl_final ) + " rootHOGs with >1 proteins and and " + str (
234+ counter_rhog_singlton_final ) + " singleton rootHOGs " )
235+
236+ return rhogs_prots
237+
238+
157239def roothogs_postprocess (hogmaps , rhogs_prots ):
158240
159241
@@ -243,6 +325,124 @@ def write_rhog(rhogs_prot_records, prot_recs_all, address_rhogs_folder, min_rhog
243325 return rhogid_written_list
244326
245327
328+ def find_rhog_candidate_pairs (hogmaps , rhogs_prots ):
329+ threshod_f_score_merging = 70
330+ pair_rhogs_count = {}
331+ for species , prt_prot_maps in hogmaps .items ():
332+ for prot , prot_maps in prt_prot_maps .items ():
333+ # [('HOG:D0017631.5a', '1771.7328874713658', '253', '234'), ('HOG:D0863448', '163.60700392437903', '253', '244'),
334+ rhogids = []
335+ for prot_map in prot_maps :
336+ hog , score = prot_map [:2 ]
337+ if float (score ) > threshod_f_score_merging :
338+ rhogid = hog .split ("." )[0 ].split (":" )[1 ]
339+ rhogids .append (rhogid )
340+ for ii in range (len (rhogids )):
341+ for jj in range (ii + 1 , len (rhogids )):
342+ hogi , hogj = rhogids [ii ], rhogids [jj ]
343+ if (hogi , hogj ) in pair_rhogs_count :
344+ pair_rhogs_count [(hogi , hogj )] += 1
345+ else :
346+ pair_rhogs_count [(hogi , hogj )] = 1
347+
348+ print (len (pair_rhogs_count ))
349+ logger_hog .debug ("There are " + str (len (pair_rhogs_count )) + " pairs of rhogs." )
350+
351+ rhogs_size = {}
352+ for rhog , list_prot in rhogs_prots .items ():
353+ rhogs_size [rhog ] = len (list_prot )
354+
355+ candidates_pair = []
356+ # dic_pair_ratio = {}
357+ for (hogi , hogj ), count_shared in pair_rhogs_count .items ():
358+ if hogi in rhogs_size and hogj in rhogs_size : # during previous functions, we might
359+ ratioMax = count_shared / max (rhogs_size [hogi ], rhogs_size [hogj ])
360+ ratioMin = count_shared / min (rhogs_size [hogi ], rhogs_size [hogj ])
361+
362+ if (ratioMax > 0.8 or ratioMin > 0.9 ) and count_shared > 20 and rhogs_size [hogi ] < _config .big_rhog_size / 2 and \
363+ rhogs_size [hogj ] < _config .big_rhog_size / 2 :
364+ if rhogs_size [hogi ] >= rhogs_size [hogj ]:
365+ candidates_pair .append ((hogi , hogj )) # bigger first
366+ else :
367+ candidates_pair .append ((hogj , hogi ))
368+ # print(hogi,"(",rhogs_size[hogi],")",hogj,"(",rhogs_size[hogj],")", count_shared,round(ratioMax,2),round(ratioMin,2))
369+ # D0651051 ( 59 ) D0658569 ( 180 ) 62 0.34 1.05 # todo why bigger than 1?
370+ # D0646495 ( 2 ) D0631227 ( 14 ) 25 1.79 12.5
371+
372+ logger_hog .debug ("There are " + str (len (candidates_pair )) + " candidate pairs of rhogs for merging." )
373+ print (len (candidates_pair ))
374+
375+ return candidates_pair
376+
377+
378+ def cluster_rhogs (candidates_pair ):
379+ # init
380+ all_hog_raw = []
381+ for pair in candidates_pair :
382+ all_hog_raw .append (pair [0 ])
383+ all_hog_raw .append (pair [1 ])
384+ all_hog = list (set (all_hog_raw ))
385+
386+ allcc = [] # connected compoenets
387+ for hog in all_hog :
388+ allcc .append ([hog ])
389+
390+ dic_where = {}
391+ for idx , cc in enumerate (allcc ):
392+ dic_where [cc [0 ]] = idx # in the begining, each inner list has only on element, a unique hog
393+
394+ # print(len(all_hog_raw),len(all_hog),len(allcc),allcc[:2])
395+ logger_hog .debug ("There are " + str (len (all_hog )) + " all_hog." )
396+
397+ # print(dic_where)
398+ for pair in candidates_pair :
399+ # print(pair)
400+ idx_0 = dic_where [pair [0 ]]
401+ idx_1 = dic_where [pair [1 ]]
402+
403+ dic_where [pair [1 ]] = idx_0
404+
405+ # print(idx_0)
406+ allcc [idx_0 ] += allcc [idx_1 ]
407+ allcc [idx_1 ] = []
408+
409+ # print(dic_where,allcc)
410+
411+ cluster_rhogs_list = [i for i in allcc if len (i ) > 1 ]
412+ # print(cluster_rhogs_list)
413+ logger_hog .debug ("There are " + str (len (cluster_rhogs_list )) + " cluster_rhogs." )
414+
415+ # allcc_cleaned_len = [len(i) for i in cluster_rhogs]
416+ # len(allcc),len(cluster_rhogs),np.sum(allcc_cleaned_len)
417+
418+ return cluster_rhogs_list
419+
420+
421+ def merge_rhogs (hogmaps , rhogs_prots ):
422+ logger_hog .debug ("started merging " )
423+
424+ candidates_pair = find_rhog_candidate_pairs (hogmaps , rhogs_prots )
425+
426+ cluster_rhogs_list = cluster_rhogs (candidates_pair )
427+
428+ logger_hog .debug ("There are " + str (len (rhogs_prots )) + " rhogs before merging." )
429+ print (len (rhogs_prots ))
430+
431+ for cluster in cluster_rhogs_list :
432+
433+ prots = [rhogs_prots [hog ] for hog in cluster ]
434+
435+ host_hog = cluster [0 ]
436+ all_prots = []
437+ for hog in cluster :
438+ all_prots += rhogs_prots [hog ]
439+ del rhogs_prots [hog ]
440+ rhogs_prots [host_hog ] = all_prots
441+
442+ print (len (rhogs_prots ))
443+ logger_hog .debug ("There are " + str (len (rhogs_prots )) + " rhogs after merging." )
444+
445+ return rhogs_prots
246446
247447
248448# import pyoma.browser.db as db
@@ -621,7 +821,7 @@ def find_nonbest_isoform(species_names, isoform_by_gene_all, hogmaps):
621821 seq_lens = [i [2 ] for i in prot_maps ]
622822 subf_med_lens = [i [3 ] for i in prot_maps ]
623823 family_score = max (scores )
624- idx_max = scores .index (max_score )
824+ idx_max = scores .index (family_score )
625825 seq_len = seq_lens [idx_max ]
626826 subf_med_len = subf_med_lens [idx_max ]
627827 # hogid = hogids[max_index]
0 commit comments