@@ -136,6 +136,7 @@ def group_prots_roothogs(hogmaps):
136136 for prot_id , prot_map in prots_map .items ():
137137 # omamer output is sorted based on normcount. but that's ok
138138 # this helps me in other functions like handle_singleton in this
139+ # this should be commented
139140 # if len(prot_map)>1:
140141 # scores = [float(i[1]) for i in prot_map] # (hogid,score,seqlen,subfamily_medianseqlen)
141142 # rhogids =[i[0] for i in prot_map]
@@ -328,7 +329,7 @@ def write_rhog(rhogs_prot_records, prot_recs_all, address_rhogs_folder, min_rhog
328329def find_rhog_candidate_pairs (hogmaps , rhogs_prots ):
329330 threshod_f_score_merging = 70
330331 pair_rhogs_count = {}
331- for species , prt_prot_maps in hogmaps .items ():
332+ for rhog , prt_prot_maps in hogmaps .items ():
332333 for prot , prot_maps in prt_prot_maps .items ():
333334 # [('HOG:D0017631.5a', '1771.7328874713658', '253', '234'), ('HOG:D0863448', '163.60700392437903', '253', '244'),
334335 rhogids = []
@@ -359,8 +360,8 @@ def find_rhog_candidate_pairs(hogmaps, rhogs_prots):
359360 ratioMax = count_shared / max (rhogs_size [hogi ], rhogs_size [hogj ])
360361 ratioMin = count_shared / min (rhogs_size [hogi ], rhogs_size [hogj ])
361362
362- if (ratioMax > 0.8 or ratioMin > 0.9 ) and count_shared > 20 and rhogs_size [ hogi ] < _config . big_rhog_size / 2 and \
363- rhogs_size [hogj ] < _config .big_rhog_size / 2 :
363+ if (ratioMax > _config . mergHOG_ratioMax_thresh or ratioMin > _config . mergHOG_ratioMin_thresh ) and _config . mergHOG_shared_thresh > 20 and \
364+ rhogs_size [hogi ] < _config . big_rhog_size / 2 and rhogs_size [ hogj ] < _config .big_rhog_size / 2 :
364365 if rhogs_size [hogi ] >= rhogs_size [hogj ]:
365366 candidates_pair .append ((hogi , hogj )) # bigger first
366367 else :
@@ -428,19 +429,32 @@ def merge_rhogs(hogmaps, rhogs_prots):
428429 logger_hog .debug ("There are " + str (len (rhogs_prots )) + " rhogs before merging." )
429430 print (len (rhogs_prots ))
430431
432+ file_out_merge = open ("merging_rhogs.txt" ,"w" )
433+ file_out_merge .write ("#first column is the host hog, the rest will be merged here.\n " )
431434 for cluster in cluster_rhogs_list :
432-
433- prots = [rhogs_prots [hog ] for hog in cluster ]
434-
435+ for cluster_i in cluster :
436+ file_out_merge .write (cluster_i + "\t " )
437+ file_out_merge .write ("\n " )
438+ file_out_merge .close ()
439+ counter_merged_prots = 0
440+ for cluster in cluster_rhogs_list :
441+ #prots = [rhogs_prots[hog] for hog in cluster]
435442 host_hog = cluster [0 ]
443+ rhogs_host_size = len (rhogs_prots [host_hog ])
436444 all_prots = []
437445 for hog in cluster :
438446 all_prots += rhogs_prots [hog ]
439447 del rhogs_prots [hog ]
440- rhogs_prots [host_hog ] = all_prots
441-
442- print (len (rhogs_prots ))
443- logger_hog .debug ("There are " + str (len (rhogs_prots )) + " rhogs after merging." )
448+ all_prots_uniq = list (set (all_prots )) # there might be repeated prots
449+ rhogs_prots [host_hog ] = all_prots_uniq
450+ # merging D0562038 and D0559070
451+ # tr | C3ZG56 | C3ZG56_BRAFL HOG: D0562038, tr | H2Y1V7 | H2Y1V7_CIOIN HOG: D0562038, tr | C3ZG56 | C3ZG56_BRAFL HOG: D0559070, tr | H2Y1V7 | H2Y1V7_CIOIN HOG: D0559070
452+ if len (all_prots_uniq ) > rhogs_host_size :
453+ counter_merged_prots += len (all_prots_uniq )
454+ # otherwise, merging didn't help
455+
456+ print (len (rhogs_prots ),counter_merged_prots )
457+ logger_hog .debug ("There are " + str (len (rhogs_prots )) + " rhogs by merging " + str (counter_merged_prots )+ " proteins in total." )
444458
445459 return rhogs_prots
446460
0 commit comments