@@ -35,7 +35,7 @@ def parse_proteomes(folder="./"): # list_oma_species
3535 prot_recs_lists [species_name ]= prots_record
3636
3737 logger_hog .info ("The are " + str (len (species_names ))+ " species in the proteome folder." )
38- return species_names , prot_recs_lists
38+ return species_names , prot_recs_lists , fasta_format_keep
3939
4040
4141
@@ -78,7 +78,7 @@ def add_species_name_prot_id(species_names, prot_recs_lists):
7878 return prot_recs_all
7979
8080
81- def parse_hogmap_omamer (species_names , folder = "./" ):
81+ def parse_hogmap_omamer (species_names , fasta_format_keep , folder = "./" ):
8282 """
8383 function for parsing output of omamer (hogmap files) located in /hogmap/
8484 Each hogmap file correspond to one fasta file of species, with the same name.
@@ -90,7 +90,7 @@ def parse_hogmap_omamer(species_names, folder="./"):
9090 """
9191 hogmaps = {}
9292 for species_name in species_names :
93- hogmap_address = folder + "/hogmap/" + species_name + ".fa .hogmap"
93+ hogmap_address = folder + "/hogmap/" + species_name + "." + fasta_format_keep + " .hogmap"
9494 hogmap_file = open (hogmap_address , 'r' )
9595
9696 for line in hogmap_file :
@@ -256,20 +256,23 @@ def roothogs_postprocess(hogmaps, rhogs_prots):
256256 sp_prot_list_filt = []
257257 hogids = []
258258
259- # removing protines with low omamer score from big rootHOG
259+ # removing proteins with low omamer score from big rootHOG
260260 for species_name , prot_id in sp_prot_list :
261261 prot_maps = hogmaps [species_name ][prot_id ]
262- if len (prot_maps ) > 1 : # probably for big rootHOG, there won't be multi-hits
263- scores = [float (i [1 ]) for i in prot_maps ] # (hogid,score,seqlen,subfamily_medianseqlen)
264- hogids = [i [0 ] for i in prot_maps ]
265- max_score = max (scores )
266- max_index = scores .index (max_score )
267- hogid = hogids [max_index ]
268- else :
269- hogid = prot_maps [0 ][0 ]
270- score = float (prot_maps [0 ][1 ])
262+ # if len(prot_maps) > 1: # probably for big rootHOG, there won't be multi-hits
263+ # scores = [float(i[1]) for i in prot_maps] # (hogid,score,seqlen,subfamily_medianseqlen)
264+ # hogids = [i[0] for i in prot_maps]
265+ # max_score = max(scores)
266+ # max_index = scores.index(max_score)
267+ # hogid = hogids[max_index]
268+ # else:
269+ # hogid = prot_maps[0][0]
270+ # max_score = float(prot_maps[0][1])
271+ # todo: highest normcount or pavalue , default of omamer ?
272+ hogid = prot_maps [0 ][0 ]
273+ max_score = float (prot_maps [0 ][1 ])
271274
272- if score > _config .omamer_family_threshold :
275+ if max_score > _config .omamer_family_threshold :
273276 sp_prot_list_filt .append ((species_name , prot_id ))
274277 hogids .append (hogid )
275278
@@ -279,21 +282,23 @@ def roothogs_postprocess(hogmaps, rhogs_prots):
279282 if len (sp_prot_list_filt ) < _config .big_rhog_size :
280283 sp_prot_list_filt2 = sp_prot_list_filt
281284
282-
283- else : # removing protines that are mapped to rootHOG (=HOGC123 not the levelsHOGC123.1a) from big rootHOG
284- hogids2 = []
285+ else : # removing proteins that are mapped to rootHOG (= HOGC123 not the levelsHOGC123.1a) from big rootHOG
286+ #hogids2 = []
285287 sp_prot_list_filt2 = []
286- for prot_idx , sp_prot in enumerate (sp_prot_list_filt ):
287- hogid = hogids [prot_idx ]
288-
289- if len (hogid .split ("." )) > 1 :
288+ for prot_idx , sp_prot in enumerate (sp_prot_list_filt ): #[('UP000192223_224129', 'tr|A0A1W4WAU6|A0A1W4WAU6_AGRPL'), ('UP000192223_224129', 'tr|A0A1W4WU99|A0A1W4WU99_AGRPL'),
289+ species , protein = sp_prot
290+ prot_maps = hogmaps [species ][protein ]
291+ hogid = prot_maps [0 ][0 ]
292+ if len (hogid .split ("." )) > 1 : # if mapped to subHOG (not to the rootHOG)
290293 sp_prot_list_filt2 .append (sp_prot )
291- hogids2 .append (hogid )
294+ # hogids2.append(hogid)
292295
293296 logger_hog .info ("For big rootHOG " + rhogid + ", " + str (
294297 len (sp_prot_list_filt2 )) + " proteins left after removing non-child subhogs" )
295-
296- rhogs_prots [rhogid ] = sp_prot_list_filt2
298+ if len (sp_prot_list_filt2 ):
299+ rhogs_prots [rhogid ] = sp_prot_list_filt2
300+ else :
301+ del rhogs_prots [rhogid ]
297302
298303 return rhogs_prots
299304
0 commit comments