Skip to content

Commit f24c9a1

Browse files
committed
checkpoint_1Nov
1 parent f3d5230 commit f24c9a1

7 files changed

Lines changed: 55 additions & 43 deletions

File tree

FastOMA/_config.py

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -36,11 +36,11 @@
3636
# threshold_dubious_sd = float(os.getenv(VARIABLE_threshold_dubious_sd, 0.1))
3737

3838
## output writing files
39-
gene_trees_write = True
40-
msa_write = True
41-
gene_trees_write_all = True
42-
msa_write_all = True
43-
keep_subhog_each_pickle = True
39+
gene_trees_write = False
40+
msa_write = False
41+
gene_trees_write_all = False
42+
msa_write_all = False
43+
keep_subhog_each_pickle = False
4444

4545
big_rhog_size = 60 * 1000
4646
omamer_family_threshold = 90

FastOMA/_hog_class.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -201,6 +201,7 @@ def _sorter_key(sh):
201201
element_list = []
202202
for sub_clade, sub_hogs in itertools.groupby(self._subhogs, key=_sorter_key): # sub_clade is the taxrange
203203
list_of_subhogs_of_same_clade = list(sub_hogs)
204+
# list_of_subhogs_of_same_clade is [object of HOGclass hogID=HOG_D0634402_sub10094,len=12, tax_least=Amniota, tax_now= Amniota, object of HOGclass hogID=HOG_D0634402_sub10096,len=20, tax_least=Amniota, tax_now= Amniota, object of HOGclass hogID=HOG_D0634402_sub10081,len=1, tax_least=MONDO, tax_now= Amniota, object of HOGclass hogID=HOG_D0634402_sub10093,len=12, tax_least=Amniota, tax_now= Amniota, object of HOGclass hogID=HOG_D0634402_sub10095,len=7, tax_least=Amniota, tax_now= Amniota]
204205
# following only for debugging, can be deleted later
205206
for subhog in list_of_subhogs_of_same_clade:
206207
if len(subhog._members) == 0:

FastOMA/_utils_frag_SO_detection.py

Lines changed: 14 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -191,14 +191,20 @@ def find_prot_dubious_sd_remove(gene_tree, all_species_dubious_sd_dic):
191191
prot_dubious_list.append(prot_name)
192192
try:
193193
subhogs_list = [i.split("|_|")[1] for i in prot_dubious_list] # subhog id at child level
194-
if len(set(subhogs_list)) > 1:
195-
# we are removing all sequences of this species on the the side of internal node (gene tree), with least leaves
196-
child_size_min_indx = child_size.index(min(child_size))
197-
prot_dubious_sd_remove_list.append(prot_dubious_list[child_size_min_indx])
198-
199-
else:
200-
logger_hog.debug( "This species (protein from the same subhog) is safe to keep "+ str(node_name)+" "+str(species_dubious_sd))
201-
#all of them are from the same subhog, so it doesn't matter, a duplication event doesn't affect when all are from the same subhog at children level
194+
195+
# todo check !, is it safe or not!
196+
# if len(set(subhogs_list)) > 1:
197+
# # we are removing all sequences of this species on the the side of internal node (gene tree), with least leaves
198+
# child_size_min_indx = child_size.index(min(child_size))
199+
# prot_dubious_sd_remove_list.append(prot_dubious_list[child_size_min_indx])
200+
#
201+
# else:
202+
# logger_hog.debug( "This species (protein from the same subhog) is safe to keep "+ str(node_name)+" "+str(species_dubious_sd))
203+
# #all of them are from the same subhog, so it doesn't matter, a duplication event doesn't affect when all are from the same subhog at children level
204+
205+
child_size_min_indx = child_size.index(min(child_size))
206+
prot_dubious_sd_remove_list.append(prot_dubious_list[child_size_min_indx])
207+
202208
except:
203209
logger_hog.warning("issue 2495869: prot_dubious_list doesnt include the hog id . so we'll keep it" + str(gene_tree) + " " + str(prot_dubious_list))
204210

FastOMA/_utils_roothog.py

Lines changed: 29 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -35,7 +35,7 @@ def parse_proteomes(folder="./"): # list_oma_species
3535
prot_recs_lists[species_name]=prots_record
3636

3737
logger_hog.info("The are "+str(len(species_names))+" species in the proteome folder.")
38-
return species_names, prot_recs_lists
38+
return species_names, prot_recs_lists, fasta_format_keep
3939

4040

4141

@@ -78,7 +78,7 @@ def add_species_name_prot_id(species_names, prot_recs_lists):
7878
return prot_recs_all
7979

8080

81-
def parse_hogmap_omamer(species_names, folder="./"):
81+
def parse_hogmap_omamer(species_names, fasta_format_keep, folder="./"):
8282
"""
8383
function for parsing output of omamer (hogmap files) located in /hogmap/
8484
Each hogmap file correspond to one fasta file of species, with the same name.
@@ -90,7 +90,7 @@ def parse_hogmap_omamer(species_names, folder="./"):
9090
"""
9191
hogmaps = {}
9292
for species_name in species_names:
93-
hogmap_address = folder + "/hogmap/" + species_name + ".fa.hogmap"
93+
hogmap_address = folder + "/hogmap/" + species_name + "."+fasta_format_keep+".hogmap"
9494
hogmap_file = open(hogmap_address, 'r')
9595

9696
for line in hogmap_file:
@@ -256,20 +256,23 @@ def roothogs_postprocess(hogmaps, rhogs_prots):
256256
sp_prot_list_filt = []
257257
hogids = []
258258

259-
# removing protines with low omamer score from big rootHOG
259+
# removing proteins with low omamer score from big rootHOG
260260
for species_name, prot_id in sp_prot_list:
261261
prot_maps = hogmaps[species_name][prot_id]
262-
if len(prot_maps) > 1: # probably for big rootHOG, there won't be multi-hits
263-
scores = [float(i[1]) for i in prot_maps] # (hogid,score,seqlen,subfamily_medianseqlen)
264-
hogids = [i[0] for i in prot_maps]
265-
max_score = max(scores)
266-
max_index = scores.index(max_score)
267-
hogid = hogids[max_index]
268-
else:
269-
hogid = prot_maps[0][0]
270-
score = float(prot_maps[0][1])
262+
# if len(prot_maps) > 1: # probably for big rootHOG, there won't be multi-hits
263+
# scores = [float(i[1]) for i in prot_maps] # (hogid,score,seqlen,subfamily_medianseqlen)
264+
# hogids = [i[0] for i in prot_maps]
265+
# max_score = max(scores)
266+
# max_index = scores.index(max_score)
267+
# hogid = hogids[max_index]
268+
# else:
269+
# hogid = prot_maps[0][0]
270+
# max_score = float(prot_maps[0][1])
271+
# todo: highest normcount or pavalue , default of omamer ?
272+
hogid = prot_maps[0][0]
273+
max_score = float(prot_maps[0][1])
271274

272-
if score > _config.omamer_family_threshold:
275+
if max_score > _config.omamer_family_threshold:
273276
sp_prot_list_filt.append((species_name, prot_id))
274277
hogids.append(hogid)
275278

@@ -279,21 +282,23 @@ def roothogs_postprocess(hogmaps, rhogs_prots):
279282
if len(sp_prot_list_filt) < _config.big_rhog_size:
280283
sp_prot_list_filt2 = sp_prot_list_filt
281284

282-
283-
else: # removing protines that are mapped to rootHOG (=HOGC123 not the levelsHOGC123.1a) from big rootHOG
284-
hogids2 = []
285+
else: # removing proteins that are mapped to rootHOG (= HOGC123 not the levelsHOGC123.1a) from big rootHOG
286+
#hogids2 = []
285287
sp_prot_list_filt2 = []
286-
for prot_idx, sp_prot in enumerate(sp_prot_list_filt):
287-
hogid = hogids[prot_idx]
288-
289-
if len(hogid.split(".")) > 1:
288+
for prot_idx, sp_prot in enumerate(sp_prot_list_filt): #[('UP000192223_224129', 'tr|A0A1W4WAU6|A0A1W4WAU6_AGRPL'), ('UP000192223_224129', 'tr|A0A1W4WU99|A0A1W4WU99_AGRPL'),
289+
species , protein = sp_prot
290+
prot_maps =hogmaps[species][protein]
291+
hogid = prot_maps[0][0]
292+
if len(hogid.split(".")) > 1: # if mapped to subHOG (not to the rootHOG)
290293
sp_prot_list_filt2.append(sp_prot)
291-
hogids2.append(hogid)
294+
#hogids2.append(hogid)
292295

293296
logger_hog.info("For big rootHOG " + rhogid + ", " + str(
294297
len(sp_prot_list_filt2)) + " proteins left after removing non-child subhogs")
295-
296-
rhogs_prots[rhogid] = sp_prot_list_filt2
298+
if len(sp_prot_list_filt2):
299+
rhogs_prots[rhogid] = sp_prot_list_filt2
300+
else:
301+
del rhogs_prots[rhogid]
297302

298303
return rhogs_prots
299304

FastOMA/collect_subhogs.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -60,7 +60,7 @@ def collect_subhogs():
6060
pickle_files_adress = [i for i in pickle_files_adress_raw if i.endswith(".pickle") and i.startswith("file_")]
6161

6262
logger_hog.info("number of pickle files is "+str(len(pickle_files_adress))+".")
63-
logger_hog.debug("pickle files are " + str(pickle_files_adress) + ".")
63+
logger_hog.debug("pickle files are " + str(len(pickle_files_adress)) + ".")
6464
hogs_a_rhog_xml_all = []
6565
for pickle_file_adress in pickle_files_adress:
6666
with open(pickle_folder + pickle_file_adress, 'rb') as handle:

FastOMA/infer_roothogs.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -33,10 +33,10 @@ def infer_roothogs():
3333
# sys.exit(0)
3434

3535
#folder = "/scratch/smajidi1/euk_omamer200.dev8_2/test/hogmap/FastOMA-main/testdata/in_folder/"
36-
species_names, prot_recs_lists = _utils_roothog.parse_proteomes()# folder
36+
species_names, prot_recs_lists,fasta_format_keep = _utils_roothog.parse_proteomes() # optional input folder
3737
prot_recs_all = _utils_roothog.add_species_name_prot_id(species_names, prot_recs_lists)
3838

39-
hogmaps = _utils_roothog.parse_hogmap_omamer(species_names)#folder
39+
hogmaps = _utils_roothog.parse_hogmap_omamer(species_names,fasta_format_keep) # optional input folder
4040

4141
splice_files = os.path.exists("./splice/")
4242
if splice_files:
@@ -48,7 +48,7 @@ def infer_roothogs():
4848

4949

5050
rhogs_prots = _utils_roothog.group_prots_roothogs(hogmaps)
51-
# todo
51+
5252
rhogs_prots = _utils_roothog.handle_singleton(rhogs_prots,hogmaps)
5353
rhogs_prots = _utils_roothog.merge_rhogs(hogmaps, rhogs_prots)
5454
rhogs_prots = _utils_roothog.roothogs_postprocess(hogmaps, rhogs_prots)

archive/test_curn.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -9,8 +9,8 @@
99
# --input-rhog-folder ./bb/ --parrallel True --species-tree species_tree.nwk
1010

1111
#a=2
12-
infer_subhogs()
13-
#infer_roothogs()
12+
#infer_subhogs()
13+
infer_roothogs()
1414

1515
#
1616
# from FastOMA.zoo.hog import transform

0 commit comments

Comments
 (0)