Skip to content

Commit f3d5230

Browse files
committed
update
1 parent 7dbafa3 commit f3d5230

6 files changed

Lines changed: 104 additions & 67 deletions

File tree

FastOMA/_config.py

Lines changed: 8 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -28,16 +28,19 @@
2828
threshold_dubious_sd = 1/10
2929
overlap_fragments = 0.15
3030

31+
mergHOG_ratioMax_thresh = 0.8
32+
mergHOG_ratioMin_thresh = 0.9
33+
mergHOG_shared_thresh = 50
3134

3235
# VARIABLE_threshold_dubious_sd
3336
# threshold_dubious_sd = float(os.getenv(VARIABLE_threshold_dubious_sd, 0.1))
3437

3538
## output writing files
36-
gene_trees_write = False
37-
msa_write = False
38-
gene_trees_write_all = False
39-
msa_write_all = False
40-
keep_subhog_each_pickle = False
39+
gene_trees_write = True
40+
msa_write = True
41+
gene_trees_write_all = True
42+
msa_write_all = True
43+
keep_subhog_each_pickle = True
4144

4245
big_rhog_size = 60 * 1000
4346
omamer_family_threshold = 90

FastOMA/_utils_frag_SO_detection.py

Lines changed: 9 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -37,7 +37,7 @@ def read_msa(input_msa):
3737
if len(ii):
3838
coords.append((np.min(ii), np.max(ii)))
3939
else:
40-
logger_hog.warning("issue 1321230 all of the seq is gap"+str(rec))
40+
logger_hog.warning("issue 1321230 all of the seq is gap "+str(rec))
4141
coords.append((0, 0))
4242

4343
ids = np.array(ids)
@@ -219,7 +219,10 @@ def handle_fragment_sd(node_species_tree, gene_tree, genetree_msa_file_addr, all
219219
if prot_dubious_sd_remove_list:
220220
rest_leaves = set([i.name for i in gene_tree.get_leaves()]) - set(prot_dubious_sd_remove_list)
221221
gene_tree.prune(rest_leaves, preserve_branch_length=True)
222-
(gene_tree, all_species_dubious_sd_dic_updated) = _utils_subhog.genetree_sd(node_species_tree, gene_tree, genetree_msa_file_addr + "_dubious_sd"+str(itr_so))
222+
if _config.gene_trees_write_all or _config.rooting_method=="mad":
223+
gene_tree.write(format=1, outfile=genetree_msa_file_addr + "_dubious_sd"+str(itr_so)+".nwk" )
224+
225+
(gene_tree, all_species_dubious_sd_dic_updated) = _utils_subhog.genetree_sd(node_species_tree, gene_tree, genetree_msa_file_addr + "_dubious_sd"+str(itr_so)+".nwk")
223226

224227
hogs_children_level_list_raw = hogs_children_level_list
225228
for prot_dubious_sd_remove in prot_dubious_sd_remove_list:
@@ -409,8 +412,11 @@ def handle_fragment_msa(prot_dubious_msa_list, seq_dubious_msa_list, gene_tree,
409412
else:
410413
gene_tree.prune(rest_leaves, preserve_branch_length=True)
411414

415+
if _config.gene_trees_write_all or _config.rooting_method == "mad":
416+
gene_tree.write(outfile=genetree_msa_file_addr+"_dubiousMSA.nwk",format=1)
417+
412418
if len(gene_tree) > 1:
413-
(gene_tree, all_species_dubious_sd_dic2) = _utils_subhog.genetree_sd(node_species_tree, gene_tree, genetree_msa_file_addr+"_dubiousMSA")
419+
(gene_tree, all_species_dubious_sd_dic2) = _utils_subhog.genetree_sd(node_species_tree, gene_tree, genetree_msa_file_addr+"_dubiousMSA.nwk")
414420

415421

416422

FastOMA/_utils_roothog.py

Lines changed: 24 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -136,6 +136,7 @@ def group_prots_roothogs(hogmaps):
136136
for prot_id, prot_map in prots_map.items():
137137
# omamer output is sorted based on normcount. but that's ok
138138
# this helps me in other functions like handle_singleton in this
139+
# this should be commented
139140
# if len(prot_map)>1:
140141
# scores = [float(i[1]) for i in prot_map] # (hogid,score,seqlen,subfamily_medianseqlen)
141142
# rhogids =[i[0] for i in prot_map]
@@ -328,7 +329,7 @@ def write_rhog(rhogs_prot_records, prot_recs_all, address_rhogs_folder, min_rhog
328329
def find_rhog_candidate_pairs(hogmaps, rhogs_prots):
329330
threshod_f_score_merging = 70
330331
pair_rhogs_count = {}
331-
for species, prt_prot_maps in hogmaps.items():
332+
for rhog, prt_prot_maps in hogmaps.items():
332333
for prot, prot_maps in prt_prot_maps.items():
333334
# [('HOG:D0017631.5a', '1771.7328874713658', '253', '234'), ('HOG:D0863448', '163.60700392437903', '253', '244'),
334335
rhogids = []
@@ -359,8 +360,8 @@ def find_rhog_candidate_pairs(hogmaps, rhogs_prots):
359360
ratioMax = count_shared / max(rhogs_size[hogi], rhogs_size[hogj])
360361
ratioMin = count_shared / min(rhogs_size[hogi], rhogs_size[hogj])
361362

362-
if (ratioMax > 0.8 or ratioMin > 0.9) and count_shared > 20 and rhogs_size[hogi] < _config.big_rhog_size / 2 and \
363-
rhogs_size[hogj] < _config.big_rhog_size / 2:
363+
if (ratioMax > _config.mergHOG_ratioMax_thresh or ratioMin > _config.mergHOG_ratioMin_thresh ) and _config.mergHOG_shared_thresh > 20 and\
364+
rhogs_size[hogi] < _config.big_rhog_size / 2 and rhogs_size[hogj] < _config.big_rhog_size / 2:
364365
if rhogs_size[hogi] >= rhogs_size[hogj]:
365366
candidates_pair.append((hogi, hogj)) # bigger first
366367
else:
@@ -428,19 +429,32 @@ def merge_rhogs(hogmaps, rhogs_prots):
428429
logger_hog.debug("There are " + str(len(rhogs_prots)) + " rhogs before merging.")
429430
print(len(rhogs_prots))
430431

432+
file_out_merge = open("merging_rhogs.txt","w")
433+
file_out_merge.write("#first column is the host hog, the rest will be merged here.\n")
431434
for cluster in cluster_rhogs_list:
432-
433-
prots = [rhogs_prots[hog] for hog in cluster]
434-
435+
for cluster_i in cluster:
436+
file_out_merge.write(cluster_i+"\t")
437+
file_out_merge.write("\n")
438+
file_out_merge.close()
439+
counter_merged_prots=0
440+
for cluster in cluster_rhogs_list:
441+
#prots = [rhogs_prots[hog] for hog in cluster]
435442
host_hog = cluster[0]
443+
rhogs_host_size= len(rhogs_prots[host_hog])
436444
all_prots = []
437445
for hog in cluster:
438446
all_prots += rhogs_prots[hog]
439447
del rhogs_prots[hog]
440-
rhogs_prots[host_hog] = all_prots
441-
442-
print(len(rhogs_prots))
443-
logger_hog.debug("There are " + str(len(rhogs_prots)) + " rhogs after merging.")
448+
all_prots_uniq = list(set(all_prots)) # there might be repeated prots
449+
rhogs_prots[host_hog] = all_prots_uniq
450+
# merging D0562038 and D0559070
451+
# tr | C3ZG56 | C3ZG56_BRAFL HOG: D0562038, tr | H2Y1V7 | H2Y1V7_CIOIN HOG: D0562038, tr | C3ZG56 | C3ZG56_BRAFL HOG: D0559070, tr | H2Y1V7 | H2Y1V7_CIOIN HOG: D0559070
452+
if len(all_prots_uniq) > rhogs_host_size:
453+
counter_merged_prots += len(all_prots_uniq)
454+
# otherwise, merging didn't help
455+
456+
print(len(rhogs_prots),counter_merged_prots)
457+
logger_hog.debug("There are " + str(len(rhogs_prots)) + " rhogs by merging "+str(counter_merged_prots)+" proteins in total.")
444458

445459
return rhogs_prots
446460

FastOMA/_wrappers.py

Lines changed: 17 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -137,7 +137,7 @@ def mad_rooting(input_tree_file_path: str, mad_executable_path: str = "./mad"):
137137
#os.system(mad_command)
138138

139139
import subprocess
140-
140+
logger_hog.debug("MAD rooting started")
141141
#bashCommand = f"mad {gene_tree_address}"
142142
process = subprocess.Popen(mad_command.split(), stdout=subprocess.PIPE)
143143
output, error = process.communicate()
@@ -146,8 +146,21 @@ def mad_rooting(input_tree_file_path: str, mad_executable_path: str = "./mad"):
146146
# print("error:\n", error)
147147

148148
if "Error analyzing file" in str(output) or error:
149-
rooted_tree = Tree(input_tree_file_path)
150-
else:
149+
try:
150+
rooted_tree = Tree(input_tree_file_path)
151+
except:
152+
rooted_tree = Tree(input_tree_file_path,format=1)
153+
154+
155+
elif "rooted trees written" in str(output): # 3 rooted trees written to tree_664187_Eukaryota.nwk.rooted Warning: Trees with repeating branch lengths are suspicious (3 repeating values).
156+
file_multiple_tree_handle= open(input_tree_file_path + ".rooted",'r')
157+
trees=[]
158+
for line_tree1 in file_multiple_tree_handle:
159+
if line_tree1.strip():
160+
trees.append(line_tree1.strip())
161+
# todo which one to choose ?
162+
rooted_tree = Tree(trees[0])
163+
else:#Rooted tree written to 'tree_672375_Theria.nwk.rooted'
151164
rooted_tree = Tree(input_tree_file_path + ".rooted")
152-
165+
logger_hog.debug("MAD rooting finished")
153166
return rooted_tree

FastOMA/infer_roothogs.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -48,6 +48,7 @@ def infer_roothogs():
4848

4949

5050
rhogs_prots = _utils_roothog.group_prots_roothogs(hogmaps)
51+
# todo
5152
rhogs_prots = _utils_roothog.handle_singleton(rhogs_prots,hogmaps)
5253
rhogs_prots = _utils_roothog.merge_rhogs(hogmaps, rhogs_prots)
5354
rhogs_prots = _utils_roothog.roothogs_postprocess(hogmaps, rhogs_prots)

archive/test_curn.py

Lines changed: 45 additions & 45 deletions
Original file line numberDiff line numberDiff line change
@@ -9,53 +9,53 @@
99
# --input-rhog-folder ./bb/ --parrallel True --species-tree species_tree.nwk
1010

1111
#a=2
12-
#infer_subhogs()
12+
infer_subhogs()
1313
#infer_roothogs()
1414

15-
16-
from FastOMA.zoo.hog import transform
17-
18-
#from zoo.tree_utils import collapse, gene_species, transform, HOG_coverages
19-
20-
import io
21-
import lxml.etree
22-
orthoxml_file = "/work/FAC/FBM/DBC/cdessim2/default/smajidi1/gethog3_qfo/benchmark-webservice3/orthoxml/euk_omamer200.dev8_13oct.orthoxml"
23-
24-
25-
orthxml_str = []
26-
with open(orthoxml_file, "r") as f:
27-
for i in f:
28-
orthxml_str.append(i)
29-
print(len(orthxml_str))
30-
dic_gene_integer={}
31-
for line in orthxml_str:
32-
if "gene id" in line:
33-
found=False
34-
gene_int= line.split("\"")[1]
35-
gene_name = line.split("\"")[3]
36-
dic_gene_integer[gene_int] = gene_name
37-
38-
39-
40-
orthoxml_etree=lxml.etree.parse(orthoxml_file)
41-
42-
pw_orthologs_integer = sorted(list(transform.iter_pairwise_relations(orthoxml_etree)))
43-
# iter_pairwise_relations(obj, rel_type=None (def:'ortholog' , but possible to use 'paralog')
44-
print(len(pw_orthologs_integer))
45-
print(pw_orthologs_integer[:2])
46-
pw_orthologs_gene =[]
47-
for pair in pw_orthologs_integer:
48-
pw_orthologs_gene.append((dic_gene_integer[pair[0]],dic_gene_integer[pair[1]]))
49-
50-
51-
52-
print(len(pw_orthologs_gene))
53-
54-
output_file = open(orthoxml_file+"_pairs.tsv","w")
55-
for pair in pw_orthologs_gene:
56-
output_file.write(pair[0]+"\t"+pair[1]+"\n")
57-
58-
output_file.close()
15+
#
16+
# from FastOMA.zoo.hog import transform
17+
#
18+
# #from zoo.tree_utils import collapse, gene_species, transform, HOG_coverages
19+
#
20+
# import io
21+
# import lxml.etree
22+
# orthoxml_file = "/work/FAC/FBM/DBC/cdessim2/default/smajidi1/gethog3_qfo/benchmark-webservice3/orthoxml/euk_omamer200.dev8_13oct.orthoxml"
23+
#
24+
#
25+
# orthxml_str = []
26+
# with open(orthoxml_file, "r") as f:
27+
# for i in f:
28+
# orthxml_str.append(i)
29+
# print(len(orthxml_str))
30+
# dic_gene_integer={}
31+
# for line in orthxml_str:
32+
# if "gene id" in line:
33+
# found=False
34+
# gene_int= line.split("\"")[1]
35+
# gene_name = line.split("\"")[3]
36+
# dic_gene_integer[gene_int] = gene_name
37+
#
38+
#
39+
#
40+
# orthoxml_etree=lxml.etree.parse(orthoxml_file)
41+
#
42+
# pw_orthologs_integer = sorted(list(transform.iter_pairwise_relations(orthoxml_etree)))
43+
# # iter_pairwise_relations(obj, rel_type=None (def:'ortholog' , but possible to use 'paralog')
44+
# print(len(pw_orthologs_integer))
45+
# print(pw_orthologs_integer[:2])
46+
# pw_orthologs_gene =[]
47+
# for pair in pw_orthologs_integer:
48+
# pw_orthologs_gene.append((dic_gene_integer[pair[0]],dic_gene_integer[pair[1]]))
49+
#
50+
#
51+
#
52+
# print(len(pw_orthologs_gene))
53+
#
54+
# output_file = open(orthoxml_file+"_pairs.tsv","w")
55+
# for pair in pw_orthologs_gene:
56+
# output_file.write(pair[0]+"\t"+pair[1]+"\n")
57+
#
58+
# output_file.close()
5959

6060

6161
#

0 commit comments

Comments
 (0)