Skip to content

Commit 64fc2a7

Browse files
committed
update nf pipeline. should output rhogs/pickle folder in nf, otherwise sbatch error .command.run too long
1 parent ebce1c7 commit 64fc2a7

7 files changed

Lines changed: 49 additions & 80 deletions

File tree

FastOMA.nf

Lines changed: 19 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -8,9 +8,8 @@ params.proteomes = params.proteome_folder + "/*"
88
params.hogmap_in = params.input_folder + "/hogmap_in"
99

1010
params.hogmap_folder = params.output_folder + "/hogmap"
11-
//params.rhogs_folder = params.output_folder + "/rhogs_all"
1211
params.species_tree = params.input_folder + "/species_tree.nwk"
13-
//params.pickles_rhogs_folder = params.output_folder + "/pickle_rhogs"
12+
params.pickles_temp = params.output_folder + "/pickles_temp"
1413
params.genetrees_folder = params.output_folder + "/genetrees"
1514

1615

@@ -35,13 +34,13 @@ process omamer_run{
3534
}
3635

3736

38-
process infer_roothogs{ // publishDir params.rhogs_folder
37+
process infer_roothogs{
3938
input:
4039
val ready_omamer_run
4140
path hogmap_folder
4241
path proteome_folder
4342
output:
44-
path "rhogs_all" // path "rhogs_all/*"
43+
path "omamer_rhogs"
4544
path "gene_id_dic_xml.pickle"
4645
val true // nextflow-io.github.io/patterns/state-dependency/
4746
script:
@@ -51,11 +50,10 @@ process infer_roothogs{ // publishDir params.rhogs_folder
5150
}
5251

5352

54-
process batch_roothogs{ // publishDir params.output_folder
53+
process batch_roothogs{
5554
input:
5655
val ready_infer_roothogs
57-
//path rhogs_folder
58-
path "rhogs_all"
56+
path "omamer_rhogs"
5957
output:
6058
path "rhogs_rest/*", optional: true
6159
path "rhogs_big/*" , optional: true
@@ -66,15 +64,15 @@ process batch_roothogs{ // publishDir params.output_folder
6664
"""
6765
}
6866

69-
process hog_big{ //publishDir params.pickles_rhogs_folder
67+
process hog_big{
68+
publishDir params.pickles_temp
7069
cpus 6
7170
time {20.h} // for very big rhog it might need more, or you could re-run and add `-resume`
7271
memory {20.GB}
7372
input:
7473
val rhogsbig_tree_ready
7574
output:
76-
path "pickle_rhogs/*.pickle"
77-
// path "*.pickle"
75+
path "*.pickle"
7876
path "*.fa", optional: true // msa if write True
7977
path "*.nwk", optional: true // gene trees if write True
8078
val true
@@ -84,13 +82,12 @@ process hog_big{ //publishDir params.pickles_rhogs_folder
8482
"""
8583
}
8684

87-
process hog_rest{ //publishDir params.pickles_rhogs_folder
85+
process hog_rest{
86+
publishDir params.pickles_temp
8887
input:
8988
val rhogsrest_tree_ready
9089
output:
91-
path "pickle_rhogs/*.pickle"
92-
// path "*.pickle"
93-
//path "pickle_rhogs/*.pickle"
90+
path "*.pickle"
9491
path "*.fa" , optional: true // msa if write True
9592
path "*.nwk" , optional: true // gene trees if write True
9693
val true
@@ -106,10 +103,10 @@ process collect_subhogs{
106103
publishDir params.output_folder, mode: 'copy'
107104
input:
108105
val ready_hog_rest
109-
val ready_hog_big // path pickle_rhogs // this is for depenedcy
110-
path "pickle_rhogs/" // "*.pickle" // path "pickle_rhogs" // this is the folder includes pickles_rhogs
106+
val ready_hog_big
107+
path "pickles_temp" // this is the folder includes pickles_rhogs
111108
path "gene_id_dic_xml.pickle"
112-
path "rhogs_all"
109+
path "omamer_rhogs"
113110
output:
114111
path "output_hog.orthoxml"
115112
path "OrthologousGroupsFasta"
@@ -121,28 +118,25 @@ process collect_subhogs{
121118
"""
122119
}
123120

124-
125121
workflow {
126122
proteomes = Channel.fromPath(params.proteomes, type:'any' ,checkIfExists:true)
127123
proteome_folder = Channel.fromPath(params.proteome_folder)
128124
hogmap_folder = Channel.fromPath(params.hogmap_folder)
129-
// rhogs_folder = Channel.fromPath(params.rhogs_folder)
130125

131126
genetrees_folder = Channel.fromPath(params.genetrees_folder)
132127
hogmap_in = Channel.fromPath(params.hogmap_in)
133128

134-
// pickles_rhogs_folder = Channel.fromPath(params.pickles_rhogs_folder)
135-
omamerdb = Channel.fromPath(params.input_folder+"/omamerdb.h5") // proteomes.view{"prot ${it}"}
129+
pickles_temp = Channel.fromPath(params.pickles_temp)
130+
omamerdb = Channel.fromPath(params.input_folder+"/omamerdb.h5")
136131
proteomes_omamerdb = proteomes.combine(omamerdb)
137132
proteomes_omamerdb_inputhog = proteomes_omamerdb.combine(hogmap_in) // proteomes_omamerdb_inputhog.view{" rhogsbig ${it}"}
138133
(hogmap, ready_omamer_run)= omamer_run(proteomes_omamerdb_inputhog)
139134
ready_omamer_run_c = ready_omamer_run.collect()
140135

141-
(rhogs_folder, gene_id_dic_xml, ready_infer_roothogs) = infer_roothogs(ready_omamer_run_c, hogmap_folder, proteome_folder)
142-
// rhogs_folder and "rhogs_all" are the same
136+
(omamer_rhogs, gene_id_dic_xml, ready_infer_roothogs) = infer_roothogs(ready_omamer_run_c, hogmap_folder, proteome_folder)
143137
ready_infer_roothogs_c = ready_infer_roothogs.collect()
144138

145-
(rhogs_rest_list, rhogs_big_list, ready_batch_roothogs) = batch_roothogs(ready_infer_roothogs_c, rhogs_folder)
139+
(rhogs_rest_list, rhogs_big_list, ready_batch_roothogs) = batch_roothogs(ready_infer_roothogs_c, omamer_rhogs)
146140
ready_batch_roothogs_c = ready_batch_roothogs.collect()
147141

148142
species_tree = Channel.fromPath(params.species_tree)
@@ -155,16 +149,8 @@ workflow {
155149
rhogsrest_tree = rhogsrest.combine(species_tree)
156150
rhogsrest_tree_ready = rhogsrest_tree.combine(ready_batch_roothogs_c)
157151
(pickle_rest_rhog, msas_out_rest, genetrees_out_test, ready_hog_rest) = hog_rest(rhogsrest_tree_ready)
158-
all_pickles = pickle_big_rhog.mix(pickle_rest_rhog).collect() // all_pickles.view() // pickle_rhogs_folder = Channel.fromPath(params.output_folder+"/pickle_rhogs")
159152

160-
(orthoxml_file, OrthologousGroupsFasta, OrthologousGroups_tsv, rootHOGs_tsv) = collect_subhogs(ready_hog_rest.collect(), ready_hog_big.collect(), all_pickles, gene_id_dic_xml, rhogs_folder) // pickles_rhogs_folder
153+
(orthoxml_file, OrthologousGroupsFasta, OrthologousGroups_tsv, rootHOGs_tsv) = collect_subhogs(ready_hog_rest.collect(), ready_hog_big.collect(), pickles_temp, gene_id_dic_xml, omamer_rhogs)
161154
orthoxml_file.view{" output orthoxml file ${it}"}
162155

163156
}
164-
165-
// memory {12.GB * (2*task.attempt - 1)}
166-
// time {24.hour}
167-
// errorStrategy {
168-
// task.exitStatus in [1,99,143,137,104,134,139,145,140] ? ‘retry’ : ‘terminate’
169-
// }
170-
// maxRetries 4

FastOMA/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,3 @@
11

22
__packagename__ = "FastOMA"
3-
__version__ = "0.0.6"
3+
__version__ = "0.1.0"

FastOMA/batch_roothogs.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -91,7 +91,7 @@ def folder_1h_rhog(address_rhogs_folder, output_folder_big, output_folder_rest):
9191

9292
def batch_roothogs():
9393

94-
input_rhog = "./rhogs_all/" #
94+
input_rhog = "./omamer_rhogs/" #
9595
output_folder_big = "./rhogs_big/"
9696
output_folder_rest = "./rhogs_rest/"
9797
folder_1h_rhog(input_rhog, output_folder_big, output_folder_rest)

FastOMA/collect_subhogs.py

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,7 @@ def collect_subhogs():
2727
# tr|A0A0N7KCI6|A0A0N7KCI6_ORYSJ
2828
# for qfo benchamrk, the middle should be wirtten in the file
2929

30-
pickle_folder = "./pickle_rhogs/" #pickle_rhogs
30+
pickle_folder = "./pickles_temp/" #pickle_rhogs
3131
output_xml_name = "./output_hog.orthoxml"
3232
gene_id_pickle_file = "./gene_id_dic_xml.pickle"
3333

@@ -136,8 +136,8 @@ def max_og_tree(tree):
136136
return og_prot_list
137137

138138
input_orthoxml = output_xml_name # sys.argv[1] # "out_folder/output_hog_.orthoxml"
139-
rhog_all_folder = "./rhogs_all/" #sys.argv[2] + "/" # "out_folder/rhogs_all/"
140-
fasta_format = "fa" # of the rhogs_all
139+
rhog_all_folder = "./omamer_rhogs/" #sys.argv[2] + "/" # "out_folder/rhogs_all/"
140+
fasta_format = "fa" # of the rhogs
141141

142142
output_file_og_tsv = "OrthologousGroups.tsv"
143143

@@ -168,12 +168,12 @@ def max_og_tree(tree):
168168
for hog_id, og_prot_list in OGs.items(): # hog_id="HOG_0667494_sub10524"
169169
rhog_id = "_".join(hog_id.split("_")[:2])
170170

171-
rhogs_all_address = rhog_all_folder + rhog_id + "." + fasta_format
172-
rhogs_all_prots = list(SeqIO.parse(rhogs_all_address, "fasta"))
171+
omamer_rhogs_all_address = rhog_all_folder + rhog_id + "." + fasta_format
172+
omamer_rhogs_all_prots = list(SeqIO.parse(omamer_rhogs_all_address, "fasta"))
173173

174174
og_prots = []
175175
og_prot_list = OGs[hog_id]
176-
for rhogs_prot in rhogs_all_prots:
176+
for rhogs_prot in omamer_rhogs_all_prots:
177177
if rhogs_prot.id.split("||")[0] in og_prot_list:
178178
sp = rhogs_prot.id.split("||")[1]
179179
rhogs_prot.description += " [" + sp + "]"

FastOMA/infer_roothogs.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -57,11 +57,11 @@ def infer_roothogs():
5757
query_species_names,
5858
query_prot_names_species_mapped)
5959
# for pure usage of this python file, you can set the output folder
60-
# output_folder_rhog = _config.in_folder + "rhogs_all/"
60+
# output_folder_rhog = _config.in_folder + "rhogs_all/" // omamer_rhogs
6161
# using nextflow
6262

6363

6464

6565
# import sys
66-
output_folder_rhog = "./rhogs_all/" # sys.argv[1] #
66+
output_folder_rhog = "./omamer_rhogs/" # sys.argv[1] #
6767
rhogid_num_list_filt1 = _utils_roothog.write_rhog(rhogids_list_filt, rhogids_prot_records_query_filt, output_folder_rhog, 2) # min_rhog_size, max_rhog_size

FastOMA/infer_subhogs.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -53,7 +53,7 @@ def infer_subhogs():
5353
if inferhog_concurrent_on:
5454
print("parallelization for subhog inference is on.")
5555

56-
pickles_rhog_folder = "./pickle_rhogs/"
56+
pickles_rhog_folder = "./" # pickles_temp/ pickle_rhogs
5757
if not os.path.exists(pickles_rhog_folder):
5858
os.makedirs(pickles_rhog_folder)
5959

FastOMA_light.nf

Lines changed: 19 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -8,15 +8,13 @@ params.proteomes = params.proteome_folder + "/*"
88
params.hogmap_in = params.input_folder + "/hogmap_in"
99

1010
params.hogmap_folder = params.output_folder + "/hogmap"
11-
//params.rhogs_folder = params.output_folder + "/rhogs_all"
1211
params.species_tree = params.input_folder + "/species_tree.nwk"
13-
//params.pickles_rhogs_folder = params.output_folder + "/pickle_rhogs"
12+
params.pickles_temp = params.output_folder + "/pickles_temp"
1413
params.genetrees_folder = params.output_folder + "/genetrees"
1514

1615

1716
process omamer_run{
1817
time {4.h}
19-
memory {4.GB}
2018
publishDir params.hogmap_folder
2119
input:
2220
path proteomes_omamerdb_inputhog
@@ -35,13 +33,13 @@ process omamer_run{
3533
}
3634

3735

38-
process infer_roothogs{ // publishDir params.rhogs_folder
36+
process infer_roothogs{
3937
input:
4038
val ready_omamer_run
4139
path hogmap_folder
4240
path proteome_folder
4341
output:
44-
path "rhogs_all" // path "rhogs_all/*"
42+
path "omamer_rhogs"
4543
path "gene_id_dic_xml.pickle"
4644
val true // nextflow-io.github.io/patterns/state-dependency/
4745
script:
@@ -51,11 +49,10 @@ process infer_roothogs{ // publishDir params.rhogs_folder
5149
}
5250

5351

54-
process batch_roothogs{ // publishDir params.output_folder
52+
process batch_roothogs{
5553
input:
5654
val ready_infer_roothogs
57-
//path rhogs_folder
58-
path "rhogs_all"
55+
path "omamer_rhogs"
5956
output:
6057
path "rhogs_rest/*", optional: true
6158
path "rhogs_big/*" , optional: true
@@ -66,15 +63,14 @@ process batch_roothogs{ // publishDir params.output_folder
6663
"""
6764
}
6865

69-
process hog_big{ //publishDir params.pickles_rhogs_folder
66+
process hog_big{
67+
publishDir params.pickles_temp
7068
cpus 2
7169
time {20.h} // for very big rhog it might need more, or you could re-run and add `-resume`
72-
memory {4.GB}
7370
input:
7471
val rhogsbig_tree_ready
7572
output:
76-
path "pickle_rhogs/*.pickle"
77-
// path "*.pickle"
73+
path "*.pickle"
7874
path "*.fa", optional: true // msa if write True
7975
path "*.nwk", optional: true // gene trees if write True
8076
val true
@@ -84,13 +80,12 @@ process hog_big{ //publishDir params.pickles_rhogs_folder
8480
"""
8581
}
8682

87-
process hog_rest{ //publishDir params.pickles_rhogs_folder
83+
process hog_rest{
84+
publishDir params.pickles_temp
8885
input:
8986
val rhogsrest_tree_ready
9087
output:
91-
path "pickle_rhogs/*.pickle"
92-
// path "*.pickle"
93-
//path "pickle_rhogs/*.pickle"
88+
path "*.pickle"
9489
path "*.fa" , optional: true // msa if write True
9590
path "*.nwk" , optional: true // gene trees if write True
9691
val true
@@ -102,14 +97,13 @@ process hog_rest{ //publishDir params.pickles_rhogs_folder
10297

10398

10499
process collect_subhogs{
105-
memory {4.GB}
106100
publishDir params.output_folder, mode: 'copy'
107101
input:
108102
val ready_hog_rest
109-
val ready_hog_big // path pickle_rhogs // this is for depenedcy
110-
path "pickle_rhogs/" // "*.pickle" // path "pickle_rhogs" // this is the folder includes pickles_rhogs
103+
val ready_hog_big
104+
path "pickles_temp" // this is the folder includes pickles_rhogs
111105
path "gene_id_dic_xml.pickle"
112-
path "rhogs_all"
106+
path "omamer_rhogs"
113107
output:
114108
path "output_hog.orthoxml"
115109
path "OrthologousGroupsFasta"
@@ -121,28 +115,25 @@ process collect_subhogs{
121115
"""
122116
}
123117

124-
125118
workflow {
126119
proteomes = Channel.fromPath(params.proteomes, type:'any' ,checkIfExists:true)
127120
proteome_folder = Channel.fromPath(params.proteome_folder)
128121
hogmap_folder = Channel.fromPath(params.hogmap_folder)
129-
// rhogs_folder = Channel.fromPath(params.rhogs_folder)
130122

131123
genetrees_folder = Channel.fromPath(params.genetrees_folder)
132124
hogmap_in = Channel.fromPath(params.hogmap_in)
133125

134-
// pickles_rhogs_folder = Channel.fromPath(params.pickles_rhogs_folder)
135-
omamerdb = Channel.fromPath(params.input_folder+"/omamerdb.h5") // proteomes.view{"prot ${it}"}
126+
pickles_temp = Channel.fromPath(params.pickles_temp)
127+
omamerdb = Channel.fromPath(params.input_folder+"/omamerdb.h5")
136128
proteomes_omamerdb = proteomes.combine(omamerdb)
137129
proteomes_omamerdb_inputhog = proteomes_omamerdb.combine(hogmap_in) // proteomes_omamerdb_inputhog.view{" rhogsbig ${it}"}
138130
(hogmap, ready_omamer_run)= omamer_run(proteomes_omamerdb_inputhog)
139131
ready_omamer_run_c = ready_omamer_run.collect()
140132

141-
(rhogs_folder, gene_id_dic_xml, ready_infer_roothogs) = infer_roothogs(ready_omamer_run_c, hogmap_folder, proteome_folder)
142-
// rhogs_folder and "rhogs_all" are the same
133+
(omamer_rhogs, gene_id_dic_xml, ready_infer_roothogs) = infer_roothogs(ready_omamer_run_c, hogmap_folder, proteome_folder)
143134
ready_infer_roothogs_c = ready_infer_roothogs.collect()
144135

145-
(rhogs_rest_list, rhogs_big_list, ready_batch_roothogs) = batch_roothogs(ready_infer_roothogs_c, rhogs_folder)
136+
(rhogs_rest_list, rhogs_big_list, ready_batch_roothogs) = batch_roothogs(ready_infer_roothogs_c, omamer_rhogs)
146137
ready_batch_roothogs_c = ready_batch_roothogs.collect()
147138

148139
species_tree = Channel.fromPath(params.species_tree)
@@ -155,16 +146,8 @@ workflow {
155146
rhogsrest_tree = rhogsrest.combine(species_tree)
156147
rhogsrest_tree_ready = rhogsrest_tree.combine(ready_batch_roothogs_c)
157148
(pickle_rest_rhog, msas_out_rest, genetrees_out_test, ready_hog_rest) = hog_rest(rhogsrest_tree_ready)
158-
all_pickles = pickle_big_rhog.mix(pickle_rest_rhog).collect() // all_pickles.view() // pickle_rhogs_folder = Channel.fromPath(params.output_folder+"/pickle_rhogs")
159149

160-
(orthoxml_file, OrthologousGroupsFasta, OrthologousGroups_tsv, rootHOGs_tsv) = collect_subhogs(ready_hog_rest.collect(), ready_hog_big.collect(), all_pickles, gene_id_dic_xml, rhogs_folder) // pickles_rhogs_folder
150+
(orthoxml_file, OrthologousGroupsFasta, OrthologousGroups_tsv, rootHOGs_tsv) = collect_subhogs(ready_hog_rest.collect(), ready_hog_big.collect(), pickles_temp, gene_id_dic_xml, omamer_rhogs)
161151
orthoxml_file.view{" output orthoxml file ${it}"}
162152

163153
}
164-
165-
// memory {12.GB * (2*task.attempt - 1)}
166-
// time {24.hour}
167-
// errorStrategy {
168-
// task.exitStatus in [1,99,143,137,104,134,139,145,140] ? ‘retry’ : ‘terminate’
169-
// }
170-
// maxRetries 4

0 commit comments

Comments
 (0)