Skip to content

Commit ebce1c7

Browse files
committed
update readme
1 parent 87877e9 commit ebce1c7

3 files changed

Lines changed: 85 additions & 145 deletions

File tree

FastOMA.nf

Lines changed: 78 additions & 142 deletions
Original file line numberDiff line numberDiff line change
@@ -1,229 +1,165 @@
11

2-
2+
// NXF_WRAPPER_STAGE_FILE_THRESHOLD='50000'
33

44
params.input_folder = "./in_folder/"
55
params.output_folder = "./out_folder/"
66
params.proteome_folder = params.input_folder + "/proteome"
77
params.proteomes = params.proteome_folder + "/*"
8-
params.hogmap_input_folder = params.input_folder + "/hogmap_input_folder"
9-
8+
params.hogmap_in = params.input_folder + "/hogmap_in"
109

1110
params.hogmap_folder = params.output_folder + "/hogmap"
12-
params.rhogs_folder = params.output_folder + "/rhogs_all"
11+
//params.rhogs_folder = params.output_folder + "/rhogs_all"
1312
params.species_tree = params.input_folder + "/species_tree.nwk"
14-
params.pickles_rhogs_folder = params.output_folder + "/pickle_rhogs"
13+
//params.pickles_rhogs_folder = params.output_folder + "/pickle_rhogs"
1514
params.genetrees_folder = params.output_folder + "/genetrees"
1615

1716

18-
// https://github.com/nextflow-io/nextflow/issues/1629
19-
// https://www.nextflow.io/docs/latest/process.html?highlight=cache#cache
20-
21-
// todo clean up this file
22-
2317
process omamer_run{
2418
time {4.h}
25-
memory {50.GB}
26-
cpus 10
19+
memory {16.GB}
2720
publishDir params.hogmap_folder
2821
input:
29-
path proteomes_omamerdb_inputhog
22+
path proteomes_omamerdb_inputhog
3023
output:
31-
path "*.hogmap"
32-
val true // ready_omamer_run
24+
path "*.hogmap"
25+
val true
3326
script:
34-
// omamer search --db ${proteomes_omamerdb[1]} --query ${proteomes_omamerdb[0]} --nthreads 1 --out ${proteomes_omamerdb[0]}.hogmap
35-
// cp /work/FAC/FBM/DBC/cdessim2/default/smajidi1/qfo_hogmap/${proteomes_omamerdb[0]}.hogmap .
3627
"""
3728
if [ -f ${proteomes_omamerdb_inputhog[2]}/${proteomes_omamerdb_inputhog[0]}.hogmap ]
3829
then
3930
cp ${proteomes_omamerdb_inputhog[2]}/${proteomes_omamerdb_inputhog[0]}.hogmap ${proteomes_omamerdb_inputhog[0]}.hogmap
4031
else
41-
omamer search --db ${proteomes_omamerdb_inputhog[1]} --query ${proteomes_omamerdb_inputhog[0]} --nthreads 10 --out ${proteomes_omamerdb_inputhog[0]}.hogmap
32+
omamer search --db ${proteomes_omamerdb_inputhog[1]} --query ${proteomes_omamerdb_inputhog[0]} --out ${proteomes_omamerdb_inputhog[0]}.hogmap
4233
fi
43-
"""
44-
34+
""" // --nthreads 10
4535
}
4636

4737

48-
process infer_roothogs{
49-
publishDir params.rhogs_folder // "${params.output_folder}/rhogs_all"
38+
process infer_roothogs{ // publishDir params.rhogs_folder
5039
input:
51-
val ready_omamer_run
52-
path hogmap_folder
53-
path proteome_folder
40+
val ready_omamer_run
41+
path hogmap_folder
42+
path proteome_folder
5443
output:
55-
path "*.fa"
56-
path "gene_id_dic_xml.pickle"
57-
val true // ready_infer_roothogs nextflow-io.github.io/patterns/state-dependency/
44+
path "rhogs_all" // path "rhogs_all/*"
45+
path "gene_id_dic_xml.pickle"
46+
val true // nextflow-io.github.io/patterns/state-dependency/
5847
script:
59-
"""
60-
infer-roothogs --logger-level DEBUG
61-
"""
48+
"""
49+
infer-roothogs --logger-level DEBUG
50+
"""
6251
}
6352

64-
process batch_roothogs{
65-
publishDir params.output_folder
53+
54+
process batch_roothogs{ // publishDir params.output_folder
6655
input:
67-
val ready_infer_roothogs
68-
path rhogs_folder //"${params.output_folder}/rhogs_all"
56+
val ready_infer_roothogs
57+
//path rhogs_folder
58+
path "rhogs_all"
6959
output:
70-
path "rhogs_rest/*", optional: true
71-
path "rhogs_big/*" , optional: true
72-
val true
60+
path "rhogs_rest/*", optional: true
61+
path "rhogs_big/*" , optional: true
62+
val true
7363
script:
74-
"""
75-
batch-roothogs
76-
"""
64+
"""
65+
batch-roothogs
66+
"""
7767
}
7868

79-
process hog_big{
69+
process hog_big{ //publishDir params.pickles_rhogs_folder
8070
cpus 6
81-
time {20.h} // for very big rhog it might need more, or you could re-run and add `-resume`
82-
memory {80.GB}
83-
84-
publishDir params.pickles_rhogs_folder
85-
71+
time {20.h} // for very big rhog it might need more, or you could re-run and add `-resume`
72+
memory {20.GB}
8673
input:
87-
// val ready_batch_roothogs
88-
// path rhogsbig_tree // = rhogsbig.combine(species_tree)
89-
// rhogs_big_i //"$rhogs_big/*.fa"
90-
// path "species_tree.nwk"
91-
val rhogsbig_tree_ready
74+
val rhogsbig_tree_ready
9275
output:
93-
path "*.pickle"
94-
95-
path "*.fa", optional: true // msa if write True
96-
path "*.nwk", optional: true // gene trees if write True
97-
98-
val true
99-
// path "pi_big_subhog/*"
100-
// pi_big rhogs_big
101-
// params.species_tree
102-
76+
path "pickle_rhogs/*.pickle"
77+
// path "*.pickle"
78+
path "*.fa", optional: true // msa if write True
79+
path "*.nwk", optional: true // gene trees if write True
80+
val true
10381
script:
104-
"""
105-
infer-subhogs --input-rhog-folder ${rhogsbig_tree_ready[0]} --species-tree ${rhogsbig_tree_ready[1]} --parallel --fragment-detection --low-so-detection
106-
"""
82+
"""
83+
infer-subhogs --input-rhog-folder ${rhogsbig_tree_ready[0]} --species-tree ${rhogsbig_tree_ready[1]} --parallel --fragment-detection --low-so-detection
84+
"""
10785
}
10886

109-
110-
process hog_rest{
111-
112-
publishDir params.pickles_rhogs_folder
113-
// publishDir(
114-
// path: {params.pickles_rhogs_folder},
115-
// pattern: {"*.pickle"}
116-
// )
117-
// publishDir(
118-
// path: {params.genetrees_folder},
119-
// pattern: {"*.nwk"}
120-
// )
121-
87+
process hog_rest{ //publishDir params.pickles_rhogs_folder
12288
input:
123-
// val ready_batch_roothogs
124-
//path rhogsrest_tree // = rhogsrest.combine(species_tree)
125-
val rhogsrest_tree_ready
126-
89+
val rhogsrest_tree_ready
12790
output:
128-
path "*.pickle"
129-
130-
path "*.fa" , optional: true // msa if write True
131-
path "*.nwk" , optional: true // gene trees if write True
132-
133-
val true
91+
path "pickle_rhogs/*.pickle"
92+
// path "*.pickle"
93+
//path "pickle_rhogs/*.pickle"
94+
path "*.fa" , optional: true // msa if write True
95+
path "*.nwk" , optional: true // gene trees if write True
96+
val true
13497
script:
135-
"""
136-
infer-subhogs --input-rhog-folder ${rhogsrest_tree_ready[0]} --species-tree ${rhogsrest_tree_ready[1]} --fragment-detection --low-so-detection
137-
""" // --parrallel False
98+
"""
99+
infer-subhogs --input-rhog-folder ${rhogsrest_tree_ready[0]} --species-tree ${rhogsrest_tree_ready[1]} --fragment-detection --low-so-detection
100+
""" // --parrallel False
138101
}
139102

140-
process collect_subhogs{
141103

104+
process collect_subhogs{
142105
memory {50.GB}
143106
publishDir params.output_folder, mode: 'copy'
144107
input:
145-
val ready_hog_rest
146-
val ready_hog_big
147-
// path pickle_rhogs // this is for depenedcy
148-
path "pickle_rhogs" // this is the folder includes pickles_rhogs
149-
path "gene_id_dic_xml.pickle"
150-
108+
val ready_hog_rest
109+
val ready_hog_big // path pickle_rhogs // this is for depenedcy
110+
path "pickle_rhogs/" // "*.pickle" // path "pickle_rhogs" // this is the folder includes pickles_rhogs
111+
path "gene_id_dic_xml.pickle"
112+
path "rhogs_all"
151113
output:
152-
path "output_hog_.orthoxml"
153-
114+
path "output_hog.orthoxml"
115+
path "OrthologousGroupsFasta"
116+
path "OrthologousGroups.tsv"
117+
path "rootHOGs.tsv"
154118
script:
155-
"""
156-
collect-subhogs
157-
"""
119+
"""
120+
collect-subhogs
121+
"""
158122
}
159123

160124

161-
162125
workflow {
163126
proteomes = Channel.fromPath(params.proteomes, type:'any' ,checkIfExists:true)
164127
proteome_folder = Channel.fromPath(params.proteome_folder)
165128
hogmap_folder = Channel.fromPath(params.hogmap_folder)
166-
rhogs_folder = Channel.fromPath(params.rhogs_folder)
129+
// rhogs_folder = Channel.fromPath(params.rhogs_folder)
167130

168131
genetrees_folder = Channel.fromPath(params.genetrees_folder)
169-
hogmap_input_folder = Channel.fromPath(params.hogmap_input_folder)
132+
hogmap_in = Channel.fromPath(params.hogmap_in)
170133

171-
pickles_rhogs_folder = Channel.fromPath(params.pickles_rhogs_folder)
172-
omamerdb = Channel.fromPath(params.input_folder+"/omamerdb.h5")
173-
// proteomes.view{"prot ${it}"}
134+
// pickles_rhogs_folder = Channel.fromPath(params.pickles_rhogs_folder)
135+
omamerdb = Channel.fromPath(params.input_folder+"/omamerdb.h5") // proteomes.view{"prot ${it}"}
174136
proteomes_omamerdb = proteomes.combine(omamerdb)
175-
proteomes_omamerdb_inputhog = proteomes_omamerdb.combine(hogmap_input_folder)
176-
// proteomes_omamerdb_inputhog.view{" rhogsbig ${it}"}
177-
137+
proteomes_omamerdb_inputhog = proteomes_omamerdb.combine(hogmap_in) // proteomes_omamerdb_inputhog.view{" rhogsbig ${it}"}
178138
(hogmap, ready_omamer_run)= omamer_run(proteomes_omamerdb_inputhog)
179-
// (hogmap, ready_omamer_run)= omamer_run(proteomes_omamerdb)
180139
ready_omamer_run_c = ready_omamer_run.collect()
181-
// hogmaps.view{"hogmap ${it}"}
182-
183-
// proteome_folder.view{"proteome_folder ${it} "}
184-
// (rhogs, gene_id_dic_xml) = infer_roothogs(hogmaps, hogmap_folder, proteome_folder)
185-
(rhogs, gene_id_dic_xml, ready_infer_roothogs) = infer_roothogs(ready_omamer_run_c, hogmap_folder, proteome_folder)
186-
// rhogs.view{"rhogs ${it}"}
187-
// rhogs_folder.view{"rhogs_folder xx ${it}"}
188140

141+
(rhogs_folder, gene_id_dic_xml, ready_infer_roothogs) = infer_roothogs(ready_omamer_run_c, hogmap_folder, proteome_folder)
142+
// rhogs_folder and "rhogs_all" are the same
189143
ready_infer_roothogs_c = ready_infer_roothogs.collect()
144+
190145
(rhogs_rest_list, rhogs_big_list, ready_batch_roothogs) = batch_roothogs(ready_infer_roothogs_c, rhogs_folder)
191146
ready_batch_roothogs_c = ready_batch_roothogs.collect()
192147

193-
// ready_batch_roothogs_c.view{" ready_batch_roothogs_c 44 ${it}"}
194-
195148
species_tree = Channel.fromPath(params.species_tree)
196149
rhogsbig = rhogs_big_list.flatten()
197-
// rhogsbig.view{" rhogsbig ${it}"}
198150
rhogsbig_tree = rhogsbig.combine(species_tree)
199-
rhogsbig_tree_ready = rhogsbig_tree.combine(ready_batch_roothogs)
200-
rhogsbig_tree_ready.view{"rhogsbig_tree_ready ${it}"}
151+
rhogsbig_tree_ready = rhogsbig_tree.combine(ready_batch_roothogs) // rhogsbig_tree_ready.view{"rhogsbig_tree_ready ${it}"}
201152
(pickle_big_rhog, msas_out, genetrees_out, ready_hog_big) = hog_big(rhogsbig_tree_ready)
202153

203154
rhogsrest = rhogs_rest_list.flatten()
204-
// rhogsrest.view{" rhogs rest ${it}"}
205155
rhogsrest_tree = rhogsrest.combine(species_tree)
206-
207-
208156
rhogsrest_tree_ready = rhogsrest_tree.combine(ready_batch_roothogs_c)
209-
// rhogsrest_tree_ready.view{"rhogsrest_tree_ready ${it}"}
210-
211157
(pickle_rest_rhog, msas_out_rest, genetrees_out_test, ready_hog_rest) = hog_rest(rhogsrest_tree_ready)
158+
all_pickles = pickle_big_rhog.mix(pickle_rest_rhog).collect() // all_pickles.view() // pickle_rhogs_folder = Channel.fromPath(params.output_folder+"/pickle_rhogs")
212159

213-
// pickle_rest_rhog.flatten().view{" pickle_rest_rhog rest ${it}"}
214-
// pickle_big_rhog.flatten().view{" pickle_big_rhog rest ${it}"}
215-
prb = pickle_big_rhog.collect()
216-
prr = pickle_rest_rhog.collect()
217-
all_pickles = prb.mix(prr)
218-
// gene_id_dic_xml = Channel.fromPath("gene_id_dic_xml.pickle")
219-
pickle_rhogs_folder = Channel.fromPath(params.output_folder+"/pickle_rhogs")
220-
// orthoxml_file = collect_subhogs(all_pickles.collect(), pickle_rhogs_folder, gene_id_dic_xml)
221-
222-
orthoxml_file = collect_subhogs(ready_hog_rest.collect(), ready_hog_big.collect(), pickles_rhogs_folder, gene_id_dic_xml)
160+
(orthoxml_file, OrthologousGroupsFasta, OrthologousGroups_tsv, rootHOGs_tsv) = collect_subhogs(ready_hog_rest.collect(), ready_hog_big.collect(), all_pickles, gene_id_dic_xml, rhogs_folder) // pickles_rhogs_folder
223161
orthoxml_file.view{" output orthoxml file ${it}"}
224162

225-
226-
227163
}
228164

229165
// memory {12.GB * (2*task.attempt - 1)}

FastOMA/_config.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -81,7 +81,11 @@
8181

8282
# batch_roothogs
8383
big_rhog_filesize_thresh = 600 * 1000
84-
sum_list_rhogs_filesize_thresh = 2 * 1e6
84+
sum_list_rhogs_filesize_thresh = 1 * 1e5
85+
86+
#big_rhog_filesize_thresh = 600 * 1000
87+
#sum_list_rhogs_filesize_thresh = 2 * 1e6
88+
8589

8690
# big_rhog_filesize_thresh = 1.6 * 1000 # 600 would be better
8791
# sum_list_rhogs_filesize_thresh = 5 * 1e3

README.md

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -176,12 +176,12 @@ This means that if you remove the `work` folder, you will not have access to the
176176
If you are working on a large scale project, you may need to change the limitation on the number of files opened in linux using `ulimit -n 271072`.
177177

178178
### using omamer's output
179-
The first step of the FastOMA pipele is to run [OMAmer](https://github.com/DessimozLab/omamer). If you already have the hogmap files, you can put them in the `in_folder/hogmap_input_folder`.
179+
The first step of the FastOMA pipele is to run [OMAmer](https://github.com/DessimozLab/omamer). If you already have the hogmap files, you can put them in the `in_folder/hogmap_in`.
180180
Then your structure of files will be
181181
```
182182
$ tree ../testdata/
183183
├── in_folder
184-
│ ├── hogmap_input_folder
184+
│ ├── hogmap_in
185185
│ │ ├── CHLTR.fa.hogmap
186186
│ │ ├── MYCGE.fa.hogmap
187187
│ ├── omamerdb.h5

0 commit comments

Comments
 (0)