Skip to content

Commit 1abbc98

Browse files
committed
Parallelized PhyML, retains temp files, reuses tree files.
1 parent 5f1c7e7 commit 1abbc98

2 files changed

Lines changed: 232 additions & 144 deletions

File tree

bin/04_CallCodeML.py

Lines changed: 31 additions & 144 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
'''This program will run CodeML on a directory of single gene alignments.
22
It will generate a unique control file and tree file for each input gene
33
before invoking CodeML using the number of CPUs specified by the user
4-
(default =1).
4+
(default = 1).
55
66
Copyright 2016 by Shawn Rupp'''
77

@@ -14,6 +14,7 @@
1414
from multiprocessing import Pool, cpu_count
1515
import argparse
1616
import shutil
17+
import math
1718
import os
1819
import re
1920

@@ -40,142 +41,22 @@ def outputFiles(outdir):
4041
completed = fin.readlines()
4142
return finished, completed
4243

43-
def controlFiles(indir, outdir, forward, completed):
44+
def controlFiles(indir, outdir, forward, cpu):
4445
'''Reads input files and stores them in memory'''
45-
go = False
46-
pairwise = False
47-
multiple = False
4846
# Make temp directory
47+
tmp = outdir + "tmp/"
4948
try:
50-
os.mkdir(outdir + "tmp/")
49+
os.mkdir(tmp)
5150
except FileExistsError:
5251
pass
53-
path = outdir.split("/")[:-2]
54-
out = ""
55-
for i in path:
56-
out += i + "/"
57-
control = glob(out + "*.ctl")
58-
if len(control) > 1:
59-
# Quit if multiple .ctl files are present
60-
print("\n\tPlease provide only one control file for CodeML.\n")
61-
quit()
62-
with open(control[0], "r") as infile:
63-
ctl = infile.readlines()
64-
for line in ctl:
65-
# Determine if a phylogenic tree is needed
66-
if "runmode = 0" in line or "runmode = 1" in line:
67-
go = makeTree(indir, outdir, forward, completed, ctl)
68-
multiple = True
69-
elif "runmode = -2" in line:
70-
go = pairwiseControl(indir, outdir, completed, ctl)
71-
pairwise = True
72-
if go == True:
73-
return pairwise, multiple
74-
75-
#-----------------------------------------------------------------------------
76-
77-
def makeTree(indir, outdir, forward, completed, ctl):
78-
'''Calls PhyML to create a gene tree.'''
79-
print("\tRunnning PhyMl to create gene trees...")
80-
genes = glob(indir + "*.phylip")
81-
for gene in genes:
82-
filename = gene.split("/")[-1]
83-
geneid = filename.split(".")[0]
84-
if (geneid + "\n") in completed:
85-
# Skip genes which have finished CodeML
86-
pass
87-
elif filename.split(".")[1] == "2":
88-
# Skip pairwise genes (PhyMl will not make a tree)
89-
pass
90-
else:
91-
# Create temp directory
92-
wd = outdir + "tmp/" + geneid + "/"
93-
try:
94-
os.mkdir(wd)
95-
except FileExistsError:
96-
pass
97-
# Set unique file names
98-
outfile = (outdir + geneid +"."+ filename.split(".")[1] + ".mlc")
99-
tempctl = wd + geneid + ".ctl"
100-
treefile = wd + filename + "_phyml_tree.txt"
101-
# Make unique control file
102-
makeCtl(gene, outfile, tempctl, treefile, ctl)
103-
# Call PhyML to make gene tree
104-
phy = Popen(split("PhyML/PhyML -q -i " + gene), stdout = DEVNULL)
105-
phy.wait()
106-
if phy.returncode == 0:
107-
# Move PhyML output to temp directory
108-
output = glob(gene + "_phyml_*")
109-
for i in output:
110-
shutil.copy(i, wd)
111-
os.remove(i)
112-
# Read in PhyML tree
113-
with open(treefile, "r") as genetree:
114-
try:
115-
tree = genetree.readlines()[0]
116-
except IndexError:
117-
pass
118-
# Remove branch lables introduced by PhyML
119-
tree = re.sub(r"\d+\.\d+:", ":", tree)
120-
# Add forward node to tree if specified
121-
if forward:
122-
if forward in tree:
123-
# Determine location and length of species name
124-
i = tree.index(forward) + len(forward)
125-
if ":" in tree:
126-
# Find end of branch length
127-
comma = tree.find(",", i)
128-
paren = tree.find(")", i)
129-
i = min([comma, paren])
130-
# Insert space and node symbol after species name
131-
tree = (tree[:i] + " #1" + tree[i:])
132-
elif forward not in tree:
133-
pass
134-
with open(treefile, "w") as outtree:
135-
# Overwrite treefile
136-
string = ""
137-
for i in tree:
138-
string += i
139-
outtree.write(string)
140-
return True
141-
142-
def pairwiseControl(indir, outdir, completed, ctl):
143-
'''Makes control files for pairwsie comparisons.'''
144-
print("\tMaking pairwise CodeML control files...")
145-
genes = glob(indir + "*.phylip")
146-
for gene in genes:
147-
filename = gene.split("/")[-1]
148-
geneid = filename.split(".")[0]
149-
if (geneid + "\n") in completed:
150-
# Skip genes which have finished CodeML
151-
pass
152-
else:
153-
# Create temp directory
154-
wd = outdir + "tmp/" + geneid + "/"
155-
try:
156-
os.mkdir(wd)
157-
except FileExistsError:
158-
pass
159-
# Set unique file names
160-
outfile = (outdir + geneid +"."+ filename.split(".")[1] + ".mlc")
161-
tempctl = wd + geneid + ".ctl"
162-
treefile = ""
163-
# Make unique control file
164-
makeCtl(gene, outfile, tempctl, treefile, ctl)
165-
return True
166-
167-
def makeCtl(gene, outfile, tempctl, treefile, ctl):
168-
'''Creates unique control file'''
169-
with open(tempctl, "w") as temp:
170-
for line in ctl:
171-
if "seqfile" in line:
172-
temp.write("\tseqfile = " + gene + "\n")
173-
elif "outfile" in line:
174-
temp.write("\toutfile = " + outfile + "\n")
175-
elif "treefile" in line:
176-
temp.write("\ttreefile = " + treefile + "\n")
177-
else:
178-
temp.write(line)
52+
cmd = ("python bin/04_callPhyML.py -i" + indir + " -o " + tmp +
53+
" -t " + str(cpu))
54+
if forward:
55+
cmd += " -f " + forward
56+
phyml = Popen(split(cmd))
57+
phyml.wait()
58+
#if phyml.returncode() == 0:
59+
return True
17960

18061
#-----------------------------------------------------------------------------
18162

@@ -184,11 +65,11 @@ def runCodeml(ap, outdir, finished, completed, gene):
18465
filename = gene.split("/")[-1]
18566
geneid = filename.split(".")[0]
18667
wd = outdir + "tmp/" + geneid + "/"
187-
if len(glob(wd + "*")) > 1:
188-
if filename.split(".")[1] == "2":
68+
if filename.split(".")[1] == "2":
69+
if len(glob(wd + "*")) > 1:
18970
# Skip pairwise genes if tree files are present
19071
pass
191-
if (geneid + "\n") in completed:
72+
elif (geneid + "\n") in completed:
19273
pass
19374
else:
19475
tempctl = wd + geneid + ".ctl"
@@ -197,9 +78,10 @@ def runCodeml(ap, outdir, finished, completed, gene):
19778
cm = Popen(split(ap + "/paml/bin/codeml " + tempctl),
19879
stdout = DEVNULL)
19980
cm.wait()
200-
# Append gene ID to list of finishedCodeML.txt
201-
with open(finished, "a") as fin:
202-
fin.write(geneid + "\n")
81+
if cm.returncode == 0:
82+
# Append gene ID to list of finishedCodeML.txt
83+
with open(finished, "a") as fin:
84+
fin.write(geneid + "\n")
20385

20486
#-----------------------------------------------------------------------------
20587

@@ -216,8 +98,8 @@ def main():
21698
parser.add_argument("-t", type=int, default=1, help="Number of threads.")
21799
parser.add_argument("-f", default="",
218100
help="Forward species (name must be the same as it appears in input files.")
219-
parser.add_argument("--noCleanUp", action="store_false",
220-
help="Keep temporary files.")
101+
parser.add_argument("--cleanUp", action="store_true",
102+
help="Remove temporary files (it may be useful to retain phylogenic trees for future use).")
221103
args = parser.parse_args()
222104
# Assign arguments
223105
indir = args.i
@@ -230,18 +112,23 @@ def main():
230112
if cpu > MAXCPU:
231113
cpu = MAXCPU
232114
forward = args.f
233-
cleanup = args.noCleanUp
115+
cleanup = args.cleanUp
234116
# Reads in required data
235117
finished, completed = outputFiles(outdir)
236-
pairwise, multiple = controlFiles(indir, outdir, forward, completed)
237-
if pairwise == True or multiple == True:
118+
run = controlFiles(indir, outdir, forward, cpu)
119+
if run == True:
238120
# Call CodeML after PhyML completes.
239121
genes = glob(indir + "*.phylip")
240122
l = int(len(genes))
123+
# Determine chunksize
124+
if l <= cpu:
125+
chunk = 1
126+
elif l > cpu:
127+
chunk = int(math.ceil(l/cpu))
241128
pool = Pool(processes = cpu)
242129
func = partial(runCodeml, ap, outdir, finished, completed)
243130
print("\tRunning CodeML with", str(cpu), "threads....")
244-
rcml = pool.imap(func, genes, chunksize = int(l/cpu))
131+
rcml = pool.imap(func, genes, chunksize = chunk)
245132
pool.close()
246133
pool.join()
247134
# Remove tmp directory

0 commit comments

Comments
 (0)