11'''This program will run CodeML on a directory of single gene alignments.
22It will generate a unique control file and tree file for each input gene
33before invoking CodeML using the number of CPUs specified by the user
4- (default =1).
4+ (default = 1).
55
66 Copyright 2016 by Shawn Rupp'''
77
1414from multiprocessing import Pool , cpu_count
1515import argparse
1616import shutil
17+ import math
1718import os
1819import re
1920
@@ -40,142 +41,22 @@ def outputFiles(outdir):
4041 completed = fin .readlines ()
4142 return finished , completed
4243
43- def controlFiles (indir , outdir , forward , completed ):
44+ def controlFiles (indir , outdir , forward , cpu ):
4445 '''Reads input files and stores them in memory'''
45- go = False
46- pairwise = False
47- multiple = False
4846 # Make temp directory
47+ tmp = outdir + "tmp/"
4948 try :
50- os .mkdir (outdir + " tmp/" )
49+ os .mkdir (tmp )
5150 except FileExistsError :
5251 pass
53- path = outdir .split ("/" )[:- 2 ]
54- out = ""
55- for i in path :
56- out += i + "/"
57- control = glob (out + "*.ctl" )
58- if len (control ) > 1 :
59- # Quit if multiple .ctl files are present
60- print ("\n \t Please provide only one control file for CodeML.\n " )
61- quit ()
62- with open (control [0 ], "r" ) as infile :
63- ctl = infile .readlines ()
64- for line in ctl :
65- # Determine if a phylogenic tree is needed
66- if "runmode = 0" in line or "runmode = 1" in line :
67- go = makeTree (indir , outdir , forward , completed , ctl )
68- multiple = True
69- elif "runmode = -2" in line :
70- go = pairwiseControl (indir , outdir , completed , ctl )
71- pairwise = True
72- if go == True :
73- return pairwise , multiple
74-
75- #-----------------------------------------------------------------------------
76-
77- def makeTree (indir , outdir , forward , completed , ctl ):
78- '''Calls PhyML to create a gene tree.'''
79- print ("\t Runnning PhyMl to create gene trees..." )
80- genes = glob (indir + "*.phylip" )
81- for gene in genes :
82- filename = gene .split ("/" )[- 1 ]
83- geneid = filename .split ("." )[0 ]
84- if (geneid + "\n " ) in completed :
85- # Skip genes which have finished CodeML
86- pass
87- elif filename .split ("." )[1 ] == "2" :
88- # Skip pairwise genes (PhyMl will not make a tree)
89- pass
90- else :
91- # Create temp directory
92- wd = outdir + "tmp/" + geneid + "/"
93- try :
94- os .mkdir (wd )
95- except FileExistsError :
96- pass
97- # Set unique file names
98- outfile = (outdir + geneid + "." + filename .split ("." )[1 ] + ".mlc" )
99- tempctl = wd + geneid + ".ctl"
100- treefile = wd + filename + "_phyml_tree.txt"
101- # Make unique control file
102- makeCtl (gene , outfile , tempctl , treefile , ctl )
103- # Call PhyML to make gene tree
104- phy = Popen (split ("PhyML/PhyML -q -i " + gene ), stdout = DEVNULL )
105- phy .wait ()
106- if phy .returncode == 0 :
107- # Move PhyML output to temp directory
108- output = glob (gene + "_phyml_*" )
109- for i in output :
110- shutil .copy (i , wd )
111- os .remove (i )
112- # Read in PhyML tree
113- with open (treefile , "r" ) as genetree :
114- try :
115- tree = genetree .readlines ()[0 ]
116- except IndexError :
117- pass
118- # Remove branch lables introduced by PhyML
119- tree = re .sub (r"\d+\.\d+:" , ":" , tree )
120- # Add forward node to tree if specified
121- if forward :
122- if forward in tree :
123- # Determine location and length of species name
124- i = tree .index (forward ) + len (forward )
125- if ":" in tree :
126- # Find end of branch length
127- comma = tree .find ("," , i )
128- paren = tree .find (")" , i )
129- i = min ([comma , paren ])
130- # Insert space and node symbol after species name
131- tree = (tree [:i ] + " #1" + tree [i :])
132- elif forward not in tree :
133- pass
134- with open (treefile , "w" ) as outtree :
135- # Overwrite treefile
136- string = ""
137- for i in tree :
138- string += i
139- outtree .write (string )
140- return True
141-
142- def pairwiseControl (indir , outdir , completed , ctl ):
143- '''Makes control files for pairwsie comparisons.'''
144- print ("\t Making pairwise CodeML control files..." )
145- genes = glob (indir + "*.phylip" )
146- for gene in genes :
147- filename = gene .split ("/" )[- 1 ]
148- geneid = filename .split ("." )[0 ]
149- if (geneid + "\n " ) in completed :
150- # Skip genes which have finished CodeML
151- pass
152- else :
153- # Create temp directory
154- wd = outdir + "tmp/" + geneid + "/"
155- try :
156- os .mkdir (wd )
157- except FileExistsError :
158- pass
159- # Set unique file names
160- outfile = (outdir + geneid + "." + filename .split ("." )[1 ] + ".mlc" )
161- tempctl = wd + geneid + ".ctl"
162- treefile = ""
163- # Make unique control file
164- makeCtl (gene , outfile , tempctl , treefile , ctl )
165- return True
166-
167- def makeCtl (gene , outfile , tempctl , treefile , ctl ):
168- '''Creates unique control file'''
169- with open (tempctl , "w" ) as temp :
170- for line in ctl :
171- if "seqfile" in line :
172- temp .write ("\t seqfile = " + gene + "\n " )
173- elif "outfile" in line :
174- temp .write ("\t outfile = " + outfile + "\n " )
175- elif "treefile" in line :
176- temp .write ("\t treefile = " + treefile + "\n " )
177- else :
178- temp .write (line )
52+ cmd = ("python bin/04_callPhyML.py -i" + indir + " -o " + tmp +
53+ " -t " + str (cpu ))
54+ if forward :
55+ cmd += " -f " + forward
56+ phyml = Popen (split (cmd ))
57+ phyml .wait ()
58+ #if phyml.returncode() == 0:
59+ return True
17960
18061#-----------------------------------------------------------------------------
18162
@@ -184,11 +65,11 @@ def runCodeml(ap, outdir, finished, completed, gene):
18465 filename = gene .split ("/" )[- 1 ]
18566 geneid = filename .split ("." )[0 ]
18667 wd = outdir + "tmp/" + geneid + "/"
187- if len ( glob ( wd + "*" )) > 1 :
188- if filename . split ( "." )[ 1 ] == "2" :
68+ if filename . split ( "." )[ 1 ] == "2" :
69+ if len ( glob ( wd + "*" )) > 1 :
18970 # Skip pairwise genes if tree files are present
19071 pass
191- if (geneid + "\n " ) in completed :
72+ elif (geneid + "\n " ) in completed :
19273 pass
19374 else :
19475 tempctl = wd + geneid + ".ctl"
@@ -197,9 +78,10 @@ def runCodeml(ap, outdir, finished, completed, gene):
19778 cm = Popen (split (ap + "/paml/bin/codeml " + tempctl ),
19879 stdout = DEVNULL )
19980 cm .wait ()
200- # Append gene ID to list of finishedCodeML.txt
201- with open (finished , "a" ) as fin :
202- fin .write (geneid + "\n " )
81+ if cm .returncode == 0 :
82+ # Append gene ID to list of finishedCodeML.txt
83+ with open (finished , "a" ) as fin :
84+ fin .write (geneid + "\n " )
20385
20486#-----------------------------------------------------------------------------
20587
@@ -216,8 +98,8 @@ def main():
21698 parser .add_argument ("-t" , type = int , default = 1 , help = "Number of threads." )
21799 parser .add_argument ("-f" , default = "" ,
218100help = "Forward species (name must be the same as it appears in input files." )
219- parser .add_argument ("--noCleanUp " , action = "store_false " ,
220- help = "Keep temporary files." )
101+ parser .add_argument ("--cleanUp " , action = "store_true " ,
102+ help = "Remove temporary files (it may be useful to retain phylogenic trees for future use) ." )
221103 args = parser .parse_args ()
222104 # Assign arguments
223105 indir = args .i
@@ -230,18 +112,23 @@ def main():
230112 if cpu > MAXCPU :
231113 cpu = MAXCPU
232114 forward = args .f
233- cleanup = args .noCleanUp
115+ cleanup = args .cleanUp
234116 # Reads in required data
235117 finished , completed = outputFiles (outdir )
236- pairwise , multiple = controlFiles (indir , outdir , forward , completed )
237- if pairwise == True or multiple == True :
118+ run = controlFiles (indir , outdir , forward , cpu )
119+ if run == True :
238120 # Call CodeML after PhyML completes.
239121 genes = glob (indir + "*.phylip" )
240122 l = int (len (genes ))
123+ # Determine chunksize
124+ if l <= cpu :
125+ chunk = 1
126+ elif l > cpu :
127+ chunk = int (math .ceil (l / cpu ))
241128 pool = Pool (processes = cpu )
242129 func = partial (runCodeml , ap , outdir , finished , completed )
243130 print ("\t Running CodeML with" , str (cpu ), "threads...." )
244- rcml = pool .imap (func , genes , chunksize = int ( l / cpu ) )
131+ rcml = pool .imap (func , genes , chunksize = chunk )
245132 pool .close ()
246133 pool .join ()
247134 # Remove tmp directory
0 commit comments