Skip to content

Commit 348a036

Browse files
committed
added modes for KaKs_Calculator, --noCleanUp option for CodeML, and streamlined input functions.
1 parent 6e8e265 commit 348a036

20 files changed

Lines changed: 1435 additions & 184 deletions

AlignmentProcessor.py

Lines changed: 35 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,7 @@ def readList(task, build, common):
3333
names.write(build + "\t" + common + "\n")
3434
quit()
3535

36-
def checkInput(axt, kaks, phylip, codeml):
36+
def checkInput(axt, kaks, phylip, codeml, outdir):
3737
'''Makes sure necessary programs are installed and proper file conversion
3838
is run for optional analysis program.'''
3939
if kaks == True:
@@ -60,6 +60,11 @@ def checkInput(axt, kaks, phylip, codeml):
6060
if phylip == False:
6161
print("\n\tError: Files must be converted into phylip format for use\
6262
with CodeML.\n")
63+
control = glob(outdir + "*.ctl")
64+
if len(control) == 0:
65+
print("\n\tPlease supply a control file for CodeML.\n")
66+
elif len(control) > 1:
67+
print("\n\tPlease supply only one CodeML control file.\n")
6368
quit()
6469

6570

@@ -169,10 +174,10 @@ def axtConvert(outdir, kaks, starttime):
169174
print("Finished converting files.")
170175
print("Total runtime: ", datetime.now() - starttime)
171176

172-
def calculateKaKs(outdir):
177+
def calculateKaKs(outdir, method):
173178
'''Calculates substition rates.'''
174179
print("Calculating Ka/Ks values...")
175-
ck = Popen(split("python bin/07_KaKsonDir.py " + outdir))
180+
ck = Popen(split("python bin/07_KaKsonDir.py " + outdir + " " + method))
176181
ck.wait()
177182
if ck.returncode == 0:
178183
return True
@@ -194,19 +199,24 @@ def phylipConvert(outdir, starttime, codeml):
194199
print("Total runtime: ", datetime.now() - starttime)
195200
return True
196201

197-
def runcodeml(cpu, outdir, forward, starttime):
202+
def runcodeml(cpu, outdir, forward, cleanup, starttime):
198203
'''Runs codeml on a directory.'''
199204
print("Running codeml...")
200-
cm = Popen(split("python bin/07_CodeMLonDir.py -i " + outdir
201-
+ " -t " + cpu + " -f " + forward))
205+
# Build commands and add options if necessary
206+
string = "python bin/07_CodeMLonDir.py -t " + cpu + " -i " + outdir
207+
if cleanup == False:
208+
string += " --noCleanUp"
209+
if forward:
210+
string += " -f " + forward
211+
cm = Popen(split(string))
202212
cm.wait()
203213
if cm.returncode == 0:
204214
print("Total runtime: ", datetime.now() - starttime)
205215

206216
#-----------------------------------------------------------------------------
207217

208218
def helplist():
209-
print("\nAlignmentProcessor0.21 Copyright 2016 by Shawn Rupp\n")
219+
print("\nAlignmentProcessor1.0 Copyright 2016 by Shawn Rupp\n")
210220
print("\texample usage: python AlignmentProcessor.py -% <decimal> \
211221
--axt/phylip --kaks/codeml -i <input fasta file> -o \
212222
<path to output directory> -r <reference species>\n")
@@ -217,16 +227,20 @@ def helplist():
217227
determine the open reading frame")
218228
print("\t--ucsc\tconverts headers of CDS fasta files obtained from \
219229
the UCSC genome browser")
220-
print("\t--axt\tConverts files to axt for use in KaKs_Calcuator.")
221-
print("\t--phylip\tConverts files to phylip for use in PhyML.")
222-
print("\t--kaks\tRuns KaKs_Calcuator if --axt is also specified")
230+
print("\t--axt\tconverts files to axt for use in KaKs_Calcuator.")
231+
print("\t--phylip\tconverts files to phylip for use in PhyML.")
232+
print("\t--kaks\truns KaKs_Calcuator if --axt is also specified")
233+
print("\t-m\tsets the method to calculate substitution rates in \
234+
KaKs_Calculator (NG by default).")
223235
print("\t--codeml\tRuns codeml if --phylip is also specified")
224236
print("\t-t\tnumber of threads to use for CodeML.")
225237
print("\t-f\tspecifies the forward branch of the CodeML input tree.")
226238
print("\t-%\tSets the percentage cutoff for the countBases step (50% \
227239
by default).")
228240
print("\t--changeNames\tchange genome build names to common names.")
229241
print("\t--retainStops\tretain sequences with internal stop codons")
242+
print("\t--noCleanUp\ttells the program to keep temporary control files, \
243+
tree files, etc. (These files are removed by default)")
230244
print("\t--printNameList\tprints list of genome build names and\
231245
associated common names")
232246
print("\t--addNameToList\tadd new genome build name and\
@@ -237,9 +251,10 @@ def helplist():
237251

238252
def main():
239253
starttime = datetime.now()
240-
# Set optional parameters to False:
254+
# Set optional parameters:
241255
commonNames = False
242256
retainStops = False
257+
cleanup = True
243258
axt = False
244259
phylip = False
245260
kaks = False
@@ -250,7 +265,7 @@ def main():
250265
percent = "0.5"
251266
ref = "void"
252267
forward = ""
253-
268+
method = "NG"
254269
for i in argv:
255270
# Print help list
256271
if len(argv) == 1:
@@ -261,7 +276,7 @@ def main():
261276
quit()
262277
# Other functions
263278
elif i == "-v" or i == "--version":
264-
print("\nAlignmentProcessor0.21 Copyright 2016 by Shawn Rupp\n")
279+
print("\nAlignmentProcessor1.0 Copyright 2016 by Shawn Rupp\n")
265280
print("This program comes with ABSOLUTELY NO WARRANTY\n")
266281
print("This is free software, and you are welcome to redistribute\
267282
it under certain conditions\n")
@@ -291,6 +306,8 @@ def main():
291306
phylip = True
292307
elif i == "--kaks":
293308
kaks = True
309+
elif i == "-m":
310+
method = argv[argv.index(i) + 1]
294311
elif i == "--codeml":
295312
codeml = True
296313
elif i == "--ucsc":
@@ -299,11 +316,13 @@ def main():
299316
commonNames = True
300317
elif i == "--retainStops":
301318
retainStops = True
319+
elif i == "--noCleanUp":
320+
cleanup = False
302321
# Check inout commands prior to running:
303322
if ref == "void":
304323
print("\n\tPlease pecify a reference species.\n")
305324
quit()
306-
checkInput(axt, kaks, phylip, codeml)
325+
checkInput(axt, kaks, phylip, codeml, outdir)
307326
# Save working directory to variable to call other scripts:
308327
path = os.getcwd()
309328
path = path + "/"
@@ -342,7 +361,7 @@ def main():
342361
# Run KaKs_Calculator:
343362
if kaks == True and codeml == False:
344363
if ac == True:
345-
ck = calculateKaKs(outdir)
364+
ck = calculateKaKs(outdir, method)
346365
if ck == True:
347366
printCSV(outdir, starttime)
348367
# Optionally covert files to phylip format:
@@ -352,7 +371,7 @@ def main():
352371
# Run codeml
353372
if codeml == True and kaks == False:
354373
if pc == True:
355-
runcodeml(cpu, outdir, forward, starttime)
374+
runcodeml(cpu, outdir, forward, cleanup, starttime)
356375

357376
if __name__ == "__main__":
358377
main()
71.2 KB
Binary file not shown.
Binary file not shown.

README.md

Lines changed: 59 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -11,15 +11,15 @@
1111
GNU General Public License for more details.
1212

1313

14-
###############################################################
14+
##############################################################################
1515
# AlignmentProcessor0.21 Package
1616

1717
# Dependencies:
1818
Python 3
1919
Python 3 version of Biopython
2020
PAML (if using CodeML)
2121
PhyML (if using CodeML)
22-
###############################################################
22+
##############################################################################
2323

2424

2525
### Contents ###
@@ -155,9 +155,9 @@ invoke KaKs_Calculator on the whole directory again.
155155

156156
# Example Usage:
157157

158-
python AlignmentProcessor.py --ucsc --axt/phylip --kaks/codeml \
159-
--retainStops -% <decimal> -f <forward branch of codeml tree> \
160-
-r <reference species> -i <input fasta file> \
158+
python AlignmentProcessor.py --ucsc --axt/phylip --kaks/codeml
159+
--retainStops -% <decimal> -f <forward branch of codeml tree>
160+
-r <reference species> -i <input fasta file>
161161
-o <path to output directory>
162162

163163
# Required Arguments:
@@ -195,6 +195,9 @@ invoke KaKs_Calculator on the whole directory again.
195195
otherwise it will not run). Otherwise the program will quit
196196
after converting the files.
197197

198+
-m indicates the method for KaKs_Calculator to use to calculate
199+
substitution rates (see below).
200+
198201
--codeml will run codeml on all of the files in the
199202
06_phylipFiles directory. You must also supply a
200203
control file for CodeML which must be located in the
@@ -207,10 +210,16 @@ invoke KaKs_Calculator on the whole directory again.
207210
AlignmentProcessor can call multiple instances of CodeML to
208211
shorten overall run time. (Default = 1)
209212

210-
-f the build name (or common name if you use the --changeNames flag) of
211-
the species on the forward branch of the phylogneic tree supplied by
212-
PhyML. This species does not have to be the same as the reference
213-
species.
213+
-f the first ten characters (standard phylip format truncates the species
214+
names to ten characters) of the build name (or common name if you use
215+
the --changeNames flag) of the species on the forward branch of the
216+
phylogneic tree supplied by PhyML. This species does not have to be
217+
the same as the reference species.
218+
219+
--noCleanUp tells the program to keep temporary CodeML control files,
220+
tree files, and other PhyML and CodeML output/temporary
221+
files which will be located in the tmp directory.
222+
These files are removed by default.
214223

215224
# Additional Commands
216225

@@ -223,7 +232,7 @@ invoke KaKs_Calculator on the whole directory again.
223232
common names. Must be run without any other arguments
224233

225234
--addNameToList will add an entry to the 02_nameList.txt file.
226-
e.g. python AlignmentProcessor.py -- addNameToList \
235+
e.g. python AlignmentProcessor.py -- addNameToList
227236
<build> <common name>
228237
229238

@@ -238,25 +247,49 @@ invoke KaKs_Calculator on the whole directory again.
238247
new row, followed by a tab, then the desired common name. Make sure
239248
there are no spaces in either name.
240249

250+
# KaKs_Calculator Method
251+
252+
KaKs_Calculator can calculate substitution rates in a number of different
253+
ways which can be specified to AlignmentProcessor using the "-m" flag.
254+
These methods include estimations, that are generally much faster, and
255+
maximum liklihood models, that should be more accurate. See the
256+
KaKs_Calculator documentation in the KaKs_Calculotor folder for more
257+
information.
258+
259+
Estimations:
260+
NG (default in AlignmentProcessor)
261+
LWL
262+
LPB
263+
MLWL
264+
YN
265+
MYN
266+
267+
Maximum Liklihood:
268+
GY
269+
MS (recommended fof maximum liklihood)
270+
241271
# The CodeML control file
242272

243273
Codeml requires that all of its parameters be specified in one control
244274
file (http://abacus.gene.ucl.ac.uk/software/pamlDOC.pdf). Provide a
245275
control file with your desired parameters and AlignmentProcessor will
246276
use it as template.
247277

248-
The control file must be titled titled “codeml.ctl”, and it must be
249-
located in the output directory. Examples are included with PAML and one
250-
is included in AlignmentProcessor's test directory.
278+
The control file must have a “.ctl” extension, and it must be
279+
located in the output directory. Only provide one control file in this
280+
directory. Examples are included with PAML and the controlFiles directory
281+
contains example control files for branch site, branch specific, and
282+
pairwise analyses. These can simply be copied into your output directory
283+
or you may supply one of your own.
251284

252285
# Invoking the Ka/Ks pipeline with a UCSC alignment:
253286

254-
python AlignmentProcessor0.21.py --axt --kaks --ucsc -r anoCar2 \
287+
python AlignmentProcessor0.21.py --axt --kaks --ucsc -r anoCar2
255288
-i anolis_gallus.fa -o pairwiseKaKs/
256289

257290
# Invoking the CodeML pipeline with a de novo alignment:
258291

259-
python AlignmentProcessor0.21.py --phylip --codeml -% 0.6 \
292+
python AlignmentProcessor0.21.py --phylip --codeml -% 0.6
260293
-r anoCar2 -i anolis_gallus.fa -o codemlOutput/
261294

262295
-------------------------------
@@ -285,7 +318,7 @@ Remember that the order of the arguments does matter for these scripts.
285318
per gene. It will produce an output file for a gene if it has at least
286319
two sequences.
287320

288-
python 01_splitFastaFiles.py <input fasta alignment> \
321+
python 01_splitFastaFiles.py <input fasta alignment>
289322
<path to output directory>
290323

291324
02_RemoveHeader.py
@@ -294,7 +327,7 @@ Remember that the order of the arguments does matter for these scripts.
294327
FASTA files and remove gene IDs from the fasta headers. It will replace
295328
build names with each species' common name if "--changeNames" is specified.
296329

297-
python 02_RemoveHeaderOnDir.py <path to input and output directories> \
330+
python 02_RemoveHeaderOnDir.py <path to input and output directories>
298331
--changeNames(optional)
299332

300333
03_CheckFrame.py
@@ -306,7 +339,7 @@ Remember that the order of the arguments does matter for these scripts.
306339
frame. It will then replace codons with missing nucleotides with gaps
307340
to remove unknown amino acids from the sequence.
308341

309-
python 03_CheckFrameOnDir.py <path to input and output directories> \
342+
python 03_CheckFrameOnDir.py <path to input and output directories>
310343
<reference_species>
311344

312345
04_CountBases.py
@@ -319,7 +352,7 @@ Remember that the order of the arguments does matter for these scripts.
319352
but the script itself does not, so you MUST specify one if you invoke
320353
it on its own.
321354

322-
python 05_CountBasesOnDir.py <threshold percentage as a decimal> \
355+
python 05_CountBasesOnDir.py <threshold percentage as a decimal>
323356
<path to input and output directories>
324357

325358
05_ReplaceStopCodons.py
@@ -335,7 +368,7 @@ Remember that the order of the arguments does matter for these scripts.
335368
any gene which does not have at least two remaining sequences will not be
336369
written to file.
337370

338-
python 05_ReplaceStopCodonsOnDir.py \
371+
python 05_ReplaceStopCodonsOnDir.py
339372
<path to input and output directories> --retainStops(optional)
340373

341374
06_FASTAtoAXT.py
@@ -349,15 +382,15 @@ Remember that the order of the arguments does matter for these scripts.
349382
This program will convert all files in an input directory
350383
from fasta format to a phylip format.
351384

352-
python 07_FASTAtoPhylip.py <number of species> \
385+
python 07_FASTAtoPhylip.py <number of species>
353386
<path to input and output directories>
354387

355388
07_KaKsonDir.py
356389

357390
This program executes KaKs_Calculator on every file in a directory.
358391

359-
python 07_KaKsonDirectory.py <path to input and output directories> \
360-
<name of reference species>
392+
python 07_KaKsonDirectory.py <path to input and output directories>
393+
<method>
361394

362395
07_CodeMLonDir.py
363396

@@ -371,7 +404,7 @@ Remember that the order of the arguments does matter for these scripts.
371404
Note: Since this script has greater utility as a stand-alone program, it
372405
utilizes flags so that the order of the arguments does not matter.
373406

374-
python 07_CodeMLonDir.py -t <# of threads> -f <name of forward branch> \
407+
python 07_CodeMLonDir.py -t <# of threads> -f <name of forward branch>
375408
-i <path to input and output directories>
376409

377410

@@ -421,7 +454,7 @@ memory.
421454
Change directory into the AlignmentProcessor folder. Paste the following into
422455
a terminal:
423456

424-
python AlignmentProcessor.py --ucsc --axt --kaks -r anoCar2 \
457+
python AlignmentProcessor.py --ucsc --axt --kaks -r anoCar2
425458
-i test/kaksTest.fa -o test/
426459

427460
This will return a tsv file with 11 lines.
@@ -431,7 +464,7 @@ The test directory already contains a sample CodeML control file, so
431464
all you need to do is change into the AlignmentProcessor directory and paste
432465
the following:
433466

434-
python AlignmentProcessor.py --ucsc --phylip --codeml -t 2 -r anoCar2 \
467+
python AlignmentProcessor.py --ucsc --phylip --codeml -t 2 -r anoCar2
435468
-f anoCar2 -i test/codemlTest.fa -o test/
436469

437470
There should be 8 .mlc files in the 07_codeml directory.

0 commit comments

Comments
 (0)