Skip to content

Commit ea1970d

Browse files
committed
Incorporated PhyML for creation of gene trees.
1 parent e34dc24 commit ea1970d

9 files changed

Lines changed: 212 additions & 519 deletions

AlignmentProcessor.py

Lines changed: 40 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -48,14 +48,20 @@ def checkInput(axt, kaks, phylip, codeml):
4848
quit()
4949
if codeml == True:
5050
indir = os.path.isfile("paml/bin/codeml")
51+
phyml = os.path.isfile("PhyML/PhyML")
5152
if indir == False:
5253
print("\n\tError: Please install PAML in the\
5354
AlignmentProcessor folder.\n")
5455
quit()
56+
if phyml == False:
57+
print("\n\tError: Please install and rename PhyML for use\
58+
with CodeML.\n")
59+
quit()
5560
if phylip == False:
5661
print("\n\tError: Files must be converted into phylip format for use\
5762
with CodeML.\n")
5863
quit()
64+
5965

6066
def makeDir(path, outdir, axt, phylip):
6167
'''Makes all sub-directories used by program.'''
@@ -80,6 +86,8 @@ def makeDir(path, outdir, axt, phylip):
8086
pass
8187
os.chdir(path)
8288

89+
#-----------------------------------------------------------------------------
90+
8391
def convertHeaders(fasta):
8492
'''Changes headers of UCSC CDS fasta files to includ eonly build name and
8593
gene ID'''
@@ -186,42 +194,45 @@ def phylipConvert(outdir, starttime, codeml):
186194
print("Total runtime: ", datetime.now() - starttime)
187195
return True
188196

189-
def runcodeml(cpu, outdir, starttime):
197+
def runcodeml(cpu, outdir, forward, starttime):
190198
'''Runs codeml on a directory.'''
191199
print("Running codeml...")
192200
cm = Popen(split("python bin/07_CodeMLonDir.py -i " + outdir
193-
+ " -n " + cpu))
201+
+ " -t " + cpu + " -f " + forward))
194202
cm.wait()
195203
if cm.returncode == 0:
196204
print("Total runtime: ", datetime.now() - starttime)
197205

206+
#-----------------------------------------------------------------------------
207+
198208
def helplist():
199209
print("\n### AlignmentProcessor will run the subsituion rate pipeline to \
200210
produce trimmed axt files for use with KaKs_calculator. ###\n")
201-
print(" example usage: python AlignmentProcessor.py -% <decimal> \
211+
print("\texample usage: python AlignmentProcessor.py -% <decimal> \
202212
--axt/phylip --kaks/codeml -i <input fasta file> -o \
203213
<path to output directory> -r <reference species>")
204-
print(" --printNameList prints list of genome build names and\
205-
associated common names")
206-
print(" --addNameToList add new genome build name and\
207-
associated common name to list")
208-
print(" --axt Converts files to axt for use in KaKs_Calcuator.")
209-
print(" --phylip Converts files to phylip for use in PhyML.")
210-
print(" --kaks Runs KaKs_Calcuator if --axt is also specified")
211-
print(" --codeml Runs codeml if --phylip is also specified")
212-
print(" --ucsc converts headers of CDS fasta files obtained from \
213-
the UCSC genome browser")
214-
print(" --retainStops retain sequences with internal stop codons")
215-
print(" -r the name of the reference species which will be used to \
214+
print("\t-i path to input file containing a multifasta alignment")
215+
print("\t-o AlignmentProcessor will use this as its working \
216+
directory and print output to this directory")
217+
print("\t-r the name of the reference species which will be used to \
216218
determine the open reading frame")
217-
print(" -% Sets the percentage cutoff for the countBases step (50% \
219+
print("\t--ucsc converts headers of CDS fasta files obtained from \
220+
the UCSC genome browser")
221+
print("\t--axt Converts files to axt for use in KaKs_Calcuator.")
222+
print(" \t--phylip Converts files to phylip for use in PhyML.")
223+
print("\t--kaks Runs KaKs_Calcuator if --axt is also specified")
224+
print("\t--codeml Runs codeml if --phylip is also specified")
225+
print("\t-t number of threads to use for CodeML.")
226+
print("\t-f specifies the forward branch of the CodeML input tree.")
227+
print("\t--retainStops retain sequences with internal stop codons")
228+
print("\t-% Sets the percentage cutoff for the countBases step (50% \
218229
by default).")
219-
print(" -t number of threads to use for CodeML.")
220-
print(" -i path to input file containing a multifasta alignment")
221-
print(" -o AlignmentProcessor will use this as its working \
222-
directory and print output to this directory")
223-
print(" -v prints coyright and version information to the screen")
224-
print("### Please note that you must be in the substituionManager \
230+
print("\t--printNameList prints list of genome build names and\
231+
associated common names")
232+
print("\t--addNameToList add new genome build name and\
233+
associated common name to list")
234+
print("\t-v prints coyright and version information to the screen")
235+
print("\n### Please note that you must be in the substituionManager \
225236
directory to run the program.###\n")
226237

227238
def main():
@@ -238,13 +249,14 @@ def main():
238249
cpu = "1"
239250
percent = "0.5"
240251
ref = "void"
252+
forward = ""
241253
# Extract values from command line:
242254
for i in argv:
243255
if i == "-h" or i == "--help":
244256
helplist()
245257
quit()
246258
elif i == "-v" or i == "--version":
247-
print("\nAlignmentProcessor0.11 Copyright 2016 by Shawn Rupp\n")
259+
print("\nAlignmentProcessor0.20 Copyright 2016 by Shawn Rupp\n")
248260
print("This program comes with ABSOLUTELY NO WARRANTY\n")
249261
print("This is free software, and you are welcome to redistribute\
250262
it under certain conditions\n")
@@ -261,6 +273,10 @@ def main():
261273
outdir = outdir + "/"
262274
elif i == "-r":
263275
ref = argv[argv.index(i) + 1]
276+
elif i == "-t":
277+
cpu = str(argv[argv.index(i) + 1])
278+
elif i == "-f":
279+
forward = argv[argv.index(i) + 1]
264280
elif i == "-%":
265281
percent = str(argv[argv.index(i) + 1])
266282
elif i == "--axt":
@@ -273,8 +289,6 @@ def main():
273289
codeml = True
274290
elif i == "--ucsc":
275291
conv = True
276-
elif i == "-n":
277-
cpu = str(argv[argv.index(i) + 1])
278292
elif i == "--changeNames":
279293
commonNames = True
280294
elif i == "--retainStops":
@@ -332,7 +346,7 @@ def main():
332346
# Run codeml
333347
if codeml == True and kaks == False:
334348
if pc == True:
335-
runcodeml(cpu, outdir, starttime)
349+
runcodeml(cpu, outdir, forward, starttime)
336350

337351
if __name__ == "__main__":
338352
main()

AlignmentProcessorReadMe.txt

Lines changed: 42 additions & 70 deletions
Original file line numberDiff line numberDiff line change
@@ -12,14 +12,13 @@
1212

1313

1414
###############################################################
15-
# AlignmentProcessor0.11 Package
15+
# AlignmentProcessor0.20 Package
1616
#
1717
# Dependencies: Python 3
1818
# Python 3 version of Biopython
1919
# Perl (if using KaKs_Calculator)
2020
# PAML (if using CodeML)
21-
# R (if using CodeML)
22-
# ape R package (if using CodeML)
21+
# PhyML (if using CodeML)
2322
###############################################################
2423

2524
### Contents ###
@@ -37,10 +36,9 @@
3736
AlignmentProcessor is a pipeline meant to quickly convert a multi-fasta
3837
alignment file into a format that can be read by KaKs_Calculator or PAML and
3938
optionally run those programs. When running codeml, alignment processor will
40-
use input control and tree files as templates to create unique control and
41-
tree files for each gene. It will call the ape R package to dynamically trim
42-
the given phylogenic tree so that only species which remain in each gene's
43-
alignment after trimming are represented in that gene's tree.
39+
use an input control file as a template to create a unique control for each
40+
gene. It will call PhyML to create a unique phylogenic tree for each gene
41+
based off of its sequences.
4442

4543
You can run the AlignmentProcessor wrapper which will call all of the python
4644
scripts in sequence, or you may call each script individually.
@@ -63,34 +61,27 @@ a terminal and Anaconda will install Biopython for you:
6361

6462
# KaKs_Calculator
6563

66-
AlignmentProcessor0.11 is packaged with KaKs_Calculator2.0 binaries for Linux
64+
AlignmentProcessor0.20 is packaged with KaKs_Calculator2.0 binaries for Linux
6765
and Windows, and a KaKs_Calculator1.2 binary for Mac (there is no 2.0 binary
6866
available for OSX). Before using, copy or move the appropriate binary for your
6967
system into the AlignmentProcessor bin which contains the python scripts.
7068

71-
# PAML 4.8
69+
# PAML
7270

7371
If you plan to use CodeML, you must first download PAML
7472
(http://abacus.gene.ucl.ac.uk/software/paml.html) and move the folder into the
7573
AlignmentProcessor directory. Make sure that it is titled "paml".
7674

77-
# Ape
78-
79-
The most straightforward way to install ape, and most R packages, is through
80-
Bioconductor. If you do not have Bioconductor installed, open R and paste:
81-
82-
source("https://bioconductor.org/biocLite.R")
83-
biocLite()
84-
85-
To install ape, enter:
86-
87-
library("BiocInstaller")
88-
biocLite("ape")
89-
75+
# PhyML
76+
If you plan to use CodeML, you must also download PhyML
77+
(http://www.atgc-montpellier.fr/phyml/binaries.php). Similar to PAML, you must
78+
move the folder into the AlignmentProcessor directory and change the name of
79+
both the folder and the binary for your operating system to "PhyML".
9080

9181
#-------------------------------
9282
# 1. Obtaining a fasta alignment
9383
#-------------------------------
84+
9485
# UCSC Fasta Alignment
9586
It is possible to download CDS fasta alignments from the UCSC Table browser.
9687
This does, unfortunately, limit you to currently available alignments.
@@ -112,10 +103,16 @@ If you did not use a UCSC genome for the reference species in your alignment,
112103
you may need to upload the reference genome that you used as a custom build.
113104
Make sure that the genome, maf file, and BED file are all set to the custom
114105
build, and that the reference species build name is identical in all three
115-
files. Additionally, you may not be able to use the UCSC BED12 file if you did
106+
files.
107+
108+
Additionally, you may not be able to use the UCSC BED12 file if you did
116109
not use a UCSC genome. If that is the case, you can either upload your own,
117110
or, if you used an Ensembl genome, you can just remove the "chr_UN" and "chr"
118-
chromosome prefixes from the file, and resubmit the file to Galaxy.
111+
chromosome prefixes from the file, and resubmit the file to Galaxy. For NCBI
112+
genomes, you may download the gff from NCBI genome, use the UCSC utility
113+
"gff3ToGenePred" with the -useName and -honorStartStopCodons options, and
114+
use the UCSC utility "genePredToBed". This will return a BED12 file which
115+
may be submitted to Galaxy.
119116

120117
Upload your maf and BED files to Galaxy (usegalaxy.org) (or retrieve a BED
121118
file of the genes for your reference species using the UCSC Main link under
@@ -134,7 +131,7 @@ output file.
134131

135132
This process will take a few hours, so plan accordingly.
136133

137-
Since this method offer the most flexibility for working with alignments,
134+
Since this method offers the most flexibility for working with alignments,
138135
AlignmentProcessor was written with this output format in mind and no further
139136
formatting is required.
140137

@@ -156,14 +153,15 @@ If you are running CodeML and the program is interrupted, you may call the
156153
07_CodeMLonDir.py script and it will continue where CodeML left off. This
157154
will save the time of having to run those genes through CodeML again (It will
158155
do the same thing if you call the entire pipeline again, but there is no need
159-
to rerun the previous steps). It will not do the same for KaKs_Calculator
156+
to re-run the previous steps). It will not do the same for KaKs_Calculator
160157
since KaKs_Calculator is much faster, so it should not be a problem to just
161158
invoke KaKs_Calculator on the whole directory again.
162159

163160
# Example Usage:
164161

165162
python AlignmentProcessor.py --ucsc --axt/phylip --kaks/codeml \
166-
--retainStops -% <decimal> -r <reference species> -i <input fasta file> \
163+
--retainStops -% <decimal> -f <forward branch of codeml tree> \
164+
-r <reference species> -i <input fasta file> \
167165
-o <path to output directory>
168166

169167
# Required Arguments:
@@ -214,6 +212,11 @@ invoke KaKs_Calculator on the whole directory again.
214212
AlignmentProcessor can call multiple instances of CodeML to
215213
shorten overall run time. (Default = 1)
216214

215+
-f the build or common name (if you use the --changeNames flag) of the
216+
species on the forward branch of the phylogneic tree supplied to
217+
CodeML. This species does not have to be the same as the reference
218+
species.
219+
217220
# Additional Commands
218221

219222
-h/--help will print the program's help dialogue
@@ -245,39 +248,19 @@ invoke KaKs_Calculator on the whole directory again.
245248
Codeml requires that all of its parameters be specified in one control
246249
file (http://abacus.gene.ucl.ac.uk/software/pamlDOC.pdf). Provide a
247250
control file with your desired parameters and AlignmentProcessor will
248-
use it as template. AlignmentProcessor uses the Biopython codeml module
249-
to more easily edit the names of the input and output files. Because of
250-
this, however, you must make sure that the first three lines (starting
251-
with "seqfile", "treefile", or "outfile" have been removed from the
252-
control file. Otherwise, CodeML will presented only with the data
253-
contained in the control file, and not the whole directory of alingments.
251+
use it as template.
254252

255253
The control file must be titled titled “codeml.ctl”, and it must be
256254
located in the output directory.
257255

258-
# The CodeML tree file
259-
260-
If your CodeML analysis requires a phylogenic tree, provide your
261-
desired tree, titled "codeml.tree" in the output directory. Specify
262-
the tree as you want to appear to CodeML and be sure that the species
263-
names are specified as the common names in the 02_nameList.txt file,
264-
but trimmed to ten characters (some programs still set a ten character
265-
limit on the length of names, so AlignmentProcessor trims the names).
266-
The 07_CodeMLonDir.py script will save any nodes you have specified
267-
with a "#" before sending a plain Newick tree to ape (which will not
268-
work if there are PAML node symbols). It will then add any nodes back
269-
into the tree after it has been trimmed. AlignmentProcessor will not
270-
currently save nodes specified with "$" since it is difficult to
271-
determine where a nested clade begins and ends.
272-
273256
# Invoking the Ka/Ks pipeline with a UCSC alignment:
274257

275-
python AlignmentProcessor0.11.py --axt --kaks --ucsc -r anoCar2 \
258+
python AlignmentProcessor0.20.py --axt --kaks --ucsc -r anoCar2 \
276259
-i anolis_gallus.fa -o pairwiseKaKs/
277260

278261
# Invoking the CodeML pipeline with a de novo alignment:
279262

280-
python AlignmentProcessor0.11.py --phylip --codeml -% 0.6 \
263+
python AlignmentProcessor0.20.py --phylip --codeml -% 0.6 \
281264
-r anoCar2 -i anolis_gallus.fa -o codemlOutput/
282265

283266
#-------------------------------
@@ -388,26 +371,15 @@ Remember that the order of the arguments does matter for these scripts.
388371
# 07_CodeMLonDir.py
389372

390373
This script will run codeml on every file in a directory. It requires
391-
the codeml.ctl file, and likely a tree file which it will supply to
392-
codeml. It will overwrite the "seqfile", "treefile", "outfile" lines
393-
include the paths to the input phylip file, the output file, and the
394-
tree file. It will also call the ape R package to trim the tree file
395-
so that it only includes species which have not been filtered out. If you
396-
are running CodeML and the program is interrupted, you may invoke this
374+
the codeml.ctl file. It will overwrite the "seqfile", "treefile",
375+
"outfile" lines include the paths to the input phylip file, the output
376+
file, and the tree file. It will also PhyML to create the tree file. If
377+
you are running CodeML and the program is interrupted, you may invoke this
397378
script to pick up where you left off.
398379

399-
python 07_CodeMLonDir.py <path to codeml control file> \
400-
<path to input and output directories>
401-
402-
# 07_pruneTree.py
403-
404-
This script will dynamically trim input trees for CodeML if any sequences
405-
have been removed. Species whose sequences were removed in steps 4 or 5
406-
and are no longer in the phylip alignment will be removed from
407-
the temporary tree given to CodeML.
380+
python 07_CodeMLonDir.py -t <# of threads> -f <name of forward branch> \
381+
-i <path to input and output directories>
408382

409-
python 07_pruneTree.py <path to input directory> \
410-
<list of species remaining in alignment> <path to tmep output directory>
411383

412384
# 08_compileKaKs.py
413385

@@ -441,7 +413,7 @@ attempt to concatenate specific parts from the output files.
441413

442414
If you wish to convert convert the files to both formats, specify both --axt
443415
and --phylip the program will convert the fasta files to both formats. You may
444-
also run one of the individual scripts on the 07_rmStops directory to convert
416+
also run one of the individual scripts on the 05_rmStops directory to convert
445417
the files in a separate step. AlignmentProcessor will not, however, run
446418
KaKs_Calculator and CodeML simultaneously, as this could require too much
447419
memory.
@@ -460,11 +432,11 @@ python AlignmentProcessor.py --axt --kaks --ucsc -r anoCar2 \
460432
This will return a text file with 11 lines.
461433

462434
# To test CodeML:
463-
The test directory already contains sample CodeML control and tree files, so
435+
The test directory already contains a sample CodeML control file, so
464436
all you need to do is change into the AlignmentProcessor directory and paste
465437
the following:
466438

467439
python AlignmentProcessor.py --phylip --codeml --ucsc -t 2 -r anoCar2 \
468-
-i codemlTest.fa -o test/
440+
-f anoCar2 -i codemlTest.fa -o test/
469441

470442
There should be 8 .mlc files in the 07_codeml directory.

0 commit comments

Comments
 (0)