Skip to content

Commit 062b1f6

Browse files
committed
Removes premature stop codons by default, allows command line access to 02_namesList.txt.
1 parent 0153123 commit 062b1f6

219 files changed

Lines changed: 281 additions & 59708 deletions

File tree

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

AlignmentProcessor.py

Lines changed: 63 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -6,8 +6,7 @@
66
77
This package is free software: you can redistribute it and/or modify
88
it under the terms of the GNU General Public License as published by
9-
the Free Software Foundation, either version 3 of the License, or
10-
(at your option) any later version.
9+
the Free Software Foundation version 3 of the License.
1110
1211
This program is distributed in the hope that it will be useful,
1312
but WITHOUT ANY WARRANTY; without even the implied warranty of
@@ -21,6 +20,51 @@
2120
from glob import glob
2221
import os
2322

23+
def readList(task, build, common):
24+
'''Will print list of genome builds and associated common names and allow
25+
new entries to be added.'''
26+
if task == "read":
27+
with open("bin/02_nameList.txt", "r") as names:
28+
for line in names:
29+
print(line.rstrip())
30+
quit()
31+
if task == "add":
32+
with open("bin/02_nameList.txt", "a") as names:
33+
names.write(build + "\t" + common + "\n")
34+
quit()
35+
36+
def checkInput(axt, kaks, phylip, codeml):
37+
'''Makes sure necessary programs are installed and proper file conversion
38+
is run for optional analysis program.'''
39+
if kaks == True:
40+
indir = os.path.isfile("bin/KaKs_Calculator")
41+
if indir == False:
42+
print()
43+
print("\tError: Please install KaKs_Clculator in the\
44+
AlignmentProcessor bin.")
45+
print()
46+
quit()
47+
if axt == False:
48+
print()
49+
print("/tError: Files must be converted into axt format for use\
50+
with KaKs_Clculator.")
51+
print()
52+
quit()
53+
if codeml == True:
54+
indir = os.path.isfile("paml/bin/codeml")
55+
if indir == False:
56+
print()
57+
print("\tError: Please install PAML in the\
58+
AlignmentProcessor folder.")
59+
print()
60+
quit()
61+
if phylip == False:
62+
print()
63+
print("\tError: Files must be converted into phylip format for use\
64+
with CodeML.")
65+
print()
66+
quit()
67+
2468
def makeDir(path, outdir, axt, phylip):
2569
'''Makes all sub-directories used by program.'''
2670
os.chdir(outdir)
@@ -31,7 +75,7 @@ def makeDir(path, outdir, axt, phylip):
3175
except FileExistsError:
3276
pass
3377
if axt == True:
34-
for i in ["06_axtFiles", "KaKsOutput"]:
78+
for i in ["06_axtFiles", "07_KaKsOutput"]:
3579
try:
3680
os.mkdir(i)
3781
except FileExistsError:
@@ -95,11 +139,11 @@ def countBases(outdir, percent):
95139
if cb.returncode == 0:
96140
return True
97141

98-
def replaceStop(outdir):
142+
def replaceStop(outdir, retainStops):
99143
'''Removes stop codons for downstream analysis.'''
100144
print("Removing stop codons...")
101145
rs = Popen(split("python bin/05_ReplaceStopCodons.py " + " "
102-
+ outdir))
146+
+ outdir + " " + str(retainStops)))
103147
rs.wait()
104148
if rs.returncode == 0:
105149
return True
@@ -159,6 +203,7 @@ def helplist():
159203
print()
160204
print("### AlignmentProcessor will run the subsituion rate pipeline to \
161205
produce trimmed axt files for use with KaKs_calculator. ###")
206+
print()
162207
print(" example usage: python AlignmentProcessor.py -% <decimal> \
163208
--axt/phylip --kaks/codeml -i <input fasta file> -o \
164209
<path to output directory> -r <reference species>")
@@ -168,6 +213,7 @@ def helplist():
168213
print(" --codeml Runs codeml if --phylip is also specified")
169214
print(" --ucsc converts headers of CDS fasta files obtained from \
170215
the UCSC genome browser")
216+
print(" --retainStops retain sequences with internal stop codons")
171217
print(" -r the name of the reference species which will be used to \
172218
determine the open reading frame")
173219
print(" -% Sets the percentage cutoff for the countBases step (50% \
@@ -184,6 +230,7 @@ def main():
184230
starttime = datetime.now()
185231
# Set optional parameters to False:
186232
axt = False
233+
retainStops = False
187234
phylip = False
188235
kaks = False
189236
codeml = False
@@ -195,11 +242,15 @@ def main():
195242
helplist()
196243
quit()
197244
elif i == "-v" or i == "--version":
198-
print("\nAlignmentProcessor Copyright 2016 by Shawn Rupp\n")
245+
print("\nAlignmentProcessor0.8 Copyright 2016 by Shawn Rupp\n")
199246
print("This program comes with ABSOLUTELY NO WARRANTY")
200247
print("This is free software, and you are welcome to redistribute\
201248
it under certain conditions\n")
202249
quit()
250+
elif i == "--printNameList":
251+
readList("read", "void", "void")
252+
elif i == "--addNameToList":
253+
readList("add", argv[argv.index(i) + 1], argv[argv.index(i) + 2])
203254
elif i == "-i":
204255
fasta = argv[argv.index(i) + 1]
205256
elif i == "-o":
@@ -209,7 +260,7 @@ def main():
209260
elif i == "-r":
210261
ref = argv[argv.index(i) + 1]
211262
elif i == "-%":
212-
percent = argv[argv.index(i) + 1]
263+
percent = str(argv[argv.index(i) + 1])
213264
elif i == "--axt":
214265
axt = True
215266
elif i == "--phylip":
@@ -220,6 +271,10 @@ def main():
220271
codeml = True
221272
elif i == "--ucsc":
222273
conv = True
274+
elif i == "--retainStops":
275+
retainStops = True
276+
# Check inout commands prior to running:
277+
checkInput(axt, kaks, phylip, codeml)
223278
# Save working directory to variable to call other scripts:
224279
path = os.getcwd()
225280
path = path + "/"
@@ -250,7 +305,7 @@ def main():
250305
if cf == True:
251306
cb =countBases(outdir, percent)
252307
if cb == True:
253-
rs = replaceStop(outdir)
308+
rs = replaceStop(outdir, retainStops)
254309
# Optionally covert files to axt format:
255310
if axt == True:
256311
if rs == True:

AlignmentProcessorReadMe.txt

Lines changed: 47 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -8,16 +8,16 @@
88
This program is distributed in the hope that it will be useful,
99
but WITHOUT ANY WARRANTY; without even the implied warranty of
1010
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11-
GNU General Public License (GPL3.txt) for more details.
11+
GNU General Public License for more details.
1212

1313

1414
###############################################################
15-
# AlignmentProcessor0.7 Package
15+
# AlignmentProcessor0.8 Package
1616
#
1717
# Dependencies: Python 3
1818
# Python 3 version of Biopython
1919
# Perl
20-
# PAML (comes packaged with Linux binaries)
20+
# PAML
2121
###############################################################
2222

2323
### Contents ###
@@ -55,18 +55,16 @@ a terminal and Anaconda will install Biopython for you:
5555

5656
# KaKs_Calculator
5757

58-
AlignmentProcessor0.7 is packaged with KaKs_Calculator2.0 binaries for Linux
58+
AlignmentProcessor0.8 is packaged with KaKs_Calculator2.0 binaries for Linux
5959
and Windows, and a KaKs_Calculator1.2 binary for Mac (there is no 2.0 binary
6060
available for OSX). Before using, copy or move the appropriate binary for your
6161
system into the AlignmentProcessor bin which contains the python scipts.
6262

6363
# PAML 4.8
6464

65-
AlignmentProcessor0.7 comes packaged with Linux binaries of PAML 4.8. If you
66-
are using Mac or Windows, you will have to replace the Linux binaries with the
67-
appropriate package for you system
68-
(http://abacus.gene.ucl.ac.uk/software/paml.html). Make sure that it is
69-
titled "paml".
65+
If you plan to use CodeML, you must first download PAML
66+
(http://abacus.gene.ucl.ac.uk/software/paml.html) and move the folder into the
67+
AlignmentProcessor directory. Make sure that it is titled "paml".
7068

7169
#-------------------------------
7270
# 1. Obtaining a fasta alignment
@@ -136,19 +134,11 @@ in the bin/ directory.
136134

137135
# Example Usage:
138136

139-
python AlignmentProcessor.py -% <decimal> \
137+
python AlignmentProcessor.py -% <decimal> --retainStops \
140138
--axt/phylip --kaks/codeml --ucsc -i <input fasta file> \
141139
-o <path to output directory> -r <reference species>
142140

143141
# Required Arguments:
144-
145-
--axt/phylip Specifies which format to convert the files into and
146-
which program to use to calculate substitution rates
147-
(kaks will tell the prgram to convert files into axt
148-
format and optionally run KaKs_Calculator; phylip will
149-
convert files into phylip format for other programs;
150-
both commands may be run at once).
151-
152142

153143
-i the path to your input fasta alignment.
154144
-o the path to your working/output directory.
@@ -158,37 +148,57 @@ in the bin/ directory.
158148

159149
--ucsc This will invoke 00_convertHeader.py, which will convert the
160150
headers from UCSC fasta files so they only contain build
161-
nmaes and gene IDs.
151+
names and gene IDs.
152+
153+
--axt/phylip Specifies which format to convert the files into
154+
(axt format for KaKs_Calculator; phylip for CodeML
155+
or other programs; both commands may be run at once).
162156

163157
--kaks will run KaKs_Calculator (you much also specify --axt,
164158
otherwise it will not run). Otherwise the program will quit
165159
after converting the files.
166160

167161
--codeml will run codeml on all of the files in the
168162
06_phylipFiles directory. You must also supply a
169-
control file for PAML which must be located in the
163+
control file for CodeML which must be located in the
170164
output directory you specified with the the -o option.
171165
This file must be titled "codeml.ctl" (the default
172166
name given by PAML. The name is hard coded into
173167
AlignmentProcessor to simplify the commands).
174168

169+
--retainStops This will tell AlignmentProcessor to retain sequences
170+
that contain internal stop codons. By default,
171+
sequences with internal stop codons will be removed
172+
from the analysis.
173+
175174
-% a decimal value specifying the minimum percentage of reads
176175
that must remain after replacing unknown codons with gaps
177176
(Default = 0.5). You may wish to use a lower threshold
178177
for highly diverged species or for low quality genomes.
178+
179+
# Additional Commands
179180

180181
-h/--help will print the program's help dialogue
181182

182183
-v/--version will print the program version and copywright info
183184

184-
# The -r option
185+
--printNameList will print the contents of 02_nameList.txt which
186+
contains the list of genome builds and associated
187+
common names.
188+
189+
--addNameToList will add an entry to the 02_nameList.txt file.
190+
e.g. python AlignmentProcessor.py -- addNameToList <build> <common name>
191+
185192

186-
This specifies the reference species. To find it, look in
187-
02_nameList.txt in the bin. Check if the genome build is present
188-
in column 1 of the list. If it is, use the common name in column 2 of
189-
the list. If it is not, you may add an entry to the list with the
193+
# Genome Builds and Common Names
194+
195+
This specifies the reference species. To find it, either use the
196+
--printNameList option or look in 02_nameList.txt in the bin. Check if
197+
the genome build is present in column 1 of the list. If it is, use the
198+
common name in column 2 of the list. If it is not, you may either use
199+
the --addNameToList option or add an entry to the file with the
190200
build name that is present in your alignment as the first entry of a
191-
new row, followed by a space, then the desired common name. Make sure
201+
new row, followed by a tab, then the desired common name. Make sure
192202
there are no spaces in either name.
193203

194204
# The codeml control file
@@ -205,12 +215,12 @@ in the bin/ directory.
205215

206216
# Invoking the Ka/Ks pipeline with a UCSC alignment:
207217

208-
python AlignmentProcessor0.7.py --axt --kaks --ucsc -r green_anole \
218+
python AlignmentProcessor0.8.py --axt --kaks --ucsc -r green_anole \
209219
-i anolis_gallus.fa -o pairwiseKaKs/
210220

211221
# Invoking the CodeML pipeline with a de novo alignment:
212222

213-
python AlignmentProcessor0.7.py --phylip --codeml -% 0.6 \
223+
python AlignmentProcessor0.8.py --phylip --codeml -% 0.6 \
214224
-r green_anole -i anolis_gallus.fa -o codemlOutput/
215225

216226
#-------------------------------
@@ -283,6 +293,11 @@ Remember that the order of the arguments does matter for these scripts.
283293
programs will not run properly if they enounter a premature stop
284294
codon.
285295

296+
Terminal stop codons will be replaced, while sequences with internal
297+
stop codons will have their gene id, as well sequence name and
298+
location of the first occuring stop codon, recorded in the
299+
internalStops.txt file.
300+
286301
python 05_ReplaceStopCodonsOnDir.py <number of species> \
287302
<path to inut and output directories>
288303

@@ -323,9 +338,9 @@ Remember that the order of the arguments does matter for these scripts.
323338

324339
# 08_compileKaKs_CSV.py
325340

326-
This script concatonates the output from KaKs_Calculator into one csv
341+
This script concatonates the output from KaKs_Calculator into a text
327342
file. It adds a column for gene (or sequence) IDs, and prints the gene
328-
ID from the filename.
343+
ID from the filename.
329344

330345
python compileCSV.py <path to inut and output directories>
331346

@@ -365,7 +380,7 @@ simultaneously, as this could require too much memory.
365380
Change directory into the AlignmentProcessor folder. Paste the followig into
366381
a terminal (change the output directory to the desired loaction):
367382

368-
python AlignmentProcessor.py --axt --kaks --ucsc -r green_anole \
383+
python AlignmentProcessor.py --axt --kaks --ucsc -r Green_anole \
369384
-i test.fa -o test/
370385

371-
It should produce a text file with 17 lines.
386+
This will return a text file with 11 lines.

bin/KaKs_CalculatorBin/KaKs_Calculator1.2/Mac/KaKs_Calculator renamed to KaKs_Calculator/KaKs_Calculator1.2/Mac/KaKs_Calculator

File renamed without changes.

bin/KaKs_CalculatorBin/KaKs_Calculator1.2/Mac/KaKs_Calculator.tar.gz renamed to KaKs_Calculator/KaKs_Calculator1.2/Mac/KaKs_Calculator.tar.gz

File renamed without changes.

bin/KaKs_CalculatorBin/KaKs_Calculator2.0/Linux/KaKs_Calculator renamed to KaKs_Calculator/KaKs_Calculator2.0/Linux/KaKs_Calculator

File renamed without changes.

bin/KaKs_CalculatorBin/KaKs_Calculator2.0/Windows/KaKs_Calculator renamed to KaKs_Calculator/KaKs_Calculator2.0/Windows/KaKs_Calculator

File renamed without changes.

bin/02_RemoveHeader.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -40,7 +40,7 @@ def changeNames(header):
4040
with open("bin/02_nameList.txt", "r") as nameList:
4141
# Create dictionary of species names from file
4242
for line in nameList:
43-
speciesDict[line.split()[0]] = line.split()[1].rstrip()
43+
speciesDict[line.split("\t")[0]] = line.split()[1].rstrip()
4444
# Compare the build name against the species dictionary. If it is matched
4545
# to a key, return the value (the common name).
4646
if build in speciesDict:

0 commit comments

Comments
 (0)