Skip to content

Commit fc75c11

Browse files
committed
Streamlined pipeline, reduced i/o operations, and made individual scripts more user-friendly.
1 parent fef3424 commit fc75c11

21 files changed

Lines changed: 834 additions & 1609 deletions

AlignmentProcessor.py

Lines changed: 133 additions & 275 deletions
Large diffs are not rendered by default.

AlignmentProcessorReadMe.pdf

301 KB
Binary file not shown.

README.md

Lines changed: 14 additions & 408 deletions
Large diffs are not rendered by default.

bin/00_ConvertHeader.py

Lines changed: 0 additions & 40 deletions
This file was deleted.

bin/01_SplitFasta.py

Lines changed: 93 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,93 @@
1+
'''This will take an aligned multiple FASTA file with multiple genes
2+
and create individual FASTA alignment files for each gene and simplify the
3+
fasta headers.
4+
5+
6+
Copyright 2016 by Shawn Rupp'''
7+
8+
import argparse
9+
from glob import glob
10+
11+
def splitFasta(infile, outdir):
12+
# Open input file and split into one alignment per gene
13+
print("\tSplitting fasta file into one file per gene...")
14+
passed = 0
15+
excluded = 0
16+
# Create log file
17+
log = ""
18+
path = outdir.split("/")[:-2]
19+
for i in path:
20+
log += i + "/"
21+
log += "splitFastaLog.txt"
22+
with open(log, "w") as logfile:
23+
logfile.write("Transcripts with only one species\n\n")
24+
# Parse input fasta
25+
with open(infile, "r") as fasta:
26+
newid = True
27+
seq = ""
28+
n = 0
29+
for line in fasta:
30+
if line != "\n":
31+
if line[0] == ">":
32+
line, geneid = convertHeader(line)
33+
# Concatenate lines for all species for each gene
34+
seq += str(line)
35+
# Determine number of sequences and species names
36+
n += 1
37+
if newid == True:
38+
# Set reference species ID as file name
39+
filename = geneid
40+
newid = False
41+
else:
42+
# Concatenate remaining lines
43+
seq += str(line)
44+
elif line == "\n" and newid == False:
45+
# Use empty lines to determine where genes end
46+
if n >= 2:
47+
# Print gene sequences to file if there are at least two
48+
# species and reset for next gene
49+
outfile = (outdir + filename + "." + str(n) + ".fa")
50+
with open(outfile, "w") as output:
51+
output.write(seq)
52+
newid = True
53+
seq = ""
54+
n = 0
55+
passed += 1
56+
elif n < 2:
57+
# Skip genes with only one sequence and save ID in log
58+
with open(log, "a") as logfile:
59+
logfile.write(geneid + "\n")
60+
excluded += 1
61+
with open(log, "a") as logfile:
62+
logfile.write(("\nTotal transcripts written to file: {}\n").format(passed))
63+
logfile.write(("Total transcripts with only one sequence: {}").format(excluded))
64+
65+
def convertHeader(line):
66+
'''Returns a header containing only the genome build name.'''
67+
if line.startswith(">ENS") == True:
68+
# Extract relevant data from UCSC header
69+
genebuild = line[1:].split()[0]
70+
genebuild = genebuild.split("_")
71+
build = ">" + str(genebuild[-1]) + "\n"
72+
geneid = str(genebuild[0].split(".")[0])
73+
else:
74+
# Extract build without ">"
75+
build = ">" + line.split(".")[0][1:].rstrip() + "\n"
76+
geneid = str(line.split(".")[1]).rstrip()
77+
return build, geneid
78+
79+
def main():
80+
parser = argparse.ArgumentParser(description="This will take the \
81+
aligned multiple FASTA file with multiple genes and create individual FASTA \
82+
alignment files for each gene and simplify the fasta headers.")
83+
parser.add_argument("-i", help="path to input file.")
84+
parser.add_argument("-o", help="path to output directory.")
85+
args = parser.parse_args()
86+
infile = args.i
87+
outdir = args.o
88+
if outdir[-1] != "/":
89+
outdir += "/"
90+
splitFasta(infile, outdir)
91+
92+
if __name__ == "__main__":
93+
main()

bin/01_SplitFastaFiles.py

Lines changed: 0 additions & 67 deletions
This file was deleted.

0 commit comments

Comments
 (0)