ASAFind-2/S2_score_table_updated.py at main · ASAFind/ASAFind-2 · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
#! /usr/bin/env python
#
# This work is copyrhight 2025 Ansgar Gruber and Marta Vohnoutová, University of South Bohemia,
# and licenced under CC BY-SA 4.0 (Creative Commons Attribution-ShareAlike 4.0 International) licence,
# available at: https://creativecommons.org/licenses/by-sa/4.0/
#
#
# For updates see our repository on GitHub: https://github.com/ASAFind/ASAFind-2
# ASAFind web service: https://asafind.jcu.cz/
# Publication: https://doi.org/10.1111/tpj.70138 (please cite if you publish results of the scripts, or derivative work)
#
# Contacts:
# Ansgar Gruber <ansgar.gruber@paru.cas.cz>
# Marta Vohnoutová <mvohnoutova@prf.jcu.cz>
#
#
# This work is derivative of Score_Table.py, which is copyright 2020 Cedar McKay and Gabrielle Rocap,
# University of Washington, licensed under the Creative Commons Attribution-ShareAlike 3.0 Unported License,
# available at: http://creativecommons.org/licenses/by-sa/3.0/
# Score_Table.py is accessible via the following url:
# http://dx.doi.org/10.13140/RG.2.2.23059.39203
# Main changes are the adjustments of package requirements.
#


VERSION = '1.0'
PROJ = 'Score_Table'

import sys
import os.path
import inspect
import argparse
from Bio import SeqIO
from Bio import AlignIO
from Bio.Align.AlignInfo import SummaryInfo
# The FreqTable import has been replaced with dictionary-based frequency handling.
import math
import pickle
from fill_constants import *

if sys.version_info < (3, 10):
    print(inspect.cleandoc('''\n\nWARNING: This version of Score_Table was developed on Python 3.6
        and tested on python 3.10 It is not tested on Python version:
        {}.{}\n\n'''.format(sys.version_info[0], sys.version_info[1])))

#### Collect Input ####
parser = argparse.ArgumentParser(description=f'''
{PROJ} {VERSION}

Takes a Fasta and generates a scoring table based on the aa frequencies at each position.
The output of this script is a tab delimited table.
Python >= 3.6 required.''',
formatter_class=argparse.RawDescriptionHelpFormatter)

parser.add_argument('-f', '--fasta_file',
                    help='Specify the input fasta FILE. Must not contain gaps.', required=True)

parser.add_argument('-o', '--out_file', help='Specify the path and name of the output file you'
                    'wish to create. Default will be the same as the fasta_file, but with a '
                    '".tab" suffix.')

parser.add_argument('--version', action='version', version=f'{PROJ} {VERSION}')

args = parser.parse_args()

# Figure out some names and paths
fasta_file = os.path.abspath(args.fasta_file)
(fasta_file_path, fasta_file_name) = os.path.split(fasta_file)
(fasta_file_base_name, fasta_file_ext) = os.path.splitext(fasta_file_name)
# Figure out what our out_file is.
if args.out_file:
    out_file = os.path.abspath(args.out_file)
else:
    out_file = os.path.abspath(os.path.join(fasta_file_path, fasta_file_base_name + '.tab'))


#####open log file####
orig_stdout = sys.stdout
log_file=open(f'{temp_file}log_S2_{fasta_file_name}.txt', 'w')
sys.stdout = log_file
print(f'Log file for S2 program {fasta_file_name}')
###################

#### Main ####
##############

# Test input files to make sure they are parseable
try:
    records_dict = SeqIO.to_dict(SeqIO.parse(fasta_file, 'fasta'))
except ValueError as e:
    raise Exception(f'The input fasta could not be parsed. The error is: {e}')
# Check if all sequences are equal length, ungapped, no illegal characters
valid_letters = 'ACDEFGHIKLMNPQRSTVWY'
lengths = []
for gene in records_dict.keys():
    seq = records_dict[gene].seq.upper()
    for aa in seq:
        if aa == '-':
            raise ValueError(f'No gaps permitted in alignment, found gap in {gene}')
        if aa not in valid_letters:
            raise ValueError(f'Letters must be one of {valid_letters}, found: "{aa}" in {gene}')
    lengths.append(len(seq))
if len(set(lengths)) > 1:
    raise Exception(f'Sequences must be of equal length')

# Now get to the good stuff

# set up aa list and generic expected frequency table
aas = ['A', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'K', 'L', 'M', 'N', 'P', 'Q', 'R', 'S', 'T', 'V', 'W', 'Y']
random_freq = [0.05] * 20
random_freq_dict = dict(zip(aas, random_freq))
random_freq_table = random_freq_dict  # Using a simple dictionary for frequencies

# read alignment file into alignment object
transit_align = AlignIO.read(fasta_file, 'fasta')
num_taxa = len(transit_align)
align_length = transit_align.get_alignment_length()

#print(f'Transit alignment {transit_align} has {num_taxa} taxa and {align_length} positions') #Marta: I added this line to check the alignment

# calculate e_n for this sample size
e_n = (1/(math.log(2))) * (19/(2*num_taxa))

# get alignment info and PSSM objects
Alignment_Info = SummaryInfo(transit_align)
PSSM = Alignment_Info.pos_specific_score_matrix()

keys = PSSM.pssm[0][1].keys()
missing_keys=[]
for k in aas:
    if k not in keys:
        missing_keys.append(k)

if len(missing_keys) > 0:
    for i in PSSM.pssm:
        i[1].update({k: 0 for k in missing_keys})
print(f'{missing_keys} were added to the PSSM') #Marta: I added this line to check the PSSM


# make a list with the information content at each position in it
Pos_Info = []
for pos in range(align_length):
    InfContent = Alignment_Info.information_content(start=pos, end=pos+1,
		e_freq_table=random_freq_table)
    InfContent = InfContent - e_n
    Pos_Info.append(InfContent)

#print(f'Positional Information {Pos_Info}') #Marta:

# set up the matrix
rows = align_length + 1     # extra for header
cols = len(aas) + 1         # extra for X row
weight_matrix = [[0 for i in range(cols)] for j in range(rows)]

# go through each position and calculate information for each aa at that position
max_score = 0
weight_matrix[0] = aas
for pos in range(align_length):
    weights = []
    for aa in aas: #Marta: I have to change this to aas_abbr
        try:
            weight = (PSSM[pos][aa]/num_taxa)*Pos_Info[pos]
            weights.append(weight)
        except KeyError:
            print(f'KeyError at position {pos} and aa {aa}')
            #print(PSSM)
    weight_matrix[pos + 1] = weights
    max_score = max_score + max(weights)

weight_matrix_transposed = [list(x) for x in zip(*weight_matrix)]

# add the X row, X contains no information, so all 0
X_row = [0] * align_length
X_row.insert(0, 'X')
weight_matrix_transposed.append(X_row)

# print some useful output about this matrix
print(f'''The maximum possible sequence score for this scoring table based on {num_taxa}
sequences is {max_score}''')

# write matrix (2Dlist) to string
scoring_table_string = ''
for row in weight_matrix_transposed:
    for elem in row:
        scoring_table_string = scoring_table_string + str(elem) + '\t'
    scoring_table_string = scoring_table_string + '\n'

# write string to file
out_handle = open(out_file, 'w')
out_handle.write(scoring_table_string)
out_handle.close()

# save to pickle after targetp step
with open(out_file+'.pkl', 'wb') as f:
    pickle.dump(scoring_table_string, f, pickle.HIGHEST_PROTOCOL)
print(type(scoring_table_string))

print('Scoring table saved to', out_file)
print('Scoring table in pickle saved to', out_file+'.pkl')