|
| 1 | +# |
| 2 | +# Author: Hani Z. Girgis, PhD |
| 3 | +# |
| 4 | +# |
| 5 | +# Purpose: This program converts all-versus-all identity scores produced |
| 6 | +# by Identity to Phylip distance matrix |
| 7 | +# Requirements: Python 3 with NumPy. |
| 8 | +# |
| 9 | + |
| 10 | +import numpy as np |
| 11 | +import sys |
| 12 | + |
| 13 | + |
| 14 | +def make_matrix(file_in, file_out): |
| 15 | + file = open(file_in, 'r') |
| 16 | + # List of labels |
| 17 | + label_dict = {} |
| 18 | + seq_index = 0 |
| 19 | + for line in file: |
| 20 | + token_list = line.strip().split("\t") |
| 21 | + |
| 22 | + if not token_list[0] in label_dict: |
| 23 | + label_dict[token_list[0]] = seq_index |
| 24 | + seq_index += 1 |
| 25 | + |
| 26 | + if not token_list[1] in label_dict: |
| 27 | + label_dict[token_list[1]] = seq_index |
| 28 | + seq_index += 1 |
| 29 | + |
| 30 | + label_count = len(label_dict.keys()) |
| 31 | + |
| 32 | + # The matrix |
| 33 | + matrix = np.ones((label_count, label_count)) |
| 34 | + |
| 35 | + # Identity does not report the sequence versus itself |
| 36 | + for i in range(0, label_count): |
| 37 | + matrix[i][i] = 0.00000000 |
| 38 | + |
| 39 | + # Go to the begining of the file |
| 40 | + file.seek(0) |
| 41 | + for line in file: |
| 42 | + token_list = line.strip().split("\t") |
| 43 | + distance = 1.0 - float(token_list[2]) |
| 44 | + |
| 45 | + if(distance < 0.0): |
| 46 | + distance = 0.0 |
| 47 | + elif(distance > 1.0): |
| 48 | + distance = 1.0 |
| 49 | + |
| 50 | + id_1 = label_dict[token_list[0]] |
| 51 | + id_2 = label_dict[token_list[1]] |
| 52 | + matrix[id_1][id_2] = distance |
| 53 | + matrix[id_2][id_1] = distance |
| 54 | + |
| 55 | + file.close() |
| 56 | + |
| 57 | + # Write Phylip matrix |
| 58 | + sorted_key_list = sorted(label_dict.keys(), key=lambda key: label_dict[key]) |
| 59 | + with open(file_out, 'w') as file_object: |
| 60 | + file_object.write(str(label_count) + '\n') |
| 61 | + for key in sorted_key_list: |
| 62 | + t = list() |
| 63 | + i = label_dict[key] |
| 64 | + for j in range(0, label_count): |
| 65 | + t.append(str.format('{0:.8f}', matrix[i][j])) #round(matrix[i][j], 8)) |
| 66 | + s = ' ' |
| 67 | + t = [str(x) for x in t] |
| 68 | + # End a species name with \t |
| 69 | + file_object.write(key[1:] + '\t' + s.join(t) + '\n') |
| 70 | + |
| 71 | +if len(sys.argv) != 3: |
| 72 | + print("Use: python3 ", sys.argv[0], "allVsAllIdentityFile outputFileName") |
| 73 | + print() |
| 74 | + print("Please provide an all-versus-all file produced by Identity and an output file name.") |
| 75 | + print() |
| 76 | + print("Example: python3 " + sys.argv[0] + " all_vs_all_identity_scores matrix.phylip") |
| 77 | + print() |
| 78 | +else: |
| 79 | + make_matrix(sys.argv[1], sys.argv[2]) |
0 commit comments