Skip to content

Commit d79be5c

Browse files
authored
Added py directory
1 parent 6f1bd38 commit d79be5c

2 files changed

Lines changed: 109 additions & 0 deletions

File tree

py/README.txt

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,23 @@
1+
Author
2+
3+
Hani Z. Girgis, PhD
4+
5+
Purpose
6+
7+
This program performs hierarchical clustering using all-versus-all identity scores produced by Identity.
8+
9+
Requirements
10+
11+
Python 3 with SciPy and Matplotlib packages.
12+
13+
Examples
14+
15+
1. Display only sorted leaves to the terminal, i.e. no graphics:
16+
python3 makeTree.py all_vs_all_identity_scores 0
17+
18+
2. Display tree:
19+
python3 makeTree.py all_vs_all_identity_scores 1
20+
21+
Note
22+
23+
The hierarchical clustering algorithm may take long time.

py/makeTree.py

Lines changed: 86 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,86 @@
1+
#
2+
# Author: Hani Z. Girgis, PhD
3+
# Purpose: This program performs hierarchical clustering using the
4+
# all-versus-all identity scores produced by Identity.
5+
# Requirements: Python 3 with SciPy and Matplotlib packages.
6+
#
7+
8+
import numpy as np
9+
import sys
10+
11+
from scipy.cluster.hierarchy import linkage
12+
from scipy.cluster.hierarchy import dendrogram
13+
from scipy.cluster.hierarchy import leaves_list
14+
import matplotlib.pyplot as plt
15+
16+
17+
def generate_tree(file, displayTree):
18+
file = open(file, 'r')
19+
# List of labels
20+
label_dict = {}
21+
seq_index = 0
22+
for line in file:
23+
token_list = line.strip().split("\t")
24+
25+
if not token_list[0] in label_dict:
26+
label_dict[token_list[0]] = seq_index
27+
seq_index += 1
28+
29+
if not token_list[1] in label_dict:
30+
label_dict[token_list[1]] = seq_index
31+
seq_index += 1
32+
33+
label_count = len(label_dict.keys())
34+
35+
# The matrix
36+
matrix = np.ones((label_count, label_count))
37+
# Go to the begining of the file
38+
file.seek(0)
39+
for line in file:
40+
token_list = line.strip().split("\t")
41+
distance = 1.0 - float(token_list[2])
42+
43+
if(distance < 0.0):
44+
distance = 0.0
45+
elif(distance > 1.0):
46+
distance = 1.0
47+
48+
id_1 = label_dict[token_list[0]]
49+
id_2 = label_dict[token_list[1]]
50+
if(id_1 < id_2):
51+
matrix[id_1][id_2] = distance
52+
else:
53+
matrix[id_2][id_1] = distance
54+
55+
file.close()
56+
57+
key_list = label_dict.keys()
58+
name_list = [-1] * len(key_list)
59+
for k in key_list:
60+
name_list[label_dict[k]] = k;
61+
62+
l = list(matrix[np.triu_indices(label_count, 1)])
63+
row_clusters = linkage(l, method='average', optimal_ordering=True)
64+
65+
# Print ordered leaves
66+
if displayTree == 0:
67+
leaf_list = leaves_list(row_clusters)
68+
for index in leaf_list:
69+
print(name_list[index])
70+
else:
71+
plt.figure()
72+
dendrogram(row_clusters, orientation='left', labels=name_list, leaf_font_size=5, distance_sort='descending', show_leaf_counts=True)
73+
plt.tight_layout()
74+
plt.show()
75+
76+
77+
if len(sys.argv) != 3:
78+
print("Use:", sys.argv[0], "allVsAllIdentityFile sortedLeavesOrTree")
79+
print()
80+
print("Please provide an all-versus-all file produced by Identity and 0 (display sorted leaves) or 1 (display a tree).")
81+
print()
82+
print("Display only sorted leaves: python3 makeTree.py all_vs_all_identity_scores 0")
83+
print("Display tree: python3 makeTree.py all_vs_all_identity_scores 1")
84+
print()
85+
else:
86+
generate_tree(sys.argv[1], int(sys.argv[2]))

0 commit comments

Comments
 (0)