-
Notifications
You must be signed in to change notification settings - Fork 3
Expand file tree
/
Copy pathhierarchical.d
More file actions
273 lines (221 loc) · 8.23 KB
/
hierarchical.d
File metadata and controls
273 lines (221 loc) · 8.23 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
/**
This file contains functions for performing hierarchical clustering, and
can be used for drawing heatmaps and, eventually, dendrograms.
Bugs: Not very efficient, though it probably doesn't need to be because
the use case is visualizations, and all the information has to fit
reasonably on the visualization. Therefore, N will always be fairly
small.
Copyright (C) 2011 David Simcha
License:
Boost Software License - Version 1.0 - August 17th, 2003
Permission is hereby granted, free of charge, to any person or organization
obtaining a copy of the software and accompanying documentation covered by
this license (the "Software") to use, reproduce, display, distribute,
execute, and transmit the Software, and to prepare derivative works of the
Software, and to permit third-parties to whom the Software is furnished to
do so, all subject to the following:
The copyright notices in the Software and this entire statement, including
the above license grant, this restriction and the following disclaimer,
must be included in all copies of the Software, in whole or in part, and
all derivative works of the Software, unless such copies or derivative
works are solely in the form of machine-executable object code generated by
a source language processor.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT
SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE
FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE,
ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
DEALINGS IN THE SOFTWARE.
*/
module plot2kill.hierarchical;
import plot2kill.util, std.typetuple;
/// Used for mean linkage.
double mean(double[] stuff) {
return reduce!"a + b"(0.0, stuff) / stuff.length;
}
/// Euclidean distance.
double euclidean(R1, R2)(R1 a, R2 b)
if(allSatisfy!(isInputRange, R1, R2) && is(ElementType!R1 : double) &&
is(ElementType!R2 : double)) {
return sqrt(
reduce!"a + (b[0] - b[1]) ^^ 2"(0.0, zip(a, b))
);
}
/**
A tree for defining hierarchical clusters.
*/
struct Cluster {
private this
(Cluster* left, Cluster* right, double dist, size_t index) {
this.left = left;
this.right = right;
this.distance = dist;
this.index = index;
}
///
Cluster* left;
///
Cluster* right;
///
double distance;
/**
The index of the data w.r.t. matrix, if this is a leaf node, or size_t.max
if this is not a leaf node.
*/
size_t index = size_t.max;
/// The name of the sample, populated only for leaf nodes.
string name;
/// True if this cluster doesn't have children.
bool isLeaf() @property pure nothrow const {
if(left is null) assert(right is null);
return left is null;
}
// Tracks distances to other clusters that have already been computed.
// Is always null once hierarchicalCluster returns, because it's no
// longer needed.
private double[Cluster*] distCache;
private double calculateDistance
(alias linkage)(ref Cluster rhs, double[][] distances) {
if(&rhs in distCache) {
return distCache[&rhs];
}
auto app = appender!(double[])();
void addDists(ref Cluster a, ref Cluster b) {
if(a.isLeaf) {
if(b.isLeaf) {
auto index1 = max(a.index, b.index);
auto index2 = min(a.index, b.index);
assert(index1 != index2);
app.put(distances[index1][index2]);
} else {
addDists(a, *(b.left));
addDists(a, *(b.right));
}
} else {
addDists(*(a.left), b);
addDists(*(a.right), b);
}
}
addDists(this, rhs);
auto ret = linkage(app.data);
distCache[&rhs] = ret;
return ret;
}
/// Iterate over the leaf nodes.
int opApply(int delegate(ref Cluster) dg) {
int res;
if(isLeaf) {
res = dg(this);
return res;
}
assert(left);
assert(right);
res = left.opApply(dg);
if(res) return res;
res = right.opApply(dg);
return res;
}
/// The number of leaf nodes in this cluster.
int nLeafNodes() const pure nothrow @property {
if(isLeaf) return 1;
return left.nLeafNodes + right.nLeafNodes;
}
}
/**
Cluster by rows or columns?
*/
enum ClusterBy {
///
rows,
///
columns
}
/**
Perform hierarchical clustering. matrix must be rectangular and represents
the data matrix. distance is the distance metric, which must be a function
that accepts two equal-length input ranges of doubles. linkage is the linkage
function, which must accept a double[] that represents all possible pairwise
distances between two clusters and return a summary of these distances.
clusterBy indicates whether the rows or the columns of the matrix should
be clustered.
names is an optional string array of names, one per sample. If provided,
this information will be placed in the Cluster objects, allowing samples
to be tracked by name.
*/
Cluster* hierarchicalCluster(alias distance = euclidean, alias linkage = mean)(
double[][] matrix, ClusterBy clusterBy, string[] names = null
) {
enforce(matrix.length > 0, "Cannot cluster zero elements.");
foreach(i; 1..matrix.length) {
enforce(matrix[i].length == matrix[0].length,
"matrix must be rectangular for hierarchicalCluster.");
}
Cluster*[] clusters;
// Make distance matrix.
double[][] distances;
if(clusterBy == ClusterBy.rows) {
clusters = new Cluster*[matrix.length];
distances = new double[][matrix.length];
enforce(names.length == 0 || names.length == matrix.length,
"names.length must be equal to matrix.length for " ~
"hierarchical clustering by row.");
foreach(i; 0..clusters.length) {
distances[i] = new double[i];
foreach(j; 0..i) {
distances[i][j] = distance(matrix[i], matrix[j]);
}
}
} else {
assert(clusterBy == ClusterBy.columns);
clusters = new Cluster*[matrix[0].length];
distances = new double[][matrix[0].length];
enforce(names.length == 0 || names.length == matrix[0].length,
"names.length must be equal to matrix[0].length for " ~
"hierarchical clustering by row.");
foreach(i; 0..clusters.length) {
distances[i] = new double[i];
foreach(j; 0..i) {
distances[i][j] = distance(
transversal(matrix, i),
transversal(matrix, j)
);
}
}
}
foreach(i; 0..clusters.length) {
clusters[i] = new Cluster(null, null, double.nan, i);
if(names.length) clusters[i].name = names[i];
}
while(clusters.length > 1) {
// Find min dist pair.
size_t minPair1, minPair2;
double minDist = double.infinity;
foreach(i; 0..clusters.length) foreach(j; i + 1..clusters.length) {
immutable dist =
clusters[i].calculateDistance!(linkage)(*clusters[j], distances);
if(dist < minDist) {
minPair1 = i;
minPair2 = j;
minDist = dist;
}
}
// Clean up excess distCache stuff, let it get GC'd.
clusters[minPair1].distCache = null;
clusters[minPair2].distCache = null;
foreach(cluster; clusters) {
if(clusters[minPair1] in cluster.distCache) {
cluster.distCache.remove(clusters[minPair1]);
}
if(clusters[minPair2] in cluster.distCache) {
cluster.distCache.remove(clusters[minPair2]);
}
}
clusters[minPair1] = new Cluster(
clusters[minPair1], clusters[minPair2], minDist, size_t.max);
clusters = clusters.remove(minPair2);
}
distances[] = null; // Make sure it gets gc'd.
clusters[0].distCache = null;
return clusters[0];
}