-
Notifications
You must be signed in to change notification settings - Fork 26
Expand file tree
/
Copy pathkmeans.py
More file actions
271 lines (225 loc) · 10.6 KB
/
kmeans.py
File metadata and controls
271 lines (225 loc) · 10.6 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
#
# This is part of "python-cluster". A library to group similar items together.
# Copyright (C) 2006 Michel Albert
#
# This library is free software; you can redistribute it and/or modify it
# under the terms of the GNU Lesser General Public License as published by the
# Free Software Foundation; either version 2.1 of the License, or (at your
# option) any later version.
# This library is distributed in the hope that it will be useful, but WITHOUT
# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
# FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License
# for more details.
# You should have received a copy of the GNU Lesser General Public License
# along with this library; if not, write to the Free Software Foundation,
# Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
#
# new functions: HDgetCluster() and HDassignItem by:
# 2015 Jose Javier Garcia Aranda, Juan Ramos Diaz
from cluster.util import ClusteringError, centroid, minkowski_distance, HDcentroid
import time
import datetime
class KMeansClustering(object):
"""
Implementation of the kmeans clustering method as explained in a tutorial_
by *matteucc*.
.. _tutorial: http://www.elet.polimi.it/upload/matteucc/Clustering/tutorial_html/kmeans.html
Example:
>>> from cluster import KMeansClustering
>>> cl = KMeansClustering([(1,1), (2,1), (5,3), ...])
>>> clusters = cl.getclusters(2)
:param data: A list of tuples or integers.
:param distance: A function determining the distance between two items.
Default (if ``None`` is passed): It assumes the tuples contain numeric
values and appiles a generalised form of the euclidian-distance
algorithm on them.
:param equality: A function to test equality of items. By default the
standard python equality operator (``==``) is applied.
:raises ValueError: if the list contains heterogeneous items or if the
distance between items cannot be determined.
"""
def __init__(self, data, distance=None, equality=None):
self.__clusters = []
self.__data = data
self.distance = distance
self.__initial_length = len(data)
self.equality = equality
# test if each item is of same dimensions
if len(data) > 1 and isinstance(data[0], tuple):
control_length = len(data[0])
for item in data[1:]:
if len(item) != control_length:
raise ValueError("Each item in the data list must have "
"the same amount of dimensions. Item "
"%r was out of line!" % (item,))
# now check if we need and have a distance function
if (len(data) > 1 and not isinstance(data[0], tuple) and
distance is None):
raise ValueError("You supplied non-standard items but no "
"distance function! We cannot continue!")
# we now know that we have tuples, and assume therefore that it's
# items are numeric
elif distance is None:
self.distance = minkowski_distance
def getclusters(self, count):
"""
Generates *count* clusters.
:param count: The amount of clusters that should be generated. count
must be greater than ``1``.
:raises ClusteringError: if *count* is out of bounds.
"""
# only proceed if we got sensible input
if count <= 1:
raise ClusteringError("When clustering, you need to ask for at "
"least two clusters! "
"You asked for %d" % count)
# return the data straight away if there is nothing to cluster
if (self.__data == [] or len(self.__data) == 1 or
count == self.__initial_length):
return self.__data
# It makes no sense to ask for more clusters than data-items available
if count > self.__initial_length:
raise ClusteringError(
"Unable to generate more clusters than "
"items available. You supplied %d items, and asked for "
"%d clusters." % (self.__initial_length, count))
self.initialise_clusters(self.__data, count)
items_moved = True # tells us if any item moved between the clusters,
# as we initialised the clusters, we assume that
# is the case
while items_moved is True:
items_moved = False
for cluster in self.__clusters:
for item in cluster:
res = self.assign_item(item, cluster)
if items_moved is False:
items_moved = res
return self.__clusters
def assign_item(self, item, origin):
"""
Assigns an item from a given cluster to the closest located cluster.
:param item: the item to be moved.
:param origin: the originating cluster.
"""
closest_cluster = origin
for cluster in self.__clusters:
if self.distance(item, centroid(cluster)) < self.distance(
item, centroid(closest_cluster)):
closest_cluster = cluster
if id(closest_cluster) != id(origin):
self.move_item(item, origin, closest_cluster)
return True
else:
return False
def move_item(self, item, origin, destination):
"""
Moves an item from one cluster to anoter cluster.
:param item: the item to be moved.
:param origin: the originating cluster.
:param destination: the target cluster.
"""
if self.equality:
item_index = 0
for i, element in enumerate(origin):
if self.equality(element, item):
item_index = i
break
else:
item_index = origin.index(item)
destination.append(origin.pop(item_index))
def initialise_clusters(self, input_, clustercount):
"""
Initialises the clusters by distributing the items from the data.
evenly across n clusters
:param input_: the data set (a list of tuples).
:param clustercount: the amount of clusters (n).
"""
# initialise the clusters with empty lists
self.__clusters = []
for _ in range(clustercount):
self.__clusters.append([])
# distribute the items into the clusters
count = 0
for item in input_:
self.__clusters[count % clustercount].append(item)
count += 1
def HDgetclusters(self, count, max_iterations):
"""
Generates *count* clusters.
:param count: The amount of clusters that should be generated. count
must be greater than ``1``.
:raises ClusteringError: if *count* is out of bounds.
"""
# only proceed if we got sensible input
if count <= 1:
raise ClusteringError("When clustering, you need to ask for at "
"least two clusters! "
"You asked for %d" % count)
# return the data straight away if there is nothing to cluster
if (self.__data == [] or len(self.__data) == 1 or
count == self.__initial_length):
return self.__data
# It makes no sense to ask for more clusters than data-items available
if count > self.__initial_length:
raise ClusteringError(
"Unable to generate more clusters than "
"items available. You supplied %d items, and asked for "
"%d clusters." % (self.__initial_length, count))
self.initialise_clusters(self.__data, count)
items_moved = True # tells us if any item moved between the clusters,
# as we initialised the clusters, we assume that
# is the case
iteration=0
#asi no, no obligar a hacer iteraciones, lo hago segun dice el algoritmo
#pero si llego a iteraciones paro, si termino antes de llegar, mejor
while items_moved is True:
items_moved = False
print "iterating",iteration
ts = time.time()
st=datetime.datetime.fromtimestamp(ts).strftime('%Y-%m-%d %H:%M:%S')
print st
iteration=iteration+1
#computation of centroids
my_centroids={} # new!!
for cluster in self.__clusters:# new!!
one_centroid=HDcentroid(cluster)# new!!
my_centroids[one_centroid]=cluster # new!!
#this few lines are new:
#print centroids . it works, for debug purposes only!!
#for i in my_centroids.keys():
# print "key:",i # print the centroid!!
# print "value:",my_centroids[i] # print all elements of the cluster!!
#print my_centroids.keys()[0] # imprime el primer centroide. es una prueba
#now we scan the N items without recalculation of centroids. Therefore, it is linear
for cluster in self.__clusters:
for centroid_aux, cluster_aux in my_centroids.iteritems():
if cluster_aux == cluster:
centroid_cluster=centroid_aux
break;
for item in cluster:
res = self.HDassign_item(item, cluster,centroid_cluster,my_centroids)#modified!!
if items_moved is False:
items_moved = res
if (iteration == max_iterations):
items_moved = False
return self.__clusters
def HDassign_item(self, item, origin, origin_centroid, my_centroids):
"""
Assigns an item from a given cluster to the closest located cluster.
:param item: the item to be moved.
:param origin: the originating cluster.
:param origin_centroid: centroid of the originating cluster
:my_centroids: dictionary of centroid,cluster
"""
closest_cluster=origin #my_centroids[closest_centroid]=closest_cluster
closest_centroid=origin_centroid
#for cluster in self.__clusters:
for centro in my_centroids.keys():
if self.distance(item, centro) < self.distance(
item, closest_centroid):
closest_cluster = my_centroids[centro]
if id(closest_cluster) != id(origin):
self.move_item(item, origin, closest_cluster)
return True
else:
return False