python-cluster/cluster/method/kmeans.py at 85e88d71e299515a9d4824cf86167deecfa36ded · exhuma/python-cluster · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
#
# This is part of "python-cluster". A library to group similar items together.
# Copyright (C) 2006    Michel Albert
#
# This library is free software; you can redistribute it and/or modify it
# under the terms of the GNU Lesser General Public License as published by the
# Free Software Foundation; either version 2.1 of the License, or (at your
# option) any later version.
# This library is distributed in the hope that it will be useful, but WITHOUT
# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
# FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License
# for more details.
# You should have received a copy of the GNU Lesser General Public License
# along with this library; if not, write to the Free Software Foundation,
# Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
#
# new functions: HDgetCluster() and HDassignItem by:
#   2015 Jose Javier Garcia Aranda, Juan Ramos Diaz

from cluster.util import ClusteringError, centroid, minkowski_distance, HDcentroid
import time
import datetime


class KMeansClustering(object):
    """
    Implementation of the kmeans clustering method as explained in a tutorial_
    by *matteucc*.

    .. _tutorial: http://www.elet.polimi.it/upload/matteucc/Clustering/tutorial_html/kmeans.html

    Example:

      >>> from cluster import KMeansClustering
      >>> cl = KMeansClustering([(1,1), (2,1), (5,3), ...])
      >>> clusters = cl.getclusters(2)

    :param data: A list of tuples or integers.
    :param distance: A function determining the distance between two items.
        Default (if ``None`` is passed): It assumes the tuples contain numeric
        values and appiles a generalised form of the euclidian-distance
        algorithm on them.
    :param equality: A function to test equality of items. By default the
        standard python equality operator (``==``) is applied.
    :raises ValueError: if the list contains heterogeneous items or if the
        distance between items cannot be determined.
    """

    def __init__(self, data, distance=None, equality=None):
        self.__clusters = []
        self.__data = data
        self.distance = distance
        self.__initial_length = len(data)
        self.equality = equality

        # test if each item is of same dimensions
        if len(data) > 1 and isinstance(data[0], tuple):
            control_length = len(data[0])
            for item in data[1:]:
                if len(item) != control_length:
                    raise ValueError("Each item in the data list must have "
                                     "the same amount of dimensions. Item "
                                     "%r was out of line!" % (item,))
        # now check if we need and have a distance function
        if (len(data) > 1 and not isinstance(data[0], tuple) and
                distance is None):
            raise ValueError("You supplied non-standard items but no "
                             "distance function! We cannot continue!")
        # we now know that we have tuples, and assume therefore that it's
        # items are numeric
        elif distance is None:
            self.distance = minkowski_distance

    def getclusters(self, count):
        """
        Generates *count* clusters.

        :param count: The amount of clusters that should be generated.  count
            must be greater than ``1``.
        :raises ClusteringError: if *count* is out of bounds.
        """

        # only proceed if we got sensible input
        if count <= 1:
            raise ClusteringError("When clustering, you need to ask for at "
                                  "least two clusters! "
                                  "You asked for %d" % count)

        # return the data straight away if there is nothing to cluster
        if (self.__data == [] or len(self.__data) == 1 or
                count == self.__initial_length):
            return self.__data

        # It makes no sense to ask for more clusters than data-items available
        if count > self.__initial_length:
            raise ClusteringError(
                "Unable to generate more clusters than "
                "items available. You supplied %d items, and asked for "
                "%d clusters." % (self.__initial_length, count))

        self.initialise_clusters(self.__data, count)

        items_moved = True  # tells us if any item moved between the clusters,
                            # as we initialised the clusters, we assume that
                            # is the case

        while items_moved is True:
            items_moved = False
            for cluster in self.__clusters:
                for item in cluster:
                    res = self.assign_item(item, cluster)
                    if items_moved is False:
                        items_moved = res
        return self.__clusters

    def assign_item(self, item, origin):
        """
        Assigns an item from a given cluster to the closest located cluster.

        :param item: the item to be moved.
        :param origin: the originating cluster.
        """
        closest_cluster = origin
        for cluster in self.__clusters:
            if self.distance(item, centroid(cluster)) < self.distance(
                    item, centroid(closest_cluster)):
                closest_cluster = cluster

        if id(closest_cluster) != id(origin):
            self.move_item(item, origin, closest_cluster)
            return True
        else:
            return False

    def move_item(self, item, origin, destination):
        """
        Moves an item from one cluster to anoter cluster.

        :param item: the item to be moved.
        :param origin: the originating cluster.
        :param destination: the target cluster.
        """
        if self.equality:
            item_index = 0
            for i, element in enumerate(origin):
                if self.equality(element, item):
                    item_index = i
                    break
        else:
            item_index = origin.index(item)

        destination.append(origin.pop(item_index))

    def initialise_clusters(self, input_, clustercount):
        """
        Initialises the clusters by distributing the items from the data.
        evenly across n clusters

        :param input_: the data set (a list of tuples).
        :param clustercount: the amount of clusters (n).
        """
        # initialise the clusters with empty lists
        self.__clusters = []
        for _ in range(clustercount):
            self.__clusters.append([])

        # distribute the items into the clusters
        count = 0
        for item in input_:
            self.__clusters[count % clustercount].append(item)
            count += 1


    def HDgetclusters(self, count,  max_iterations):
        """
        Generates *count* clusters.

        :param count: The amount of clusters that should be generated.  count
            must be greater than ``1``.
        :raises ClusteringError: if *count* is out of bounds.
        """

        # only proceed if we got sensible input
        if count <= 1:
            raise ClusteringError("When clustering, you need to ask for at "
                                  "least two clusters! "
                                  "You asked for %d" % count)

        # return the data straight away if there is nothing to cluster
        if (self.__data == [] or len(self.__data) == 1 or
                count == self.__initial_length):
            return self.__data

        # It makes no sense to ask for more clusters than data-items available
        if count > self.__initial_length:
            raise ClusteringError(
                "Unable to generate more clusters than "
                "items available. You supplied %d items, and asked for "
                "%d clusters." % (self.__initial_length, count))

        self.initialise_clusters(self.__data, count)

        items_moved = True  # tells us if any item moved between the clusters,
                            # as we initialised the clusters, we assume that
                            # is the case

        iteration=0
        #asi no, no obligar a hacer iteraciones, lo hago segun dice el algoritmo
        #pero si llego a iteraciones paro, si termino antes de llegar, mejor
        while items_moved is True:
            items_moved = False
            print "iterating",iteration
            ts = time.time()
            st=datetime.datetime.fromtimestamp(ts).strftime('%Y-%m-%d %H:%M:%S')
            print st
            iteration=iteration+1

            #computation of centroids
            my_centroids={}   # new!!
            for cluster in self.__clusters:# new!!
                one_centroid=HDcentroid(cluster)# new!!
                my_centroids[one_centroid]=cluster # new!!


            #this few lines are new:
            #print centroids . it works, for debug purposes only!!
            #for i in my_centroids.keys():
            #    print "key:",i # print the centroid!!
            #    print "value:",my_centroids[i] # print all elements of the cluster!!
            #print my_centroids.keys()[0] # imprime el primer centroide. es una prueba

            #now we scan the N items without recalculation of centroids. Therefore, it is linear
            for cluster in self.__clusters:
                for centroid_aux, cluster_aux in my_centroids.iteritems():
                    if cluster_aux == cluster:
                        centroid_cluster=centroid_aux
                        break;
                for item in cluster:
                    res = self.HDassign_item(item, cluster,centroid_cluster,my_centroids)#modified!!
                    if items_moved is False:
                        items_moved = res

            if (iteration == max_iterations):
                items_moved = False
        return self.__clusters


    def HDassign_item(self, item, origin, origin_centroid, my_centroids):
        """
        Assigns an item from a given cluster to the closest located cluster.

        :param item: the item to be moved.
        :param origin: the originating cluster.
        :param origin_centroid: centroid of the originating cluster
        :my_centroids: dictionary of centroid,cluster
        """
        closest_cluster=origin #my_centroids[closest_centroid]=closest_cluster
        closest_centroid=origin_centroid
        #for cluster in self.__clusters:
        for centro in my_centroids.keys():
            if self.distance(item, centro) < self.distance(
                    item, closest_centroid):
                closest_cluster = my_centroids[centro]

        if id(closest_cluster) != id(origin):
            self.move_item(item, origin, closest_cluster)
            return True
        else:
            return False