Skip to content

Commit 9d6e5c6

Browse files
committed
Version 0.5.1
1 parent f700b07 commit 9d6e5c6

7 files changed

Lines changed: 70 additions & 60 deletions

File tree

CHANGELOG.md

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,14 @@
22

33
For changes to the main Rust package, please see <https://github.com/kno10/rust-kmedoids/blob/main/CHANGELOG.md>
44

5+
## kmedoids 0.5.1 (2024-03-14)
6+
7+
- DynMSC: best loss reported incorrectly if best k=2
8+
- add minimum k parameter
9+
- bump rayon version (no changes)
10+
- use pointer-sized np.uintp type for medoids, for wasm32 target
11+
that should match Rust usize.
12+
513
## kmedoids 0.5.0 (2023-12-10)
614

715
- add DynMSC, Silhouette clustering with optimal number of clusters

CITATION.cff

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@ authors:
1010
title: "Fast k-medoids Clustering in Rust and Python"
1111
journal: "J. Open Source Softw."
1212
doi: 10.21105/joss.04183
13-
version: 0.5.0
13+
version: 0.5.1
1414
date-released: 2023-12-10
1515
license: GPL-3.0
1616
preferred-citation:

Cargo.toml

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
[package]
22
edition = "2021"
33
name = "kmedoids"
4-
version = "0.5.0"
4+
version = "0.5.1"
55
authors = ["Erich Schubert <erich.schubert@tu-dortmund.de>", "Lars Lenssen <lars.lenssen@tu-dortmund.de>"]
66
description = "k-Medoids clustering with the FasterPAM algorithm"
77
homepage = "https://github.com/kno10/python-kmedoids"
@@ -14,11 +14,11 @@ name = "kmedoids"
1414
crate-type = ["cdylib"]
1515

1616
[dependencies]
17-
rustkmedoids = { version = "0.5.0", package = "kmedoids", git = "https://github.com/kno10/rust-kmedoids" }
17+
rustkmedoids = { version = "0.5.1", package = "kmedoids", git = "https://github.com/kno10/rust-kmedoids" }
1818
numpy = "0.20"
1919
ndarray = "0.15"
2020
rand = "0.8"
21-
rayon = "1.8"
21+
rayon = "1.9"
2222

2323
[dependencies.pyo3]
2424
version = "0.20"

README.md

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -153,14 +153,15 @@ from sklearn.metrics.pairwise import euclidean_distances
153153
X, _ = fetch_openml('mnist_784', version=1, return_X_y=True, as_frame=False)
154154
X = X[:10000]
155155
diss = euclidean_distances(X)
156-
kmin = 10
157-
kmax = 20
156+
kmin, kmax = 10, 20
158157
dm = kmedoids.dynmsc(diss, kmax, kmin)
159158
print("Optimal number of clusters according to the Medoid Silhouette:", dm.bestk)
160159
print("Medoid Silhouette over range of k:", dm.losses)
161160
print("Range of k:", dm.rangek)
162161
```
163162

163+
[Full Colab notebook example](https://colab.research.google.com/drive/14vop12NwZ5Si5EuzXHIksKnxZxabecWW).
164+
164165
### Memory Requirements
165166

166167
Because the algorithms require a distance matrix as input, you need O(N²) memory to use these implementations. With single precision, this matrix needs 4·N² bytes, so a typical laptop with 8 GB of RAM could handle data sets of over 40.000 instances, but if your computation of the distance matrix incurs copying the matrix, only 30.000 or less may be feasible.

kmedoids/__init__.py

Lines changed: 53 additions & 52 deletions
Original file line numberDiff line numberDiff line change
@@ -166,7 +166,7 @@ def _check_medoids(diss, medoids, init, random_state):
166166
if isinstance(medoids, np.ndarray):
167167
if random_state is not None:
168168
warnings.warn("Seed will be ignored if initial medoids are given")
169-
return medoids
169+
return medoids.astype(np.uintp)
170170
if isinstance(medoids, int):
171171
if init.lower() == "build":
172172
return pam_build(diss, medoids).medoids
@@ -177,8 +177,8 @@ def _check_medoids(diss, medoids, init, random_state):
177177
elif isinstance(random_state, numbers.Integral):
178178
random_state = np.random.RandomState(random_state)
179179
if not isinstance(random_state, np.random.RandomState):
180-
raise ValueError("Pass a numpy random generator, state or integer seed")
181-
return random_state.choice(diss.shape[0], medoids, False)
180+
raise ValueError("Pass a numpy random generator, RandomState or integer seed")
181+
return random_state.choice(diss.shape[0], medoids, False).astype(np.uintp)
182182
raise ValueError("Specify the number of medoids, or give a numpy array of initial medoids")
183183

184184
def fasterpam(diss, medoids, max_iter=100, init="random", random_state=None, n_cpu=-1):
@@ -245,22 +245,22 @@ def fasterpam(diss, medoids, max_iter=100, init="random", random_state=None, n_c
245245
else:
246246
raise ValueError("Pass a numpy random generator, state or integer seed")
247247
if dtype == np.float32:
248-
return KMedoidsResult(*_par_fasterpam_f32(diss, medoids.astype(np.uint64), max_iter, seed, n_cpu))
248+
return KMedoidsResult(*_par_fasterpam_f32(diss, medoids, max_iter, seed, n_cpu))
249249
elif dtype == np.float64:
250-
return KMedoidsResult(*_par_fasterpam_f64(diss, medoids.astype(np.uint64), max_iter, seed, n_cpu))
250+
return KMedoidsResult(*_par_fasterpam_f64(diss, medoids, max_iter, seed, n_cpu))
251251
elif dtype == np.int32:
252-
return KMedoidsResult(*_par_fasterpam_i32(diss, medoids.astype(np.uint64), max_iter, seed, n_cpu))
252+
return KMedoidsResult(*_par_fasterpam_i32(diss, medoids, max_iter, seed, n_cpu))
253253
elif dtype == np.int64:
254-
return KMedoidsResult(*_par_fasterpam_i64(diss, medoids.astype(np.uint64), max_iter, seed, n_cpu))
254+
return KMedoidsResult(*_par_fasterpam_i64(diss, medoids, max_iter, seed, n_cpu))
255255
elif random_state is None:
256256
if dtype == np.float32:
257-
return KMedoidsResult(*_fasterpam_f32(diss, medoids.astype(np.uint64), max_iter))
257+
return KMedoidsResult(*_fasterpam_f32(diss, medoids, max_iter))
258258
elif dtype == np.float64:
259-
return KMedoidsResult(*_fasterpam_f64(diss, medoids.astype(np.uint64), max_iter))
259+
return KMedoidsResult(*_fasterpam_f64(diss, medoids, max_iter))
260260
elif dtype == np.int32:
261-
return KMedoidsResult(*_fasterpam_i32(diss, medoids.astype(np.uint64), max_iter))
261+
return KMedoidsResult(*_fasterpam_i32(diss, medoids, max_iter))
262262
elif dtype == np.int64:
263-
return KMedoidsResult(*_fasterpam_i64(diss, medoids.astype(np.uint64), max_iter))
263+
return KMedoidsResult(*_fasterpam_i64(diss, medoids, max_iter))
264264
else:
265265
seed = None
266266
if random_state is np.random:
@@ -272,13 +272,13 @@ def fasterpam(diss, medoids, max_iter=100, init="random", random_state=None, n_c
272272
else:
273273
raise ValueError("Pass a numpy random generator, state or integer seed")
274274
if dtype == np.float32:
275-
return KMedoidsResult(*_rand_fasterpam_f32(diss, medoids.astype(np.uint64), max_iter, seed))
275+
return KMedoidsResult(*_rand_fasterpam_f32(diss, medoids, max_iter, seed))
276276
elif dtype == np.float64:
277-
return KMedoidsResult(*_rand_fasterpam_f64(diss, medoids.astype(np.uint64), max_iter, seed))
277+
return KMedoidsResult(*_rand_fasterpam_f64(diss, medoids, max_iter, seed))
278278
elif dtype == np.int32:
279-
return KMedoidsResult(*_rand_fasterpam_i32(diss, medoids.astype(np.uint64), max_iter, seed))
279+
return KMedoidsResult(*_rand_fasterpam_i32(diss, medoids, max_iter, seed))
280280
elif dtype == np.int64:
281-
return KMedoidsResult(*_rand_fasterpam_i64(diss, medoids.astype(np.uint64), max_iter, seed))
281+
return KMedoidsResult(*_rand_fasterpam_i64(diss, medoids, max_iter, seed))
282282
raise ValueError("Input data not supported. Use a numpy array of floats.")
283283

284284
def fastpam1(diss, medoids, max_iter=100, init="random", random_state=None):
@@ -327,13 +327,13 @@ def fastpam1(diss, medoids, max_iter=100, init="random", random_state=None):
327327
if isinstance(diss, np.ndarray):
328328
dtype = diss.dtype
329329
if dtype == np.float32:
330-
return KMedoidsResult(*_fastpam1_f32(diss, medoids.astype(np.uint64), max_iter))
330+
return KMedoidsResult(*_fastpam1_f32(diss, medoids, max_iter))
331331
elif dtype == np.float64:
332-
return KMedoidsResult(*_fastpam1_f64(diss, medoids.astype(np.uint64), max_iter))
332+
return KMedoidsResult(*_fastpam1_f64(diss, medoids, max_iter))
333333
elif dtype == np.int32:
334-
return KMedoidsResult(*_fastpam1_i32(diss, medoids.astype(np.uint64), max_iter))
334+
return KMedoidsResult(*_fastpam1_i32(diss, medoids, max_iter))
335335
elif dtype == np.int64:
336-
return KMedoidsResult(*_fastpam1_i64(diss, medoids.astype(np.uint64), max_iter))
336+
return KMedoidsResult(*_fastpam1_i64(diss, medoids, max_iter))
337337
raise ValueError("Input data not supported. Use a numpy array of floats.")
338338

339339
def pam_build(diss, k):
@@ -419,13 +419,13 @@ def pam(diss, medoids, max_iter=100, init="build", random_state=None):
419419
if isinstance(diss, np.ndarray):
420420
dtype = diss.dtype
421421
if dtype == np.float32:
422-
return KMedoidsResult(*_pam_swap_f32(diss, medoids.astype(np.uint64), max_iter))
422+
return KMedoidsResult(*_pam_swap_f32(diss, medoids, max_iter))
423423
elif dtype == np.float64:
424-
return KMedoidsResult(*_pam_swap_f64(diss, medoids.astype(np.uint64), max_iter))
424+
return KMedoidsResult(*_pam_swap_f64(diss, medoids, max_iter))
425425
elif dtype == np.int32:
426-
return KMedoidsResult(*_pam_swap_i32(diss, medoids.astype(np.uint64), max_iter))
426+
return KMedoidsResult(*_pam_swap_i32(diss, medoids, max_iter))
427427
elif dtype == np.int64:
428-
return KMedoidsResult(*_pam_swap_i64(diss, medoids.astype(np.uint64), max_iter))
428+
return KMedoidsResult(*_pam_swap_i64(diss, medoids, max_iter))
429429
raise ValueError("Input data not supported. Use a numpy array of floats.")
430430

431431
def pammedsil(diss, medoids, max_iter=100, init="build", random_state=None):
@@ -466,9 +466,9 @@ def pammedsil(diss, medoids, max_iter=100, init="build", random_state=None):
466466
if isinstance(diss, np.ndarray):
467467
dtype = diss.dtype
468468
if dtype == np.float32:
469-
return KMedoidsResult(*_pammedsil_swap_f32(diss, medoids.astype(np.uint64), max_iter))
469+
return KMedoidsResult(*_pammedsil_swap_f32(diss, medoids, max_iter))
470470
elif dtype == np.float64:
471-
return KMedoidsResult(*_pammedsil_swap_f64(diss, medoids.astype(np.uint64), max_iter))
471+
return KMedoidsResult(*_pammedsil_swap_f64(diss, medoids, max_iter))
472472
raise ValueError("Input data not supported. Use a numpy array of floats.")
473473

474474
def pamsil(diss, medoids, max_iter=100, init="build", random_state=None):
@@ -508,9 +508,9 @@ def pamsil(diss, medoids, max_iter=100, init="build", random_state=None):
508508
if isinstance(diss, np.ndarray):
509509
dtype = diss.dtype
510510
if dtype == np.float32:
511-
return KMedoidsResult(*_pamsil_swap_f32(diss, medoids.astype(np.uint64), max_iter))
511+
return KMedoidsResult(*_pamsil_swap_f32(diss, medoids, max_iter))
512512
elif dtype == np.float64:
513-
return KMedoidsResult(*_pamsil_swap_f64(diss, medoids.astype(np.uint64), max_iter))
513+
return KMedoidsResult(*_pamsil_swap_f64(diss, medoids, max_iter))
514514
raise ValueError("Input data not supported. Use a numpy array of floats.")
515515

516516
def fastmsc(diss, medoids, max_iter=100, init="random", random_state=None):
@@ -558,9 +558,9 @@ def fastmsc(diss, medoids, max_iter=100, init="random", random_state=None):
558558
if isinstance(diss, np.ndarray):
559559
dtype = diss.dtype
560560
if dtype == np.float32:
561-
return KMedoidsResult(*_fastmsc_f32(diss, medoids.astype(np.uint64), max_iter))
561+
return KMedoidsResult(*_fastmsc_f32(diss, medoids, max_iter))
562562
elif dtype == np.float64:
563-
return KMedoidsResult(*_fastmsc_f64(diss, medoids.astype(np.uint64), max_iter))
563+
return KMedoidsResult(*_fastmsc_f64(diss, medoids, max_iter))
564564
raise ValueError("Input data not supported. Use a numpy array of floats.")
565565

566566
def fastermsc(diss, medoids, max_iter=100, init="random", random_state=None):
@@ -608,9 +608,9 @@ def fastermsc(diss, medoids, max_iter=100, init="random", random_state=None):
608608
if isinstance(diss, np.ndarray):
609609
dtype = diss.dtype
610610
if dtype == np.float32:
611-
return KMedoidsResult(*_fastermsc_f32(diss, medoids.astype(np.uint64), max_iter))
611+
return KMedoidsResult(*_fastermsc_f32(diss, medoids, max_iter))
612612
elif dtype == np.float64:
613-
return KMedoidsResult(*_fastermsc_f64(diss, medoids.astype(np.uint64), max_iter))
613+
return KMedoidsResult(*_fastermsc_f64(diss, medoids, max_iter))
614614
raise ValueError("Input data not supported. Use a numpy array of floats.")
615615

616616
def dynmsc(diss, medoids, minimum_k=2, max_iter=100, init="random", random_state=None):
@@ -632,12 +632,12 @@ def dynmsc(diss, medoids, minimum_k=2, max_iter=100, init="random", random_state
632632
:type diss: ndarray
633633
:param medoids: maximum number of clusters to find or existing medoids with length of maximum number of clusters to find
634634
:type medoids: int or ndarray
635+
:param minimum_k: minimum number of clusters to find
636+
:type minimum_k: int
635637
:param max_iter: maximum number of iterations
636638
:type max_iter: int
637639
:param init: initialization method
638640
:type init: str, "random", "first" or "build"
639-
:param minimum_k: minimum number of clusters to find
640-
:type minimum_k: int
641641
:param random_state: random seed if no medoids are given
642642
:type random_state: int, RandomState instance or None
643643
@@ -657,9 +657,9 @@ def dynmsc(diss, medoids, minimum_k=2, max_iter=100, init="random", random_state
657657
if isinstance(diss, np.ndarray):
658658
dtype = diss.dtype
659659
if dtype == np.float32:
660-
return DynkResult(*_dynmsc_f32(diss, medoids.astype(np.uint64), minimum_k, max_iter))
660+
return DynkResult(*_dynmsc_f32(diss, medoids, minimum_k, max_iter))
661661
elif dtype == np.float64:
662-
return DynkResult(*_dynmsc_f64(diss, medoids.astype(np.uint64), minimum_k, max_iter))
662+
return DynkResult(*_dynmsc_f64(diss, medoids, minimum_k, max_iter))
663663
raise ValueError("Input data not supported. Use a numpy array of floats.")
664664

665665
def alternating(diss, medoids, max_iter=100, init="random", random_state=None):
@@ -692,13 +692,13 @@ def alternating(diss, medoids, max_iter=100, init="random", random_state=None):
692692
if isinstance(diss, np.ndarray):
693693
dtype = diss.dtype
694694
if dtype == np.float32:
695-
return KMedoidsResult(*_alternating_f32(diss, medoids.astype(np.uint64), max_iter))
695+
return KMedoidsResult(*_alternating_f32(diss, medoids, max_iter))
696696
elif dtype == np.float64:
697-
return KMedoidsResult(*_alternating_f64(diss, medoids.astype(np.uint64), max_iter))
697+
return KMedoidsResult(*_alternating_f64(diss, medoids, max_iter))
698698
elif dtype == np.int32:
699-
return KMedoidsResult(*_alternating_i32(diss, medoids.astype(np.uint64), max_iter))
699+
return KMedoidsResult(*_alternating_i32(diss, medoids, max_iter))
700700
elif dtype == np.int64:
701-
return KMedoidsResult(*_alternating_i64(diss, medoids.astype(np.uint64), max_iter))
701+
return KMedoidsResult(*_alternating_i64(diss, medoids, max_iter))
702702
raise ValueError("Input data not supported. Use a numpy array of floats.")
703703

704704
def silhouette(diss, labels, samples=False, n_cpu=-1):
@@ -735,7 +735,7 @@ def silhouette(diss, labels, samples=False, n_cpu=-1):
735735

736736
if not isinstance(diss, np.ndarray):
737737
diss = np.array(diss)
738-
labels = np.unique(labels, return_inverse=True)[1].astype(np.uint64) # ensure labels are 0..k-1
738+
labels = np.unique(labels, return_inverse=True)[1].astype(np.uintp) # ensure labels are 0..k-1
739739

740740
if isinstance(diss, np.ndarray):
741741
dtype = diss.dtype
@@ -797,16 +797,17 @@ def medoid_silhouette(diss, meds, samples=False):
797797
if not isinstance(diss, np.ndarray):
798798
diss = np.array(diss)
799799
if not isinstance(meds, np.ndarray):
800-
meds = np.array(meds, dtype=np.uint64)
800+
meds = np.array(meds)
801+
meds = meds.astype(np.uintp)
801802

802803
if isinstance(diss, np.ndarray):
803804
dtype = diss.dtype
804805
if dtype == np.float32:
805-
return _medoid_silhouette_f32(diss, meds.astype(np.uint64), samples)
806+
return _medoid_silhouette_f32(diss, meds, samples)
806807
elif dtype == np.float64:
807-
return _medoid_silhouette_f64(diss, meds.astype(np.uint64), samples)
808+
return _medoid_silhouette_f64(diss, meds, samples)
808809
elif dtype == np.int32:
809-
return _medoid_silhouette_i32(diss, meds.astype(np.uint64), samples)
810+
return _medoid_silhouette_i32(diss, meds, samples)
810811
elif dtype == np.int64:
811812
raise ValueError("Input of int64 is currently not supported, as it could overflow the float64 used internally when computing Silhouette. Use diss.astype(numpy.float64) if that is acceptable and you have the necessary memory for this copy.")
812813
raise ValueError("Input data not supported. Use a numpy array of floats.")
@@ -881,14 +882,14 @@ class KMedoids(SKLearnClusterer):
881882
:param random_state: random seed if no medoids are given
882883
:type random_state: int, RandomState instance or None
883884
884-
:ivar cluster_centers\_: None for 'precomputed'
885-
:type cluster_centers\_: array
886-
:ivar medoid_indices\_: The indices of the medoid rows in X
887-
:type medoid_indices\_: array, shape = (n_clusters,)
888-
:ivar labels\_: Labels of each point
889-
:type labels\_: array, shape = (n_samples,)
890-
:ivar inertia\_: Sum of distances of samples to their closest cluster center
891-
:type inertia\_: float
885+
:ivar cluster_centers_: None for 'precomputed'
886+
:type cluster_centers_: array
887+
:ivar medoid_indices_: The indices of the medoid rows in X
888+
:type medoid_indices_: array, shape = (n_clusters,)
889+
:ivar labels_: Labels of each point
890+
:type labels_: array, shape = (n_samples,)
891+
:ivar inertia_: Sum of distances of samples to their closest cluster center
892+
:type inertia_: float
892893
"""
893894
def __init__(
894895
self,

pyproject.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@ build-backend = "maturin"
44

55
[project]
66
name = "kmedoids"
7-
version = "0.5.0"
7+
version = "0.5.1"
88
description = "k-Medoids Clustering in Python with FasterPAM"
99
requires-dist = ["numpy"]
1010
classifier = [

tests/test_integration.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -78,7 +78,7 @@ def test_fastermsc(self):
7878
def test_dynmsc(self):
7979
dist = np.array([[0, 2, 3, 4, 5], [2, 0, 6, 7, 8], [3, 6, 0, 9, 10], [4, 7, 9, 0, 11], [5, 8, 10, 11, 0]], dtype=np.float32)
8080
dmsc = kmedoids.dynmsc(dist, 3, init='build')
81-
dmsc_rust = kmedoids.kmedoids._dynmsc_f32(dist, dmsc.medoids, 100)
81+
dmsc_rust = kmedoids.kmedoids._dynmsc_f32(dist, dmsc.medoids, 2, 100)
8282
assert dmsc.loss == 0.8761904761904762
8383
assert np.array_equal(dmsc.medoids, dmsc_rust[2])
8484
assert dmsc.loss == dmsc_rust[0]

0 commit comments

Comments
 (0)