Skip to content

Commit 5d2924e

Browse files
larslenssenkno10
authored andcommitted
Update dynmsc
1 parent cebf30b commit 5d2924e

3 files changed

Lines changed: 37 additions & 90 deletions

File tree

README.md

Lines changed: 4 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -146,10 +146,10 @@ from sklearn.metrics.pairwise import euclidean_distances
146146
X, _ = fetch_openml('mnist_784', version=1, return_X_y=True, as_frame=False)
147147
X = X[:10000]
148148
diss = euclidean_distances(X)
149-
bk = kmedoids.bestk(diss, 100)
150-
print("Optimal number of clusters according to the Medoid Silhouette:", bk.bestk)
151-
print("Medoid Silhouette over range of k:", bk.losses)
152-
print("Range of k:", bk.rangek)
149+
dm = kmedoids.dynmsc(diss, 100)
150+
print("Optimal number of clusters according to the Medoid Silhouette:", dm.bestk)
151+
print("Medoid Silhouette over range of k:", dm.losses)
152+
print("Range of k:", dm.rangek)
153153
```
154154

155155
### Memory Requirements
@@ -170,7 +170,6 @@ For larger data sets, it is recommended to only cluster a representative sample
170170
* **FasterMSC** (Lenssen and Schubert, 2022)
171171
* FastMSC (Lenssen and Schubert, 2022)
172172
* DynMSC (Lenssen and Schubert, 2023)
173-
* Bestk (Lenssen and Schubert, 2023)
174173
* PAMSIL (Van der Laan and Pollard, 2003)
175174
* PAMMEDSIL (Van der Laan and Pollard, 2003)
176175
* Medoid Silhouette index for evaluation (Van der Laan and Pollard, 2003)

kmedoids/__init__.py

Lines changed: 28 additions & 52 deletions
Original file line numberDiff line numberDiff line change
@@ -111,9 +111,24 @@ def __repr__(self):
111111
return f"KMedoidsResult(loss={self.loss}, labels={self.labels}, medoids={self.medoids}, n_iter={self.n_iter}, n_swaps={self.n_swap})"
112112

113113

114-
class BestkResult:
114+
class DynkResult:
115115
"""
116-
Result of choosing the optimal number of clusters according to the Medoid Silhouette.
116+
K-medoids clustering result with automatic number of clusters
117+
118+
:param loss: Loss of this clustering (sum of deviations)
119+
:type loss: float
120+
121+
:param labels: Cluster assignment
122+
:type labels: ndarray
123+
124+
:param medoids: Chosen medoid indexes
125+
:type medoids: ndarray
126+
127+
:param n_iter: Number of iterations
128+
:type n_iter: int
129+
130+
:param n_swap: Number of swaps performed
131+
:type n_swap: int
117132
118133
:param bestk: Best k by Medoid Silhouette
119134
:type bestk: int
@@ -124,13 +139,18 @@ class BestkResult:
124139
:param rangek: range of k
125140
:type rangek: range
126141
"""
127-
def __init__(self, bestk, losses, rangek):
142+
def __init__(self, loss, labels, medoids, bestk, losses, rangek, n_iter=None, n_swap=None):
143+
self.loss = loss
144+
self.labels = labels
145+
self.medoids = medoids
146+
self.n_iter = n_iter
147+
self.n_swap = n_swap
128148
self.bestk = bestk
129149
self.losses = losses
130150
self.rangek = rangek
131151

132152
def __repr__(self):
133-
return f"BestkResult(bestk={self.bestk}, losses={self.losses}, rangek={self.rangek})"
153+
return f"DynkResult(loss={self.loss}, labels={self.labels}, medoids={self.medoids}, bestk={self.bestk}, losses={self.losses}, rangek={self.rangek}, n_iter={self.n_iter}, n_swaps={self.n_swap})"
134154

135155
def _check_medoids(diss, medoids, init, random_state):
136156
"""Check the medoids and random_state parameters."""
@@ -609,8 +629,8 @@ def dynmsc(diss, medoids, max_iter=100, init="random", random_state=None):
609629
:param random_state: random seed if no medoids are given
610630
:type random_state: int, RandomState instance or None
611631
612-
:return: k-medoids clustering result
613-
:rtype: KMedoidsResult
632+
:return: k-medoids clustering with automatic number of clusters
633+
:rtype: DynkResult
614634
"""
615635
import numpy as np
616636
from .kmedoids import _dynmsc_f32, _dynmsc_f64
@@ -623,53 +643,9 @@ def dynmsc(diss, medoids, max_iter=100, init="random", random_state=None):
623643
if isinstance(diss, np.ndarray):
624644
dtype = diss.dtype
625645
if dtype == np.float32:
626-
return KMedoidsResult(*_dynmsc_f32(diss, medoids.astype(np.uint64), max_iter))
627-
elif dtype == np.float64:
628-
return KMedoidsResult(*_dynmsc_f64(diss, medoids.astype(np.uint64), max_iter))
629-
raise ValueError("Input data not supported. Use a numpy array of floats.")
630-
631-
def bestk(diss, medoids=100, max_iter=100, init="random", random_state=None):
632-
"""Optimal number of clusters according to the Medoid Silhouette
633-
634-
This version uses DynMSC to choose the ptimal number of clusters according
635-
to the Medoid Silhouette, that performs DynMSC for k = 2 to the number of input medoids
636-
and returns k with the highest Average Medoid Silhouette.
637-
638-
References:
639-
640-
| Lars Lenssen, Erich Schubert:
641-
| Medoid silhouette clustering with automatic cluster number selection
642-
| Information Systems (120), 2024, 102290
643-
| <https://doi.org/10.1016/j.is.2023.102290>
644-
645-
:param diss: square numpy array of dissimilarities
646-
:type diss: ndarray
647-
:param medoids: maximum number of clusters to find or existing medoids with length of maximum number of clusters to find
648-
:type medoids: int or ndarray
649-
:param max_iter: maximum number of iterations
650-
:type max_iter: int
651-
:param init: initialization method
652-
:type init: str, "random", "first" or "build"
653-
:param random_state: random seed if no medoids are given
654-
:type random_state: int, RandomState instance or None
655-
656-
:return: Result of choosing the optimal number of clusters according to the Medoid Silhouette
657-
:rtype: BestkResult
658-
"""
659-
import numpy as np
660-
from .kmedoids import _bestk_f32, _bestk_f64
661-
662-
if not isinstance(diss, np.ndarray):
663-
diss = np.array(diss)
664-
665-
medoids = _check_medoids(diss, medoids, init, random_state)
666-
667-
if isinstance(diss, np.ndarray):
668-
dtype = diss.dtype
669-
if dtype == np.float32:
670-
return BestkResult(*_bestk_f32(diss, medoids.astype(np.uint64), max_iter))
646+
return DynkResult(*_dynmsc_f32(diss, medoids.astype(np.uint64), max_iter))
671647
elif dtype == np.float64:
672-
return BestkResult(*_bestk_f64(diss, medoids.astype(np.uint64), max_iter))
648+
return DynkResult(*_dynmsc_f64(diss, medoids.astype(np.uint64), max_iter))
673649
raise ValueError("Input data not supported. Use a numpy array of floats.")
674650

675651
def alternating(diss, medoids, max_iter=100, init="random", random_state=None):

src/lib.rs

Lines changed: 5 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -181,49 +181,23 @@ macro_rules! dynmsc_call {
181181
/// :param max_iter: maximum number of iterations
182182
/// :type max_iter: int
183183
/// :return: k-medoids clustering result
184-
/// :rtype: KMedoidsResult
184+
/// :rtype: DynkResult
185185
#[pyfunction]
186186
fn $name(dist: PyReadonlyArray2<'_, $type>, meds: PyReadonlyArray1<'_, usize>, max_iter: usize) -> PyResult<Py<PyAny>> {
187187
assert_eq!(dist.ndim(), 2);
188188
assert_eq!(dist.shape()[0], dist.shape()[1]);
189189
let mut meds = meds.to_vec()?;
190-
let (loss, assi, n_iter, n_swap, best_meds, _losses): ($ltype, _, _, _, _, _) = rustkmedoids::dynmsc(&dist.as_array(), &mut meds, max_iter);
190+
let maxk = meds.len() + 1;
191+
let (loss, assi, n_iter, n_swap, best_meds, losses): ($ltype, _, _, _, _, _) = rustkmedoids::dynmsc(&dist.as_array(), &mut meds, max_iter);
192+
let bestk = best_meds.len();
191193
Python::with_gil(|py| -> PyResult<Py<PyAny>> {
192-
Ok((loss, PyArray1::from_vec(py, assi), PyArray1::from_vec(py, best_meds), n_iter, n_swap).to_object(py))
194+
Ok((loss, PyArray1::from_vec(py, assi), PyArray1::from_vec(py, best_meds), bestk, PyArray1::from_vec(py, losses), (2..maxk).collect::<Vec<usize>>(), n_iter, n_swap).to_object(py))
193195
})
194196
}
195197
}}
196198
dynmsc_call!(dynmsc_f32, f32, f64);
197199
dynmsc_call!(dynmsc_f64, f64, f64);
198200

199-
200-
macro_rules! bestk_call {
201-
($name:ident, $type: ty, $ltype: ty) => {
202-
/// Run $variant k-medoids clustering function for $type precision
203-
///
204-
/// :param dist: distance matrix
205-
/// :type dist: ndarray
206-
/// :param meds: initial medoids
207-
/// :type meds: ndarray
208-
/// :param max_iter: maximum number of iterations
209-
/// :type max_iter: int
210-
/// :return: k-medoids clustering result
211-
/// :rtype: KMedoidsResult
212-
#[pyfunction]
213-
fn $name(dist: PyReadonlyArray2<'_, $type>, meds: PyReadonlyArray1<'_, usize>, max_iter: usize) -> PyResult<Py<PyAny>> {
214-
assert_eq!(dist.ndim(), 2);
215-
assert_eq!(dist.shape()[0], dist.shape()[1]);
216-
let mut meds = meds.to_vec()?;
217-
let maxk = meds.len() + 1;
218-
let (_loss, _assi, _n_iter, _n_swap, best_meds, losses): ($ltype, _, _, _, _, _) = rustkmedoids::dynmsc(&dist.as_array(), &mut meds, max_iter);
219-
Python::with_gil(|py| -> PyResult<Py<PyAny>> {
220-
Ok((best_meds.len(), PyArray1::from_vec(py, losses), (2..maxk).collect::<Vec<usize>>()).to_object(py))
221-
})
222-
}
223-
}}
224-
bestk_call!(bestk_f32, f32, f64);
225-
bestk_call!(bestk_f64, f64, f64);
226-
227201
macro_rules! silhouette_call {
228202
($name:ident, $type: ty) => {
229203
/// Run the Silhouette index evaluation for $type precision
@@ -327,8 +301,6 @@ fn kmedoids(py: Python, m: &PyModule) -> PyResult<()> {
327301
m.add("_fastermsc_f64", wrap_pyfunction!(fastermsc_f64, m)?)?;
328302
m.add("_dynmsc_f32", wrap_pyfunction!(dynmsc_f32, m)?)?;
329303
m.add("_dynmsc_f64", wrap_pyfunction!(dynmsc_f64, m)?)?;
330-
m.add("_bestk_f32", wrap_pyfunction!(bestk_f32, m)?)?;
331-
m.add("_bestk_f64", wrap_pyfunction!(bestk_f64, m)?)?;
332304
m.add("_pam_swap_f32", wrap_pyfunction!(pam_swap_f32, m)?)?;
333305
m.add("_pam_swap_f64", wrap_pyfunction!(pam_swap_f64, m)?)?;
334306
m.add("_pam_swap_i32", wrap_pyfunction!(pam_swap_i32, m)?)?;

0 commit comments

Comments
 (0)