Skip to content

Commit 360cf03

Browse files
committed
Add FasterMSC, FastMSC, PAMSIL, PAMMEDSIL
1 parent 283a03a commit 360cf03

2 files changed

Lines changed: 199 additions & 3 deletions

File tree

kmedoids/__init__.py

Lines changed: 183 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,15 @@
1010
- BUILD (the initialization of PAM)
1111
- Silhouette evaluation
1212
13+
Additionally, the package implements clustering algorithms
14+
for direct optimization of the (Medoid) Silhouette,
15+
in decreasing order of performance:
16+
17+
- FasterMSC
18+
- FastMSC (same result as PAMMEDSIL; but faster)
19+
- PAMMEDSIL
20+
- PAMSIL
21+
1322
References:
1423
1524
| Erich Schubert, Peter J. Rousseeuw
@@ -24,6 +33,10 @@
2433
| https://doi.org/10.1007/978-3-030-32047-8_16
2534
| Preprint: https://arxiv.org/abs/1810.05691
2635
36+
| Lars Lenssen, Erich Schubert:
37+
| Clustering by Direct Optimization of the Medoid Silhouette
38+
| In: 15th International Conference on Similarity Search and Applications (SISAP 2022).
39+
2740
| Leonard Kaufman, Peter J. Rousseeuw:
2841
| Clustering by means of medoids.
2942
| In: Dodge Y (ed) Statistical Data Analysis Based on the L 1 Norm and Related Methods, 405-416, 1987
@@ -41,6 +54,7 @@
4154
"pam",
4255
"fastpam1",
4356
"fasterpam",
57+
"fastmsc",
4458
"alternating",
4559
"pam_build",
4660
"silhouette",
@@ -341,8 +355,175 @@ def pam(diss, medoids, max_iter=100, init="build", random_state=None):
341355
return KMedoidsResult(*_pam_swap_f64(diss, medoids.astype(np.uint64), max_iter))
342356
elif dtype == np.int32:
343357
return KMedoidsResult(*_pam_swap_i32(diss, medoids.astype(np.uint64), max_iter))
344-
elif dtype == np.int64:
345-
return KMedoidsResult(*_pam_swap_i64(diss, medoids.astype(np.uint64), max_iter))
358+
raise ValueError("Input data not supported. Use a numpy array of floats.")
359+
360+
def pammedsil(diss, medoids, max_iter=100, init="build", random_state=None):
361+
"""PAMMEDSIL clustering
362+
363+
This is an implementation of the original PAMMEDSIL
364+
clustering algorithm. For improved versions, see the fastmsc and fastermsc methods.
365+
366+
References:
367+
368+
| Mark Van der Laan, Katherine Pollard, Jennifer Bryan:
369+
| A new partitioning around medoids algorithm.
370+
| In: Journal of Statistical Computation and Simulation, pp 575-584, 2003
371+
372+
:param diss: square numpy array of dissimilarities
373+
:type diss: ndarray
374+
:param medoids: number of clusters to find or existing medoids
375+
:type medoids: int or ndarray
376+
:param max_iter: maximum number of iterations
377+
:type max_iter: int
378+
:param init: initialization method
379+
:type init: str, "random", "first" or "build"
380+
:param random_state: random seed if no medoids are given
381+
:type random_state: int, RandomState instance or None
382+
383+
:return: k-medoids clustering result
384+
:rtype: KMedoidsResult
385+
"""
386+
import numpy as np
387+
from .kmedoids import _pammedsil_swap_f32, _pammedsil_swap_f64
388+
389+
if not isinstance(diss, np.ndarray):
390+
diss = np.array(diss)
391+
392+
medoids = _check_medoids(diss, medoids, init, random_state)
393+
394+
if isinstance(diss, np.ndarray):
395+
dtype = diss.dtype
396+
if dtype == np.float32:
397+
return KMedoidsResult(*_pammedsil_swap_f32(diss, medoids.astype(np.uint64), max_iter))
398+
elif dtype == np.float64:
399+
return KMedoidsResult(*_pammedsil_swap_f64(diss, medoids.astype(np.uint64), max_iter))
400+
raise ValueError("Input data not supported. Use a numpy array of floats.")
401+
402+
def pamsil(diss, medoids, max_iter=100, init="build", random_state=None):
403+
"""PAMSIL k-medoids clustering
404+
405+
This is an implementation of the original PAMSIL.
406+
407+
References:
408+
409+
| Mark Van der Laan, Katherine Pollard, Jennifer Bryan:
410+
| A new partitioning around medoids algorithm.
411+
| In: Journal of Statistical Computation and Simulation, pp 575-584, 2003
412+
413+
:param diss: square numpy array of dissimilarities
414+
:type diss: ndarray
415+
:param medoids: number of clusters to find or existing medoids
416+
:type medoids: int or ndarray
417+
:param max_iter: maximum number of iterations
418+
:type max_iter: int
419+
:param init: initialization method
420+
:type init: str, "random", "first" or "build"
421+
:param random_state: random seed if no medoids are given
422+
:type random_state: int, RandomState instance or None
423+
424+
:return: k-medoids clustering result
425+
:rtype: KMedoidsResult
426+
"""
427+
import numpy as np
428+
from .kmedoids import _pamsil_swap_f32, _pamsil_swap_f64
429+
430+
if not isinstance(diss, np.ndarray):
431+
diss = np.array(diss)
432+
433+
medoids = _check_medoids(diss, medoids, init, random_state)
434+
435+
if isinstance(diss, np.ndarray):
436+
dtype = diss.dtype
437+
if dtype == np.float32:
438+
return KMedoidsResult(*_pamsil_swap_f32(diss, medoids.astype(np.uint64), max_iter))
439+
elif dtype == np.float64:
440+
return KMedoidsResult(*_pamsil_swap_f64(diss, medoids.astype(np.uint64), max_iter))
441+
raise ValueError("Input data not supported. Use a numpy array of floats.")
442+
443+
def fastmsc(diss, medoids, max_iter=100, init="random", random_state=None):
444+
"""FastMSC clustering
445+
446+
This is an accelerated version of PAMMEDSIL clustering, that performs the
447+
same swaps as the original PAMMEDSIL (given the same starting conditions),
448+
but finds the best swap O(k^2) times faster.
449+
450+
References:
451+
452+
| Lars Lenssen, Erich Schubert:
453+
| Clustering by Direct Optimization of the Medoid Silhouette
454+
| In: 15th International Conference on Similarity Search and Applications (SISAP 2022).
455+
456+
:param diss: square numpy array of dissimilarities
457+
:type diss: ndarray
458+
:param medoids: number of clusters to find or existing medoids
459+
:type medoids: int or ndarray
460+
:param max_iter: maximum number of iterations
461+
:type max_iter: int
462+
:param init: initialization method
463+
:type init: str, "random", "first" or "build"
464+
:param random_state: random seed if no medoids are given
465+
:type random_state: int, RandomState instance or None
466+
467+
:return: k-medoids clustering result
468+
:rtype: KMedoidsResult
469+
"""
470+
import numpy as np
471+
from .kmedoids import _fastmsc_f32, _fastmsc_f64
472+
473+
if not isinstance(diss, np.ndarray):
474+
diss = np.array(diss)
475+
476+
medoids = _check_medoids(diss, medoids, init, random_state)
477+
478+
if isinstance(diss, np.ndarray):
479+
dtype = diss.dtype
480+
if dtype == np.float32:
481+
return KMedoidsResult(*_fastmsc_f32(diss, medoids.astype(np.uint64), max_iter))
482+
elif dtype == np.float64:
483+
return KMedoidsResult(*_fastmsc_f64(diss, medoids.astype(np.uint64), max_iter))
484+
raise ValueError("Input data not supported. Use a numpy array of floats.")
485+
486+
def fastermsc(diss, medoids, max_iter=100, init="random", random_state=None):
487+
"""FasterMSC clustering
488+
489+
This is an accelerated version of PAMMEDSIL clustering, that eagerly
490+
performs any swap found, and contains the O(k^2) improvement to find
491+
the best swaps faster.
492+
493+
References:
494+
495+
| Lars Lenssen, Erich Schubert:
496+
| Clustering by Direct Optimization of the Medoid Silhouette
497+
| In: 15th International Conference on Similarity Search and Applications (SISAP 2022).
498+
499+
:param diss: square numpy array of dissimilarities
500+
:type diss: ndarray
501+
:param medoids: number of clusters to find or existing medoids
502+
:type medoids: int or ndarray
503+
:param max_iter: maximum number of iterations
504+
:type max_iter: int
505+
:param init: initialization method
506+
:type init: str, "random", "first" or "build"
507+
:param random_state: random seed if no medoids are given
508+
:type random_state: int, RandomState instance or None
509+
510+
:return: k-medoids clustering result
511+
:rtype: KMedoidsResult
512+
"""
513+
import numpy as np
514+
from .kmedoids import _fastermsc_f32, _fastermsc_f64
515+
516+
if not isinstance(diss, np.ndarray):
517+
diss = np.array(diss)
518+
519+
medoids = _check_medoids(diss, medoids, init, random_state)
520+
521+
if isinstance(diss, np.ndarray):
522+
dtype = diss.dtype
523+
if dtype == np.float32:
524+
return KMedoidsResult(*_fastermsc_f32(diss, medoids.astype(np.uint64), max_iter))
525+
elif dtype == np.float64:
526+
return KMedoidsResult(*_fastermsc_f64(diss, medoids.astype(np.uint64), max_iter))
346527
raise ValueError("Input data not supported. Use a numpy array of floats.")
347528

348529
def alternating(diss, medoids, max_iter=100, init="random", random_state=None):
@@ -447,7 +628,6 @@ def silhouette(diss, labels, samples=False, n_cpu=-1):
447628
raise ValueError("Input of int64 is currently not supported, as it could overflow the float64 used internally when computing Silhouette. Use diss.astype(numpy.float64) if that is acceptable and you have the necessary memory for this copy.")
448629
raise ValueError("Input data not supported. Use a numpy array of floats.")
449630

450-
451631
# This is a hack to make sklearn an optional dependency only:
452632
try:
453633
from sklearn.base import BaseEstimator, ClusterMixin, TransformerMixin

src/lib.rs

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,14 @@ variant_call!(pam_swap_f32, pam_swap, f32, f64);
4040
variant_call!(pam_swap_f64, pam_swap, f64, f64);
4141
variant_call!(pam_swap_i32, pam_swap, i32, i64);
4242
variant_call!(pam_swap_i64, pam_swap, i64, i64);
43+
variant_call!(pammedsil_swap_f32, pammedsil_swap, f32, f64);
44+
variant_call!(pammedsil_swap_f64, pammedsil_swap, f64, f64);
45+
variant_call!(pamsil_swap_f32, pamsil_swap, f32, f64);
46+
variant_call!(pamsil_swap_f64, pamsil_swap, f64, f64);
47+
variant_call!(fastmsc_f32, fastmsc, f32, f64);
48+
variant_call!(fastmsc_f64, fastmsc, f64, f64);
49+
variant_call!(fastermsc_f32, fastermsc, f32, f64);
50+
variant_call!(fastermsc_f64, fastermsc, f64, f64);
4351

4452
macro_rules! rand_call {
4553
($name:ident, $variant:ident, $type: ty, $ltype: ty) => {
@@ -232,6 +240,10 @@ fn kmedoids(py: Python, m: &PyModule) -> PyResult<()> {
232240
m.add("_fastpam1_f64", wrap_pyfunction!(fastpam1_f64, m)?)?;
233241
m.add("_fastpam1_i32", wrap_pyfunction!(fastpam1_i32, m)?)?;
234242
m.add("_fastpam1_i64", wrap_pyfunction!(fastpam1_i64, m)?)?;
243+
m.add("_fastmsc_f32", wrap_pyfunction!(fastmsc_f32, m)?)?;
244+
m.add("_fastmsc_f64", wrap_pyfunction!(fastmsc_f64, m)?)?;
245+
m.add("_fastermsc_f32", wrap_pyfunction!(fastermsc_f32, m)?)?;
246+
m.add("_fastermsc_f64", wrap_pyfunction!(fastermsc_f64, m)?)?;
235247
m.add("_pam_swap_f32", wrap_pyfunction!(pam_swap_f32, m)?)?;
236248
m.add("_pam_swap_f64", wrap_pyfunction!(pam_swap_f64, m)?)?;
237249
m.add("_pam_swap_i32", wrap_pyfunction!(pam_swap_i32, m)?)?;
@@ -240,6 +252,10 @@ fn kmedoids(py: Python, m: &PyModule) -> PyResult<()> {
240252
m.add("_pam_build_f64", wrap_pyfunction!(pam_build_f64, m)?)?;
241253
m.add("_pam_build_i32", wrap_pyfunction!(pam_build_i32, m)?)?;
242254
m.add("_pam_build_i64", wrap_pyfunction!(pam_build_i64, m)?)?;
255+
m.add("_pammedsil_swap_f32", wrap_pyfunction!(pammedsil_swap_f32, m)?)?;
256+
m.add("_pammedsil_swap_f64", wrap_pyfunction!(pammedsil_swap_f64, m)?)?;
257+
m.add("_pamsil_swap_f32", wrap_pyfunction!(pamsil_swap_f32, m)?)?;
258+
m.add("_pamsil_swap_f64", wrap_pyfunction!(pamsil_swap_f64, m)?)?;
243259
m.add("_alternating_f32", wrap_pyfunction!(alternating_f32, m)?)?;
244260
m.add("_alternating_f64", wrap_pyfunction!(alternating_f64, m)?)?;
245261
m.add("_alternating_i32", wrap_pyfunction!(alternating_i32, m)?)?;

0 commit comments

Comments
 (0)