|
10 | 10 | - BUILD (the initialization of PAM) |
11 | 11 | - Silhouette evaluation |
12 | 12 |
|
| 13 | +Additionally, the package implements clustering algorithms |
| 14 | +for direct optimization of the (Medoid) Silhouette, |
| 15 | +in decreasing order of performance: |
| 16 | +
|
| 17 | +- FasterMSC |
| 18 | +- FastMSC (same result as PAMMEDSIL; but faster) |
| 19 | +- PAMMEDSIL |
| 20 | +- PAMSIL |
| 21 | +
|
13 | 22 | References: |
14 | 23 |
|
15 | 24 | | Erich Schubert, Peter J. Rousseeuw |
|
24 | 33 | | https://doi.org/10.1007/978-3-030-32047-8_16 |
25 | 34 | | Preprint: https://arxiv.org/abs/1810.05691 |
26 | 35 |
|
| 36 | +| Lars Lenssen, Erich Schubert: |
| 37 | +| Clustering by Direct Optimization of the Medoid Silhouette |
| 38 | +| In: 15th International Conference on Similarity Search and Applications (SISAP 2022). |
| 39 | +
|
27 | 40 | | Leonard Kaufman, Peter J. Rousseeuw: |
28 | 41 | | Clustering by means of medoids. |
29 | 42 | | In: Dodge Y (ed) Statistical Data Analysis Based on the L 1 Norm and Related Methods, 405-416, 1987 |
|
41 | 54 | "pam", |
42 | 55 | "fastpam1", |
43 | 56 | "fasterpam", |
| 57 | + "fastmsc", |
44 | 58 | "alternating", |
45 | 59 | "pam_build", |
46 | 60 | "silhouette", |
@@ -341,8 +355,175 @@ def pam(diss, medoids, max_iter=100, init="build", random_state=None): |
341 | 355 | return KMedoidsResult(*_pam_swap_f64(diss, medoids.astype(np.uint64), max_iter)) |
342 | 356 | elif dtype == np.int32: |
343 | 357 | return KMedoidsResult(*_pam_swap_i32(diss, medoids.astype(np.uint64), max_iter)) |
344 | | - elif dtype == np.int64: |
345 | | - return KMedoidsResult(*_pam_swap_i64(diss, medoids.astype(np.uint64), max_iter)) |
| 358 | + raise ValueError("Input data not supported. Use a numpy array of floats.") |
| 359 | + |
| 360 | +def pammedsil(diss, medoids, max_iter=100, init="build", random_state=None): |
| 361 | + """PAMMEDSIL clustering |
| 362 | +
|
| 363 | + This is an implementation of the original PAMMEDSIL |
| 364 | + clustering algorithm. For improved versions, see the fastmsc and fastermsc methods. |
| 365 | +
|
| 366 | + References: |
| 367 | +
|
| 368 | + | Mark Van der Laan, Katherine Pollard, Jennifer Bryan: |
| 369 | + | A new partitioning around medoids algorithm. |
| 370 | + | In: Journal of Statistical Computation and Simulation, pp 575-584, 2003 |
| 371 | +
|
| 372 | + :param diss: square numpy array of dissimilarities |
| 373 | + :type diss: ndarray |
| 374 | + :param medoids: number of clusters to find or existing medoids |
| 375 | + :type medoids: int or ndarray |
| 376 | + :param max_iter: maximum number of iterations |
| 377 | + :type max_iter: int |
| 378 | + :param init: initialization method |
| 379 | + :type init: str, "random", "first" or "build" |
| 380 | + :param random_state: random seed if no medoids are given |
| 381 | + :type random_state: int, RandomState instance or None |
| 382 | +
|
| 383 | + :return: k-medoids clustering result |
| 384 | + :rtype: KMedoidsResult |
| 385 | + """ |
| 386 | + import numpy as np |
| 387 | + from .kmedoids import _pammedsil_swap_f32, _pammedsil_swap_f64 |
| 388 | + |
| 389 | + if not isinstance(diss, np.ndarray): |
| 390 | + diss = np.array(diss) |
| 391 | + |
| 392 | + medoids = _check_medoids(diss, medoids, init, random_state) |
| 393 | + |
| 394 | + if isinstance(diss, np.ndarray): |
| 395 | + dtype = diss.dtype |
| 396 | + if dtype == np.float32: |
| 397 | + return KMedoidsResult(*_pammedsil_swap_f32(diss, medoids.astype(np.uint64), max_iter)) |
| 398 | + elif dtype == np.float64: |
| 399 | + return KMedoidsResult(*_pammedsil_swap_f64(diss, medoids.astype(np.uint64), max_iter)) |
| 400 | + raise ValueError("Input data not supported. Use a numpy array of floats.") |
| 401 | + |
| 402 | +def pamsil(diss, medoids, max_iter=100, init="build", random_state=None): |
| 403 | + """PAMSIL k-medoids clustering |
| 404 | +
|
| 405 | + This is an implementation of the original PAMSIL. |
| 406 | +
|
| 407 | + References: |
| 408 | +
|
| 409 | + | Mark Van der Laan, Katherine Pollard, Jennifer Bryan: |
| 410 | + | A new partitioning around medoids algorithm. |
| 411 | + | In: Journal of Statistical Computation and Simulation, pp 575-584, 2003 |
| 412 | +
|
| 413 | + :param diss: square numpy array of dissimilarities |
| 414 | + :type diss: ndarray |
| 415 | + :param medoids: number of clusters to find or existing medoids |
| 416 | + :type medoids: int or ndarray |
| 417 | + :param max_iter: maximum number of iterations |
| 418 | + :type max_iter: int |
| 419 | + :param init: initialization method |
| 420 | + :type init: str, "random", "first" or "build" |
| 421 | + :param random_state: random seed if no medoids are given |
| 422 | + :type random_state: int, RandomState instance or None |
| 423 | +
|
| 424 | + :return: k-medoids clustering result |
| 425 | + :rtype: KMedoidsResult |
| 426 | + """ |
| 427 | + import numpy as np |
| 428 | + from .kmedoids import _pamsil_swap_f32, _pamsil_swap_f64 |
| 429 | + |
| 430 | + if not isinstance(diss, np.ndarray): |
| 431 | + diss = np.array(diss) |
| 432 | + |
| 433 | + medoids = _check_medoids(diss, medoids, init, random_state) |
| 434 | + |
| 435 | + if isinstance(diss, np.ndarray): |
| 436 | + dtype = diss.dtype |
| 437 | + if dtype == np.float32: |
| 438 | + return KMedoidsResult(*_pamsil_swap_f32(diss, medoids.astype(np.uint64), max_iter)) |
| 439 | + elif dtype == np.float64: |
| 440 | + return KMedoidsResult(*_pamsil_swap_f64(diss, medoids.astype(np.uint64), max_iter)) |
| 441 | + raise ValueError("Input data not supported. Use a numpy array of floats.") |
| 442 | + |
| 443 | +def fastmsc(diss, medoids, max_iter=100, init="random", random_state=None): |
| 444 | + """FastMSC clustering |
| 445 | +
|
| 446 | + This is an accelerated version of PAMMEDSIL clustering, that performs the |
| 447 | + same swaps as the original PAMMEDSIL (given the same starting conditions), |
| 448 | + but finds the best swap O(k^2) times faster. |
| 449 | +
|
| 450 | + References: |
| 451 | +
|
| 452 | + | Lars Lenssen, Erich Schubert: |
| 453 | + | Clustering by Direct Optimization of the Medoid Silhouette |
| 454 | + | In: 15th International Conference on Similarity Search and Applications (SISAP 2022). |
| 455 | +
|
| 456 | + :param diss: square numpy array of dissimilarities |
| 457 | + :type diss: ndarray |
| 458 | + :param medoids: number of clusters to find or existing medoids |
| 459 | + :type medoids: int or ndarray |
| 460 | + :param max_iter: maximum number of iterations |
| 461 | + :type max_iter: int |
| 462 | + :param init: initialization method |
| 463 | + :type init: str, "random", "first" or "build" |
| 464 | + :param random_state: random seed if no medoids are given |
| 465 | + :type random_state: int, RandomState instance or None |
| 466 | +
|
| 467 | + :return: k-medoids clustering result |
| 468 | + :rtype: KMedoidsResult |
| 469 | + """ |
| 470 | + import numpy as np |
| 471 | + from .kmedoids import _fastmsc_f32, _fastmsc_f64 |
| 472 | + |
| 473 | + if not isinstance(diss, np.ndarray): |
| 474 | + diss = np.array(diss) |
| 475 | + |
| 476 | + medoids = _check_medoids(diss, medoids, init, random_state) |
| 477 | + |
| 478 | + if isinstance(diss, np.ndarray): |
| 479 | + dtype = diss.dtype |
| 480 | + if dtype == np.float32: |
| 481 | + return KMedoidsResult(*_fastmsc_f32(diss, medoids.astype(np.uint64), max_iter)) |
| 482 | + elif dtype == np.float64: |
| 483 | + return KMedoidsResult(*_fastmsc_f64(diss, medoids.astype(np.uint64), max_iter)) |
| 484 | + raise ValueError("Input data not supported. Use a numpy array of floats.") |
| 485 | + |
| 486 | +def fastermsc(diss, medoids, max_iter=100, init="random", random_state=None): |
| 487 | + """FasterMSC clustering |
| 488 | +
|
| 489 | + This is an accelerated version of PAMMEDSIL clustering, that eagerly |
| 490 | + performs any swap found, and contains the O(k^2) improvement to find |
| 491 | + the best swaps faster. |
| 492 | +
|
| 493 | + References: |
| 494 | +
|
| 495 | + | Lars Lenssen, Erich Schubert: |
| 496 | + | Clustering by Direct Optimization of the Medoid Silhouette |
| 497 | + | In: 15th International Conference on Similarity Search and Applications (SISAP 2022). |
| 498 | +
|
| 499 | + :param diss: square numpy array of dissimilarities |
| 500 | + :type diss: ndarray |
| 501 | + :param medoids: number of clusters to find or existing medoids |
| 502 | + :type medoids: int or ndarray |
| 503 | + :param max_iter: maximum number of iterations |
| 504 | + :type max_iter: int |
| 505 | + :param init: initialization method |
| 506 | + :type init: str, "random", "first" or "build" |
| 507 | + :param random_state: random seed if no medoids are given |
| 508 | + :type random_state: int, RandomState instance or None |
| 509 | +
|
| 510 | + :return: k-medoids clustering result |
| 511 | + :rtype: KMedoidsResult |
| 512 | + """ |
| 513 | + import numpy as np |
| 514 | + from .kmedoids import _fastermsc_f32, _fastermsc_f64 |
| 515 | + |
| 516 | + if not isinstance(diss, np.ndarray): |
| 517 | + diss = np.array(diss) |
| 518 | + |
| 519 | + medoids = _check_medoids(diss, medoids, init, random_state) |
| 520 | + |
| 521 | + if isinstance(diss, np.ndarray): |
| 522 | + dtype = diss.dtype |
| 523 | + if dtype == np.float32: |
| 524 | + return KMedoidsResult(*_fastermsc_f32(diss, medoids.astype(np.uint64), max_iter)) |
| 525 | + elif dtype == np.float64: |
| 526 | + return KMedoidsResult(*_fastermsc_f64(diss, medoids.astype(np.uint64), max_iter)) |
346 | 527 | raise ValueError("Input data not supported. Use a numpy array of floats.") |
347 | 528 |
|
348 | 529 | def alternating(diss, medoids, max_iter=100, init="random", random_state=None): |
@@ -447,7 +628,6 @@ def silhouette(diss, labels, samples=False, n_cpu=-1): |
447 | 628 | raise ValueError("Input of int64 is currently not supported, as it could overflow the float64 used internally when computing Silhouette. Use diss.astype(numpy.float64) if that is acceptable and you have the necessary memory for this copy.") |
448 | 629 | raise ValueError("Input data not supported. Use a numpy array of floats.") |
449 | 630 |
|
450 | | - |
451 | 631 | # This is a hack to make sklearn an optional dependency only: |
452 | 632 | try: |
453 | 633 | from sklearn.base import BaseEstimator, ClusterMixin, TransformerMixin |
|
0 commit comments