Skip to content

Commit f700b07

Browse files
committed
Add a minimum k to DynMSC
1 parent fe7f3e5 commit f700b07

4 files changed

Lines changed: 20 additions & 12 deletions

File tree

README.md

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -141,7 +141,7 @@ print("Loss with PAM:", pam.loss)
141141
### Choose the optimal number of clusters
142142

143143
This package includes DynMSC, an algorithm that optimizes the Medoid Silhouette,
144-
and chooses the "optimal" number of clusters in a range of 2..kmax.
144+
and chooses the "optimal" number of clusters in a range of kmin..kmax.
145145
Beware that if you allow a too large kmax, the optimum result will likely have many
146146
one-elemental clusters. A too high kmax may mask more desirable results, hence it
147147
is recommended that you choose only 2-3 times the number of clusters you expect as maximum.
@@ -153,7 +153,9 @@ from sklearn.metrics.pairwise import euclidean_distances
153153
X, _ = fetch_openml('mnist_784', version=1, return_X_y=True, as_frame=False)
154154
X = X[:10000]
155155
diss = euclidean_distances(X)
156-
dm = kmedoids.dynmsc(diss, 100)
156+
kmin = 10
157+
kmax = 20
158+
dm = kmedoids.dynmsc(diss, kmax, kmin)
157159
print("Optimal number of clusters according to the Medoid Silhouette:", dm.bestk)
158160
print("Medoid Silhouette over range of k:", dm.losses)
159161
print("Range of k:", dm.rangek)

docs/index.rst

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -118,7 +118,7 @@ Choosing the optimal number of clusters
118118
---------------------------------------
119119

120120
This package includes :ref:`DynMSC<dynmsc>`, an algorithm that optimizes the Medoid Silhouette,
121-
and chooses the "optimal" number of clusters in a range of 2..kmax.
121+
and chooses the "optimal" number of clusters in a range of kmin..kmax.
122122
Beware that if you allow a too large kmax, the optimum result will likely have many
123123
one-elemental clusters. A too high kmax may mask more desirable results, hence it
124124
is recommended that you choose only 2-3 times the number of clusters you expect as maximum.
@@ -131,7 +131,9 @@ is recommended that you choose only 2-3 times the number of clusters you expect
131131
X, _ = fetch_openml('mnist_784', version=1, return_X_y=True, as_frame=False)
132132
X = X[:10000]
133133
diss = euclidean_distances(X)
134-
dm = kmedoids.dynmsc(diss, 100)
134+
kmin = 10
135+
kmax = 20
136+
dm = kmedoids.dynmsc(diss, kmax, kmin)
135137
print("Optimal number of clusters according to the Medoid Silhouette:", dm.bestk)
136138
print("Medoid Silhouette over range of k:", dm.losses)
137139
print("Range of k:", dm.rangek)

kmedoids/__init__.py

Lines changed: 9 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -613,11 +613,11 @@ def fastermsc(diss, medoids, max_iter=100, init="random", random_state=None):
613613
return KMedoidsResult(*_fastermsc_f64(diss, medoids.astype(np.uint64), max_iter))
614614
raise ValueError("Input data not supported. Use a numpy array of floats.")
615615

616-
def dynmsc(diss, medoids, max_iter=100, init="random", random_state=None):
616+
def dynmsc(diss, medoids, minimum_k=2, max_iter=100, init="random", random_state=None):
617617
"""DynMSC clustering
618618
619619
This is a version of FasterMSC with automatic cluster number selection, that
620-
performs FasterMSC for k = 2 to the number of input medoids and returns
620+
performs FasterMSC for a minimum k to the number of input medoids and returns
621621
the clustering with the highest Average Medoid Silhouette.
622622
623623
References:
@@ -636,6 +636,8 @@ def dynmsc(diss, medoids, max_iter=100, init="random", random_state=None):
636636
:type max_iter: int
637637
:param init: initialization method
638638
:type init: str, "random", "first" or "build"
639+
:param minimum_k: minimum number of clusters to find
640+
:type minimum_k: int
639641
:param random_state: random seed if no medoids are given
640642
:type random_state: int, RandomState instance or None
641643
@@ -650,12 +652,14 @@ def dynmsc(diss, medoids, max_iter=100, init="random", random_state=None):
650652

651653
medoids = _check_medoids(diss, medoids, init, random_state)
652654

655+
if medoids.shape[0] < minimum_k:
656+
raise ValueError("Maximum k should be at least minimum k.")
653657
if isinstance(diss, np.ndarray):
654658
dtype = diss.dtype
655659
if dtype == np.float32:
656-
return DynkResult(*_dynmsc_f32(diss, medoids.astype(np.uint64), max_iter))
660+
return DynkResult(*_dynmsc_f32(diss, medoids.astype(np.uint64), minimum_k, max_iter))
657661
elif dtype == np.float64:
658-
return DynkResult(*_dynmsc_f64(diss, medoids.astype(np.uint64), max_iter))
662+
return DynkResult(*_dynmsc_f64(diss, medoids.astype(np.uint64), minimum_k, max_iter))
659663
raise ValueError("Input data not supported. Use a numpy array of floats.")
660664

661665
def alternating(diss, medoids, max_iter=100, init="random", random_state=None):
@@ -933,7 +937,7 @@ def fit(self, X, y=None):
933937
elif self.method == "fastermsc":
934938
result = fastermsc(X, self.n_clusters, self.max_iter, self.init, random_state=self.random_state)
935939
elif self.method == "dynmsc":
936-
result = dynmsc(X, self.n_clusters, self.max_iter, self.init, random_state=self.random_state)
940+
result = dynmsc(X, self.n_clusters, 2, self.max_iter, self.init, random_state=self.random_state)
937941
elif self.method == "fastmsc":
938942
result = fastmsc(X, self.n_clusters, self.max_iter, self.init, random_state=self.random_state)
939943
elif self.method == "pamsil":

src/lib.rs

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -183,15 +183,15 @@ macro_rules! dynmsc_call {
183183
/// :return: k-medoids clustering result
184184
/// :rtype: DynkResult
185185
#[pyfunction]
186-
fn $name(dist: PyReadonlyArray2<'_, $type>, meds: PyReadonlyArray1<'_, usize>, max_iter: usize) -> PyResult<Py<PyAny>> {
186+
fn $name(dist: PyReadonlyArray2<'_, $type>, meds: PyReadonlyArray1<'_, usize>, minimum_k: usize, max_iter: usize) -> PyResult<Py<PyAny>> {
187187
assert_eq!(dist.ndim(), 2);
188188
assert_eq!(dist.shape()[0], dist.shape()[1]);
189189
let mut meds = meds.to_vec()?;
190190
let maxk = meds.len() + 1;
191-
let (loss, assi, n_iter, n_swap, best_meds, losses): ($ltype, _, _, _, _, _) = rustkmedoids::dynmsc(&dist.as_array(), &mut meds, max_iter);
191+
let (loss, assi, n_iter, n_swap, best_meds, losses): ($ltype, _, _, _, _, _) = rustkmedoids::dynmsc(&dist.as_array(), &mut meds, minimum_k, max_iter);
192192
let bestk = best_meds.len();
193193
Python::with_gil(|py| -> PyResult<Py<PyAny>> {
194-
Ok((loss, PyArray1::from_vec(py, assi), PyArray1::from_vec(py, best_meds), bestk, PyArray1::from_vec(py, losses), (2..maxk).collect::<Vec<usize>>(), n_iter, n_swap).to_object(py))
194+
Ok((loss, PyArray1::from_vec(py, assi), PyArray1::from_vec(py, best_meds), bestk, PyArray1::from_vec(py, losses), (minimum_k..maxk).collect::<Vec<usize>>(), n_iter, n_swap).to_object(py))
195195
})
196196
}
197197
}}

0 commit comments

Comments
 (0)