Skip to content

Commit 1477e24

Browse files
committed
initial test batchSOM
1 parent 80529c6 commit 1477e24

8 files changed

Lines changed: 596 additions & 0 deletions

File tree

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
from ._som import SOM_Batch, map_data_to_codes # isort:skip
2+
from .som_estimator import SOMEstimator_batch_init # isort:skip

src/flowsom/models/batch/_som.py

Lines changed: 198 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,198 @@
1+
from __future__ import annotations
2+
3+
from typing import Callable
4+
5+
import numpy as np
6+
from numba import jit, prange
7+
from sklearn.neighbors import BallTree
8+
9+
from flowsom.models.numpy_numba import nb_median_axis_0
10+
11+
12+
@jit(nopython=True, fastmath=True)
13+
def eucl_without_sqrt(p1: np.ndarray, p2: np.ndarray):
14+
"""Function that computes the Euclidean distance between two points without taking the square root.
15+
16+
For performance reasons, the square root is not taken. This is useful when comparing distances, because the square
17+
root is a monotonic function, meaning that the order of the distances is preserved.
18+
19+
Args:
20+
p1 (np.ndarray): The first point.
21+
p2 (np.ndarray): The second point.
22+
23+
Returns
24+
-------
25+
float: The Euclidean distance between the two points.
26+
27+
>>> eucl_without_sqrt(np.array([1, 2, 3]), np.array([4, 5, 6]))
28+
27.0
29+
"""
30+
distance = 0.0
31+
for j in range(p1.shape[0]):
32+
diff = p1[j] - p2[j]
33+
distance += diff * diff
34+
return distance
35+
36+
37+
@jit(nopython=True, parallel=True, fastmath=True)
38+
def SOM_Batch(
39+
data: np.ndarray,
40+
codes: np.ndarray,
41+
nhbrdist: np.ndarray,
42+
alphas: tuple,
43+
radii: tuple,
44+
ncodes: int,
45+
rlen: int,
46+
nr_batches: int = 10,
47+
distf: Callable[[np.ndarray, np.ndarray], float] = eucl_without_sqrt,
48+
seed=None,
49+
):
50+
"""Function that computes the Self-Organizing Map.
51+
52+
Args:
53+
data (np.ndarray): The data to be clustered.
54+
codes (np.ndarray): The initial codes.
55+
nhbrdist (np.ndarray): The neighbourhood distances.
56+
alphas (tuple): The alphas.
57+
radii (tuple): The radii.
58+
ncodes (int): The number of codes.
59+
rlen (int): The number of iterations.
60+
nr_batches (int): The number of batches.
61+
distf (function): The distance function.
62+
seed (int): The seed for the random number generator.
63+
64+
Returns
65+
-------
66+
np.ndarray: The computed codes.
67+
"""
68+
if seed is not None:
69+
np.random.seed(seed)
70+
71+
# Number of data points
72+
n = data[-1].shape[0]
73+
74+
# Dimension of the data
75+
px = data[0].shape[1]
76+
77+
# Number of iterations
78+
niter = n
79+
80+
# The threshold is the radius of the neighbourhood, meaning in which range codes are updated.
81+
# The threshold step decides how much the threshold is decreased each iteration.
82+
treshold_step = (radii[0] - radii[1]) / niter
83+
84+
# Keep the temporary codes, using the given codes as the initial codes, for every batch
85+
tmp_codes_all = np.empty((nr_batches, ncodes, px), dtype=np.float64)
86+
87+
# Copy the codes as a float64, because the codes are updated in the algorithm
88+
copy_codes = codes.copy().astype(np.float64)
89+
90+
# Execute some initial serial iterations to get a good init clustering
91+
xdist = np.empty(ncodes, dtype=np.float64)
92+
init_threshold = radii[0]
93+
init_alpha = alphas[0]
94+
95+
for i in range(niter):
96+
# Choose a random data point
97+
i = np.random.choice(n)
98+
99+
# Compute the nearest code
100+
nearest = 0
101+
for cd in range(ncodes):
102+
xdist[cd] = distf(data[0][i, :], copy_codes[cd, :])
103+
if xdist[cd] < xdist[nearest]:
104+
nearest = cd
105+
106+
init_alpha = alphas[0] - (alphas[0] - alphas[1]) * i / (niter * rlen)
107+
108+
for cd in range(ncodes):
109+
# The neighbourhood distance decides whether the code is updated. This states that the code is only updated
110+
# if they are close enough to each other. Otherwise, the value stays the same.
111+
if nhbrdist[cd, nearest] <= init_threshold:
112+
# Update the code based on the difference between the used data point and the code.
113+
for j in range(px):
114+
tmp = data[0][i, j] - copy_codes[cd, j]
115+
copy_codes[cd, j] += tmp * init_alpha
116+
117+
init_threshold -= treshold_step
118+
119+
# Choose random data points, for the different batches, and the rlen iterations
120+
data_points_random = np.random.choice(n, nr_batches * rlen * n, replace=True)
121+
122+
# Decrease the number of iterations, because the first iterations are already done
123+
rlen = int(rlen / 2)
124+
125+
for iteration in range(rlen):
126+
# Execute the batches in parallel
127+
for batch_nr in prange(nr_batches):
128+
# Keep the temporary codes, using the given codes as the initial codes
129+
tmp_codes = copy_codes.copy()
130+
131+
# Array for the distances
132+
xdists = np.empty(ncodes, dtype=np.float64)
133+
134+
# IMPORTANT: When setting the threshold to radii[0], this causes big changes every iteration. This is not
135+
# wanted, because the algorithm should converge. Therefore, the threshold is decreased every iteration.
136+
# Update: factor 2 is added, to make the threshold decrease faster.
137+
threshold = init_threshold - radii[0] * 2 * iteration / rlen
138+
139+
for k in range(iteration * niter, (iteration + 1) * niter):
140+
# Get the data point
141+
i = data_points_random[n * rlen * batch_nr + k]
142+
143+
# Compute the nearest code
144+
nearest = 0
145+
for cd in range(ncodes):
146+
xdists[cd] = distf(data[batch_nr][i, :], tmp_codes[cd, :])
147+
if xdists[cd] < xdists[nearest]:
148+
nearest = cd
149+
150+
if threshold < 1.0:
151+
threshold = 0.5
152+
alpha = init_alpha - (alphas[0] - alphas[1]) * k / (niter * rlen)
153+
154+
for cd in range(ncodes):
155+
# The neighbourhood distance decided whether the code is updated. This states that the code is only updated
156+
# if they are close enough to each other. Otherwise, the value stays the same.
157+
if nhbrdist[cd, nearest] <= threshold:
158+
# Update the code based on the difference between the used data point and the code.
159+
for j in range(px):
160+
tmp = data[batch_nr][i, j] - tmp_codes[cd, j]
161+
tmp_codes[cd, j] += tmp * alpha
162+
163+
threshold -= treshold_step
164+
165+
tmp_codes_all[batch_nr] = tmp_codes
166+
167+
# Merge the different SOM's together
168+
copy_codes = nb_median_axis_0(tmp_codes_all).astype(np.float64)
169+
170+
return copy_codes
171+
172+
173+
# ChatGPT generated alternative to map_data_to_codes
174+
def map_data_to_codes(data, codes):
175+
"""Returns a tuple with the indices and distances of the nearest code for each data point.
176+
177+
Args:
178+
data (np.ndarray): The data points.
179+
codes (np.ndarray): The codes that the data points are mapped to.
180+
181+
Returns
182+
-------
183+
np.ndarray: The indices of the nearest code for each data point.
184+
np.ndarray: The distances of the nearest code for each data point.
185+
186+
>>> data_ = np.array([[1, 2, 3], [4, 5, 6]])
187+
>>> codes_ = np.array([[1, 2, 3], [4, 5, 6]])
188+
>>> map_data_to_codes(data_, codes_)
189+
(array([0, 1]), array([0., 0.]))
190+
"""
191+
# Create a BallTree for the codes (this is an efficient data structure for nearest neighbor search)
192+
tree = BallTree(codes, metric="euclidean")
193+
194+
# Query the BallTree to find the nearest code for each data point (k=1 means we only want the nearest neighbor)
195+
dists, indices = tree.query(data, k=1)
196+
197+
# Flatten the results and return them
198+
return indices.flatten(), dists.flatten()
Lines changed: 174 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,174 @@
1+
import igraph as ig
2+
import numpy as np
3+
from scipy.spatial.distance import cdist, pdist, squareform
4+
from sklearn.utils.validation import check_is_fitted
5+
6+
from flowsom.models.base_cluster_estimator import BaseClusterEstimator
7+
8+
from . import SOM_Batch, map_data_to_codes
9+
10+
11+
class SOMEstimator_batch_init(BaseClusterEstimator):
12+
"""Estimate a Self-Organizing Map (SOM) clustering model."""
13+
14+
def __init__(
15+
self,
16+
xdim=10,
17+
ydim=10,
18+
rlen=10,
19+
mst=1,
20+
alpha=(0.05, 0.01),
21+
init=False,
22+
initf=None,
23+
map=True,
24+
codes=None,
25+
importance=None,
26+
seed=None,
27+
):
28+
super().__init__()
29+
self.xdim = xdim
30+
self.ydim = ydim
31+
self.rlen = rlen
32+
self.mst = mst
33+
self.alpha = alpha
34+
self.init = init
35+
self.initf = initf
36+
self.map = map
37+
self.codes = codes
38+
self.importance = importance
39+
self.seed = seed
40+
41+
# Core of the algorithm, where the SOM is executed
42+
def fit(
43+
self,
44+
X,
45+
y=None,
46+
):
47+
"""Perform SOM clustering.
48+
49+
:param inp: An array of the columns to use for clustering
50+
:type inp: np.array
51+
:param xdim: x dimension of SOM
52+
:type xdim: int
53+
:param ydim: y dimension of SOM
54+
:type ydim: int
55+
:param rlen: Number of times to loop over the training data for each MST (Minimum Spanning Tree)
56+
:type rlen: int
57+
:param importance: Array with numeric values. Parameters will be scaled
58+
according to importance
59+
:type importance: np.array
60+
"""
61+
codes = self.codes
62+
xdim = self.xdim
63+
ydim = self.ydim
64+
importance = self.importance
65+
init = self.init
66+
mst = self.mst
67+
alpha = self.alpha
68+
69+
if codes is not None:
70+
assert (
71+
(codes.shape[1] == X.shape[1]) and (codes.shape[0] == xdim * ydim)
72+
), "If codes is not NULL, it should have the same number of columns as the data and the number of rows should correspond with xdim*ydim"
73+
74+
if importance is not None:
75+
X = np.stack([X[:, i] * importance[i] for i in range(len(importance))], axis=1)
76+
77+
# Initialize the grid
78+
grid = [(x, y) for x in range(xdim) for y in range(ydim)]
79+
n_codes = len(grid)
80+
81+
if self.seed is not None:
82+
np.random.seed(self.seed)
83+
84+
if codes is None:
85+
if init:
86+
codes = self.initf(X, xdim, ydim)
87+
else:
88+
# If no codes are provided, choose n_codes different random rows from the data
89+
codes = X[np.random.choice(X.shape[0], n_codes, replace=False), :]
90+
91+
# Initialize the neighbourhood
92+
# First the distances are computed (using the chebyshev distance this means the distance between (1, 1) and
93+
# (1, 2) is one because the highest difference between two coördinates is 1. Using the squareform these are
94+
# converted to a square matrix. This is a symmetric matrix, where the diagonal is 0.
95+
nhbrdist = squareform(pdist(grid, metric="chebyshev"))
96+
97+
# Initialize the radius
98+
radius = (np.quantile(nhbrdist, 0.67), 0)
99+
100+
# MST defines the amount of times the data is looped over. If mst is 1, only one radius and alpha is used.
101+
# If mst is higher, the radius and alpha are linearly spaced between the given values
102+
if mst == 1:
103+
radius = [radius]
104+
alpha = [alpha]
105+
else:
106+
radius = np.linspace(radius[0], radius[1], num=mst + 1)
107+
radius = [tuple(radius[i : i + 2]) for i in range(mst)]
108+
alpha = np.linspace(alpha[0], alpha[1], num=mst + 1)
109+
alpha = [tuple(alpha[i : i + 2]) for i in range(mst)]
110+
111+
# Define the number of batches
112+
nr_batches = 10
113+
114+
# Split the data for the different batches, where batch with number 0 contains datapoint 0, batch_size, 2*batch_size, ...
115+
data = []
116+
for i in range(nr_batches):
117+
data.append(X[i::nr_batches, :])
118+
119+
# Make sure all the batches have the same amount of data, if not add the last data point to the last batch
120+
for i in range(nr_batches):
121+
if data[i].shape[0] < data[0].shape[0]:
122+
data[i] = np.vstack([data[i], X[-1, :]])
123+
124+
# Compute the SOM: mst defines the amount of times the data is looped over
125+
for i in range(mst):
126+
codes = SOM_Batch(
127+
np.array(data, dtype=np.float32),
128+
codes,
129+
nhbrdist,
130+
alphas=alpha[i],
131+
radii=radius[i],
132+
ncodes=n_codes,
133+
rlen=self.rlen,
134+
seed=self.seed,
135+
nr_batches=nr_batches,
136+
)
137+
if mst != 1:
138+
nhbrdist: list[list[int]] = _dist_mst(codes)
139+
140+
clusters, dists = map_data_to_codes(data=X, codes=codes)
141+
self.codes, self.labels_, self.distances = codes.copy(), clusters, dists
142+
self._is_fitted = True
143+
return self
144+
145+
def predict(self, X, y=None):
146+
"""Predict labels using the model."""
147+
check_is_fitted(self)
148+
# self.distances = cdist(X, self.codes, metric="euclidean") => Not used in the original code
149+
clusters, dists = map_data_to_codes(X, self.codes)
150+
self.labels_ = clusters.astype(int)
151+
self.distances = dists
152+
return self.labels_
153+
154+
# Called by the BASE FlowSOM Estimator
155+
def fit_predict(self, X, y=None):
156+
"""Fit the model and predict labels."""
157+
self.fit(X)
158+
# Makes no sense here to call predict again, since the labels are already computed in the fit method
159+
return self.labels_
160+
161+
162+
def _dist_mst(codes):
163+
adjacency = cdist(
164+
codes,
165+
codes,
166+
metric="euclidean",
167+
)
168+
full_graph = ig.Graph.Weighted_Adjacency(adjacency, mode="undirected", loops=False)
169+
MST_graph = ig.Graph.spanning_tree(full_graph, weights=full_graph.es["weight"])
170+
codes = [
171+
[len(x) - 1 for x in MST_graph.get_shortest_paths(v=i, to=MST_graph.vs.indices, weights=None)]
172+
for i in MST_graph.vs.indices
173+
]
174+
return codes
Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
from . import BaseFlowSOMEstimator, ConsensusCluster # isort:skip
2+
from .batch import SOMEstimator_batch_init # isort:skip
3+
4+
5+
class BatchFlowSOMEstimator(BaseFlowSOMEstimator):
6+
"""A class that implements the FlowSOM model."""
7+
8+
def __init__(
9+
self,
10+
cluster_model=SOMEstimator_batch_init,
11+
metacluster_model=ConsensusCluster,
12+
**kwargs,
13+
):
14+
"""Initialize the FlowSOMEstimator object."""
15+
super().__init__(
16+
cluster_model=cluster_model,
17+
metacluster_model=metacluster_model,
18+
**kwargs,
19+
)

0 commit comments

Comments
 (0)