saeyslab
diff --git a/‎src/flowsom/models/batch/__init__.py‎
Lines changed: 2 additions & 0 deletions b/‎src/flowsom/models/batch/__init__.py‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎src/flowsom/models/batch/_som.py‎
Lines changed: 198 additions & 0 deletions b/‎src/flowsom/models/batch/_som.py‎
Lines changed: 198 additions & 0 deletions
diff --git a/‎src/flowsom/models/batch/som_estimator.py‎
Lines changed: 174 additions & 0 deletions b/‎src/flowsom/models/batch/som_estimator.py‎
Lines changed: 174 additions & 0 deletions
diff --git a/‎src/flowsom/models/batch_flowsom_estimator.py‎
Lines changed: 19 additions & 0 deletions b/‎src/flowsom/models/batch_flowsom_estimator.py‎
Lines changed: 19 additions & 0 deletions
@@ -0,0 +1,2 @@
+from ._som import SOM_Batch, map_data_to_codes  # isort:skip
+from .som_estimator import SOMEstimator_batch_init  # isort:skip
@@ -0,0 +1,198 @@
+from __future__ import annotations
+
+from typing import Callable
+
+import numpy as np
+from numba import jit, prange
+from sklearn.neighbors import BallTree
+
+from flowsom.models.numpy_numba import nb_median_axis_0
+
+
+@jit(nopython=True, fastmath=True)
+def eucl_without_sqrt(p1: np.ndarray, p2: np.ndarray):
+    """Function that computes the Euclidean distance between two points without taking the square root.
+
+    For performance reasons, the square root is not taken. This is useful when comparing distances, because the square
+    root is a monotonic function, meaning that the order of the distances is preserved.
+
+    Args:
+        p1 (np.ndarray): The first point.
+        p2 (np.ndarray): The second point.
+
+    Returns
+    -------
+        float: The Euclidean distance between the two points.
+
+    >>> eucl_without_sqrt(np.array([1, 2, 3]), np.array([4, 5, 6]))
+    27.0
+    """
+    distance = 0.0
+    for j in range(p1.shape[0]):
+        diff = p1[j] - p2[j]
+        distance += diff * diff
+    return distance
+
+
+@jit(nopython=True, parallel=True, fastmath=True)
+def SOM_Batch(
+    data: np.ndarray,
+    codes: np.ndarray,
+    nhbrdist: np.ndarray,
+    alphas: tuple,
+    radii: tuple,
+    ncodes: int,
+    rlen: int,
+    nr_batches: int = 10,
+    distf: Callable[[np.ndarray, np.ndarray], float] = eucl_without_sqrt,
+    seed=None,
+):
+    """Function that computes the Self-Organizing Map.
+
+    Args:
+        data (np.ndarray): The data to be clustered.
+        codes (np.ndarray): The initial codes.
+        nhbrdist (np.ndarray): The neighbourhood distances.
+        alphas (tuple): The alphas.
+        radii (tuple): The radii.
+        ncodes (int): The number of codes.
+        rlen (int): The number of iterations.
+        nr_batches (int): The number of batches.
+        distf (function): The distance function.
+        seed (int): The seed for the random number generator.
+
+    Returns
+    -------
+        np.ndarray: The computed codes.
+    """
+    if seed is not None:
+        np.random.seed(seed)
+
+    # Number of data points
+    n = data[-1].shape[0]
+
+    # Dimension of the data
+    px = data[0].shape[1]
+
+    # Number of iterations
+    niter = n
+
+    # The threshold is the radius of the neighbourhood, meaning in which range codes are updated.
+    # The threshold step decides how much the threshold is decreased each iteration.
+    treshold_step = (radii[0] - radii[1]) / niter
+
+    # Keep the temporary codes, using the given codes as the initial codes, for every batch
+    tmp_codes_all = np.empty((nr_batches, ncodes, px), dtype=np.float64)
+
+    # Copy the codes as a float64, because the codes are updated in the algorithm
+    copy_codes = codes.copy().astype(np.float64)
+
+    # Execute some initial serial iterations to get a good init clustering
+    xdist = np.empty(ncodes, dtype=np.float64)
+    init_threshold = radii[0]
+    init_alpha = alphas[0]
+
+    for i in range(niter):
+        # Choose a random data point
+        i = np.random.choice(n)
+
+        # Compute the nearest code
+        nearest = 0
+        for cd in range(ncodes):
+            xdist[cd] = distf(data[0][i, :], copy_codes[cd, :])
+            if xdist[cd] < xdist[nearest]:
+                nearest = cd
+
+        init_alpha = alphas[0] - (alphas[0] - alphas[1]) * i / (niter * rlen)
+
+        for cd in range(ncodes):
+            # The neighbourhood distance decides whether the code is updated. This states that the code is only updated
+            # if they are close enough to each other. Otherwise, the value stays the same.
+            if nhbrdist[cd, nearest] <= init_threshold:
+                # Update the code based on the difference between the used data point and the code.
+                for j in range(px):
+                    tmp = data[0][i, j] - copy_codes[cd, j]
+                    copy_codes[cd, j] += tmp * init_alpha
+
+        init_threshold -= treshold_step
+
+    # Choose random data points, for the different batches, and the rlen iterations
+    data_points_random = np.random.choice(n, nr_batches * rlen * n, replace=True)
+
+    # Decrease the number of iterations, because the first iterations are already done
+    rlen = int(rlen / 2)
+
+    for iteration in range(rlen):
+        # Execute the batches in parallel
+        for batch_nr in prange(nr_batches):
+            # Keep the temporary codes, using the given codes as the initial codes
+            tmp_codes = copy_codes.copy()
+
+            # Array for the distances
+            xdists = np.empty(ncodes, dtype=np.float64)
+
+            # IMPORTANT: When setting the threshold to radii[0], this causes big changes every iteration. This is not
+            # wanted, because the algorithm should converge. Therefore, the threshold is decreased every iteration.
+            # Update: factor 2 is added, to make the threshold decrease faster.
+            threshold = init_threshold - radii[0] * 2 * iteration / rlen
+
+            for k in range(iteration * niter, (iteration + 1) * niter):
+                # Get the data point
+                i = data_points_random[n * rlen * batch_nr + k]
+
+                # Compute the nearest code
+                nearest = 0
+                for cd in range(ncodes):
+                    xdists[cd] = distf(data[batch_nr][i, :], tmp_codes[cd, :])
+                    if xdists[cd] < xdists[nearest]:
+                        nearest = cd
+
+                if threshold < 1.0:
+                    threshold = 0.5
+                alpha = init_alpha - (alphas[0] - alphas[1]) * k / (niter * rlen)
+
+                for cd in range(ncodes):
+                    # The neighbourhood distance decided whether the code is updated. This states that the code is only updated
+                    # if they are close enough to each other. Otherwise, the value stays the same.
+                    if nhbrdist[cd, nearest] <= threshold:
+                        # Update the code based on the difference between the used data point and the code.
+                        for j in range(px):
+                            tmp = data[batch_nr][i, j] - tmp_codes[cd, j]
+                            tmp_codes[cd, j] += tmp * alpha
+
+                threshold -= treshold_step
+
+            tmp_codes_all[batch_nr] = tmp_codes
+
+        # Merge the different SOM's together
+        copy_codes = nb_median_axis_0(tmp_codes_all).astype(np.float64)
+
+    return copy_codes
+
+
+# ChatGPT generated alternative to map_data_to_codes
+def map_data_to_codes(data, codes):
+    """Returns a tuple with the indices and distances of the nearest code for each data point.
+
+    Args:
+        data (np.ndarray): The data points.
+        codes (np.ndarray): The codes that the data points are mapped to.
+
+    Returns
+    -------
+        np.ndarray: The indices of the nearest code for each data point.
+        np.ndarray: The distances of the nearest code for each data point.
+
+    >>> data_ = np.array([[1, 2, 3], [4, 5, 6]])
+    >>> codes_ = np.array([[1, 2, 3], [4, 5, 6]])
+    >>> map_data_to_codes(data_, codes_)
+    (array([0, 1]), array([0., 0.]))
+    """
+    # Create a BallTree for the codes (this is an efficient data structure for nearest neighbor search)
+    tree = BallTree(codes, metric="euclidean")
+
+    # Query the BallTree to find the nearest code for each data point (k=1 means we only want the nearest neighbor)
+    dists, indices = tree.query(data, k=1)
+
+    # Flatten the results and return them
+    return indices.flatten(), dists.flatten()
@@ -0,0 +1,174 @@
+import igraph as ig
+import numpy as np
+from scipy.spatial.distance import cdist, pdist, squareform
+from sklearn.utils.validation import check_is_fitted
+
+from flowsom.models.base_cluster_estimator import BaseClusterEstimator
+
+from . import SOM_Batch, map_data_to_codes
+
+
+class SOMEstimator_batch_init(BaseClusterEstimator):
+    """Estimate a Self-Organizing Map (SOM) clustering model."""
+
+    def __init__(
+        self,
+        xdim=10,
+        ydim=10,
+        rlen=10,
+        mst=1,
+        alpha=(0.05, 0.01),
+        init=False,
+        initf=None,
+        map=True,
+        codes=None,
+        importance=None,
+        seed=None,
+    ):
+        super().__init__()
+        self.xdim = xdim
+        self.ydim = ydim
+        self.rlen = rlen
+        self.mst = mst
+        self.alpha = alpha
+        self.init = init
+        self.initf = initf
+        self.map = map
+        self.codes = codes
+        self.importance = importance
+        self.seed = seed
+
+    # Core of the algorithm, where the SOM is executed
+    def fit(
+        self,
+        X,
+        y=None,
+    ):
+        """Perform SOM clustering.
+
+        :param inp:  An array of the columns to use for clustering
+        :type inp: np.array
+        :param xdim: x dimension of SOM
+        :type xdim: int
+        :param ydim: y dimension of SOM
+        :type ydim: int
+        :param rlen: Number of times to loop over the training data for each MST (Minimum Spanning Tree)
+        :type rlen: int
+        :param importance: Array with numeric values. Parameters will be scaled
+        according to importance
+        :type importance: np.array
+        """
+        codes = self.codes
+        xdim = self.xdim
+        ydim = self.ydim
+        importance = self.importance
+        init = self.init
+        mst = self.mst
+        alpha = self.alpha
+
+        if codes is not None:
+            assert (
+                (codes.shape[1] == X.shape[1]) and (codes.shape[0] == xdim * ydim)
+            ), "If codes is not NULL, it should have the same number of columns as the data and the number of rows should correspond with xdim*ydim"
+
+        if importance is not None:
+            X = np.stack([X[:, i] * importance[i] for i in range(len(importance))], axis=1)
+
+        # Initialize the grid
+        grid = [(x, y) for x in range(xdim) for y in range(ydim)]
+        n_codes = len(grid)
+
+        if self.seed is not None:
+            np.random.seed(self.seed)
+
+        if codes is None:
+            if init:
+                codes = self.initf(X, xdim, ydim)
+            else:
+                # If no codes are provided, choose n_codes different random rows from the data
+                codes = X[np.random.choice(X.shape[0], n_codes, replace=False), :]
+
+        # Initialize the neighbourhood
+        # First the distances are computed (using the chebyshev distance this means the distance between (1, 1) and
+        # (1, 2) is one because the highest difference between two coördinates is 1. Using the squareform these are
+        # converted to a square matrix. This is a symmetric matrix, where the diagonal is 0.
+        nhbrdist = squareform(pdist(grid, metric="chebyshev"))
+
+        # Initialize the radius
+        radius = (np.quantile(nhbrdist, 0.67), 0)
+
+        # MST defines the amount of times the data is looped over. If mst is 1, only one radius and alpha is used.
+        # If mst is higher, the radius and alpha are linearly spaced between the given values
+        if mst == 1:
+            radius = [radius]
+            alpha = [alpha]
+        else:
+            radius = np.linspace(radius[0], radius[1], num=mst + 1)
+            radius = [tuple(radius[i : i + 2]) for i in range(mst)]
+            alpha = np.linspace(alpha[0], alpha[1], num=mst + 1)
+            alpha = [tuple(alpha[i : i + 2]) for i in range(mst)]
+
+        # Define the number of batches
+        nr_batches = 10
+
+        # Split the data for the different batches, where batch with number 0 contains datapoint 0, batch_size, 2*batch_size, ...
+        data = []
+        for i in range(nr_batches):
+            data.append(X[i::nr_batches, :])
+
+        # Make sure all the batches have the same amount of data, if not add the last data point to the last batch
+        for i in range(nr_batches):
+            if data[i].shape[0] < data[0].shape[0]:
+                data[i] = np.vstack([data[i], X[-1, :]])
+
+        # Compute the SOM: mst defines the amount of times the data is looped over
+        for i in range(mst):
+            codes = SOM_Batch(
+                np.array(data, dtype=np.float32),
+                codes,
+                nhbrdist,
+                alphas=alpha[i],
+                radii=radius[i],
+                ncodes=n_codes,
+                rlen=self.rlen,
+                seed=self.seed,
+                nr_batches=nr_batches,
+            )
+            if mst != 1:
+                nhbrdist: list[list[int]] = _dist_mst(codes)
+
+        clusters, dists = map_data_to_codes(data=X, codes=codes)
+        self.codes, self.labels_, self.distances = codes.copy(), clusters, dists
+        self._is_fitted = True
+        return self
+
+    def predict(self, X, y=None):
+        """Predict labels using the model."""
+        check_is_fitted(self)
+        # self.distances = cdist(X, self.codes, metric="euclidean") => Not used in the original code
+        clusters, dists = map_data_to_codes(X, self.codes)
+        self.labels_ = clusters.astype(int)
+        self.distances = dists
+        return self.labels_
+
+    # Called by the BASE FlowSOM Estimator
+    def fit_predict(self, X, y=None):
+        """Fit the model and predict labels."""
+        self.fit(X)
+        # Makes no sense here to call predict again, since the labels are already computed in the fit method
+        return self.labels_
+
+
+def _dist_mst(codes):
+    adjacency = cdist(
+        codes,
+        codes,
+        metric="euclidean",
+    )
+    full_graph = ig.Graph.Weighted_Adjacency(adjacency, mode="undirected", loops=False)
+    MST_graph = ig.Graph.spanning_tree(full_graph, weights=full_graph.es["weight"])
+    codes = [
+        [len(x) - 1 for x in MST_graph.get_shortest_paths(v=i, to=MST_graph.vs.indices, weights=None)]
+        for i in MST_graph.vs.indices
+    ]
+    return codes
@@ -0,0 +1,19 @@
+from . import BaseFlowSOMEstimator, ConsensusCluster # isort:skip
+from .batch import SOMEstimator_batch_init # isort:skip
+
+
+class BatchFlowSOMEstimator(BaseFlowSOMEstimator):
+    """A class that implements the FlowSOM model."""
+
+    def __init__(
+        self,
+        cluster_model=SOMEstimator_batch_init,
+        metacluster_model=ConsensusCluster,
+        **kwargs,
+    ):
+        """Initialize the FlowSOMEstimator object."""
+        super().__init__(
+            cluster_model=cluster_model,
+            metacluster_model=metacluster_model,
+            **kwargs,
+        )
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,2 @@`
	`1`	`+from ._som import SOM_Batch, map_data_to_codes # isort:skip`
	`2`	`+from .som_estimator import SOMEstimator_batch_init # isort:skip`