Merge pull request #235 from theislab/feature/predict_batch

selmanozleyen · web-flow · commit 56888eb22f26 · 2025-06-13T12:03:07.000+02:00
Feature and Speedup: `predict_batch`
diff --git a/src/cellflow/model/_cellflow.py b/src/cellflow/model/_cellflow.py
@@ -10,6 +10,7 @@
 import flax.linen as nn
 import jax
 import jax.numpy as jnp
+import numpy as np
 import optax
 import pandas as pd
 from ott.neural.methods.flows import dynamics
@@ -625,6 +626,8 @@ def predict(
         batch = pred_loader.sample()
         src = batch["source"]
         condition = batch.get("condition", None)
+        # using jax.tree.map to batch the prediction
+        # because PredictionSampler can return a different number of cells for each condition
         out = jax.tree.map(
             functools.partial(self.solver.predict, rng=rng, **kwargs),
             src,
@@ -637,9 +640,10 @@ def predict(
                 f"When saving predictions to `adata`, all control cells must be from the same control \
                                 population, but found {len(pred_data.control_to_perturbation)} control populations."
             )
+        out_np = {k: np.array(v) for k, v in out.items()}
         _write_predictions(
             adata=adata,
-            predictions=out,
+            predictions=out_np,
             key_added_prefix=key_added_prefix,
         )
 
diff --git a/src/cellflow/solvers/_genot.py b/src/cellflow/solvers/_genot.py
@@ -234,6 +234,7 @@ def predict(
         condition: dict[str, ArrayLike] | None = None,
         rng: ArrayLike | None = None,
         rng_genot: ArrayLike | None = None,
+        batched: bool = False,
         **kwargs: Any,
     ) -> ArrayLike | tuple[ArrayLike, diffrax.Solution]:
         """Generate the push-forward of ``x`` under condition ``condition``.
@@ -253,13 +254,48 @@ def predict(
             mean embedding is used.
         rng_genot
             Random generate used to sample from the latent distribution in cell space.
+        batched
+            Whether to use batched prediction. This is only supported if the input has
+            the same number of cells for each condition. For example, this works when using
+            :class:`~cellflow.data.ValidationSampler` to sample the validation data.
         kwargs
             Keyword arguments for :func:`diffrax.diffeqsolve`.
 
         Returns
         -------
         The push-forward distribution of ``x`` under condition ``condition``.
         """
+        if batched and not x:
+            return {}
+
+        if batched:
+            keys = sorted(x.keys())
+            condition_keys = sorted(set().union(*(condition[k].keys() for k in keys)))
+            _predict_jit = jax.jit(lambda x, condition: self._predict_jit(x, condition, rng, **kwargs))
+            batched_predict = jax.vmap(_predict_jit, in_axes=(0, dict.fromkeys(condition_keys, 0)))
+            # assert that the number of cells is the same for each condition
+            n_cells = x[keys[0]].shape[0]
+            for k in keys:
+                assert x[k].shape[0] == n_cells, "The number of cells must be the same for each condition"
+            src_inputs = jnp.stack([x[k] for k in keys], axis=0)
+            batched_conditions = {}
+            for cond_key in condition_keys:
+                batched_conditions[cond_key] = jnp.stack([condition[k][cond_key] for k in keys])
+
+            pred_targets = batched_predict(src_inputs, batched_conditions)
+            return {k: pred_targets[i] for i, k in enumerate(keys)}
+        else:
+            x_pred = self._predict_jit(x, condition, rng, rng_genot, **kwargs)
+            return np.array(x_pred)
+
+    def _predict_jit(
+        self,
+        x: ArrayLike,
+        condition: dict[str, ArrayLike] | None = None,
+        rng: ArrayLike | None = None,
+        rng_genot: ArrayLike | None = None,
+        **kwargs: Any,
+    ) -> ArrayLike | tuple[ArrayLike, diffrax.Solution]:
         kwargs.setdefault("dt0", None)
         kwargs.setdefault("solver", diffrax.Tsit5())
         kwargs.setdefault("stepsize_controller", diffrax.PIDController(rtol=1e-5, atol=1e-5))
@@ -291,7 +327,7 @@ def solve_ode(
             return sol.ys[0]
 
         x_pred = jax.jit(jax.vmap(solve_ode, in_axes=[0, 0, None, None]))(latent, x, condition, encoder_noise)
-        return np.array(x_pred)
+        return x_pred
 
     @property
     def is_trained(self) -> bool:
diff --git a/src/cellflow/solvers/_otfm.py b/src/cellflow/solvers/_otfm.py
@@ -174,31 +174,10 @@ def get_condition_embedding(self, condition: dict[str, ArrayLike], return_as_num
             return np.asarray(cond_mean), np.asarray(cond_logvar)
         return cond_mean, cond_logvar
 
-    def predict(
+    def _predict_jit(
         self, x: ArrayLike, condition: dict[str, ArrayLike], rng: jax.Array | None = None, **kwargs: Any
     ) -> ArrayLike:
-        """Predict the translated source ``x`` under condition ``condition``.
-
-        This function solves the ODE learnt with
-        the :class:`~cellflow.networks.ConditionalVelocityField`.
-
-        Parameters
-        ----------
-        x
-            Input data of shape [batch_size, ...].
-        condition
-            Condition of the input data of shape [batch_size, ...].
-        rng
-            Random number generator to sample from the latent distribution,
-            only used if ``condition_mode='stochastic'``. If :obj:`None`, the
-            mean embedding is used.
-        kwargs
-            Keyword arguments for :func:`diffrax.diffeqsolve`.
-
-        Returns
-        -------
-        The push-forward distribution of ``x`` under condition ``condition``.
-        """
+        """See :meth:`OTFlowMatching.predict`."""
         kwargs.setdefault("dt0", None)
         kwargs.setdefault("solver", diffrax.Tsit5())
         kwargs.setdefault("stepsize_controller", diffrax.PIDController(rtol=1e-5, atol=1e-5))
@@ -226,7 +205,67 @@ def solve_ode(x: jnp.ndarray, condition: dict[str, jnp.ndarray], encoder_noise:
             return result.ys[0]
 
         x_pred = jax.jit(jax.vmap(solve_ode, in_axes=[0, None, None]))(x, condition, encoder_noise)
-        return np.array(x_pred)
+        return x_pred
+
+    def predict(
+        self,
+        x: ArrayLike | dict[str, ArrayLike],
+        condition: dict[str, ArrayLike] | dict[str, dict[str, ArrayLike]],
+        rng: jax.Array | None = None,
+        batched: bool = False,
+        **kwargs: Any,
+    ) -> ArrayLike | dict[str, ArrayLike]:
+        """Predict the translated source ``x`` under condition ``condition``.
+
+        This function solves the ODE learnt with
+        the :class:`~cellflow.networks.ConditionalVelocityField`.
+
+        Parameters
+        ----------
+        x
+            A dictionary with keys indicating the name of the condition and values containing
+            the input data as arrays. If ``batched=False`` provide an array of shape [batch_size, ...].
+        condition
+            A dictionary with keys indicating the name of the condition and values containing
+            the condition of input data as arrays. If ``batched=False`` provide an array of shape
+            [batch_size, ...].
+        rng
+            Random number generator to sample from the latent distribution,
+            only used if ``condition_mode='stochastic'``. If :obj:`None`, the
+            mean embedding is used.
+        batched
+            Whether to use batched prediction. This is only supported if the input has
+            the same number of cells for each condition. For example, this works when using
+            :class:`~cellflow.data.ValidationSampler` to sample the validation data.
+        kwargs
+            Keyword arguments for :func:`diffrax.diffeqsolve`.
+
+        Returns
+        -------
+        The push-forward distribution of ``x`` under condition ``condition``.
+        """
+        if batched and not x:
+            return {}
+
+        if batched:
+            keys = sorted(x.keys())
+            condition_keys = sorted(set().union(*(condition[k].keys() for k in keys)))
+            _predict_jit = jax.jit(lambda x, condition: self._predict_jit(x, condition, rng, **kwargs))
+            batched_predict = jax.vmap(_predict_jit, in_axes=(0, dict.fromkeys(condition_keys, 0)))
+            # assert that the number of cells is the same for each condition
+            n_cells = x[keys[0]].shape[0]
+            for k in keys:
+                assert x[k].shape[0] == n_cells, "The number of cells must be the same for each condition"
+            src_inputs = jnp.stack([x[k] for k in keys], axis=0)
+            batched_conditions = {}
+            for cond_key in condition_keys:
+                batched_conditions[cond_key] = jnp.stack([condition[k][cond_key] for k in keys])
+
+            pred_targets = batched_predict(src_inputs, batched_conditions)
+            return {k: pred_targets[i] for i, k in enumerate(keys)}
+        else:
+            x_pred = self._predict_jit(x, condition, rng, **kwargs)
+            return np.array(x_pred)
 
     @property
     def is_trained(self) -> bool:
diff --git a/src/cellflow/training/_trainer.py b/src/cellflow/training/_trainer.py
@@ -61,7 +61,7 @@ def _validation_step(
             condition = batch.get("condition", None)
             true_tgt = batch["target"]
             valid_source_data[val_key] = src
-            valid_pred_data[val_key] = jax.tree.map(self.solver.predict, src, condition)
+            valid_pred_data[val_key] = self.solver.predict(src, condition=condition, batched=True)
             valid_true_data[val_key] = true_tgt
 
         return valid_source_data, valid_true_data, valid_pred_data
diff --git a/tests/solver/test_solver.py b/tests/solver/test_solver.py
@@ -0,0 +1,89 @@
+import functools
+import time
+
+import jax
+import numpy as np
+import optax
+import pytest
+from ott.neural.methods.flows import dynamics
+
+import cellflow
+from cellflow.solvers import _genot, _otfm
+from cellflow.utils import match_linear
+
+src = {
+    ("drug_1",): np.random.rand(10, 5),
+    ("drug_2",): np.random.rand(10, 5),
+}
+cond = {
+    ("drug_1",): {"drug": np.random.rand(1, 1, 3)},
+    ("drug_2",): {"drug": np.random.rand(1, 1, 3)},
+}
+vf_rng = jax.random.PRNGKey(111)
+
+
+class TestSolver:
+    @pytest.mark.parametrize("solver_class", ["otfm", "genot"])
+    def test_predict_batch(self, dataloader, solver_class):
+        if solver_class == "otfm":
+            vf_class = cellflow.networks.ConditionalVelocityField
+        else:
+            vf_class = cellflow.networks.GENOTConditionalVelocityField
+
+        opt = optax.adam(1e-3)
+        vf = vf_class(
+            output_dim=5,
+            max_combination_length=2,
+            condition_embedding_dim=12,
+            hidden_dims=(32, 32),
+            decoder_dims=(32, 32),
+        )
+        if solver_class == "otfm":
+            solver = _otfm.OTFlowMatching(
+                vf=vf,
+                match_fn=match_linear,
+                probability_path=dynamics.ConstantNoiseFlow(0.0),
+                optimizer=opt,
+                conditions={"drug": np.random.rand(2, 1, 3)},
+                rng=vf_rng,
+            )
+        else:
+            solver = _genot.GENOT(
+                vf=vf,
+                data_match_fn=match_linear,
+                probability_path=dynamics.ConstantNoiseFlow(0.0),
+                optimizer=opt,
+                source_dim=5,
+                target_dim=5,
+                conditions={"drug": np.random.rand(2, 1, 3)},
+                rng=vf_rng,
+            )
+
+        trainer = cellflow.training.CellFlowTrainer(solver=solver)
+        trainer.train(
+            dataloader=dataloader,
+            num_iterations=2,
+            valid_freq=1,
+        )
+        start_batched = time.time()
+        x_pred_batched = solver.predict(src, cond, batched=True)
+        end_batched = time.time()
+        diff_batched = end_batched - start_batched
+
+        start_nonbatched = time.time()
+        x_pred_nonbatched = jax.tree.map(
+            functools.partial(solver.predict, batched=False),
+            src,
+            cond,  # type: ignore[attr-defined]
+        )
+        end_nonbatched = time.time()
+        diff_nonbatched = end_nonbatched - start_nonbatched
+
+        assert x_pred_batched[("drug_1",)].shape == x_pred_nonbatched[("drug_1",)].shape
+        assert np.allclose(
+            x_pred_batched[("drug_1",)],
+            x_pred_nonbatched[("drug_1",)],
+            atol=1e-1,
+            rtol=1e-2,
+        )
+        assert diff_nonbatched - diff_batched > 2