Re-add simplified cache for compiled kernel code.

lohedges · lohedges · commit d4975e962b7f · 2026-02-03T12:43:08.000Z
diff --git a/src/loch/_platforms/_base.py b/src/loch/_platforms/_base.py
@@ -190,6 +190,18 @@ def platform_name(self) -> str:
         """
         pass
 
+    @property
+    def cache_hit(self) -> bool:
+        """
+        Whether the last compile_kernels() call was a cache hit.
+
+        Returns
+        -------
+        bool
+            True if kernels were loaded from cache, False if freshly compiled.
+        """
+        return getattr(self, "_cache_hit", False)
+
     @property
     def compiler_log(self) -> str:
         """
diff --git a/src/loch/_platforms/_cuda.py b/src/loch/_platforms/_cuda.py
@@ -35,6 +35,12 @@
 from .._kernels import code as _kernel_code
 from ._base import PlatformBackend as _PlatformBackend
 
+# Module-level kernel compilation cache. Keyed on
+# (device_index, compiler_optimisations). Since the kernel source no longer
+# depends on system-specific parameters, the same compiled binary can be
+# reused across all samplers on a given device.
+_kernel_cache = {}
+
 
 class CUDAPlatform(_PlatformBackend):
     """
@@ -123,38 +129,51 @@ def compile_kernels(self) -> _Dict[str, _Callable]:
         """
         Compile CUDA kernels and return callable functions.
 
+        Uses a module-level cache so that only the first sampler on a given
+        device pays the nvcc compilation cost.
+
         Returns
         -------
         dict
             Dictionary mapping kernel names to callable kernel functions.
         """
-        # Compile kernel source.
-        # Suppress stderr but capture it for error reporting.
-        stderr_capture = _io.StringIO()
-        old_stderr = _sys.stderr
-
-        options = []
-        if self._compiler_optimisations:
-            options.append("--use_fast_math")
-
-        try:
-            _sys.stderr = stderr_capture
-            cubin = _compile(
-                _kernel_code,
-                no_extern_c=True,
-                nvcc=self._nvcc,
-                options=options,
-            )
-        except Exception as e:
-            stderr_output = stderr_capture.getvalue().strip()
-            error_msg = f"CUDA kernel compilation failed: {e}"
-            if stderr_output:
-                error_msg += f"\n{stderr_output}"
-            raise RuntimeError(error_msg)
-        finally:
-            _sys.stderr = old_stderr
-
-        self._compiler_log = stderr_capture.getvalue().strip()
+        cache_key = (self._device_index, self._compiler_optimisations)
+
+        if cache_key in _kernel_cache:
+            cubin = _kernel_cache[cache_key]
+            self._compiler_log = ""
+            self._cache_hit = True
+        else:
+            # Compile kernel source.
+            # Suppress stderr but capture it for error reporting.
+            stderr_capture = _io.StringIO()
+            old_stderr = _sys.stderr
+
+            options = []
+            if self._compiler_optimisations:
+                options.append("--use_fast_math")
+
+            try:
+                _sys.stderr = stderr_capture
+                cubin = _compile(
+                    _kernel_code,
+                    no_extern_c=True,
+                    nvcc=self._nvcc,
+                    options=options,
+                )
+            except Exception as e:
+                stderr_output = stderr_capture.getvalue().strip()
+                error_msg = f"CUDA kernel compilation failed: {e}"
+                if stderr_output:
+                    error_msg += f"\n{stderr_output}"
+                raise RuntimeError(error_msg)
+            finally:
+                _sys.stderr = old_stderr
+
+            self._compiler_log = stderr_capture.getvalue().strip()
+            self._cache_hit = False
+            _kernel_cache[cache_key] = cubin
+
         mod = _cuda.module_from_buffer(cubin)
 
         # Extract kernel functions
@@ -168,6 +187,11 @@ def compile_kernels(self) -> _Dict[str, _Callable]:
 
         return kernels
 
+    @staticmethod
+    def clear_cache():
+        """Clear the kernel compilation cache."""
+        _kernel_cache.clear()
+
     def to_gpu(self, array: _np.ndarray) -> _Any:
         """
         Transfer a NumPy array to GPU memory.
diff --git a/src/loch/_platforms/_opencl.py b/src/loch/_platforms/_opencl.py
@@ -35,6 +35,10 @@
 from .._kernels import code as _kernel_code
 from ._base import PlatformBackend as _PlatformBackend
 
+# Module-level kernel compilation cache. Keyed on
+# (device_index, compiler_optimisations). Stores compiled program binaries.
+_kernel_cache = {}
+
 
 class OpenCLPlatform(_PlatformBackend):
     """
@@ -122,39 +126,75 @@ def compile_kernels(self) -> _Dict[str, _Callable]:
         """
         Compile OpenCL kernels and return callable functions.
 
+        Uses a module-level cache so that only the first sampler on a given
+        device pays the compilation cost.
+
         Returns
         -------
         dict
             Dictionary mapping kernel names to callable kernel functions.
         """
+        cache_key = (self._device_index, self._compiler_optimisations)
+
         # Build compiler options
         build_options = []
         if self._compiler_optimisations:
             build_options.extend(["-cl-mad-enable", "-cl-no-signed-zeros"])
 
-        # Compile program from source, suppressing stderr and warnings.
-        stderr_capture = _io.StringIO()
-        old_stderr = _sys.stderr
-        try:
-            _sys.stderr = stderr_capture
-            with _warnings.catch_warnings():
-                _warnings.simplefilter("ignore")
-                program = _cl.Program(self._context, _kernel_code).build(
-                    options=build_options
-                )
-        except _cl.RuntimeError as e:
-            stderr_output = stderr_capture.getvalue().strip()
-            error_msg = f"OpenCL kernel compilation failed: {e}"
-            if stderr_output:
-                error_msg += f"\n{stderr_output}"
-            raise RuntimeError(error_msg)
-        finally:
-            _sys.stderr = old_stderr
-
-        # Capture the compiler log (including any warnings).
-        self._compiler_log = program.get_build_info(
-            self._device, _cl.program_build_info.LOG
-        ).strip()
+        if cache_key in _kernel_cache:
+            cached_binary = _kernel_cache[cache_key]
+
+            # Create program from cached binary.
+            stderr_capture = _io.StringIO()
+            old_stderr = _sys.stderr
+            try:
+                _sys.stderr = stderr_capture
+                with _warnings.catch_warnings():
+                    _warnings.simplefilter("ignore")
+                    program = _cl.Program(
+                        self._context, [self._device], [cached_binary]
+                    )
+                    program.build(options=build_options)
+            except _cl.RuntimeError as e:
+                stderr_output = stderr_capture.getvalue().strip()
+                error_msg = f"OpenCL kernel build from cached binary failed: {e}"
+                if stderr_output:
+                    error_msg += f"\n{stderr_output}"
+                raise RuntimeError(error_msg)
+            finally:
+                _sys.stderr = old_stderr
+
+            self._compiler_log = ""
+            self._cache_hit = True
+        else:
+            # Compile program from source, suppressing stderr and warnings.
+            stderr_capture = _io.StringIO()
+            old_stderr = _sys.stderr
+            try:
+                _sys.stderr = stderr_capture
+                with _warnings.catch_warnings():
+                    _warnings.simplefilter("ignore")
+                    program = _cl.Program(self._context, _kernel_code).build(
+                        options=build_options
+                    )
+            except _cl.RuntimeError as e:
+                stderr_output = stderr_capture.getvalue().strip()
+                error_msg = f"OpenCL kernel compilation failed: {e}"
+                if stderr_output:
+                    error_msg += f"\n{stderr_output}"
+                raise RuntimeError(error_msg)
+            finally:
+                _sys.stderr = old_stderr
+
+            # Capture the compiler log (including any warnings).
+            self._compiler_log = program.get_build_info(
+                self._device, _cl.program_build_info.LOG
+            ).strip()
+
+            self._cache_hit = False
+
+            # Cache the compiled binary.
+            _kernel_cache[cache_key] = program.get_info(_cl.program_info.BINARIES)[0]
 
         # Create kernel wrappers that match PyCUDA calling convention.
         # OpenCL kernels need (queue, global_size, local_size, *args)
@@ -189,6 +229,11 @@ def wrapper(*args, **kwargs):
 
         return kernels
 
+    @staticmethod
+    def clear_cache():
+        """Clear the kernel compilation cache."""
+        _kernel_cache.clear()
+
     def to_gpu(self, array: _np.ndarray) -> _Any:
         """
         Transfer a NumPy array to GPU memory.
diff --git a/src/loch/_sampler.py b/src/loch/_sampler.py
@@ -759,6 +759,19 @@ def pop(self) -> None:
         """Pop the GPU context from the calling thread's context stack."""
         self._backend.pop_context()
 
+    @property
+    def kernel_cache_hit(self) -> bool:
+        """
+        Whether kernel compilation was satisfied from cache.
+
+        Returns
+        -------
+
+        cache_hit: bool
+            True if kernels were loaded from cache, False if freshly compiled.
+        """
+        return self._backend.cache_hit
+
     def system(self) -> _Any:
         """
         Return the GCMC system.
diff --git a/tests/test_compiler.py b/tests/test_compiler.py
@@ -113,6 +113,9 @@ def test_compilation_error_raises_exception(self):
             nvcc=_get_nvcc(),
         )
 
+        # Clear the cache so the patched code is actually compiled.
+        CUDAPlatform.clear_cache()
+
         # Patch kernel code directly in the cuda module (not the kernels module,
         # since it's already imported as _kernel_code at module load time).
         original_code = cuda_module._kernel_code
diff --git a/tests/test_energy.py b/tests/test_energy.py
@@ -357,3 +357,90 @@ def test_energy_regression(fixture, platform, request):
     assert math.isclose(
         energy_lj, ref["energy_lj"], abs_tol=1e-4
     ), f"LJ energy changed: {energy_lj!r} != {ref['energy_lj']!r}"
+
+
+@pytest.mark.skipif(
+    "CUDA_VISIBLE_DEVICES" not in os.environ,
+    reason="Requires CUDA enabled GPU.",
+)
+@pytest.mark.parametrize("platform", ["cuda", "opencl"])
+def test_cached_kernel_correctness(platform, water_box):
+    """
+    A second sampler using cached kernels must produce the same energies
+    as the first.
+    """
+
+    mols, reference = water_box
+
+    schedule = sr.cas.LambdaSchedule.standard_morph()
+
+    def _create_and_run(seed):
+        sampler = GCMCSampler(
+            mols,
+            cutoff_type="rf",
+            cutoff="10 A",
+            reference=reference,
+            lambda_schedule=schedule,
+            lambda_value=0.5,
+            log_level="debug",
+            ghost_file=None,
+            log_file=None,
+            test=True,
+            platform=platform,
+            seed=seed,
+        )
+
+        d = sampler.system().dynamics(
+            cutoff_type="rf",
+            cutoff="10 A",
+            temperature="298 K",
+            pressure=None,
+            constraint="h_bonds",
+            timestep="2 fs",
+            schedule=schedule,
+            lambda_value=0.5,
+            coulomb_power=sampler._coulomb_power,
+            shift_coulomb=str(sampler._shift_coulomb),
+            shift_delta=str(sampler._shift_delta),
+            platform=platform,
+        )
+
+        is_accepted = False
+        while not is_accepted:
+            moves = sampler.move(d.context())
+            if len(moves) > 0 and moves[0] == 0:
+                is_accepted = True
+
+        return sampler
+
+    # Clear the cache so the first sampler compiles from source.
+    if platform == "cuda":
+        from loch._platforms._cuda import CUDAPlatform
+
+        CUDAPlatform.clear_cache()
+    else:
+        from loch._platforms._opencl import OpenCLPlatform
+
+        OpenCLPlatform.clear_cache()
+
+    # First sampler compiles kernels, second uses the cache.
+    # Both use the same seed so random water positions are identical.
+    sampler1 = _create_and_run(seed=42)
+    sampler2 = _create_and_run(seed=42)
+
+    # Verify cache behaviour.
+    assert not sampler1.kernel_cache_hit, "First sampler should compile from source"
+    assert sampler2.kernel_cache_hit, "Second sampler should use cached kernels"
+
+    # Verify energy consistency.
+    energy1_coul = sampler1._debug["energy_coul"]
+    energy1_lj = sampler1._debug["energy_lj"]
+    energy2_coul = sampler2._debug["energy_coul"]
+    energy2_lj = sampler2._debug["energy_lj"]
+
+    assert math.isclose(
+        energy1_coul, energy2_coul, abs_tol=1e-4
+    ), f"Coulomb energy mismatch: {energy1_coul!r} vs {energy2_coul!r}"
+    assert math.isclose(
+        energy1_lj, energy2_lj, abs_tol=1e-4
+    ), f"LJ energy mismatch: {energy1_lj!r} vs {energy2_lj!r}"

Original file line number	Diff line number	Diff line change
`@@ -113,6 +113,9 @@ def test_compilation_error_raises_exception(self):`
`113`	`113`	`nvcc=_get_nvcc(),`
`114`	`114`	`)`
`115`	`115`
	`116`	`+ # Clear the cache so the patched code is actually compiled.`
	`117`	`+ CUDAPlatform.clear_cache()`
	`118`	`+`
`116`	`119`	`# Patch kernel code directly in the cuda module (not the kernels module,`
`117`	`120`	`# since it's already imported as _kernel_code at module load time).`
`118`	`121`	`original_code = cuda_module._kernel_code`