diff --git a/.github/workflows/build_kernel_windows.yaml b/.github/workflows/build_kernel_windows.yaml
index 64a265db..6082f7ad 100644
--- a/.github/workflows/build_kernel_windows.yaml
+++ b/.github/workflows/build_kernel_windows.yaml
@@ -26,31 +26,94 @@ jobs:
     runs-on: ${{ matrix.os }}
 
     steps:
-      - uses: actions/cache@27d5ce7f107fe9357f9df03efb73ab90386fccae # v5.0.5
-        with:
-          key: cuda-toolkit-v${{ matrix.cuda }}-${{ matrix.os }}
-          path: |
-            C:\Program Files\NVIDIA GPU Computing Toolkit
-            ~/.cargo/registry
-            ~/.cargo/git
-
       - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
 
-      # CUDA environment setup
-      - uses: huggingface/cuda-toolkit@714c97b32958862237b96401fb253a4261453c3b # v0.1.0
-        id: setup-cuda-toolkit
+      # ---- CUDA toolkit (cache + skip installer on hit) ----
+      # On a cache hit we restore C:\Program Files\NVIDIA GPU Computing Toolkit
+      # and skip the cuda-toolkit action entirely (which otherwise spends ~7
+      # min running the MSI even when the files are already on disk). We then
+      # replicate the small bit of env setup the action would have done — see
+      # the next step.
+      - name: Cache CUDA toolkit
+        id: cuda-cache
+        uses: actions/cache@27d5ce7f107fe9357f9df03efb73ab90386fccae # v5.0.5
         with:
-          cuda: ${{ matrix.torch.cuda }}  # TODO(mfuntowicz): How can we test multiple CUDA versions than align with torch?
+          path: C:\Program Files\NVIDIA GPU Computing Toolkit
+          # Key bumps:
+          #  - matrix.torch.cuda — different CUDA versions get separate caches
+          #  - 714c97b3 — pinned SHA of huggingface/cuda-toolkit; bump when the
+          #    action changes so we re-download instead of reusing a stale tree
+          key: cuda-toolkit-${{ matrix.torch.cuda }}-714c97b3-${{ matrix.os }}
+
+      - name: Install CUDA toolkit
+        if: steps.cuda-cache.outputs.cache-hit != 'true'
+        uses: huggingface/cuda-toolkit@714c97b32958862237b96401fb253a4261453c3b # v0.1.0
+        with:
+          cuda: ${{ matrix.torch.cuda }}
+
+      - name: Restore CUDA env vars (cache hit only)
+        # huggingface/cuda-toolkit's updatePath sets CUDA_PATH, CUDA_PATH_VX_Y,
+        # and prepends <CUDA_PATH>\bin to PATH. When we skip the action above,
+        # those env mutations don't happen — replicate them here so nvcc and
+        # the downstream builds find the toolkit.
+        #
+        # Also re-install the MSBuild integration: the CUDA installer normally
+        # copies CUDA <ver>.{props,targets,xml} from the toolkit's
+        # extras\visual_studio_integration\MSBuildExtensions\ into the VS
+        # BuildCustomizations dir. Without that, CMake's CUDA language detection
+        # fails with "No CUDA toolset found". Cache only restores the toolkit
+        # tree, so we copy the props in by hand on cache hits.
+        if: steps.cuda-cache.outputs.cache-hit == 'true'
+        shell: pwsh
+        run: |
+          $parts = "${{ matrix.torch.cuda }}".Split('.')
+          $major = $parts[0]
+          $minor = $parts[1]
+          $cudaPath = "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v$major.$minor"
+          "CUDA_PATH=$cudaPath"                              | Out-File $env:GITHUB_ENV  -Append -Encoding utf8
+          "CUDA_PATH_V${major}_${minor}=$cudaPath"           | Out-File $env:GITHUB_ENV  -Append -Encoding utf8
+          "$cudaPath\bin"                                    | Out-File $env:GITHUB_PATH -Append -Encoding utf8
+
+          $msBuildExt = Join-Path $cudaPath 'extras\visual_studio_integration\MSBuildExtensions'
+          if (-not (Test-Path $msBuildExt)) {
+            throw "MSBuild integration not found in cached toolkit at $msBuildExt — cache may be incomplete."
+          }
+          # GitHub-hosted windows-2022 ships VS 2022 Enterprise; glob anyway so
+          # we don't silently break if the image switches edition.
+          $vsRoots = Get-ChildItem 'C:\Program Files\Microsoft Visual Studio\2022' -Directory -ErrorAction SilentlyContinue
+          if (-not $vsRoots) { throw "Visual Studio 2022 not found on runner." }
+          foreach ($vs in $vsRoots) {
+            $dest = Join-Path $vs.FullName 'MSBuild\Microsoft\VC\v170\BuildCustomizations'
+            New-Item -ItemType Directory -Force -Path $dest | Out-Null
+            Copy-Item -Path (Join-Path $msBuildExt '*') -Destination $dest -Force -Recurse
+            Write-Host "Installed CUDA MSBuild integration into $dest"
+          }
+
       - name: "NVCC checks"
         run: nvcc -V
 
-      # Rust build environment setup
+      # ---- Rust toolchain + cached kernel-builder build ----
       - uses: actions-rs/toolchain@16499b5e05bf2e26879000db0c1d13f7e13fa3af # v1.0.7
         with:
           toolchain: stable
           profile: minimal
           override: true
 
+      # Caches the workspace target/ plus ~/.cargo/{registry,git}. Keys on
+      # Cargo.lock so a clean dep-graph change invalidates the artifact cache
+      # but unrelated edits reuse it incrementally. Cuts the kernel-builder
+      # build from ~8 min cold to ~30s warm.
+      #
+      # workspaces must point at the actual workspace root (root Cargo.toml
+      # has `[workspace] members = [..., "kernel-builder", ...]`). Cargo
+      # always writes target/ at the workspace root, so caching
+      # ./kernel-builder/target would restore to a path cargo never reads.
+      - name: Cache cargo + kernel-builder target
+        uses: Swatinem/rust-cache@c19371144df3bb44fab255c43d04cbc2ab54d1c4 # v2.9.1
+        with:
+          workspaces: .
+          shared-key: kernel-builder-${{ matrix.os }}
+
       - name: Build kernel-builder
         run: ( cd kernel-builder && cargo build --release )