From 9f77fa31701b973e6aab9fcdf3c8a64f9bf52602 Mon Sep 17 00:00:00 2001 From: sayakpaul Date: Wed, 27 May 2026 14:06:56 +0530 Subject: [PATCH 1/7] try to optimize windows builder ci --- .github/workflows/build_kernel_windows.yaml | 62 ++++++++++++++++----- 1 file changed, 49 insertions(+), 13 deletions(-) diff --git a/.github/workflows/build_kernel_windows.yaml b/.github/workflows/build_kernel_windows.yaml index 64a265db..68784ae0 100644 --- a/.github/workflows/build_kernel_windows.yaml +++ b/.github/workflows/build_kernel_windows.yaml @@ -26,31 +26,67 @@ jobs: runs-on: ${{ matrix.os }} steps: - - uses: actions/cache@27d5ce7f107fe9357f9df03efb73ab90386fccae # v5.0.5 - with: - key: cuda-toolkit-v${{ matrix.cuda }}-${{ matrix.os }} - path: | - C:\Program Files\NVIDIA GPU Computing Toolkit - ~/.cargo/registry - ~/.cargo/git - - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 - # CUDA environment setup - - uses: huggingface/cuda-toolkit@714c97b32958862237b96401fb253a4261453c3b # v0.1.0 - id: setup-cuda-toolkit + # ---- CUDA toolkit (cache + skip installer on hit) ---- + # On a cache hit we restore C:\Program Files\NVIDIA GPU Computing Toolkit + # and skip the cuda-toolkit action entirely (which otherwise spends ~7 + # min running the MSI even when the files are already on disk). We then + # replicate the small bit of env setup the action would have done — see + # the next step. + - name: Cache CUDA toolkit + id: cuda-cache + uses: actions/cache@27d5ce7f107fe9357f9df03efb73ab90386fccae # v5.0.5 + with: + path: C:\Program Files\NVIDIA GPU Computing Toolkit + # Key bumps: + # - matrix.torch.cuda — different CUDA versions get separate caches + # - 714c97b3 — pinned SHA of huggingface/cuda-toolkit; bump when the + # action changes so we re-download instead of reusing a stale tree + key: cuda-toolkit-${{ matrix.torch.cuda }}-714c97b3-${{ matrix.os }} + + - name: Install CUDA toolkit + if: steps.cuda-cache.outputs.cache-hit != 'true' + uses: huggingface/cuda-toolkit@714c97b32958862237b96401fb253a4261453c3b # v0.1.0 with: - cuda: ${{ matrix.torch.cuda }} # TODO(mfuntowicz): How can we test multiple CUDA versions than align with torch? + cuda: ${{ matrix.torch.cuda }} + + - name: Restore CUDA env vars (cache hit only) + # huggingface/cuda-toolkit's updatePath sets CUDA_PATH, CUDA_PATH_VX_Y, + # and prepends \bin to PATH. When we skip the action above, + # those env mutations don't happen — replicate them here so nvcc and + # the downstream builds find the toolkit. + if: steps.cuda-cache.outputs.cache-hit == 'true' + shell: pwsh + run: | + $parts = "${{ matrix.torch.cuda }}".Split('.') + $major = $parts[0] + $minor = $parts[1] + $cudaPath = "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v$major.$minor" + "CUDA_PATH=$cudaPath" | Out-File $env:GITHUB_ENV -Append -Encoding utf8 + "CUDA_PATH_V${major}_${minor}=$cudaPath" | Out-File $env:GITHUB_ENV -Append -Encoding utf8 + "$cudaPath\bin" | Out-File $env:GITHUB_PATH -Append -Encoding utf8 + - name: "NVCC checks" run: nvcc -V - # Rust build environment setup + # ---- Rust toolchain + cached kernel-builder build ---- - uses: actions-rs/toolchain@16499b5e05bf2e26879000db0c1d13f7e13fa3af # v1.0.7 with: toolchain: stable profile: minimal override: true + # Caches kernel-builder/target plus ~/.cargo/{registry,git}. Keys on + # Cargo.lock so a clean dep-graph change invalidates the artifact cache + # but unrelated edits reuse it incrementally. Cuts the kernel-builder + # build from ~8 min cold to ~30s warm. + - name: Cache cargo + kernel-builder target + uses: Swatinem/rust-cache@c19371144df3bb44fab255c43d04cbc2ab54d1c4 # v2.9.1 + with: + workspaces: kernel-builder + shared-key: kernel-builder-${{ matrix.os }} + - name: Build kernel-builder run: ( cd kernel-builder && cargo build --release ) From 6b0fe1c1ce71209033aed028dce0fde5b17b5633 Mon Sep 17 00:00:00 2001 From: sayakpaul Date: Wed, 27 May 2026 14:49:26 +0530 Subject: [PATCH 2/7] trigger cache-hit validation run From 0c1a5ab51ad1d7291f32b315fb367d04bd378585 Mon Sep 17 00:00:00 2001 From: sayakpaul Date: Wed, 27 May 2026 15:19:03 +0530 Subject: [PATCH 3/7] fix --- .github/workflows/build_kernel_windows.yaml | 22 +++++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/.github/workflows/build_kernel_windows.yaml b/.github/workflows/build_kernel_windows.yaml index 68784ae0..debb830c 100644 --- a/.github/workflows/build_kernel_windows.yaml +++ b/.github/workflows/build_kernel_windows.yaml @@ -56,6 +56,13 @@ jobs: # and prepends \bin to PATH. When we skip the action above, # those env mutations don't happen — replicate them here so nvcc and # the downstream builds find the toolkit. + # + # Also re-install the MSBuild integration: the CUDA installer normally + # copies CUDA .{props,targets,xml} from the toolkit's + # extras\visual_studio_integration\MSBuildExtensions\ into the VS + # BuildCustomizations dir. Without that, CMake's CUDA language detection + # fails with "No CUDA toolset found". Cache only restores the toolkit + # tree, so we copy the props in by hand on cache hits. if: steps.cuda-cache.outputs.cache-hit == 'true' shell: pwsh run: | @@ -67,6 +74,21 @@ jobs: "CUDA_PATH_V${major}_${minor}=$cudaPath" | Out-File $env:GITHUB_ENV -Append -Encoding utf8 "$cudaPath\bin" | Out-File $env:GITHUB_PATH -Append -Encoding utf8 + $msBuildExt = Join-Path $cudaPath 'extras\visual_studio_integration\MSBuildExtensions' + if (-not (Test-Path $msBuildExt)) { + throw "MSBuild integration not found in cached toolkit at $msBuildExt — cache may be incomplete." + } + # GitHub-hosted windows-2022 ships VS 2022 Enterprise; glob anyway so + # we don't silently break if the image switches edition. + $vsRoots = Get-ChildItem 'C:\Program Files\Microsoft Visual Studio\2022' -Directory -ErrorAction SilentlyContinue + if (-not $vsRoots) { throw "Visual Studio 2022 not found on runner." } + foreach ($vs in $vsRoots) { + $dest = Join-Path $vs.FullName 'MSBuild\Microsoft\VC\v170\BuildCustomizations' + New-Item -ItemType Directory -Force -Path $dest | Out-Null + Copy-Item -Path (Join-Path $msBuildExt '*') -Destination $dest -Force -Recurse + Write-Host "Installed CUDA MSBuild integration into $dest" + } + - name: "NVCC checks" run: nvcc -V From 8314d46e1eeba83b71427485200e5553c16f3beb Mon Sep 17 00:00:00 2001 From: sayakpaul Date: Wed, 27 May 2026 16:10:53 +0530 Subject: [PATCH 4/7] verbose to debug why kernel-builder isn't picking up cache --- .github/workflows/build_kernel_windows.yaml | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/.github/workflows/build_kernel_windows.yaml b/.github/workflows/build_kernel_windows.yaml index debb830c..86aec23c 100644 --- a/.github/workflows/build_kernel_windows.yaml +++ b/.github/workflows/build_kernel_windows.yaml @@ -110,7 +110,15 @@ jobs: shared-key: kernel-builder-${{ matrix.os }} - name: Build kernel-builder - run: ( cd kernel-builder && cargo build --release ) + # TEMP DIAGNOSTIC: -v + CARGO_LOG=fingerprint=info makes cargo print + # the exact reason each crate is being rebuilt ("stale: missing", + # "stale: mtime newer than ...", etc.). Used once to find out why + # rust-cache restores target/ successfully but cargo still rebuilds + # everything. Revert this step before merging. + shell: pwsh + env: + CARGO_LOG: cargo::core::compiler::fingerprint=info + run: ( cd kernel-builder ; cargo build --release -v ) # Python environment setup - uses: actions/setup-python@a309ff8b426b58ec0e2a45f0f869d46889d02405 # v6.2.0 From 6e6fdfe87acc38decb9ff02740fcba93de5feb77 Mon Sep 17 00:00:00 2001 From: sayakpaul Date: Wed, 27 May 2026 16:24:27 +0530 Subject: [PATCH 5/7] fix expression --- .github/workflows/build_kernel_windows.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/build_kernel_windows.yaml b/.github/workflows/build_kernel_windows.yaml index 86aec23c..bb6fbec3 100644 --- a/.github/workflows/build_kernel_windows.yaml +++ b/.github/workflows/build_kernel_windows.yaml @@ -118,7 +118,7 @@ jobs: shell: pwsh env: CARGO_LOG: cargo::core::compiler::fingerprint=info - run: ( cd kernel-builder ; cargo build --release -v ) + run: ( cd kernel-builder && cargo build --release -v ) # Python environment setup - uses: actions/setup-python@a309ff8b426b58ec0e2a45f0f869d46889d02405 # v6.2.0 From 0891c97f4e84f82bb938a6581944f220ef9950f3 Mon Sep 17 00:00:00 2001 From: sayakpaul Date: Wed, 27 May 2026 16:55:39 +0530 Subject: [PATCH 6/7] fix caching in the builder --- .github/workflows/build_kernel_windows.yaml | 19 ++++++++----------- 1 file changed, 8 insertions(+), 11 deletions(-) diff --git a/.github/workflows/build_kernel_windows.yaml b/.github/workflows/build_kernel_windows.yaml index bb6fbec3..6082f7ad 100644 --- a/.github/workflows/build_kernel_windows.yaml +++ b/.github/workflows/build_kernel_windows.yaml @@ -99,26 +99,23 @@ jobs: profile: minimal override: true - # Caches kernel-builder/target plus ~/.cargo/{registry,git}. Keys on + # Caches the workspace target/ plus ~/.cargo/{registry,git}. Keys on # Cargo.lock so a clean dep-graph change invalidates the artifact cache # but unrelated edits reuse it incrementally. Cuts the kernel-builder # build from ~8 min cold to ~30s warm. + # + # workspaces must point at the actual workspace root (root Cargo.toml + # has `[workspace] members = [..., "kernel-builder", ...]`). Cargo + # always writes target/ at the workspace root, so caching + # ./kernel-builder/target would restore to a path cargo never reads. - name: Cache cargo + kernel-builder target uses: Swatinem/rust-cache@c19371144df3bb44fab255c43d04cbc2ab54d1c4 # v2.9.1 with: - workspaces: kernel-builder + workspaces: . shared-key: kernel-builder-${{ matrix.os }} - name: Build kernel-builder - # TEMP DIAGNOSTIC: -v + CARGO_LOG=fingerprint=info makes cargo print - # the exact reason each crate is being rebuilt ("stale: missing", - # "stale: mtime newer than ...", etc.). Used once to find out why - # rust-cache restores target/ successfully but cargo still rebuilds - # everything. Revert this step before merging. - shell: pwsh - env: - CARGO_LOG: cargo::core::compiler::fingerprint=info - run: ( cd kernel-builder && cargo build --release -v ) + run: ( cd kernel-builder && cargo build --release ) # Python environment setup - uses: actions/setup-python@a309ff8b426b58ec0e2a45f0f869d46889d02405 # v6.2.0 From ec8fb1fe31fb8e4bba7b473e58243a575fda0d80 Mon Sep 17 00:00:00 2001 From: sayakpaul Date: Wed, 27 May 2026 18:05:11 +0530 Subject: [PATCH 7/7] trigger warm cache run