huggingface · danieldk · Oct 16, 2025 · Oct 9, 2025 · Oct 9, 2025 · Oct 9, 2025
diff --git a/.github/workflows/build_kernel_windows.yaml b/.github/workflows/build_kernel_windows.yaml
@@ -0,0 +1,86 @@
+name: "Build and test kernel - Windows"
+on:
+  push:
+    branches: [main]
+  pull_request:
+    branches: [main]
+    types: [opened, synchronize, reopened] # trigger on PRs
+  workflow_dispatch:
+
+jobs:
+  build:
+    strategy:
+      matrix:
+        os: [ windows-2022 ]
+        python: [ '3.12', '3.13' ]
+        torch: [
+          { version: '2.8', cuda: '12.9.1', wheel: '129' }
+        ]
+
+    name: Build kernel
+    runs-on: ${{ matrix.os }}
+
+    steps:
+      - uses: actions/cache@v4
+        with:
+          key: cuda-toolkit-v${{ matrix.cuda }}-${{ matrix.os }}
+          path: |
+            C:\Program Files\NVIDIA GPU Computing Toolkit
+            ~/.cargo/registry
+            ~/.cargo/git
+
+      - uses: actions/checkout@v5
+
+      # CUDA environment setup
+      - uses: N-Storm/cuda-toolkit@v0.2.28
+        id: setup-cuda-toolkit
+        with:
+          cuda: ${{ matrix.torch.cuda }}  # TODO(mfuntowicz): How can we test multiple CUDA versions than align with torch?
+      - name: "NVCC checks"
+        run: nvcc -V
+
+      # Rust build environment setup
+      - uses: actions-rs/toolchain@v1
+        with:
+          toolchain: stable
+          profile: minimal
+          override: true
+
+      - name: Build build2cmake
+        run: ( cd build2cmake && cargo build --release )
+
+      # Python environment setup
+      - uses: actions/setup-python@v6
+        with:
+          python-version: ${{ matrix.python }}
+          cache: 'pip'
+
+      - name: Install PyTorch
+        run: pip install torch --index-url https://download.pytorch.org/whl/cu129
+
+      - name: Build activation kernel
+        run: ( scripts\windows\builder.ps1 -SourceFolder examples/activation -BuildConfig Release -Backend cuda -Build -Force )
+#      - name: Copy activation kernel
+#        run: cp -rL examples/activation/build activation-kernel
+
+      - name: Build cutlass GEMM kernel
+        run: ( scripts\windows\builder.ps1 -SourceFolder examples/cutlass-gemm -BuildConfig Release -Backend cuda -Build -Force )
+#      - name: Copy cutlass GEMM kernel
+#        run: cp -rL examples/cutlass-gemm/result cutlass-gemm-kernel
+
+      - name: Build relu kernel
+        run: ( scripts\windows\builder.ps1 -SourceFolder examples/relu -BuildConfig Release -Backend cuda -Build -Force )
+#      - name: Copy relu kernel
+#        run: cp -rL examples/relu/result relu-kernel
+
+      - name: Build relu-backprop-compile kernel
+        run: ( scripts\windows\builder.ps1 -SourceFolder examples/relu-backprop-compile -BuildConfig Release -Backend cuda -Build -Force  )
+#      - name: Copy relu-backprop-compile kernel
+#        run: cp -rL examples/relu-backprop-compile/result relu-backprop-compile-kernel
+
+      # Just test that we build with the extra torchVersions argument.
+#      - name: Build relu kernel (specific Torch version)
+#        run: ( cd examples/relu-specific-torch && nix build . )
+
+      - name: Build silu-and-mul-universal kernel
+        run: ( scripts\windows\builder.ps1 -SourceFolder examples/silu-and-mul-universal -BuildConfig Release -Build -Force)
diff --git a/build2cmake/src/templates/cuda/preamble.cmake b/build2cmake/src/templates/cuda/preamble.cmake
@@ -98,3 +98,23 @@ else()
     ${GPU_LANG}
     "${${GPU_LANG}_SUPPORTED_ARCHS}")
 endif()
+
+
+message(STATUS "Rendered for platform {{ platform }}")
+{% if platform == 'windows' %}
+include(${CMAKE_CURRENT_LIST_DIR}/cmake/windows.cmake)
+
+# Generate standardized build name
+run_python(TORCH_VERSION "import torch; print(torch.__version__.split('+')[0])" "Failed to get Torch version")
+run_python(CXX11_ABI_VALUE "import torch; print('TRUE' if torch._C._GLIBCXX_USE_CXX11_ABI else 'FALSE')" "Failed to get CXX11 ABI")
+cmake_host_system_information(RESULT HOST_ARCH QUERY OS_PLATFORM)
+
+set(SYSTEM_STRING "${HOST_ARCH}-windows")
+
+if(GPU_LANG STREQUAL "CUDA")
+  generate_build_name(BUILD_VARIANT_NAME "${TORCH_VERSION}" ${CXX11_ABI_VALUE} "cuda" "${CUDA_VERSION}" "${SYSTEM_STRING}")
+elseif(GPU_LANG STREQUAL "HIP")
+  run_python(ROCM_VERSION "import torch.version; print(torch.version.hip.split('.')[0] + '.' + torch.version.hip.split('.')[1])" "Failed to get ROCm version")
+  generate_build_name(BUILD_VARIANT_NAME "${TORCH_VERSION}" ${CXX11_ABI_VALUE} "rocm" "${ROCM_VERSION}" "${SYSTEM_STRING}")
+endif()
+{% endif %}
diff --git a/build2cmake/src/templates/cuda/torch-extension.cmake b/build2cmake/src/templates/cuda/torch-extension.cmake
@@ -9,5 +9,17 @@ define_gpu_extension_target(
   USE_SABI 3
   WITH_SOABI)
 
-target_link_options({{ ops_name }} PRIVATE -static-libstdc++)
+if( NOT MSVC)
+    target_link_options({{ ops_name }} PRIVATE -static-libstdc++)
+endif()
 
+{% if platform == 'windows' %}
+# These methods below should be included from preamble.cmake on windows platform.
+
+# Add kernels_install target for huggingface/kernels library layout
+add_kernels_install_target({{ ops_name }} "{{ name }}" "${BUILD_VARIANT_NAME}")
+
+# Add local_install target for local development with get_local_kernel()
+add_local_install_target({{ ops_name }} "{{ name }}" "${BUILD_VARIANT_NAME}")
+
+{% endif %}
diff --git a/build2cmake/src/templates/windows.cmake b/build2cmake/src/templates/windows.cmake
@@ -0,0 +1,176 @@
+# Generate a standardized build variant name following the pattern:
+# torch<VERSION>-<ABI>-<COMPUTE>-windows
+#
+# Arguments:
+#   OUT_BUILD_NAME - Output variable name
+#   TORCH_VERSION - PyTorch version (e.g., "2.7.1")
+#   CXX11_ABI - Whether C++11 ABI is enabled (TRUE/FALSE)
+#   COMPUTE_FRAMEWORK - One of: cuda, rocm, metal, xpu
+#   COMPUTE_VERSION - Version of compute framework (e.g., "12.4" for CUDA, "6.0" for ROCm)
+# Example output: torch271-cxx11-cu124-x86_64-windows
+#
+function(generate_build_name OUT_BUILD_NAME TORCH_VERSION CXX11_ABI COMPUTE_FRAMEWORK COMPUTE_VERSION)
+    # Flatten version by removing dots and padding to 2 components
+    string(REPLACE "." ";" VERSION_LIST "${TORCH_VERSION}")
+    list(LENGTH VERSION_LIST VERSION_COMPONENTS)
+
+    # Pad to at least 2 components
+    if(VERSION_COMPONENTS LESS 2)
+        list(APPEND VERSION_LIST "0")
+    endif()
+
+    # Take first 2 components and join without dots
+    list(GET VERSION_LIST 0 MAJOR)
+    list(GET VERSION_LIST 1 MINOR)
+    set(FLATTENED_TORCH "${MAJOR}${MINOR}")
+
+    # Generate compute string
+    if(COMPUTE_FRAMEWORK STREQUAL "cuda")
+        # Flatten CUDA version (e.g., "12.4" -> "124")
+        string(REPLACE "." ";" COMPUTE_VERSION_LIST "${COMPUTE_VERSION}")
+        list(LENGTH COMPUTE_VERSION_LIST COMPUTE_COMPONENTS)
+        if(COMPUTE_COMPONENTS GREATER_EQUAL 2)
+            list(GET COMPUTE_VERSION_LIST 0 COMPUTE_MAJOR)
+            list(GET COMPUTE_VERSION_LIST 1 COMPUTE_MINOR)
+            set(COMPUTE_STRING "cu${COMPUTE_MAJOR}${COMPUTE_MINOR}")
+        else()
+            list(GET COMPUTE_VERSION_LIST 0 COMPUTE_MAJOR)
+            set(COMPUTE_STRING "cu${COMPUTE_MAJOR}0")
+        endif()
+    elseif(COMPUTE_FRAMEWORK STREQUAL "rocm")
+        # Flatten ROCm version (e.g., "6.0" -> "60")
+        string(REPLACE "." ";" COMPUTE_VERSION_LIST "${COMPUTE_VERSION}")
+        list(LENGTH COMPUTE_VERSION_LIST COMPUTE_COMPONENTS)
+        if(COMPUTE_COMPONENTS GREATER_EQUAL 2)
+            list(GET COMPUTE_VERSION_LIST 0 COMPUTE_MAJOR)
+            list(GET COMPUTE_VERSION_LIST 1 COMPUTE_MINOR)
+            set(COMPUTE_STRING "rocm${COMPUTE_MAJOR}${COMPUTE_MINOR}")
+        else()
+            list(GET COMPUTE_VERSION_LIST 0 COMPUTE_MAJOR)
+            set(COMPUTE_STRING "rocm${COMPUTE_MAJOR}0")
+        endif()
+    elseif(COMPUTE_FRAMEWORK STREQUAL "xpu")
+        # Flatten XPU version (e.g., "2025.2" -> "202552")
+        string(REPLACE "." ";" COMPUTE_VERSION_LIST "${COMPUTE_VERSION}")
+        list(LENGTH COMPUTE_VERSION_LIST COMPUTE_COMPONENTS)
+        if(COMPUTE_COMPONENTS GREATER_EQUAL 2)
+            list(GET COMPUTE_VERSION_LIST 0 COMPUTE_MAJOR)
+            list(GET COMPUTE_VERSION_LIST 1 COMPUTE_MINOR)
+            set(COMPUTE_STRING "xpu${COMPUTE_MAJOR}${COMPUTE_MINOR}")
+        else()
+            list(GET COMPUTE_VERSION_LIST 0 COMPUTE_MAJOR)
+            set(COMPUTE_STRING "xpu${COMPUTE_MAJOR}0")
+        endif()
+    else()
+        message(FATAL_ERROR "Unknown compute framework: ${COMPUTE_FRAMEWORK}")
+    endif()
+
+    # Assemble the final build name
+    if(ABI_STRING STREQUAL "")
+        set(BUILD_NAME "torch${FLATTENED_TORCH}-${COMPUTE_STRING}-windows")
+    else()
+        set(BUILD_NAME "torch${FLATTENED_TORCH}-${ABI_STRING}-${COMPUTE_STRING}-windows")
+    endif()
+
+    set(${OUT_BUILD_NAME} "${BUILD_NAME}" PARENT_SCOPE)
+    message(STATUS "Generated build name: ${BUILD_NAME}")
+endfunction()
+
+#
+# Create a custom install target for the huggingface/kernels library layout.
+# This installs the extension into a directory structure suitable for kernel hub discovery:
+#   <PREFIX>/<BUILD_VARIANT_NAME>/<PACKAGE_NAME>/
+#
+# Arguments:
+#   TARGET_NAME - Name of the target to create the install rule for
+#   PACKAGE_NAME - Python package name (e.g., "activation")
+#   BUILD_VARIANT_NAME - Build variant name (e.g., "torch271-cxx11-cu124-x86_64-linux")
+#   INSTALL_PREFIX - Base installation directory (defaults to CMAKE_INSTALL_PREFIX)
+#
+function(add_kernels_install_target TARGET_NAME PACKAGE_NAME BUILD_VARIANT_NAME)
+    set(oneValueArgs INSTALL_PREFIX)
+    cmake_parse_arguments(ARG "" "${oneValueArgs}" "" ${ARGN})
+
+    if(NOT ARG_INSTALL_PREFIX)
+        set(ARG_INSTALL_PREFIX "${CMAKE_INSTALL_PREFIX}")
+    endif()
+
+    # Create the kernels_install target if it doesn't exist
+    if(NOT TARGET kernels_install)
+        add_custom_target(kernels_install ALL
+                COMMENT "Installing all kernels to hub-compatible layout"
+                VERBATIM)
+    endif()
+
+    # Create a custom target for this specific kernel
+    set(KERNEL_INSTALL_TARGET "${TARGET_NAME}_kernel_install")
+    set(KERNEL_INSTALL_DIR "${ARG_INSTALL_PREFIX}/${BUILD_VARIANT_NAME}/${PACKAGE_NAME}")
+
+    add_custom_target(${KERNEL_INSTALL_TARGET} ALL
+            COMMAND ${CMAKE_COMMAND} -E make_directory "${KERNEL_INSTALL_DIR}"
+            COMMAND ${CMAKE_COMMAND} -E copy $<TARGET_FILE:${TARGET_NAME}> "${KERNEL_INSTALL_DIR}/"
+            COMMAND ${CMAKE_COMMAND} -E copy_directory
+            "${CMAKE_SOURCE_DIR}/torch-ext/${PACKAGE_NAME}"
+            "${KERNEL_INSTALL_DIR}/"
+            DEPENDS ${TARGET_NAME}
+            COMMENT "Installing ${TARGET_NAME} to ${KERNEL_INSTALL_DIR}"
+            VERBATIM)
+
+    # Make kernels_install depend on this specific kernel's install
+    add_dependencies(kernels_install ${KERNEL_INSTALL_TARGET})
+
+    # Set folder for IDE organization
+    if(MSVC OR XCODE)
+        set_target_properties(${KERNEL_INSTALL_TARGET} PROPERTIES FOLDER "Install")
+    endif()
+
+    message(STATUS "Added kernels_install target for ${TARGET_NAME} -> ${BUILD_VARIANT_NAME}/${PACKAGE_NAME}")
+endfunction()
+
+#
+# Add install rules for local development with huggingface/kernels.
+# This installs the extension into the layout expected by get_local_kernel():
+#   ${CMAKE_SOURCE_DIR}/build/<BUILD_VARIANT_NAME>/<PACKAGE_NAME>/
+#
+# This allows developers to use get_local_kernel() from the kernels library to load
+# locally built kernels without needing to publish to the hub.
+#
+# This uses the standard CMake install() command, so it works with the default
+# "install" target that is always available.
+#
+# Arguments:
+#   TARGET_NAME - Name of the target to create the install rule for
+#   PACKAGE_NAME - Python package name (e.g., "activation")
+#   BUILD_VARIANT_NAME - Build variant name (e.g., "torch271-cxx11-cu124-x86_64-linux")
+#
+function(add_local_install_target TARGET_NAME PACKAGE_NAME BUILD_VARIANT_NAME)
+    # Define your local, folder based, installation directory
+    set(LOCAL_INSTALL_DIR "${CMAKE_SOURCE_DIR}/build/${BUILD_VARIANT_NAME}/${PACKAGE_NAME}")
+
+    # Glob Python files at configure time
+    file(GLOB PYTHON_FILES "${CMAKE_SOURCE_DIR}/torch-ext/${PACKAGE_NAME}/*.py")
+
+    # Create a custom target for local installation
+    add_custom_target(local_install
+            COMMENT "Installing files to local directory..."
+    )
+
+    # Add custom commands to copy files
+    add_custom_command(TARGET local_install POST_BUILD
+            # Copy the shared library
+            COMMAND ${CMAKE_COMMAND} -E copy_if_different
+            $<TARGET_FILE:${TARGET_NAME}>
+            ${LOCAL_INSTALL_DIR}/
+
+            # Copy each Python file
+            COMMAND ${CMAKE_COMMAND} -E copy_if_different
+            ${PYTHON_FILES}
+            ${LOCAL_INSTALL_DIR}/
+
+            COMMENT "Copying shared library and Python files to ${LOCAL_INSTALL_DIR}"
+            COMMAND_EXPAND_LISTS
+    )
+
+    file(MAKE_DIRECTORY ${LOCAL_INSTALL_DIR})
+    message(STATUS "Added install rules for ${TARGET_NAME} -> build/${BUILD_VARIANT_NAME}/${PACKAGE_NAME}")
+endfunction()
diff --git a/build2cmake/src/torch/cuda.rs b/build2cmake/src/torch/cuda.rs
@@ -1,4 +1,5 @@
 use std::collections::HashSet;
+use std::env;
 use std::io::Write;
 use std::path::PathBuf;
 
@@ -12,6 +13,7 @@ use crate::version::Version;
 use crate::FileSet;
 
 static CMAKE_UTILS: &str = include_str!("../templates/utils.cmake");
+static WINDOWS_UTILS: &str = include_str!("../templates/windows.cmake");
 static REGISTRATION_H: &str = include_str!("../templates/registration.h");
 static HIPIFY: &str = include_str!("../templates/cuda/hipify.py");
 static CUDA_SUPPORTED_ARCHS_JSON: &str = include_str!("../cuda_supported_archs.json");
@@ -155,6 +157,13 @@ fn write_cmake(
         .entry(utils_path.clone())
         .extend_from_slice(CMAKE_UTILS.as_bytes());
 
+    let mut windows_utils_path = PathBuf::new();
+    windows_utils_path.push("cmake");
+    windows_utils_path.push("windows.cmake");
+    file_set
+        .entry(windows_utils_path.clone())
+        .extend_from_slice(WINDOWS_UTILS.as_bytes());
+
     let mut hipify_path = PathBuf::new();
     hipify_path.push("cmake");
     hipify_path.push("hipify.py");
@@ -184,7 +193,7 @@ fn write_cmake(
         render_kernel(env, kernel_name, kernel, cmake_writer)?;
     }
 
-    render_extension(env, ops_name, cmake_writer)?;
+    render_extension(env, name, ops_name, cmake_writer)?;
 
     Ok(())
 }
@@ -351,11 +360,17 @@ pub fn render_kernel(
     Ok(())
 }
 
-pub fn render_extension(env: &Environment, ops_name: &str, write: &mut impl Write) -> Result<()> {
+pub fn render_extension(
+    env: &Environment,
+    name: &str,
+    ops_name: &str,
+    write: &mut impl Write,
+) -> Result<()> {
     env.get_template("cuda/torch-extension.cmake")
         .wrap_err("Cannot get Torch extension template")?
         .render_to_write(
             context! {
+                name => name,
                 ops_name => ops_name,
             },
             &mut *write,
@@ -382,7 +397,7 @@ pub fn render_preamble(
                 cuda_minver => cuda_minver.map(|v| v.to_string()),
                 cuda_maxver => cuda_maxver.map(|v| v.to_string()),
                 cuda_supported_archs => cuda_supported_archs(),
-
+                platform => env::consts::OS
             },
             &mut *write,
         )