Skip to content
This repository was archived by the owner on Apr 6, 2026. It is now read-only.
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
38 commits
Select commit Hold shift + click to select a range
f57d466
feat(builder): create build name directly in CMake for portability (d…
mfuntowicz Oct 9, 2025
89e1bb9
feat(builder): automatically generate installation structure for loca…
mfuntowicz Oct 9, 2025
9995a28
feat(builder): expose name of the operation when populating template
mfuntowicz Oct 9, 2025
a396f92
feat(builder): add missing utils.cmake functions - oops
mfuntowicz Oct 9, 2025
90eb189
feat(builder): introduce Windows, PowerShell based, kbuilder.ps1 to b…
mfuntowicz Oct 9, 2025
2f8969f
feat(builder): attempt to not rely on intel_pytorch_extension to gath…
mfuntowicz Oct 9, 2025
4db1bc6
feat(builder): missing function call on get_device_capability ...
mfuntowicz Oct 9, 2025
93ad509
feat(gha): attempt to enable building windows based kernels
mfuntowicz Oct 13, 2025
b0acd2f
feat(gha): do not specify cuda version for now
mfuntowicz Oct 13, 2025
ae7ab63
feat(gha): disable local cache
mfuntowicz Oct 13, 2025
61c0287
feat(gha): target windows-2022
mfuntowicz Oct 13, 2025
4cbaaca
feat(gha): attempt to downgrade the cuda-toolkit action to a one "wor…
mfuntowicz Oct 13, 2025
acf273b
feat(gha): once more?
mfuntowicz Oct 13, 2025
1b643e6
feat(gha): use 13.0.0 with this one
mfuntowicz Oct 13, 2025
91c29a2
feat(gha): use correct path towards kbuilder.ps1
mfuntowicz Oct 13, 2025
360f1eb
feat(gha): build build2cmake and specify path to it for kbuilder
mfuntowicz Oct 13, 2025
ce3d2b9
feat(gha): again
mfuntowicz Oct 13, 2025
51b061c
feat(gha): let kbuilder discover build2cmake path from root
mfuntowicz Oct 13, 2025
a54c4c8
feat(gha): setup python
mfuntowicz Oct 13, 2025
5847050
feat(gha): ...
mfuntowicz Oct 13, 2025
a23ce20
misc(fmt): rustfmt
mfuntowicz Oct 13, 2025
515edac
feat(gha): update key for caching
mfuntowicz Oct 13, 2025
bd80677
feat(xpu): use cmake Intel Compiler version to generate build name
mfuntowicz Oct 13, 2025
7493b6e
feat(gha): enable some more tests for windows
mfuntowicz Oct 14, 2025
dbd8997
feat(gha): disable universal kernels, need investigation
mfuntowicz Oct 14, 2025
26bb852
misc(gha): remove double yml extension
mfuntowicz Oct 14, 2025
9753e49
misc(builder): refactored some duplicated fragments
mfuntowicz Oct 14, 2025
c824402
misc(builder): update gha workflow with new paths
mfuntowicz Oct 14, 2025
b8ef512
feat(gha): update to latest version of cuda-toolkit
mfuntowicz Oct 14, 2025
07500fe
misc(builder): update build2cmake default location
mfuntowicz Oct 15, 2025
57c2e9f
misc(builder): rename args variable to kwargs to not override builtin…
mfuntowicz Oct 15, 2025
5f6c3b1
misc(builder): do not attempt to build if there is no CMakeLists.txt
mfuntowicz Oct 15, 2025
2b9ac6c
misc(builder): setup for multiple torch and cuda versions
mfuntowicz Oct 15, 2025
53b383c
misc(builder): move windows only logic to specific windows.cmake and …
mfuntowicz Oct 16, 2025
76b0ba6
misc(builder): remove remaining SYSTEM_STRING override
mfuntowicz Oct 16, 2025
482422f
misc(builder): remove unneeded changes for unsupported platforms
mfuntowicz Oct 16, 2025
579aa92
misc(builder): conditionally include add_kernels_install_targets only…
mfuntowicz Oct 16, 2025
b4a4cfe
misc(builder): make sure unintended changes are not in the diff
mfuntowicz Oct 16, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
86 changes: 86 additions & 0 deletions .github/workflows/build_kernel_windows.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,86 @@
name: "Build and test kernel - Windows"
Comment thread
mfuntowicz marked this conversation as resolved.
on:
push:
branches: [main]
pull_request:
branches: [main]
types: [opened, synchronize, reopened] # trigger on PRs
workflow_dispatch:

jobs:
build:
strategy:
matrix:
os: [ windows-2022 ]
python: [ '3.12', '3.13' ]
torch: [
{ version: '2.8', cuda: '12.9.1', wheel: '129' }
]

name: Build kernel
runs-on: ${{ matrix.os }}

steps:
- uses: actions/cache@v4
with:
key: cuda-toolkit-v${{ matrix.cuda }}-${{ matrix.os }}
path: |
C:\Program Files\NVIDIA GPU Computing Toolkit
~/.cargo/registry
~/.cargo/git

- uses: actions/checkout@v5

# CUDA environment setup
- uses: N-Storm/cuda-toolkit@v0.2.28
id: setup-cuda-toolkit
with:
cuda: ${{ matrix.torch.cuda }} # TODO(mfuntowicz): How can we test multiple CUDA versions than align with torch?
- name: "NVCC checks"
run: nvcc -V

# Rust build environment setup
- uses: actions-rs/toolchain@v1
with:
toolchain: stable
profile: minimal
override: true

- name: Build build2cmake
run: ( cd build2cmake && cargo build --release )

# Python environment setup
- uses: actions/setup-python@v6
with:
python-version: ${{ matrix.python }}
cache: 'pip'

- name: Install PyTorch
run: pip install torch --index-url https://download.pytorch.org/whl/cu129

- name: Build activation kernel
run: ( scripts\windows\builder.ps1 -SourceFolder examples/activation -BuildConfig Release -Backend cuda -Build -Force )
# - name: Copy activation kernel
# run: cp -rL examples/activation/build activation-kernel

- name: Build cutlass GEMM kernel
run: ( scripts\windows\builder.ps1 -SourceFolder examples/cutlass-gemm -BuildConfig Release -Backend cuda -Build -Force )
# - name: Copy cutlass GEMM kernel
# run: cp -rL examples/cutlass-gemm/result cutlass-gemm-kernel

- name: Build relu kernel
run: ( scripts\windows\builder.ps1 -SourceFolder examples/relu -BuildConfig Release -Backend cuda -Build -Force )
# - name: Copy relu kernel
# run: cp -rL examples/relu/result relu-kernel

- name: Build relu-backprop-compile kernel
run: ( scripts\windows\builder.ps1 -SourceFolder examples/relu-backprop-compile -BuildConfig Release -Backend cuda -Build -Force )
# - name: Copy relu-backprop-compile kernel
# run: cp -rL examples/relu-backprop-compile/result relu-backprop-compile-kernel

# Just test that we build with the extra torchVersions argument.
# - name: Build relu kernel (specific Torch version)
# run: ( cd examples/relu-specific-torch && nix build . )

- name: Build silu-and-mul-universal kernel
run: ( scripts\windows\builder.ps1 -SourceFolder examples/silu-and-mul-universal -BuildConfig Release -Build -Force)
20 changes: 20 additions & 0 deletions build2cmake/src/templates/cuda/preamble.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -98,3 +98,23 @@ else()
${GPU_LANG}
"${${GPU_LANG}_SUPPORTED_ARCHS}")
endif()


message(STATUS "Rendered for platform {{ platform }}")
{% if platform == 'windows' %}
include(${CMAKE_CURRENT_LIST_DIR}/cmake/windows.cmake)

# Generate standardized build name
Comment thread
mfuntowicz marked this conversation as resolved.
run_python(TORCH_VERSION "import torch; print(torch.__version__.split('+')[0])" "Failed to get Torch version")
run_python(CXX11_ABI_VALUE "import torch; print('TRUE' if torch._C._GLIBCXX_USE_CXX11_ABI else 'FALSE')" "Failed to get CXX11 ABI")
cmake_host_system_information(RESULT HOST_ARCH QUERY OS_PLATFORM)

set(SYSTEM_STRING "${HOST_ARCH}-windows")

if(GPU_LANG STREQUAL "CUDA")
generate_build_name(BUILD_VARIANT_NAME "${TORCH_VERSION}" ${CXX11_ABI_VALUE} "cuda" "${CUDA_VERSION}" "${SYSTEM_STRING}")
elseif(GPU_LANG STREQUAL "HIP")
run_python(ROCM_VERSION "import torch.version; print(torch.version.hip.split('.')[0] + '.' + torch.version.hip.split('.')[1])" "Failed to get ROCm version")
generate_build_name(BUILD_VARIANT_NAME "${TORCH_VERSION}" ${CXX11_ABI_VALUE} "rocm" "${ROCM_VERSION}" "${SYSTEM_STRING}")
endif()
{% endif %}
14 changes: 13 additions & 1 deletion build2cmake/src/templates/cuda/torch-extension.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -9,5 +9,17 @@ define_gpu_extension_target(
USE_SABI 3
WITH_SOABI)

target_link_options({{ ops_name }} PRIVATE -static-libstdc++)
if( NOT MSVC)
target_link_options({{ ops_name }} PRIVATE -static-libstdc++)
endif()

{% if platform == 'windows' %}
# These methods below should be included from preamble.cmake on windows platform.

# Add kernels_install target for huggingface/kernels library layout
add_kernels_install_target({{ ops_name }} "{{ name }}" "${BUILD_VARIANT_NAME}")

# Add local_install target for local development with get_local_kernel()
add_local_install_target({{ ops_name }} "{{ name }}" "${BUILD_VARIANT_NAME}")

{% endif %}
176 changes: 176 additions & 0 deletions build2cmake/src/templates/windows.cmake
Original file line number Diff line number Diff line change
@@ -0,0 +1,176 @@
# Generate a standardized build variant name following the pattern:
# torch<VERSION>-<ABI>-<COMPUTE>-windows
#
# Arguments:
# OUT_BUILD_NAME - Output variable name
# TORCH_VERSION - PyTorch version (e.g., "2.7.1")
# CXX11_ABI - Whether C++11 ABI is enabled (TRUE/FALSE)
# COMPUTE_FRAMEWORK - One of: cuda, rocm, metal, xpu
# COMPUTE_VERSION - Version of compute framework (e.g., "12.4" for CUDA, "6.0" for ROCm)
# Example output: torch271-cxx11-cu124-x86_64-windows
#
function(generate_build_name OUT_BUILD_NAME TORCH_VERSION CXX11_ABI COMPUTE_FRAMEWORK COMPUTE_VERSION)
# Flatten version by removing dots and padding to 2 components
string(REPLACE "." ";" VERSION_LIST "${TORCH_VERSION}")
list(LENGTH VERSION_LIST VERSION_COMPONENTS)

# Pad to at least 2 components
if(VERSION_COMPONENTS LESS 2)
list(APPEND VERSION_LIST "0")
endif()

# Take first 2 components and join without dots
list(GET VERSION_LIST 0 MAJOR)
list(GET VERSION_LIST 1 MINOR)
set(FLATTENED_TORCH "${MAJOR}${MINOR}")

# Generate compute string
if(COMPUTE_FRAMEWORK STREQUAL "cuda")
# Flatten CUDA version (e.g., "12.4" -> "124")
string(REPLACE "." ";" COMPUTE_VERSION_LIST "${COMPUTE_VERSION}")
list(LENGTH COMPUTE_VERSION_LIST COMPUTE_COMPONENTS)
if(COMPUTE_COMPONENTS GREATER_EQUAL 2)
list(GET COMPUTE_VERSION_LIST 0 COMPUTE_MAJOR)
list(GET COMPUTE_VERSION_LIST 1 COMPUTE_MINOR)
set(COMPUTE_STRING "cu${COMPUTE_MAJOR}${COMPUTE_MINOR}")
else()
list(GET COMPUTE_VERSION_LIST 0 COMPUTE_MAJOR)
set(COMPUTE_STRING "cu${COMPUTE_MAJOR}0")
endif()
elseif(COMPUTE_FRAMEWORK STREQUAL "rocm")
# Flatten ROCm version (e.g., "6.0" -> "60")
string(REPLACE "." ";" COMPUTE_VERSION_LIST "${COMPUTE_VERSION}")
list(LENGTH COMPUTE_VERSION_LIST COMPUTE_COMPONENTS)
if(COMPUTE_COMPONENTS GREATER_EQUAL 2)
list(GET COMPUTE_VERSION_LIST 0 COMPUTE_MAJOR)
list(GET COMPUTE_VERSION_LIST 1 COMPUTE_MINOR)
set(COMPUTE_STRING "rocm${COMPUTE_MAJOR}${COMPUTE_MINOR}")
else()
list(GET COMPUTE_VERSION_LIST 0 COMPUTE_MAJOR)
set(COMPUTE_STRING "rocm${COMPUTE_MAJOR}0")
endif()
elseif(COMPUTE_FRAMEWORK STREQUAL "xpu")
# Flatten XPU version (e.g., "2025.2" -> "202552")
string(REPLACE "." ";" COMPUTE_VERSION_LIST "${COMPUTE_VERSION}")
list(LENGTH COMPUTE_VERSION_LIST COMPUTE_COMPONENTS)
if(COMPUTE_COMPONENTS GREATER_EQUAL 2)
list(GET COMPUTE_VERSION_LIST 0 COMPUTE_MAJOR)
list(GET COMPUTE_VERSION_LIST 1 COMPUTE_MINOR)
set(COMPUTE_STRING "xpu${COMPUTE_MAJOR}${COMPUTE_MINOR}")
else()
list(GET COMPUTE_VERSION_LIST 0 COMPUTE_MAJOR)
set(COMPUTE_STRING "xpu${COMPUTE_MAJOR}0")
endif()
else()
message(FATAL_ERROR "Unknown compute framework: ${COMPUTE_FRAMEWORK}")
endif()

# Assemble the final build name
if(ABI_STRING STREQUAL "")
set(BUILD_NAME "torch${FLATTENED_TORCH}-${COMPUTE_STRING}-windows")
else()
set(BUILD_NAME "torch${FLATTENED_TORCH}-${ABI_STRING}-${COMPUTE_STRING}-windows")
endif()

set(${OUT_BUILD_NAME} "${BUILD_NAME}" PARENT_SCOPE)
message(STATUS "Generated build name: ${BUILD_NAME}")
endfunction()

#
# Create a custom install target for the huggingface/kernels library layout.
# This installs the extension into a directory structure suitable for kernel hub discovery:
# <PREFIX>/<BUILD_VARIANT_NAME>/<PACKAGE_NAME>/
#
# Arguments:
# TARGET_NAME - Name of the target to create the install rule for
# PACKAGE_NAME - Python package name (e.g., "activation")
# BUILD_VARIANT_NAME - Build variant name (e.g., "torch271-cxx11-cu124-x86_64-linux")
# INSTALL_PREFIX - Base installation directory (defaults to CMAKE_INSTALL_PREFIX)
#
function(add_kernels_install_target TARGET_NAME PACKAGE_NAME BUILD_VARIANT_NAME)
set(oneValueArgs INSTALL_PREFIX)
cmake_parse_arguments(ARG "" "${oneValueArgs}" "" ${ARGN})

if(NOT ARG_INSTALL_PREFIX)
set(ARG_INSTALL_PREFIX "${CMAKE_INSTALL_PREFIX}")
endif()

# Create the kernels_install target if it doesn't exist
if(NOT TARGET kernels_install)
add_custom_target(kernels_install ALL
COMMENT "Installing all kernels to hub-compatible layout"
VERBATIM)
endif()

# Create a custom target for this specific kernel
set(KERNEL_INSTALL_TARGET "${TARGET_NAME}_kernel_install")
set(KERNEL_INSTALL_DIR "${ARG_INSTALL_PREFIX}/${BUILD_VARIANT_NAME}/${PACKAGE_NAME}")

add_custom_target(${KERNEL_INSTALL_TARGET} ALL
COMMAND ${CMAKE_COMMAND} -E make_directory "${KERNEL_INSTALL_DIR}"
COMMAND ${CMAKE_COMMAND} -E copy $<TARGET_FILE:${TARGET_NAME}> "${KERNEL_INSTALL_DIR}/"
COMMAND ${CMAKE_COMMAND} -E copy_directory
"${CMAKE_SOURCE_DIR}/torch-ext/${PACKAGE_NAME}"
"${KERNEL_INSTALL_DIR}/"
DEPENDS ${TARGET_NAME}
COMMENT "Installing ${TARGET_NAME} to ${KERNEL_INSTALL_DIR}"
VERBATIM)

# Make kernels_install depend on this specific kernel's install
add_dependencies(kernels_install ${KERNEL_INSTALL_TARGET})

# Set folder for IDE organization
if(MSVC OR XCODE)
set_target_properties(${KERNEL_INSTALL_TARGET} PROPERTIES FOLDER "Install")
endif()

message(STATUS "Added kernels_install target for ${TARGET_NAME} -> ${BUILD_VARIANT_NAME}/${PACKAGE_NAME}")
endfunction()

#
# Add install rules for local development with huggingface/kernels.
# This installs the extension into the layout expected by get_local_kernel():
# ${CMAKE_SOURCE_DIR}/build/<BUILD_VARIANT_NAME>/<PACKAGE_NAME>/
#
# This allows developers to use get_local_kernel() from the kernels library to load
# locally built kernels without needing to publish to the hub.
#
# This uses the standard CMake install() command, so it works with the default
# "install" target that is always available.
#
# Arguments:
# TARGET_NAME - Name of the target to create the install rule for
# PACKAGE_NAME - Python package name (e.g., "activation")
# BUILD_VARIANT_NAME - Build variant name (e.g., "torch271-cxx11-cu124-x86_64-linux")
#
function(add_local_install_target TARGET_NAME PACKAGE_NAME BUILD_VARIANT_NAME)
# Define your local, folder based, installation directory
set(LOCAL_INSTALL_DIR "${CMAKE_SOURCE_DIR}/build/${BUILD_VARIANT_NAME}/${PACKAGE_NAME}")

# Glob Python files at configure time
file(GLOB PYTHON_FILES "${CMAKE_SOURCE_DIR}/torch-ext/${PACKAGE_NAME}/*.py")

# Create a custom target for local installation
add_custom_target(local_install
COMMENT "Installing files to local directory..."
)

# Add custom commands to copy files
add_custom_command(TARGET local_install POST_BUILD
# Copy the shared library
COMMAND ${CMAKE_COMMAND} -E copy_if_different
$<TARGET_FILE:${TARGET_NAME}>
${LOCAL_INSTALL_DIR}/

# Copy each Python file
COMMAND ${CMAKE_COMMAND} -E copy_if_different
${PYTHON_FILES}
${LOCAL_INSTALL_DIR}/

COMMENT "Copying shared library and Python files to ${LOCAL_INSTALL_DIR}"
COMMAND_EXPAND_LISTS
)

file(MAKE_DIRECTORY ${LOCAL_INSTALL_DIR})
message(STATUS "Added install rules for ${TARGET_NAME} -> build/${BUILD_VARIANT_NAME}/${PACKAGE_NAME}")
endfunction()
21 changes: 18 additions & 3 deletions build2cmake/src/torch/cuda.rs
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
use std::collections::HashSet;
use std::env;
use std::io::Write;
use std::path::PathBuf;

Expand All @@ -12,6 +13,7 @@ use crate::version::Version;
use crate::FileSet;

static CMAKE_UTILS: &str = include_str!("../templates/utils.cmake");
static WINDOWS_UTILS: &str = include_str!("../templates/windows.cmake");
static REGISTRATION_H: &str = include_str!("../templates/registration.h");
static HIPIFY: &str = include_str!("../templates/cuda/hipify.py");
static CUDA_SUPPORTED_ARCHS_JSON: &str = include_str!("../cuda_supported_archs.json");
Expand Down Expand Up @@ -155,6 +157,13 @@ fn write_cmake(
.entry(utils_path.clone())
.extend_from_slice(CMAKE_UTILS.as_bytes());

let mut windows_utils_path = PathBuf::new();
windows_utils_path.push("cmake");
windows_utils_path.push("windows.cmake");
file_set
.entry(windows_utils_path.clone())
.extend_from_slice(WINDOWS_UTILS.as_bytes());

let mut hipify_path = PathBuf::new();
hipify_path.push("cmake");
hipify_path.push("hipify.py");
Expand Down Expand Up @@ -184,7 +193,7 @@ fn write_cmake(
render_kernel(env, kernel_name, kernel, cmake_writer)?;
}

render_extension(env, ops_name, cmake_writer)?;
render_extension(env, name, ops_name, cmake_writer)?;

Ok(())
}
Expand Down Expand Up @@ -351,11 +360,17 @@ pub fn render_kernel(
Ok(())
}

pub fn render_extension(env: &Environment, ops_name: &str, write: &mut impl Write) -> Result<()> {
pub fn render_extension(
env: &Environment,
name: &str,
ops_name: &str,
write: &mut impl Write,
) -> Result<()> {
env.get_template("cuda/torch-extension.cmake")
.wrap_err("Cannot get Torch extension template")?
.render_to_write(
context! {
name => name,
ops_name => ops_name,
},
&mut *write,
Expand All @@ -382,7 +397,7 @@ pub fn render_preamble(
cuda_minver => cuda_minver.map(|v| v.to_string()),
cuda_maxver => cuda_maxver.map(|v| v.to_string()),
cuda_supported_archs => cuda_supported_archs(),

platform => env::consts::OS
},
&mut *write,
)
Expand Down
Loading
Loading