From 7f7fca00ec19952071086baac084df3926751a9e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Dani=C3=ABl=20de=20Kok?= Date: Wed, 27 May 2026 09:53:47 +0000 Subject: [PATCH 1/2] kernel-builder: reject empty capabilities/archs list We compute a kernel component's capabilities by intersecting the capabilities that are specified for the kernel and the capabilities that are supported by CUDA/ROCm. Before this change, we would silently set an empty list if this intersection was empty. This resulted in CMake falling back to an old capability. This change fixes that by erroring out when when the capability list is empty. --- examples/kernels/flake.nix | 19 ++++++- .../kernels/relu-invalid-capability/CARD.md | 56 +++++++++++++++++++ .../relu-invalid-capability/build.toml | 32 +++++++++++ .../kernels/relu-invalid-capability/flake.nix | 17 ++++++ .../relu-invalid-capability/relu_cuda/relu.cu | 47 ++++++++++++++++ .../relu_invalid_capability/__init__.py | 15 +++++ .../torch-ext/torch_binding.cpp | 19 +++++++ .../torch-ext/torch_binding.h | 5 ++ .../src/pyproject/templates/kernel.cmake | 6 ++ 9 files changed, 215 insertions(+), 1 deletion(-) create mode 100644 examples/kernels/relu-invalid-capability/CARD.md create mode 100644 examples/kernels/relu-invalid-capability/build.toml create mode 100644 examples/kernels/relu-invalid-capability/flake.nix create mode 100644 examples/kernels/relu-invalid-capability/relu_cuda/relu.cu create mode 100644 examples/kernels/relu-invalid-capability/torch-ext/relu_invalid_capability/__init__.py create mode 100644 examples/kernels/relu-invalid-capability/torch-ext/torch_binding.cpp create mode 100644 examples/kernels/relu-invalid-capability/torch-ext/torch_binding.h diff --git a/examples/kernels/flake.nix b/examples/kernels/flake.nix index ac8b377b..fa072d58 100644 --- a/examples/kernels/flake.nix +++ b/examples/kernels/flake.nix @@ -102,6 +102,13 @@ path = ./relu-compiler-flags; drv = sys: out: out.packages.${sys}.redistributable.${"torch${torchVersion}-${cudaVersion}-${sys}"}; } + { + name = "relu-invalid-capability"; + path = ./relu-invalid-capability; + drv = sys: out: out.packages.${sys}.redistributable.${"torch${torchVersion}-${cudaVersion}-${sys}"}; + assertFail = true; + assertFailLogs = [ "empty set of capabilities" ]; + } { # Check that we can build an arch dev shell. name = "relu-dev-shell"; @@ -164,7 +171,17 @@ resolvedKernels = map (kernel: { inherit (kernel) name; - drv = kernel.drv system kernel.outputs; + drv = + let + baseDrv = kernel.drv system kernel.outputs; + in + if kernel.assertFail or false then + pkgs.testers.testBuildFailure' { + drv = baseDrv; + expectedBuilderLogEntries = kernel.assertFailLogs or [ ]; + } + else + baseDrv; }) ciKernelOutputs; ci-build = pkgs.linkFarm "ci-kernels" ( diff --git a/examples/kernels/relu-invalid-capability/CARD.md b/examples/kernels/relu-invalid-capability/CARD.md new file mode 100644 index 00000000..c7c70715 --- /dev/null +++ b/examples/kernels/relu-invalid-capability/CARD.md @@ -0,0 +1,56 @@ +--- +library_name: kernels +{% if license %}license: {{ license }} +{% endif %}--- + +This is the repository card of {{ repo_id }} that has been pushed on the Hub. It was built to be used with the [`kernels` library](https://github.com/huggingface/kernels). This card was automatically generated. + +## How to use +{% if functions %} + +```python +# make sure `kernels` is installed: `pip install -U kernels` +from kernels import get_kernel + +kernel_module = get_kernel("{{ repo_id }}", version={{ version }}) +{{ functions[0] }} = kernel_module.{{ functions[0] }} + +{{ functions[0] }}(...) +``` +{% else %} + +Usage example not available. +{% endif %} + +## Available functions +{% if functions %} +{% for func in functions %} +- `{{ func }}` +{% endfor %} +{% else %} + +Function list not available. +{% endif %} +{% if layers %} + +## Available layers +{% for layer in layers %} +- `{{ layer }}` +{% endfor %} +{% endif %} + +## Benchmarks +{% if has_benchmark %} + +Benchmarking script is available for this kernel. Run `kernels benchmark {{ repo_id }} --version {{ version }}`. +{% else %} + +No benchmark available yet. +{% endif %} +{% if upstream %} + +## Source code + +Source code of this kernel originally comes from {{ upstream }} and it was repurposed for compatibility with `kernels`. +{% endif %} + diff --git a/examples/kernels/relu-invalid-capability/build.toml b/examples/kernels/relu-invalid-capability/build.toml new file mode 100644 index 00000000..9de4c0d1 --- /dev/null +++ b/examples/kernels/relu-invalid-capability/build.toml @@ -0,0 +1,32 @@ +[general] +name = "relu-invalid-capability" +version = 1 +license = "Apache-2.0" +backends = [ + "cpu", + "cuda", + "metal", + "rocm", + "xpu", +] + +[general.hub] +repo-id = "kernels-test/relu-invalid-capability" + +[torch] +src = [ + "torch-ext/torch_binding.cpp", + "torch-ext/torch_binding.h", +] + +[kernel.relu_rocm] +backend = "rocm" +depends = ["torch"] +rocm-archs = [ "gfx99999" ] +src = ["relu_cuda/relu.cu"] + +[kernel.relu] +backend = "cuda" +depends = ["torch"] +cuda-capabilities = [ "99999.0" ] +src = ["relu_cuda/relu.cu"] diff --git a/examples/kernels/relu-invalid-capability/flake.nix b/examples/kernels/relu-invalid-capability/flake.nix new file mode 100644 index 00000000..41e51eef --- /dev/null +++ b/examples/kernels/relu-invalid-capability/flake.nix @@ -0,0 +1,17 @@ +{ + description = "Flake for ReLU kernel"; + + inputs = { + kernel-builder.url = "path:../../.."; + }; + + outputs = + { + self, + kernel-builder, + }: + kernel-builder.lib.genKernelFlakeOutputs { + inherit self; + path = ./.; + }; +} diff --git a/examples/kernels/relu-invalid-capability/relu_cuda/relu.cu b/examples/kernels/relu-invalid-capability/relu_cuda/relu.cu new file mode 100644 index 00000000..50b49f91 --- /dev/null +++ b/examples/kernels/relu-invalid-capability/relu_cuda/relu.cu @@ -0,0 +1,47 @@ +#include +#include +#include + +#include + +__global__ void relu_kernel(float *__restrict__ out, + float const *__restrict__ input, const int d) { + const int64_t token_idx = blockIdx.x; + for (int64_t idx = threadIdx.x; idx < d; idx += blockDim.x) { + auto x = input[token_idx * d + idx]; + out[token_idx * d + idx] = x > 0.0f ? x : 0.0f; + } +} + +void relu(torch::Tensor &out, torch::Tensor const &input) { + TORCH_CHECK(input.device().is_cuda(), "input must be a CUDA tensor"); + TORCH_CHECK(input.is_contiguous(), "input must be contiguous"); + TORCH_CHECK(input.scalar_type() == at::ScalarType::Float && + input.scalar_type() == at::ScalarType::Float, + "relu_kernel only supports float32"); + + TORCH_CHECK(input.sizes() == out.sizes(), + "Tensors must have the same shape. Got input shape: ", + input.sizes(), " and output shape: ", out.sizes()); + + TORCH_CHECK(input.scalar_type() == out.scalar_type(), + "Tensors must have the same data type. Got input dtype: ", + input.scalar_type(), " and output dtype: ", out.scalar_type()); + + TORCH_CHECK(input.device() == out.device(), + "Tensors must be on the same device. Got input device: ", + input.device(), " and output device: ", out.device()); + + if (input.numel() == 0) { + return; + } + + int d = input.size(-1); + int64_t num_tokens = input.numel() / d; + dim3 grid(num_tokens); + dim3 block(std::min(d, 1024)); + const at::cuda::OptionalCUDAGuard device_guard(device_of(input)); + const cudaStream_t stream = at::cuda::getCurrentCUDAStream(); + relu_kernel<<>>(out.data_ptr(), + input.data_ptr(), d); +} diff --git a/examples/kernels/relu-invalid-capability/torch-ext/relu_invalid_capability/__init__.py b/examples/kernels/relu-invalid-capability/torch-ext/relu_invalid_capability/__init__.py new file mode 100644 index 00000000..a844aae8 --- /dev/null +++ b/examples/kernels/relu-invalid-capability/torch-ext/relu_invalid_capability/__init__.py @@ -0,0 +1,15 @@ +from typing import Optional + +import torch + +from ._ops import ops + + +def relu(x: torch.Tensor, out: Optional[torch.Tensor] = None) -> torch.Tensor: + if out is None: + out = torch.empty_like(x) + ops.relu(out, x) + return out + + +__all__ = ["relu"] diff --git a/examples/kernels/relu-invalid-capability/torch-ext/torch_binding.cpp b/examples/kernels/relu-invalid-capability/torch-ext/torch_binding.cpp new file mode 100644 index 00000000..1765d92d --- /dev/null +++ b/examples/kernels/relu-invalid-capability/torch-ext/torch_binding.cpp @@ -0,0 +1,19 @@ +#include + +#include "registration.h" +#include "torch_binding.h" + +TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) { + ops.def("relu(Tensor! out, Tensor input) -> ()"); +#if defined(CPU_KERNEL) + ops.impl("relu", torch::kCPU, &relu); +#elif defined(CUDA_KERNEL) || defined(ROCM_KERNEL) + ops.impl("relu", torch::kCUDA, &relu); +#elif defined(METAL_KERNEL) + ops.impl("relu", torch::kMPS, relu); +#elif defined(XPU_KERNEL) + ops.impl("relu", torch::kXPU, &relu); +#endif +} + +REGISTER_EXTENSION(TORCH_EXTENSION_NAME) diff --git a/examples/kernels/relu-invalid-capability/torch-ext/torch_binding.h b/examples/kernels/relu-invalid-capability/torch-ext/torch_binding.h new file mode 100644 index 00000000..3bcf2904 --- /dev/null +++ b/examples/kernels/relu-invalid-capability/torch-ext/torch_binding.h @@ -0,0 +1,5 @@ +#pragma once + +#include + +void relu(torch::Tensor &out, torch::Tensor const &input); \ No newline at end of file diff --git a/kernel-builder/src/pyproject/templates/kernel.cmake b/kernel-builder/src/pyproject/templates/kernel.cmake index 4d4ffa64..be5a736c 100644 --- a/kernel-builder/src/pyproject/templates/kernel.cmake +++ b/kernel-builder/src/pyproject/templates/kernel.cmake @@ -34,6 +34,9 @@ function(cuda_kernel_component SRC_VAR) # Determine CUDA architectures if(KERNEL_CUDA_CAPABILITIES) cuda_archs_loose_intersection(_KERNEL_ARCHS "${KERNEL_CUDA_CAPABILITIES}" "${CUDA_ARCHS}") + if(NOT _KERNEL_ARCHS) + message(FATAL_ERROR "CUDA kernel: ${KERNEL_NAME}, empty set of capabilities after intersection (kernel: ${KERNEL_CUDA_CAPABILITIES}, supported: ${CUDA_ARCHS})") + endif() else() set(_KERNEL_ARCHS "${CUDA_KERNEL_ARCHS}") endif() @@ -115,6 +118,9 @@ function(hip_kernel_component SRC_VAR) # Determine ROCm architectures if(KERNEL_ROCM_ARCHS) hip_archs_loose_intersection(_KERNEL_ARCHS "${KERNEL_ROCM_ARCHS}" "${ROCM_ARCHS}") + if(NOT _KERNEL_ARCHS) + message(FATAL_ERROR "ROCm kernel: ${KERNEL_NAME}, empty set of architectures after intersection (kernel: ${KERNEL_ROCM_ARCHS}, supported: ${ROCM_ARCHS})") + endif() else() set(_KERNEL_ARCHS "${ROCM_ARCHS}") endif() From ae44554a3c4fe517e7c7ca92e83fb3706f89dece Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Dani=C3=ABl=20de=20Kok?= Date: Wed, 27 May 2026 10:19:02 +0000 Subject: [PATCH 2/2] Hook up ROCm test and make it concurrent --- .github/workflows/build_kernel.yaml | 2 +- .github/workflows/build_kernel_rocm.yaml | 9 +-- examples/kernels/flake.nix | 96 ++++++++++++++++-------- 3 files changed, 69 insertions(+), 38 deletions(-) diff --git a/.github/workflows/build_kernel.yaml b/.github/workflows/build_kernel.yaml index 18e4e7d7..9a17e059 100644 --- a/.github/workflows/build_kernel.yaml +++ b/.github/workflows/build_kernel.yaml @@ -40,7 +40,7 @@ jobs: run: nix-shell -p nix-info --run "nix-info -m" - name: Build all example kernels - run: nix build -L ./examples/kernels#ci-build + run: nix build -L ./examples/kernels#ci-build-cuda - name: Copy kernel artifacts run: cp -rL result/* . diff --git a/.github/workflows/build_kernel_rocm.yaml b/.github/workflows/build_kernel_rocm.yaml index 6d090477..4b6205b1 100644 --- a/.github/workflows/build_kernel_rocm.yaml +++ b/.github/workflows/build_kernel_rocm.yaml @@ -12,7 +12,7 @@ on: jobs: build: - name: Build kernel + name: Build kernels (ROCm) runs-on: group: aws-highmemory-32-plus-nix steps: @@ -33,8 +33,5 @@ jobs: run: nix-shell -p nix-info --run "nix-info -m" # For now we only test that there are no regressions in building ROCm # kernels. Also run tests once we have a ROCm runner. - - name: Build relu kernel - run: ( cd examples/kernels/relu && nix build .\#redistributable.torch211-rocm71-x86_64-linux -L ) - - - name: Build relu kernel (compiler flags) - run: ( cd examples/kernels/relu-compiler-flags && nix build .\#redistributable.torch211-rocm71-x86_64-linux ) + - name: Build all ROCm example kernels + run: nix build -L ./examples/kernels#ci-build-rocm diff --git a/examples/kernels/flake.nix b/examples/kernels/flake.nix index fa072d58..b26ba054 100644 --- a/examples/kernels/flake.nix +++ b/examples/kernels/flake.nix @@ -15,6 +15,7 @@ inherit (kernel-builder.inputs.nixpkgs) lib; cudaVersion = "cu126"; + rocmVersion = "rocm71"; torchVersion = "211"; tvmFfiVersion = "01"; @@ -136,6 +137,27 @@ } ]; + # ROCm kernels to build in CI. + ciRocmKernels = [ + { + name = "relu-invalid-capability"; + path = ./relu-invalid-capability; + drv = sys: out: out.packages.${sys}.redistributable.${"torch${torchVersion}-${rocmVersion}-${sys}"}; + assertFail = true; + assertFailLogs = [ "empty set of architectures" ]; + } + { + name = "relu-kernel"; + path = ./relu; + drv = sys: out: out.packages.${sys}.redistributable.${"torch${torchVersion}-${rocmVersion}-${sys}"}; + } + { + name = "relu-compiler-flags"; + path = ./relu-compiler-flags; + drv = sys: out: out.packages.${sys}.redistributable.${"torch${torchVersion}-${rocmVersion}-${sys}"}; + } + ]; + mkKernelOutputs = { path, @@ -148,16 +170,21 @@ // lib.optionalAttrs (torchVersions != null) { inherit torchVersions; } ); - ciKernelOutputs = map ( - kernel: - kernel - // { - outputs = mkKernelOutputs { - inherit (kernel) path; - torchVersions = kernel.torchVersions or null; - }; - } - ) ciKernels; + mkKernelOutputs' = + kernels: + map ( + kernel: + kernel + // { + outputs = mkKernelOutputs { + inherit (kernel) path; + torchVersions = kernel.torchVersions or null; + }; + } + ) kernels; + + ciKernelOutputs = mkKernelOutputs' ciKernels; + ciRocmKernelOutputs = mkKernelOutputs' ciRocmKernels; in flake-utils.lib.eachSystem [ @@ -169,32 +196,39 @@ let pkgs = nixpkgs.legacyPackages.${system}; - resolvedKernels = map (kernel: { - inherit (kernel) name; - drv = - let - baseDrv = kernel.drv system kernel.outputs; - in - if kernel.assertFail or false then - pkgs.testers.testBuildFailure' { - drv = baseDrv; - expectedBuilderLogEntries = kernel.assertFailLogs or [ ]; - } - else - baseDrv; - }) ciKernelOutputs; - - ci-build = pkgs.linkFarm "ci-kernels" ( + resolveKernels = + kernelOutputsList: map (kernel: { inherit (kernel) name; - path = kernel.drv; - }) resolvedKernels - ); + drv = + let + baseDrv = kernel.drv system kernel.outputs; + in + if kernel.assertFail or false then + pkgs.testers.testBuildFailure' { + drv = baseDrv; + expectedBuilderLogEntries = kernel.assertFailLogs or [ ]; + } + else + baseDrv; + }) kernelOutputsList; + + mkCiBuild = + name: kernelOutputsList: + pkgs.linkFarm name ( + map (kernel: { + inherit (kernel) name; + path = kernel.drv; + }) (resolveKernels kernelOutputsList) + ); + + ci-build-cuda = mkCiBuild "ci-kernels-cuda" ciKernelOutputs; + ci-build-rocm = mkCiBuild "ci-kernels-rocm" ciRocmKernelOutputs; in { packages = { - inherit ci-build; - default = ci-build; + inherit ci-build-cuda ci-build-rocm; + default = ci-build-cuda; }; } );