diff --git a/.github/workflows/build_kernel.yaml b/.github/workflows/build_kernel.yaml index 18e4e7d7..9a17e059 100644 --- a/.github/workflows/build_kernel.yaml +++ b/.github/workflows/build_kernel.yaml @@ -40,7 +40,7 @@ jobs: run: nix-shell -p nix-info --run "nix-info -m" - name: Build all example kernels - run: nix build -L ./examples/kernels#ci-build + run: nix build -L ./examples/kernels#ci-build-cuda - name: Copy kernel artifacts run: cp -rL result/* . diff --git a/.github/workflows/build_kernel_rocm.yaml b/.github/workflows/build_kernel_rocm.yaml index 6d090477..4b6205b1 100644 --- a/.github/workflows/build_kernel_rocm.yaml +++ b/.github/workflows/build_kernel_rocm.yaml @@ -12,7 +12,7 @@ on: jobs: build: - name: Build kernel + name: Build kernels (ROCm) runs-on: group: aws-highmemory-32-plus-nix steps: @@ -33,8 +33,5 @@ jobs: run: nix-shell -p nix-info --run "nix-info -m" # For now we only test that there are no regressions in building ROCm # kernels. Also run tests once we have a ROCm runner. - - name: Build relu kernel - run: ( cd examples/kernels/relu && nix build .\#redistributable.torch211-rocm71-x86_64-linux -L ) - - - name: Build relu kernel (compiler flags) - run: ( cd examples/kernels/relu-compiler-flags && nix build .\#redistributable.torch211-rocm71-x86_64-linux ) + - name: Build all ROCm example kernels + run: nix build -L ./examples/kernels#ci-build-rocm diff --git a/examples/kernels/flake.nix b/examples/kernels/flake.nix index ac8b377b..b26ba054 100644 --- a/examples/kernels/flake.nix +++ b/examples/kernels/flake.nix @@ -15,6 +15,7 @@ inherit (kernel-builder.inputs.nixpkgs) lib; cudaVersion = "cu126"; + rocmVersion = "rocm71"; torchVersion = "211"; tvmFfiVersion = "01"; @@ -102,6 +103,13 @@ path = ./relu-compiler-flags; drv = sys: out: out.packages.${sys}.redistributable.${"torch${torchVersion}-${cudaVersion}-${sys}"}; } + { + name = "relu-invalid-capability"; + path = ./relu-invalid-capability; + drv = sys: out: out.packages.${sys}.redistributable.${"torch${torchVersion}-${cudaVersion}-${sys}"}; + assertFail = true; + assertFailLogs = [ "empty set of capabilities" ]; + } { # Check that we can build an arch dev shell. name = "relu-dev-shell"; @@ -129,6 +137,27 @@ } ]; + # ROCm kernels to build in CI. + ciRocmKernels = [ + { + name = "relu-invalid-capability"; + path = ./relu-invalid-capability; + drv = sys: out: out.packages.${sys}.redistributable.${"torch${torchVersion}-${rocmVersion}-${sys}"}; + assertFail = true; + assertFailLogs = [ "empty set of architectures" ]; + } + { + name = "relu-kernel"; + path = ./relu; + drv = sys: out: out.packages.${sys}.redistributable.${"torch${torchVersion}-${rocmVersion}-${sys}"}; + } + { + name = "relu-compiler-flags"; + path = ./relu-compiler-flags; + drv = sys: out: out.packages.${sys}.redistributable.${"torch${torchVersion}-${rocmVersion}-${sys}"}; + } + ]; + mkKernelOutputs = { path, @@ -141,16 +170,21 @@ // lib.optionalAttrs (torchVersions != null) { inherit torchVersions; } ); - ciKernelOutputs = map ( - kernel: - kernel - // { - outputs = mkKernelOutputs { - inherit (kernel) path; - torchVersions = kernel.torchVersions or null; - }; - } - ) ciKernels; + mkKernelOutputs' = + kernels: + map ( + kernel: + kernel + // { + outputs = mkKernelOutputs { + inherit (kernel) path; + torchVersions = kernel.torchVersions or null; + }; + } + ) kernels; + + ciKernelOutputs = mkKernelOutputs' ciKernels; + ciRocmKernelOutputs = mkKernelOutputs' ciRocmKernels; in flake-utils.lib.eachSystem [ @@ -162,22 +196,39 @@ let pkgs = nixpkgs.legacyPackages.${system}; - resolvedKernels = map (kernel: { - inherit (kernel) name; - drv = kernel.drv system kernel.outputs; - }) ciKernelOutputs; - - ci-build = pkgs.linkFarm "ci-kernels" ( + resolveKernels = + kernelOutputsList: map (kernel: { inherit (kernel) name; - path = kernel.drv; - }) resolvedKernels - ); + drv = + let + baseDrv = kernel.drv system kernel.outputs; + in + if kernel.assertFail or false then + pkgs.testers.testBuildFailure' { + drv = baseDrv; + expectedBuilderLogEntries = kernel.assertFailLogs or [ ]; + } + else + baseDrv; + }) kernelOutputsList; + + mkCiBuild = + name: kernelOutputsList: + pkgs.linkFarm name ( + map (kernel: { + inherit (kernel) name; + path = kernel.drv; + }) (resolveKernels kernelOutputsList) + ); + + ci-build-cuda = mkCiBuild "ci-kernels-cuda" ciKernelOutputs; + ci-build-rocm = mkCiBuild "ci-kernels-rocm" ciRocmKernelOutputs; in { packages = { - inherit ci-build; - default = ci-build; + inherit ci-build-cuda ci-build-rocm; + default = ci-build-cuda; }; } ); diff --git a/examples/kernels/relu-invalid-capability/CARD.md b/examples/kernels/relu-invalid-capability/CARD.md new file mode 100644 index 00000000..c7c70715 --- /dev/null +++ b/examples/kernels/relu-invalid-capability/CARD.md @@ -0,0 +1,56 @@ +--- +library_name: kernels +{% if license %}license: {{ license }} +{% endif %}--- + +This is the repository card of {{ repo_id }} that has been pushed on the Hub. It was built to be used with the [`kernels` library](https://github.com/huggingface/kernels). This card was automatically generated. + +## How to use +{% if functions %} + +```python +# make sure `kernels` is installed: `pip install -U kernels` +from kernels import get_kernel + +kernel_module = get_kernel("{{ repo_id }}", version={{ version }}) +{{ functions[0] }} = kernel_module.{{ functions[0] }} + +{{ functions[0] }}(...) +``` +{% else %} + +Usage example not available. +{% endif %} + +## Available functions +{% if functions %} +{% for func in functions %} +- `{{ func }}` +{% endfor %} +{% else %} + +Function list not available. +{% endif %} +{% if layers %} + +## Available layers +{% for layer in layers %} +- `{{ layer }}` +{% endfor %} +{% endif %} + +## Benchmarks +{% if has_benchmark %} + +Benchmarking script is available for this kernel. Run `kernels benchmark {{ repo_id }} --version {{ version }}`. +{% else %} + +No benchmark available yet. +{% endif %} +{% if upstream %} + +## Source code + +Source code of this kernel originally comes from {{ upstream }} and it was repurposed for compatibility with `kernels`. +{% endif %} + diff --git a/examples/kernels/relu-invalid-capability/build.toml b/examples/kernels/relu-invalid-capability/build.toml new file mode 100644 index 00000000..9de4c0d1 --- /dev/null +++ b/examples/kernels/relu-invalid-capability/build.toml @@ -0,0 +1,32 @@ +[general] +name = "relu-invalid-capability" +version = 1 +license = "Apache-2.0" +backends = [ + "cpu", + "cuda", + "metal", + "rocm", + "xpu", +] + +[general.hub] +repo-id = "kernels-test/relu-invalid-capability" + +[torch] +src = [ + "torch-ext/torch_binding.cpp", + "torch-ext/torch_binding.h", +] + +[kernel.relu_rocm] +backend = "rocm" +depends = ["torch"] +rocm-archs = [ "gfx99999" ] +src = ["relu_cuda/relu.cu"] + +[kernel.relu] +backend = "cuda" +depends = ["torch"] +cuda-capabilities = [ "99999.0" ] +src = ["relu_cuda/relu.cu"] diff --git a/examples/kernels/relu-invalid-capability/flake.nix b/examples/kernels/relu-invalid-capability/flake.nix new file mode 100644 index 00000000..41e51eef --- /dev/null +++ b/examples/kernels/relu-invalid-capability/flake.nix @@ -0,0 +1,17 @@ +{ + description = "Flake for ReLU kernel"; + + inputs = { + kernel-builder.url = "path:../../.."; + }; + + outputs = + { + self, + kernel-builder, + }: + kernel-builder.lib.genKernelFlakeOutputs { + inherit self; + path = ./.; + }; +} diff --git a/examples/kernels/relu-invalid-capability/relu_cuda/relu.cu b/examples/kernels/relu-invalid-capability/relu_cuda/relu.cu new file mode 100644 index 00000000..50b49f91 --- /dev/null +++ b/examples/kernels/relu-invalid-capability/relu_cuda/relu.cu @@ -0,0 +1,47 @@ +#include +#include +#include + +#include + +__global__ void relu_kernel(float *__restrict__ out, + float const *__restrict__ input, const int d) { + const int64_t token_idx = blockIdx.x; + for (int64_t idx = threadIdx.x; idx < d; idx += blockDim.x) { + auto x = input[token_idx * d + idx]; + out[token_idx * d + idx] = x > 0.0f ? x : 0.0f; + } +} + +void relu(torch::Tensor &out, torch::Tensor const &input) { + TORCH_CHECK(input.device().is_cuda(), "input must be a CUDA tensor"); + TORCH_CHECK(input.is_contiguous(), "input must be contiguous"); + TORCH_CHECK(input.scalar_type() == at::ScalarType::Float && + input.scalar_type() == at::ScalarType::Float, + "relu_kernel only supports float32"); + + TORCH_CHECK(input.sizes() == out.sizes(), + "Tensors must have the same shape. Got input shape: ", + input.sizes(), " and output shape: ", out.sizes()); + + TORCH_CHECK(input.scalar_type() == out.scalar_type(), + "Tensors must have the same data type. Got input dtype: ", + input.scalar_type(), " and output dtype: ", out.scalar_type()); + + TORCH_CHECK(input.device() == out.device(), + "Tensors must be on the same device. Got input device: ", + input.device(), " and output device: ", out.device()); + + if (input.numel() == 0) { + return; + } + + int d = input.size(-1); + int64_t num_tokens = input.numel() / d; + dim3 grid(num_tokens); + dim3 block(std::min(d, 1024)); + const at::cuda::OptionalCUDAGuard device_guard(device_of(input)); + const cudaStream_t stream = at::cuda::getCurrentCUDAStream(); + relu_kernel<<>>(out.data_ptr(), + input.data_ptr(), d); +} diff --git a/examples/kernels/relu-invalid-capability/torch-ext/relu_invalid_capability/__init__.py b/examples/kernels/relu-invalid-capability/torch-ext/relu_invalid_capability/__init__.py new file mode 100644 index 00000000..a844aae8 --- /dev/null +++ b/examples/kernels/relu-invalid-capability/torch-ext/relu_invalid_capability/__init__.py @@ -0,0 +1,15 @@ +from typing import Optional + +import torch + +from ._ops import ops + + +def relu(x: torch.Tensor, out: Optional[torch.Tensor] = None) -> torch.Tensor: + if out is None: + out = torch.empty_like(x) + ops.relu(out, x) + return out + + +__all__ = ["relu"] diff --git a/examples/kernels/relu-invalid-capability/torch-ext/torch_binding.cpp b/examples/kernels/relu-invalid-capability/torch-ext/torch_binding.cpp new file mode 100644 index 00000000..1765d92d --- /dev/null +++ b/examples/kernels/relu-invalid-capability/torch-ext/torch_binding.cpp @@ -0,0 +1,19 @@ +#include + +#include "registration.h" +#include "torch_binding.h" + +TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) { + ops.def("relu(Tensor! out, Tensor input) -> ()"); +#if defined(CPU_KERNEL) + ops.impl("relu", torch::kCPU, &relu); +#elif defined(CUDA_KERNEL) || defined(ROCM_KERNEL) + ops.impl("relu", torch::kCUDA, &relu); +#elif defined(METAL_KERNEL) + ops.impl("relu", torch::kMPS, relu); +#elif defined(XPU_KERNEL) + ops.impl("relu", torch::kXPU, &relu); +#endif +} + +REGISTER_EXTENSION(TORCH_EXTENSION_NAME) diff --git a/examples/kernels/relu-invalid-capability/torch-ext/torch_binding.h b/examples/kernels/relu-invalid-capability/torch-ext/torch_binding.h new file mode 100644 index 00000000..3bcf2904 --- /dev/null +++ b/examples/kernels/relu-invalid-capability/torch-ext/torch_binding.h @@ -0,0 +1,5 @@ +#pragma once + +#include + +void relu(torch::Tensor &out, torch::Tensor const &input); \ No newline at end of file diff --git a/kernel-builder/src/pyproject/templates/kernel.cmake b/kernel-builder/src/pyproject/templates/kernel.cmake index 4d4ffa64..be5a736c 100644 --- a/kernel-builder/src/pyproject/templates/kernel.cmake +++ b/kernel-builder/src/pyproject/templates/kernel.cmake @@ -34,6 +34,9 @@ function(cuda_kernel_component SRC_VAR) # Determine CUDA architectures if(KERNEL_CUDA_CAPABILITIES) cuda_archs_loose_intersection(_KERNEL_ARCHS "${KERNEL_CUDA_CAPABILITIES}" "${CUDA_ARCHS}") + if(NOT _KERNEL_ARCHS) + message(FATAL_ERROR "CUDA kernel: ${KERNEL_NAME}, empty set of capabilities after intersection (kernel: ${KERNEL_CUDA_CAPABILITIES}, supported: ${CUDA_ARCHS})") + endif() else() set(_KERNEL_ARCHS "${CUDA_KERNEL_ARCHS}") endif() @@ -115,6 +118,9 @@ function(hip_kernel_component SRC_VAR) # Determine ROCm architectures if(KERNEL_ROCM_ARCHS) hip_archs_loose_intersection(_KERNEL_ARCHS "${KERNEL_ROCM_ARCHS}" "${ROCM_ARCHS}") + if(NOT _KERNEL_ARCHS) + message(FATAL_ERROR "ROCm kernel: ${KERNEL_NAME}, empty set of architectures after intersection (kernel: ${KERNEL_ROCM_ARCHS}, supported: ${ROCM_ARCHS})") + endif() else() set(_KERNEL_ARCHS "${ROCM_ARCHS}") endif()