Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/workflows/build_kernel.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ jobs:
run: nix-shell -p nix-info --run "nix-info -m"

- name: Build all example kernels
run: nix build -L ./examples/kernels#ci-build
run: nix build -L ./examples/kernels#ci-build-cuda
- name: Copy kernel artifacts
run: cp -rL result/* .

Expand Down
9 changes: 3 additions & 6 deletions .github/workflows/build_kernel_rocm.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ on:

jobs:
build:
name: Build kernel
name: Build kernels (ROCm)
runs-on:
group: aws-highmemory-32-plus-nix
steps:
Expand All @@ -33,8 +33,5 @@ jobs:
run: nix-shell -p nix-info --run "nix-info -m"
# For now we only test that there are no regressions in building ROCm
# kernels. Also run tests once we have a ROCm runner.
- name: Build relu kernel
run: ( cd examples/kernels/relu && nix build .\#redistributable.torch211-rocm71-x86_64-linux -L )

- name: Build relu kernel (compiler flags)
run: ( cd examples/kernels/relu-compiler-flags && nix build .\#redistributable.torch211-rocm71-x86_64-linux )
- name: Build all ROCm example kernels
run: nix build -L ./examples/kernels#ci-build-rocm
93 changes: 72 additions & 21 deletions examples/kernels/flake.nix
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
inherit (kernel-builder.inputs.nixpkgs) lib;

cudaVersion = "cu126";
rocmVersion = "rocm71";
torchVersion = "211";
tvmFfiVersion = "01";

Expand Down Expand Up @@ -102,6 +103,13 @@
path = ./relu-compiler-flags;
drv = sys: out: out.packages.${sys}.redistributable.${"torch${torchVersion}-${cudaVersion}-${sys}"};
}
{
name = "relu-invalid-capability";
path = ./relu-invalid-capability;
drv = sys: out: out.packages.${sys}.redistributable.${"torch${torchVersion}-${cudaVersion}-${sys}"};
assertFail = true;
assertFailLogs = [ "empty set of capabilities" ];
}
{
# Check that we can build an arch dev shell.
name = "relu-dev-shell";
Expand Down Expand Up @@ -129,6 +137,27 @@
}
];

# ROCm kernels to build in CI.
ciRocmKernels = [
{
name = "relu-invalid-capability";
path = ./relu-invalid-capability;
drv = sys: out: out.packages.${sys}.redistributable.${"torch${torchVersion}-${rocmVersion}-${sys}"};
assertFail = true;
assertFailLogs = [ "empty set of architectures" ];
}
{
name = "relu-kernel";
path = ./relu;
drv = sys: out: out.packages.${sys}.redistributable.${"torch${torchVersion}-${rocmVersion}-${sys}"};
}
{
name = "relu-compiler-flags";
path = ./relu-compiler-flags;
drv = sys: out: out.packages.${sys}.redistributable.${"torch${torchVersion}-${rocmVersion}-${sys}"};
}
];

mkKernelOutputs =
{
path,
Expand All @@ -141,16 +170,21 @@
// lib.optionalAttrs (torchVersions != null) { inherit torchVersions; }
);

ciKernelOutputs = map (
kernel:
kernel
// {
outputs = mkKernelOutputs {
inherit (kernel) path;
torchVersions = kernel.torchVersions or null;
};
}
) ciKernels;
mkKernelOutputs' =
kernels:
map (
kernel:
kernel
// {
outputs = mkKernelOutputs {
inherit (kernel) path;
torchVersions = kernel.torchVersions or null;
};
}
) kernels;

ciKernelOutputs = mkKernelOutputs' ciKernels;
ciRocmKernelOutputs = mkKernelOutputs' ciRocmKernels;
in
flake-utils.lib.eachSystem
[
Expand All @@ -162,22 +196,39 @@
let
pkgs = nixpkgs.legacyPackages.${system};

resolvedKernels = map (kernel: {
inherit (kernel) name;
drv = kernel.drv system kernel.outputs;
}) ciKernelOutputs;

ci-build = pkgs.linkFarm "ci-kernels" (
resolveKernels =
kernelOutputsList:
map (kernel: {
inherit (kernel) name;
path = kernel.drv;
}) resolvedKernels
);
drv =
let
baseDrv = kernel.drv system kernel.outputs;
in
if kernel.assertFail or false then
pkgs.testers.testBuildFailure' {
drv = baseDrv;
expectedBuilderLogEntries = kernel.assertFailLogs or [ ];
}
else
baseDrv;
}) kernelOutputsList;

mkCiBuild =
name: kernelOutputsList:
pkgs.linkFarm name (
map (kernel: {
inherit (kernel) name;
path = kernel.drv;
}) (resolveKernels kernelOutputsList)
);

ci-build-cuda = mkCiBuild "ci-kernels-cuda" ciKernelOutputs;
ci-build-rocm = mkCiBuild "ci-kernels-rocm" ciRocmKernelOutputs;
in
{
packages = {
inherit ci-build;
default = ci-build;
inherit ci-build-cuda ci-build-rocm;
default = ci-build-cuda;
};
}
);
Expand Down
56 changes: 56 additions & 0 deletions examples/kernels/relu-invalid-capability/CARD.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
---
library_name: kernels
{% if license %}license: {{ license }}
{% endif %}---

This is the repository card of {{ repo_id }} that has been pushed on the Hub. It was built to be used with the [`kernels` library](https://github.com/huggingface/kernels). This card was automatically generated.

## How to use
{% if functions %}

```python
# make sure `kernels` is installed: `pip install -U kernels`
from kernels import get_kernel

kernel_module = get_kernel("{{ repo_id }}", version={{ version }})
{{ functions[0] }} = kernel_module.{{ functions[0] }}

{{ functions[0] }}(...)
```
{% else %}

Usage example not available.
{% endif %}

## Available functions
{% if functions %}
{% for func in functions %}
- `{{ func }}`
{% endfor %}
{% else %}

Function list not available.
{% endif %}
{% if layers %}

## Available layers
{% for layer in layers %}
- `{{ layer }}`
{% endfor %}
{% endif %}

## Benchmarks
{% if has_benchmark %}

Benchmarking script is available for this kernel. Run `kernels benchmark {{ repo_id }} --version {{ version }}`.
{% else %}

No benchmark available yet.
{% endif %}
{% if upstream %}

## Source code

Source code of this kernel originally comes from {{ upstream }} and it was repurposed for compatibility with `kernels`.
{% endif %}

32 changes: 32 additions & 0 deletions examples/kernels/relu-invalid-capability/build.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
[general]
name = "relu-invalid-capability"
version = 1
license = "Apache-2.0"
backends = [
"cpu",
"cuda",
"metal",
"rocm",
"xpu",
]

[general.hub]
repo-id = "kernels-test/relu-invalid-capability"

[torch]
src = [
"torch-ext/torch_binding.cpp",
"torch-ext/torch_binding.h",
]

[kernel.relu_rocm]
backend = "rocm"
depends = ["torch"]
rocm-archs = [ "gfx99999" ]
src = ["relu_cuda/relu.cu"]

[kernel.relu]
backend = "cuda"
depends = ["torch"]
cuda-capabilities = [ "99999.0" ]
src = ["relu_cuda/relu.cu"]
17 changes: 17 additions & 0 deletions examples/kernels/relu-invalid-capability/flake.nix
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
{
description = "Flake for ReLU kernel";

inputs = {
kernel-builder.url = "path:../../..";
};

outputs =
{
self,
kernel-builder,
}:
kernel-builder.lib.genKernelFlakeOutputs {
inherit self;
path = ./.;
};
}
47 changes: 47 additions & 0 deletions examples/kernels/relu-invalid-capability/relu_cuda/relu.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
#include <ATen/cuda/CUDAContext.h>
#include <c10/cuda/CUDAGuard.h>
#include <torch/all.h>

#include <cmath>

__global__ void relu_kernel(float *__restrict__ out,
float const *__restrict__ input, const int d) {
const int64_t token_idx = blockIdx.x;
for (int64_t idx = threadIdx.x; idx < d; idx += blockDim.x) {
auto x = input[token_idx * d + idx];
out[token_idx * d + idx] = x > 0.0f ? x : 0.0f;
}
}

void relu(torch::Tensor &out, torch::Tensor const &input) {
TORCH_CHECK(input.device().is_cuda(), "input must be a CUDA tensor");
TORCH_CHECK(input.is_contiguous(), "input must be contiguous");
TORCH_CHECK(input.scalar_type() == at::ScalarType::Float &&
input.scalar_type() == at::ScalarType::Float,
"relu_kernel only supports float32");

TORCH_CHECK(input.sizes() == out.sizes(),
"Tensors must have the same shape. Got input shape: ",
input.sizes(), " and output shape: ", out.sizes());

TORCH_CHECK(input.scalar_type() == out.scalar_type(),
"Tensors must have the same data type. Got input dtype: ",
input.scalar_type(), " and output dtype: ", out.scalar_type());

TORCH_CHECK(input.device() == out.device(),
"Tensors must be on the same device. Got input device: ",
input.device(), " and output device: ", out.device());

if (input.numel() == 0) {
return;
}

int d = input.size(-1);
int64_t num_tokens = input.numel() / d;
dim3 grid(num_tokens);
dim3 block(std::min(d, 1024));
const at::cuda::OptionalCUDAGuard device_guard(device_of(input));
const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
relu_kernel<<<grid, block, 0, stream>>>(out.data_ptr<float>(),
input.data_ptr<float>(), d);
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
from typing import Optional

import torch

from ._ops import ops


def relu(x: torch.Tensor, out: Optional[torch.Tensor] = None) -> torch.Tensor:
if out is None:
out = torch.empty_like(x)
ops.relu(out, x)
return out


__all__ = ["relu"]
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
#include <torch/library.h>

#include "registration.h"
#include "torch_binding.h"

TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
ops.def("relu(Tensor! out, Tensor input) -> ()");
#if defined(CPU_KERNEL)
ops.impl("relu", torch::kCPU, &relu);
#elif defined(CUDA_KERNEL) || defined(ROCM_KERNEL)
ops.impl("relu", torch::kCUDA, &relu);
#elif defined(METAL_KERNEL)
ops.impl("relu", torch::kMPS, relu);
#elif defined(XPU_KERNEL)
ops.impl("relu", torch::kXPU, &relu);
#endif
}

REGISTER_EXTENSION(TORCH_EXTENSION_NAME)
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
#pragma once

#include <torch/torch.h>

void relu(torch::Tensor &out, torch::Tensor const &input);
6 changes: 6 additions & 0 deletions kernel-builder/src/pyproject/templates/kernel.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,9 @@ function(cuda_kernel_component SRC_VAR)
# Determine CUDA architectures
if(KERNEL_CUDA_CAPABILITIES)
cuda_archs_loose_intersection(_KERNEL_ARCHS "${KERNEL_CUDA_CAPABILITIES}" "${CUDA_ARCHS}")
if(NOT _KERNEL_ARCHS)
message(FATAL_ERROR "CUDA kernel: ${KERNEL_NAME}, empty set of capabilities after intersection (kernel: ${KERNEL_CUDA_CAPABILITIES}, supported: ${CUDA_ARCHS})")
endif()
else()
set(_KERNEL_ARCHS "${CUDA_KERNEL_ARCHS}")
endif()
Expand Down Expand Up @@ -115,6 +118,9 @@ function(hip_kernel_component SRC_VAR)
# Determine ROCm architectures
if(KERNEL_ROCM_ARCHS)
hip_archs_loose_intersection(_KERNEL_ARCHS "${KERNEL_ROCM_ARCHS}" "${ROCM_ARCHS}")
if(NOT _KERNEL_ARCHS)
message(FATAL_ERROR "ROCm kernel: ${KERNEL_NAME}, empty set of architectures after intersection (kernel: ${KERNEL_ROCM_ARCHS}, supported: ${ROCM_ARCHS})")
endif()
else()
set(_KERNEL_ARCHS "${ROCM_ARCHS}")
endif()
Expand Down
Loading