diff --git a/.github/workflows/build_kernel.yaml b/.github/workflows/build_kernel.yaml
index 18e4e7d7..9a17e059 100644
--- a/.github/workflows/build_kernel.yaml
+++ b/.github/workflows/build_kernel.yaml
@@ -40,7 +40,7 @@ jobs:
         run: nix-shell -p nix-info --run "nix-info -m"
 
       - name: Build all example kernels
-        run: nix build -L ./examples/kernels#ci-build
+        run: nix build -L ./examples/kernels#ci-build-cuda
       - name: Copy kernel artifacts
         run: cp -rL result/* .
 
diff --git a/.github/workflows/build_kernel_rocm.yaml b/.github/workflows/build_kernel_rocm.yaml
index 6d090477..4b6205b1 100644
--- a/.github/workflows/build_kernel_rocm.yaml
+++ b/.github/workflows/build_kernel_rocm.yaml
@@ -12,7 +12,7 @@ on:
 
 jobs:
   build:
-    name: Build kernel
+    name: Build kernels (ROCm)
     runs-on:
       group: aws-highmemory-32-plus-nix
     steps:
@@ -33,8 +33,5 @@ jobs:
         run: nix-shell -p nix-info --run "nix-info -m"
       # For now we only test that there are no regressions in building ROCm
       # kernels. Also run tests once we have a ROCm runner.
-      - name: Build relu kernel
-        run: ( cd examples/kernels/relu && nix build .\#redistributable.torch211-rocm71-x86_64-linux -L )
-
-      - name: Build relu kernel (compiler flags)
-        run: ( cd examples/kernels/relu-compiler-flags && nix build .\#redistributable.torch211-rocm71-x86_64-linux )
+      - name: Build all ROCm example kernels
+        run: nix build -L ./examples/kernels#ci-build-rocm
diff --git a/examples/kernels/flake.nix b/examples/kernels/flake.nix
index ac8b377b..b26ba054 100644
--- a/examples/kernels/flake.nix
+++ b/examples/kernels/flake.nix
@@ -15,6 +15,7 @@
       inherit (kernel-builder.inputs.nixpkgs) lib;
 
       cudaVersion = "cu126";
+      rocmVersion = "rocm71";
       torchVersion = "211";
       tvmFfiVersion = "01";
 
@@ -102,6 +103,13 @@
           path = ./relu-compiler-flags;
           drv = sys: out: out.packages.${sys}.redistributable.${"torch${torchVersion}-${cudaVersion}-${sys}"};
         }
+        {
+          name = "relu-invalid-capability";
+          path = ./relu-invalid-capability;
+          drv = sys: out: out.packages.${sys}.redistributable.${"torch${torchVersion}-${cudaVersion}-${sys}"};
+          assertFail = true;
+          assertFailLogs = [ "empty set of capabilities" ];
+        }
         {
           # Check that we can build an arch dev shell.
           name = "relu-dev-shell";
@@ -129,6 +137,27 @@
         }
       ];
 
+      # ROCm kernels to build in CI.
+      ciRocmKernels = [
+        {
+          name = "relu-invalid-capability";
+          path = ./relu-invalid-capability;
+          drv = sys: out: out.packages.${sys}.redistributable.${"torch${torchVersion}-${rocmVersion}-${sys}"};
+          assertFail = true;
+          assertFailLogs = [ "empty set of architectures" ];
+        }
+        {
+          name = "relu-kernel";
+          path = ./relu;
+          drv = sys: out: out.packages.${sys}.redistributable.${"torch${torchVersion}-${rocmVersion}-${sys}"};
+        }
+        {
+          name = "relu-compiler-flags";
+          path = ./relu-compiler-flags;
+          drv = sys: out: out.packages.${sys}.redistributable.${"torch${torchVersion}-${rocmVersion}-${sys}"};
+        }
+      ];
+
       mkKernelOutputs =
         {
           path,
@@ -141,16 +170,21 @@
           // lib.optionalAttrs (torchVersions != null) { inherit torchVersions; }
         );
 
-      ciKernelOutputs = map (
-        kernel:
-        kernel
-        // {
-          outputs = mkKernelOutputs {
-            inherit (kernel) path;
-            torchVersions = kernel.torchVersions or null;
-          };
-        }
-      ) ciKernels;
+      mkKernelOutputs' =
+        kernels:
+        map (
+          kernel:
+          kernel
+          // {
+            outputs = mkKernelOutputs {
+              inherit (kernel) path;
+              torchVersions = kernel.torchVersions or null;
+            };
+          }
+        ) kernels;
+
+      ciKernelOutputs = mkKernelOutputs' ciKernels;
+      ciRocmKernelOutputs = mkKernelOutputs' ciRocmKernels;
     in
     flake-utils.lib.eachSystem
       [
@@ -162,22 +196,39 @@
         let
           pkgs = nixpkgs.legacyPackages.${system};
 
-          resolvedKernels = map (kernel: {
-            inherit (kernel) name;
-            drv = kernel.drv system kernel.outputs;
-          }) ciKernelOutputs;
-
-          ci-build = pkgs.linkFarm "ci-kernels" (
+          resolveKernels =
+            kernelOutputsList:
             map (kernel: {
               inherit (kernel) name;
-              path = kernel.drv;
-            }) resolvedKernels
-          );
+              drv =
+                let
+                  baseDrv = kernel.drv system kernel.outputs;
+                in
+                if kernel.assertFail or false then
+                  pkgs.testers.testBuildFailure' {
+                    drv = baseDrv;
+                    expectedBuilderLogEntries = kernel.assertFailLogs or [ ];
+                  }
+                else
+                  baseDrv;
+            }) kernelOutputsList;
+
+          mkCiBuild =
+            name: kernelOutputsList:
+            pkgs.linkFarm name (
+              map (kernel: {
+                inherit (kernel) name;
+                path = kernel.drv;
+              }) (resolveKernels kernelOutputsList)
+            );
+
+          ci-build-cuda = mkCiBuild "ci-kernels-cuda" ciKernelOutputs;
+          ci-build-rocm = mkCiBuild "ci-kernels-rocm" ciRocmKernelOutputs;
         in
         {
           packages = {
-            inherit ci-build;
-            default = ci-build;
+            inherit ci-build-cuda ci-build-rocm;
+            default = ci-build-cuda;
           };
         }
       );
diff --git a/examples/kernels/relu-invalid-capability/CARD.md b/examples/kernels/relu-invalid-capability/CARD.md
new file mode 100644
index 00000000..c7c70715
--- /dev/null
+++ b/examples/kernels/relu-invalid-capability/CARD.md
@@ -0,0 +1,56 @@
+---
+library_name: kernels
+{% if license %}license: {{ license }}
+{% endif %}---
+
+This is the repository card of {{ repo_id }} that has been pushed on the Hub. It was built to be used with the [`kernels` library](https://github.com/huggingface/kernels). This card was automatically generated.
+
+## How to use
+{% if functions %}
+
+```python
+# make sure `kernels` is installed: `pip install -U kernels`
+from kernels import get_kernel
+
+kernel_module = get_kernel("{{ repo_id }}", version={{ version }})
+{{ functions[0] }} = kernel_module.{{ functions[0] }}
+
+{{ functions[0] }}(...)
+```
+{% else %}
+
+Usage example not available.
+{% endif %}
+
+## Available functions
+{% if functions %}
+{% for func in functions %}
+- `{{ func }}`
+{% endfor %}
+{% else %}
+
+Function list not available.
+{% endif %}
+{% if layers %}
+
+## Available layers
+{% for layer in layers %}
+- `{{ layer }}`
+{% endfor %}
+{% endif %}
+
+## Benchmarks
+{% if has_benchmark %}
+
+Benchmarking script is available for this kernel. Run `kernels benchmark {{ repo_id }} --version {{ version }}`.
+{% else %}
+
+No benchmark available yet.
+{% endif %}
+{% if upstream %}
+
+## Source code
+
+Source code of this kernel originally comes from {{ upstream }} and it was repurposed for compatibility with `kernels`.
+{% endif %}
+
diff --git a/examples/kernels/relu-invalid-capability/build.toml b/examples/kernels/relu-invalid-capability/build.toml
new file mode 100644
index 00000000..9de4c0d1
--- /dev/null
+++ b/examples/kernels/relu-invalid-capability/build.toml
@@ -0,0 +1,32 @@
+[general]
+name = "relu-invalid-capability"
+version = 1
+license = "Apache-2.0"
+backends = [
+    "cpu",
+    "cuda",
+    "metal",
+    "rocm",
+    "xpu",
+]
+
+[general.hub]
+repo-id = "kernels-test/relu-invalid-capability"
+
+[torch]
+src = [
+    "torch-ext/torch_binding.cpp",
+    "torch-ext/torch_binding.h",
+]
+
+[kernel.relu_rocm]
+backend = "rocm"
+depends = ["torch"]
+rocm-archs = [ "gfx99999" ]
+src = ["relu_cuda/relu.cu"]
+
+[kernel.relu]
+backend = "cuda"
+depends = ["torch"]
+cuda-capabilities = [ "99999.0" ]
+src = ["relu_cuda/relu.cu"]
diff --git a/examples/kernels/relu-invalid-capability/flake.nix b/examples/kernels/relu-invalid-capability/flake.nix
new file mode 100644
index 00000000..41e51eef
--- /dev/null
+++ b/examples/kernels/relu-invalid-capability/flake.nix
@@ -0,0 +1,17 @@
+{
+  description = "Flake for ReLU kernel";
+
+  inputs = {
+    kernel-builder.url = "path:../../..";
+  };
+
+  outputs =
+    {
+      self,
+      kernel-builder,
+    }:
+    kernel-builder.lib.genKernelFlakeOutputs {
+      inherit self;
+      path = ./.;
+    };
+}
diff --git a/examples/kernels/relu-invalid-capability/relu_cuda/relu.cu b/examples/kernels/relu-invalid-capability/relu_cuda/relu.cu
new file mode 100644
index 00000000..50b49f91
--- /dev/null
+++ b/examples/kernels/relu-invalid-capability/relu_cuda/relu.cu
@@ -0,0 +1,47 @@
+#include <ATen/cuda/CUDAContext.h>
+#include <c10/cuda/CUDAGuard.h>
+#include <torch/all.h>
+
+#include <cmath>
+
+__global__ void relu_kernel(float *__restrict__ out,
+                            float const *__restrict__ input, const int d) {
+  const int64_t token_idx = blockIdx.x;
+  for (int64_t idx = threadIdx.x; idx < d; idx += blockDim.x) {
+    auto x = input[token_idx * d + idx];
+    out[token_idx * d + idx] = x > 0.0f ? x : 0.0f;
+  }
+}
+
+void relu(torch::Tensor &out, torch::Tensor const &input) {
+  TORCH_CHECK(input.device().is_cuda(), "input must be a CUDA tensor");
+  TORCH_CHECK(input.is_contiguous(), "input must be contiguous");
+  TORCH_CHECK(input.scalar_type() == at::ScalarType::Float &&
+                  input.scalar_type() == at::ScalarType::Float,
+              "relu_kernel only supports float32");
+
+  TORCH_CHECK(input.sizes() == out.sizes(),
+              "Tensors must have the same shape. Got input shape: ",
+              input.sizes(), " and output shape: ", out.sizes());
+
+  TORCH_CHECK(input.scalar_type() == out.scalar_type(),
+              "Tensors must have the same data type. Got input dtype: ",
+              input.scalar_type(), " and output dtype: ", out.scalar_type());
+
+  TORCH_CHECK(input.device() == out.device(),
+              "Tensors must be on the same device. Got input device: ",
+              input.device(), " and output device: ", out.device());
+
+  if (input.numel() == 0) {
+    return;
+  }
+
+  int d = input.size(-1);
+  int64_t num_tokens = input.numel() / d;
+  dim3 grid(num_tokens);
+  dim3 block(std::min(d, 1024));
+  const at::cuda::OptionalCUDAGuard device_guard(device_of(input));
+  const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+  relu_kernel<<<grid, block, 0, stream>>>(out.data_ptr<float>(),
+                                          input.data_ptr<float>(), d);
+}
diff --git a/examples/kernels/relu-invalid-capability/torch-ext/relu_invalid_capability/__init__.py b/examples/kernels/relu-invalid-capability/torch-ext/relu_invalid_capability/__init__.py
new file mode 100644
index 00000000..a844aae8
--- /dev/null
+++ b/examples/kernels/relu-invalid-capability/torch-ext/relu_invalid_capability/__init__.py
@@ -0,0 +1,15 @@
+from typing import Optional
+
+import torch
+
+from ._ops import ops
+
+
+def relu(x: torch.Tensor, out: Optional[torch.Tensor] = None) -> torch.Tensor:
+    if out is None:
+        out = torch.empty_like(x)
+    ops.relu(out, x)
+    return out
+
+
+__all__ = ["relu"]
diff --git a/examples/kernels/relu-invalid-capability/torch-ext/torch_binding.cpp b/examples/kernels/relu-invalid-capability/torch-ext/torch_binding.cpp
new file mode 100644
index 00000000..1765d92d
--- /dev/null
+++ b/examples/kernels/relu-invalid-capability/torch-ext/torch_binding.cpp
@@ -0,0 +1,19 @@
+#include <torch/library.h>
+
+#include "registration.h"
+#include "torch_binding.h"
+
+TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
+  ops.def("relu(Tensor! out, Tensor input) -> ()");
+#if defined(CPU_KERNEL)
+  ops.impl("relu", torch::kCPU, &relu);
+#elif defined(CUDA_KERNEL) || defined(ROCM_KERNEL)
+  ops.impl("relu", torch::kCUDA, &relu);
+#elif defined(METAL_KERNEL)
+  ops.impl("relu", torch::kMPS, relu);
+#elif defined(XPU_KERNEL)
+  ops.impl("relu", torch::kXPU, &relu);
+#endif
+}
+
+REGISTER_EXTENSION(TORCH_EXTENSION_NAME)
diff --git a/examples/kernels/relu-invalid-capability/torch-ext/torch_binding.h b/examples/kernels/relu-invalid-capability/torch-ext/torch_binding.h
new file mode 100644
index 00000000..3bcf2904
--- /dev/null
+++ b/examples/kernels/relu-invalid-capability/torch-ext/torch_binding.h
@@ -0,0 +1,5 @@
+#pragma once
+
+#include <torch/torch.h>
+
+void relu(torch::Tensor &out, torch::Tensor const &input);
\ No newline at end of file
diff --git a/kernel-builder/src/pyproject/templates/kernel.cmake b/kernel-builder/src/pyproject/templates/kernel.cmake
index 4d4ffa64..be5a736c 100644
--- a/kernel-builder/src/pyproject/templates/kernel.cmake
+++ b/kernel-builder/src/pyproject/templates/kernel.cmake
@@ -34,6 +34,9 @@ function(cuda_kernel_component SRC_VAR)
     # Determine CUDA architectures
     if(KERNEL_CUDA_CAPABILITIES)
         cuda_archs_loose_intersection(_KERNEL_ARCHS "${KERNEL_CUDA_CAPABILITIES}" "${CUDA_ARCHS}")
+        if(NOT _KERNEL_ARCHS)
+            message(FATAL_ERROR "CUDA kernel: ${KERNEL_NAME}, empty set of capabilities after intersection (kernel: ${KERNEL_CUDA_CAPABILITIES}, supported: ${CUDA_ARCHS})")
+        endif()
     else()
         set(_KERNEL_ARCHS "${CUDA_KERNEL_ARCHS}")
     endif()
@@ -115,6 +118,9 @@ function(hip_kernel_component SRC_VAR)
     # Determine ROCm architectures
     if(KERNEL_ROCM_ARCHS)
         hip_archs_loose_intersection(_KERNEL_ARCHS "${KERNEL_ROCM_ARCHS}" "${ROCM_ARCHS}")
+        if(NOT _KERNEL_ARCHS)
+            message(FATAL_ERROR "ROCm kernel: ${KERNEL_NAME}, empty set of architectures after intersection (kernel: ${KERNEL_ROCM_ARCHS}, supported: ${ROCM_ARCHS})")
+        endif()
     else()
         set(_KERNEL_ARCHS "${ROCM_ARCHS}")
     endif()