kubeflow · abdullahpathan22 · Mar 27, 2026 · Mar 27, 2026 · Mar 27, 2026 · Mar 28, 2026
diff --git a/.github/workflows/model_registry_test.yaml b/.github/workflows/model_registry_test.yaml
@@ -54,75 +54,5 @@ jobs:
       run: |
         kustomize build applications/model-registry/upstream/options/ui/overlays/istio | kubectl apply -n kubeflow -f -
 
-    - name: Test KF Model Registry deployment
-      run: |
-        echo "Waiting for all Model Registry Pods to become ready..."
-        if ! kubectl wait --for=condition=available -n kubeflow deployment/model-registry-db --timeout=60s ; then
-            kubectl events -A
-            kubectl describe deployment/model-registry-db -n kubeflow
-            kubectl logs deployment/model-registry-db -n kubeflow
-            exit 1
-        fi
-        kubectl wait --for=condition=available -n kubeflow deployment/model-registry-deployment --timeout=60s
-    - name: Test KF Model Registry UI deployment
-      run: |
-        echo "Waiting for all Model Registry UI Pods to become ready..."
-        if ! kubectl wait --for=condition=available -n kubeflow deployment/model-registry-ui --timeout=60s ; then
-            kubectl events -A
-            kubectl describe deployment/model-registry-ui -n kubeflow
-            kubectl logs deployment/model-registry-ui -n kubeflow
-            exit 1
-        fi
-    - name: Dry-run KF Model Registry API directly
-      run: |
-        echo "Dry-run KF Model Registry API directly..."
-        nohup kubectl port-forward svc/model-registry-service -n kubeflow 8081:8080 &
-        while ! curl localhost:8081; do echo "waiting for port-forwarding 8081"; sleep 1; done; echo "port-forwarding 8181 ready"
-        curl -v -X 'GET' \
-          'http://localhost:8081/api/model_registry/v1alpha3/registered_models?pageSize=100&orderBy=ID&sortOrder=DESC' \
-          -H 'accept: application/json'
-
-    # for these steps below ensure same steps as kserve (ie: Istio with external authentication, cert-manager, knative) so to achieve same setup
-    - name: Port forward Istio gateway
-      run: |
-        INGRESS_GATEWAY_SERVICE=$(kubectl get svc --namespace istio-system --selector="app=istio-ingressgateway" --output jsonpath='{.items[0].metadata.name}')
-        nohup kubectl port-forward --namespace istio-system svc/${INGRESS_GATEWAY_SERVICE} 8080:80 &
-        while ! curl localhost:8080; do echo waiting for port-forwarding; sleep 1; done; echo port-forwarding ready
-
-    - name: Dry-run KF Model Registry REST API
-      run: |
-        echo "Dry-run KF Model Registry REST API..."
-        export KF_TOKEN="$(kubectl -n default create token default)"
-        curl -v -H "Authorization: Bearer "$KF_TOKEN http://localhost:8080/api/model_registry/v1alpha3/registered_models
-
-    - name: Dry-run KF Model Registry REST API UI
-      run: |
-        echo "Dry-run KF Model Registry REST API..."
-        export KF_PROFILE=kubeflow-user-example-com
-        export KF_TOKEN="$(kubectl -n $KF_PROFILE create token default-editor)"
-
-        STATUS_CODE=$(curl -v \
-            --silent --output /dev/stderr --write-out "%{http_code}" \
-            "localhost:8080/model-registry/api/v1/model_registry?namespace=${KF_PROFILE}" \
-            -H "Authorization: Bearer ${KF_TOKEN}")
-
-        if test $STATUS_CODE -ne 200; then
-            echo "Error, this call should be authorized to list model registries in namespace ${KF_PROFILE}."
-            exit 1
-        fi
-
-    - name: Dry-run KF Model Registry REST API UI with unauthorized SA Token
-      run: |
-        echo "Dry-run KF Model Registry REST API..."
-        export KF_PROFILE=kubeflow-user-example-com
-        export KF_TOKEN="$(kubectl -n default create token default)"
-
-        STATUS_CODE=$(curl -v \
-            --silent --output /dev/stderr --write-out "%{http_code}" \
-            "localhost:8080/model-registry/api/v1/model_registry?namespace=${KF_PROFILE}" \
-            -H "Authorization: Bearer ${KF_TOKEN}")
-
-        if test $STATUS_CODE -ne 403; then
-            echo "Error, this call should fail to list model registry resources in namespace ${KF_PROFILE}."
-            exit 1
-        fi
+    - name: Run Model Registry tests
+      run: ./tests/model_registry_test.sh
diff --git a/.github/workflows/observability-kustomize-build.yaml b/.github/workflows/observability-kustomize-build.yaml
@@ -0,0 +1,20 @@
+name: observability-kustomize-build
+
+on:
+  pull_request:
+    paths:
+      - 'common/observability/**'
+
+jobs:
+  build:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v5
+      - name: Install KinD, kustomize, and kubectl
+        run: ./tests/install_KinD_create_KinD_cluster_install_kustomize.sh
+      - name: Build base
+        run: kustomize build common/observability/base
+      - name: Build overlay
+        run: kustomize build common/observability/overlays/kubeflow
+      - name: Build kepler component
+        run: kustomize build common/observability/components/kepler
diff --git a/README.md b/README.md
@@ -75,7 +75,7 @@ This repository periodically synchronizes all official Kubeflow components from
 | Volumes Web Application | applications/volumes-web-app/upstream | [v1.10.0](https://github.com/kubeflow/kubeflow/tree/v1.10.0/components/crud-web-apps/volumes/manifests) | 4m | 226Mi | 0GB |
 | Katib | applications/katib/upstream | [v0.19.0](https://github.com/kubeflow/katib/tree/v0.19.0/manifests/v1beta1) | 13m | 476Mi | 10GB |
 | KServe | applications/kserve/kserve | [v0.16.0](https://github.com/kserve/kserve/releases/tag/v0.16.0/install/v0.16.0) | 600m | 1200Mi | 0GB |
-| KServe Models Web Application | applications/kserve/models-web-app | [c71ee4309f0335159d9fdfd4559a538b5c782c92](https://github.com/kserve/models-web-app/tree/c71ee4309f0335159d9fdfd4559a538b5c782c92/manifests/kustomize) | 6m | 259Mi  | 0GB |
+| KServe Models Web Application | applications/kserve/models-web-app | [c71ee4309f0335159d9fdfd4559a538b5c782c92](https://github.com/kserve/models-web-app/tree/c71ee4309f0335159d9fdfd4559a538b5c782c92/manifests/kustomize) | 6m | 259Mi  | 0GB |
 | Kubeflow Pipelines | applications/pipeline/upstream | [2.16.0](https://github.com/kubeflow/pipelines/tree/2.16.0/manifests/kustomize) | 970m | 3552Mi | 35GB |
 | Kubeflow Model Registry | applications/model-registry/upstream | [v0.3.7](https://github.com/kubeflow/model-registry/tree/v0.3.7/manifests/kustomize) | 510m | 2112Mi | 20GB |
 | Spark Operator	|	applications/spark/spark-operator	|	[2.5.0](https://github.com/kubeflow/spark-operator/tree/v2.5.0) | 9m | 41Mi | 0GB |
@@ -84,6 +84,7 @@ This repository periodically synchronizes all official Kubeflow components from
 | Cert Manager | common/cert-manager | [1.19.4](https://github.com/cert-manager/cert-manager/releases/tag/v1.19.4) | 3m | 128Mi | 0GB |
 | Dex | common/dex | [2.45.0](https://github.com/dexidp/dex/releases/tag/v2.45.0) | 3m | 27Mi | 0GB |
 | OAuth2-Proxy | common/oauth2-proxy | [7.14.3](https://github.com/oauth2-proxy/oauth2-proxy/releases/tag/v7.14.3) | 3m | 27Mi | 0GB |
+| Observability | common/observability | [3426](https://github.com/kubeflow/manifests/issues/3426) | - | - | 0GB |
 | **Total** | | | **4380m** | **12341Mi** | **65GB** |
 
 
@@ -111,7 +112,7 @@ The `example` directory contains an example kustomization for the single command
 ### ARM64 / aarch64 note
 
 Kubeflow on ARM64/aarch64 may not be fully supported yet because some OCI images might not be available for `linux/arm64`.
-If you hit image pull errors such as “no matching manifest for linux/arm64”, please track/report details in kubeflow/manifests#2745 and take a look at the [Google Summer of Code project for Kubeflow on ARM64](https://www.kubeflow.org/events/upcoming-events/gsoc-2026/#project--end-to-end-arm64-support--validation-on-kubeflow).
+If you hit image pull errors such as "no matching manifest for linux/arm64", please track/report details in kubeflow/manifests#2745 and take a look at the [Google Summer of Code project for Kubeflow on ARM64](https://www.kubeflow.org/events/upcoming-events/gsoc-2026/#project--end-to-end-arm64-support--validation-on-kubeflow).
 
 ---
 **NOTE**
@@ -182,6 +183,24 @@ Install the Kubeflow namespace:
 kustomize build common/kubeflow-namespace/base | kubectl apply -f -
 ```
 
+#### Observability Stack (Optional)
+
+This component provides an optional monitoring stack for GPU metrics (NVIDIA/AMD) along with Grafana dashboards. It includes Prometheus and Grafana operators deployed in the `kubeflow-monitoring-system` namespace. Energy consumption metrics via Kepler are available as a separate opt-in component and are NOT installed by default — see the Kepler section below.
+
+Install the observability base component (GPU metrics, Prometheus, and Grafana, without Kepler):
+
+```sh
+./tests/observability_install.sh
+```
+
+To opt into Kepler for energy metrics:
+
+> **Note on Container Runtimes:** The Kepler component mounts the generic `/var/run` host path to automatically pick up the container runtime socket (which supports containerd, CRI-O, and other standard runtimes). 
+
+```sh
+kustomize build common/observability/components/kepler | kubectl apply --server-side -f -
+```
+
 #### Cert-manager
 
 Cert-manager is used by many Kubeflow components to provide certificates for admission webhooks.

diff --git a/common/observability/OWNERS b/common/observability/OWNERS
@@ -0,0 +1,7 @@
+approvers:
+  - juliusvonkohout
+  - kimwnasptd
+reviewers:
+  - juliusvonkohout
+  - kimwnasptd
+  - tarekabouzeid
diff --git a/common/observability/README.md b/common/observability/README.md
@@ -0,0 +1,71 @@
+## Overview
+An opt-in kustomize component providing a complete monitoring foundation for Kubeflow clusters. The base component focuses on GPU workloads (NVIDIA DCGM + AMD ROCm) and installs Prometheus Operator, Grafana Operator, GPU ServiceMonitors, and three Grafana dashboards. Energy metrics via **Kepler** are available as a separate, optional sub-component and are not part of the default installation.
+
+> **Note:** All ServiceMonitor resources are created in the `kubeflow-monitoring-system` namespace (forced by the base kustomization). The `spec.namespaceSelector` field on each ServiceMonitor controls which target namespaces are scraped. If the target namespace (e.g. `gpu-operator`) does not exist, Prometheus will simply find no matching endpoints — no error is raised.
+
+## Prerequisites
+| Prerequisite | Required for | Notes |
+|---|---|---|
+| Kubernetes 1.27+ | Everything | |
+| kustomize v5+ | Installation | |
+| NVIDIA GPU Operator | NVIDIA ServiceMonitor | Runs in `gpu-operator` ns — ServiceMonitor scrapes it via `spec.namespaceSelector`; silent if absent |
+| AMD GPU Operator | AMD ServiceMonitor | Runs in `kube-amd-gpu` ns — ServiceMonitor scrapes it via `spec.namespaceSelector`; silent if absent |
+| kube-state-metrics | GPU Namespace Usage + Availability dashboards | **Without it 2/3 dashboards render blank with no error** — install via kube-prometheus-stack or standalone |
+
+### Architecture Support
+| Component | x86_64 | ARM64 |
+|---|:---:|:---:|
+| Core Stack (Prometheus, Grafana) | ✅ | ✅ |
+| NVIDIA DCGM Exporter | ✅ | ⚠️ (Requires specific image) |
+| AMD GPU Exporter | ✅ | ❌ |
+| Kepler | ✅ | ✅ |
+
+## Installation
+```sh
+# Main stack (Prometheus + Grafana + GPU ServiceMonitors + dashboards)
+kustomize build common/observability/overlays/kubeflow | kubectl apply --server-side -f -
+
+# Or via script
+./tests/observability_install.sh
+
+# Kepler energy metrics (OPTIONAL — separate step)
+# Note: Kepler requires privileged access. See section below.
+kustomize build common/observability/components/kepler | kubectl apply --server-side -f -
+```
+> `--server-side` is required — CRD bundles exceed client-side annotation size limits.
+
+## What gets installed
+| Resource | Namespace | Purpose |
+|---|---|---|
+| Prometheus Operator | kubeflow-monitoring-system | Manages Prometheus CR |
+| Prometheus CR | kubeflow-monitoring-system | Scrapes all ServiceMonitors across all namespaces |
+| Grafana Operator | kubeflow-monitoring-system | Manages Grafana, GrafanaDatasource, GrafanaDashboard CRs |
+| Grafana CR | kubeflow-monitoring-system | Grafana instance |
+| GrafanaDatasource | kubeflow-monitoring-system | Prometheus datasource, uid: prometheus |
+| NVIDIA DCGM ServiceMonitor | kubeflow-monitoring-system | Scrapes DCGM exporter in `gpu-operator` ns via `spec.namespaceSelector` |
+| AMD ROCm ServiceMonitor | kubeflow-monitoring-system | Scrapes device-metrics-exporter in `kube-amd-gpu` ns via `spec.namespaceSelector` |
+| 3x GrafanaDashboard CRs | kubeflow-monitoring-system | GPU dashboards |
+| Kepler DaemonSet (opt-in) | kepler | Per-pod energy/power draw metrics |
+| Kepler ServiceMonitor (opt-in) | kubeflow-monitoring-system | Scrapes Kepler in the `kepler` ns via `spec.namespaceSelector` |
+
+## Dashboards
+| Dashboard | What it shows | Dependencies |
+|---|---|---|
+| GPU Cluster Usage | Cluster-wide GPU utilization, memory, count per node | DCGM or ROCm metrics |
+| GPU Namespace Usage | Per-namespace GPU allocation and utilization | kube-state-metrics + DCGM |
+| GPU Availability & Allocation | Allocation ratios, pending GPU sessions | kube-state-metrics |
+
+## Accessing Grafana
+```sh
+kubectl port-forward svc/grafana-service -n kubeflow-monitoring-system 3000:3000
+```
+Open http://localhost:3000 — default credentials are `admin` / `admin` (managed via `grafana-admin-credentials` Secret).
+
+> **Security warning:** These default credentials are provided for ease of initial access and must be rotated immediately for production use by updating the `grafana-admin-credentials` Secret or via the Grafana UI.
+
+## Kepler (opt-in)
+Kepler deploys to its own `kepler` namespace (PSS: privileged) to avoid impacting the PSS restricted posture of `kubeflow-monitoring-system`. It requires `privileged: true` to access `/proc`, `/sys`, and the container runtime socket for energy metrics.
+
+## Reference
+- CERN architecture: https://architecture.cncf.io/architectures/cern-scientific-computing/
+- Issue: https://github.com/kubeflow/manifests/issues/3426
diff --git a/common/observability/base/dashboards/gpu-availability-allocation-dashboard.yaml b/common/observability/base/dashboards/gpu-availability-allocation-dashboard.yaml
@@ -0,0 +1,42 @@
+apiVersion: grafana.integreatly.org/v1beta1
+kind: GrafanaDashboard
+metadata:
+  name: gpu-availability-allocation-dashboard
+  namespace: kubeflow-monitoring-system
+  labels:
+    app.kubernetes.io/part-of: kubeflow-observability
+spec:
+  instanceSelector:
+    matchLabels:
+      dashboards: "grafana"
+  json: |
+    {
+      "__requires": [
+        { "type": "grafana", "id": "grafana", "name": "Grafana", "version": "9.0.0" },
+        { "type": "datasource", "id": "prometheus", "name": "Prometheus", "version": "1.0.0" }
+      ],
+      "templating": {
+        "list": [
+          {
+            "name": "datasource",
+            "type": "datasource",
+            "query": "prometheus",
+            "label": "Datasource",
+            "hide": 0
+          }
+        ]
+      },
+      "title": "GPU Availability & Allocation",
+      "description": "Requires kube-state-metrics for GPU allocation and availability metrics.",
+      "panels": [
+        {
+          "title": "Pending GPU workloads",
+          "description": "Requires kube-state-metrics being scraped by the Prometheus datasource.",
+          "type": "stat",
+          "datasource": { "type": "prometheus", "uid": "prometheus" },
+          "targets": [
+            { "expr": "count(kube_pod_status_phase{phase=\"Pending\"} * on(pod, namespace) group_left() kube_pod_container_resource_requests{resource=\"nvidia.com/gpu\"})", "legendFormat": "Pending NVIDIA GPU Pods" }
+          ]
+        }
+      ]
+    }
diff --git a/common/observability/base/dashboards/gpu-cluster-usage-dashboard.yaml b/common/observability/base/dashboards/gpu-cluster-usage-dashboard.yaml
@@ -0,0 +1,49 @@
+apiVersion: grafana.integreatly.org/v1beta1
+kind: GrafanaDashboard
+metadata:
+  name: gpu-cluster-usage-dashboard
+  namespace: kubeflow-monitoring-system
+  labels:
+    app.kubernetes.io/part-of: kubeflow-observability
+spec:
+  instanceSelector:
+    matchLabels:
+      dashboards: "grafana"
+  json: |
+    {
+      "__requires": [
+        { "type": "grafana", "id": "grafana", "name": "Grafana", "version": "9.0.0" },
+        { "type": "datasource", "id": "prometheus", "name": "Prometheus", "version": "1.0.0" }
+      ],
+      "templating": {
+        "list": [
+          {
+            "name": "datasource",
+            "type": "datasource",
+            "query": "prometheus",
+            "label": "Datasource",
+            "hide": 0
+          }
+        ]
+      },
+      "title": "GPU Cluster Usage",
+      "panels": [
+        {
+          "title": "Cluster-wide GPU Utilization %",
+          "type": "timeseries",
+          "datasource": { "type": "prometheus", "uid": "prometheus" },
+          "targets": [
+            { "expr": "avg(DCGM_FI_DEV_GPU_UTIL) or avg(amd_gpu_utilization)", "legendFormat": "GPU Utilization" }
+          ]
+        },
+        {
+          "title": "GPU Memory Used vs Total per Node",
+          "type": "timeseries",
+          "datasource": { "type": "prometheus", "uid": "prometheus" },
+          "targets": [
+            { "expr": "sum(DCGM_FI_DEV_FB_USED) by (node)", "legendFormat": "{{node}} Used" },
+            { "expr": "sum(DCGM_FI_DEV_FB_FREE + DCGM_FI_DEV_FB_USED) by (node)", "legendFormat": "{{node}} Total" }
+          ]
+        }
+      ]
+    }
diff --git a/common/observability/base/dashboards/gpu-namespace-usage-dashboard.yaml b/common/observability/base/dashboards/gpu-namespace-usage-dashboard.yaml
@@ -0,0 +1,50 @@
+apiVersion: grafana.integreatly.org/v1beta1
+kind: GrafanaDashboard
+metadata:
+  name: gpu-namespace-usage-dashboard
+  namespace: kubeflow-monitoring-system
+  labels:
+    app.kubernetes.io/part-of: kubeflow-observability
+spec:
+  instanceSelector:
+    matchLabels:
+      dashboards: "grafana"
+  json: |
+    {
+      "__requires": [
+        { "type": "grafana", "id": "grafana", "name": "Grafana", "version": "9.0.0" },
+        { "type": "datasource", "id": "prometheus", "name": "Prometheus", "version": "1.0.0" }
+      ],
+      "templating": {
+        "list": [
+          {
+            "name": "datasource",
+            "type": "datasource",
+            "query": "prometheus",
+            "label": "Datasource",
+            "hide": 0
+          },
+          {
+            "name": "namespace",
+            "type": "query",
+            "datasource": { "uid": "prometheus" },
+            "query": "label_values(kube_pod_info, namespace)",
+            "label": "Namespace",
+            "hide": 0
+          }
+        ]
+      },
+      "title": "GPU Namespace Usage",
+      "description": "Requires kube-state-metrics for namespace-level GPU attribution via kube_pod_info.",
+      "panels": [
+        {
+          "title": "Per-Workload-Namespace GPU Utilization over time",
+          "description": "Requires kube-state-metrics with kube_pod_info being scraped by the Prometheus datasource.",
+          "type": "timeseries",
-      "title": "GPU Namespace Usage",
-      "panels": [
-        {
-          "title": "Per-Workload-Namespace GPU Utilization over time",
-          "type": "timeseries",
+      "title": "GPU Namespace Usage",
+      "description": "This dashboard depends on kube-state-metrics providing the kube_pod_info series and requires that kube-state-metrics is installed and scraped by Prometheus.",
+      "panels": [
+        {
+          "title": "Per-Workload-Namespace GPU Utilization over time",
+          "type": "timeseries",
+          "description": "Requires kube-state-metrics with kube_pod_info being scraped by the Prometheus datasource.",
-      "title": "GPU Namespace Usage",
-      "panels": [
-        {
-          "title": "Per-Workload-Namespace GPU Utilization over time",
-          "type": "timeseries",
+      "title": "GPU Namespace Usage",
+      "description": "This dashboard depends on kube-state-metrics providing the kube_pod_info series and requires that kube-state-metrics is installed and scraped by Prometheus.",
+      "panels": [
+        {
+          "title": "Per-Workload-Namespace GPU Utilization over time",
+          "type": "timeseries",
+          "description": "Requires kube-state-metrics with kube_pod_info being scraped by the Prometheus datasource.",
+          "datasource": { "type": "prometheus", "uid": "prometheus" },
+          "targets": [
+            { "expr": "sum by (namespace) (DCGM_FI_DEV_GPU_UTIL * on (pod) group_left(namespace) kube_pod_info)", "legendFormat": "{{namespace}}" }
+          ]
+        }
+      ]
+    }
diff --git a/common/observability/base/dashboards/kustomization.yaml b/common/observability/base/dashboards/kustomization.yaml
@@ -0,0 +1,6 @@
+apiVersion: kustomize.config.k8s.io/v1beta1
+kind: Kustomization
+resources:
+- gpu-cluster-usage-dashboard.yaml
+- gpu-namespace-usage-dashboard.yaml
+- gpu-availability-allocation-dashboard.yaml
- gpu-availability-allocation-dashboard.yaml
- gpu-availability-allocation-dashboard.yaml
diff --git a/common/observability/base/kepler/clusterrole.yaml b/common/observability/base/kepler/clusterrole.yaml
@@ -0,0 +1,11 @@
+apiVersion: rbac.authorization.k8s.io/v1
+kind: ClusterRole
+metadata:
+  name: kubeflow-kepler-role
+rules:
+- apiGroups: [""]
+  resources: ["nodes", "pods", "namespaces"]
+  verbs: ["get", "list", "watch"]
+- apiGroups: [""]
+  resources: ["endpoints"]
+  verbs: ["get"]