kubeflow
diff --git a/‎README.md‎
Lines changed: 19 additions & 3 deletions b/‎README.md‎
Lines changed: 19 additions & 3 deletions
diff --git a/‎common/observability/base/dashboards/gpu-availability-allocation-dashboard-cr.yaml‎
Lines changed: 12 additions & 0 deletions b/‎common/observability/base/dashboards/gpu-availability-allocation-dashboard-cr.yaml‎
Lines changed: 12 additions & 0 deletions
diff --git a/‎common/observability/base/dashboards/gpu-availability-allocation-dashboard.yaml‎
Lines changed: 22 additions & 0 deletions b/‎common/observability/base/dashboards/gpu-availability-allocation-dashboard.yaml‎
Lines changed: 22 additions & 0 deletions
diff --git a/‎common/observability/base/dashboards/gpu-cluster-usage-dashboard-cr.yaml‎
Lines changed: 12 additions & 0 deletions b/‎common/observability/base/dashboards/gpu-cluster-usage-dashboard-cr.yaml‎
Lines changed: 12 additions & 0 deletions
diff --git a/‎common/observability/base/dashboards/gpu-cluster-usage-dashboard.yaml‎
Lines changed: 30 additions & 0 deletions b/‎common/observability/base/dashboards/gpu-cluster-usage-dashboard.yaml‎
Lines changed: 30 additions & 0 deletions
diff --git a/‎common/observability/base/dashboards/gpu-namespace-usage-dashboard-cr.yaml‎
Lines changed: 12 additions & 0 deletions b/‎common/observability/base/dashboards/gpu-namespace-usage-dashboard-cr.yaml‎
Lines changed: 12 additions & 0 deletions
diff --git a/‎common/observability/base/dashboards/gpu-namespace-usage-dashboard.yaml‎
Lines changed: 22 additions & 0 deletions b/‎common/observability/base/dashboards/gpu-namespace-usage-dashboard.yaml‎
Lines changed: 22 additions & 0 deletions
diff --git a/‎common/observability/base/dashboards/kustomization.yaml‎
Lines changed: 9 additions & 0 deletions b/‎common/observability/base/dashboards/kustomization.yaml‎
Lines changed: 9 additions & 0 deletions
diff --git a/‎common/observability/base/kepler/clusterrole.yaml‎
Lines changed: 11 additions & 0 deletions b/‎common/observability/base/kepler/clusterrole.yaml‎
Lines changed: 11 additions & 0 deletions
diff --git a/‎common/observability/base/kepler/clusterrolebinding.yaml‎
Lines changed: 12 additions & 0 deletions b/‎common/observability/base/kepler/clusterrolebinding.yaml‎
Lines changed: 12 additions & 0 deletions
@@ -75,7 +75,7 @@ This repository periodically synchronizes all official Kubeflow components from
 | Volumes Web Application | applications/volumes-web-app/upstream | [v1.10.0](https://github.com/kubeflow/kubeflow/tree/v1.10.0/components/crud-web-apps/volumes/manifests) | 4m | 226Mi | 0GB |
 | Katib | applications/katib/upstream | [v0.19.0](https://github.com/kubeflow/katib/tree/v0.19.0/manifests/v1beta1) | 13m | 476Mi | 10GB |
 | KServe | applications/kserve/kserve | [v0.16.0](https://github.com/kserve/kserve/releases/tag/v0.16.0/install/v0.16.0) | 600m | 1200Mi | 0GB |
-| KServe Models Web Application | applications/kserve/models-web-app | [c71ee4309f0335159d9fdfd4559a538b5c782c92](https://github.com/kserve/models-web-app/tree/c71ee4309f0335159d9fdfd4559a538b5c782c92/manifests/kustomize) | 6m | 259Mi  | 0GB |
+| KServe Models Web Application | applications/kserve/models-web-app | [v0.16.1](https://github.com/kserve/models-web-app/tree/v0.16.1/config) | 6m | 259Mi  | 0GB |
 | Kubeflow Pipelines | applications/pipeline/upstream | [2.16.0](https://github.com/kubeflow/pipelines/tree/2.16.0/manifests/kustomize) | 970m | 3552Mi | 35GB |
 | Kubeflow Model Registry | applications/model-registry/upstream | [v0.3.7](https://github.com/kubeflow/model-registry/tree/v0.3.7/manifests/kustomize) | 510m | 2112Mi | 20GB |
 | Spark Operator	|	applications/spark/spark-operator	|	[2.5.0](https://github.com/kubeflow/spark-operator/tree/v2.5.0) | 9m | 41Mi | 0GB |
@@ -84,6 +84,7 @@ This repository periodically synchronizes all official Kubeflow components from
 | Cert Manager | common/cert-manager | [1.19.4](https://github.com/cert-manager/cert-manager/releases/tag/v1.19.4) | 3m | 128Mi | 0GB |
 | Dex | common/dex | [2.45.0](https://github.com/dexidp/dex/releases/tag/v2.45.0) | 3m | 27Mi | 0GB |
 | OAuth2-Proxy | common/oauth2-proxy | [7.14.3](https://github.com/oauth2-proxy/oauth2-proxy/releases/tag/v7.14.3) | 3m | 27Mi | 0GB |
+| Observability | common/observability | [3426](https://github.com/kubeflow/manifests/issues/3426) | - | - | 0GB |
 | **Total** | | | **4380m** | **12341Mi** | **65GB** |
 
 
@@ -111,7 +112,7 @@ The `example` directory contains an example kustomization for the single command
 ### ARM64 / aarch64 note
 
 Kubeflow on ARM64/aarch64 may not be fully supported yet because some OCI images might not be available for `linux/arm64`.
-If you hit image pull errors such as “no matching manifest for linux/arm64”, please track/report details in kubeflow/manifests#2745 and take a look at the [Google Summer of Code project for Kubeflow on ARM64](https://www.kubeflow.org/events/upcoming-events/gsoc-2026/#project--end-to-end-arm64-support--validation-on-kubeflow).
+If you hit image pull errors such as "no matching manifest for linux/arm64", please track/report details in kubeflow/manifests#2745 and take a look at the [Google Summer of Code project for Kubeflow on ARM64](https://www.kubeflow.org/events/upcoming-events/gsoc-2026/#project--end-to-end-arm64-support--validation-on-kubeflow).
 
 ---
 **NOTE**
@@ -182,6 +183,22 @@ Install the Kubeflow namespace:
 kustomize build common/kubeflow-namespace/base | kubectl apply -f -
 ```
 
+#### Observability Stack (Optional)
+
+This component provides an optional monitoring stack for GPU metrics (NVIDIA/AMD) and energy consumption (Kepler), along with Grafana dashboards. It includes Prometheus and Grafana operators and is deployed in the `kubeflow-monitoring-system` namespace.
+
+Install the observability base component:
+
+```sh
+./tests/observability_install.sh
+```
+
+To opt into Kepler for energy metrics:
+
+```sh
+kustomize build common/observability/components/kepler | kubectl apply -f -
+```
+
 #### Cert-manager
 
 Cert-manager is used by many Kubeflow components to provide certificates for admission webhooks.
@@ -448,7 +465,6 @@ kustomize build applications/tensorboard/tensorboard-controller/upstream/overlay
 
 ```sh
 ./tests/spark_install.sh
-```
 
 #### User Namespaces
 
 
@@ -0,0 +1,12 @@
+apiVersion: grafana.integreatly.org/v1beta1
+kind: GrafanaDashboard
+metadata:
+  name: gpu-availability-allocation
+  namespace: kubeflow-monitoring-system
+spec:
+  configMapRef:
+    name: gpu-availability-allocation-dashboard
+    key: gpu-availability-allocation.json
+  instanceSelector:
+    matchLabels:
+      dashboards: "grafana"
@@ -0,0 +1,22 @@
+apiVersion: v1
+kind: ConfigMap
+metadata:
+  name: gpu-availability-allocation-dashboard
+  namespace: kubeflow-monitoring-system
+  labels:
+    grafana_dashboard: "1"
+data:
+  gpu-availability-allocation.json: |
+    {
+      "title": "GPU Availability & Allocation",
+      "panels": [
+        {
+          "title": "Pending GPU workloads",
+          "type": "stat",
+          "targets": [
+            { "expr": "count(kube_pod_status_phase{phase=\"Pending\"} * on(pod, namespace) group_left() kube_pod_container_resource_requests{resource=\"nvidia.com/gpu\"})", "legendFormat": "Pending NVIDIA GPU Pods" }
+          ]
+        }
+      ],
+      "datasource": { "uid": "prometheus" }
+    }
@@ -0,0 +1,12 @@
+apiVersion: grafana.integreatly.org/v1beta1
+kind: GrafanaDashboard
+metadata:
+  name: gpu-cluster-usage
+  namespace: kubeflow-monitoring-system
+spec:
+  configMapRef:
+    name: gpu-cluster-usage-dashboard
+    key: gpu-cluster-usage.json
+  instanceSelector:
+    matchLabels:
+      dashboards: "grafana"
@@ -0,0 +1,30 @@
+apiVersion: v1
+kind: ConfigMap
+metadata:
+  name: gpu-cluster-usage-dashboard
+  namespace: kubeflow-monitoring-system
+  labels:
+    grafana_dashboard: "1"
+data:
+  gpu-cluster-usage.json: |
+    {
+      "title": "GPU Cluster Usage",
+      "panels": [
+        {
+          "title": "Cluster-wide GPU Utilization %",
+          "type": "timeseries",
+          "targets": [
+            { "expr": "avg(DCGM_FI_DEV_GPU_UTIL) or avg(amd_gpu_utilization)", "legendFormat": "GPU Utilization" }
+          ]
+        },
+        {
+          "title": "GPU Memory Used vs Total per Node",
+          "type": "timeseries",
+          "targets": [
+            { "expr": "sum(DCGM_FI_DEV_FB_USED) by (node)", "legendFormat": "{{node}} Used" },
+            { "expr": "sum(DCGM_FI_DEV_FB_FREE + DCGM_FI_DEV_FB_USED) by (node)", "legendFormat": "{{node}} Total" }
+          ]
+        }
+      ],
+      "datasource": { "uid": "prometheus" }
+    }
@@ -0,0 +1,12 @@
+apiVersion: grafana.integreatly.org/v1beta1
+kind: GrafanaDashboard
+metadata:
+  name: gpu-namespace-usage
+  namespace: kubeflow-monitoring-system
+spec:
+  configMapRef:
+    name: gpu-namespace-usage-dashboard
+    key: gpu-namespace-usage.json
+  instanceSelector:
+    matchLabels:
+      dashboards: "grafana"
@@ -0,0 +1,22 @@
+apiVersion: v1
+kind: ConfigMap
+metadata:
+  name: gpu-namespace-usage-dashboard
+  namespace: kubeflow-monitoring-system
+  labels:
+    grafana_dashboard: "1"
+data:
+  gpu-namespace-usage.json: |
+    {
+      "title": "GPU Namespace Usage",
+      "panels": [
+        {
+          "title": "Per-Workload-Namespace GPU Utilization over time",
+          "type": "timeseries",
+          "targets": [
+            { "expr": "sum by (exported_namespace) (label_replace(DCGM_FI_DEV_GPU_UTIL * on(pod) group_left(namespace) kube_pod_info{}, \"exported_namespace\", \"$1\", \"namespace\", \"(.*)\"))", "legendFormat": "{{exported_namespace}}" }
+          ]
+        }
+      ],
+      "datasource": { "uid": "prometheus" }
+    }
@@ -0,0 +1,9 @@
+apiVersion: kustomize.config.k8s.io/v1beta1
+kind: Kustomization
+resources:
+- gpu-cluster-usage-dashboard.yaml
+- gpu-namespace-usage-dashboard.yaml
+- gpu-availability-allocation-dashboard.yaml
+- gpu-cluster-usage-dashboard-cr.yaml
+- gpu-namespace-usage-dashboard-cr.yaml
+- gpu-availability-allocation-dashboard-cr.yaml
@@ -0,0 +1,11 @@
+apiVersion: rbac.authorization.k8s.io/v1
+kind: ClusterRole
+metadata:
+  name: kepler-role
+rules:
+- apiGroups: [""]
+  resources: ["nodes", "pods", "namespaces"]
+  verbs: ["get", "list", "watch"]
+- apiGroups: [""]
+  resources: ["endpoints"]
+  verbs: ["get"]
@@ -0,0 +1,12 @@
+apiVersion: rbac.authorization.k8s.io/v1
+kind: ClusterRoleBinding
+metadata:
+  name: kepler-role-binding
+roleRef:
+  apiGroup: rbac.authorization.k8s.io
+  kind: ClusterRole
+  name: kepler-role
+subjects:
+- kind: ServiceAccount
+  name: kepler-sa
+  namespace: kubeflow-monitoring-system