metrics added

mblos · mblos · commit e6df6fda16e8 · 2026-03-24T10:27:05.000+01:00
diff --git a/helm/bundles/cortex-nova/alerts/nova.alerts.yaml b/helm/bundles/cortex-nova/alerts/nova.alerts.yaml
@@ -368,3 +368,100 @@ groups:
         to be scheduled. Affected commitment changes are rolled back and Limes
         will see them as failed. Consider investigating the scheduler performance
         or increasing the timeout configuration.
+
+  # Committed Resource Usage API Alerts
+  - alert: CortexNovaCommittedResourceUsageHttpRequest400sTooHigh
+    expr: rate(cortex_committed_resource_usage_api_requests_total{service="cortex-nova-metrics", status_code=~"4.."}[5m]) > 0.1
+    for: 5m
+    labels:
+      context: committed-resource-api
+      dashboard: cortex/cortex
+      service: cortex
+      severity: warning
+      support_group: workload-management
+    annotations:
+      summary: "Committed Resource usage API HTTP 400 errors too high"
+      description: >
+        The committed resource usage API (Limes LIQUID integration) is responding
+        with HTTP 4xx errors. This may indicate invalid project IDs or malformed
+        requests from Limes. Limes will typically retry these requests.
+
+  - alert: CortexNovaCommittedResourceUsageHttpRequest500sTooHigh
+    expr: rate(cortex_committed_resource_usage_api_requests_total{service="cortex-nova-metrics", status_code=~"5.."}[5m]) > 0.1
+    for: 5m
+    labels:
+      context: committed-resource-api
+      dashboard: cortex/cortex
+      service: cortex
+      severity: warning
+      support_group: workload-management
+    annotations:
+      summary: "Committed Resource usage API HTTP 500 errors too high"
+      description: >
+        The committed resource usage API (Limes LIQUID integration) is responding
+        with HTTP 5xx errors. This indicates internal problems fetching reservations
+        or Nova server data. Limes may receive stale or incomplete usage data.
+
+  - alert: CortexNovaCommittedResourceUsageLatencyTooHigh
+    expr: histogram_quantile(0.95, sum(rate(cortex_committed_resource_usage_api_request_duration_seconds_bucket{service="cortex-nova-metrics"}[5m])) by (le)) > 5
+    for: 5m
+    labels:
+      context: committed-resource-api
+      dashboard: cortex/cortex
+      service: cortex
+      severity: warning
+      support_group: workload-management
+    annotations:
+      summary: "Committed Resource usage API latency too high"
+      description: >
+        The committed resource usage API (Limes LIQUID integration) is experiencing
+        high latency (p95 > 5s). This may indicate slow Nova API responses or
+        database queries. Limes scrapes may time out, affecting quota reporting.
+
+  # Committed Resource Capacity API Alerts
+  - alert: CortexNovaCommittedResourceCapacityHttpRequest400sTooHigh
+    expr: rate(cortex_committed_resource_capacity_api_requests_total{service="cortex-nova-metrics", status_code=~"4.."}[5m]) > 0.1
+    for: 5m
+    labels:
+      context: committed-resource-api
+      dashboard: cortex/cortex
+      service: cortex
+      severity: warning
+      support_group: workload-management
+    annotations:
+      summary: "Committed Resource capacity API HTTP 400 errors too high"
+      description: >
+        The committed resource capacity API (Limes LIQUID integration) is responding
+        with HTTP 4xx errors. This may indicate malformed requests from Limes.
+
+  - alert: CortexNovaCommittedResourceCapacityHttpRequest500sTooHigh
+    expr: rate(cortex_committed_resource_capacity_api_requests_total{service="cortex-nova-metrics", status_code=~"5.."}[5m]) > 0.1
+    for: 5m
+    labels:
+      context: committed-resource-api
+      dashboard: cortex/cortex
+      service: cortex
+      severity: warning
+      support_group: workload-management
+    annotations:
+      summary: "Committed Resource capacity API HTTP 500 errors too high"
+      description: >
+        The committed resource capacity API (Limes LIQUID integration) is responding
+        with HTTP 5xx errors. This indicates internal problems calculating cluster
+        capacity. Limes may receive stale or incomplete capacity data.
+
+  - alert: CortexNovaCommittedResourceCapacityLatencyTooHigh
+    expr: histogram_quantile(0.95, sum(rate(cortex_committed_resource_capacity_api_request_duration_seconds_bucket{service="cortex-nova-metrics"}[5m])) by (le)) > 5
+    for: 5m
+    labels:
+      context: committed-resource-api
+      dashboard: cortex/cortex
+      service: cortex
+      severity: warning
+      support_group: workload-management
+    annotations:
+      summary: "Committed Resource capacity API latency too high"
+      description: >
+        The committed resource capacity API (Limes LIQUID integration) is experiencing
+        high latency (p95 > 5s). This may indicate slow database queries or knowledge
+        CRD retrieval. Limes scrapes may time out, affecting capacity reporting.
diff --git a/internal/scheduling/reservations/commitments/api.go b/internal/scheduling/reservations/commitments/api.go
@@ -21,10 +21,12 @@ type UsageNovaClient interface {
 
 // HTTPAPI implements Limes LIQUID commitment validation endpoints.
 type HTTPAPI struct {
-	client     client.Client
-	config     Config
-	novaClient UsageNovaClient
-	monitor    ChangeCommitmentsAPIMonitor
+	client          client.Client
+	config          Config
+	novaClient      UsageNovaClient
+	monitor         ChangeCommitmentsAPIMonitor
+	usageMonitor    ReportUsageAPIMonitor
+	capacityMonitor ReportCapacityAPIMonitor
 	// Mutex to serialize change-commitments requests
 	changeMutex sync.Mutex
 }
@@ -35,17 +37,21 @@ func NewAPI(client client.Client) *HTTPAPI {
 
 func NewAPIWithConfig(client client.Client, config Config, novaClient UsageNovaClient) *HTTPAPI {
 	return &HTTPAPI{
-		client:     client,
-		config:     config,
-		novaClient: novaClient,
-		monitor:    NewChangeCommitmentsAPIMonitor(),
+		client:          client,
+		config:          config,
+		novaClient:      novaClient,
+		monitor:         NewChangeCommitmentsAPIMonitor(),
+		usageMonitor:    NewReportUsageAPIMonitor(),
+		capacityMonitor: NewReportCapacityAPIMonitor(),
 	}
 }
 
 func (api *HTTPAPI) Init(mux *http.ServeMux, registry prometheus.Registerer) {
 	registry.MustRegister(&api.monitor)
+	registry.MustRegister(&api.usageMonitor)
+	registry.MustRegister(&api.capacityMonitor)
 	mux.HandleFunc("/v1/commitments/change-commitments", api.HandleChangeCommitments)
-	// mux.HandleFunc("/v1/report-capacity", api.HandleReportCapacity)
+	mux.HandleFunc("/v1/commitments/report-capacity", api.HandleReportCapacity)
 	mux.HandleFunc("/v1/commitments/info", api.HandleInfo)
 	mux.HandleFunc("/v1/commitments/projects/", api.HandleReportUsage) // matches /v1/commitments/projects/:project_id/report-usage
 }
diff --git a/internal/scheduling/reservations/commitments/api_report_capacity.go b/internal/scheduling/reservations/commitments/api_report_capacity.go
@@ -6,21 +6,28 @@ package commitments
 import (
 	"encoding/json"
 	"net/http"
+	"strconv"
+	"time"
 
 	"github.com/sapcc/go-api-declarations/liquid"
 )
 
-// handles POST /v1/report-capacity requests from Limes:
+// handles POST /v1/commitments/report-capacity requests from Limes:
 // See: https://github.com/sapcc/go-api-declarations/blob/main/liquid/commitment.go
 // See: https://pkg.go.dev/github.com/sapcc/go-api-declarations/liquid
 // Reports available capacity across all flavor group resources. Note, unit is specified in the Info API response with multiple of the smallest memory resource unit within a flavor group.
 func (api *HTTPAPI) HandleReportCapacity(w http.ResponseWriter, r *http.Request) {
+	startTime := time.Now()
+	statusCode := http.StatusOK
+
 	ctx := WithNewGlobalRequestID(r.Context())
-	logger := LoggerFromContext(ctx).WithValues("component", "api", "endpoint", "/v1/report-capacity")
+	logger := LoggerFromContext(ctx).WithValues("component", "api", "endpoint", "/v1/commitments/report-capacity")
 
 	// Only accept POST method
 	if r.Method != http.MethodPost {
-		http.Error(w, "Method not allowed", http.StatusMethodNotAllowed)
+		statusCode = http.StatusMethodNotAllowed
+		http.Error(w, "Method not allowed", statusCode)
+		api.recordCapacityMetrics(statusCode, startTime)
 		return
 	}
 
@@ -38,18 +45,27 @@ func (api *HTTPAPI) HandleReportCapacity(w http.ResponseWriter, r *http.Request)
 	report, err := calculator.CalculateCapacity(ctx)
 	if err != nil {
 		logger.Error(err, "failed to calculate capacity")
-		http.Error(w, "Failed to calculate capacity: "+err.Error(),
-			http.StatusInternalServerError)
+		statusCode = http.StatusInternalServerError
+		http.Error(w, "Failed to calculate capacity: "+err.Error(), statusCode)
+		api.recordCapacityMetrics(statusCode, startTime)
 		return
 	}
 
 	logger.Info("calculated capacity report", "resourceCount", len(report.Resources))
 
 	// Return response
 	w.Header().Set("Content-Type", "application/json")
-	w.WriteHeader(http.StatusOK)
+	w.WriteHeader(statusCode)
 	if err := json.NewEncoder(w).Encode(report); err != nil {
 		logger.Error(err, "failed to encode capacity report")
-		return
 	}
+	api.recordCapacityMetrics(statusCode, startTime)
+}
+
+// recordCapacityMetrics records Prometheus metrics for a report-capacity request.
+func (api *HTTPAPI) recordCapacityMetrics(statusCode int, startTime time.Time) {
+	duration := time.Since(startTime).Seconds()
+	statusCodeStr := strconv.Itoa(statusCode)
+	api.capacityMonitor.requestCounter.WithLabelValues(statusCodeStr).Inc()
+	api.capacityMonitor.requestDuration.WithLabelValues(statusCodeStr).Observe(duration)
 }
diff --git a/internal/scheduling/reservations/commitments/api_report_capacity_monitor.go b/internal/scheduling/reservations/commitments/api_report_capacity_monitor.go
@@ -0,0 +1,41 @@
+// Copyright SAP SE
+// SPDX-License-Identifier: Apache-2.0
+
+package commitments
+
+import (
+	"github.com/prometheus/client_golang/prometheus"
+)
+
+// ReportCapacityAPIMonitor provides metrics for the CR report-capacity API.
+type ReportCapacityAPIMonitor struct {
+	requestCounter  *prometheus.CounterVec
+	requestDuration *prometheus.HistogramVec
+}
+
+// NewReportCapacityAPIMonitor creates a new monitor with Prometheus metrics.
+func NewReportCapacityAPIMonitor() ReportCapacityAPIMonitor {
+	return ReportCapacityAPIMonitor{
+		requestCounter: prometheus.NewCounterVec(prometheus.CounterOpts{
+			Name: "cortex_committed_resource_capacity_api_requests_total",
+			Help: "Total number of committed resource capacity API requests by HTTP status code",
+		}, []string{"status_code"}),
+		requestDuration: prometheus.NewHistogramVec(prometheus.HistogramOpts{
+			Name:    "cortex_committed_resource_capacity_api_request_duration_seconds",
+			Help:    "Duration of committed resource capacity API requests in seconds by HTTP status code",
+			Buckets: []float64{0.1, 0.25, 0.5, 1, 2.5, 5, 10},
+		}, []string{"status_code"}),
+	}
+}
+
+// Describe implements prometheus.Collector.
+func (m *ReportCapacityAPIMonitor) Describe(ch chan<- *prometheus.Desc) {
+	m.requestCounter.Describe(ch)
+	m.requestDuration.Describe(ch)
+}
+
+// Collect implements prometheus.Collector.
+func (m *ReportCapacityAPIMonitor) Collect(ch chan<- prometheus.Metric) {
+	m.requestCounter.Collect(ch)
+	m.requestDuration.Collect(ch)
+}
diff --git a/internal/scheduling/reservations/commitments/api_report_usage.go b/internal/scheduling/reservations/commitments/api_report_usage.go
@@ -7,6 +7,7 @@ import (
 	"encoding/json"
 	"fmt"
 	"net/http"
+	"strconv"
 	"strings"
 	"time"
 
@@ -20,14 +21,19 @@ import (
 // This endpoint reports usage information for a specific project's committed resources,
 // including per-AZ usage, physical usage, and detailed VM subresources.
 func (api *HTTPAPI) HandleReportUsage(w http.ResponseWriter, r *http.Request) {
+	startTime := time.Now()
+	statusCode := http.StatusOK
+
 	requestID := r.Header.Get("X-Request-ID")
 	if requestID == "" {
 		requestID = fmt.Sprintf("req-%d", time.Now().UnixNano())
 	}
 	log := baseLog.WithValues("requestID", requestID, "endpoint", "report-usage")
 
 	if r.Method != http.MethodPost {
-		http.Error(w, "Method not allowed", http.StatusMethodNotAllowed)
+		statusCode = http.StatusMethodNotAllowed
+		http.Error(w, "Method not allowed", statusCode)
+		api.recordUsageMetrics(statusCode, startTime)
 		return
 	}
 
@@ -36,15 +42,19 @@ func (api *HTTPAPI) HandleReportUsage(w http.ResponseWriter, r *http.Request) {
 	projectID, err := extractProjectIDFromPath(r.URL.Path)
 	if err != nil {
 		log.Error(err, "failed to extract project ID from path")
-		http.Error(w, "Invalid URL path: "+err.Error(), http.StatusBadRequest)
+		statusCode = http.StatusBadRequest
+		http.Error(w, "Invalid URL path: "+err.Error(), statusCode)
+		api.recordUsageMetrics(statusCode, startTime)
 		return
 	}
 
 	// Parse request body
 	var req liquid.ServiceUsageRequest
 	if err := json.NewDecoder(r.Body).Decode(&req); err != nil {
 		log.Error(err, "failed to decode request body")
-		http.Error(w, "Invalid request body: "+err.Error(), http.StatusBadRequest)
+		statusCode = http.StatusBadRequest
+		http.Error(w, "Invalid request body: "+err.Error(), statusCode)
+		api.recordUsageMetrics(statusCode, startTime)
 		return
 	}
 
@@ -53,15 +63,26 @@ func (api *HTTPAPI) HandleReportUsage(w http.ResponseWriter, r *http.Request) {
 	report, err := calculator.CalculateUsage(r.Context(), log, projectID, req.AllAZs)
 	if err != nil {
 		log.Error(err, "failed to calculate usage report", "projectID", projectID)
-		http.Error(w, "Failed to generate usage report: "+err.Error(), http.StatusInternalServerError)
+		statusCode = http.StatusInternalServerError
+		http.Error(w, "Failed to generate usage report: "+err.Error(), statusCode)
+		api.recordUsageMetrics(statusCode, startTime)
 		return
 	}
 
 	w.Header().Set("Content-Type", "application/json")
-	w.WriteHeader(http.StatusOK)
+	w.WriteHeader(statusCode)
 	if err := json.NewEncoder(w).Encode(report); err != nil {
 		log.Error(err, "failed to encode usage report")
 	}
+	api.recordUsageMetrics(statusCode, startTime)
+}
+
+// recordUsageMetrics records Prometheus metrics for a report-usage request.
+func (api *HTTPAPI) recordUsageMetrics(statusCode int, startTime time.Time) {
+	duration := time.Since(startTime).Seconds()
+	statusCodeStr := strconv.Itoa(statusCode)
+	api.usageMonitor.requestCounter.WithLabelValues(statusCodeStr).Inc()
+	api.usageMonitor.requestDuration.WithLabelValues(statusCodeStr).Observe(duration)
 }
 
 // extractProjectIDFromPath extracts the project UUID from the URL path.
diff --git a/internal/scheduling/reservations/commitments/api_report_usage_monitor.go b/internal/scheduling/reservations/commitments/api_report_usage_monitor.go
@@ -0,0 +1,41 @@
+// Copyright SAP SE
+// SPDX-License-Identifier: Apache-2.0
+
+package commitments
+
+import (
+	"github.com/prometheus/client_golang/prometheus"
+)
+
+// ReportUsageAPIMonitor provides metrics for the CR report-usage API.
+type ReportUsageAPIMonitor struct {
+	requestCounter  *prometheus.CounterVec
+	requestDuration *prometheus.HistogramVec
+}
+
+// NewReportUsageAPIMonitor creates a new monitor with Prometheus metrics.
+func NewReportUsageAPIMonitor() ReportUsageAPIMonitor {
+	return ReportUsageAPIMonitor{
+		requestCounter: prometheus.NewCounterVec(prometheus.CounterOpts{
+			Name: "cortex_committed_resource_usage_api_requests_total",
+			Help: "Total number of committed resource usage API requests by HTTP status code",
+		}, []string{"status_code"}),
+		requestDuration: prometheus.NewHistogramVec(prometheus.HistogramOpts{
+			Name:    "cortex_committed_resource_usage_api_request_duration_seconds",
+			Help:    "Duration of committed resource usage API requests in seconds by HTTP status code",
+			Buckets: []float64{0.1, 0.25, 0.5, 1, 2.5, 5, 10},
+		}, []string{"status_code"}),
+	}
+}
+
+// Describe implements prometheus.Collector.
+func (m *ReportUsageAPIMonitor) Describe(ch chan<- *prometheus.Desc) {
+	m.requestCounter.Describe(ch)
+	m.requestDuration.Describe(ch)
+}
+
+// Collect implements prometheus.Collector.
+func (m *ReportUsageAPIMonitor) Collect(ch chan<- prometheus.Metric) {
+	m.requestCounter.Collect(ch)
+	m.requestDuration.Collect(ch)
+}