Skip to content

Commit e6df6fd

Browse files
committed
metrics added
1 parent d5e79e2 commit e6df6fd

6 files changed

Lines changed: 243 additions & 21 deletions

File tree

helm/bundles/cortex-nova/alerts/nova.alerts.yaml

Lines changed: 97 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -368,3 +368,100 @@ groups:
368368
to be scheduled. Affected commitment changes are rolled back and Limes
369369
will see them as failed. Consider investigating the scheduler performance
370370
or increasing the timeout configuration.
371+
372+
# Committed Resource Usage API Alerts
373+
- alert: CortexNovaCommittedResourceUsageHttpRequest400sTooHigh
374+
expr: rate(cortex_committed_resource_usage_api_requests_total{service="cortex-nova-metrics", status_code=~"4.."}[5m]) > 0.1
375+
for: 5m
376+
labels:
377+
context: committed-resource-api
378+
dashboard: cortex/cortex
379+
service: cortex
380+
severity: warning
381+
support_group: workload-management
382+
annotations:
383+
summary: "Committed Resource usage API HTTP 400 errors too high"
384+
description: >
385+
The committed resource usage API (Limes LIQUID integration) is responding
386+
with HTTP 4xx errors. This may indicate invalid project IDs or malformed
387+
requests from Limes. Limes will typically retry these requests.
388+
389+
- alert: CortexNovaCommittedResourceUsageHttpRequest500sTooHigh
390+
expr: rate(cortex_committed_resource_usage_api_requests_total{service="cortex-nova-metrics", status_code=~"5.."}[5m]) > 0.1
391+
for: 5m
392+
labels:
393+
context: committed-resource-api
394+
dashboard: cortex/cortex
395+
service: cortex
396+
severity: warning
397+
support_group: workload-management
398+
annotations:
399+
summary: "Committed Resource usage API HTTP 500 errors too high"
400+
description: >
401+
The committed resource usage API (Limes LIQUID integration) is responding
402+
with HTTP 5xx errors. This indicates internal problems fetching reservations
403+
or Nova server data. Limes may receive stale or incomplete usage data.
404+
405+
- alert: CortexNovaCommittedResourceUsageLatencyTooHigh
406+
expr: histogram_quantile(0.95, sum(rate(cortex_committed_resource_usage_api_request_duration_seconds_bucket{service="cortex-nova-metrics"}[5m])) by (le)) > 5
407+
for: 5m
408+
labels:
409+
context: committed-resource-api
410+
dashboard: cortex/cortex
411+
service: cortex
412+
severity: warning
413+
support_group: workload-management
414+
annotations:
415+
summary: "Committed Resource usage API latency too high"
416+
description: >
417+
The committed resource usage API (Limes LIQUID integration) is experiencing
418+
high latency (p95 > 5s). This may indicate slow Nova API responses or
419+
database queries. Limes scrapes may time out, affecting quota reporting.
420+
421+
# Committed Resource Capacity API Alerts
422+
- alert: CortexNovaCommittedResourceCapacityHttpRequest400sTooHigh
423+
expr: rate(cortex_committed_resource_capacity_api_requests_total{service="cortex-nova-metrics", status_code=~"4.."}[5m]) > 0.1
424+
for: 5m
425+
labels:
426+
context: committed-resource-api
427+
dashboard: cortex/cortex
428+
service: cortex
429+
severity: warning
430+
support_group: workload-management
431+
annotations:
432+
summary: "Committed Resource capacity API HTTP 400 errors too high"
433+
description: >
434+
The committed resource capacity API (Limes LIQUID integration) is responding
435+
with HTTP 4xx errors. This may indicate malformed requests from Limes.
436+
437+
- alert: CortexNovaCommittedResourceCapacityHttpRequest500sTooHigh
438+
expr: rate(cortex_committed_resource_capacity_api_requests_total{service="cortex-nova-metrics", status_code=~"5.."}[5m]) > 0.1
439+
for: 5m
440+
labels:
441+
context: committed-resource-api
442+
dashboard: cortex/cortex
443+
service: cortex
444+
severity: warning
445+
support_group: workload-management
446+
annotations:
447+
summary: "Committed Resource capacity API HTTP 500 errors too high"
448+
description: >
449+
The committed resource capacity API (Limes LIQUID integration) is responding
450+
with HTTP 5xx errors. This indicates internal problems calculating cluster
451+
capacity. Limes may receive stale or incomplete capacity data.
452+
453+
- alert: CortexNovaCommittedResourceCapacityLatencyTooHigh
454+
expr: histogram_quantile(0.95, sum(rate(cortex_committed_resource_capacity_api_request_duration_seconds_bucket{service="cortex-nova-metrics"}[5m])) by (le)) > 5
455+
for: 5m
456+
labels:
457+
context: committed-resource-api
458+
dashboard: cortex/cortex
459+
service: cortex
460+
severity: warning
461+
support_group: workload-management
462+
annotations:
463+
summary: "Committed Resource capacity API latency too high"
464+
description: >
465+
The committed resource capacity API (Limes LIQUID integration) is experiencing
466+
high latency (p95 > 5s). This may indicate slow database queries or knowledge
467+
CRD retrieval. Limes scrapes may time out, affecting capacity reporting.

internal/scheduling/reservations/commitments/api.go

Lines changed: 15 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -21,10 +21,12 @@ type UsageNovaClient interface {
2121

2222
// HTTPAPI implements Limes LIQUID commitment validation endpoints.
2323
type HTTPAPI struct {
24-
client client.Client
25-
config Config
26-
novaClient UsageNovaClient
27-
monitor ChangeCommitmentsAPIMonitor
24+
client client.Client
25+
config Config
26+
novaClient UsageNovaClient
27+
monitor ChangeCommitmentsAPIMonitor
28+
usageMonitor ReportUsageAPIMonitor
29+
capacityMonitor ReportCapacityAPIMonitor
2830
// Mutex to serialize change-commitments requests
2931
changeMutex sync.Mutex
3032
}
@@ -35,17 +37,21 @@ func NewAPI(client client.Client) *HTTPAPI {
3537

3638
func NewAPIWithConfig(client client.Client, config Config, novaClient UsageNovaClient) *HTTPAPI {
3739
return &HTTPAPI{
38-
client: client,
39-
config: config,
40-
novaClient: novaClient,
41-
monitor: NewChangeCommitmentsAPIMonitor(),
40+
client: client,
41+
config: config,
42+
novaClient: novaClient,
43+
monitor: NewChangeCommitmentsAPIMonitor(),
44+
usageMonitor: NewReportUsageAPIMonitor(),
45+
capacityMonitor: NewReportCapacityAPIMonitor(),
4246
}
4347
}
4448

4549
func (api *HTTPAPI) Init(mux *http.ServeMux, registry prometheus.Registerer) {
4650
registry.MustRegister(&api.monitor)
51+
registry.MustRegister(&api.usageMonitor)
52+
registry.MustRegister(&api.capacityMonitor)
4753
mux.HandleFunc("/v1/commitments/change-commitments", api.HandleChangeCommitments)
48-
// mux.HandleFunc("/v1/report-capacity", api.HandleReportCapacity)
54+
mux.HandleFunc("/v1/commitments/report-capacity", api.HandleReportCapacity)
4955
mux.HandleFunc("/v1/commitments/info", api.HandleInfo)
5056
mux.HandleFunc("/v1/commitments/projects/", api.HandleReportUsage) // matches /v1/commitments/projects/:project_id/report-usage
5157
}

internal/scheduling/reservations/commitments/api_report_capacity.go

Lines changed: 23 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -6,21 +6,28 @@ package commitments
66
import (
77
"encoding/json"
88
"net/http"
9+
"strconv"
10+
"time"
911

1012
"github.com/sapcc/go-api-declarations/liquid"
1113
)
1214

13-
// handles POST /v1/report-capacity requests from Limes:
15+
// handles POST /v1/commitments/report-capacity requests from Limes:
1416
// See: https://github.com/sapcc/go-api-declarations/blob/main/liquid/commitment.go
1517
// See: https://pkg.go.dev/github.com/sapcc/go-api-declarations/liquid
1618
// Reports available capacity across all flavor group resources. Note, unit is specified in the Info API response with multiple of the smallest memory resource unit within a flavor group.
1719
func (api *HTTPAPI) HandleReportCapacity(w http.ResponseWriter, r *http.Request) {
20+
startTime := time.Now()
21+
statusCode := http.StatusOK
22+
1823
ctx := WithNewGlobalRequestID(r.Context())
19-
logger := LoggerFromContext(ctx).WithValues("component", "api", "endpoint", "/v1/report-capacity")
24+
logger := LoggerFromContext(ctx).WithValues("component", "api", "endpoint", "/v1/commitments/report-capacity")
2025

2126
// Only accept POST method
2227
if r.Method != http.MethodPost {
23-
http.Error(w, "Method not allowed", http.StatusMethodNotAllowed)
28+
statusCode = http.StatusMethodNotAllowed
29+
http.Error(w, "Method not allowed", statusCode)
30+
api.recordCapacityMetrics(statusCode, startTime)
2431
return
2532
}
2633

@@ -38,18 +45,27 @@ func (api *HTTPAPI) HandleReportCapacity(w http.ResponseWriter, r *http.Request)
3845
report, err := calculator.CalculateCapacity(ctx)
3946
if err != nil {
4047
logger.Error(err, "failed to calculate capacity")
41-
http.Error(w, "Failed to calculate capacity: "+err.Error(),
42-
http.StatusInternalServerError)
48+
statusCode = http.StatusInternalServerError
49+
http.Error(w, "Failed to calculate capacity: "+err.Error(), statusCode)
50+
api.recordCapacityMetrics(statusCode, startTime)
4351
return
4452
}
4553

4654
logger.Info("calculated capacity report", "resourceCount", len(report.Resources))
4755

4856
// Return response
4957
w.Header().Set("Content-Type", "application/json")
50-
w.WriteHeader(http.StatusOK)
58+
w.WriteHeader(statusCode)
5159
if err := json.NewEncoder(w).Encode(report); err != nil {
5260
logger.Error(err, "failed to encode capacity report")
53-
return
5461
}
62+
api.recordCapacityMetrics(statusCode, startTime)
63+
}
64+
65+
// recordCapacityMetrics records Prometheus metrics for a report-capacity request.
66+
func (api *HTTPAPI) recordCapacityMetrics(statusCode int, startTime time.Time) {
67+
duration := time.Since(startTime).Seconds()
68+
statusCodeStr := strconv.Itoa(statusCode)
69+
api.capacityMonitor.requestCounter.WithLabelValues(statusCodeStr).Inc()
70+
api.capacityMonitor.requestDuration.WithLabelValues(statusCodeStr).Observe(duration)
5571
}
Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,41 @@
1+
// Copyright SAP SE
2+
// SPDX-License-Identifier: Apache-2.0
3+
4+
package commitments
5+
6+
import (
7+
"github.com/prometheus/client_golang/prometheus"
8+
)
9+
10+
// ReportCapacityAPIMonitor provides metrics for the CR report-capacity API.
11+
type ReportCapacityAPIMonitor struct {
12+
requestCounter *prometheus.CounterVec
13+
requestDuration *prometheus.HistogramVec
14+
}
15+
16+
// NewReportCapacityAPIMonitor creates a new monitor with Prometheus metrics.
17+
func NewReportCapacityAPIMonitor() ReportCapacityAPIMonitor {
18+
return ReportCapacityAPIMonitor{
19+
requestCounter: prometheus.NewCounterVec(prometheus.CounterOpts{
20+
Name: "cortex_committed_resource_capacity_api_requests_total",
21+
Help: "Total number of committed resource capacity API requests by HTTP status code",
22+
}, []string{"status_code"}),
23+
requestDuration: prometheus.NewHistogramVec(prometheus.HistogramOpts{
24+
Name: "cortex_committed_resource_capacity_api_request_duration_seconds",
25+
Help: "Duration of committed resource capacity API requests in seconds by HTTP status code",
26+
Buckets: []float64{0.1, 0.25, 0.5, 1, 2.5, 5, 10},
27+
}, []string{"status_code"}),
28+
}
29+
}
30+
31+
// Describe implements prometheus.Collector.
32+
func (m *ReportCapacityAPIMonitor) Describe(ch chan<- *prometheus.Desc) {
33+
m.requestCounter.Describe(ch)
34+
m.requestDuration.Describe(ch)
35+
}
36+
37+
// Collect implements prometheus.Collector.
38+
func (m *ReportCapacityAPIMonitor) Collect(ch chan<- prometheus.Metric) {
39+
m.requestCounter.Collect(ch)
40+
m.requestDuration.Collect(ch)
41+
}

internal/scheduling/reservations/commitments/api_report_usage.go

Lines changed: 26 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@ import (
77
"encoding/json"
88
"fmt"
99
"net/http"
10+
"strconv"
1011
"strings"
1112
"time"
1213

@@ -20,14 +21,19 @@ import (
2021
// This endpoint reports usage information for a specific project's committed resources,
2122
// including per-AZ usage, physical usage, and detailed VM subresources.
2223
func (api *HTTPAPI) HandleReportUsage(w http.ResponseWriter, r *http.Request) {
24+
startTime := time.Now()
25+
statusCode := http.StatusOK
26+
2327
requestID := r.Header.Get("X-Request-ID")
2428
if requestID == "" {
2529
requestID = fmt.Sprintf("req-%d", time.Now().UnixNano())
2630
}
2731
log := baseLog.WithValues("requestID", requestID, "endpoint", "report-usage")
2832

2933
if r.Method != http.MethodPost {
30-
http.Error(w, "Method not allowed", http.StatusMethodNotAllowed)
34+
statusCode = http.StatusMethodNotAllowed
35+
http.Error(w, "Method not allowed", statusCode)
36+
api.recordUsageMetrics(statusCode, startTime)
3137
return
3238
}
3339

@@ -36,15 +42,19 @@ func (api *HTTPAPI) HandleReportUsage(w http.ResponseWriter, r *http.Request) {
3642
projectID, err := extractProjectIDFromPath(r.URL.Path)
3743
if err != nil {
3844
log.Error(err, "failed to extract project ID from path")
39-
http.Error(w, "Invalid URL path: "+err.Error(), http.StatusBadRequest)
45+
statusCode = http.StatusBadRequest
46+
http.Error(w, "Invalid URL path: "+err.Error(), statusCode)
47+
api.recordUsageMetrics(statusCode, startTime)
4048
return
4149
}
4250

4351
// Parse request body
4452
var req liquid.ServiceUsageRequest
4553
if err := json.NewDecoder(r.Body).Decode(&req); err != nil {
4654
log.Error(err, "failed to decode request body")
47-
http.Error(w, "Invalid request body: "+err.Error(), http.StatusBadRequest)
55+
statusCode = http.StatusBadRequest
56+
http.Error(w, "Invalid request body: "+err.Error(), statusCode)
57+
api.recordUsageMetrics(statusCode, startTime)
4858
return
4959
}
5060

@@ -53,15 +63,26 @@ func (api *HTTPAPI) HandleReportUsage(w http.ResponseWriter, r *http.Request) {
5363
report, err := calculator.CalculateUsage(r.Context(), log, projectID, req.AllAZs)
5464
if err != nil {
5565
log.Error(err, "failed to calculate usage report", "projectID", projectID)
56-
http.Error(w, "Failed to generate usage report: "+err.Error(), http.StatusInternalServerError)
66+
statusCode = http.StatusInternalServerError
67+
http.Error(w, "Failed to generate usage report: "+err.Error(), statusCode)
68+
api.recordUsageMetrics(statusCode, startTime)
5769
return
5870
}
5971

6072
w.Header().Set("Content-Type", "application/json")
61-
w.WriteHeader(http.StatusOK)
73+
w.WriteHeader(statusCode)
6274
if err := json.NewEncoder(w).Encode(report); err != nil {
6375
log.Error(err, "failed to encode usage report")
6476
}
77+
api.recordUsageMetrics(statusCode, startTime)
78+
}
79+
80+
// recordUsageMetrics records Prometheus metrics for a report-usage request.
81+
func (api *HTTPAPI) recordUsageMetrics(statusCode int, startTime time.Time) {
82+
duration := time.Since(startTime).Seconds()
83+
statusCodeStr := strconv.Itoa(statusCode)
84+
api.usageMonitor.requestCounter.WithLabelValues(statusCodeStr).Inc()
85+
api.usageMonitor.requestDuration.WithLabelValues(statusCodeStr).Observe(duration)
6586
}
6687

6788
// extractProjectIDFromPath extracts the project UUID from the URL path.
Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,41 @@
1+
// Copyright SAP SE
2+
// SPDX-License-Identifier: Apache-2.0
3+
4+
package commitments
5+
6+
import (
7+
"github.com/prometheus/client_golang/prometheus"
8+
)
9+
10+
// ReportUsageAPIMonitor provides metrics for the CR report-usage API.
11+
type ReportUsageAPIMonitor struct {
12+
requestCounter *prometheus.CounterVec
13+
requestDuration *prometheus.HistogramVec
14+
}
15+
16+
// NewReportUsageAPIMonitor creates a new monitor with Prometheus metrics.
17+
func NewReportUsageAPIMonitor() ReportUsageAPIMonitor {
18+
return ReportUsageAPIMonitor{
19+
requestCounter: prometheus.NewCounterVec(prometheus.CounterOpts{
20+
Name: "cortex_committed_resource_usage_api_requests_total",
21+
Help: "Total number of committed resource usage API requests by HTTP status code",
22+
}, []string{"status_code"}),
23+
requestDuration: prometheus.NewHistogramVec(prometheus.HistogramOpts{
24+
Name: "cortex_committed_resource_usage_api_request_duration_seconds",
25+
Help: "Duration of committed resource usage API requests in seconds by HTTP status code",
26+
Buckets: []float64{0.1, 0.25, 0.5, 1, 2.5, 5, 10},
27+
}, []string{"status_code"}),
28+
}
29+
}
30+
31+
// Describe implements prometheus.Collector.
32+
func (m *ReportUsageAPIMonitor) Describe(ch chan<- *prometheus.Desc) {
33+
m.requestCounter.Describe(ch)
34+
m.requestDuration.Describe(ch)
35+
}
36+
37+
// Collect implements prometheus.Collector.
38+
func (m *ReportUsageAPIMonitor) Collect(ch chan<- prometheus.Metric) {
39+
m.requestCounter.Collect(ch)
40+
m.requestDuration.Collect(ch)
41+
}

0 commit comments

Comments
 (0)