cortex/helm/bundles/cortex-nova/alerts/nova.alerts.yaml at 7f0b767bb540de9c764bcaa5cfff2728ab0ba172 · cobaltcore-dev/cortex · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
groups:
- name: cortex-nova-alerts
  rules:
  - alert: CortexNovaSchedulingDown
    expr: |
      up{pod=~"cortex-nova-scheduling-.*"} != 1 or
      absent(up{pod=~"cortex-nova-scheduling-.*"})
    for: 5m
    labels:
      context: liveness
      dashboard: cortex/cortex
      service: cortex
      severity: critical
      support_group: workload-management
      playbook: docs/support/playbook/cortex/down
    annotations:
      summary: "Cortex Scheduling for Nova is down"
      description: >
        The Cortex scheduling service is down. Scheduling requests from Nova will
        not be served. This is non-critical for vmware virtual machines, but
        blocks kvm virtual machines from being scheduled. Thus, it is
        recommended to immediately investigate and resolve the issue.

  - alert: CortexNovaKnowledgeDown
    expr: |
      up{pod=~"cortex-nova-knowledge-.*"} != 1 or
      absent(up{pod=~"cortex-nova-knowledge-.*"})
    for: 5m
    labels:
      context: liveness
      dashboard: cortex/cortex
      service: cortex
      severity: warning
      support_group: workload-management
      playbook: docs/support/playbook/cortex/down
    annotations:
      summary: "Cortex Knowledge for Nova is down"
      description: >
        The Cortex Knowledge service is down. This is no immediate problem,
        since cortex is still able to process requests,
        but the quality of the responses may be affected.

  - alert: CortexNovaDeschedulerPipelineErroring
    expr: delta(cortex_detector_pipeline_run_duration_seconds_count{component="nova-scheduling", error="true"}[2m]) > 0
    for: 5m
    labels:
      context: descheduler
      dashboard: cortex/cortex
      service: cortex
      severity: warning
      support_group: workload-management
    annotations:
      summary: "Descheduler pipeline is erroring."
      description: >
        The Cortex descheduler pipeline is encountering errors during its execution.
        This may indicate issues with the descheduling logic or the underlying infrastructure.
        It is recommended to investigate the descheduler logs and the state of the VMs being processed.

  - alert: CortexNovaHttpRequest400sTooHigh
    expr: rate(cortex_scheduler_api_request_duration_seconds_count{service="cortex-nova-metrics", status=~"4.+"}[5m]) > 0.1
    for: 5m
    labels:
      context: api
      dashboard: cortex/cortex
      service: cortex
      severity: warning
      support_group: workload-management
    annotations:
      summary: "Nova Scheduler HTTP request 400 errors too high"
      description: >
        Nova Scheduler is responding to placement requests with HTTP 4xx
        errors. This is expected when the scheduling request cannot be served
        by Cortex. However, it could also indicate that the request format has
        changed and Cortex is unable to parse it.

  - alert: CortexNovaSchedulingHttpRequest500sTooHigh
    expr: rate(cortex_scheduler_api_request_duration_seconds_count{service="cortex-nova-metrics", status=~"5.+" }[5m]) > 0.1
    for: 5m
    labels:
      context: api
      dashboard: cortex/cortex
      service: cortex
      severity: warning
      support_group: workload-management
    annotations:
      summary: "Nova Scheduler HTTP request 500 errors too high"
      description: >
        Nova Scheduler is responding to placement requests with HTTP 5xx errors.
        This is not expected and indicates that Cortex is having some internal problem.
        Nova will continue to place new VMs, but the placement will be less desirable.
        Thus, no immediate action is needed.

  - alert: CortexNovaHighMemoryUsage
    expr: process_resident_memory_bytes{service="cortex-nova-metrics"} > 6000 * 1024 * 1024
    for: 5m
    labels:
      context: memory
      dashboard: cortex/cortex
      service: cortex
      severity: warning
      support_group: workload-management
    annotations:
      summary: "`{{$labels.component}}` uses too much memory"
      description: >
        `{{$labels.component}}` should not be using more than 6000 MiB of memory. Usually it
        should use much less, so there may be a memory leak or other changes
        that are causing the memory usage to increase significantly.

  - alert: CortexNovaHighCPUUsage
    expr: rate(process_cpu_seconds_total{service="cortex-nova-metrics"}[1m]) > 0.5
    for: 5m
    labels:
      context: cpu
      dashboard: cortex/cortex
      service: cortex
      severity: warning
      support_group: workload-management
    annotations:
      summary: "`{{$labels.component}}` uses too much CPU"
      description: >
        `{{$labels.component}}` should not be using more than 50% of a single CPU core. Usually
        it should use much less, so there may be a CPU leak or other changes
        that are causing the CPU usage to increase significantly.

  - alert: CortexNovaTooManyDBConnectionAttempts
    expr: rate(cortex_db_connection_attempts_total{service="cortex-nova-metrics"}[5m]) > 0.1
    for: 5m
    labels:
      context: db
      dashboard: cortex/cortex
      service: cortex
      severity: warning
      support_group: workload-management
    annotations:
      summary: "`{{$labels.component}}` is trying to connect to the database too often"
      description: >
        `{{$labels.component}}` is trying to connect to the database too often. This may happen
        when the database is down or the connection parameters are misconfigured.

  - alert: CortexNovaSyncNotSuccessful
    expr: cortex_sync_request_processed_total{service="cortex-nova-metrics"} - cortex_sync_request_duration_seconds_count{service="cortex-nova-metrics"} > 0
    for: 5m
    labels:
      context: syncstatus
      dashboard: cortex/cortex
      service: cortex
      severity: warning
      support_group: workload-management
    annotations:
      summary: "`{{$labels.component}}` Sync not successful"
      description: >
        `{{$labels.component}}` experienced an issue syncing data from the datasource `{{$labels.datasource}}`. This may
        happen when the datasource (OpenStack, Prometheus, etc.) is down or
        the sync module is misconfigured. No immediate action is needed, since
        the sync module will retry the sync operation and the currently synced
        data will be kept. However, when this problem persists for a longer
        time the service will have a less recent view of the datacenter.

  - alert: CortexNovaSyncObjectsDroppedToZero
    expr: cortex_sync_objects{service="cortex-nova-metrics", datasource!="openstack_migrations"} == 0
    for: 60m
    labels:
      context: syncobjects
      dashboard: cortex/cortex
      service: cortex
      severity: warning
      support_group: workload-management
    annotations:
      summary: "`{{$labels.component}}` is not syncing any new data from `{{$labels.datasource}}`"
      description: >
        `{{$labels.component}}` is not syncing any objects from the datasource `{{$labels.datasource}}`. This may happen
        when the datasource (OpenStack, Prometheus, etc.) is down or the sync
        module is misconfigured. No immediate action is needed, since the sync
        module will retry the sync operation and the currently synced data will
        be kept. However, when this problem persists for a longer time the
        service will have a less recent view of the datacenter.

  - alert: CortexNovaDatasourceUnready
    expr: cortex_datasource_state{domain="nova",state!="ready"} != 0
    for: 60m
    labels:
      context: datasources
      dashboard: cortex/cortex
      service: cortex
      severity: warning
      support_group: workload-management
    annotations:
      summary: "Datasource `{{$labels.datasource}}` is in `{{$labels.state}}` state"
      description: >
        This may indicate issues with the datasource
        connectivity or configuration. It is recommended to investigate the
        datasource status and logs for more details.

  - alert: CortexNovaKnowledgeUnready
    expr: cortex_knowledge_state{domain="nova",state!="ready"} != 0
    for: 60m
    labels:
      context: knowledge
      dashboard: cortex/cortex
      service: cortex
      severity: warning
      support_group: workload-management
    annotations:
      summary: "Knowledge `{{$labels.knowledge}}` is in `{{$labels.state}}` state"
      description: >
        This may indicate issues with the knowledge
        configuration. It is recommended to investigate the
        knowledge status and logs for more details.

  - alert: CortexNovaDecisionsWithErrors
    expr: cortex_decision_state{domain="nova",state="error"} > 0
    for: 5m
    labels:
      context: decisions
      dashboard: cortex/cortex
      service: cortex
      severity: warning
      support_group: workload-management
    annotations:
      summary: "Some decisions are in error state for operator `{{$labels.operator}}`"
      description: >
        The cortex scheduling pipeline generated decisions that are in error state.
        This may indicate issues with the decision logic or the underlying infrastructure.
        It is recommended to investigate the decision logs and the state of the
        VMs being processed.

  - alert: CortexNovaTooManyDecisionsWaiting
    expr: cortex_decision_state{domain="nova",state="waiting"} > 10
    for: 5m
    labels:
      context: decisions
      dashboard: cortex/cortex
      service: cortex
      severity: warning
      support_group: workload-management
    annotations:
      summary: "Too many decisions are in waiting state for operator `{{$labels.operator}}`"
      description: >
        The cortex scheduling pipeline has a high number of decisions for which
        no target host has been assigned yet.

        This may indicate a backlog in processing or issues with the decision logic.
        It is recommended to investigate the decision logs and the state of the
        VMs being processed.

  - alert: CortexNovaKPIUnready
    expr: |
      cortex_kpi_state{domain="nova",state!="ready"} != 0
    for: 60m
    labels:
      context: kpis
      dashboard: cortex/cortex
      service: cortex
      severity: warning
      support_group: workload-management
    annotations:
      summary: "KPI `{{$labels.kpi}}` is in `{{$labels.state}}` state"
      description: >
        This may indicate issues with the KPI
        configuration. It is recommended to investigate the
        KPI status and logs for more details.

  - alert: CortexNovaPipelineUnready
    expr: cortex_pipeline_state{domain="nova",state!="ready"} != 0
    for: 5m
    labels:
      context: pipelines
      dashboard: cortex/cortex
      service: cortex
      severity: warning
      support_group: workload-management
    annotations:
      summary: "Pipeline `{{$labels.pipeline}}` is in `{{$labels.state}}` state"
      description: >
        This may indicate issues with the pipeline
        configuration. It is recommended to investigate the
        pipeline status and logs for more details.

  # Committed Resource Info API Alerts
  - alert: CortexNovaCommittedResourceInfoHttpRequest500sTooHigh
    expr: rate(cortex_committed_resource_info_api_requests_total{service="cortex-nova-metrics", status_code=~"5.."}[5m]) > 0.1
    for: 5m
    labels:
      context: committed-resource-api
      dashboard: cortex/cortex
      service: cortex
      severity: warning
      support_group: workload-management
    annotations:
      summary: "Committed Resource info API HTTP 500 errors too high"
      description: >
        The committed resource info API (Limes LIQUID integration) is responding
        with HTTP 5xx errors. This indicates internal problems building service info,
        such as invalid flavor group data. Limes will not be able to discover available
        resources until the issue is resolved.

  # Committed Resource Change API Alerts
  - alert: CortexNovaCommittedResourceHttpRequest400sTooHigh
    expr: rate(cortex_committed_resource_change_api_requests_total{service="cortex-nova-metrics", status_code=~"4.."}[5m]) > 0.1
    for: 5m
    labels:
      context: committed-resource-api
      dashboard: cortex/cortex
      service: cortex
      severity: warning
      support_group: workload-management
    annotations:
      summary: "Committed Resource change API HTTP 400 errors too high"
      description: >
        The committed resource change API (Limes LIQUID integration) is responding
        with HTTP 4xx errors. This may happen when Limes sends a request with
        an outdated info version (409), the API is temporarily unavailable,
        or the request format is invalid. Limes will typically retry these
        requests, so no immediate action is needed unless the errors persist.

  - alert: CortexNovaCommittedResourceHttpRequest500sTooHigh
    expr: rate(cortex_committed_resource_change_api_requests_total{service="cortex-nova-metrics", status_code=~"5.."}[5m]) > 0.1
    for: 5m
    labels:
      context: committed-resource-api
      dashboard: cortex/cortex
      service: cortex
      severity: warning
      support_group: workload-management
    annotations:
      summary: "Committed Resource change API HTTP 500 errors too high"
      description: >
        The committed resource change API (Limes LIQUID integration) is responding
        with HTTP 5xx errors. This is not expected and indicates that Cortex
        is having an internal problem processing commitment changes. Limes will
        continue to retry, but new commitments may not be fulfilled until the
        issue is resolved.

  - alert: CortexNovaCommittedResourceLatencyTooHigh
    expr: histogram_quantile(0.95, sum(rate(cortex_committed_resource_change_api_request_duration_seconds_bucket{service="cortex-nova-metrics"}[5m])) by (le)) > 30
    for: 5m
    labels:
      context: committed-resource-api
      dashboard: cortex/cortex
      service: cortex
      severity: warning
      support_group: workload-management
    annotations:
      summary: "Committed Resource change API latency too high"
      description: >
        The committed resource change API (Limes LIQUID integration) is experiencing
        high latency (p95 > 30s). This may indicate that the scheduling pipeline
        is under heavy load or that reservation scheduling is taking longer than
        expected. Limes requests may time out, causing commitment changes to fail.

  - alert: CortexNovaCommittedResourceRejectionRateTooHigh
    expr: |
      sum(rate(cortex_committed_resource_change_api_commitment_changes_total{service="cortex-nova-metrics", result="rejected"}[5m]))
      / sum(rate(cortex_committed_resource_change_api_commitment_changes_total{service="cortex-nova-metrics"}[5m])) > 0.5
    for: 5m
    labels:
      context: committed-resource-api
      dashboard: cortex/cortex
      service: cortex
      severity: warning
      support_group: workload-management
    annotations:
      summary: "Committed Resource rejection rate too high"
      description: >
        More than 50% of commitment change requests are being rejected.
        This may indicate insufficient capacity in the datacenter to fulfill
        new commitments, or issues with the commitment scheduling logic.
        Rejected commitments are rolled back, so Limes will see them as failed
        and may retry or report the failure to users.

  - alert: CortexNovaCommittedResourceTimeoutsTooHigh
    expr: increase(cortex_committed_resource_change_api_timeouts_total{service="cortex-nova-metrics"}[5m]) > 0
    for: 5m
    labels:
      context: committed-resource-api
      dashboard: cortex/cortex
      service: cortex
      severity: warning
      support_group: workload-management
    annotations:
      summary: "Committed Resource change API timeouts too high"
      description: >
        The committed resource change API (Limes LIQUID integration) timed out
        while waiting for reservations to become ready. This indicates that the
        scheduling pipeline is overloaded or reservations are taking too long
        to be scheduled. Affected commitment changes are rolled back and Limes
        will see them as failed. Consider investigating the scheduler performance
        or increasing the timeout configuration.

  # Committed Resource Usage API Alerts
  - alert: CortexNovaCommittedResourceUsageHttpRequest400sTooHigh
    expr: rate(cortex_committed_resource_usage_api_requests_total{service="cortex-nova-metrics", status_code=~"4.."}[5m]) > 0.1
    for: 5m
    labels:
      context: committed-resource-api
      dashboard: cortex/cortex
      service: cortex
      severity: warning
      support_group: workload-management
    annotations:
      summary: "Committed Resource usage API HTTP 400 errors too high"
      description: >
        The committed resource usage API (Limes LIQUID integration) is responding
        with HTTP 4xx errors. This may indicate invalid project IDs or malformed
        requests from Limes. Limes will typically retry these requests.

  - alert: CortexNovaCommittedResourceUsageHttpRequest500sTooHigh
    expr: rate(cortex_committed_resource_usage_api_requests_total{service="cortex-nova-metrics", status_code=~"5.."}[5m]) > 0.1
    for: 5m
    labels:
      context: committed-resource-api
      dashboard: cortex/cortex
      service: cortex
      severity: warning
      support_group: workload-management
    annotations:
      summary: "Committed Resource usage API HTTP 500 errors too high"
      description: >
        The committed resource usage API (Limes LIQUID integration) is responding
        with HTTP 5xx errors. This indicates internal problems fetching reservations
        or Nova server data. Limes may receive stale or incomplete usage data.

  - alert: CortexNovaCommittedResourceUsageLatencyTooHigh
    expr: histogram_quantile(0.95, sum(rate(cortex_committed_resource_usage_api_request_duration_seconds_bucket{service="cortex-nova-metrics"}[5m])) by (le)) > 5
    for: 5m
    labels:
      context: committed-resource-api
      dashboard: cortex/cortex
      service: cortex
      severity: warning
      support_group: workload-management
    annotations:
      summary: "Committed Resource usage API latency too high"
      description: >
        The committed resource usage API (Limes LIQUID integration) is experiencing
        high latency (p95 > 5s). This may indicate slow Nova API responses or
        database queries. Limes scrapes may time out, affecting quota reporting.

  # Committed Resource Capacity API Alerts
  - alert: CortexNovaCommittedResourceCapacityHttpRequest400sTooHigh
    expr: rate(cortex_committed_resource_capacity_api_requests_total{service="cortex-nova-metrics", status_code=~"4.."}[5m]) > 0.1
    for: 5m
    labels:
      context: committed-resource-api
      dashboard: cortex/cortex
      service: cortex
      severity: warning
      support_group: workload-management
    annotations:
      summary: "Committed Resource capacity API HTTP 400 errors too high"
      description: >
        The committed resource capacity API (Limes LIQUID integration) is responding
        with HTTP 4xx errors. This may indicate malformed requests from Limes.

  - alert: CortexNovaCommittedResourceCapacityHttpRequest500sTooHigh
    expr: rate(cortex_committed_resource_capacity_api_requests_total{service="cortex-nova-metrics", status_code=~"5.."}[5m]) > 0.1
    for: 5m
    labels:
      context: committed-resource-api
      dashboard: cortex/cortex
      service: cortex
      severity: warning
      support_group: workload-management
    annotations:
      summary: "Committed Resource capacity API HTTP 500 errors too high"
      description: >
        The committed resource capacity API (Limes LIQUID integration) is responding
        with HTTP 5xx errors. This indicates internal problems calculating cluster
        capacity. Limes may receive stale or incomplete capacity data.

  - alert: CortexNovaCommittedResourceCapacityLatencyTooHigh
    expr: histogram_quantile(0.95, sum(rate(cortex_committed_resource_capacity_api_request_duration_seconds_bucket{service="cortex-nova-metrics"}[5m])) by (le)) > 5
    for: 5m
    labels:
      context: committed-resource-api
      dashboard: cortex/cortex
      service: cortex
      severity: warning
      support_group: workload-management
    annotations:
      summary: "Committed Resource capacity API latency too high"
      description: >
        The committed resource capacity API (Limes LIQUID integration) is experiencing
        high latency (p95 > 5s). This may indicate slow database queries or knowledge
        CRD retrieval. Limes scrapes may time out, affecting capacity reporting.

  # Committed Resource Syncer Alerts
  - alert: CortexNovaCommittedResourceSyncerErrorsHigh
    expr: increase(cortex_committed_resource_syncer_errors_total{service="cortex-nova-metrics"}[1h]) > 3
    for: 5m
    labels:
      context: committed-resource-syncer
      dashboard: cortex/cortex
      service: cortex
      severity: warning
      support_group: workload-management
    annotations:
      summary: "Committed Resource syncer experiencing errors"
      description: >
        The committed resource syncer has encountered multiple errors in the last hour.
        This may indicate connectivity issues with Limes. Check the syncer logs for error details.

  - alert: CortexNovaCommittedResourceSyncerUnitMismatchRateHigh
    expr: |
      (
        sum(rate(cortex_committed_resource_syncer_commitments_skipped_total{service="cortex-nova-metrics", reason="unit_mismatch"}[1h]))
        / sum(rate(cortex_committed_resource_syncer_commitments_total{service="cortex-nova-metrics"}[1h]))
      ) > 0.05
      and on() sum(rate(cortex_committed_resource_syncer_commitments_total{service="cortex-nova-metrics"}[1h])) > 0
    for: 15m
    labels:
      context: committed-resource-syncer
      dashboard: cortex/cortex
      service: cortex
      severity: warning
      support_group: workload-management
    annotations:
      summary: "Committed Resource syncer unit mismatch rate >5%"
      description: >
        More than 5% of commitments are being skipped due to unit mismatches between
        Limes and Cortex flavor groups. This happens when Limes has not yet been
        updated to use the new unit format after a flavor group change. The affected
        commitments will keep their existing reservations until Limes notices the update.
        Check the logs if this error persists for longer time.

  - alert: CortexNovaCommittedResourceSyncerUnknownFlavorGroupRateHigh
    expr: |
      (
        sum(rate(cortex_committed_resource_syncer_commitments_skipped_total{service="cortex-nova-metrics", reason="unknown_flavor_group"}[1h]))
        / sum(rate(cortex_committed_resource_syncer_commitments_total{service="cortex-nova-metrics"}[1h]))
      ) > 0
      and on() sum(rate(cortex_committed_resource_syncer_commitments_total{service="cortex-nova-metrics"}[1h])) > 0
    for: 15m
    labels:
      context: committed-resource-syncer
      dashboard: cortex/cortex
      service: cortex
      severity: warning
      support_group: workload-management
    annotations:
      summary: "Committed Resource syncer unknown flavor group rate >0%"
      description: >
        Some commitments reference flavor groups that don't exist in
        Cortex Knowledge (anymore). This may indicate that flavor group configuration is
        out of sync between Limes and Cortex, or that Knowledge extraction is failing.
        Check the flavor group Knowledge CRD and history to see what was changed.

  - alert: CortexNovaCommittedResourceSyncerLocalChangeRateHigh
    expr: |
      (
        (
          rate(cortex_committed_resource_syncer_reservations_created_total{service="cortex-nova-metrics"}[1h]) +
          rate(cortex_committed_resource_syncer_reservations_deleted_total{service="cortex-nova-metrics"}[1h]) +
          rate(cortex_committed_resource_syncer_reservations_repaired_total{service="cortex-nova-metrics"}[1h])
        ) / rate(cortex_committed_resource_syncer_commitments_processed_total{service="cortex-nova-metrics"}[1h])
      ) > 0.01
      and on() rate(cortex_committed_resource_syncer_commitments_processed_total{service="cortex-nova-metrics"}[1h]) > 0
    for: 15m
    labels:
      context: committed-resource-syncer
      dashboard: cortex/cortex
      service: cortex
      severity: warning
      support_group: workload-management
    annotations:
      summary: "Committed Resource syncer local change rate >1%"
      description: >
        More than 1% of synced commitments are requiring reservation changes
        (creates, deletes, or repairs). This is higher than expected for steady-state
        operation and may indicate data inconsistencies, external modifications to
        reservations, or issues with the CRDs. Check Cortex logs for details.

  - alert: CortexNovaCommittedResourceSyncerRepairRateHigh
    expr: |
      (
        rate(cortex_committed_resource_syncer_reservations_repaired_total{service="cortex-nova-metrics"}[1h])
        / rate(cortex_committed_resource_syncer_commitments_processed_total{service="cortex-nova-metrics"}[1h])
      ) > 0
      and on() rate(cortex_committed_resource_syncer_commitments_processed_total{service="cortex-nova-metrics"}[1h]) > 0
    for: 15m
    labels:
      context: committed-resource-syncer
      dashboard: cortex/cortex
      service: cortex
      severity: warning
      support_group: workload-management
    annotations:
      summary: "Committed Resource syncer repair rate >0%"
      description: >
        Some commitments have reservations that needed repair
        (wrong metadata like project ID or flavor group). This may indicate data
        corruption, bugs in reservation creation, or external modifications.
        Reservations are automatically repaired, but the root cause should be
        investigated if this alert persists.

  - alert: CortexNovaDoesntFindValidKVMHosts
    expr: sum by (az, hvtype) (cortex_vm_faults{hvtype=~"CH|QEMU",faultmsg=~".*No valid host was found.*"}) > 0
    for: 5m
    labels:
      context: scheduling
      dashboard: cortex/cortex
      service: cortex
      severity: warning
      support_group: workload-management
    annotations:
      summary: "Nova scheduling cannot find valid KVM hosts"
      description: >
        Cortex is seeing faulty vms in `{{$labels.az}}` where Nova scheduling
        failed to find a valid `{{$labels.hvtype}}` host. This may indicate
        capacity issues, misconfigured filters, or resource constraints in the
        datacenter. Investigate the affected VMs and hypervisor availability.

  - alert: CortexNovaNewDatasourcesNotReconciling
    expr: count by(datasource) (cortex_datasource_seconds_until_reconcile{queued="false",domain="nova"}) > 0
    for: 60m
    labels:
      context: datasources
      dashboard: cortex/cortex
      service: cortex
      severity: warning
      support_group: workload-management
    annotations:
      summary: "New datasource `{{$labels.datasource}}` has not reconciled"
      description: >
        A new datasource `{{$labels.datasource}}` has been added but has not
        completed its first reconciliation yet. This may indicate issues with
        the datasource controller's workqueue overprioritizing other datasources.

  - alert: CortexNovaExistingDatasourcesLackingBehind
    expr: |
      sum by(datasource) (cortex_datasource_seconds_until_reconcile{queued="true",domain="nova"}) < -600
      and on(datasource) cortex_datasource_state{state="ready",domain="nova"} == 1
    for: 10m
    labels:
      context: datasources
      dashboard: cortex/cortex
      service: cortex
      severity: warning
      support_group: workload-management
    annotations:
      summary: "Existing datasource `{{$labels.datasource}}` is lacking behind"
      description: >
        An existing datasource `{{$labels.datasource}}` has been queued for
        reconciliation for more than 10 minutes. This may indicate issues with
        the datasource controller's workqueue or that this or another datasource
        is taking an unusually long time to reconcile.

  - alert: CortexNovaReconcileErrorsHigh
    expr: |
      (sum by (controller) (rate(controller_runtime_reconcile_errors_total{service="cortex-nova-metrics"}[5m])))
      / (sum by (controller) (rate(controller_runtime_reconcile_total{service="cortex-nova-metrics"}[5m]))) > 0.1
    for: 15m
    labels:
      context: controller-errors
      dashboard: cortex/cortex
      service: cortex
      severity: warning
      support_group: workload-management
    annotations:
      summary: "Controller reconcile error rate >10%"
      description: >
        More than 10% of controller reconciles are resulting in errors. This may
        indicate issues with the controller logic, connectivity problems, or
        external factors causing failures. Check the controller logs for error
        details and investigate the affected resources.

  - alert: CortexNovaReconcileDurationHigher10Min
    expr: |
      (sum by (controller) (rate(controller_runtime_reconcile_time_seconds_sum{service="cortex-nova-metrics"}[5m])))
      / (sum by (controller) (rate(controller_runtime_reconcile_time_seconds_count{service="cortex-nova-metrics"}[5m]))) > 600
    for: 15m
    labels:
      context: controller-duration
      dashboard: cortex/cortex
      service: cortex
      severity: warning
      support_group: workload-management
    annotations:
      summary: "Controller reconciliation takes longer than ({{ $value | humanizeDuration }})"
      description: "Reconcile duration higher than 10m while reconciling {{ $labels.controller }}"

  - alert: CortexNovaWorkqueueNotDrained
    expr: |
      sum by (name) (workqueue_depth{service="cortex-nova-metrics"}) > 0
    for: 60m
    labels:
      context: controller-workqueue
      dashboard: cortex/cortex
      service: cortex
      severity: warning
      support_group: workload-management
    annotations:
      summary: "Controller {{ $labels.name }}'s backlog is not being drained."
      description: >
        The workqueue for controller {{ $labels.name }} has a backlog that is
        not being drained. This may indicate that the controller is overwhelmed
        with work or is stuck on certain resources. Check the controller logs
        and the state of the resources it manages for more details.

  - alert: CortexNovaWebhookLatencyHigh
    expr: |
      histogram_quantile(0.9, avg(rate(controller_runtime_webhook_latency_seconds_bucket{service="cortex-nova-metrics"}[5m])) by (webhook, le)) > 0.2
    for: 15m
    labels:
      context: controller-webhook
      dashboard: cortex/cortex
      service: cortex
      severity: warning
      support_group: workload-management
    annotations:
      summary: "Controller webhook {{ $labels.webhook }} latency is high"
      description: >
        The latency for webhook {{ $labels.webhook }} is higher than expected (p90 > 200ms).
        This may indicate performance issues with the webhook server or the logic it executes.
        Check the webhook server logs and monitor its resource usage for more insights.

  - alert: CortexNovaWebhookErrorsHigh
    expr: |
      (sum by (webhook) (rate(controller_runtime_webhook_requests_total{code!="200", service="cortex-nova-metrics"}[5m])))
      / (sum by (webhook) (rate(controller_runtime_webhook_requests_total{service="cortex-nova-metrics"}[5m]))) > 0.1
    for: 15m
    labels:
      context: controller-webhook
      dashboard: cortex/cortex
      service: cortex
      severity: warning
      support_group: workload-management
    annotations:
      summary: "Controller webhook {{ $labels.webhook }} is experiencing errors"
      description: >
        The webhook {{ $labels.webhook }} has experienced errors in the last 5 minutes.
        This may indicate issues with the webhook logic, connectivity problems, or
        external factors causing failures. Check the webhook server logs for error
        details and investigate the affected resources.