Skip to content

Commit 5d598cd

Browse files
committed
Set -1 when Applies=Unknown
1 parent 9952662 commit 5d598cd

3 files changed

Lines changed: 20 additions & 16 deletions

File tree

install/0000_90_cluster-version-operator_02_prometheusrule-TechPreviewNoUpgrade.yaml

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -3,24 +3,24 @@ kind: PrometheusRule
33
metadata:
44
labels:
55
k8s-app: cluster-version-operator
6-
name: cluster-version-operator-tech-preview
6+
name: cluster-version-operator-accept-risks
77
namespace: openshift-cluster-version
88
annotations:
9-
kubernetes.io/description: Alerting rules for when cluster-version operator metrics call for administrator attention.
9+
kubernetes.io/description: Alerting rules for the feature gate ClusterUpdateAcceptRisks.
1010
exclude.release.openshift.io/internal-openshift-hosted: "true"
1111
include.release.openshift.io/self-managed-high-availability: "true"
12-
release.openshift.io/feature-set: TechPreviewNoUpgrade
12+
release.openshift.io/feature-gate: "ClusterUpdateAcceptRisks"
1313
spec:
1414
groups:
1515
- name: cluster-version-tech-preview
1616
rules:
17-
- alert: OpenShiftUpdateRiskApplies
17+
- alert: OpenShiftUpdateRiskMightApply
1818
annotations:
19-
summary: The cluster has been exposed to the conditional update risk for 10 minutes.
20-
description: The conditional update risk {{ "{{ $labels.risk }}" }} applies to the cluster, and the cluster update to a version exposed to the risk is not recommended. For more information refer to 'oc adm upgrade'.
21-
runbook_url: https://github.com/openshift/runbooks/blob/master/alerts/cluster-monitoring-operator/OpenShiftUpdateRiskApplies.md
19+
summary: The cluster might have been exposed to the conditional update risk for 10 minutes.
20+
description: The conditional update risk {{ "{{ $labels.risk }}" }} might apply to the cluster because of {{ "{{ $labels.reason }}" }}, and the cluster update to a version exposed to the risk is not recommended. For more information refer to 'oc adm upgrade'.
21+
runbook_url: https://github.com/openshift/runbooks/blob/master/alerts/cluster-monitoring-operator/OpenShiftUpdateRiskMightApply.md
2222
expr: |
23-
max by (namespace, risk) (last_over_time(cluster_version_risk_conditions{job="cluster-version-operator", condition="Applies"}[5m]) == 1)
23+
max by (namespace, risk, reason) (abs(last_over_time(cluster_version_risk_conditions{job="cluster-version-operator", condition="Applies"}[5m])) == 1)
2424
for: 10m
2525
labels:
2626
severity: warning

pkg/cvo/metrics.go

Lines changed: 8 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -106,8 +106,8 @@ penultimate completed version for 'completed'.
106106
}, []string{"name", "condition", "reason"}),
107107
clusterVersionRiskConditions: prometheus.NewGaugeVec(prometheus.GaugeOpts{
108108
Name: "cluster_version_risk_conditions",
109-
Help: "Report the risk conditions for the cluster version. 0 is False and 1 is True.",
110-
}, []string{"condition", "risk"}),
109+
Help: "Report the risk conditions for the cluster version. -1 is Unknown, 0 is False and 1 is True.",
110+
}, []string{"condition", "risk", "reason"}),
111111
clusterOperatorConditionTransitions: prometheus.NewGaugeVec(prometheus.GaugeOpts{
112112
Name: "cluster_operator_condition_transitions",
113113
Help: "Reports the number of times that a condition on a cluster operator changes status",
@@ -493,7 +493,7 @@ func (m *operatorMetrics) Describe(ch chan<- *prometheus.Desc) {
493493
ch <- m.capability.WithLabelValues("").Desc()
494494
ch <- m.clusterOperatorUp.WithLabelValues("", "", "").Desc()
495495
ch <- m.clusterOperatorConditions.WithLabelValues("", "", "").Desc()
496-
ch <- m.clusterVersionRiskConditions.WithLabelValues("", "").Desc()
496+
ch <- m.clusterVersionRiskConditions.WithLabelValues("", "", "").Desc()
497497
ch <- m.clusterOperatorConditionTransitions.WithLabelValues("", "").Desc()
498498
ch <- m.clusterInstaller.WithLabelValues("", "", "").Desc()
499499
ch <- m.clusterVersionOperatorUpdateRetrievalTimestampSeconds.WithLabelValues("").Desc()
@@ -522,9 +522,12 @@ func (m *operatorMetrics) collectConditionalUpdateRisks(ch chan<- prometheus.Met
522522
continue
523523
}
524524

525-
g := m.clusterVersionRiskConditions.WithLabelValues(condition.Type, risk.Name)
526-
if condition.Status == metav1.ConditionTrue {
525+
g := m.clusterVersionRiskConditions.WithLabelValues(condition.Type, risk.Name, condition.Reason)
526+
switch condition.Status {
527+
case metav1.ConditionTrue:
527528
g.Set(1)
529+
case metav1.ConditionUnknown:
530+
g.Set(-1)
528531
}
529532
// We do not need to do g.Set(0) as it is done when g is initialized
530533
ch <- g

pkg/cvo/metrics_test.go

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1017,7 +1017,7 @@ func Test_collectConditionalUpdateRisks(t *testing.T) {
10171017
},
10181018
},
10191019
expected: []valueWithLabels{{
1020-
labels: map[string]string{"condition": "Applies", "risk": "RiskX"},
1020+
labels: map[string]string{"condition": "Applies", "risk": "RiskX", "reason": "ReasonA"},
10211021
}},
10221022
},
10231023
{
@@ -1035,7 +1035,7 @@ func Test_collectConditionalUpdateRisks(t *testing.T) {
10351035
},
10361036
expected: []valueWithLabels{{
10371037
value: 1,
1038-
labels: map[string]string{"condition": "Applies", "risk": "RiskX"},
1038+
labels: map[string]string{"condition": "Applies", "risk": "RiskX", "reason": "ReasonA"},
10391039
}},
10401040
},
10411041
{
@@ -1052,7 +1052,8 @@ func Test_collectConditionalUpdateRisks(t *testing.T) {
10521052
},
10531053
},
10541054
expected: []valueWithLabels{{
1055-
labels: map[string]string{"condition": "Applies", "risk": "RiskX"},
1055+
value: -1,
1056+
labels: map[string]string{"condition": "Applies", "risk": "RiskX", "reason": "ReasonA"},
10561057
}},
10571058
},
10581059
}

0 commit comments

Comments
 (0)