Skip to content

Commit 6780b9d

Browse files
committed
Set -1 when Applies=Unknown
1 parent aca2e31 commit 6780b9d

3 files changed

Lines changed: 20 additions & 16 deletions

File tree

install/0000_90_cluster-version-operator_02_prometheusrule-TechPreviewNoUpgrade.yaml

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -3,24 +3,24 @@ kind: PrometheusRule
33
metadata:
44
labels:
55
k8s-app: cluster-version-operator
6-
name: cluster-version-operator-tech-preview
6+
name: cluster-version-operator-accept-risks
77
namespace: openshift-cluster-version
88
annotations:
9-
kubernetes.io/description: Alerting rules for when cluster-version operator metrics call for administrator attention.
9+
kubernetes.io/description: Alerting rules for the feature gate ClusterUpdateAcceptRisks.
1010
exclude.release.openshift.io/internal-openshift-hosted: "true"
1111
include.release.openshift.io/self-managed-high-availability: "true"
12-
release.openshift.io/feature-set: TechPreviewNoUpgrade
12+
release.openshift.io/feature-gate: "ClusterUpdateAcceptRisks"
1313
spec:
1414
groups:
1515
- name: cluster-version-tech-preview
1616
rules:
17-
- alert: OpenShiftUpdateRiskApplies
17+
- alert: OpenShiftUpdateRiskMightApply
1818
annotations:
19-
summary: The cluster has been exposed to the conditional update risk for 10 minutes.
20-
description: The conditional update risk {{ "{{ $labels.risk }}" }} applies to the cluster, and the cluster update to a version exposed to the risk is not recommended. For more information refer to 'oc adm upgrade'.
21-
runbook_url: https://github.com/openshift/runbooks/blob/master/alerts/cluster-monitoring-operator/OpenShiftUpdateRiskApplies.md
19+
summary: The cluster might have been exposed to the conditional update risk for 10 minutes.
20+
description: The conditional update risk {{ "{{ $labels.risk }}" }} might apply to the cluster because of {{ "{{ $labels.reason }}" }}, and the cluster update to a version exposed to the risk is not recommended. For more information refer to 'oc adm upgrade'.
21+
runbook_url: https://github.com/openshift/runbooks/blob/master/alerts/cluster-monitoring-operator/OpenShiftUpdateRiskMightApply.md
2222
expr: |
23-
max by (namespace, risk) (last_over_time(cluster_version_risk_conditions{job="cluster-version-operator", condition="Applies"}[5m]) == 1)
23+
max by (namespace, risk, reason) (abs(last_over_time(cluster_version_risk_conditions{job="cluster-version-operator", condition="Applies"}[5m])) == 1)
2424
for: 10m
2525
labels:
2626
severity: warning

pkg/cvo/metrics.go

Lines changed: 8 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -108,8 +108,8 @@ penultimate completed version for 'completed'.
108108
}, []string{"name", "condition", "reason"}),
109109
clusterVersionRiskConditions: prometheus.NewGaugeVec(prometheus.GaugeOpts{
110110
Name: "cluster_version_risk_conditions",
111-
Help: "Report the risk conditions for the cluster version. 0 is False and 1 is True.",
112-
}, []string{"condition", "risk"}),
111+
Help: "Report the risk conditions for the cluster version. -1 is Unknown, 0 is False and 1 is True.",
112+
}, []string{"condition", "risk", "reason"}),
113113
clusterOperatorConditionTransitions: prometheus.NewGaugeVec(prometheus.GaugeOpts{
114114
Name: "cluster_operator_condition_transitions",
115115
Help: "Reports the number of times that a condition on a cluster operator changes status",
@@ -495,7 +495,7 @@ func (m *operatorMetrics) Describe(ch chan<- *prometheus.Desc) {
495495
ch <- m.capability.WithLabelValues("").Desc()
496496
ch <- m.clusterOperatorUp.WithLabelValues("", "", "").Desc()
497497
ch <- m.clusterOperatorConditions.WithLabelValues("", "", "").Desc()
498-
ch <- m.clusterVersionRiskConditions.WithLabelValues("", "").Desc()
498+
ch <- m.clusterVersionRiskConditions.WithLabelValues("", "", "").Desc()
499499
ch <- m.clusterOperatorConditionTransitions.WithLabelValues("", "").Desc()
500500
ch <- m.clusterInstaller.WithLabelValues("", "", "").Desc()
501501
ch <- m.clusterVersionOperatorUpdateRetrievalTimestampSeconds.WithLabelValues("").Desc()
@@ -524,9 +524,12 @@ func (m *operatorMetrics) collectConditionalUpdateRisks(ch chan<- prometheus.Met
524524
continue
525525
}
526526

527-
g := m.clusterVersionRiskConditions.WithLabelValues(condition.Type, risk.Name)
528-
if condition.Status == metav1.ConditionTrue {
527+
g := m.clusterVersionRiskConditions.WithLabelValues(condition.Type, risk.Name, condition.Reason)
528+
switch condition.Status {
529+
case metav1.ConditionTrue:
529530
g.Set(1)
531+
case metav1.ConditionUnknown:
532+
g.Set(-1)
530533
}
531534
// We do not need to do g.Set(0) as it is done when g is initialized
532535
ch <- g

pkg/cvo/metrics_test.go

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1019,7 +1019,7 @@ func Test_collectConditionalUpdateRisks(t *testing.T) {
10191019
},
10201020
},
10211021
expected: []valueWithLabels{{
1022-
labels: map[string]string{"condition": "Applies", "risk": "RiskX"},
1022+
labels: map[string]string{"condition": "Applies", "risk": "RiskX", "reason": "ReasonA"},
10231023
}},
10241024
},
10251025
{
@@ -1037,7 +1037,7 @@ func Test_collectConditionalUpdateRisks(t *testing.T) {
10371037
},
10381038
expected: []valueWithLabels{{
10391039
value: 1,
1040-
labels: map[string]string{"condition": "Applies", "risk": "RiskX"},
1040+
labels: map[string]string{"condition": "Applies", "risk": "RiskX", "reason": "ReasonA"},
10411041
}},
10421042
},
10431043
{
@@ -1054,7 +1054,8 @@ func Test_collectConditionalUpdateRisks(t *testing.T) {
10541054
},
10551055
},
10561056
expected: []valueWithLabels{{
1057-
labels: map[string]string{"condition": "Applies", "risk": "RiskX"},
1057+
value: -1,
1058+
labels: map[string]string{"condition": "Applies", "risk": "RiskX", "reason": "ReasonA"},
10581059
}},
10591060
},
10601061
}

0 commit comments

Comments
 (0)