Skip to content

Commit 8a9e0f8

Browse files
committed
Introduce a new metric cluster_version_risk_conditions
Follow up [1]. The samples for `cluster_version_risk_conditions` will be collected only when its operator `shouldReconcileAcceptRisks`. It means, e.g., on a TechPreview disabled cluster the metric is still defined but has no samples. [1]. #1284 (comment)
1 parent 54dc4ef commit 8a9e0f8

2 files changed

Lines changed: 140 additions & 0 deletions

File tree

pkg/cvo/metrics.go

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@ import (
1515

1616
corev1 "k8s.io/api/core/v1"
1717
apierrors "k8s.io/apimachinery/pkg/api/errors"
18+
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
1819
"k8s.io/apimachinery/pkg/labels"
1920
"k8s.io/apimachinery/pkg/util/sets"
2021
"k8s.io/apiserver/pkg/server/dynamiccertificates"
@@ -54,6 +55,7 @@ type operatorMetrics struct {
5455
capability *prometheus.GaugeVec
5556
clusterOperatorUp *prometheus.GaugeVec
5657
clusterOperatorConditions *prometheus.GaugeVec
58+
clusterVersionRiskConditions *prometheus.GaugeVec
5759
clusterOperatorConditionTransitions *prometheus.GaugeVec
5860
clusterInstaller *prometheus.GaugeVec
5961
clusterVersionOperatorUpdateRetrievalTimestampSeconds *prometheus.GaugeVec
@@ -104,6 +106,10 @@ penultimate completed version for 'completed'.
104106
Name: "cluster_operator_conditions",
105107
Help: "Report the conditions for active cluster operators. 0 is False and 1 is True.",
106108
}, []string{"name", "condition", "reason"}),
109+
clusterVersionRiskConditions: prometheus.NewGaugeVec(prometheus.GaugeOpts{
110+
Name: "cluster_version_risk_conditions",
111+
Help: "Report the risk conditions for cluster versions. 0 is False and 1 is True.",
112+
}, []string{"name", "condition", "risk"}),
107113
clusterOperatorConditionTransitions: prometheus.NewGaugeVec(prometheus.GaugeOpts{
108114
Name: "cluster_operator_condition_transitions",
109115
Help: "Reports the number of times that a condition on a cluster operator changes status",
@@ -489,6 +495,7 @@ func (m *operatorMetrics) Describe(ch chan<- *prometheus.Desc) {
489495
ch <- m.capability.WithLabelValues("").Desc()
490496
ch <- m.clusterOperatorUp.WithLabelValues("", "", "").Desc()
491497
ch <- m.clusterOperatorConditions.WithLabelValues("", "", "").Desc()
498+
ch <- m.clusterVersionRiskConditions.WithLabelValues("", "", "").Desc()
492499
ch <- m.clusterOperatorConditionTransitions.WithLabelValues("", "").Desc()
493500
ch <- m.clusterInstaller.WithLabelValues("", "", "").Desc()
494501
ch <- m.clusterVersionOperatorUpdateRetrievalTimestampSeconds.WithLabelValues("").Desc()
@@ -510,6 +517,24 @@ func (m *operatorMetrics) collectConditionalUpdates(ch chan<- prometheus.Metric,
510517
}
511518
}
512519

520+
func (m *operatorMetrics) collectConditionalUpdateRisks(ch chan<- prometheus.Metric, risks []configv1.ConditionalUpdateRisk) {
521+
for _, risk := range risks {
522+
for _, condition := range risk.Conditions {
523+
if condition.Type != internal.ConditionalUpdateRiskConditionTypeApplies {
524+
continue
525+
}
526+
527+
g := m.clusterVersionRiskConditions.WithLabelValues("version", condition.Type, risk.Name)
528+
if condition.Status == metav1.ConditionTrue {
529+
g.Set(1)
530+
} else {
531+
g.Set(0)
532+
}
533+
ch <- g
534+
}
535+
}
536+
}
537+
513538
// Collect collects metrics from the operator into the channel ch
514539
func (m *operatorMetrics) Collect(ch chan<- prometheus.Metric) {
515540
current := m.optr.currentVersion()
@@ -655,6 +680,9 @@ func (m *operatorMetrics) Collect(ch chan<- prometheus.Metric) {
655680
}
656681

657682
m.collectConditionalUpdates(ch, cv.Status.ConditionalUpdates)
683+
if m.optr.shouldReconcileAcceptRisks() {
684+
m.collectConditionalUpdateRisks(ch, cv.Status.ConditionalUpdateRisks)
685+
}
658686
}
659687

660688
g := m.version.WithLabelValues("current", current.Version, current.Image, completed.Version)

pkg/cvo/metrics_test.go

Lines changed: 112 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@ import (
2525
configv1 "github.com/openshift/api/config/v1"
2626
"github.com/openshift/library-go/pkg/crypto"
2727

28+
"github.com/openshift/cluster-version-operator/pkg/featuregates"
2829
"github.com/openshift/cluster-version-operator/pkg/internal"
2930
)
3031

@@ -669,6 +670,7 @@ func Test_operatorMetrics_Collect(t *testing.T) {
669670
}
670671
for _, tt := range tests {
671672
t.Run(tt.name, func(t *testing.T) {
673+
tt.optr.enabledCVOFeatureGates = featuregates.DefaultCvoGates("version")
672674
tt.optr.eventRecorder = record.NewFakeRecorder(100)
673675
if tt.optr.cvLister == nil {
674676
tt.optr.cvLister = &cvLister{}
@@ -975,6 +977,116 @@ func TestCollectUnknownConditionalUpdates(t *testing.T) {
975977
}
976978
}
977979

980+
func Test_collectConditionalUpdateRisks(t *testing.T) {
981+
type valueWithLabels struct {
982+
value float64
983+
labels map[string]string
984+
}
985+
testCases := []struct {
986+
name string
987+
risks []configv1.ConditionalUpdateRisk
988+
expected []valueWithLabels
989+
}{
990+
{
991+
name: "no conditional updates",
992+
expected: []valueWithLabels{},
993+
},
994+
{
995+
name: "unknown type",
996+
risks: []configv1.ConditionalUpdateRisk{
997+
{
998+
Name: "RiskX",
999+
Conditions: []metav1.Condition{{
1000+
Type: internal.ConditionalUpdateConditionTypeRecommended,
1001+
Status: metav1.ConditionFalse,
1002+
Reason: "ReasonA",
1003+
Message: "Risk does not apply",
1004+
}},
1005+
},
1006+
},
1007+
},
1008+
{
1009+
name: "apply false",
1010+
risks: []configv1.ConditionalUpdateRisk{
1011+
{
1012+
Name: "RiskX",
1013+
Conditions: []metav1.Condition{{
1014+
Type: internal.ConditionalUpdateRiskConditionTypeApplies,
1015+
Status: metav1.ConditionFalse,
1016+
Reason: "ReasonA",
1017+
Message: "Risk does not apply",
1018+
}},
1019+
},
1020+
},
1021+
expected: []valueWithLabels{{
1022+
labels: map[string]string{"name": "version", "condition": "Applies", "risk": "RiskX"},
1023+
}},
1024+
},
1025+
{
1026+
name: "apply true",
1027+
risks: []configv1.ConditionalUpdateRisk{
1028+
{
1029+
Name: "RiskX",
1030+
Conditions: []metav1.Condition{{
1031+
Type: internal.ConditionalUpdateRiskConditionTypeApplies,
1032+
Status: metav1.ConditionTrue,
1033+
Reason: "ReasonA",
1034+
Message: "Risk does not apply",
1035+
}},
1036+
},
1037+
},
1038+
expected: []valueWithLabels{{
1039+
value: 1,
1040+
labels: map[string]string{"name": "version", "condition": "Applies", "risk": "RiskX"},
1041+
}},
1042+
},
1043+
{
1044+
name: "apply unknown",
1045+
risks: []configv1.ConditionalUpdateRisk{
1046+
{
1047+
Name: "RiskX",
1048+
Conditions: []metav1.Condition{{
1049+
Type: internal.ConditionalUpdateRiskConditionTypeApplies,
1050+
Status: metav1.ConditionUnknown,
1051+
Reason: "ReasonA",
1052+
Message: "Risk does not apply",
1053+
}},
1054+
},
1055+
},
1056+
expected: []valueWithLabels{{
1057+
labels: map[string]string{"name": "version", "condition": "Applies", "risk": "RiskX"},
1058+
}},
1059+
},
1060+
}
1061+
1062+
for _, tc := range testCases {
1063+
tc := tc
1064+
t.Run(tc.name, func(t *testing.T) {
1065+
optr := &Operator{}
1066+
m := newOperatorMetrics(optr)
1067+
ch := make(chan prometheus.Metric)
1068+
1069+
go func() {
1070+
m.collectConditionalUpdateRisks(ch, tc.risks)
1071+
close(ch)
1072+
}()
1073+
1074+
var collected []prometheus.Metric
1075+
for item := range ch {
1076+
collected = append(collected, item)
1077+
}
1078+
1079+
if lenC, lenE := len(collected), len(tc.expected); lenC != lenE {
1080+
1081+
t.Fatalf("Expected %d metrics, got %d metrics\nGot metrics: %s", lenE, lenC, spew.Sdump(collected))
1082+
}
1083+
for i := range tc.expected {
1084+
expectMetric(t, collected[i], tc.expected[i].value, tc.expected[i].labels)
1085+
}
1086+
})
1087+
}
1088+
}
1089+
9781090
func expectMetric(t *testing.T, metric prometheus.Metric, value float64, labels map[string]string) {
9791091
t.Helper()
9801092
var d dto.Metric

0 commit comments

Comments
 (0)