Skip to content

Commit a4e2f8a

Browse files
committed
Introduce a new metric cluster_version_risk_conditions
Follow up [1]. The samples for `cluster_version_risk_conditions` will be collected only when its operator `shouldReconcileAcceptRisks`. It means, e.g., on a TechPreview disabled cluster the metric is still defined but has no samples. [1]. #1284 (comment)
1 parent 9a401d5 commit a4e2f8a

2 files changed

Lines changed: 140 additions & 0 deletions

File tree

pkg/cvo/metrics.go

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@ import (
1414
"github.com/prometheus/client_golang/prometheus/promhttp"
1515
corev1 "k8s.io/api/core/v1"
1616
apierrors "k8s.io/apimachinery/pkg/api/errors"
17+
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
1718
"k8s.io/apimachinery/pkg/labels"
1819
"k8s.io/apimachinery/pkg/util/sets"
1920
"k8s.io/apiserver/pkg/server/dynamiccertificates"
@@ -52,6 +53,7 @@ type operatorMetrics struct {
5253
capability *prometheus.GaugeVec
5354
clusterOperatorUp *prometheus.GaugeVec
5455
clusterOperatorConditions *prometheus.GaugeVec
56+
clusterVersionRiskConditions *prometheus.GaugeVec
5557
clusterOperatorConditionTransitions *prometheus.GaugeVec
5658
clusterInstaller *prometheus.GaugeVec
5759
clusterVersionOperatorUpdateRetrievalTimestampSeconds *prometheus.GaugeVec
@@ -102,6 +104,10 @@ penultimate completed version for 'completed'.
102104
Name: "cluster_operator_conditions",
103105
Help: "Report the conditions for active cluster operators. 0 is False and 1 is True.",
104106
}, []string{"name", "condition", "reason"}),
107+
clusterVersionRiskConditions: prometheus.NewGaugeVec(prometheus.GaugeOpts{
108+
Name: "cluster_version_risk_conditions",
109+
Help: "Report the risk conditions for cluster versions. 0 is False and 1 is True.",
110+
}, []string{"name", "condition", "risk"}),
105111
clusterOperatorConditionTransitions: prometheus.NewGaugeVec(prometheus.GaugeOpts{
106112
Name: "cluster_operator_condition_transitions",
107113
Help: "Reports the number of times that a condition on a cluster operator changes status",
@@ -487,6 +493,7 @@ func (m *operatorMetrics) Describe(ch chan<- *prometheus.Desc) {
487493
ch <- m.capability.WithLabelValues("").Desc()
488494
ch <- m.clusterOperatorUp.WithLabelValues("", "", "").Desc()
489495
ch <- m.clusterOperatorConditions.WithLabelValues("", "", "").Desc()
496+
ch <- m.clusterVersionRiskConditions.WithLabelValues("", "", "").Desc()
490497
ch <- m.clusterOperatorConditionTransitions.WithLabelValues("", "").Desc()
491498
ch <- m.clusterInstaller.WithLabelValues("", "", "").Desc()
492499
ch <- m.clusterVersionOperatorUpdateRetrievalTimestampSeconds.WithLabelValues("").Desc()
@@ -508,6 +515,24 @@ func (m *operatorMetrics) collectConditionalUpdates(ch chan<- prometheus.Metric,
508515
}
509516
}
510517

518+
func (m *operatorMetrics) collectConditionalUpdateRisks(ch chan<- prometheus.Metric, risks []configv1.ConditionalUpdateRisk) {
519+
for _, risk := range risks {
520+
for _, condition := range risk.Conditions {
521+
if condition.Type != internal.ConditionalUpdateRiskConditionTypeApplies {
522+
continue
523+
}
524+
525+
g := m.clusterVersionRiskConditions.WithLabelValues("version", condition.Type, risk.Name)
526+
if condition.Status == metav1.ConditionTrue {
527+
g.Set(1)
528+
} else {
529+
g.Set(0)
530+
}
531+
ch <- g
532+
}
533+
}
534+
}
535+
511536
// Collect collects metrics from the operator into the channel ch
512537
func (m *operatorMetrics) Collect(ch chan<- prometheus.Metric) {
513538
current := m.optr.currentVersion()
@@ -653,6 +678,9 @@ func (m *operatorMetrics) Collect(ch chan<- prometheus.Metric) {
653678
}
654679

655680
m.collectConditionalUpdates(ch, cv.Status.ConditionalUpdates)
681+
if m.optr.shouldReconcileAcceptRisks() {
682+
m.collectConditionalUpdateRisks(ch, cv.Status.ConditionalUpdateRisks)
683+
}
656684
}
657685

658686
g := m.version.WithLabelValues("current", current.Version, current.Image, completed.Version)

pkg/cvo/metrics_test.go

Lines changed: 112 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@ import (
2323
"k8s.io/apiserver/pkg/server/dynamiccertificates"
2424
"k8s.io/client-go/tools/record"
2525

26+
"github.com/openshift/cluster-version-operator/pkg/featuregates"
2627
"github.com/openshift/cluster-version-operator/pkg/internal"
2728
)
2829

@@ -667,6 +668,7 @@ func Test_operatorMetrics_Collect(t *testing.T) {
667668
}
668669
for _, tt := range tests {
669670
t.Run(tt.name, func(t *testing.T) {
671+
tt.optr.enabledCVOFeatureGates = featuregates.DefaultCvoGates("version")
670672
tt.optr.eventRecorder = record.NewFakeRecorder(100)
671673
if tt.optr.cvLister == nil {
672674
tt.optr.cvLister = &cvLister{}
@@ -973,6 +975,116 @@ func TestCollectUnknownConditionalUpdates(t *testing.T) {
973975
}
974976
}
975977

978+
func Test_collectConditionalUpdateRisks(t *testing.T) {
979+
type valueWithLabels struct {
980+
value float64
981+
labels map[string]string
982+
}
983+
testCases := []struct {
984+
name string
985+
risks []configv1.ConditionalUpdateRisk
986+
expected []valueWithLabels
987+
}{
988+
{
989+
name: "no conditional updates",
990+
expected: []valueWithLabels{},
991+
},
992+
{
993+
name: "unknown type",
994+
risks: []configv1.ConditionalUpdateRisk{
995+
{
996+
Name: "RiskX",
997+
Conditions: []metav1.Condition{{
998+
Type: internal.ConditionalUpdateConditionTypeRecommended,
999+
Status: metav1.ConditionFalse,
1000+
Reason: "ReasonA",
1001+
Message: "Risk does not apply",
1002+
}},
1003+
},
1004+
},
1005+
},
1006+
{
1007+
name: "apply false",
1008+
risks: []configv1.ConditionalUpdateRisk{
1009+
{
1010+
Name: "RiskX",
1011+
Conditions: []metav1.Condition{{
1012+
Type: internal.ConditionalUpdateRiskConditionTypeApplies,
1013+
Status: metav1.ConditionFalse,
1014+
Reason: "ReasonA",
1015+
Message: "Risk does not apply",
1016+
}},
1017+
},
1018+
},
1019+
expected: []valueWithLabels{{
1020+
labels: map[string]string{"name": "version", "condition": "Applies", "risk": "RiskX"},
1021+
}},
1022+
},
1023+
{
1024+
name: "apply true",
1025+
risks: []configv1.ConditionalUpdateRisk{
1026+
{
1027+
Name: "RiskX",
1028+
Conditions: []metav1.Condition{{
1029+
Type: internal.ConditionalUpdateRiskConditionTypeApplies,
1030+
Status: metav1.ConditionTrue,
1031+
Reason: "ReasonA",
1032+
Message: "Risk does not apply",
1033+
}},
1034+
},
1035+
},
1036+
expected: []valueWithLabels{{
1037+
value: 1,
1038+
labels: map[string]string{"name": "version", "condition": "Applies", "risk": "RiskX"},
1039+
}},
1040+
},
1041+
{
1042+
name: "apply unknown",
1043+
risks: []configv1.ConditionalUpdateRisk{
1044+
{
1045+
Name: "RiskX",
1046+
Conditions: []metav1.Condition{{
1047+
Type: internal.ConditionalUpdateRiskConditionTypeApplies,
1048+
Status: metav1.ConditionUnknown,
1049+
Reason: "ReasonA",
1050+
Message: "Risk does not apply",
1051+
}},
1052+
},
1053+
},
1054+
expected: []valueWithLabels{{
1055+
labels: map[string]string{"name": "version", "condition": "Applies", "risk": "RiskX"},
1056+
}},
1057+
},
1058+
}
1059+
1060+
for _, tc := range testCases {
1061+
tc := tc
1062+
t.Run(tc.name, func(t *testing.T) {
1063+
optr := &Operator{}
1064+
m := newOperatorMetrics(optr)
1065+
ch := make(chan prometheus.Metric)
1066+
1067+
go func() {
1068+
m.collectConditionalUpdateRisks(ch, tc.risks)
1069+
close(ch)
1070+
}()
1071+
1072+
var collected []prometheus.Metric
1073+
for item := range ch {
1074+
collected = append(collected, item)
1075+
}
1076+
1077+
if lenC, lenE := len(collected), len(tc.expected); lenC != lenE {
1078+
1079+
t.Fatalf("Expected %d metrics, got %d metrics\nGot metrics: %s", lenE, lenC, spew.Sdump(collected))
1080+
}
1081+
for i := range tc.expected {
1082+
expectMetric(t, collected[i], tc.expected[i].value, tc.expected[i].labels)
1083+
}
1084+
})
1085+
}
1086+
}
1087+
9761088
func expectMetric(t *testing.T, metric prometheus.Metric, value float64, labels map[string]string) {
9771089
t.Helper()
9781090
var d dto.Metric

0 commit comments

Comments
 (0)