Skip to content

Commit cae24c3

Browse files
committed
Introduce a new metric cluster_version_risk_conditions
Follow up [1]. The samples for `cluster_version_risk_conditions` will be collected only when its operator `shouldReconcileAcceptRisks`. It means, e.g., on a TechPreview disabled cluster the metric is still defined but has no samples. [1]. #1284 (comment)
1 parent 0797b28 commit cae24c3

2 files changed

Lines changed: 160 additions & 0 deletions

File tree

pkg/cvo/metrics.go

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -58,6 +58,7 @@ type operatorMetrics struct {
5858
capability *prometheus.GaugeVec
5959
clusterOperatorUp *prometheus.GaugeVec
6060
clusterOperatorConditions *prometheus.GaugeVec
61+
clusterVersionRiskConditions *prometheus.GaugeVec
6162
clusterOperatorConditionTransitions *prometheus.GaugeVec
6263
clusterInstaller *prometheus.GaugeVec
6364
clusterVersionOperatorUpdateRetrievalTimestampSeconds *prometheus.GaugeVec
@@ -108,6 +109,10 @@ penultimate completed version for 'completed'.
108109
Name: "cluster_operator_conditions",
109110
Help: "Report the conditions for active cluster operators. 0 is False and 1 is True.",
110111
}, []string{"name", "condition", "reason"}),
112+
clusterVersionRiskConditions: prometheus.NewGaugeVec(prometheus.GaugeOpts{
113+
Name: "cluster_version_risk_conditions",
114+
Help: "Report the risk conditions for cluster versions. 0 is False and 1 is True.",
115+
}, []string{"name", "condition", "risk"}),
111116
clusterOperatorConditionTransitions: prometheus.NewGaugeVec(prometheus.GaugeOpts{
112117
Name: "cluster_operator_condition_transitions",
113118
Help: "Reports the number of times that a condition on a cluster operator changes status",
@@ -436,6 +441,7 @@ func (m *operatorMetrics) Describe(ch chan<- *prometheus.Desc) {
436441
ch <- m.capability.WithLabelValues("").Desc()
437442
ch <- m.clusterOperatorUp.WithLabelValues("", "", "").Desc()
438443
ch <- m.clusterOperatorConditions.WithLabelValues("", "", "").Desc()
444+
ch <- m.clusterVersionRiskConditions.WithLabelValues("", "", "").Desc()
439445
ch <- m.clusterOperatorConditionTransitions.WithLabelValues("", "").Desc()
440446
ch <- m.clusterInstaller.WithLabelValues("", "", "").Desc()
441447
ch <- m.clusterVersionOperatorUpdateRetrievalTimestampSeconds.WithLabelValues("").Desc()
@@ -457,6 +463,24 @@ func (m *operatorMetrics) collectConditionalUpdates(ch chan<- prometheus.Metric,
457463
}
458464
}
459465

466+
func (m *operatorMetrics) collectConditionalUpdateRisks(ch chan<- prometheus.Metric, risks []configv1.ConditionalUpdateRisk) {
467+
for _, risk := range risks {
468+
for _, condition := range risk.Conditions {
469+
if condition.Type != internal.ConditionalUpdateRiskConditionTypeApplies {
470+
continue
471+
}
472+
473+
g := m.clusterVersionRiskConditions.WithLabelValues("version", condition.Type, risk.Name)
474+
if condition.Status == metav1.ConditionTrue {
475+
g.Set(1)
476+
} else {
477+
g.Set(0)
478+
}
479+
ch <- g
480+
}
481+
}
482+
}
483+
460484
// Collect collects metrics from the operator into the channel ch
461485
func (m *operatorMetrics) Collect(ch chan<- prometheus.Metric) {
462486
current := m.optr.currentVersion()
@@ -602,6 +626,9 @@ func (m *operatorMetrics) Collect(ch chan<- prometheus.Metric) {
602626
}
603627

604628
m.collectConditionalUpdates(ch, cv.Status.ConditionalUpdates)
629+
if m.optr.shouldReconcileAcceptRisks() {
630+
m.collectConditionalUpdateRisks(ch, cv.Status.ConditionalUpdateRisks)
631+
}
605632
}
606633

607634
g := m.version.WithLabelValues("current", current.Version, current.Image, completed.Version)

pkg/cvo/metrics_test.go

Lines changed: 133 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,28 @@ import (
2626
"github.com/openshift/cluster-version-operator/pkg/internal"
2727
)
2828

29+
type fakeCvoGateChecker struct{}
30+
31+
func (f *fakeCvoGateChecker) DesiredVersion() string {
32+
return ""
33+
}
34+
35+
func (f *fakeCvoGateChecker) UnknownVersion() bool {
36+
return false
37+
}
38+
39+
func (f *fakeCvoGateChecker) StatusReleaseArchitecture() bool {
40+
return false
41+
}
42+
43+
func (f *fakeCvoGateChecker) CVOConfiguration() bool {
44+
return false
45+
}
46+
47+
func (f *fakeCvoGateChecker) AcceptRisks() bool {
48+
return false
49+
}
50+
2951
func Test_operatorMetrics_Collect(t *testing.T) {
3052
tests := []struct {
3153
name string
@@ -667,6 +689,7 @@ func Test_operatorMetrics_Collect(t *testing.T) {
667689
}
668690
for _, tt := range tests {
669691
t.Run(tt.name, func(t *testing.T) {
692+
tt.optr.enabledCVOFeatureGates = &fakeCvoGateChecker{}
670693
tt.optr.eventRecorder = record.NewFakeRecorder(100)
671694
if tt.optr.cvLister == nil {
672695
tt.optr.cvLister = &cvLister{}
@@ -973,6 +996,116 @@ func TestCollectUnknownConditionalUpdates(t *testing.T) {
973996
}
974997
}
975998

999+
func Test_collectConditionalUpdateRisks(t *testing.T) {
1000+
type valueWithLabels struct {
1001+
value float64
1002+
labels map[string]string
1003+
}
1004+
testCases := []struct {
1005+
name string
1006+
risks []configv1.ConditionalUpdateRisk
1007+
expected []valueWithLabels
1008+
}{
1009+
{
1010+
name: "no conditional updates",
1011+
expected: []valueWithLabels{},
1012+
},
1013+
{
1014+
name: "unknown type",
1015+
risks: []configv1.ConditionalUpdateRisk{
1016+
{
1017+
Name: "RiskX",
1018+
Conditions: []metav1.Condition{{
1019+
Type: internal.ConditionalUpdateConditionTypeRecommended,
1020+
Status: metav1.ConditionFalse,
1021+
Reason: "ReasonA",
1022+
Message: "Risk does not apply",
1023+
}},
1024+
},
1025+
},
1026+
},
1027+
{
1028+
name: "apply false",
1029+
risks: []configv1.ConditionalUpdateRisk{
1030+
{
1031+
Name: "RiskX",
1032+
Conditions: []metav1.Condition{{
1033+
Type: internal.ConditionalUpdateRiskConditionTypeApplies,
1034+
Status: metav1.ConditionFalse,
1035+
Reason: "ReasonA",
1036+
Message: "Risk does not apply",
1037+
}},
1038+
},
1039+
},
1040+
expected: []valueWithLabels{{
1041+
labels: map[string]string{"name": "version", "condition": "Applies", "risk": "RiskX"},
1042+
}},
1043+
},
1044+
{
1045+
name: "apply true",
1046+
risks: []configv1.ConditionalUpdateRisk{
1047+
{
1048+
Name: "RiskX",
1049+
Conditions: []metav1.Condition{{
1050+
Type: internal.ConditionalUpdateRiskConditionTypeApplies,
1051+
Status: metav1.ConditionTrue,
1052+
Reason: "ReasonA",
1053+
Message: "Risk does not apply",
1054+
}},
1055+
},
1056+
},
1057+
expected: []valueWithLabels{{
1058+
value: 1,
1059+
labels: map[string]string{"name": "version", "condition": "Applies", "risk": "RiskX"},
1060+
}},
1061+
},
1062+
{
1063+
name: "apply unknown",
1064+
risks: []configv1.ConditionalUpdateRisk{
1065+
{
1066+
Name: "RiskX",
1067+
Conditions: []metav1.Condition{{
1068+
Type: internal.ConditionalUpdateRiskConditionTypeApplies,
1069+
Status: metav1.ConditionUnknown,
1070+
Reason: "ReasonA",
1071+
Message: "Risk does not apply",
1072+
}},
1073+
},
1074+
},
1075+
expected: []valueWithLabels{{
1076+
labels: map[string]string{"name": "version", "condition": "Applies", "risk": "RiskX"},
1077+
}},
1078+
},
1079+
}
1080+
1081+
for _, tc := range testCases {
1082+
tc := tc
1083+
t.Run(tc.name, func(t *testing.T) {
1084+
optr := &Operator{}
1085+
m := newOperatorMetrics(optr)
1086+
ch := make(chan prometheus.Metric)
1087+
1088+
go func() {
1089+
m.collectConditionalUpdateRisks(ch, tc.risks)
1090+
close(ch)
1091+
}()
1092+
1093+
var collected []prometheus.Metric
1094+
for item := range ch {
1095+
collected = append(collected, item)
1096+
}
1097+
1098+
if lenC, lenE := len(collected), len(tc.expected); lenC != lenE {
1099+
1100+
t.Fatalf("Expected %d metrics, got %d metrics\nGot metrics: %s", lenE, lenC, spew.Sdump(collected))
1101+
}
1102+
for i := range tc.expected {
1103+
expectMetric(t, collected[i], tc.expected[i].value, tc.expected[i].labels)
1104+
}
1105+
})
1106+
}
1107+
}
1108+
9761109
func expectMetric(t *testing.T, metric prometheus.Metric, value float64, labels map[string]string) {
9771110
t.Helper()
9781111
var d dto.Metric

0 commit comments

Comments
 (0)