Skip to content

Commit 83be4bd

Browse files
committed
NO-ISSUE: instrument PromQL cluster condition
1 parent 8324378 commit 83be4bd

1 file changed

Lines changed: 56 additions & 0 deletions

File tree

pkg/clusterconditions/promql/promql.go

Lines changed: 56 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@ import (
1717
configv1 "github.com/openshift/api/config/v1"
1818
"github.com/prometheus/client_golang/api"
1919
prometheusv1 "github.com/prometheus/client_golang/api/prometheus/v1"
20+
"github.com/prometheus/client_golang/prometheus"
2021
"github.com/prometheus/common/config"
2122
"github.com/prometheus/common/model"
2223
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
@@ -28,6 +29,50 @@ import (
2829
"github.com/openshift/cluster-version-operator/pkg/clusterconditions/cache"
2930
)
3031

32+
var (
33+
promQLEvaluations = prometheus.NewCounter(prometheus.CounterOpts{
34+
Name: "cluster_operator_promql_evaluations_total",
35+
Help: "Report the total number of PromQL evaluations being processed.",
36+
})
37+
38+
promQLEvaluationErrors = prometheus.NewCounterVec(prometheus.CounterOpts{
39+
Name: "cluster_operator_promql_failed_evaluations_total",
40+
Help: "Report the total number of failed PromQL evaluations by reason.",
41+
}, []string{"reason"})
42+
43+
promQLEvaluationWarnings = prometheus.NewCounter(prometheus.CounterOpts{
44+
Name: "cluster_operator_promql_evaluation_warnings_total",
45+
Help: "Report the total number of PromQL warnings.",
46+
})
47+
)
48+
49+
const (
50+
hostMissingReason = "host missing"
51+
apiErrorReason = "API error"
52+
invalidResultTypeReason = "invalid result type"
53+
invalidResultLengthReason = "invalid result length"
54+
invalidResultValueReason = "invalid result value"
55+
internalReason = "internal"
56+
)
57+
58+
func init() {
59+
for _, r := range []string{
60+
hostMissingReason,
61+
apiErrorReason,
62+
invalidResultTypeReason,
63+
invalidResultLengthReason,
64+
invalidResultValueReason,
65+
internalReason,
66+
} {
67+
promQLEvaluationErrors.WithLabelValues(r)
68+
}
69+
prometheus.MustRegister(
70+
promQLEvaluations,
71+
promQLEvaluationErrors,
72+
promQLEvaluationWarnings,
73+
)
74+
}
75+
3176
// statusCodeNotImplementedForPostClient returns an empty response containing the status
3277
// code 501 (Not Implemented) for POST requests.
3378
//
@@ -134,10 +179,13 @@ func (p *PromQL) Valid(ctx context.Context, condition *configv1.ClusterCondition
134179
// false when the PromQL evaluates to 0, and an error if the PromQL
135180
// returns no time series or returns a value besides 0 or 1.
136181
func (p *PromQL) Match(ctx context.Context, condition *configv1.ClusterCondition) (bool, error) {
182+
promQLEvaluations.Inc()
183+
137184
// Lookup the address every attempt in case the service IP changes. This can happen when the thanos service is
138185
// deleted and recreated.
139186
host, err := p.Host(ctx)
140187
if err != nil {
188+
promQLEvaluationErrors.WithLabelValues(hostMissingReason).Inc()
141189
return false, fmt.Errorf("failure determine thanos IP: %w", err)
142190
}
143191
p.url.Host = host
@@ -146,11 +194,13 @@ func (p *PromQL) Match(ctx context.Context, condition *configv1.ClusterCondition
146194
if roundTripper, err := config.NewRoundTripperFromConfig(p.HTTPClientConfig, "cluster-conditions"); err == nil {
147195
clientConfig.RoundTripper = roundTripper
148196
} else {
197+
promQLEvaluationErrors.WithLabelValues(internalReason).Inc()
149198
return false, fmt.Errorf("creating PromQL round-tripper: %w", err)
150199
}
151200

152201
promqlClient, err := api.NewClient(clientConfig)
153202
if err != nil {
203+
promQLEvaluationErrors.WithLabelValues(internalReason).Inc()
154204
return false, fmt.Errorf("creating PromQL client: %w", err)
155205
}
156206

@@ -170,23 +220,28 @@ func (p *PromQL) Match(ctx context.Context, condition *configv1.ClusterCondition
170220
klog.V(2).Infof("evaluate %s cluster condition: %q", condition.Type, condition.PromQL.PromQL)
171221
result, warnings, err := v1api.Query(queryContext, condition.PromQL.PromQL, time.Now())
172222
if err != nil {
223+
promQLEvaluationErrors.WithLabelValues(apiErrorReason).Inc()
173224
return false, fmt.Errorf("executing PromQL query: %w", err)
174225
}
175226

227+
promQLEvaluations.Add(float64(len(warnings)))
176228
for _, warning := range warnings {
177229
klog.Warning(warning)
178230
}
179231

180232
if result.Type() != model.ValVector {
233+
promQLEvaluationErrors.WithLabelValues(invalidResultTypeReason).Inc()
181234
return false, fmt.Errorf("invalid PromQL result type is %s, not vector", result.Type())
182235
}
183236

184237
vector, ok := result.(model.Vector)
185238
if !ok {
239+
promQLEvaluationErrors.WithLabelValues(invalidResultTypeReason).Inc()
186240
return false, fmt.Errorf("invalid PromQL result type is nominally %s, but fails Vector cast", result.Type())
187241
}
188242

189243
if vector.Len() != 1 {
244+
promQLEvaluationErrors.WithLabelValues(invalidResultLengthReason).Inc()
190245
return false, fmt.Errorf("invalid PromQL result length must be one, but is %d", vector.Len())
191246
}
192247

@@ -196,5 +251,6 @@ func (p *PromQL) Match(ctx context.Context, condition *configv1.ClusterCondition
196251
} else if sample.Value == 1 {
197252
return true, nil
198253
}
254+
promQLEvaluationErrors.WithLabelValues(invalidResultValueReason).Inc()
199255
return false, fmt.Errorf("invalid PromQL result (must be 0 or 1): %v", sample.Value)
200256
}

0 commit comments

Comments
 (0)