@@ -17,6 +17,7 @@ import (
1717 configv1 "github.com/openshift/api/config/v1"
1818 "github.com/prometheus/client_golang/api"
1919 prometheusv1 "github.com/prometheus/client_golang/api/prometheus/v1"
20+ "github.com/prometheus/client_golang/prometheus"
2021 "github.com/prometheus/common/config"
2122 "github.com/prometheus/common/model"
2223 metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
@@ -28,6 +29,50 @@ import (
2829 "github.com/openshift/cluster-version-operator/pkg/clusterconditions/cache"
2930)
3031
32+ var (
33+ promQLEvaluations = prometheus .NewCounter (prometheus.CounterOpts {
34+ Name : "cluster_operator_promql_evaluations_total" ,
35+ Help : "Report the total number of PromQL evaluations being processed." ,
36+ })
37+
38+ promQLEvaluationErrors = prometheus .NewCounterVec (prometheus.CounterOpts {
39+ Name : "cluster_operator_promql_failed_evaluations_total" ,
40+ Help : "Report the total number of failed PromQL evaluations by reason." ,
41+ }, []string {"reason" })
42+
43+ promQLEvaluationWarnings = prometheus .NewCounter (prometheus.CounterOpts {
44+ Name : "cluster_operator_promql_evaluation_warnings_total" ,
45+ Help : "Report the total number of PromQL warnings." ,
46+ })
47+ )
48+
49+ const (
50+ hostMissingReason = "host missing"
51+ apiErrorReason = "API error"
52+ invalidResultTypeReason = "invalid result type"
53+ invalidResultLengthReason = "invalid result length"
54+ invalidResultValueReason = "invalid result value"
55+ internalReason = "internal"
56+ )
57+
58+ func init () {
59+ for _ , r := range []string {
60+ hostMissingReason ,
61+ apiErrorReason ,
62+ invalidResultTypeReason ,
63+ invalidResultLengthReason ,
64+ invalidResultValueReason ,
65+ internalReason ,
66+ } {
67+ promQLEvaluationErrors .WithLabelValues (r )
68+ }
69+ prometheus .MustRegister (
70+ promQLEvaluations ,
71+ promQLEvaluationErrors ,
72+ promQLEvaluationWarnings ,
73+ )
74+ }
75+
3176// statusCodeNotImplementedForPostClient returns an empty response containing the status
3277// code 501 (Not Implemented) for POST requests.
3378//
@@ -134,10 +179,13 @@ func (p *PromQL) Valid(ctx context.Context, condition *configv1.ClusterCondition
134179// false when the PromQL evaluates to 0, and an error if the PromQL
135180// returns no time series or returns a value besides 0 or 1.
136181func (p * PromQL ) Match (ctx context.Context , condition * configv1.ClusterCondition ) (bool , error ) {
182+ promQLEvaluations .Inc ()
183+
137184 // Lookup the address every attempt in case the service IP changes. This can happen when the thanos service is
138185 // deleted and recreated.
139186 host , err := p .Host (ctx )
140187 if err != nil {
188+ promQLEvaluationErrors .WithLabelValues (hostMissingReason ).Inc ()
141189 return false , fmt .Errorf ("failure determine thanos IP: %w" , err )
142190 }
143191 p .url .Host = host
@@ -146,11 +194,13 @@ func (p *PromQL) Match(ctx context.Context, condition *configv1.ClusterCondition
146194 if roundTripper , err := config .NewRoundTripperFromConfig (p .HTTPClientConfig , "cluster-conditions" ); err == nil {
147195 clientConfig .RoundTripper = roundTripper
148196 } else {
197+ promQLEvaluationErrors .WithLabelValues (internalReason ).Inc ()
149198 return false , fmt .Errorf ("creating PromQL round-tripper: %w" , err )
150199 }
151200
152201 promqlClient , err := api .NewClient (clientConfig )
153202 if err != nil {
203+ promQLEvaluationErrors .WithLabelValues (internalReason ).Inc ()
154204 return false , fmt .Errorf ("creating PromQL client: %w" , err )
155205 }
156206
@@ -170,23 +220,28 @@ func (p *PromQL) Match(ctx context.Context, condition *configv1.ClusterCondition
170220 klog .V (2 ).Infof ("evaluate %s cluster condition: %q" , condition .Type , condition .PromQL .PromQL )
171221 result , warnings , err := v1api .Query (queryContext , condition .PromQL .PromQL , time .Now ())
172222 if err != nil {
223+ promQLEvaluationErrors .WithLabelValues (apiErrorReason ).Inc ()
173224 return false , fmt .Errorf ("executing PromQL query: %w" , err )
174225 }
175226
227+ promQLEvaluations .Add (float64 (len (warnings )))
176228 for _ , warning := range warnings {
177229 klog .Warning (warning )
178230 }
179231
180232 if result .Type () != model .ValVector {
233+ promQLEvaluationErrors .WithLabelValues (invalidResultTypeReason ).Inc ()
181234 return false , fmt .Errorf ("invalid PromQL result type is %s, not vector" , result .Type ())
182235 }
183236
184237 vector , ok := result .(model.Vector )
185238 if ! ok {
239+ promQLEvaluationErrors .WithLabelValues (invalidResultTypeReason ).Inc ()
186240 return false , fmt .Errorf ("invalid PromQL result type is nominally %s, but fails Vector cast" , result .Type ())
187241 }
188242
189243 if vector .Len () != 1 {
244+ promQLEvaluationErrors .WithLabelValues (invalidResultLengthReason ).Inc ()
190245 return false , fmt .Errorf ("invalid PromQL result length must be one, but is %d" , vector .Len ())
191246 }
192247
@@ -196,5 +251,6 @@ func (p *PromQL) Match(ctx context.Context, condition *configv1.ClusterCondition
196251 } else if sample .Value == 1 {
197252 return true , nil
198253 }
254+ promQLEvaluationErrors .WithLabelValues (invalidResultValueReason ).Inc ()
199255 return false , fmt .Errorf ("invalid PromQL result (must be 0 or 1): %v" , sample .Value )
200256}
0 commit comments