Skip to content

Commit 2a432a6

Browse files
DavidHurtaclaude
andcommitted
pkg/cvo/metrics: Utilize dynamiccertificates package for certificate updates
This commit's goal is to prepare the existing code for mTLS support. In OpenShift, core operators SHOULD require authentication, and they SHOULD support TLS client certificate authentication [1]. They also SHOULD support local authorization and SHOULD allow the well-known metrics scraping identity [1]. To achieve this, an operator must be able to verify a client's certificate. To do this, the certificate can be verified using the certificate authority (CA) bundle located in a ConfigMap in the kube-system namespace [2]. This would entail an implementation of a new controller to watch the ConfigMap for changes. To avoid such implementation to avoid potential bugs and future maintenance, my goal is to utilize the `k8s.io/apiserver/pkg/server/dynamiccertificates` package for this goal as the package provides a functionality for this specific use case. While doing so, we can also rework the existing, a bit complex, implementation and utilize the package for existing use cases as well to simplify the logic and use an existing, well-tested library. [1]: https://github.com/openshift/enhancements/blob/master/CONVENTIONS.md#metrics [2]: https://rhobs-handbook.netlify.app/products/openshiftmonitoring/collecting_metrics.md/#exposing-metrics-for-prometheus Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
1 parent f77bc1e commit 2a432a6

1 file changed

Lines changed: 62 additions & 199 deletions

File tree

pkg/cvo/metrics.go

Lines changed: 62 additions & 199 deletions
Original file line numberDiff line numberDiff line change
@@ -1,17 +1,12 @@
11
package cvo
22

33
import (
4-
"bytes"
54
"context"
6-
"crypto/sha256"
75
"crypto/tls"
86
"errors"
97
"fmt"
10-
"io"
118
"net"
129
"net/http"
13-
"os"
14-
"path/filepath"
1510
"strings"
1611
"time"
1712

@@ -23,6 +18,7 @@ import (
2318
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
2419
"k8s.io/apimachinery/pkg/labels"
2520
"k8s.io/apimachinery/pkg/util/sets"
21+
"k8s.io/apiserver/pkg/server/dynamiccertificates"
2622
authenticationclientsetv1 "k8s.io/client-go/kubernetes/typed/authentication/v1"
2723
"k8s.io/client-go/rest"
2824
"k8s.io/client-go/tools/cache"
@@ -32,8 +28,6 @@ import (
3228
"github.com/openshift/cluster-version-operator/lib/resourcemerge"
3329
"github.com/openshift/cluster-version-operator/pkg/internal"
3430
"github.com/openshift/library-go/pkg/crypto"
35-
36-
"gopkg.in/fsnotify.v1"
3731
)
3832

3933
// RegisterMetrics initializes metrics and registers them with the
@@ -210,15 +204,6 @@ func (a *authHandler) ServeHTTP(w http.ResponseWriter, r *http.Request) {
210204
a.downstream.ServeHTTP(w, r)
211205
}
212206

213-
func shutdownHttpServer(parentCtx context.Context, svr *http.Server) {
214-
ctx, cancel := context.WithTimeout(parentCtx, 5*time.Second)
215-
defer cancel()
216-
klog.Info("Shutting down metrics server so it can be recreated with updated TLS configuration.")
217-
if err := svr.Shutdown(ctx); err != nil {
218-
klog.Errorf("Failed to gracefully shut down metrics server during restart: %v", err)
219-
}
220-
}
221-
222207
func startListening(svr *http.Server, tlsConfig *tls.Config, lAddr string, resultChannel chan asyncResult) {
223208
tcpListener, err := net.Listen("tcp", lAddr)
224209
if err != nil {
@@ -248,73 +233,65 @@ func handleServerResult(result asyncResult, lastLoopError error) error {
248233
}
249234

250235
// RunMetrics launches a server bound to listenAddress serving
251-
// Prometheus metrics at /metrics over HTTPS. Continues serving
236+
// Prometheus metrics at /metrics over HTTPS. Continues serving
252237
// until runContext.Done() and then attempts a clean shutdown
253-
// limited by shutdownContext.Done(). Assumes runContext.Done()
238+
// limited by shutdownContext.Done(). Assumes runContext.Done()
254239
// occurs before or simultaneously with shutdownContext.Done().
255-
// Also detects changes to metrics certificate files upon which
256-
// the metrics HTTP server is shutdown and recreated with a new
257-
// TLS configuration.
240+
// The TLS configuration automatically reloads certificates when
241+
// they change on disk using dynamiccertificates.
258242
func RunMetrics(runContext context.Context, shutdownContext context.Context, listenAddress, certFile, keyFile string, restConfig *rest.Config, disableMetricsAuth bool) error {
259-
var tlsConfig *tls.Config
260-
if listenAddress != "" {
261-
var err error
262-
tlsConfig, err = makeTLSConfig(certFile, keyFile)
263-
if err != nil {
264-
return fmt.Errorf("failed to create TLS config: %w", err)
265-
}
266-
} else {
243+
if listenAddress == "" {
267244
return errors.New("TLS configuration is required to serve metrics")
268245
}
269246

270-
client, err := authenticationclientsetv1.NewForConfig(restConfig)
271-
if err != nil {
272-
return fmt.Errorf("failed to create config: %w", err)
273-
}
274-
275-
server := createHttpServer(runContext, client, disableMetricsAuth)
247+
// Prepare synchronization for to-be created go routines
248+
metricsContext, metricsContextCancel := context.WithCancel(runContext)
249+
defer metricsContextCancel()
276250

277251
resultChannel := make(chan asyncResult, 1)
278-
resultChannelCount := 1
252+
resultChannelCount := 0
279253

280-
go startListening(server, tlsConfig, listenAddress, resultChannel)
254+
// Create a dynamic serving cert/key controller to watch for serving certificate changes from files.
255+
servingCertController, err := dynamiccertificates.NewDynamicServingContentFromFiles("metrics-serving-cert", certFile, keyFile)
256+
if err != nil {
257+
return fmt.Errorf("failed to create serving certificate controller: %w", err)
258+
}
259+
if err := servingCertController.RunOnce(metricsContext); err != nil {
260+
return fmt.Errorf("failed to initialize serving content controller: %w", err)
261+
}
281262

282-
certDir := filepath.Dir(certFile)
283-
keyDir := filepath.Dir(keyFile)
263+
// Start the serving cert controller to begin watching the cert and key files
264+
resultChannelCount++
265+
go func() {
266+
servingCertController.Run(metricsContext, 1)
267+
resultChannel <- asyncResult{name: "serving content controller"}
268+
}()
284269

285-
origCertChecksum, err := checksumFile(certFile)
270+
// Create TLS config using the controllers. The config uses callbacks to dynamically
271+
// fetch the latest certificates and CA bundles on each connection, so no server
272+
// restart is needed when certificates change.
273+
tlsConfig, err := makeTLSConfig(servingCertController)
286274
if err != nil {
287-
return fmt.Errorf("failed to initialize certificate file checksum: %w", err)
275+
return fmt.Errorf("failed to create TLS config: %w", err)
288276
}
289-
origKeyChecksum, err := checksumFile(keyFile)
277+
278+
client, err := authenticationclientsetv1.NewForConfig(restConfig)
290279
if err != nil {
291-
return fmt.Errorf("failed to initialize key file checksum: %w", err)
280+
return fmt.Errorf("failed to create config: %w", err)
292281
}
293282

294-
// Set up and start the file watcher.
295-
watcher, err := fsnotify.NewWatcher()
296-
if watcher == nil || err != nil {
297-
return fmt.Errorf("failed to create file watcher for certificate and key rotation: %w", err)
298-
} else {
299-
defer func() {
300-
if err := watcher.Close(); err != nil {
301-
klog.Errorf("Failed to close file watcher: %v", err)
302-
}
303-
}()
304-
if err := watcher.Add(certDir); err != nil {
305-
return fmt.Errorf("failed to add %v to watcher: %w", certDir, err)
306-
}
307-
if certDir != keyDir {
308-
if err := watcher.Add(keyDir); err != nil {
309-
return fmt.Errorf("failed to add %v to watcher: %w", keyDir, err)
310-
}
311-
}
312-
}
283+
server := createHttpServer(metricsContext, client, disableMetricsAuth)
284+
285+
resultChannelCount++
286+
go func() {
287+
startListening(server, tlsConfig, listenAddress, resultChannel)
288+
}()
313289

290+
// Wait for server to exit or shutdown signal
314291
shutdown := false
315-
restartServer := false
316292
var loopError error
317293
for resultChannelCount > 0 {
294+
klog.Infof("Waiting on %d outstanding goroutines.", resultChannelCount)
318295
if shutdown {
319296
select {
320297
case result := <-resultChannel:
@@ -326,63 +303,24 @@ func RunMetrics(runContext context.Context, shutdownContext context.Context, lis
326303
}
327304
} else {
328305
select {
329-
case <-runContext.Done(): // clean shutdown
330-
case result := <-resultChannel: // crashed before a shutdown was requested or metrics server recreated
331-
if restartServer {
332-
klog.Info("Creating metrics server with updated TLS configuration.")
333-
server = createHttpServer(runContext, client, disableMetricsAuth)
334-
go startListening(server, tlsConfig, listenAddress, resultChannel)
335-
restartServer = false
336-
continue
337-
}
306+
case <-metricsContext.Done():
307+
klog.Infof("Clean metrics shutdown requested: %v", metricsContext.Err())
308+
case result := <-resultChannel:
338309
resultChannelCount--
339310
loopError = handleServerResult(result, loopError)
340-
case event := <-watcher.Events:
341-
if event.Op != fsnotify.Chmod && event.Op != fsnotify.Remove {
342-
if changed, err := certsChanged(origCertChecksum, origKeyChecksum, certFile, keyFile); changed {
343-
344-
// Update file checksums with latest files.
345-
//
346-
if origCertChecksum, err = checksumFile(certFile); err != nil {
347-
klog.Errorf("Failed to update certificate file checksum: %v", err)
348-
loopError = err
349-
break
350-
}
351-
if origKeyChecksum, err = checksumFile(keyFile); err != nil {
352-
klog.Errorf("Failed to update key file checksum: %v", err)
353-
loopError = err
354-
break
355-
}
356-
357-
tlsConfig, err = makeTLSConfig(certFile, keyFile)
358-
if err == nil {
359-
restartServer = true
360-
shutdownHttpServer(shutdownContext, server)
361-
continue
362-
} else {
363-
klog.Errorf("Failed to create TLS configuration with updated configuration: %v", err)
364-
loopError = err
365-
}
366-
} else if err != nil {
367-
klog.Errorf("%v", err)
368-
loopError = err
369-
} else {
370-
continue
371-
}
372-
} else {
373-
continue
374-
}
375-
case err = <-watcher.Errors:
376-
klog.Errorf("Error from metrics server certificate file watcher: %v", err)
377-
loopError = err
378311
}
379312
shutdown = true
380313
shutdownError := server.Shutdown(shutdownContext)
314+
if shutdownError != nil { // log the error we are discarding
315+
klog.Errorf("Failed to gracefully shut down metrics server: %v", shutdownError)
316+
}
317+
381318
if loopError == nil {
382319
loopError = shutdownError
383-
} else if shutdownError != nil { // log the error we are discarding
384-
klog.Errorf("Failed to gracefully shut down metrics server: %v", shutdownError)
385320
}
321+
322+
// Request remaining go routines to shut down
323+
metricsContextCancel()
386324
}
387325
}
388326

@@ -722,95 +660,20 @@ func mostRecentTimestamp(cv *configv1.ClusterVersion) int64 {
722660
return latest.Unix()
723661
}
724662

725-
// Determine if the certificates have changed and need to be updated.
726-
// If no errors occur, returns true if both files have changed and
727-
// neither is an empty file. Otherwise returns false and any error.
728-
func certsChanged(origCertChecksum []byte, origKeyChecksum []byte, certFile, keyFile string) (bool, error) {
729-
// Check if both files exist.
730-
certNotEmpty, err := fileExistsAndNotEmpty(certFile)
663+
func makeTLSConfig(servingCertController dynamiccertificates.CertKeyContentProvider) (*tls.Config, error) {
664+
_, err := tls.X509KeyPair(servingCertController.CurrentCertKeyContent())
731665
if err != nil {
732-
return false, fmt.Errorf("Error checking if changed TLS cert file empty/exists: %w", err)
666+
return nil, fmt.Errorf("failed to create X509 key pair: %w", err)
733667
}
734-
keyNotEmpty, err := fileExistsAndNotEmpty(keyFile)
735-
if err != nil {
736-
return false, fmt.Errorf("error checking if changed TLS key file empty/exists: %w", err)
737-
}
738-
if !certNotEmpty || !keyNotEmpty {
739-
// One of the files is missing despite some file event.
740-
return false, fmt.Errorf("certificate or key is missing or empty, certificates will not be rotated")
741-
}
742-
743-
currentCertChecksum, err := checksumFile(certFile)
744-
if err != nil {
745-
return false, fmt.Errorf("error checking certificate file checksum: %w", err)
746-
}
747-
748-
currentKeyChecksum, err := checksumFile(keyFile)
749-
if err != nil {
750-
return false, fmt.Errorf("error checking key file checksum: %w", err)
751-
}
752-
753-
// Check if the non-empty certificate/key files have actually changed.
754-
if !bytes.Equal(origCertChecksum, currentCertChecksum) && !bytes.Equal(origKeyChecksum, currentKeyChecksum) {
755-
klog.V(2).Info("Certificate and key changed. Will recreate metrics server with updated TLS configuration.")
756-
return true, nil
757-
}
758-
759-
return false, nil
760-
}
761-
762-
func makeTLSConfig(servingCertFile, servingKeyFile string) (*tls.Config, error) {
763-
// Load the initial certificate contents.
764-
certBytes, err := os.ReadFile(servingCertFile)
765-
if err != nil {
766-
return nil, err
767-
}
768-
keyBytes, err := os.ReadFile(servingKeyFile)
769-
if err != nil {
770-
return nil, err
771-
}
772-
certificate, err := tls.X509KeyPair(certBytes, keyBytes)
773-
if err != nil {
774-
return nil, err
775-
}
776-
777-
return crypto.SecureTLSConfig(&tls.Config{
668+
tlsConfig := crypto.SecureTLSConfig(&tls.Config{
778669
GetCertificate: func(_ *tls.ClientHelloInfo) (*tls.Certificate, error) {
779-
return &certificate, nil
670+
cert, err := tls.X509KeyPair(servingCertController.CurrentCertKeyContent())
671+
if err != nil {
672+
klog.Errorf("Failed to load current serving certificate, rejecting connection: %v", err)
673+
return nil, fmt.Errorf("invalid serving certificate: %w", err)
674+
}
675+
return &cert, nil
780676
},
781-
}), nil
782-
}
783-
784-
// Compute the sha256 checksum for file 'fName' returning any error.
785-
func checksumFile(fName string) ([]byte, error) {
786-
file, err := os.Open(fName)
787-
if err != nil {
788-
return nil, fmt.Errorf("failed to open file %v for checksum: %w", fName, err)
789-
}
790-
defer func() {
791-
if err := file.Close(); err != nil {
792-
klog.Errorf("failed to close file %s: %v", fName, err)
793-
}
794-
}()
795-
796-
hash := sha256.New()
797-
798-
if _, err = io.Copy(hash, file); err != nil {
799-
return nil, fmt.Errorf("failed to compute checksum for file %v: %w", fName, err)
800-
}
801-
802-
return hash.Sum(nil), nil
803-
}
804-
805-
// Check if a file exists and has file.Size() not equal to 0.
806-
// Returns any error returned by os.Stat other than os.ErrNotExist.
807-
func fileExistsAndNotEmpty(fName string) (bool, error) {
808-
if fi, err := os.Stat(fName); err == nil {
809-
return (fi.Size() != 0), nil
810-
} else if errors.Is(err, os.ErrNotExist) {
811-
return false, nil
812-
} else {
813-
// Some other error, file may not exist.
814-
return false, err
815-
}
677+
})
678+
return tlsConfig, nil
816679
}

0 commit comments

Comments
 (0)