From 6fe4c18194af078dd14b70261726ebe062002103 Mon Sep 17 00:00:00 2001 From: Yama Date: Sat, 16 May 2026 13:49:53 +0200 Subject: [PATCH 01/13] feat: shard rule-ConfigMaps to avoid etcd size limit --- charts/operator/templates/role.yaml | 21 +- charts/operator/templates/rolebinding.yaml | 17 + charts/operator/templates/rule-evaluator.yaml | 7 +- cmd/config-reloader/README.md | 4 + cmd/config-reloader/internal/syncer.go | 166 +++++++ cmd/config-reloader/internal/syncer_test.go | 362 ++++++++++++++ cmd/config-reloader/main.go | 66 ++- e2e/ruler_test.go | 246 +++++++++- manifests/operator.yaml | 35 +- pkg/operator/rules.go | 217 +++++++-- pkg/operator/rules_test.go | 441 ++++++++++++++++++ 11 files changed, 1514 insertions(+), 68 deletions(-) create mode 100644 cmd/config-reloader/internal/syncer.go create mode 100644 cmd/config-reloader/internal/syncer_test.go diff --git a/charts/operator/templates/role.yaml b/charts/operator/templates/role.yaml index 21541f6a13..373cd67b98 100644 --- a/charts/operator/templates/role.yaml +++ b/charts/operator/templates/role.yaml @@ -68,7 +68,7 @@ rules: - resources: - configmaps apiGroups: [""] - verbs: ["list", "watch", "create"] + verbs: ["list", "watch", "create", "get", "patch", "update", "delete"] - resources: - configmaps apiGroups: [""] @@ -185,3 +185,22 @@ rules: - gmp-operator verbs: ["delete"] {{- end -}} +{{- if .Values.collector.rbac.create -}} + {{- if .Values.operator.rbac.create }} +--- + {{- end }} +apiVersion: rbac.authorization.k8s.io/v1 +kind: Role +metadata: + name: collector + namespace: {{.Values.namespace.system}} + {{- if .Values.commonLabels }} + labels: + {{- include "prometheus-engine.labels" . | nindent 4 }} + {{- end }} +rules: +- resources: + - configmaps + apiGroups: [""] + verbs: ["get", "list", "watch"] +{{- end -}} diff --git a/charts/operator/templates/rolebinding.yaml b/charts/operator/templates/rolebinding.yaml index 768da24a6c..a0e932639a 100644 --- a/charts/operator/templates/rolebinding.yaml +++ b/charts/operator/templates/rolebinding.yaml @@ -103,4 +103,21 @@ subjects: - name: collector namespace: {{.Values.namespace.system}} kind: ServiceAccount +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: RoleBinding +metadata: + name: collector + namespace: {{.Values.namespace.system}} + {{- if .Values.commonLabels }} + labels: + {{- include "prometheus-engine.labels" . | nindent 4 }} + {{- end }} +roleRef: + name: collector + kind: Role + apiGroup: rbac.authorization.k8s.io +subjects: +- name: collector + kind: ServiceAccount {{- end -}} diff --git a/charts/operator/templates/rule-evaluator.yaml b/charts/operator/templates/rule-evaluator.yaml index ae6d8350f9..27473f1a52 100644 --- a/charts/operator/templates/rule-evaluator.yaml +++ b/charts/operator/templates/rule-evaluator.yaml @@ -99,6 +99,8 @@ spec: - --config-file=/prometheus/config/config.yaml - --config-file-output=/prometheus/config_out/config.yaml - --config-dir=/etc/rules + - --config-dir-from-configmap-selector=monitoring.googleapis.com/rules-shard=true + - --config-dir-from-configmap-namespace={{.Values.namespace.system}} - --config-dir-output=/prometheus/rules_out - --watched-dir=/etc/secrets - --reload-url=http://127.0.0.1:19092/-/reload @@ -115,7 +117,6 @@ spec: - name: config-out mountPath: /prometheus/config_out - name: rules - readOnly: true mountPath: /etc/rules - name: rules-out mountPath: /prometheus/rules_out @@ -137,9 +138,7 @@ spec: - name: config-out emptyDir: {} - name: rules - configMap: - name: rules-generated - defaultMode: 420 + emptyDir: {} - name: rules-out emptyDir: {} - name: rules-secret diff --git a/cmd/config-reloader/README.md b/cmd/config-reloader/README.md index 289a5a45f0..51a805c836 100644 --- a/cmd/config-reloader/README.md +++ b/cmd/config-reloader/README.md @@ -9,6 +9,10 @@ Meant to be run as a sidecar. Usage of config-reloader: -config-dir string config directory to watch for changes + -config-dir-from-configmap-namespace string + namespace to list ConfigMaps from (required when --config-dir-from-configmap-selector is set) + -config-dir-from-configmap-selector string + label selector to discover rule ConfigMaps via K8s API (e.g. monitoring.googleapis.com/rules-shard=true). When set, replaces --config-dir for rule file discovery. -config-dir-output string config directory to write with interpolated environment variables -config-file string diff --git a/cmd/config-reloader/internal/syncer.go b/cmd/config-reloader/internal/syncer.go new file mode 100644 index 0000000000..be3c988acf --- /dev/null +++ b/cmd/config-reloader/internal/syncer.go @@ -0,0 +1,166 @@ +// Copyright 2024 Google LLC +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package internal + +import ( + "context" + "crypto/sha256" + "fmt" + "os" + "path/filepath" + "sort" + "time" + + "github.com/go-kit/log" + "github.com/go-kit/log/level" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/client-go/kubernetes" + "k8s.io/client-go/rest" +) + +type ConfigMapSyncer struct { + client kubernetes.Interface + namespace string + selector string + outputDir string + logger log.Logger + interval time.Duration + + lastHash string +} + +func NewConfigMapSyncer(namespace, selector, outputDir string, interval time.Duration, logger log.Logger) (*ConfigMapSyncer, error) { + cfg, err := rest.InClusterConfig() + if err != nil { + return nil, err + } + client, err := kubernetes.NewForConfig(cfg) + if err != nil { + return nil, err + } + return newConfigMapSyncerWithClient(client, namespace, selector, outputDir, interval, logger), nil +} + +func newConfigMapSyncerWithClient(client kubernetes.Interface, namespace, selector, outputDir string, interval time.Duration, logger log.Logger) *ConfigMapSyncer { + return &ConfigMapSyncer{ + client: client, + namespace: namespace, + selector: selector, + outputDir: outputDir, + interval: interval, + logger: logger, + } +} + +func (s *ConfigMapSyncer) Sync(ctx context.Context) (bool, error) { + cmList, err := s.client.CoreV1().ConfigMaps(s.namespace).List(ctx, metav1.ListOptions{ + LabelSelector: s.selector, + }) + if err != nil { + return false, fmt.Errorf("list configmaps: %w", err) + } + + files := make(map[string][]byte) + for i := range cmList.Items { + cm := &cmList.Items[i] + for k, v := range cm.Data { + files[cm.Name+"__"+k] = []byte(v) + } + for k, v := range cm.BinaryData { + files[cm.Name+"__"+k] = v + } + } + + hash := hashFiles(files) + if hash == s.lastHash { + return false, nil + } + + if err := s.writeFiles(files); err != nil { + return false, err + } + + s.lastHash = hash + //nolint:errcheck + level.Info(s.logger).Log("msg", "synced configmap rules", "configmaps", len(cmList.Items), "files", len(files)) + return true, nil +} + +func (s *ConfigMapSyncer) Run(ctx context.Context) error { + // Best-effort initial sync; the reloader will pick up files on its next poll cycle. + if _, err := s.Sync(ctx); err != nil { + //nolint:errcheck + level.Warn(s.logger).Log("msg", "initial configmap sync failed", "err", err) + } + ticker := time.NewTicker(s.interval) + defer ticker.Stop() + for { + select { + case <-ctx.Done(): + return nil + case <-ticker.C: + if _, err := s.Sync(ctx); err != nil { + //nolint:errcheck + level.Warn(s.logger).Log("msg", "configmap sync failed", "err", err) + } + } + } +} + +func (s *ConfigMapSyncer) writeFiles(files map[string][]byte) error { + if err := os.MkdirAll(s.outputDir, 0o755); err != nil { + return err + } + + for name, data := range files { + if err := os.WriteFile(filepath.Join(s.outputDir, name), data, 0o644); err != nil { + return err + } + } + + entries, err := os.ReadDir(s.outputDir) + if err != nil { + return err + } + for _, e := range entries { + if e.IsDir() { + continue + } + if _, ok := files[e.Name()]; !ok { + if err := os.Remove(filepath.Join(s.outputDir, e.Name())); err != nil { + return err + } + } + } + + return nil +} + +func hashFiles(files map[string][]byte) string { + h := sha256.New() + + keys := make([]string, 0, len(files)) + for k := range files { + keys = append(keys, k) + } + sort.Strings(keys) + + for _, k := range keys { + fmt.Fprintf(h, "%s\x00", k) + _, _ = h.Write(files[k]) + _, _ = h.Write([]byte{0}) + } + return fmt.Sprintf("%x", h.Sum(nil)) +} diff --git a/cmd/config-reloader/internal/syncer_test.go b/cmd/config-reloader/internal/syncer_test.go new file mode 100644 index 0000000000..65c83518da --- /dev/null +++ b/cmd/config-reloader/internal/syncer_test.go @@ -0,0 +1,362 @@ +// Copyright 2024 Google LLC +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package internal + +import ( + "os" + "path/filepath" + "testing" + "time" + + "github.com/go-kit/log" + corev1 "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/client-go/kubernetes/fake" +) + +func TestConfigMapSyncer_BasicSync(t *testing.T) { + outputDir := t.TempDir() + + cm := &corev1.ConfigMap{ + ObjectMeta: metav1.ObjectMeta{ + Name: "rules-generated-0", + Namespace: "gmp-system", + Labels: map[string]string{ + "monitoring.googleapis.com/rules-shard": "true", + }, + }, + Data: map[string]string{ + "rules__default__test.yaml": "groups:\n- name: test\n rules: []\n", + }, + } + + client := fake.NewSimpleClientset(cm) + syncer := newConfigMapSyncerWithClient(client, "gmp-system", "monitoring.googleapis.com/rules-shard=true", outputDir, time.Second, log.NewNopLogger()) + + changed, err := syncer.Sync(t.Context()) + if err != nil { + t.Fatal(err) + } + if !changed { + t.Fatal("expected changed=true on first sync") + } + + data, err := os.ReadFile(filepath.Join(outputDir, "rules-generated-0__rules__default__test.yaml")) + if err != nil { + t.Fatal(err) + } + if string(data) != "groups:\n- name: test\n rules: []\n" { + t.Errorf("unexpected file content: %q", data) + } +} + +func TestConfigMapSyncer_NoChangeOnSecondSync(t *testing.T) { + outputDir := t.TempDir() + + cm := &corev1.ConfigMap{ + ObjectMeta: metav1.ObjectMeta{ + Name: "rules-generated-0", + Namespace: "gmp-system", + Labels: map[string]string{ + "monitoring.googleapis.com/rules-shard": "true", + }, + }, + Data: map[string]string{ + "test.yaml": "groups: []\n", + }, + } + + client := fake.NewSimpleClientset(cm) + syncer := newConfigMapSyncerWithClient(client, "gmp-system", "monitoring.googleapis.com/rules-shard=true", outputDir, time.Second, log.NewNopLogger()) + + if _, err := syncer.Sync(t.Context()); err != nil { + t.Fatalf("first sync: %v", err) + } + + changed, err := syncer.Sync(t.Context()) + if err != nil { + t.Fatalf("second sync: %v", err) + } + if changed { + t.Fatal("expected changed=false when content unchanged") + } +} + +func TestConfigMapSyncer_StaleFileRemoval(t *testing.T) { + outputDir := t.TempDir() + + staleFile := filepath.Join(outputDir, "old-shard__stale.yaml") + if err := os.WriteFile(staleFile, []byte("stale"), 0o644); err != nil { + t.Fatal(err) + } + + cm := &corev1.ConfigMap{ + ObjectMeta: metav1.ObjectMeta{ + Name: "rules-generated-0", + Namespace: "gmp-system", + Labels: map[string]string{ + "monitoring.googleapis.com/rules-shard": "true", + }, + }, + Data: map[string]string{ + "current.yaml": "groups: []\n", + }, + } + + client := fake.NewSimpleClientset(cm) + syncer := newConfigMapSyncerWithClient(client, "gmp-system", "monitoring.googleapis.com/rules-shard=true", outputDir, time.Second, log.NewNopLogger()) + + if _, err := syncer.Sync(t.Context()); err != nil { + t.Fatal(err) + } + + if _, err := os.Stat(staleFile); !os.IsNotExist(err) { + t.Error("stale file was not removed") + } + if _, err := os.Stat(filepath.Join(outputDir, "rules-generated-0__current.yaml")); err != nil { + t.Errorf("current file missing: %v", err) + } +} + +func TestConfigMapSyncer_MultipleConfigMaps(t *testing.T) { + outputDir := t.TempDir() + + cm0 := &corev1.ConfigMap{ + ObjectMeta: metav1.ObjectMeta{ + Name: "rules-generated-0", + Namespace: "gmp-system", + Labels: map[string]string{ + "monitoring.googleapis.com/rules-shard": "true", + }, + }, + Data: map[string]string{"rules1.yaml": "shard0-rules1"}, + } + cm1 := &corev1.ConfigMap{ + ObjectMeta: metav1.ObjectMeta{ + Name: "rules-generated-1", + Namespace: "gmp-system", + Labels: map[string]string{ + "monitoring.googleapis.com/rules-shard": "true", + }, + }, + Data: map[string]string{"rules2.yaml": "shard1-rules2"}, + } + + client := fake.NewSimpleClientset(cm0, cm1) + syncer := newConfigMapSyncerWithClient(client, "gmp-system", "monitoring.googleapis.com/rules-shard=true", outputDir, time.Second, log.NewNopLogger()) + + changed, err := syncer.Sync(t.Context()) + if err != nil { + t.Fatal(err) + } + if !changed { + t.Fatal("expected changed=true") + } + + entries, err := os.ReadDir(outputDir) + if err != nil { + t.Fatal(err) + } + if len(entries) != 2 { + t.Errorf("expected 2 files, got %d", len(entries)) + } +} + +func TestConfigMapSyncer_ContentUpdateDetection(t *testing.T) { + outputDir := t.TempDir() + + cm := &corev1.ConfigMap{ + ObjectMeta: metav1.ObjectMeta{ + Name: "rules-generated-0", + Namespace: "gmp-system", + Labels: map[string]string{ + "monitoring.googleapis.com/rules-shard": "true", + }, + }, + Data: map[string]string{"rules.yaml": "version: 1"}, + } + + client := fake.NewSimpleClientset(cm) + syncer := newConfigMapSyncerWithClient(client, "gmp-system", "monitoring.googleapis.com/rules-shard=true", outputDir, time.Second, log.NewNopLogger()) + + if _, err := syncer.Sync(t.Context()); err != nil { + t.Fatalf("first sync: %v", err) + } + + cm.Data["rules.yaml"] = "version: 2" + if _, err := client.CoreV1().ConfigMaps("gmp-system").Update(t.Context(), cm, metav1.UpdateOptions{}); err != nil { + t.Fatal(err) + } + + changed, err := syncer.Sync(t.Context()) + if err != nil { + t.Fatalf("second sync: %v", err) + } + if !changed { + t.Fatal("expected changed=true after content update") + } + + data, err := os.ReadFile(filepath.Join(outputDir, "rules-generated-0__rules.yaml")) + if err != nil { + t.Fatal(err) + } + if string(data) != "version: 2" { + t.Errorf("expected updated content, got %q", data) + } +} + +func TestConfigMapSyncer_MixedDataAndBinaryData(t *testing.T) { + outputDir := t.TempDir() + + gzipContent := []byte{0x1f, 0x8b, 0x08, 0x00} + + cm := &corev1.ConfigMap{ + ObjectMeta: metav1.ObjectMeta{ + Name: "rules-generated-0", + Namespace: "gmp-system", + Labels: map[string]string{ + "monitoring.googleapis.com/rules-shard": "true", + }, + }, + Data: map[string]string{"uncompressed.yaml": "groups: []\n"}, + BinaryData: map[string][]byte{"compressed.yaml": gzipContent}, + } + + client := fake.NewSimpleClientset(cm) + syncer := newConfigMapSyncerWithClient(client, "gmp-system", "monitoring.googleapis.com/rules-shard=true", outputDir, time.Second, log.NewNopLogger()) + + if _, err := syncer.Sync(t.Context()); err != nil { + t.Fatal(err) + } + + entries, err := os.ReadDir(outputDir) + if err != nil { + t.Fatal(err) + } + if len(entries) != 2 { + t.Fatalf("expected 2 files, got %d", len(entries)) + } + + textData, err := os.ReadFile(filepath.Join(outputDir, "rules-generated-0__uncompressed.yaml")) + if err != nil { + t.Fatal(err) + } + if string(textData) != "groups: []\n" { + t.Errorf("unexpected text content: %q", textData) + } + + binData, err := os.ReadFile(filepath.Join(outputDir, "rules-generated-0__compressed.yaml")) + if err != nil { + t.Fatal(err) + } + if len(binData) != len(gzipContent) { + t.Errorf("expected %d binary bytes, got %d", len(gzipContent), len(binData)) + } +} + +func TestConfigMapSyncer_SelectorFiltering(t *testing.T) { + outputDir := t.TempDir() + + matching := &corev1.ConfigMap{ + ObjectMeta: metav1.ObjectMeta{ + Name: "rules-generated-0", + Namespace: "gmp-system", + Labels: map[string]string{"monitoring.googleapis.com/rules-shard": "true"}, + }, + Data: map[string]string{"rules.yaml": "matched"}, + } + nonMatching := &corev1.ConfigMap{ + ObjectMeta: metav1.ObjectMeta{ + Name: "rule-evaluator", + Namespace: "gmp-system", + Labels: map[string]string{"app.kubernetes.io/name": "rule-evaluator"}, + }, + Data: map[string]string{"config.yaml": "should-not-appear"}, + } + wrongNamespace := &corev1.ConfigMap{ + ObjectMeta: metav1.ObjectMeta{ + Name: "rules-generated-0", + Namespace: "other-ns", + Labels: map[string]string{"monitoring.googleapis.com/rules-shard": "true"}, + }, + Data: map[string]string{"rules.yaml": "wrong-namespace"}, + } + + client := fake.NewSimpleClientset(matching, nonMatching, wrongNamespace) + syncer := newConfigMapSyncerWithClient(client, "gmp-system", "monitoring.googleapis.com/rules-shard=true", outputDir, time.Second, log.NewNopLogger()) + + if _, err := syncer.Sync(t.Context()); err != nil { + t.Fatal(err) + } + + entries, err := os.ReadDir(outputDir) + if err != nil { + t.Fatal(err) + } + if len(entries) != 1 { + t.Fatalf("expected 1 file, got %d", len(entries)) + } + + data, err := os.ReadFile(filepath.Join(outputDir, "rules-generated-0__rules.yaml")) + if err != nil { + t.Fatal(err) + } + if string(data) != "matched" { + t.Errorf("unexpected content: %q", data) + } +} + +func TestConfigMapSyncer_ConfigMapRemoved(t *testing.T) { + outputDir := t.TempDir() + + cm := &corev1.ConfigMap{ + ObjectMeta: metav1.ObjectMeta{ + Name: "rules-generated-0", + Namespace: "gmp-system", + Labels: map[string]string{"monitoring.googleapis.com/rules-shard": "true"}, + }, + Data: map[string]string{"rules.yaml": "data"}, + } + + client := fake.NewSimpleClientset(cm) + syncer := newConfigMapSyncerWithClient(client, "gmp-system", "monitoring.googleapis.com/rules-shard=true", outputDir, time.Second, log.NewNopLogger()) + + if _, err := syncer.Sync(t.Context()); err != nil { + t.Fatal(err) + } + if _, err := os.Stat(filepath.Join(outputDir, "rules-generated-0__rules.yaml")); err != nil { + t.Fatal("file should exist after first sync") + } + + if err := client.CoreV1().ConfigMaps("gmp-system").Delete(t.Context(), "rules-generated-0", metav1.DeleteOptions{}); err != nil { + t.Fatal(err) + } + + changed, err := syncer.Sync(t.Context()) + if err != nil { + t.Fatal(err) + } + if !changed { + t.Fatal("expected changed=true after ConfigMap deletion") + } + + entries, err := os.ReadDir(outputDir) + if err != nil { + t.Fatal(err) + } + if len(entries) != 0 { + t.Errorf("expected 0 files after ConfigMap removed, got %d", len(entries)) + } +} diff --git a/cmd/config-reloader/main.go b/cmd/config-reloader/main.go index da5eaf1cfb..a8c1e98b83 100644 --- a/cmd/config-reloader/main.go +++ b/cmd/config-reloader/main.go @@ -16,6 +16,7 @@ package main import ( "context" + "errors" "flag" "net/http" "net/url" @@ -28,6 +29,8 @@ import ( "github.com/go-kit/log" "github.com/go-kit/log/level" "github.com/oklog/run" + + crinternal "github.com/GoogleCloudPlatform/prometheus-engine/cmd/config-reloader/internal" "github.com/prometheus/client_golang/prometheus" "github.com/prometheus/client_golang/prometheus/collectors" versioninfo "github.com/prometheus/client_golang/prometheus/collectors/version" @@ -42,6 +45,10 @@ func main() { configFileOutput = flag.String("config-file-output", "", "config file to write with interpolated environment variables") configDir = flag.String("config-dir", "", "config directory to watch for changes") configDirOutput = flag.String("config-dir-output", "", "config directory to write with interpolated environment variables") + + configMapSelector = flag.String("config-dir-from-configmap-selector", "", "label selector to discover rule ConfigMaps via K8s API (e.g. monitoring.googleapis.com/rules-shard=true). When set, replaces --config-dir for rule file discovery.") + configMapNamespace = flag.String("config-dir-from-configmap-namespace", "", "namespace to list ConfigMaps from (required when --config-dir-from-configmap-selector is set)") + // Ready and reload endpoints should be compatible with Prometheus-style // management APIs, e.g. // https://prometheus.io/docs/prometheus/latest/management_api/ @@ -135,12 +142,13 @@ func main() { }() <-done - var cfgDirs []reloader.CfgDirOption - if *configDir != "" { - cfgDirs = append(cfgDirs, reloader.CfgDirOption{ - Dir: *configDir, - OutputDir: *configDirOutput, - }) + watchInterval := 10 * time.Second + + cfgDirs, syncer, err := setupCfgDirs(*configMapSelector, *configMapNamespace, *configDir, *configDirOutput, watchInterval, logger) + if err != nil { + //nolint:errcheck + level.Error(logger).Log("msg", "setting up config dirs", "err", err) + os.Exit(1) } rel := reloader.New( @@ -156,7 +164,7 @@ func main() { // Configure a very aggress refresh for now. The reloader will only send reload signals // to Prometheus if the contents actually changed. So this should not have any practical // drawbacks. - WatchInterval: 10 * time.Second, + WatchInterval: watchInterval, RetryInterval: 5 * time.Second, DelayInterval: 3 * time.Second, }, @@ -171,6 +179,16 @@ func main() { cancel() }) } + + if syncer != nil { + ctx, cancel := context.WithCancel(context.Background()) + g.Add(func() error { + return syncer.Run(ctx) + }, func(error) { + cancel() + }) + } + { cancel := make(chan struct{}) g.Add( @@ -213,6 +231,40 @@ func main() { } } +func setupCfgDirs(configMapSelector, configMapNamespace, configDir, configDirOutput string, interval time.Duration, logger log.Logger) ([]reloader.CfgDirOption, *crinternal.ConfigMapSyncer, error) { + if configMapSelector == "" { + var dirs []reloader.CfgDirOption + if configDir != "" { + dirs = append(dirs, reloader.CfgDirOption{ + Dir: configDir, + OutputDir: configDirOutput, + }) + } + return dirs, nil, nil + } + + if configMapNamespace == "" { + return nil, nil, errors.New("--config-dir-from-configmap-namespace required when using --config-dir-from-configmap-selector") + } + if configDir == "" { + return nil, nil, errors.New("--config-dir required when using --config-dir-from-configmap-selector") + } + if configDirOutput == "" { + return nil, nil, errors.New("--config-dir-output required when using --config-dir-from-configmap-selector") + } + + syncer, err := crinternal.NewConfigMapSyncer(configMapNamespace, configMapSelector, configDir, interval, logger) + if err != nil { + return nil, nil, err + } + + dirs := []reloader.CfgDirOption{{ + Dir: configDir, + OutputDir: configDirOutput, + }} + return dirs, syncer, nil +} + type stringSlice []string func (ss *stringSlice) String() string { diff --git a/e2e/ruler_test.go b/e2e/ruler_test.go index f8c4e57cc1..977f586861 100644 --- a/e2e/ruler_test.go +++ b/e2e/ruler_test.go @@ -18,6 +18,7 @@ import ( "bytes" "compress/gzip" "context" + "crypto/sha256" "encoding/json" "errors" "fmt" @@ -562,7 +563,7 @@ func testCreateRules( err := wait.PollUntilContextTimeout(ctx, 3*time.Second, 3*time.Minute, true, func(ctx context.Context) (bool, error) { var cm corev1.ConfigMap - if err := kubeClient.Get(ctx, client.ObjectKey{Namespace: systemNamespace, Name: "rules-generated"}, &cm); err != nil { + if err := kubeClient.Get(ctx, client.ObjectKey{Namespace: systemNamespace, Name: "rules-generated-0"}, &cm); err != nil { if apierrors.IsNotFound(err) { return false, nil } @@ -853,3 +854,246 @@ func logsError(logs string) (string, error) { } return "", nil } + +func TestRuleEvaluatorMultiShardNoCompression(t *testing.T) { + testRuleEvaluatorMultiShard(t, monitoringv1.OperatorFeatures{ + Config: monitoringv1.ConfigSpec{ + Compression: monitoringv1.CompressionNone, + }, + }) +} + +func TestRuleEvaluatorMultiShardGzipCompression(t *testing.T) { + testRuleEvaluatorMultiShard(t, monitoringv1.OperatorFeatures{ + Config: monitoringv1.ConfigSpec{ + Compression: monitoringv1.CompressionGzip, + }, + }) +} + +func testRuleEvaluatorMultiShard(t *testing.T, features monitoringv1.OperatorFeatures) { + ctx := contextWithDeadline(t) + kubeClient, restConfig, err := setupCluster(ctx, t) + if err != nil { + t.Fatalf("error instantiating clients. err: %s", err) + } + + t.Run("rule-evaluator-deployed", testRuleEvaluatorDeployed(ctx, kubeClient)) + t.Run("rule-evaluator-operatorconfig", testRuleEvaluatorOperatorConfig(ctx, kubeClient, features)) + t.Run("rules-multi-shard", testCreateMultiShardRules(ctx, restConfig, kubeClient, operator.DefaultOperatorNamespace, metav1.NamespaceDefault, features)) +} + +func testCreateMultiShardRules( + ctx context.Context, + restConfig *rest.Config, + kubeClient client.Client, + systemNamespace, + userNamespace string, + features monitoringv1.OperatorFeatures, +) func(*testing.T) { + return func(t *testing.T) { + t.Log("creating large rules to trigger multi-shard") + + timeStart := time.Now() + + type ruleSpec struct { + name string + groupName string + expr string + } + + var specs []ruleSpec + switch features.Config.Compression { + case monitoringv1.CompressionGzip: + // 900KiB hex expressions. Hex has 4 bits entropy/byte so gzip compresses + // to ≥50% of original → ≥450KiB per entry. Two entries ≥900KiB > 800KiB + // threshold, guaranteeing a shard split. + for i := range 3 { + specs = append(specs, ruleSpec{ + name: fmt.Sprintf("large-rules-%d", i), + groupName: fmt.Sprintf("group-%d", i), + expr: incompressibleMetricName(900 * 1024), + }) + } + default: + for i := range 2 { + specs = append(specs, ruleSpec{ + name: fmt.Sprintf("large-rules-%d", i), + groupName: fmt.Sprintf("group-%d", i), + expr: strings.Repeat("a", 500*1024), + }) + } + } + + wantGroupNames := make(map[string]bool, len(specs)) + for _, s := range specs { + if err := kubeClient.Create(ctx, largeRules(userNamespace, s.name, s.groupName, s.expr)); err != nil { + t.Fatal(err) + } + wantGroupNames[s.groupName] = true + } + + var shardCMs []corev1.ConfigMap + err := wait.PollUntilContextTimeout(ctx, 3*time.Second, 3*time.Minute, true, func(ctx context.Context) (bool, error) { + var cmList corev1.ConfigMapList + if err := kubeClient.List(ctx, &cmList, + client.InNamespace(systemNamespace), + client.MatchingLabels{"monitoring.googleapis.com/rules-shard": "true"}, + ); err != nil { + return false, fmt.Errorf("list shard ConfigMaps: %w", err) + } + if len(cmList.Items) < 2 { + return false, nil + } + shardCMs = cmList.Items + return true, nil + }) + if err != nil { + t.Fatalf("waiting for at least 2 shard ConfigMaps: %s", err) + } + t.Logf("found %d shard ConfigMaps", len(shardCMs)) + + allFiles := make(map[string]bool) + for _, cm := range shardCMs { + isShard0 := cm.Name == "rules-generated-0" + if isShard0 { + if _, ok := cm.Data["empty.yaml"]; !ok { + t.Error("shard 0 missing empty.yaml sentinel") + } + } else { + if _, ok := cm.Data["empty.yaml"]; ok { + t.Errorf("shard %s should not have empty.yaml sentinel", cm.Name) + } + } + for k := range cm.Data { + allFiles[k] = true + } + for k := range cm.BinaryData { + allFiles[k] = true + } + if features.Config.Compression == monitoringv1.CompressionGzip { + for k := range cm.Data { + if k != "empty.yaml" { + t.Errorf("shard %s: key %q should be in BinaryData with gzip, not Data", cm.Name, k) + } + } + } + } + + for _, s := range specs { + filename := fmt.Sprintf("rules__%s__%s.yaml", userNamespace, s.name) + if !allFiles[filename] { + t.Errorf("expected rule file %q not found in any shard", filename) + } + } + + var legacyCM corev1.ConfigMap + err = kubeClient.Get(ctx, client.ObjectKey{Namespace: systemNamespace, Name: "rules-generated"}, &legacyCM) + if err == nil { + t.Error("legacy rules-generated ConfigMap should not exist") + } else if !apierrors.IsNotFound(err) { + t.Fatalf("unexpected error checking legacy ConfigMap: %s", err) + } + + if err := kube.WaitForDeploymentReady(ctx, kubeClient, systemNamespace, operator.NameRuleEvaluator); err != nil { + t.Errorf("rule-evaluator is not ready: %s", err) + out := strings.Builder{} + if err := kube.Debug(t.Context(), restConfig, kubeClient, &appsv1.Deployment{ + ObjectMeta: metav1.ObjectMeta{ + Namespace: systemNamespace, + Name: operator.NameRuleEvaluator, + }, + }, &out); err != nil { + t.Fatalf("unable to debug: %s", err) + } + t.Fatalf("debug:\n%s", out.String()) + } + + pod, err := ruleEvaluatorPod(ctx, kubeClient, systemNamespace) + if err != nil { + t.Fatalf("unable to get rule-evaluator pod: %s", err) + } + + httpClient, err := createPortForwardClient(t, restConfig, kubeClient) + if err != nil { + t.Fatalf("failed to create port forward client: %s", err) + } + + err = wait.PollUntilContextTimeout(ctx, 3*time.Second, 3*time.Minute, true, func(ctx context.Context) (bool, error) { + updated, err := isRuntimeInfoUpdatedSince(ctx, httpClient, pod, 19092, timeStart) + if err != nil { + t.Logf("unable to check rule-evaluator status: %s", err) + return false, nil + } + return updated, nil + }) + if err != nil { + t.Fatalf("failed waiting for rule-evaluator config reload: %s", err) + } + + err = wait.PollUntilContextTimeout(ctx, 3*time.Second, 3*time.Minute, true, func(ctx context.Context) (bool, error) { + rules, err := getRules(ctx, httpClient, pod.Status.PodIP, 19092) + if err != nil { + t.Logf("unable to get rules: %s", err) + return false, nil + } + gotGroups := make(map[string]bool, len(rules.Groups)) + for _, g := range rules.Groups { + gotGroups[g.Name] = true + } + for name := range wantGroupNames { + if !gotGroups[name] { + t.Logf("rule group %q not yet loaded", name) + return false, nil + } + } + return true, nil + }) + if err != nil { + t.Fatalf("not all rule groups appeared in evaluator API: %s", err) + } + + logs, err := kube.PodLogs(ctx, restConfig, pod.Namespace, pod.Name, configReloaderContainerName) + if err != nil { + t.Fatalf("unable to fetch config-reloader logs: %s", err) + } + line, err := logsError(logs) + if err != nil { + t.Fatalf("unable to read logs: %s", err) + } + if line != "" { + t.Fatalf("found error in config-reloader logs: %s", line) + } + } +} + +func largeRules(namespace, name, groupName, expr string) *monitoringv1.Rules { + return &monitoringv1.Rules{ + ObjectMeta: metav1.ObjectMeta{ + Name: name, + Namespace: namespace, + }, + Spec: monitoringv1.RulesSpec{ + Groups: []monitoringv1.RuleGroup{ + { + Name: groupName, + Rules: []monitoringv1.Rule{ + {Record: "shard_test_" + name, Expr: expr}, + }, + }, + }, + }, + } +} + +func incompressibleMetricName(size int) string { + var buf strings.Builder + buf.Grow(size) + _, _ = buf.WriteString("m_") + h := sha256.Sum256([]byte("seed")) + for buf.Len() < size { + _, _ = buf.WriteString(fmt.Sprintf("%x", h)) + h = sha256.Sum256(h[:]) + } + return buf.String()[:size] +} diff --git a/manifests/operator.yaml b/manifests/operator.yaml index da8d181b26..11120e850a 100644 --- a/manifests/operator.yaml +++ b/manifests/operator.yaml @@ -190,7 +190,7 @@ rules: - resources: - configmaps apiGroups: [""] - verbs: ["list", "watch", "create"] + verbs: ["list", "watch", "create", "get", "patch", "update", "delete"] - resources: - configmaps apiGroups: [""] @@ -247,6 +247,18 @@ rules: apiGroups: ["monitoring.googleapis.com"] verbs: ["get", "update", "list", "watch"] --- +# Source: operator/templates/role.yaml +apiVersion: rbac.authorization.k8s.io/v1 +kind: Role +metadata: + name: collector + namespace: gmp-system +rules: +- resources: + - configmaps + apiGroups: [""] + verbs: ["get", "list", "watch"] +--- # Source: operator/templates/rolebinding.yaml apiVersion: rbac.authorization.k8s.io/v1 kind: RoleBinding @@ -276,6 +288,20 @@ subjects: - name: operator kind: ServiceAccount --- +# Source: operator/templates/rolebinding.yaml +apiVersion: rbac.authorization.k8s.io/v1 +kind: RoleBinding +metadata: + name: collector + namespace: gmp-system +roleRef: + name: collector + kind: Role + apiGroup: rbac.authorization.k8s.io +subjects: +- name: collector + kind: ServiceAccount +--- # Source: operator/templates/alertmanager.yaml apiVersion: v1 kind: Service @@ -727,6 +753,8 @@ spec: - --config-file=/prometheus/config/config.yaml - --config-file-output=/prometheus/config_out/config.yaml - --config-dir=/etc/rules + - --config-dir-from-configmap-selector=monitoring.googleapis.com/rules-shard=true + - --config-dir-from-configmap-namespace=gmp-system - --config-dir-output=/prometheus/rules_out - --watched-dir=/etc/secrets - --reload-url=http://127.0.0.1:19092/-/reload @@ -748,7 +776,6 @@ spec: - name: config-out mountPath: /prometheus/config_out - name: rules - readOnly: true mountPath: /etc/rules - name: rules-out mountPath: /prometheus/rules_out @@ -770,9 +797,7 @@ spec: - name: config-out emptyDir: {} - name: rules - configMap: - name: rules-generated - defaultMode: 420 + emptyDir: {} - name: rules-out emptyDir: {} - name: rules-secret diff --git a/pkg/operator/rules.go b/pkg/operator/rules.go index 283c544d1c..e5bb3d5dcf 100644 --- a/pkg/operator/rules.go +++ b/pkg/operator/rules.go @@ -29,6 +29,7 @@ import ( ctrl "sigs.k8s.io/controller-runtime" "sigs.k8s.io/controller-runtime/pkg/builder" "sigs.k8s.io/controller-runtime/pkg/client" + "sigs.k8s.io/controller-runtime/pkg/event" "sigs.k8s.io/controller-runtime/pkg/predicate" "sigs.k8s.io/controller-runtime/pkg/reconcile" @@ -36,7 +37,10 @@ import ( ) const ( - nameRulesGenerated = "rules-generated" + nameRulesGenerated = "rules-generated" + nameRulesGeneratedPrefix = "rules-generated-" + maxShardDataBytes = 800 * 1024 // headroom below 1MB etcd limit for metadata overhead + labelRulesShardType = "monitoring.googleapis.com/rules-shard" ) func setupRulesControllers(op *Operator) error { @@ -52,10 +56,10 @@ func setupRulesControllers(op *Operator) error { namespace: op.opts.PublicNamespace, name: NameOperatorConfig, } - // Rule-evaluator rules ConfigMap filter. - objFilterRulesGenerated := namespacedNamePredicate{ + // Rule-evaluator rules shard ConfigMap filter. + objFilterRuleShards := namespacedLabelPredicate{ namespace: op.opts.OperatorNamespace, - name: nameRulesGenerated, + labels: map[string]string{labelRulesShardType: "true"}, } // Reconcile the generated rules that are used by the rule-evaluator deployment. @@ -86,7 +90,7 @@ func setupRulesControllers(op *Operator) error { Watches( &corev1.ConfigMap{}, enqueueConst(objRequest), - builder.WithPredicates(objFilterRulesGenerated), + builder.WithPredicates(objFilterRuleShards), ). Complete(newRulesReconciler(op.manager.GetClient(), op.opts)) if err != nil { @@ -220,35 +224,65 @@ func hasGlobalRules(ctx context.Context, c client.Client) (bool, error) { return len(rules.Items) > 0, nil } -// ensureRuleConfigs updates the Prometheus Rules ConfigMap. -func (r *rulesReconciler) ensureRuleConfigs(ctx context.Context, projectID, location, cluster string, configCompression monitoringv1.CompressionType) error { - logger, _ := logr.FromContext(ctx) +type ruleEntry struct { + filename string + data string +} - // Re-generate the configmap that's loaded by the rule-evaluator. - cm := &corev1.ConfigMap{ - ObjectMeta: metav1.ObjectMeta{ - Namespace: r.opts.OperatorNamespace, - Name: nameRulesGenerated, - Labels: map[string]string{ - LabelAppName: NameRuleEvaluator, - }, - }, - // Ensure there's always at least an empty, uncompressed dummy file as the evaluator - // expects at least one match. - Data: map[string]string{ - "empty.yaml": "", - }, +func configMapDataSize(cm *corev1.ConfigMap) int { + var n int + for k, v := range cm.Data { + n += len(k) + len(v) + } + for k, v := range cm.BinaryData { + n += len(k) + len(v) + } + return n +} + +func deleteConfigMapKey(cm *corev1.ConfigMap, compression monitoringv1.CompressionType, key string) { + switch compression { + case monitoringv1.CompressionGzip: + delete(cm.BinaryData, key) + default: + delete(cm.Data, key) + } +} + +type namespacedLabelPredicate struct { + namespace string + labels map[string]string +} + +func (p namespacedLabelPredicate) Create(e event.CreateEvent) bool { + return p.matches(e.Object) +} +func (p namespacedLabelPredicate) Update(e event.UpdateEvent) bool { + return p.matches(e.ObjectNew) +} +func (p namespacedLabelPredicate) Delete(e event.DeleteEvent) bool { + return p.matches(e.Object) +} +func (p namespacedLabelPredicate) Generic(e event.GenericEvent) bool { + return p.matches(e.Object) +} + +func (p namespacedLabelPredicate) matches(obj client.Object) bool { + if obj.GetNamespace() != p.namespace { + return false + } + objLabels := obj.GetLabels() + for k, v := range p.labels { + if objLabels[k] != v { + return false + } } + return true +} + +func (r *rulesReconciler) ensureRuleConfigs(ctx context.Context, projectID, location, cluster string, configCompression monitoringv1.CompressionType) error { + logger, _ := logr.FromContext(ctx) - // Generate a final rule file for each Rules resource. - // - // Depending on the scope level (global, cluster, namespace) the rules will be generated - // so that queries are constrained to the appropriate project_id, cluster, and namespace - // labels and that they are preserved through query aggregations and appear on the - // output data. - // - // The location is not scoped as it's not a meaningful boundary for "human access" - // to data as clusters may span locations. var rulesList monitoringv1.RulesList if err := r.client.List(ctx, &rulesList); err != nil { return fmt.Errorf("list rules: %w", err) @@ -260,6 +294,7 @@ func (r *rulesReconciler) ensureRuleConfigs(ctx context.Context, projectID, loca Status: corev1.ConditionTrue, } var statusUpdates []monitoringv1.MonitoringCRD + var entries []ruleEntry for i := range rulesList.Items { rs := &rulesList.Items[i] @@ -278,10 +313,7 @@ func (r *rulesReconciler) ensureRuleConfigs(ctx context.Context, projectID, loca continue } filename := fmt.Sprintf("rules__%s__%s.yaml", rs.Namespace, rs.Name) - if err := setConfigMapData(cm, configCompression, filename, result); err != nil { - return err - } - + entries = append(entries, ruleEntry{filename: filename, data: result}) if rs.Status.SetMonitoringCondition(rs.GetGeneration(), now, conditionSuccess) { statusUpdates = append(statusUpdates, rs) } @@ -308,10 +340,7 @@ func (r *rulesReconciler) ensureRuleConfigs(ctx context.Context, projectID, loca continue } filename := fmt.Sprintf("clusterrules__%s.yaml", rs.Name) - if err := setConfigMapData(cm, configCompression, filename, result); err != nil { - return err - } - + entries = append(entries, ruleEntry{filename: filename, data: result}) if rs.Status.SetMonitoringCondition(rs.GetGeneration(), now, conditionSuccess) { statusUpdates = append(statusUpdates, rs) } @@ -338,22 +367,18 @@ func (r *rulesReconciler) ensureRuleConfigs(ctx context.Context, projectID, loca continue } filename := fmt.Sprintf("globalrules__%s.yaml", rs.Name) - if err := setConfigMapData(cm, configCompression, filename, result); err != nil { - return err - } - + entries = append(entries, ruleEntry{filename: filename, data: result}) if rs.Status.SetMonitoringCondition(rs.GetGeneration(), now, conditionSuccess) { statusUpdates = append(statusUpdates, rs) } } - // Create or update generated rule ConfigMap. - if err := r.client.Update(ctx, cm); apierrors.IsNotFound(err) { - if err := r.client.Create(ctx, cm); err != nil { - return fmt.Errorf("create generated rules: %w", err) - } - } else if err != nil { - return fmt.Errorf("update generated rules: %w", err) + numShards, err := r.upsertShards(ctx, entries, configCompression) + if err != nil { + return err + } + if err := r.deleteStaleShards(ctx, logger, numShards); err != nil { + return err } var errs []error @@ -365,3 +390,95 @@ func (r *rulesReconciler) ensureRuleConfigs(ctx context.Context, projectID, loca return errors.Join(errs...) } + +func (r *rulesReconciler) newShardConfigMap(idx int) *corev1.ConfigMap { + cm := &corev1.ConfigMap{ + ObjectMeta: metav1.ObjectMeta{ + Namespace: r.opts.OperatorNamespace, + Name: fmt.Sprintf("%s%d", nameRulesGeneratedPrefix, idx), + Labels: map[string]string{ + LabelAppName: NameRuleEvaluator, + labelRulesShardType: "true", + }, + }, + } + if idx == 0 { + cm.Data = map[string]string{"empty.yaml": ""} + } + return cm +} + +func (r *rulesReconciler) createOrUpdateConfigMap(ctx context.Context, cm *corev1.ConfigMap) error { + if err := r.client.Update(ctx, cm); apierrors.IsNotFound(err) { + if err := r.client.Create(ctx, cm); err != nil { + return fmt.Errorf("create shard %s: %w", cm.Name, err) + } + } else if err != nil { + return fmt.Errorf("update shard %s: %w", cm.Name, err) + } + return nil +} + +func (r *rulesReconciler) upsertShards(ctx context.Context, entries []ruleEntry, compression monitoringv1.CompressionType) (int, error) { + shardIdx := 0 + cm := r.newShardConfigMap(shardIdx) + entriesInShard := 0 + + for _, e := range entries { + if err := setConfigMapData(cm, compression, e.filename, e.data); err != nil { + return 0, err + } + entriesInShard++ + + if configMapDataSize(cm) > maxShardDataBytes && entriesInShard > 1 { + deleteConfigMapKey(cm, compression, e.filename) + if err := r.createOrUpdateConfigMap(ctx, cm); err != nil { + return 0, err + } + shardIdx++ + cm = r.newShardConfigMap(shardIdx) + if err := setConfigMapData(cm, compression, e.filename, e.data); err != nil { + return 0, err + } + entriesInShard = 1 + } + } + + if err := r.createOrUpdateConfigMap(ctx, cm); err != nil { + return 0, err + } + return shardIdx + 1, nil +} + +func (r *rulesReconciler) deleteStaleShards(ctx context.Context, logger logr.Logger, numShards int) error { + active := make(map[string]bool, numShards) + for i := range numShards { + active[fmt.Sprintf("%s%d", nameRulesGeneratedPrefix, i)] = true + } + + var cmList corev1.ConfigMapList + if err := r.client.List(ctx, &cmList, + client.InNamespace(r.opts.OperatorNamespace), + client.MatchingLabels{labelRulesShardType: "true"}, + ); err != nil { + return fmt.Errorf("list rule configmaps: %w", err) + } + + for i := range cmList.Items { + cm := &cmList.Items[i] + if !active[cm.Name] { + if err := r.client.Delete(ctx, cm); err != nil && !apierrors.IsNotFound(err) { + logger.Error(err, "delete stale shard", "name", cm.Name) + } + } + } + + // Clean up legacy single ConfigMap from before sharding was introduced. + var legacy corev1.ConfigMap + legacy.Namespace = r.opts.OperatorNamespace + legacy.Name = nameRulesGenerated + if err := r.client.Delete(ctx, &legacy); err != nil && !apierrors.IsNotFound(err) { + logger.Error(err, "delete legacy rules-generated") + } + return nil +} diff --git a/pkg/operator/rules_test.go b/pkg/operator/rules_test.go index 7da0e290f0..edda9e1672 100644 --- a/pkg/operator/rules_test.go +++ b/pkg/operator/rules_test.go @@ -18,6 +18,7 @@ import ( "context" "errors" "fmt" + "strings" "testing" "time" @@ -629,3 +630,443 @@ func applyScale(obj client.Object, scale *autoscalingv1.Scale) error { } return nil } + +func TestConfigMapDataSize(t *testing.T) { + cm := &corev1.ConfigMap{ + Data: map[string]string{ + "a.yaml": "hello", + "b.yaml": "world", + }, + BinaryData: map[string][]byte{ + "c.yaml": {0x1f, 0x8b}, + }, + } + got := configMapDataSize(cm) + want := len("a.yaml") + len("hello") + len("b.yaml") + len("world") + len("c.yaml") + 2 + if got != want { + t.Errorf("configMapDataSize = %d, want %d", got, want) + } +} + +func TestEnsureRuleConfigs_MultiShard(t *testing.T) { + largeExpr := strings.Repeat("x", 500*1024) + rules1 := &monitoringv1.Rules{ + ObjectMeta: metav1.ObjectMeta{ + Name: "large-rules-1", + Namespace: "default", + }, + Spec: monitoringv1.RulesSpec{ + Groups: []monitoringv1.RuleGroup{ + { + Name: "test-group-1", + Rules: []monitoringv1.Rule{ + {Record: "test_record", Expr: largeExpr}, + }, + }, + }, + }, + } + rules2 := &monitoringv1.Rules{ + ObjectMeta: metav1.ObjectMeta{ + Name: "large-rules-2", + Namespace: "default", + }, + Spec: monitoringv1.RulesSpec{ + Groups: []monitoringv1.RuleGroup{ + { + Name: "test-group-2", + Rules: []monitoringv1.Rule{ + {Record: "test_record", Expr: largeExpr}, + }, + }, + }, + }, + } + + kubeClient := newFakeClientBuilder(). + WithObjects(rules1, rules2). + Build() + + r := rulesReconciler{ + client: kubeClient, + opts: Options{OperatorNamespace: "gmp-system"}, + } + + if err := r.ensureRuleConfigs(t.Context(), "", "", "", monitoringv1.CompressionNone); err != nil { + t.Fatal("ensure rule configs:", err) + } + + var cmList corev1.ConfigMapList + if err := kubeClient.List(t.Context(), &cmList, client.InNamespace("gmp-system")); err != nil { + t.Fatal(err) + } + + shardCount := 0 + for _, cm := range cmList.Items { + if strings.HasPrefix(cm.Name, nameRulesGeneratedPrefix) { + shardCount++ + if cm.Labels[labelRulesShardType] != "true" { + t.Errorf("shard %s missing label %s", cm.Name, labelRulesShardType) + } + // empty.yaml sentinel should only be in shard 0. + _, hasEmpty := cm.Data["empty.yaml"] + if cm.Name == "rules-generated-0" && !hasEmpty { + t.Error("shard 0 should have empty.yaml sentinel") + } + if cm.Name != "rules-generated-0" && hasEmpty { + t.Errorf("shard %s should not have empty.yaml sentinel", cm.Name) + } + } + } + if shardCount < 2 { + t.Errorf("expected at least 2 shards, got %d", shardCount) + } +} + +func TestEnsureRuleConfigs_CleanupStaleShards(t *testing.T) { + staleShard := &corev1.ConfigMap{ + ObjectMeta: metav1.ObjectMeta{ + Name: "rules-generated-5", + Namespace: "gmp-system", + Labels: map[string]string{ + LabelAppName: NameRuleEvaluator, + labelRulesShardType: "true", + }, + }, + Data: map[string]string{"stale.yaml": "stale data"}, + } + + kubeClient := newFakeClientBuilder(). + WithObjects(staleShard). + Build() + + r := rulesReconciler{ + client: kubeClient, + opts: Options{OperatorNamespace: "gmp-system"}, + } + + if err := r.ensureRuleConfigs(t.Context(), "", "", "", monitoringv1.CompressionNone); err != nil { + t.Fatal("ensure rule configs:", err) + } + + var cm corev1.ConfigMap + err := kubeClient.Get(t.Context(), client.ObjectKey{Namespace: "gmp-system", Name: "rules-generated-5"}, &cm) + if !apierrors.IsNotFound(err) { + t.Errorf("expected stale shard to be deleted, got err: %v", err) + } + + err = kubeClient.Get(t.Context(), client.ObjectKey{Namespace: "gmp-system", Name: "rules-generated-0"}, &cm) + if err != nil { + t.Errorf("expected shard 0 to exist: %v", err) + } +} + +func TestEnsureRuleConfigs_CleanupLegacy(t *testing.T) { + legacyCM := &corev1.ConfigMap{ + ObjectMeta: metav1.ObjectMeta{ + Name: "rules-generated", + Namespace: "gmp-system", + Labels: map[string]string{ + LabelAppName: NameRuleEvaluator, + }, + }, + Data: map[string]string{"old.yaml": "old data"}, + } + + kubeClient := newFakeClientBuilder(). + WithObjects(legacyCM). + Build() + + r := rulesReconciler{ + client: kubeClient, + opts: Options{OperatorNamespace: "gmp-system"}, + } + + if err := r.ensureRuleConfigs(t.Context(), "", "", "", monitoringv1.CompressionNone); err != nil { + t.Fatal("ensure rule configs:", err) + } + + var cm corev1.ConfigMap + err := kubeClient.Get(t.Context(), client.ObjectKey{Namespace: "gmp-system", Name: "rules-generated"}, &cm) + if !apierrors.IsNotFound(err) { + t.Errorf("expected legacy rules-generated to be deleted, got err: %v", err) + } +} + +func TestEnsureRuleConfigs_SingleShard(t *testing.T) { + rules := &monitoringv1.Rules{ + ObjectMeta: metav1.ObjectMeta{ + Name: "small-rules", + Namespace: "default", + }, + Spec: monitoringv1.RulesSpec{ + Groups: []monitoringv1.RuleGroup{ + { + Name: "test-group", + Rules: []monitoringv1.Rule{ + {Record: "test_record", Expr: "test_expr"}, + }, + }, + }, + }, + } + + kubeClient := newFakeClientBuilder(). + WithObjects(rules). + Build() + + r := rulesReconciler{ + client: kubeClient, + opts: Options{OperatorNamespace: "gmp-system"}, + } + + if err := r.ensureRuleConfigs(t.Context(), "", "", "", monitoringv1.CompressionNone); err != nil { + t.Fatal("ensure rule configs:", err) + } + + var cm corev1.ConfigMap + if err := kubeClient.Get(t.Context(), client.ObjectKey{Namespace: "gmp-system", Name: "rules-generated-0"}, &cm); err != nil { + t.Fatalf("shard 0 should exist: %v", err) + } + + if cm.Labels[LabelAppName] != NameRuleEvaluator { + t.Error("missing app name label") + } + if cm.Labels[labelRulesShardType] != "true" { + t.Error("missing shard type label") + } + if cm.Data["empty.yaml"] != "" { + t.Error("shard 0 should have empty.yaml sentinel") + } + + found := false + for k := range cm.Data { + if strings.HasPrefix(k, "rules__") { + found = true + break + } + } + if !found { + t.Errorf("shard 0 should contain rule data, got keys: %v", keysOf(cm.Data)) + } + + err := kubeClient.Get(t.Context(), client.ObjectKey{Namespace: "gmp-system", Name: "rules-generated-1"}, &cm) + if !apierrors.IsNotFound(err) { + t.Errorf("expected no shard 1 for small rules, got err: %v", err) + } +} + +func TestEnsureRuleConfigs_ShardReduction(t *testing.T) { + shard0 := &corev1.ConfigMap{ + ObjectMeta: metav1.ObjectMeta{ + Name: "rules-generated-0", + Namespace: "gmp-system", + Labels: map[string]string{ + LabelAppName: NameRuleEvaluator, + labelRulesShardType: "true", + }, + }, + Data: map[string]string{"empty.yaml": "", "rules__default__old.yaml": "old"}, + } + shard1 := &corev1.ConfigMap{ + ObjectMeta: metav1.ObjectMeta{ + Name: "rules-generated-1", + Namespace: "gmp-system", + Labels: map[string]string{ + LabelAppName: NameRuleEvaluator, + labelRulesShardType: "true", + }, + }, + Data: map[string]string{"rules__default__extra1.yaml": "extra1"}, + } + shard2 := &corev1.ConfigMap{ + ObjectMeta: metav1.ObjectMeta{ + Name: "rules-generated-2", + Namespace: "gmp-system", + Labels: map[string]string{ + LabelAppName: NameRuleEvaluator, + labelRulesShardType: "true", + }, + }, + Data: map[string]string{"rules__default__extra2.yaml": "extra2"}, + } + + rules := &monitoringv1.Rules{ + ObjectMeta: metav1.ObjectMeta{ + Name: "small-rule", + Namespace: "default", + }, + Spec: monitoringv1.RulesSpec{ + Groups: []monitoringv1.RuleGroup{ + { + Name: "grp", + Rules: []monitoringv1.Rule{ + {Record: "r", Expr: "1"}, + }, + }, + }, + }, + } + + kubeClient := newFakeClientBuilder(). + WithObjects(shard0, shard1, shard2, rules). + Build() + + r := rulesReconciler{ + client: kubeClient, + opts: Options{OperatorNamespace: "gmp-system"}, + } + + if err := r.ensureRuleConfigs(t.Context(), "", "", "", monitoringv1.CompressionNone); err != nil { + t.Fatal("ensure rule configs:", err) + } + + var cm corev1.ConfigMap + if err := kubeClient.Get(t.Context(), client.ObjectKey{Namespace: "gmp-system", Name: "rules-generated-0"}, &cm); err != nil { + t.Fatalf("shard 0 should exist: %v", err) + } + + for _, name := range []string{"rules-generated-1", "rules-generated-2"} { + err := kubeClient.Get(t.Context(), client.ObjectKey{Namespace: "gmp-system", Name: name}, &cm) + if !apierrors.IsNotFound(err) { + t.Errorf("expected %s to be deleted, got err: %v", name, err) + } + } +} + +func TestEnsureRuleConfigs_AllRuleTypes(t *testing.T) { + rules := &monitoringv1.Rules{ + ObjectMeta: metav1.ObjectMeta{ + Name: "ns-rules", + Namespace: "app-ns", + }, + Spec: monitoringv1.RulesSpec{ + Groups: []monitoringv1.RuleGroup{ + {Name: "g1", Rules: []monitoringv1.Rule{{Record: "r1", Expr: "1"}}}, + }, + }, + } + clusterRules := &monitoringv1.ClusterRules{ + ObjectMeta: metav1.ObjectMeta{ + Name: "cluster-rules", + }, + Spec: monitoringv1.RulesSpec{ + Groups: []monitoringv1.RuleGroup{ + {Name: "g2", Rules: []monitoringv1.Rule{{Record: "r2", Expr: "2"}}}, + }, + }, + } + globalRules := &monitoringv1.GlobalRules{ + ObjectMeta: metav1.ObjectMeta{ + Name: "global-rules", + }, + Spec: monitoringv1.RulesSpec{ + Groups: []monitoringv1.RuleGroup{ + {Name: "g3", Rules: []monitoringv1.Rule{{Record: "r3", Expr: "3"}}}, + }, + }, + } + + kubeClient := newFakeClientBuilder(). + WithObjects(rules, clusterRules, globalRules). + Build() + + r := rulesReconciler{ + client: kubeClient, + opts: Options{OperatorNamespace: "gmp-system"}, + } + + if err := r.ensureRuleConfigs(t.Context(), "proj", "us-central1", "cluster-1", monitoringv1.CompressionNone); err != nil { + t.Fatal("ensure rule configs:", err) + } + + var cm corev1.ConfigMap + if err := kubeClient.Get(t.Context(), client.ObjectKey{Namespace: "gmp-system", Name: "rules-generated-0"}, &cm); err != nil { + t.Fatal(err) + } + + foundNS, foundCluster, foundGlobal := false, false, false + var allKeys []string + for k := range cm.Data { + allKeys = append(allKeys, k) + if strings.HasPrefix(k, "rules__") { + foundNS = true + } + if strings.HasPrefix(k, "clusterrules__") { + foundCluster = true + } + if strings.HasPrefix(k, "globalrules__") { + foundGlobal = true + } + } + if !foundNS { + t.Errorf("missing namespaced rules in keys: %v", allKeys) + } + if !foundCluster { + t.Errorf("missing cluster rules in keys: %v", allKeys) + } + if !foundGlobal { + t.Errorf("missing global rules in keys: %v", allKeys) + } +} + +func TestNamespacedLabelPredicate(t *testing.T) { + pred := namespacedLabelPredicate{ + namespace: "gmp-system", + labels: map[string]string{labelRulesShardType: "true"}, + } + + matchingCM := &corev1.ConfigMap{ + ObjectMeta: metav1.ObjectMeta{ + Name: "rules-generated-0", + Namespace: "gmp-system", + Labels: map[string]string{labelRulesShardType: "true"}, + }, + } + wrongNamespaceCM := &corev1.ConfigMap{ + ObjectMeta: metav1.ObjectMeta{ + Name: "rules-generated-0", + Namespace: "other", + Labels: map[string]string{labelRulesShardType: "true"}, + }, + } + wrongLabelCM := &corev1.ConfigMap{ + ObjectMeta: metav1.ObjectMeta{ + Name: "rule-evaluator", + Namespace: "gmp-system", + Labels: map[string]string{LabelAppName: NameRuleEvaluator}, + }, + } + noLabelsCM := &corev1.ConfigMap{ + ObjectMeta: metav1.ObjectMeta{ + Name: "some-cm", + Namespace: "gmp-system", + }, + } + + tests := map[string]struct { + obj client.Object + want bool + }{ + "matching": {obj: matchingCM, want: true}, + "wrong namespace": {obj: wrongNamespaceCM, want: false}, + "wrong label": {obj: wrongLabelCM, want: false}, + "no labels": {obj: noLabelsCM, want: false}, + } + + for name, tc := range tests { + t.Run(name, func(t *testing.T) { + if got := pred.matches(tc.obj); got != tc.want { + t.Errorf("matches() = %v, want %v", got, tc.want) + } + }) + } +} + +func keysOf(m map[string]string) []string { + keys := make([]string, 0, len(m)) + for k := range m { + keys = append(keys, k) + } + return keys +} From 5d0fb479507977b0c0dbf04c9bd69430f8585ad3 Mon Sep 17 00:00:00 2001 From: Yama Date: Sat, 16 May 2026 15:20:48 +0200 Subject: [PATCH 02/13] fix(config-reloader): add defensive file name check --- cmd/config-reloader/internal/syncer.go | 3 +++ 1 file changed, 3 insertions(+) diff --git a/cmd/config-reloader/internal/syncer.go b/cmd/config-reloader/internal/syncer.go index be3c988acf..06f4f7bb1f 100644 --- a/cmd/config-reloader/internal/syncer.go +++ b/cmd/config-reloader/internal/syncer.go @@ -125,6 +125,9 @@ func (s *ConfigMapSyncer) writeFiles(files map[string][]byte) error { } for name, data := range files { + if filepath.Base(name) != name { + continue + } if err := os.WriteFile(filepath.Join(s.outputDir, name), data, 0o644); err != nil { return err } From f970e1c5d67bbe3e5f13c9f382e30a061b360d2b Mon Sep 17 00:00:00 2001 From: Yama Date: Tue, 19 May 2026 11:41:32 +0200 Subject: [PATCH 03/13] build: bump conform image to v0.1.0-alpha.31 for arm64 support --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index 9450b1dbcf..6b847d1e8c 100644 --- a/Makefile +++ b/Makefile @@ -82,7 +82,7 @@ clean: ## Clean build time resources, primarily, unused docker images. .PHONY: conform conform: - docker run --rm -v ${PWD}:/src -w /src ghcr.io/siderolabs/conform:v0.1.0-alpha.27 enforce + docker run --rm -v ${PWD}:/src -w /src ghcr.io/siderolabs/conform:v0.1.0-alpha.31 enforce .PHONY: lint lint: From 34f92626d59c23045ae5df68d9f5315450197416 Mon Sep 17 00:00:00 2001 From: Yama Date: Tue, 19 May 2026 11:42:10 +0200 Subject: [PATCH 04/13] fix(deps): bump golang.org/x/net from 0.48.0 to 0.53.0 (make govulncheck happy) --- go.mod | 14 +++++++------- go.sum | 32 ++++++++++++++++---------------- 2 files changed, 23 insertions(+), 23 deletions(-) diff --git a/go.mod b/go.mod index c0763b4088..c64763935e 100644 --- a/go.mod +++ b/go.mod @@ -26,7 +26,7 @@ require ( github.com/stretchr/testify v1.11.1 github.com/thanos-io/thanos v0.36.1 go.uber.org/zap v1.27.0 - golang.org/x/mod v0.30.0 + golang.org/x/mod v0.34.0 golang.org/x/oauth2 v0.34.0 golang.org/x/time v0.12.0 // indirect google.golang.org/api v0.248.0 @@ -160,13 +160,13 @@ require ( go.uber.org/multierr v1.11.0 // indirect go.yaml.in/yaml/v2 v2.4.2 // indirect go.yaml.in/yaml/v3 v3.0.4 // indirect - golang.org/x/crypto v0.46.0 // indirect + golang.org/x/crypto v0.50.0 // indirect golang.org/x/exp v0.0.0-20250819193227-8b4c13bb791b // indirect - golang.org/x/net v0.48.0 // indirect - golang.org/x/sync v0.19.0 // indirect - golang.org/x/sys v0.39.0 // indirect - golang.org/x/term v0.38.0 // indirect - golang.org/x/text v0.32.0 // indirect + golang.org/x/net v0.53.0 // indirect + golang.org/x/sync v0.20.0 // indirect + golang.org/x/sys v0.43.0 // indirect + golang.org/x/term v0.42.0 // indirect + golang.org/x/text v0.36.0 // indirect gomodules.xyz/jsonpatch/v2 v2.5.0 // indirect google.golang.org/genproto v0.0.0-20250825161204-c5933d9347a5 // indirect google.golang.org/genproto/googleapis/rpc v0.0.0-20251202230838-ff82c1b0f217 // indirect diff --git a/go.sum b/go.sum index 188eb1ab52..378d201a62 100644 --- a/go.sum +++ b/go.sum @@ -453,8 +453,8 @@ golang.org/x/crypto v0.19.0/go.mod h1:Iy9bg/ha4yyC70EfRS8jz+B6ybOBKMaSxLj6P6oBDf golang.org/x/crypto v0.23.0/go.mod h1:CKFgDieR+mRhux2Lsu27y0fO304Db0wZe70UKqHu0v8= golang.org/x/crypto v0.24.0/go.mod h1:Z1PMYSOR5nyMcyAVAIQSKCDwalqy85Aqn1x3Ws4L5DM= golang.org/x/crypto v0.30.0/go.mod h1:kDsLvtWBEx7MV9tJOj9bnXsPbxwJQ6csT/x4KIN4Ssk= -golang.org/x/crypto v0.46.0 h1:cKRW/pmt1pKAfetfu+RCEvjvZkA9RimPbh7bhFjGVBU= -golang.org/x/crypto v0.46.0/go.mod h1:Evb/oLKmMraqjZ2iQTwDwvCtJkczlDuTmdJXoZVzqU0= +golang.org/x/crypto v0.50.0 h1:zO47/JPrL6vsNkINmLoo/PH1gcxpls50DNogFvB5ZGI= +golang.org/x/crypto v0.50.0/go.mod h1:3muZ7vA7PBCE6xgPX7nkzzjiUq87kRItoJQM1Yo8S+Q= golang.org/x/exp v0.0.0-20250819193227-8b4c13bb791b h1:DXr+pvt3nC887026GRP39Ej11UATqWDmWuS99x26cD0= golang.org/x/exp v0.0.0-20250819193227-8b4c13bb791b/go.mod h1:4QTo5u+SEIbbKW1RacMZq1YEfOBqeXa19JeshGi+zc4= golang.org/x/mod v0.2.0/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA= @@ -464,8 +464,8 @@ golang.org/x/mod v0.8.0/go.mod h1:iBbtSCu2XBx23ZKBPSOrRkjjQPZFPuis4dIYUhu/chs= golang.org/x/mod v0.12.0/go.mod h1:iBbtSCu2XBx23ZKBPSOrRkjjQPZFPuis4dIYUhu/chs= golang.org/x/mod v0.15.0/go.mod h1:hTbmBsO62+eylJbnUtE2MGJUyE7QWk4xUqPFrRgJ+7c= golang.org/x/mod v0.17.0/go.mod h1:hTbmBsO62+eylJbnUtE2MGJUyE7QWk4xUqPFrRgJ+7c= -golang.org/x/mod v0.30.0 h1:fDEXFVZ/fmCKProc/yAXXUijritrDzahmwwefnjoPFk= -golang.org/x/mod v0.30.0/go.mod h1:lAsf5O2EvJeSFMiBxXDki7sCgAxEUcZHXoXMKT4GJKc= +golang.org/x/mod v0.34.0 h1:xIHgNUUnW6sYkcM5Jleh05DvLOtwc6RitGHbDk4akRI= +golang.org/x/mod v0.34.0/go.mod h1:ykgH52iCZe79kzLLMhyCUzhMci+nQj+0XkbXpNYtVjY= golang.org/x/net v0.0.0-20190404232315-eb5bcb51f2a3/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg= golang.org/x/net v0.0.0-20190620200207-3b0461eec859/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= golang.org/x/net v0.0.0-20200226121028-0de0cce0169b/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= @@ -480,8 +480,8 @@ golang.org/x/net v0.21.0/go.mod h1:bIjVDfnllIU7BJ2DNgfnXvpSvtn8VRwhlsaeUTyUS44= golang.org/x/net v0.25.0/go.mod h1:JkAGAh7GEvH74S6FOH42FLoXpXbE/aqXSrIQjXgsiwM= golang.org/x/net v0.26.0/go.mod h1:5YKkiSynbBIh3p6iOc/vibscux0x38BZDkn8sCUPxHE= golang.org/x/net v0.32.0/go.mod h1:CwU0IoeOlnQQWJ6ioyFrfRuomB8GKF6KbYXZVyeXNfs= -golang.org/x/net v0.48.0 h1:zyQRTTrjc33Lhh0fBgT/H3oZq9WuvRR5gPC70xpDiQU= -golang.org/x/net v0.48.0/go.mod h1:+ndRgGjkh8FGtu1w1FGbEC31if4VrNVMuKTgcAAnQRY= +golang.org/x/net v0.53.0 h1:d+qAbo5L0orcWAr0a9JweQpjXF19LMXJE8Ey7hwOdUA= +golang.org/x/net v0.53.0/go.mod h1:JvMuJH7rrdiCfbeHoo3fCQU24Lf5JJwT9W3sJFulfgs= golang.org/x/oauth2 v0.21.0/go.mod h1:XYTD2NtWslqkgxebSiOHnXEap4TF09sJSc7H1sXbhtI= golang.org/x/oauth2 v0.24.0/go.mod h1:XYTD2NtWslqkgxebSiOHnXEap4TF09sJSc7H1sXbhtI= golang.org/x/oauth2 v0.34.0 h1:hqK/t4AKgbqWkdkcAeI8XLmbK+4m4G5YeQRrmiotGlw= @@ -497,8 +497,8 @@ golang.org/x/sync v0.3.0/go.mod h1:FU7BRWz2tNW+3quACPkgCx/L+uEAv1htQ0V83Z9Rj+Y= golang.org/x/sync v0.6.0/go.mod h1:Czt+wKu1gCyEFDUtn0jG5QVvpJ6rzVqr5aXyt9drQfk= golang.org/x/sync v0.7.0/go.mod h1:Czt+wKu1gCyEFDUtn0jG5QVvpJ6rzVqr5aXyt9drQfk= golang.org/x/sync v0.10.0/go.mod h1:Czt+wKu1gCyEFDUtn0jG5QVvpJ6rzVqr5aXyt9drQfk= -golang.org/x/sync v0.19.0 h1:vV+1eWNmZ5geRlYjzm2adRgW2/mcpevXNg50YZtPCE4= -golang.org/x/sync v0.19.0/go.mod h1:9KTHXmSnoGruLpwFjVSX0lNNA75CykiMECbovNTZqGI= +golang.org/x/sync v0.20.0 h1:e0PTpb7pjO8GAtTs2dQ6jYa5BWYlMuX047Dco/pItO4= +golang.org/x/sync v0.20.0/go.mod h1:9xrNwdLfx4jkKbNva9FpL6vEN7evnE43NNNJQ2LF3+0= golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= golang.org/x/sys v0.0.0-20190412213103-97732733099d/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20200930185726-fdedc70b468f/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= @@ -517,8 +517,8 @@ golang.org/x/sys v0.20.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= golang.org/x/sys v0.21.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= golang.org/x/sys v0.22.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= golang.org/x/sys v0.28.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= -golang.org/x/sys v0.39.0 h1:CvCKL8MeisomCi6qNZ+wbb0DN9E5AATixKsvNtMoMFk= -golang.org/x/sys v0.39.0/go.mod h1:OgkHotnGiDImocRcuBABYBEXf8A9a87e/uXjp9XT3ks= +golang.org/x/sys v0.43.0 h1:Rlag2XtaFTxp19wS8MXlJwTvoh8ArU6ezoyFsMyCTNI= +golang.org/x/sys v0.43.0/go.mod h1:4GL1E5IUh+htKOUEOaiffhrAeqysfVGipDYzABqnCmw= golang.org/x/telemetry v0.0.0-20240228155512-f48c80bd79b2/go.mod h1:TeRTkGYfJXctD9OcfyVLyj2J3IxLnKwHJR8f4D8a3YE= golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo= golang.org/x/term v0.0.0-20210927222741-03fcf44c2211/go.mod h1:jbD1KX2456YbFQfuXm/mYQcufACuNUgVhRMnK/tPxf8= @@ -529,8 +529,8 @@ golang.org/x/term v0.17.0/go.mod h1:lLRBjIVuehSbZlaOtGMbcMncT+aqLLLmKrsjNrUguwk= golang.org/x/term v0.20.0/go.mod h1:8UkIAJTvZgivsXaD6/pH6U9ecQzZ45awqEOzuCvwpFY= golang.org/x/term v0.21.0/go.mod h1:ooXLefLobQVslOqselCNF4SxFAaoS6KujMbsGzSDmX0= golang.org/x/term v0.27.0/go.mod h1:iMsnZpn0cago0GOrHO2+Y7u7JPn5AylBrcoWkElMTSM= -golang.org/x/term v0.38.0 h1:PQ5pkm/rLO6HnxFR7N2lJHOZX6Kez5Y1gDSJla6jo7Q= -golang.org/x/term v0.38.0/go.mod h1:bSEAKrOT1W+VSu9TSCMtoGEOUcKxOKgl3LE5QEF/xVg= +golang.org/x/term v0.42.0 h1:UiKe+zDFmJobeJ5ggPwOshJIVt6/Ft0rcfrXZDLWAWY= +golang.org/x/term v0.42.0/go.mod h1:Dq/D+snpsbazcBG5+F9Q1n2rXV8Ma+71xEjTRufARgY= golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= golang.org/x/text v0.3.2/go.mod h1:bEr9sfX3Q8Zfm5fL9x+3itogRgK3+ptLWKqgva+5dAk= golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ= @@ -542,8 +542,8 @@ golang.org/x/text v0.14.0/go.mod h1:18ZOQIKpY8NJVqYksKHtTdi31H5itFRjB5/qKTNYzSU= golang.org/x/text v0.15.0/go.mod h1:18ZOQIKpY8NJVqYksKHtTdi31H5itFRjB5/qKTNYzSU= golang.org/x/text v0.16.0/go.mod h1:GhwF1Be+LQoKShO3cGOHzqOgRrGaYc9AvblQOmPVHnI= golang.org/x/text v0.21.0/go.mod h1:4IBbMaMmOPCJ8SecivzSH54+73PCFmPWxNTLm+vZkEQ= -golang.org/x/text v0.32.0 h1:ZD01bjUt1FQ9WJ0ClOL5vxgxOI/sVCNgX1YtKwcY0mU= -golang.org/x/text v0.32.0/go.mod h1:o/rUWzghvpD5TXrTIBuJU77MTaN0ljMWE47kxGJQ7jY= +golang.org/x/text v0.36.0 h1:JfKh3XmcRPqZPKevfXVpI1wXPTqbkE5f7JA92a55Yxg= +golang.org/x/text v0.36.0/go.mod h1:NIdBknypM8iqVmPiuco0Dh6P5Jcdk8lJL0CUebqK164= golang.org/x/time v0.12.0 h1:ScB/8o8olJvc+CQPWrK3fPZNfh7qgwCrY0zJmoEQLSE= golang.org/x/time v0.12.0/go.mod h1:CDIdPxbZBQxdj6cxyCIdrNogrJKMJ7pr37NYpMcMDSg= golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= @@ -554,8 +554,8 @@ golang.org/x/tools v0.1.12/go.mod h1:hNGJHUnrk76NpqgfD5Aqm5Crs+Hm0VOH/i9J2+nxYbc golang.org/x/tools v0.6.0/go.mod h1:Xwgl3UAJ/d3gWutnCtw505GrjyAbvKui8lOU390QaIU= golang.org/x/tools v0.13.0/go.mod h1:HvlwmtVNQAhOuCjW7xxvovg8wbNq7LwfXh/k7wXUl58= golang.org/x/tools v0.21.1-0.20240508182429-e35e4ccd0d2d/go.mod h1:aiJjzUbINMkxbQROHiO6hDPo2LHcIPhhQsa9DLh0yGk= -golang.org/x/tools v0.39.0 h1:ik4ho21kwuQln40uelmciQPp9SipgNDdrafrYA4TmQQ= -golang.org/x/tools v0.39.0/go.mod h1:JnefbkDPyD8UU2kI5fuf8ZX4/yUeh9W877ZeBONxUqQ= +golang.org/x/tools v0.43.0 h1:12BdW9CeB3Z+J/I/wj34VMl8X+fEXBxVR90JeMX5E7s= +golang.org/x/tools v0.43.0/go.mod h1:uHkMso649BX2cZK6+RpuIPXS3ho2hZo4FVwfoy1vIk0= golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= golang.org/x/xerrors v0.0.0-20191011141410-1b5146add898/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= From 040ed6e767b169980f5088fc6ff668e43294ec3f Mon Sep 17 00:00:00 2001 From: Yama Date: Tue, 19 May 2026 11:45:27 +0200 Subject: [PATCH 05/13] test(e2e): sanitize hyphens in shard test record name --- e2e/ruler_test.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/e2e/ruler_test.go b/e2e/ruler_test.go index 977f586861..6c9216f796 100644 --- a/e2e/ruler_test.go +++ b/e2e/ruler_test.go @@ -1078,7 +1078,7 @@ func largeRules(namespace, name, groupName, expr string) *monitoringv1.Rules { { Name: groupName, Rules: []monitoringv1.Rule{ - {Record: "shard_test_" + name, Expr: expr}, + {Record: "shard_test_" + strings.ReplaceAll(name, "-", "_"), Expr: expr}, }, }, }, From 11f3a780be1228916a0b8b006844f653ef0525a0 Mon Sep 17 00:00:00 2001 From: Yama Date: Tue, 19 May 2026 11:58:33 +0200 Subject: [PATCH 06/13] refactor(operator): preserve legacy rules-generated ConfigMap for safe rollback --- pkg/operator/rules.go | 16 +++++++--------- pkg/operator/rules_test.go | 10 ++++++---- 2 files changed, 13 insertions(+), 13 deletions(-) diff --git a/pkg/operator/rules.go b/pkg/operator/rules.go index e5bb3d5dcf..58f9785b36 100644 --- a/pkg/operator/rules.go +++ b/pkg/operator/rules.go @@ -37,7 +37,6 @@ import ( ) const ( - nameRulesGenerated = "rules-generated" nameRulesGeneratedPrefix = "rules-generated-" maxShardDataBytes = 800 * 1024 // headroom below 1MB etcd limit for metadata overhead labelRulesShardType = "monitoring.googleapis.com/rules-shard" @@ -450,6 +449,13 @@ func (r *rulesReconciler) upsertShards(ctx context.Context, entries []ruleEntry, return shardIdx + 1, nil } +// deleteStaleShards removes shard ConfigMaps from previous reconciles that +// the current ruleset no longer needs. +// +// The legacy single "rules-generated" ConfigMap is intentionally not deleted +// here — leaving it in place keeps rollback to a pre-sharding operator binary +// safe for users under the etcd size limit. Remove in a follow-up release +// after this version has soaked. func (r *rulesReconciler) deleteStaleShards(ctx context.Context, logger logr.Logger, numShards int) error { active := make(map[string]bool, numShards) for i := range numShards { @@ -472,13 +478,5 @@ func (r *rulesReconciler) deleteStaleShards(ctx context.Context, logger logr.Log } } } - - // Clean up legacy single ConfigMap from before sharding was introduced. - var legacy corev1.ConfigMap - legacy.Namespace = r.opts.OperatorNamespace - legacy.Name = nameRulesGenerated - if err := r.client.Delete(ctx, &legacy); err != nil && !apierrors.IsNotFound(err) { - logger.Error(err, "delete legacy rules-generated") - } return nil } diff --git a/pkg/operator/rules_test.go b/pkg/operator/rules_test.go index edda9e1672..27c9d97837 100644 --- a/pkg/operator/rules_test.go +++ b/pkg/operator/rules_test.go @@ -761,7 +761,7 @@ func TestEnsureRuleConfigs_CleanupStaleShards(t *testing.T) { } } -func TestEnsureRuleConfigs_CleanupLegacy(t *testing.T) { +func TestEnsureRuleConfigs_PreservesLegacy(t *testing.T) { legacyCM := &corev1.ConfigMap{ ObjectMeta: metav1.ObjectMeta{ Name: "rules-generated", @@ -787,9 +787,11 @@ func TestEnsureRuleConfigs_CleanupLegacy(t *testing.T) { } var cm corev1.ConfigMap - err := kubeClient.Get(t.Context(), client.ObjectKey{Namespace: "gmp-system", Name: "rules-generated"}, &cm) - if !apierrors.IsNotFound(err) { - t.Errorf("expected legacy rules-generated to be deleted, got err: %v", err) + if err := kubeClient.Get(t.Context(), client.ObjectKey{Namespace: "gmp-system", Name: "rules-generated"}, &cm); err != nil { + t.Fatalf("legacy rules-generated should be preserved for rollback safety: %v", err) + } + if cm.Data["old.yaml"] != "old data" { + t.Errorf("legacy rules-generated data was modified: %v", cm.Data) } } From b63f30e72cd0527905c8fb13dea11722b9b800c9 Mon Sep 17 00:00:00 2001 From: Yama Date: Tue, 19 May 2026 14:30:54 +0200 Subject: [PATCH 07/13] test(e2e): allow non-string fields in config-reloader log lines --- e2e/ruler_test.go | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/e2e/ruler_test.go b/e2e/ruler_test.go index 6c9216f796..4e8a9bcc27 100644 --- a/e2e/ruler_test.go +++ b/e2e/ruler_test.go @@ -844,11 +844,11 @@ func logsError(logs string) (string, error) { if line == "" { continue } - data := map[string]string{} + data := map[string]any{} if err := json.Unmarshal([]byte(line), &data); err != nil { return "", fmt.Errorf("unable to unmarshal log line: %s", err) } - if data["level"] == "error" { + if level, _ := data["level"].(string); level == "error" { return line, nil } } From dfe7ed5a699792b9922695a9ccd14d473ece638c Mon Sep 17 00:00:00 2001 From: Yama Date: Tue, 19 May 2026 16:25:25 +0200 Subject: [PATCH 08/13] fix(rule-evaluator): seed rules-out with placeholder.yaml at startup --- charts/operator/templates/rule-evaluator.yaml | 5 ++++- manifests/operator.yaml | 5 ++++- 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/charts/operator/templates/rule-evaluator.yaml b/charts/operator/templates/rule-evaluator.yaml index 27473f1a52..75f22a8153 100644 --- a/charts/operator/templates/rule-evaluator.yaml +++ b/charts/operator/templates/rule-evaluator.yaml @@ -44,10 +44,13 @@ spec: initContainers: - name: config-init image: {{.Values.images.bash.image}}:{{.Values.images.bash.tag}} - command: ['/bin/bash', '-c', 'touch /prometheus/config_out/config.yaml'] + # placeholder.yaml keeps rule_files glob non-empty until the syncer populates rules-out. + command: ['/bin/bash', '-c', 'touch /prometheus/config_out/config.yaml /prometheus/rules_out/placeholder.yaml'] volumeMounts: - name: config-out mountPath: /prometheus/config_out + - name: rules-out + mountPath: /prometheus/rules_out securityContext: allowPrivilegeEscalation: false capabilities: diff --git a/manifests/operator.yaml b/manifests/operator.yaml index 11120e850a..5c43474fb0 100644 --- a/manifests/operator.yaml +++ b/manifests/operator.yaml @@ -693,10 +693,13 @@ spec: initContainers: - name: config-init image: gke.gcr.io/gke-distroless/bash:gke_distroless_20260207.00_p0 - command: ['/bin/bash', '-c', 'touch /prometheus/config_out/config.yaml'] + # placeholder.yaml keeps rule_files glob non-empty until the syncer populates rules-out. + command: ['/bin/bash', '-c', 'touch /prometheus/config_out/config.yaml /prometheus/rules_out/placeholder.yaml'] volumeMounts: - name: config-out mountPath: /prometheus/config_out + - name: rules-out + mountPath: /prometheus/rules_out securityContext: allowPrivilegeEscalation: false capabilities: From 2900249b02cd82d7b245d32891f1dccb986ed754 Mon Sep 17 00:00:00 2001 From: Yama Date: Wed, 20 May 2026 14:24:09 +0200 Subject: [PATCH 09/13] test(e2e): surface evaluator state on multi-shard rule poll failure --- cmd/config-reloader/internal/syncer.go | 7 ++++- cmd/config-reloader/internal/syncer_test.go | 2 +- e2e/ruler_test.go | 30 ++++++++++++++++----- 3 files changed, 31 insertions(+), 8 deletions(-) diff --git a/cmd/config-reloader/internal/syncer.go b/cmd/config-reloader/internal/syncer.go index 06f4f7bb1f..ee1b415d9b 100644 --- a/cmd/config-reloader/internal/syncer.go +++ b/cmd/config-reloader/internal/syncer.go @@ -1,4 +1,4 @@ -// Copyright 2024 Google LLC +// Copyright 2026 Google LLC // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -30,6 +30,8 @@ import ( "k8s.io/client-go/rest" ) +// ConfigMapSyncer materializes ConfigMaps matched by a label selector into +// files under outputDir, so the thanos reloader can pick them up. type ConfigMapSyncer struct { client kubernetes.Interface namespace string @@ -41,6 +43,7 @@ type ConfigMapSyncer struct { lastHash string } +// NewConfigMapSyncer constructs a syncer using in-cluster credentials. func NewConfigMapSyncer(namespace, selector, outputDir string, interval time.Duration, logger log.Logger) (*ConfigMapSyncer, error) { cfg, err := rest.InClusterConfig() if err != nil { @@ -64,6 +67,7 @@ func newConfigMapSyncerWithClient(client kubernetes.Interface, namespace, select } } +// Sync runs one list-and-write cycle. It returns whether any file changed. func (s *ConfigMapSyncer) Sync(ctx context.Context) (bool, error) { cmList, err := s.client.CoreV1().ConfigMaps(s.namespace).List(ctx, metav1.ListOptions{ LabelSelector: s.selector, @@ -98,6 +102,7 @@ func (s *ConfigMapSyncer) Sync(ctx context.Context) (bool, error) { return true, nil } +// Run does an initial Sync and then re-syncs on every interval until ctx is cancelled. func (s *ConfigMapSyncer) Run(ctx context.Context) error { // Best-effort initial sync; the reloader will pick up files on its next poll cycle. if _, err := s.Sync(ctx); err != nil { diff --git a/cmd/config-reloader/internal/syncer_test.go b/cmd/config-reloader/internal/syncer_test.go index 65c83518da..4dbbc79d0a 100644 --- a/cmd/config-reloader/internal/syncer_test.go +++ b/cmd/config-reloader/internal/syncer_test.go @@ -1,4 +1,4 @@ -// Copyright 2024 Google LLC +// Copyright 2026 Google LLC // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. diff --git a/e2e/ruler_test.go b/e2e/ruler_test.go index 4e8a9bcc27..a9ca0a2022 100644 --- a/e2e/ruler_test.go +++ b/e2e/ruler_test.go @@ -1037,19 +1037,27 @@ func testCreateMultiShardRules( t.Logf("unable to get rules: %s", err) return false, nil } - gotGroups := make(map[string]bool, len(rules.Groups)) + gotGroups := make(map[string]string, len(rules.Groups)) for _, g := range rules.Groups { - gotGroups[g.Name] = true + gotGroups[g.Name] = g.File } + var missing []string for name := range wantGroupNames { - if !gotGroups[name] { - t.Logf("rule group %q not yet loaded", name) - return false, nil + if _, ok := gotGroups[name]; !ok { + missing = append(missing, name) } } - return true, nil + if rt, rterr := getRuntimeInfo(ctx, httpClient, pod, 19092); rterr == nil { + t.Logf("rules poll: got=%v missing=%v reloadSuccess=%t lastConfigTime=%s", + gotGroups, missing, rt.ReloadConfigSuccess, rt.LastConfigTime.Format(time.RFC3339)) + } else { + t.Logf("rules poll: got=%v missing=%v runtimeInfo_err=%s", gotGroups, missing, rterr) + } + return len(missing) == 0, nil }) if err != nil { + dumpContainerLogs(ctx, t, restConfig, pod, operator.RuleEvaluatorContainerName) + dumpContainerLogs(ctx, t, restConfig, pod, configReloaderContainerName) t.Fatalf("not all rule groups appeared in evaluator API: %s", err) } @@ -1067,6 +1075,16 @@ func testCreateMultiShardRules( } } +func dumpContainerLogs(ctx context.Context, t *testing.T, restConfig *rest.Config, pod *corev1.Pod, container string) { + t.Helper() + logs, err := kube.PodLogs(ctx, restConfig, pod.Namespace, pod.Name, container) + if err != nil { + t.Logf("debug: fetch %s logs failed: %s", container, err) + return + } + t.Logf("debug: %s container logs follow:\n%s", container, logs) +} + func largeRules(namespace, name, groupName, expr string) *monitoringv1.Rules { return &monitoringv1.Rules{ ObjectMeta: metav1.ObjectMeta{ From d325876ab0a57f8ede3956999753c3df8c12bf9b Mon Sep 17 00:00:00 2001 From: Yama Date: Wed, 20 May 2026 14:50:03 +0200 Subject: [PATCH 10/13] fix(config-reloader): ignore ConfigMaps with unexpected names --- cmd/config-reloader/internal/syncer.go | 44 ++++++++++------- cmd/config-reloader/internal/syncer_test.go | 55 ++++++++++++++++++--- cmd/config-reloader/main.go | 3 +- 3 files changed, 76 insertions(+), 26 deletions(-) diff --git a/cmd/config-reloader/internal/syncer.go b/cmd/config-reloader/internal/syncer.go index ee1b415d9b..b5d5465ad2 100644 --- a/cmd/config-reloader/internal/syncer.go +++ b/cmd/config-reloader/internal/syncer.go @@ -21,6 +21,7 @@ import ( "os" "path/filepath" "sort" + "strings" "time" "github.com/go-kit/log" @@ -30,21 +31,24 @@ import ( "k8s.io/client-go/rest" ) -// ConfigMapSyncer materializes ConfigMaps matched by a label selector into -// files under outputDir, so the thanos reloader can pick them up. +// ConfigMapSyncer materializes ConfigMaps matched by selector into files +// under outputDir. ConfigMaps whose name does not start with namePrefix are +// skipped. type ConfigMapSyncer struct { - client kubernetes.Interface - namespace string - selector string - outputDir string - logger log.Logger - interval time.Duration + client kubernetes.Interface + namespace string + selector string + namePrefix string + outputDir string + logger log.Logger + interval time.Duration lastHash string } // NewConfigMapSyncer constructs a syncer using in-cluster credentials. -func NewConfigMapSyncer(namespace, selector, outputDir string, interval time.Duration, logger log.Logger) (*ConfigMapSyncer, error) { +// Empty namePrefix disables the name check. +func NewConfigMapSyncer(namespace, selector, namePrefix, outputDir string, interval time.Duration, logger log.Logger) (*ConfigMapSyncer, error) { cfg, err := rest.InClusterConfig() if err != nil { return nil, err @@ -53,17 +57,18 @@ func NewConfigMapSyncer(namespace, selector, outputDir string, interval time.Dur if err != nil { return nil, err } - return newConfigMapSyncerWithClient(client, namespace, selector, outputDir, interval, logger), nil + return newConfigMapSyncerWithClient(client, namespace, selector, namePrefix, outputDir, interval, logger), nil } -func newConfigMapSyncerWithClient(client kubernetes.Interface, namespace, selector, outputDir string, interval time.Duration, logger log.Logger) *ConfigMapSyncer { +func newConfigMapSyncerWithClient(client kubernetes.Interface, namespace, selector, namePrefix, outputDir string, interval time.Duration, logger log.Logger) *ConfigMapSyncer { return &ConfigMapSyncer{ - client: client, - namespace: namespace, - selector: selector, - outputDir: outputDir, - interval: interval, - logger: logger, + client: client, + namespace: namespace, + selector: selector, + namePrefix: namePrefix, + outputDir: outputDir, + interval: interval, + logger: logger, } } @@ -79,6 +84,11 @@ func (s *ConfigMapSyncer) Sync(ctx context.Context) (bool, error) { files := make(map[string][]byte) for i := range cmList.Items { cm := &cmList.Items[i] + if s.namePrefix != "" && !strings.HasPrefix(cm.Name, s.namePrefix) { + //nolint:errcheck + level.Warn(s.logger).Log("msg", "ignoring configmap with unexpected name", "name", cm.Name, "want_prefix", s.namePrefix) + continue + } for k, v := range cm.Data { files[cm.Name+"__"+k] = []byte(v) } diff --git a/cmd/config-reloader/internal/syncer_test.go b/cmd/config-reloader/internal/syncer_test.go index 4dbbc79d0a..c492482109 100644 --- a/cmd/config-reloader/internal/syncer_test.go +++ b/cmd/config-reloader/internal/syncer_test.go @@ -43,7 +43,7 @@ func TestConfigMapSyncer_BasicSync(t *testing.T) { } client := fake.NewSimpleClientset(cm) - syncer := newConfigMapSyncerWithClient(client, "gmp-system", "monitoring.googleapis.com/rules-shard=true", outputDir, time.Second, log.NewNopLogger()) + syncer := newConfigMapSyncerWithClient(client, "gmp-system", "monitoring.googleapis.com/rules-shard=true", "rules-generated-", outputDir, time.Second, log.NewNopLogger()) changed, err := syncer.Sync(t.Context()) if err != nil { @@ -79,7 +79,7 @@ func TestConfigMapSyncer_NoChangeOnSecondSync(t *testing.T) { } client := fake.NewSimpleClientset(cm) - syncer := newConfigMapSyncerWithClient(client, "gmp-system", "monitoring.googleapis.com/rules-shard=true", outputDir, time.Second, log.NewNopLogger()) + syncer := newConfigMapSyncerWithClient(client, "gmp-system", "monitoring.googleapis.com/rules-shard=true", "rules-generated-", outputDir, time.Second, log.NewNopLogger()) if _, err := syncer.Sync(t.Context()); err != nil { t.Fatalf("first sync: %v", err) @@ -116,7 +116,7 @@ func TestConfigMapSyncer_StaleFileRemoval(t *testing.T) { } client := fake.NewSimpleClientset(cm) - syncer := newConfigMapSyncerWithClient(client, "gmp-system", "monitoring.googleapis.com/rules-shard=true", outputDir, time.Second, log.NewNopLogger()) + syncer := newConfigMapSyncerWithClient(client, "gmp-system", "monitoring.googleapis.com/rules-shard=true", "rules-generated-", outputDir, time.Second, log.NewNopLogger()) if _, err := syncer.Sync(t.Context()); err != nil { t.Fatal(err) @@ -155,7 +155,7 @@ func TestConfigMapSyncer_MultipleConfigMaps(t *testing.T) { } client := fake.NewSimpleClientset(cm0, cm1) - syncer := newConfigMapSyncerWithClient(client, "gmp-system", "monitoring.googleapis.com/rules-shard=true", outputDir, time.Second, log.NewNopLogger()) + syncer := newConfigMapSyncerWithClient(client, "gmp-system", "monitoring.googleapis.com/rules-shard=true", "rules-generated-", outputDir, time.Second, log.NewNopLogger()) changed, err := syncer.Sync(t.Context()) if err != nil { @@ -189,7 +189,7 @@ func TestConfigMapSyncer_ContentUpdateDetection(t *testing.T) { } client := fake.NewSimpleClientset(cm) - syncer := newConfigMapSyncerWithClient(client, "gmp-system", "monitoring.googleapis.com/rules-shard=true", outputDir, time.Second, log.NewNopLogger()) + syncer := newConfigMapSyncerWithClient(client, "gmp-system", "monitoring.googleapis.com/rules-shard=true", "rules-generated-", outputDir, time.Second, log.NewNopLogger()) if _, err := syncer.Sync(t.Context()); err != nil { t.Fatalf("first sync: %v", err) @@ -235,7 +235,7 @@ func TestConfigMapSyncer_MixedDataAndBinaryData(t *testing.T) { } client := fake.NewSimpleClientset(cm) - syncer := newConfigMapSyncerWithClient(client, "gmp-system", "monitoring.googleapis.com/rules-shard=true", outputDir, time.Second, log.NewNopLogger()) + syncer := newConfigMapSyncerWithClient(client, "gmp-system", "monitoring.googleapis.com/rules-shard=true", "rules-generated-", outputDir, time.Second, log.NewNopLogger()) if _, err := syncer.Sync(t.Context()); err != nil { t.Fatal(err) @@ -295,7 +295,7 @@ func TestConfigMapSyncer_SelectorFiltering(t *testing.T) { } client := fake.NewSimpleClientset(matching, nonMatching, wrongNamespace) - syncer := newConfigMapSyncerWithClient(client, "gmp-system", "monitoring.googleapis.com/rules-shard=true", outputDir, time.Second, log.NewNopLogger()) + syncer := newConfigMapSyncerWithClient(client, "gmp-system", "monitoring.googleapis.com/rules-shard=true", "rules-generated-", outputDir, time.Second, log.NewNopLogger()) if _, err := syncer.Sync(t.Context()); err != nil { t.Fatal(err) @@ -331,7 +331,7 @@ func TestConfigMapSyncer_ConfigMapRemoved(t *testing.T) { } client := fake.NewSimpleClientset(cm) - syncer := newConfigMapSyncerWithClient(client, "gmp-system", "monitoring.googleapis.com/rules-shard=true", outputDir, time.Second, log.NewNopLogger()) + syncer := newConfigMapSyncerWithClient(client, "gmp-system", "monitoring.googleapis.com/rules-shard=true", "rules-generated-", outputDir, time.Second, log.NewNopLogger()) if _, err := syncer.Sync(t.Context()); err != nil { t.Fatal(err) @@ -360,3 +360,42 @@ func TestConfigMapSyncer_ConfigMapRemoved(t *testing.T) { t.Errorf("expected 0 files after ConfigMap removed, got %d", len(entries)) } } + +func TestConfigMapSyncer_IgnoresUnexpectedNames(t *testing.T) { + outputDir := t.TempDir() + + shard := &corev1.ConfigMap{ + ObjectMeta: metav1.ObjectMeta{ + Name: "rules-generated-0", + Namespace: "gmp-system", + Labels: map[string]string{ + "monitoring.googleapis.com/rules-shard": "true", + }, + }, + Data: map[string]string{"rules__default__test.yaml": "groups: []\n"}, + } + rogue := &corev1.ConfigMap{ + ObjectMeta: metav1.ObjectMeta{ + Name: "tenant-injected", + Namespace: "gmp-system", + Labels: map[string]string{ + "monitoring.googleapis.com/rules-shard": "true", + }, + }, + Data: map[string]string{"evil.yaml": "groups: [{name: evil, rules: []}]\n"}, + } + + client := fake.NewSimpleClientset(shard, rogue) + syncer := newConfigMapSyncerWithClient(client, "gmp-system", "monitoring.googleapis.com/rules-shard=true", "rules-generated-", outputDir, time.Second, log.NewNopLogger()) + + if _, err := syncer.Sync(t.Context()); err != nil { + t.Fatal(err) + } + + if _, err := os.Stat(filepath.Join(outputDir, "rules-generated-0__rules__default__test.yaml")); err != nil { + t.Errorf("expected shard file to be written: %v", err) + } + if _, err := os.Stat(filepath.Join(outputDir, "tenant-injected__evil.yaml")); !os.IsNotExist(err) { + t.Errorf("expected rogue ConfigMap to be skipped, got err=%v", err) + } +} diff --git a/cmd/config-reloader/main.go b/cmd/config-reloader/main.go index a8c1e98b83..9d5ce4097f 100644 --- a/cmd/config-reloader/main.go +++ b/cmd/config-reloader/main.go @@ -253,7 +253,8 @@ func setupCfgDirs(configMapSelector, configMapNamespace, configDir, configDirOut return nil, nil, errors.New("--config-dir-output required when using --config-dir-from-configmap-selector") } - syncer, err := crinternal.NewConfigMapSyncer(configMapNamespace, configMapSelector, configDir, interval, logger) + // Only operator-generated shards (rules-generated-N) are materialized. + syncer, err := crinternal.NewConfigMapSyncer(configMapNamespace, configMapSelector, "rules-generated-", configDir, interval, logger) if err != nil { return nil, nil, err } From 24248289ed78246d7fe58e96b8b7c73a6e215690 Mon Sep 17 00:00:00 2001 From: Yama Date: Wed, 20 May 2026 14:54:16 +0200 Subject: [PATCH 11/13] fix(rule-evaluator): route /api/v1/rules through the live rule manager --- cmd/rule-evaluator/main.go | 24 +++++++++++++++++++++++- 1 file changed, 23 insertions(+), 1 deletion(-) diff --git a/cmd/rule-evaluator/main.go b/cmd/rule-evaluator/main.go index 053e4af313..bfb9917b29 100644 --- a/cmd/rule-evaluator/main.go +++ b/cmd/rule-evaluator/main.go @@ -380,7 +380,7 @@ func main() { http.HandleFunc("/api/v1/status/buildinfo", buildInfoHandler) // https://prometheus.io/docs/prometheus/latest/querying/api/#rules - apiHandler := internal.NewAPI(logger, ruleEvaluator.rulesManager) + apiHandler := internal.NewAPI(logger, ruleEvaluator) http.HandleFunc("/api/v1/rules", apiHandler.HandleRulesEndpoint) http.HandleFunc("/api/v1/rules/", http.NotFound) @@ -1054,6 +1054,28 @@ func (e *ruleEvaluator) Stop() { e.rulesManager = nil } +// RuleGroups reads the live rulesManager pointer so the API sees the manager after any ApplyConfig swap. +func (e *ruleEvaluator) RuleGroups() []*rules.Group { + e.mtx.Lock() + curr := e.rulesManager + e.mtx.Unlock() + if curr == nil { + return nil + } + return curr.RuleGroups() +} + +// AlertingRules returns the currently loaded alerting rules. See RuleGroups. +func (e *ruleEvaluator) AlertingRules() []*rules.AlertingRule { + e.mtx.Lock() + curr := e.rulesManager + e.mtx.Unlock() + if curr == nil { + return nil + } + return curr.AlertingRules() +} + func newQueryFunc(logger log.Logger, v1api v1.API) rules.QueryFunc { return func(ctx context.Context, q string, t time.Time) (promql.Vector, error) { v, warnings, err := QueryFunc(ctx, q, t, v1api) From 907c2e98419d784b4e6620b6c9772e62a2d684ac Mon Sep 17 00:00:00 2001 From: Yama Date: Wed, 20 May 2026 15:58:08 +0200 Subject: [PATCH 12/13] chore(lint): exclude go-kit Logger.Log from errcheck --- .golangci.yml | 1 + cmd/config-reloader/internal/syncer.go | 4 ---- 2 files changed, 1 insertion(+), 4 deletions(-) diff --git a/.golangci.yml b/.golangci.yml index 0273fcac8f..a1333c9c5c 100644 --- a/.golangci.yml +++ b/.golangci.yml @@ -89,6 +89,7 @@ linters: - io/ioutil.ReadFile - io.Copy(*bytes.Buffer) - io.Copy(os.Stdout) + - (github.com/go-kit/log.Logger).Log # Display function signature instead of selector. # Default: false verbose: true diff --git a/cmd/config-reloader/internal/syncer.go b/cmd/config-reloader/internal/syncer.go index b5d5465ad2..12ee587a7d 100644 --- a/cmd/config-reloader/internal/syncer.go +++ b/cmd/config-reloader/internal/syncer.go @@ -85,7 +85,6 @@ func (s *ConfigMapSyncer) Sync(ctx context.Context) (bool, error) { for i := range cmList.Items { cm := &cmList.Items[i] if s.namePrefix != "" && !strings.HasPrefix(cm.Name, s.namePrefix) { - //nolint:errcheck level.Warn(s.logger).Log("msg", "ignoring configmap with unexpected name", "name", cm.Name, "want_prefix", s.namePrefix) continue } @@ -107,7 +106,6 @@ func (s *ConfigMapSyncer) Sync(ctx context.Context) (bool, error) { } s.lastHash = hash - //nolint:errcheck level.Info(s.logger).Log("msg", "synced configmap rules", "configmaps", len(cmList.Items), "files", len(files)) return true, nil } @@ -116,7 +114,6 @@ func (s *ConfigMapSyncer) Sync(ctx context.Context) (bool, error) { func (s *ConfigMapSyncer) Run(ctx context.Context) error { // Best-effort initial sync; the reloader will pick up files on its next poll cycle. if _, err := s.Sync(ctx); err != nil { - //nolint:errcheck level.Warn(s.logger).Log("msg", "initial configmap sync failed", "err", err) } ticker := time.NewTicker(s.interval) @@ -127,7 +124,6 @@ func (s *ConfigMapSyncer) Run(ctx context.Context) error { return nil case <-ticker.C: if _, err := s.Sync(ctx); err != nil { - //nolint:errcheck level.Warn(s.logger).Log("msg", "configmap sync failed", "err", err) } } From 6b78901dba7a1b813fbdb6b011e76f998bab0afe Mon Sep 17 00:00:00 2001 From: Yama Date: Wed, 20 May 2026 16:01:35 +0200 Subject: [PATCH 13/13] docs(config-reloader): generalize --config-dir-from-configmap-selector help --- cmd/config-reloader/README.md | 2 +- cmd/config-reloader/main.go | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/cmd/config-reloader/README.md b/cmd/config-reloader/README.md index 51a805c836..badd697fb6 100644 --- a/cmd/config-reloader/README.md +++ b/cmd/config-reloader/README.md @@ -12,7 +12,7 @@ Usage of config-reloader: -config-dir-from-configmap-namespace string namespace to list ConfigMaps from (required when --config-dir-from-configmap-selector is set) -config-dir-from-configmap-selector string - label selector to discover rule ConfigMaps via K8s API (e.g. monitoring.googleapis.com/rules-shard=true). When set, replaces --config-dir for rule file discovery. + label selector to discover ConfigMaps via K8s API (e.g. monitoring.googleapis.com/rules-shard=true). When set, materialized ConfigMap entries are written into --config-dir-output alongside any files from --config-dir. -config-dir-output string config directory to write with interpolated environment variables -config-file string diff --git a/cmd/config-reloader/main.go b/cmd/config-reloader/main.go index 9d5ce4097f..43042ad2bc 100644 --- a/cmd/config-reloader/main.go +++ b/cmd/config-reloader/main.go @@ -46,7 +46,7 @@ func main() { configDir = flag.String("config-dir", "", "config directory to watch for changes") configDirOutput = flag.String("config-dir-output", "", "config directory to write with interpolated environment variables") - configMapSelector = flag.String("config-dir-from-configmap-selector", "", "label selector to discover rule ConfigMaps via K8s API (e.g. monitoring.googleapis.com/rules-shard=true). When set, replaces --config-dir for rule file discovery.") + configMapSelector = flag.String("config-dir-from-configmap-selector", "", "label selector to discover ConfigMaps via K8s API (e.g. monitoring.googleapis.com/rules-shard=true). When set, materialized ConfigMap entries are written into --config-dir-output alongside any files from --config-dir.") configMapNamespace = flag.String("config-dir-from-configmap-namespace", "", "namespace to list ConfigMaps from (required when --config-dir-from-configmap-selector is set)") // Ready and reload endpoints should be compatible with Prometheus-style