diff --git a/charts/operator/crds/monitoring.googleapis.com_operatorconfigs.yaml b/charts/operator/crds/monitoring.googleapis.com_operatorconfigs.yaml index dc435aa295..b63e0e53b8 100644 --- a/charts/operator/crds/monitoring.googleapis.com_operatorconfigs.yaml +++ b/charts/operator/crds/monitoring.googleapis.com_operatorconfigs.yaml @@ -11,6 +11,8 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. + +--- apiVersion: apiextensions.k8s.io/v1 kind: CustomResourceDefinition metadata: @@ -237,6 +239,279 @@ spec: If no URL is provided, Alertmanager will point to the Google Cloud Metric Explorer page. type: string + storage: + description: |- + Storage opts the managed Alertmanager into a PersistentVolumeClaim-backed + data directory. When unset, Alertmanager uses an ephemeral emptyDir volume + and all silences, notification log entries, and inhibitions are lost on + pod restart. When set, the operator creates a PVC in the operator + namespace and mounts it at the Alertmanager data path so this state + survives pod churn. + + See https://github.com/GoogleCloudPlatform/prometheus-engine/issues/685. + properties: + volumeClaim: + description: |- + VolumeClaim describes the desired PersistentVolumeClaim. The + embedded structure exposes both `metadata` (so callers can attach + labels and annotations, e.g. for volume-snapshot tooling) and `spec` + (so every Kubernetes PVC field — accessModes, storageClassName, + resources, selector, volumeMode, dataSource, dataSourceRef — is + configurable). The operator overwrites the claim's name and + namespace; everything else is taken from the caller-provided spec + modulo Kubernetes-enforced immutability. + properties: + apiVersion: + description: |- + APIVersion defines the versioned schema of this representation of an object. + Servers should convert recognized schemas to the latest internal value, and + may reject unrecognized values. + More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources + type: string + kind: + description: |- + Kind is a string value representing the REST resource this object represents. + Servers may infer this from the endpoint the client submits requests to. + Cannot be updated. + In CamelCase. + More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds + type: string + metadata: + description: |- + EmbeddedObjectMetadata contains labels, annotations and finalizers + applied to the generated PersistentVolumeClaim. Other ObjectMeta + fields are ignored. + properties: + annotations: + additionalProperties: + type: string + description: |- + Annotations applied to the generated resource. Useful for + integrations such as VolumeSnapshot controllers or storage-class + provisioners that read annotations from the claim. + type: object + finalizers: + description: |- + Finalizers applied to the generated resource on creation. The + operator does not strip user-managed finalizers it did not add, so + removing entries from this list does not remove them from the live + object. + items: + type: string + type: array + labels: + additionalProperties: + type: string + description: |- + Labels applied to the generated resource. Merged with the + operator's default labels; on conflict the operator's value wins. + type: object + type: object + spec: + description: |- + Spec defines the desired characteristics of the volume. At minimum, + `resources.requests.storage` must be set. See the Kubernetes + PersistentVolumeClaim documentation for the full field reference: + https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.30/#persistentvolumeclaimspec-v1-core + properties: + accessModes: + description: |- + accessModes contains the desired access modes the volume should have. + More info: https://kubernetes.io/docs/concepts/storage/persistent-volumes#access-modes-1 + items: + type: string + type: array + x-kubernetes-list-type: atomic + dataSource: + description: |- + dataSource field can be used to specify either: + * An existing VolumeSnapshot object (snapshot.storage.k8s.io/VolumeSnapshot) + * An existing PVC (PersistentVolumeClaim) + If the provisioner or an external controller can support the specified data source, + it will create a new volume based on the contents of the specified data source. + When the AnyVolumeDataSource feature gate is enabled, dataSource contents will be copied to dataSourceRef, + and dataSourceRef contents will be copied to dataSource when dataSourceRef.namespace is not specified. + If the namespace is specified, then dataSourceRef will not be copied to dataSource. + properties: + apiGroup: + description: |- + APIGroup is the group for the resource being referenced. + If APIGroup is not specified, the specified Kind must be in the core API group. + For any other third-party types, APIGroup is required. + type: string + kind: + description: Kind is the type of resource being referenced + type: string + name: + description: Name is the name of resource being referenced + type: string + required: + - kind + - name + type: object + x-kubernetes-map-type: atomic + dataSourceRef: + description: |- + dataSourceRef specifies the object from which to populate the volume with data, if a non-empty + volume is desired. This may be any object from a non-empty API group (non + core object) or a PersistentVolumeClaim object. + When this field is specified, volume binding will only succeed if the type of + the specified object matches some installed volume populator or dynamic + provisioner. + This field will replace the functionality of the dataSource field and as such + if both fields are non-empty, they must have the same value. For backwards + compatibility, when namespace isn't specified in dataSourceRef, + both fields (dataSource and dataSourceRef) will be set to the same + value automatically if one of them is empty and the other is non-empty. + When namespace is specified in dataSourceRef, + dataSource isn't set to the same value and must be empty. + There are three important differences between dataSource and dataSourceRef: + * While dataSource only allows two specific types of objects, dataSourceRef + allows any non-core object, as well as PersistentVolumeClaim objects. + * While dataSource ignores disallowed values (dropping them), dataSourceRef + preserves all values, and generates an error if a disallowed value is + specified. + * While dataSource only allows local objects, dataSourceRef allows objects + in any namespaces. + (Beta) Using this field requires the AnyVolumeDataSource feature gate to be enabled. + (Alpha) Using the namespace field of dataSourceRef requires the CrossNamespaceVolumeDataSource feature gate to be enabled. + properties: + apiGroup: + description: |- + APIGroup is the group for the resource being referenced. + If APIGroup is not specified, the specified Kind must be in the core API group. + For any other third-party types, APIGroup is required. + type: string + kind: + description: Kind is the type of resource being referenced + type: string + name: + description: Name is the name of resource being referenced + type: string + namespace: + description: |- + Namespace is the namespace of resource being referenced + Note that when a namespace is specified, a gateway.networking.k8s.io/ReferenceGrant object is required in the referent namespace to allow that namespace's owner to accept the reference. See the ReferenceGrant documentation for details. + (Alpha) This field requires the CrossNamespaceVolumeDataSource feature gate to be enabled. + type: string + required: + - kind + - name + type: object + resources: + description: |- + resources represents the minimum resources the volume should have. + If RecoverVolumeExpansionFailure feature is enabled users are allowed to specify resource requirements + that are lower than previous value but must still be higher than capacity recorded in the + status field of the claim. + More info: https://kubernetes.io/docs/concepts/storage/persistent-volumes#resources + properties: + limits: + additionalProperties: + anyOf: + - type: integer + - type: string + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true + description: |- + Limits describes the maximum amount of compute resources allowed. + More info: https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/ + type: object + requests: + additionalProperties: + anyOf: + - type: integer + - type: string + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true + description: |- + Requests describes the minimum amount of compute resources required. + If Requests is omitted for a container, it defaults to Limits if that is explicitly specified, + otherwise to an implementation-defined value. Requests cannot exceed Limits. + More info: https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/ + type: object + type: object + selector: + description: selector is a label query over volumes to + consider for binding. + properties: + matchExpressions: + description: matchExpressions is a list of label selector + requirements. The requirements are ANDed. + items: + description: |- + A label selector requirement is a selector that contains values, a key, and an operator that + relates the key and values. + properties: + key: + description: key is the label key that the selector + applies to. + type: string + operator: + description: |- + operator represents a key's relationship to a set of values. + Valid operators are In, NotIn, Exists and DoesNotExist. + type: string + values: + description: |- + values is an array of string values. If the operator is In or NotIn, + the values array must be non-empty. If the operator is Exists or DoesNotExist, + the values array must be empty. This array is replaced during a strategic + merge patch. + items: + type: string + type: array + x-kubernetes-list-type: atomic + required: + - key + - operator + type: object + type: array + x-kubernetes-list-type: atomic + matchLabels: + additionalProperties: + type: string + description: |- + matchLabels is a map of {key,value} pairs. A single {key,value} in the matchLabels + map is equivalent to an element of matchExpressions, whose key field is "key", the + operator is "In", and the values array contains only "value". The requirements are ANDed. + type: object + type: object + x-kubernetes-map-type: atomic + storageClassName: + description: |- + storageClassName is the name of the StorageClass required by the claim. + More info: https://kubernetes.io/docs/concepts/storage/persistent-volumes#class-1 + type: string + volumeAttributesClassName: + description: |- + volumeAttributesClassName may be used to set the VolumeAttributesClass used by this claim. + If specified, the CSI driver will create or update the volume with the attributes defined + in the corresponding VolumeAttributesClass. This has a different purpose than storageClassName, + it can be changed after the claim is created. An empty string value means that no VolumeAttributesClass + will be applied to the claim but it's not allowed to reset this field to empty string once it is set. + If unspecified and the PersistentVolumeClaim is unbound, the default VolumeAttributesClass + will be set by the persistentvolume controller if it exists. + If the resource referred to by volumeAttributesClass does not exist, this PersistentVolumeClaim will be + set to a Pending state, as reflected by the modifyVolumeStatus field, until such as a resource + exists. + More info: https://kubernetes.io/docs/concepts/storage/volume-attributes-classes/ + (Alpha) Using this field requires the VolumeAttributesClass feature gate to be enabled. + type: string + volumeMode: + description: |- + volumeMode defines what type of volume is required by the claim. + Value of Filesystem is implied when not included in claim spec. + type: string + volumeName: + description: volumeName is the binding reference to the + PersistentVolume backing this claim. + type: string + type: object + type: object + required: + - volumeClaim + type: object type: object metadata: type: object diff --git a/doc/api.md b/doc/api.md index 9e576cfcc6..4a460b388d 100644 --- a/doc/api.md +++ b/doc/api.md @@ -52,6 +52,12 @@ Resource Types:
  • LabelMapping
  • +AlertmanagerStorageSpec +
  • +EmbeddedObjectMetadata +
  • +EmbeddedPersistentVolumeClaim +
  • ManagedAlertmanagerSpec
  • MonitoringCRD @@ -1373,6 +1379,189 @@ be derived automatically.

    If no URL is provided, Alertmanager will point to the Google Cloud Metric Explorer page.

    + + +storage
    + + +AlertmanagerStorageSpec + + + + +

    Storage opts the managed Alertmanager into a PersistentVolumeClaim-backed +data directory. When unset, Alertmanager uses an ephemeral emptyDir volume +and all silences, notification log entries, and inhibitions are lost on +pod restart. When set, the operator creates a PVC in the operator +namespace and mounts it at the Alertmanager data path so this state +survives pod churn.

    +

    See issue #685.

    + + + + +

    +AlertmanagerStorageSpec + +

    +

    +(Appears in: ManagedAlertmanagerSpec) +

    +
    +

    AlertmanagerStorageSpec configures persistent storage for the managed +Alertmanager. The operator provisions a single PersistentVolumeClaim named +"alertmanager-data" in the operator namespace using the supplied spec and +mounts it at the Alertmanager data path. The managed Alertmanager runs with +a single replica, so a ReadWriteOnce access mode is sufficient; multi-replica +support would require migrating to volumeClaimTemplates and is out of scope +here.

    +

    Changing this spec after creation triggers a rolling restart of the +Alertmanager StatefulSet. Most PersistentVolumeClaim fields are immutable +once the claim is bound — only resources.requests.storage can be +expanded (and only if the StorageClass allows volume expansion). The +operator logs and ignores shrink requests and other mutations to +immutable fields; the existing PVC must be deleted manually to fully +reset (silences will be lost).

    +
    + + + + + + + + + + + + + +
    FieldDescription
    +volumeClaim
    + + +EmbeddedPersistentVolumeClaim + + +
    +

    VolumeClaim describes the desired PersistentVolumeClaim. The embedded +structure exposes both metadata (so callers can attach labels and +annotations, e.g. for volume-snapshot tooling) and spec (so every +Kubernetes PVC field — accessModes, storageClassName, resources, selector, +volumeMode, dataSource, dataSourceRef — is configurable). The operator +overwrites the claim's name and namespace; everything else is taken from +the caller-provided spec modulo Kubernetes-enforced immutability.

    +
    +

    +EmbeddedPersistentVolumeClaim + +

    +

    +(Appears in: AlertmanagerStorageSpec) +

    +
    +

    EmbeddedPersistentVolumeClaim is a PersistentVolumeClaim definition +embedded directly in a parent resource's spec. It mirrors prometheus- +operator's type of the same name so user-facing YAML feels familiar.

    +
    + + + + + + + + + + + + + + + + + +
    FieldDescription
    +metadata
    + + +EmbeddedObjectMetadata + + +
    +

    EmbeddedObjectMetadata contains labels, annotations and finalizers +applied to the generated PersistentVolumeClaim. Other ObjectMeta +fields are ignored.

    +
    +spec
    + + +Kubernetes core/v1.PersistentVolumeClaimSpec + + +
    +

    Spec defines the desired characteristics of the volume. At minimum, +resources.requests.storage must be set.

    +
    +

    +EmbeddedObjectMetadata + +

    +

    +(Appears in: EmbeddedPersistentVolumeClaim) +

    +
    +

    EmbeddedObjectMetadata is a subset of metav1.ObjectMeta containing only +the fields that make sense to set on an operator-managed child resource. +Setting name or namespace here has no effect — the operator owns +those.

    +
    + + + + + + + + + + + + + + + + + + + +
    FieldDescription
    +labels
    + +map[string]string + +
    +

    Labels applied to the generated resource. Merged with the operator's +default labels; on conflict the operator's value wins.

    +
    +annotations
    + +map[string]string + +
    +

    Annotations applied to the generated resource. Useful for integrations +such as VolumeSnapshot controllers or storage-class provisioners that +read annotations from the claim.

    +
    +finalizers
    + +[]string + +
    +

    Finalizers applied to the generated resource on creation. The operator +does not strip user-managed finalizers it did not add, so removing +entries from this list does not remove them from the live object.

    +

    diff --git a/pkg/operator/apis/monitoring/v1/operator_types.go b/pkg/operator/apis/monitoring/v1/operator_types.go index a3fb792188..4fbc7009c0 100644 --- a/pkg/operator/apis/monitoring/v1/operator_types.go +++ b/pkg/operator/apis/monitoring/v1/operator_types.go @@ -289,6 +289,87 @@ type ManagedAlertmanagerSpec struct { // // If no URL is provided, Alertmanager will point to the Google Cloud Metric Explorer page. ExternalURL string `json:"externalURL,omitempty"` + // Storage opts the managed Alertmanager into a PersistentVolumeClaim-backed + // data directory. When unset, Alertmanager uses an ephemeral emptyDir volume + // and all silences, notification log entries, and inhibitions are lost on + // pod restart. When set, the operator creates a PVC in the operator + // namespace and mounts it at the Alertmanager data path so this state + // survives pod churn. + // + // See https://github.com/GoogleCloudPlatform/prometheus-engine/issues/685. + Storage *AlertmanagerStorageSpec `json:"storage,omitempty"` +} + +// AlertmanagerStorageSpec configures persistent storage for the managed +// Alertmanager. The operator provisions a single PersistentVolumeClaim named +// "alertmanager-data" in the operator namespace using the supplied spec and +// mounts it at the Alertmanager data path. The managed Alertmanager runs with +// a single replica, so a ReadWriteOnce access mode is sufficient; multi-replica +// support would require migrating to volumeClaimTemplates and is out of scope +// here. +// +// Changing this spec after creation triggers a rolling restart of the +// Alertmanager StatefulSet. Most PersistentVolumeClaim fields are immutable +// once the claim is bound — only `resources.requests.storage` can be +// expanded (and only if the StorageClass allows volume expansion). The +// operator logs and ignores shrink requests and other mutations to +// immutable fields; the existing PVC must be deleted manually to fully +// reset (silences will be lost). +type AlertmanagerStorageSpec struct { + // VolumeClaim describes the desired PersistentVolumeClaim. The + // embedded structure exposes both `metadata` (so callers can attach + // labels and annotations, e.g. for volume-snapshot tooling) and `spec` + // (so every Kubernetes PVC field — accessModes, storageClassName, + // resources, selector, volumeMode, dataSource, dataSourceRef — is + // configurable). The operator overwrites the claim's name and + // namespace; everything else is taken from the caller-provided spec + // modulo Kubernetes-enforced immutability. + VolumeClaim EmbeddedPersistentVolumeClaim `json:"volumeClaim"` +} + +// EmbeddedPersistentVolumeClaim is a PersistentVolumeClaim definition +// embedded directly in a parent resource's spec. It mirrors prometheus- +// operator's type of the same name so user-facing YAML feels familiar. +// +// Only ObjectMeta fields that customise the claim itself (labels, +// annotations, finalizers) are honoured; name and namespace are owned by +// the operator. +type EmbeddedPersistentVolumeClaim struct { + metav1.TypeMeta `json:",inline"` + + // EmbeddedObjectMetadata contains labels, annotations and finalizers + // applied to the generated PersistentVolumeClaim. Other ObjectMeta + // fields are ignored. + // +optional + EmbeddedObjectMetadata `json:"metadata,omitempty"` + + // Spec defines the desired characteristics of the volume. At minimum, + // `resources.requests.storage` must be set. See the Kubernetes + // PersistentVolumeClaim documentation for the full field reference: + // https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.30/#persistentvolumeclaimspec-v1-core + Spec corev1.PersistentVolumeClaimSpec `json:"spec,omitempty"` +} + +// EmbeddedObjectMetadata is a subset of metav1.ObjectMeta containing only +// the fields that make sense to set on an operator-managed child resource. +// Setting `name` or `namespace` here has no effect — the operator owns +// those. +type EmbeddedObjectMetadata struct { + // Labels applied to the generated resource. Merged with the + // operator's default labels; on conflict the operator's value wins. + // +optional + Labels map[string]string `json:"labels,omitempty"` + // Annotations applied to the generated resource. Useful for + // integrations such as VolumeSnapshot controllers or storage-class + // provisioners that read annotations from the claim. + // +optional + Annotations map[string]string `json:"annotations,omitempty"` + // Finalizers applied to the generated resource on creation. The + // operator does not strip user-managed finalizers it did not add, so + // removing entries from this list does not remove them from the live + // object. + // +optional + Finalizers []string `json:"finalizers,omitempty"` } // AlertmanagerEndpoints defines a selection of a single Endpoints object diff --git a/pkg/operator/apis/monitoring/v1/zz_generated.deepcopy.go b/pkg/operator/apis/monitoring/v1/zz_generated.deepcopy.go index c6a5d372f2..d722b44836 100644 --- a/pkg/operator/apis/monitoring/v1/zz_generated.deepcopy.go +++ b/pkg/operator/apis/monitoring/v1/zz_generated.deepcopy.go @@ -661,6 +661,11 @@ func (in *ManagedAlertmanagerSpec) DeepCopyInto(out *ManagedAlertmanagerSpec) { *out = new(corev1.SecretKeySelector) (*in).DeepCopyInto(*out) } + if in.Storage != nil { + in, out := &in.Storage, &out.Storage + *out = new(AlertmanagerStorageSpec) + (*in).DeepCopyInto(*out) + } return } @@ -674,6 +679,77 @@ func (in *ManagedAlertmanagerSpec) DeepCopy() *ManagedAlertmanagerSpec { return out } +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *AlertmanagerStorageSpec) DeepCopyInto(out *AlertmanagerStorageSpec) { + *out = *in + in.VolumeClaim.DeepCopyInto(&out.VolumeClaim) + return +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new AlertmanagerStorageSpec. +func (in *AlertmanagerStorageSpec) DeepCopy() *AlertmanagerStorageSpec { + if in == nil { + return nil + } + out := new(AlertmanagerStorageSpec) + in.DeepCopyInto(out) + return out +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *EmbeddedPersistentVolumeClaim) DeepCopyInto(out *EmbeddedPersistentVolumeClaim) { + *out = *in + out.TypeMeta = in.TypeMeta + in.EmbeddedObjectMetadata.DeepCopyInto(&out.EmbeddedObjectMetadata) + in.Spec.DeepCopyInto(&out.Spec) + return +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new EmbeddedPersistentVolumeClaim. +func (in *EmbeddedPersistentVolumeClaim) DeepCopy() *EmbeddedPersistentVolumeClaim { + if in == nil { + return nil + } + out := new(EmbeddedPersistentVolumeClaim) + in.DeepCopyInto(out) + return out +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *EmbeddedObjectMetadata) DeepCopyInto(out *EmbeddedObjectMetadata) { + *out = *in + if in.Labels != nil { + in, out := &in.Labels, &out.Labels + *out = make(map[string]string, len(*in)) + for key, val := range *in { + (*out)[key] = val + } + } + if in.Annotations != nil { + in, out := &in.Annotations, &out.Annotations + *out = make(map[string]string, len(*in)) + for key, val := range *in { + (*out)[key] = val + } + } + if in.Finalizers != nil { + in, out := &in.Finalizers, &out.Finalizers + *out = make([]string, len(*in)) + copy(*out, *in) + } + return +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new EmbeddedObjectMetadata. +func (in *EmbeddedObjectMetadata) DeepCopy() *EmbeddedObjectMetadata { + if in == nil { + return nil + } + out := new(EmbeddedObjectMetadata) + in.DeepCopyInto(out) + return out +} + // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. func (in *MonitoringCondition) DeepCopyInto(out *MonitoringCondition) { *out = *in diff --git a/pkg/operator/operator_config.go b/pkg/operator/operator_config.go index 61417e62d8..f7a32994cf 100644 --- a/pkg/operator/operator_config.go +++ b/pkg/operator/operator_config.go @@ -38,6 +38,7 @@ import ( appsv1 "k8s.io/api/apps/v1" corev1 "k8s.io/api/core/v1" apierrors "k8s.io/apimachinery/pkg/api/errors" + "k8s.io/apimachinery/pkg/api/resource" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/types" ctrl "sigs.k8s.io/controller-runtime" @@ -512,7 +513,235 @@ func (r *operatorConfigReconciler) ensureAlertmanagerStatefulSet(ctx context.Con logger.Error(err, "Alertmanager StatefulSet does not exist") return nil } - return err + if err != nil { + return err + } + + return r.reconcileAlertmanagerStorage(ctx, &sset, spec.Storage) +} + +// alertmanagerDataVolumeName is the name of the volume backing Alertmanager's +// --storage.path. Matches manifests/operator.yaml. Used as both the volume +// name on the StatefulSet pod template and the PVC name in the operator +// namespace when persistent storage is configured. +const alertmanagerDataVolumeName = "alertmanager-data" + +// reconcileAlertmanagerStorage swaps the Alertmanager data volume between an +// ephemeral emptyDir and a PVC-backed claim depending on whether the user +// configured persistent storage. +// +// When spec is non-nil, the operator owns a PVC named "alertmanager-data" in +// the operator namespace and the StatefulSet's pod template references it via +// `volumes[name=alertmanager-data].persistentVolumeClaim`. The PVC spec is +// kept in sync with the user-provided spec, modulo Kubernetes' restriction +// that most PVC fields are immutable after creation — for those the operator +// logs a warning and leaves the existing PVC alone. +// +// When spec is nil, the StatefulSet falls back to the manifest default +// (emptyDir) and any operator-owned PVC is left in place to avoid surprising +// data loss; users wanting to reclaim storage should delete the PVC manually. +func (r *operatorConfigReconciler) reconcileAlertmanagerStorage(ctx context.Context, sset *appsv1.StatefulSet, spec *monitoringv1.AlertmanagerStorageSpec) error { + logger, _ := logr.FromContext(ctx) + + if spec == nil { + // Restore the manifest default emptyDir if a previous spec swapped + // it for a PVC reference. Doing this lets users disable persistence + // without manually editing the StatefulSet, at the cost of leaving + // the PVC behind (intentional — see godoc above). + return r.setAlertmanagerDataVolume(ctx, sset, corev1.Volume{ + Name: alertmanagerDataVolumeName, + VolumeSource: corev1.VolumeSource{EmptyDir: &corev1.EmptyDirVolumeSource{}}, + }) + } + + if err := r.ensureAlertmanagerPVC(ctx, spec); err != nil { + return fmt.Errorf("ensure alertmanager PVC: %w", err) + } + + logger.Info("alertmanager storage reconciled", "claim", alertmanagerDataVolumeName, "namespace", r.opts.OperatorNamespace) + + return r.setAlertmanagerDataVolume(ctx, sset, corev1.Volume{ + Name: alertmanagerDataVolumeName, + VolumeSource: corev1.VolumeSource{ + PersistentVolumeClaim: &corev1.PersistentVolumeClaimVolumeSource{ + ClaimName: alertmanagerDataVolumeName, + }, + }, + }) +} + +// ensureAlertmanagerPVC creates or updates the PVC backing Alertmanager's +// data directory. Most PVC fields are immutable post-creation (access modes, +// storage class, volume name); only the storage request is patchable via +// Kubernetes' PVC resize support. We update only that field on existing +// claims to avoid validation errors. Caller-supplied labels and annotations +// are merged on every reconciliation so they can be added or updated after +// the PVC is bound. +func (r *operatorConfigReconciler) ensureAlertmanagerPVC(ctx context.Context, spec *monitoringv1.AlertmanagerStorageSpec) error { + logger, _ := logr.FromContext(ctx) + + desiredSpec := spec.VolumeClaim.Spec.DeepCopy() + if len(desiredSpec.AccessModes) == 0 { + desiredSpec.AccessModes = []corev1.PersistentVolumeAccessMode{corev1.ReadWriteOnce} + } + + pvcKey := client.ObjectKey{Namespace: r.opts.OperatorNamespace, Name: alertmanagerDataVolumeName} + var existing corev1.PersistentVolumeClaim + err := r.client.Get(ctx, pvcKey, &existing) + if apierrors.IsNotFound(err) { + pvc := corev1.PersistentVolumeClaim{ + ObjectMeta: metav1.ObjectMeta{ + Namespace: pvcKey.Namespace, + Name: pvcKey.Name, + Labels: mergeLabels(componentLabels(NameAlertmanager), spec.VolumeClaim.Labels), + Annotations: copyMap(spec.VolumeClaim.Annotations), + Finalizers: append([]string(nil), spec.VolumeClaim.Finalizers...), + }, + Spec: *desiredSpec, + } + return r.client.Create(ctx, &pvc) + } + if err != nil { + return err + } + + patch := existing.DeepCopy() + mutated := false + + // Reconcile mutable metadata. Operator-owned label keys always win; + // every other user-supplied label/annotation is propagated. We do not + // remove keys the user previously set and has now removed — to do so + // safely we'd need to track owned keys explicitly, which is more + // surface than needed for the v1 of this feature. + if labels := mergeLabels(existing.Labels, spec.VolumeClaim.Labels, componentLabels(NameAlertmanager)); !mapsEqual(labels, existing.Labels) { + patch.Labels = labels + mutated = true + } + if anns := mergeLabels(existing.Annotations, spec.VolumeClaim.Annotations); !mapsEqual(anns, existing.Annotations) { + patch.Annotations = anns + mutated = true + } + + // Only the storage request is mutable on a bound PVC. Anything else + // (access modes, storage class, selector) silently won't apply and the + // API server rejects the update — so log and skip. + wantStorage := desiredSpec.Resources.Requests[corev1.ResourceStorage] + gotStorage := existing.Spec.Resources.Requests[corev1.ResourceStorage] + switch wantStorage.Cmp(gotStorage) { + case 1: + if patch.Spec.Resources.Requests == nil { + patch.Spec.Resources.Requests = corev1.ResourceList{} + } + patch.Spec.Resources.Requests[corev1.ResourceStorage] = wantStorage + mutated = true + case -1: + logger.Info("ignoring requested PVC shrink; Kubernetes does not support PVC shrinking", + "have", gotStorage.String(), "want", wantStorage.String()) + } + + if !mutated { + return nil + } + return r.client.Patch(ctx, patch, client.MergeFrom(&existing)) +} + +// componentLabels returns the standard label set used by gmp-operator-managed +// resources for a given component name. Kept local to operator_config.go to +// avoid leaking into the broader API surface. +func componentLabels(component string) map[string]string { + return map[string]string{ + LabelAppName: component, + } +} + +// mergeLabels merges any number of string maps. Later maps take precedence +// over earlier ones, so operator-owned labels should be passed last to +// override any user-supplied conflicting values. +func mergeLabels(in ...map[string]string) map[string]string { + out := map[string]string{} + for _, m := range in { + maps.Copy(out, m) + } + if len(out) == 0 { + return nil + } + return out +} + +func copyMap(m map[string]string) map[string]string { + if m == nil { + return nil + } + out := make(map[string]string, len(m)) + maps.Copy(out, m) + return out +} + +func mapsEqual(a, b map[string]string) bool { + if len(a) != len(b) { + return false + } + for k, v := range a { + if b[k] != v { + return false + } + } + return true +} + +// setAlertmanagerDataVolume replaces the named volume on the Alertmanager +// pod template. It is a no-op when the existing volume already matches the +// desired source, so steady-state reconciliations don't churn the +// StatefulSet. The mutation is sent as a strategic-merge patch off a +// snapshot of the original object so the operator doesn't blast over +// fields owned by another controller (e.g. addon-manager-set annotations) +// and never races with concurrent writers via optimistic-concurrency +// conflicts. +func (r *operatorConfigReconciler) setAlertmanagerDataVolume(ctx context.Context, sset *appsv1.StatefulSet, desired corev1.Volume) error { + original := sset.DeepCopy() + for i, v := range sset.Spec.Template.Spec.Volumes { + if v.Name != desired.Name { + continue + } + if volumeSourcesEqual(v.VolumeSource, desired.VolumeSource) { + return nil + } + sset.Spec.Template.Spec.Volumes[i] = desired + return r.client.Patch(ctx, sset, client.MergeFrom(original)) + } + // Volume not present — append. Should not happen with the shipped + // manifest, but keeps the operator self-healing if someone strips it. + sset.Spec.Template.Spec.Volumes = append(sset.Spec.Template.Spec.Volumes, desired) + return r.client.Patch(ctx, sset, client.MergeFrom(original)) +} + +// volumeSourcesEqual checks whether two VolumeSource values describe the +// same underlying storage. It compares the kind of source plus every field +// the operator manages (and might therefore need to correct on drift), so a +// manually-edited `medium`, `sizeLimit`, or `readOnly` reconciles back to +// the operator-desired shape rather than being silently preserved. +func volumeSourcesEqual(a, b corev1.VolumeSource) bool { + switch { + case a.EmptyDir != nil && b.EmptyDir != nil: + if a.EmptyDir.Medium != b.EmptyDir.Medium { + return false + } + return resourceQuantityPtrEqual(a.EmptyDir.SizeLimit, b.EmptyDir.SizeLimit) + case a.PersistentVolumeClaim != nil && b.PersistentVolumeClaim != nil: + return a.PersistentVolumeClaim.ClaimName == b.PersistentVolumeClaim.ClaimName && + a.PersistentVolumeClaim.ReadOnly == b.PersistentVolumeClaim.ReadOnly + } + return false +} + +// resourceQuantityPtrEqual returns true when two *resource.Quantity pointers +// describe the same quantity (or are both nil). Defined locally because +// k8s.io/apimachinery does not expose a pointer-aware equality helper. +func resourceQuantityPtrEqual(a, b *resource.Quantity) bool { + if a == nil || b == nil { + return a == b + } + return a.Cmp(*b) == 0 } // ensureRuleEvaluatorDeployment reconciles the Deployment for rule-evaluator. diff --git a/pkg/operator/operator_config_test.go b/pkg/operator/operator_config_test.go index 7627e251ac..0f69bedfb7 100644 --- a/pkg/operator/operator_config_test.go +++ b/pkg/operator/operator_config_test.go @@ -15,6 +15,7 @@ package operator import ( + "fmt" "testing" monitoringv1 "github.com/GoogleCloudPlatform/prometheus-engine/pkg/operator/apis/monitoring/v1" @@ -25,7 +26,10 @@ import ( "github.com/prometheus/prometheus/google/export" "github.com/stretchr/testify/require" "gopkg.in/yaml.v3" + appsv1 "k8s.io/api/apps/v1" corev1 "k8s.io/api/core/v1" + apierrors "k8s.io/apimachinery/pkg/api/errors" + "k8s.io/apimachinery/pkg/api/resource" v1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/types" "sigs.k8s.io/controller-runtime/pkg/client" @@ -433,3 +437,179 @@ route: }) } } + +func TestEnsureAlertmanagerStatefulSet_Storage(t *testing.T) { + operatorOpts := Options{ + ProjectID: "test-project", + Location: "us-central1-c", + Cluster: "test-cluster", + PublicNamespace: DefaultPublicNamespace, + OperatorNamespace: DefaultOperatorNamespace, + } + + newSset := func() *appsv1.StatefulSet { + return &appsv1.StatefulSet{ + ObjectMeta: v1.ObjectMeta{ + Namespace: DefaultOperatorNamespace, + Name: NameAlertmanager, + }, + Spec: appsv1.StatefulSetSpec{ + Template: corev1.PodTemplateSpec{ + Spec: corev1.PodSpec{ + Volumes: []corev1.Volume{ + { + Name: alertmanagerDataVolumeName, + VolumeSource: corev1.VolumeSource{EmptyDir: &corev1.EmptyDirVolumeSource{}}, + }, + }, + }, + }, + }, + } + } + + storageGB := func(gb int) corev1.PersistentVolumeClaimSpec { + return corev1.PersistentVolumeClaimSpec{ + Resources: corev1.VolumeResourceRequirements{ + Requests: corev1.ResourceList{ + corev1.ResourceStorage: resource.MustParse(fmt.Sprintf("%dGi", gb)), + }, + }, + } + } + + t.Run("nil storage leaves emptyDir intact", func(t *testing.T) { + ctx := t.Context() + sset := newSset() + kubeClient := newFakeClientBuilder().WithObjects(sset).Build() + reconciler := newOperatorConfigReconciler(kubeClient, operatorOpts) + + require.NoError(t, reconciler.ensureAlertmanagerStatefulSet(ctx, &monitoringv1.ManagedAlertmanagerSpec{})) + + var got appsv1.StatefulSet + require.NoError(t, kubeClient.Get(ctx, client.ObjectKeyFromObject(sset), &got)) + require.NotNil(t, got.Spec.Template.Spec.Volumes[0].EmptyDir, "emptyDir volume must be preserved when no storage spec is set") + require.Nil(t, got.Spec.Template.Spec.Volumes[0].PersistentVolumeClaim) + + // No PVC should have been created. + var pvc corev1.PersistentVolumeClaim + err := kubeClient.Get(ctx, client.ObjectKey{Namespace: DefaultOperatorNamespace, Name: alertmanagerDataVolumeName}, &pvc) + require.True(t, apierrors.IsNotFound(err), "PVC must not exist when storage is unset; got err=%v", err) + }) + + t.Run("storage set provisions PVC and swaps volume to PVC reference", func(t *testing.T) { + ctx := t.Context() + sset := newSset() + kubeClient := newFakeClientBuilder().WithObjects(sset).Build() + reconciler := newOperatorConfigReconciler(kubeClient, operatorOpts) + + spec := &monitoringv1.ManagedAlertmanagerSpec{ + Storage: &monitoringv1.AlertmanagerStorageSpec{ + VolumeClaim: monitoringv1.EmbeddedPersistentVolumeClaim{ + EmbeddedObjectMetadata: monitoringv1.EmbeddedObjectMetadata{ + Labels: map[string]string{"team": "platform"}, + Annotations: map[string]string{"backup.example.com/enabled": "true"}, + }, + Spec: storageGB(5), + }, + }, + } + require.NoError(t, reconciler.ensureAlertmanagerStatefulSet(ctx, spec)) + + var pvc corev1.PersistentVolumeClaim + require.NoError(t, kubeClient.Get(ctx, client.ObjectKey{Namespace: DefaultOperatorNamespace, Name: alertmanagerDataVolumeName}, &pvc)) + require.Equal(t, []corev1.PersistentVolumeAccessMode{corev1.ReadWriteOnce}, pvc.Spec.AccessModes, "access mode must default to ReadWriteOnce when caller omits it") + require.Equal(t, "5Gi", pvc.Spec.Resources.Requests.Storage().String()) + require.Equal(t, "platform", pvc.Labels["team"]) + require.Equal(t, NameAlertmanager, pvc.Labels[LabelAppName], "operator-owned label must be set") + require.Equal(t, "true", pvc.Annotations["backup.example.com/enabled"]) + + var got appsv1.StatefulSet + require.NoError(t, kubeClient.Get(ctx, client.ObjectKeyFromObject(sset), &got)) + require.NotNil(t, got.Spec.Template.Spec.Volumes[0].PersistentVolumeClaim, "data volume must now reference the PVC") + require.Equal(t, alertmanagerDataVolumeName, got.Spec.Template.Spec.Volumes[0].PersistentVolumeClaim.ClaimName) + require.Nil(t, got.Spec.Template.Spec.Volumes[0].EmptyDir) + }) + + t.Run("expanding storage request patches the PVC", func(t *testing.T) { + ctx := t.Context() + sset := newSset() + // Pre-bind PVC at 5Gi to simulate a steady-state cluster. + existingPVC := &corev1.PersistentVolumeClaim{ + ObjectMeta: v1.ObjectMeta{ + Namespace: DefaultOperatorNamespace, + Name: alertmanagerDataVolumeName, + }, + Spec: storageGB(5), + } + existingPVC.Spec.AccessModes = []corev1.PersistentVolumeAccessMode{corev1.ReadWriteOnce} + kubeClient := newFakeClientBuilder().WithObjects(sset, existingPVC).Build() + reconciler := newOperatorConfigReconciler(kubeClient, operatorOpts) + + spec := &monitoringv1.ManagedAlertmanagerSpec{ + Storage: &monitoringv1.AlertmanagerStorageSpec{ + VolumeClaim: monitoringv1.EmbeddedPersistentVolumeClaim{Spec: storageGB(10)}, + }, + } + require.NoError(t, reconciler.ensureAlertmanagerStatefulSet(ctx, spec)) + + var pvc corev1.PersistentVolumeClaim + require.NoError(t, kubeClient.Get(ctx, client.ObjectKey{Namespace: DefaultOperatorNamespace, Name: alertmanagerDataVolumeName}, &pvc)) + require.Equal(t, "10Gi", pvc.Spec.Resources.Requests.Storage().String(), "PVC must be expanded to match requested size") + }) + + t.Run("shrink request is ignored", func(t *testing.T) { + ctx := t.Context() + sset := newSset() + existingPVC := &corev1.PersistentVolumeClaim{ + ObjectMeta: v1.ObjectMeta{ + Namespace: DefaultOperatorNamespace, + Name: alertmanagerDataVolumeName, + }, + Spec: storageGB(10), + } + existingPVC.Spec.AccessModes = []corev1.PersistentVolumeAccessMode{corev1.ReadWriteOnce} + kubeClient := newFakeClientBuilder().WithObjects(sset, existingPVC).Build() + reconciler := newOperatorConfigReconciler(kubeClient, operatorOpts) + + spec := &monitoringv1.ManagedAlertmanagerSpec{ + Storage: &monitoringv1.AlertmanagerStorageSpec{ + VolumeClaim: monitoringv1.EmbeddedPersistentVolumeClaim{Spec: storageGB(2)}, + }, + } + require.NoError(t, reconciler.ensureAlertmanagerStatefulSet(ctx, spec)) + + var pvc corev1.PersistentVolumeClaim + require.NoError(t, kubeClient.Get(ctx, client.ObjectKey{Namespace: DefaultOperatorNamespace, Name: alertmanagerDataVolumeName}, &pvc)) + require.Equal(t, "10Gi", pvc.Spec.Resources.Requests.Storage().String(), "PVC must not shrink; Kubernetes does not allow this") + }) + + t.Run("removing storage spec falls back to emptyDir and leaves PVC in place", func(t *testing.T) { + ctx := t.Context() + sset := newSset() + sset.Spec.Template.Spec.Volumes[0] = corev1.Volume{ + Name: alertmanagerDataVolumeName, + VolumeSource: corev1.VolumeSource{ + PersistentVolumeClaim: &corev1.PersistentVolumeClaimVolumeSource{ClaimName: alertmanagerDataVolumeName}, + }, + } + existingPVC := &corev1.PersistentVolumeClaim{ + ObjectMeta: v1.ObjectMeta{ + Namespace: DefaultOperatorNamespace, + Name: alertmanagerDataVolumeName, + }, + Spec: storageGB(5), + } + kubeClient := newFakeClientBuilder().WithObjects(sset, existingPVC).Build() + reconciler := newOperatorConfigReconciler(kubeClient, operatorOpts) + + require.NoError(t, reconciler.ensureAlertmanagerStatefulSet(ctx, &monitoringv1.ManagedAlertmanagerSpec{})) + + var got appsv1.StatefulSet + require.NoError(t, kubeClient.Get(ctx, client.ObjectKeyFromObject(sset), &got)) + require.NotNil(t, got.Spec.Template.Spec.Volumes[0].EmptyDir, "removing storage spec must restore emptyDir") + + var pvc corev1.PersistentVolumeClaim + require.NoError(t, kubeClient.Get(ctx, client.ObjectKey{Namespace: DefaultOperatorNamespace, Name: alertmanagerDataVolumeName}, &pvc), "PVC must remain so silences survive accidental config removal") + }) +}