Skip to content

Commit 1698305

Browse files
committed
Implement rabbitmquser finalizer management for edpm nodes
Add dataplane-specific logic to track and manage RabbitMQ user finalizers for OpenStackDataPlaneNodeSet services. This includes: - Service finalizer tracking in NodeSet status - RabbitMQ user creation and cleanup logic - Multi-cluster RabbitMQ support - Comprehensive tests for finalizer management This preserves the webhooks_warnings approach for webhook defaulting and services controllers while adding only the dataplane-specific RabbitMQ finalizer management functionality.
1 parent 4ed90c4 commit 1698305

17 files changed

Lines changed: 4463 additions & 3 deletions

api/bases/dataplane.openstack.org_openstackdataplanenodesets.yaml

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1973,6 +1973,14 @@ spec:
19731973
items:
19741974
type: string
19751975
type: array
1976+
finalizerHash:
1977+
description: |-
1978+
FinalizerHash is a short, deterministic hash derived from the nodeset name.
1979+
Used to create unique, collision-free finalizer names for RabbitMQ users.
1980+
Format: first 8 characters of SHA256(nodeset.metadata.name)
1981+
Example: "a3f2b5c8"
1982+
This allows easy lookup of which nodeset owns a specific finalizer.
1983+
type: string
19761984
inventorySecretName:
19771985
description: InventorySecretName Name of a secret containing the ansible
19781986
inventory

api/dataplane/v1beta1/openstackdataplanenodeset_types.go

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -160,6 +160,13 @@ type OpenStackDataPlaneNodeSetStatus struct {
160160

161161
//DeployedBmhHash - Hash of BMHs deployed
162162
DeployedBmhHash string `json:"deployedBmhHash,omitempty"`
163+
164+
// FinalizerHash is a short, deterministic hash derived from the nodeset name.
165+
// Used to create unique, collision-free finalizer names for RabbitMQ users.
166+
// Format: first 8 characters of SHA256(nodeset.metadata.name)
167+
// Example: "a3f2b5c8"
168+
// This allows easy lookup of which nodeset owns a specific finalizer.
169+
FinalizerHash string `json:"finalizerHash,omitempty"`
163170
}
164171

165172
// +kubebuilder:object:root=true

api/dataplane/v1beta1/openstackdataplanenodeset_webhook.go

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,7 @@ import (
2727
apierrors "k8s.io/apimachinery/pkg/api/errors"
2828
"k8s.io/apimachinery/pkg/runtime"
2929
"k8s.io/apimachinery/pkg/runtime/schema"
30+
"k8s.io/apimachinery/pkg/types"
3031
apimachineryvalidation "k8s.io/apimachinery/pkg/util/validation"
3132
"k8s.io/apimachinery/pkg/util/validation/field"
3233
"sigs.k8s.io/controller-runtime/pkg/client"
@@ -165,6 +166,27 @@ func (r *OpenStackDataPlaneNodeSet) ValidateUpdate(ctx context.Context, old runt
165166
for deployName, deployConditions := range oldNodeSet.Status.DeploymentStatuses {
166167
deployCondition := deployConditions.Get(NodeSetDeploymentReadyCondition)
167168
if !deployConditions.IsTrue(NodeSetDeploymentReadyCondition) && !condition.IsError(deployCondition) {
169+
// Check if the deployment is being deleted - if so, allow the NodeSet update
170+
deployment := &OpenStackDataPlaneDeployment{}
171+
err := c.Get(ctx, types.NamespacedName{Name: deployName, Namespace: r.Namespace}, deployment)
172+
if err != nil {
173+
if apierrors.IsNotFound(err) {
174+
// Deployment no longer exists, allow the update
175+
continue
176+
}
177+
// If we can't check the deployment, log but don't block
178+
openstackdataplanenodesetlog.Info("could not check deployment status during validation",
179+
"deployment", deployName, "error", err)
180+
continue
181+
}
182+
183+
// If deployment is being deleted, allow the NodeSet update
184+
if deployment.DeletionTimestamp != nil {
185+
openstackdataplanenodesetlog.Info("allowing NodeSet update because deployment is being deleted",
186+
"deployment", deployName)
187+
continue
188+
}
189+
168190
return nil, apierrors.NewConflict(
169191
schema.GroupResource{Group: "dataplane.openstack.org", Resource: "OpenStackDataPlaneNodeSet"},
170192
r.Name,

config/crd/bases/dataplane.openstack.org_openstackdataplanenodesets.yaml

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1973,6 +1973,14 @@ spec:
19731973
items:
19741974
type: string
19751975
type: array
1976+
finalizerHash:
1977+
description: |-
1978+
FinalizerHash is a short, deterministic hash derived from the nodeset name.
1979+
Used to create unique, collision-free finalizer names for RabbitMQ users.
1980+
Format: first 8 characters of SHA256(nodeset.metadata.name)
1981+
Example: "a3f2b5c8"
1982+
This allows easy lookup of which nodeset owns a specific finalizer.
1983+
type: string
19761984
inventorySecretName:
19771985
description: InventorySecretName Name of a secret containing the ansible
19781986
inventory

config/operator/deployment/kustomization.yaml

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -27,3 +27,15 @@ patches:
2727
kind: Deployment
2828
name: openstack-operator-controller-init
2929
namespace: system
30+
- patch: '[{"op": "replace", "path": "/spec/template/spec/containers/0/env/0", "value":
31+
{"name": "OPENSTACK_RELEASE_VERSION", "value": "0.1.17-1768984468"}}]'
32+
target:
33+
kind: Deployment
34+
name: openstack-operator-controller-operator
35+
namespace: system
36+
- patch: '[{"op": "replace", "path": "/spec/template/spec/containers/0/env/0", "value":
37+
{"name": "OPENSTACK_RELEASE_VERSION", "value": "0.5.0-1770051580"}}]'
38+
target:
39+
kind: Deployment
40+
name: openstack-operator-controller-init
41+
namespace: system

config/rbac/role.yaml

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -596,6 +596,23 @@ rules:
596596
- patch
597597
- update
598598
- watch
599+
- apiGroups:
600+
- rabbitmq.openstack.org
601+
resources:
602+
- rabbitmqusers
603+
verbs:
604+
- get
605+
- list
606+
- patch
607+
- update
608+
- watch
609+
- apiGroups:
610+
- rabbitmq.openstack.org
611+
resources:
612+
- rabbitmqusers/finalizers
613+
verbs:
614+
- patch
615+
- update
599616
- apiGroups:
600617
- rbac.authorization.k8s.io
601618
resources:
Lines changed: 249 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,249 @@
1+
# DataPlane RabbitMQ User Finalizer Management
2+
3+
## Overview
4+
5+
The OpenStack DataPlane operator manages RabbitMQ user finalizers to prevent deletion of RabbitMQ users that are still in use by compute nodes. This is critical during rolling upgrades and credential rotation scenarios.
6+
7+
## Problem Statement
8+
9+
During RabbitMQ credential rotation:
10+
1. Nova creates a new RabbitMQ user (e.g., `user2`) with new credentials
11+
2. The nova-cell1-compute-config secret is updated with the new `transport_url`
12+
3. Compute nodes are updated incrementally (using `ansibleLimit` for rolling upgrades)
13+
4. The old RabbitMQ user (e.g., `user1`) must remain until **ALL** nodes are updated
14+
5. Only after complete migration should the old user's finalizer be removed
15+
16+
**Challenge**: With ansibleLimit deployments, nodes are updated in batches. We must track which nodes have been updated across multiple deployments.
17+
18+
## How It Works
19+
20+
### 1. Secret Change Detection
21+
22+
The controller monitors nova-cell compute-config secrets for changes:
23+
24+
```go
25+
// When a secret's transport_url changes, the NovaCellSecretHash is updated
26+
// This triggers UpdatedNodesAfterSecretChange to be reset to empty
27+
if newHash != instance.Status.NovaCellSecretHash {
28+
instance.Status.NovaCellSecretHash = newHash
29+
instance.Status.UpdatedNodesAfterSecretChange = []string{}
30+
}
31+
```
32+
33+
**Location**: `openstackdataplanenodeset_controller.go:523-545`
34+
35+
### 2. Node Coverage Tracking
36+
37+
As deployments complete, the controller tracks which nodes have been updated:
38+
39+
```go
40+
func updateNodeCoverage(deployment, nodeset, secretsLastModified) {
41+
// Check deployment completion time (not creation time!)
42+
deploymentCompletedTime := readyCondition.LastTransitionTime.Time
43+
44+
// Only track nodes from deployments that completed AFTER secret change
45+
if deploymentCompletedTime.Before(secretModTime) {
46+
return // Skip old deployments
47+
}
48+
49+
// Add nodes from this deployment to UpdatedNodesAfterSecretChange
50+
for _, nodeName := range deployment.AnsibleLimit {
51+
instance.Status.UpdatedNodesAfterSecretChange = append(...)
52+
}
53+
}
54+
```
55+
56+
**Location**: `openstackdataplanenodeset_controller.go:750-793`
57+
58+
**Key Design Decision**: Uses deployment completion timestamp (`NodeSetDeploymentReadyCondition.LastTransitionTime`) rather than creation timestamp, because deployments can be created before they run and rotate secrets.
59+
60+
### 3. Finalizer Management Decision
61+
62+
Finalizers are only managed when ALL safety conditions are met:
63+
64+
```go
65+
// Multi-layer protection system
66+
if novaServiceDeployed && // Nova service was successfully deployed
67+
isLatestDeployment && // This is the most recent deployment
68+
!isNodeSetDeploymentRunning && // No deployment currently running
69+
!isDeploymentBeingDeleted && // Deployment not being deleted
70+
hasActiveDeployments { // At least one active deployment exists
71+
72+
// Safety check 1: Secret hash must be set
73+
if instance.Status.NovaCellSecretHash == "" {
74+
return // Skip - tracking not initialized
75+
}
76+
77+
// Safety check 2: All nodes must be accounted for
78+
allNodeNames := getAllNodeNamesFromNodeset(instance)
79+
if len(instance.Status.UpdatedNodesAfterSecretChange) != len(allNodeNames) {
80+
return // Skip - not all nodes updated yet
81+
}
82+
83+
// Safety check 3: All nodesets using same cluster must be updated
84+
if !allNodesetsUsingClusterUpdated(instance) {
85+
return // Skip - other nodesets still pending
86+
}
87+
88+
// All checks passed - safe to manage finalizers
89+
manageRabbitMQUserFinalizers(...)
90+
}
91+
```
92+
93+
**Location**: `openstackdataplanenodeset_controller.go:669-719`
94+
95+
### 4. RabbitMQ Cluster Identification
96+
97+
The controller extracts RabbitMQ cluster information from transport_url:
98+
99+
```go
100+
// Parse transport_url from nova config files (01-nova.conf or custom.conf)
101+
// Format: rabbit://username:password@rabbitmq-cell1.openstack.svc:5672/?ssl=1
102+
transportURL := extractTransportURLFromConfig(configData)
103+
cluster := extractClusterFromTransportURL(transportURL) // Returns "rabbitmq-cell1"
104+
```
105+
106+
**Location**: `rabbitmq.go:278-372`
107+
108+
**Important**: The transport_url is embedded in config files (01-nova.conf), not as a top-level secret field.
109+
110+
## Safety Mechanisms
111+
112+
### 1. Deployment Completion Timestamp Check
113+
Prevents old deployments (created before secret change) from incorrectly populating the updated nodes list.
114+
115+
### 2. Complete Node Coverage
116+
Ensures ALL nodes in the nodeset are accounted for before removing old user finalizers.
117+
118+
### 3. Secret Hash Initialization
119+
Requires the tracking system to be properly initialized before managing finalizers.
120+
121+
### 4. Cross-NodeSet Validation
122+
When multiple nodesets share a RabbitMQ cluster, ensures all are updated before finalizer removal.
123+
124+
### 5. Deployment Deletion Protection
125+
Prevents finalizer changes when ansibleLimit deployments are being deleted, avoiding incorrect state from old deployments.
126+
127+
### 6. Active Deployment Requirement
128+
Requires at least one active deployment to ensure reliable state tracking.
129+
130+
### 7. Accurate Node Counting
131+
Counts only actual node names (map keys), excluding IP addresses and hostnames which are just aliases.
132+
133+
**Location**: `openstackdataplanenodeset_controller.go:930-945`
134+
135+
## Example Scenario: AnsibleLimit Rolling Upgrade
136+
137+
### Initial State
138+
- NodeSet: `edpm-compute` with 2 nodes (compute-0, compute-1)
139+
- RabbitMQ user: `user1`
140+
- Secret: nova-cell1-compute-config (transport_url uses user1)
141+
142+
### Step 1: Secret Updated
143+
```
144+
Secret updated: transport_url changed to use user2
145+
NovaCellSecretHash: updated from hash1 to hash2
146+
UpdatedNodesAfterSecretChange: [] (reset to empty)
147+
```
148+
149+
### Step 2: First AnsibleLimit Deployment
150+
```
151+
Deployment: edpm-deployment-compute-0
152+
AnsibleLimit: "edpm-compute-0"
153+
Status: Running → Completed
154+
UpdatedNodesAfterSecretChange: ["edpm-compute-0"]
155+
156+
Finalizer Action: NONE (only 1 of 2 nodes updated)
157+
```
158+
159+
### Step 3: Second AnsibleLimit Deployment
160+
```
161+
Deployment: edpm-deployment-compute-1
162+
AnsibleLimit: "edpm-compute-1"
163+
Status: Running → Completed
164+
UpdatedNodesAfterSecretChange: ["edpm-compute-0", "edpm-compute-1"]
165+
166+
Finalizer Action: EXECUTE
167+
- Add finalizer to user2 (new user)
168+
- Remove finalizer from user1 (old user can now be deleted)
169+
```
170+
171+
## Edge Cases Handled
172+
173+
### 1. Deployment Created Before Secret Change
174+
If a deployment is created at T1, but runs and rotates the secret at T2, it correctly tracks as completing after the secret change.
175+
176+
### 2. Cluster Identification Failure
177+
If transport_url cannot be parsed (missing or malformed), the controller logs a warning and skips finalizer management rather than risking incorrect cleanup.
178+
179+
**Location**: `openstackdataplanenodeset_controller.go:860-866`
180+
181+
### 3. AnsibleLimit Deployments Deleted
182+
If ansibleLimit deployments are deleted after completing, the old deployment (created before secret change) does NOT trigger finalizer management, preventing incorrect state restoration.
183+
184+
### 4. Secret Rotation During Deployment
185+
If the secret changes while a deployment is running, the hash change resets tracking and the deployment must re-run to populate the updated nodes list.
186+
187+
### 5. In-Memory vs Cluster State Race Condition (Fixed)
188+
**Problem**: When a deployment completes, the reconciliation loop:
189+
1. Calls `updateNodeCoverage()` to update `UpdatedNodesAfterSecretChange` in memory
190+
2. Calls `allNodesetsUsingClusterUpdated()` which reads nodesets from the cluster
191+
3. The cluster still has the old status (before the update in step 1)
192+
4. Finalizer management is skipped due to stale data
193+
194+
**Solution**: The controller now uses the in-memory nodeset when checking the current nodeset, avoiding stale cluster reads.
195+
196+
**Location**: `openstackdataplanenodeset_controller.go:897-902`
197+
198+
## Code References
199+
200+
### Controller Logic
201+
- Main reconciliation: `openstackdataplanenodeset_controller.go:380-750`
202+
- Node coverage tracking: `openstackdataplanenodeset_controller.go:750-793`
203+
- Finalizer management: `openstackdataplanenodeset_controller.go:669-719`
204+
- Cross-nodeset validation: `openstackdataplanenodeset_controller.go:795-920`
205+
206+
### RabbitMQ Utilities
207+
- Cluster identification: `rabbitmq.go:278-372`
208+
- Username extraction: `rabbitmq.go:33-122`
209+
- Cell name extraction: `rabbitmq.go:151-202`
210+
- Secret hash computation: `rabbitmq.go:204-276`
211+
212+
## Testing
213+
214+
### Unit Test
215+
- **Test**: "Should correctly count nodes without IP address aliases"
216+
- **Validates**: Node counting excludes hostName and ansibleHost aliases
217+
- **Location**: `test/functional/dataplane/openstackdataplanenodeset_rabbitmq_finalizer_test.go:141-149`
218+
219+
### Integration Testing
220+
Complex deployment workflows require testing in a real cluster environment where multiple controllers (NodeSet, Deployment, Ansibleee) work together.
221+
222+
## Debugging
223+
224+
### Check Node Coverage Status
225+
```bash
226+
kubectl get openstackdataplanenodeset edpm-compute -n openstack -o jsonpath='{.status.updatedNodesAfterSecretChange}'
227+
```
228+
229+
### Check Secret Hash
230+
```bash
231+
kubectl get openstackdataplanenodeset edpm-compute -n openstack -o jsonpath='{.status.novaCellSecretHash}'
232+
```
233+
234+
### Check Deployment Completion Time
235+
```bash
236+
kubectl get openstackdataplanedeployment edpm-deployment -n openstack -o jsonpath='{.status.nodeSetConditions.edpm-compute[?(@.type=="NodeSetDeploymentReady")].lastTransitionTime}'
237+
```
238+
239+
### Check RabbitMQ User Finalizers
240+
```bash
241+
kubectl get rabbitmquser nova-cell1-transport-user1 -n openstack -o jsonpath='{.metadata.finalizers}'
242+
```
243+
244+
## Future Considerations
245+
246+
1. **Metrics**: Add Prometheus metrics for tracking finalizer management events
247+
2. **Events**: Emit Kubernetes events when finalizers are added/removed for better visibility
248+
3. **Status Conditions**: Add dedicated condition type for RabbitMQ user finalizer management
249+
4. **Multi-Cell Support**: Current implementation handles multiple cells through separate nodesets

0 commit comments

Comments
 (0)