Skip to content

Commit 2733dc4

Browse files
Fix race conditions on unmount, and verify volume is actually attached to the correct node (#7)
* Fix unmount race conditions * Return nil in case of failure * return error on publish volume failure * Remove error from unpublish --------- Co-authored-by: Brian Model <brian.model@gmail.com>
1 parent a222b07 commit 2733dc4

1 file changed

Lines changed: 96 additions & 8 deletions

File tree

pkg/driver/controllerserver.go

Lines changed: 96 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -199,15 +199,46 @@ func (cs *controllerServer) ControllerPublishVolume(ctx context.Context, req *cs
199199
return nil, status.Errorf(codes.NotFound, "ControllerPublishVolume: GetVolume returned nil volume")
200200
}
201201
klog.Infof("ControllerPublishVolume: GetVolume succeeded -\nStatus: %s\nName: %s\nID: %d\nSize:%d", *getVolume.Status, *getVolume.Name, *getVolume.Id, *getVolume.Size)
202-
if *getVolume.Status == "in-use" { //Volume is already attached
202+
if *getVolume.Status == "in-use" {
203203
klog.Infof("ControllerPublishVolume: Volume %s is already in use", *getVolume.Name)
204204
if len(*getVolume.Attachments) > 0 {
205-
klog.Infof("ControllerPublishVolume: Volume %s is already attached to node %s", *getVolume.Name, *(*getVolume.Attachments)[0].InstanceId)
206-
return &csi.ControllerPublishVolumeResponse{
207-
PublishContext: map[string]string{
208-
volNameKeyFromControllerPublishVolume: *(*getVolume.Attachments)[0].Device,
209-
},
210-
}, nil
205+
attachedInstanceIdStr := *(*getVolume.Attachments)[0].InstanceId
206+
attachedInstanceId, err := strconv.Atoi(attachedInstanceIdStr)
207+
if err != nil {
208+
klog.Errorf("ControllerPublishVolume: Failed to convert attached instance ID to int: %v", err)
209+
return nil, status.Errorf(codes.Internal, "Failed to convert attached instance ID to int: %v", err)
210+
}
211+
klog.Infof("ControllerPublishVolume: Volume %s is already attached to node %d", *getVolume.Name, attachedInstanceId)
212+
if attachedInstanceId == vmId {
213+
return &csi.ControllerPublishVolumeResponse{
214+
PublishContext: map[string]string{
215+
volNameKeyFromControllerPublishVolume: *(*getVolume.Attachments)[0].Device,
216+
},
217+
}, nil
218+
}
219+
klog.Infof("ControllerPublishVolume: Volume attached to wrong node (attached: %d, requested: %d), detaching", attachedInstanceId, vmId)
220+
_, err = cloud.DetachVolumeFromNode(ctx, attachedInstanceId, volumeIDInt)
221+
if err != nil {
222+
klog.Errorf("ControllerPublishVolume: Failed to detach from wrong node: %v", err)
223+
return nil, status.Errorf(codes.Internal, "ControllerPublishVolume: Failed to detach from wrong node: %v", err)
224+
}
225+
maxAttempts := 30
226+
for i := 0; i < maxAttempts; i++ {
227+
v, err := cloud.GetVolume(ctx, volumeIDInt)
228+
if err != nil || v == nil {
229+
time.Sleep(2 * time.Second)
230+
continue
231+
}
232+
if *v.Status == "available" {
233+
klog.Infof("ControllerPublishVolume: Volume detached from wrong node, now available")
234+
getVolume = v
235+
break
236+
}
237+
time.Sleep(2 * time.Second)
238+
}
239+
if *getVolume.Status != "available" {
240+
return nil, status.Error(codes.DeadlineExceeded, "ControllerPublishVolume: Timeout waiting for detachment from wrong node")
241+
}
211242
}
212243
}
213244
if *getVolume.Status == "available" {
@@ -217,8 +248,42 @@ func (cs *controllerServer) ControllerPublishVolume(ctx context.Context, req *cs
217248
return nil, status.Errorf(codes.Internal, "ControllerPublishVolume: Failed to AttachVolumeToNode: %v", err)
218249
}
219250
klog.Infof("ControllerPublishVolume: AttachVolumeToNode succeeded -\nID: %v\nInstance id ID: %v\nStatus: %v\nVolume ID: %v", *attachVolume.Id, *attachVolume.InstanceId, *attachVolume.Status, *attachVolume.VolumeId)
251+
252+
maxAttempts := 30
253+
klog.Infof("ControllerPublishVolume: Polling for attachment to complete with max attempts: %d", maxAttempts)
254+
for i := 0; i < maxAttempts; i++ {
255+
klog.Infof("ControllerPublishVolume: Polling for attachment %d/%d", i+1, maxAttempts)
256+
v, err := cloud.GetVolume(ctx, volumeIDInt)
257+
if err != nil {
258+
klog.Errorf("ControllerPublishVolume: Failed to GetVolume while polling: %v", err)
259+
time.Sleep(2 * time.Second)
260+
continue
261+
}
262+
if v == nil {
263+
klog.Warningf("ControllerPublishVolume: GetVolume attempt %d returned nil volume", i+1)
264+
time.Sleep(2 * time.Second)
265+
continue
266+
}
267+
if *v.Status == "in-use" && len(*v.Attachments) > 0 {
268+
attachment := (*v.Attachments)[0]
269+
attachedId, err := strconv.Atoi(*attachment.InstanceId)
270+
if err == nil && attachedId == vmId && attachment.Device != nil {
271+
klog.Infof("ControllerPublishVolume: Volume attached to correct node with device %s", *attachment.Device)
272+
return &csi.ControllerPublishVolumeResponse{
273+
PublishContext: map[string]string{
274+
volNameKeyFromControllerPublishVolume: *attachment.Device,
275+
},
276+
}, nil
277+
}
278+
}
279+
time.Sleep(2 * time.Second)
280+
}
281+
return nil, status.Error(codes.DeadlineExceeded, "ControllerPublishVolume: Timeout waiting for attachment to complete")
220282
}
221-
return &csi.ControllerPublishVolumeResponse{}, nil
283+
284+
klog.Errorf("ControllerPublishVolume: Volume %s in unexpected state: %s", *getVolume.Name, *getVolume.Status)
285+
return nil, status.Errorf(codes.FailedPrecondition,
286+
"Volume in unexpected state: %s (expected 'available' or 'in-use')", *getVolume.Status)
222287
}
223288

224289
func (cs *controllerServer) ControllerUnpublishVolume(ctx context.Context, req *csi.ControllerUnpublishVolumeRequest) (*csi.ControllerUnpublishVolumeResponse, error) {
@@ -254,6 +319,29 @@ func (cs *controllerServer) ControllerUnpublishVolume(ctx context.Context, req *
254319
return nil, status.Errorf(codes.Internal, "ControllerUnpublishVolume: Failed to DetachVolumeFromNode: %v", err)
255320
}
256321
klog.Infof("ControllerUnpublishVolume: DetachVolumeFromNode succeeded -\nMessage: %v\nStatus: %v\nVolume Attachments: %v", *detachVolume.Message, *detachVolume.Status, *detachVolume.VolumeAttachments)
322+
323+
maxAttempts := 30
324+
klog.Infof("ControllerUnpublishVolume: Polling for volume to become available with max attempts: %d", maxAttempts)
325+
for i := 0; i < maxAttempts; i++ {
326+
klog.Infof("ControllerUnpublishVolume: Polling for volume to become available %d/%d", i+1, maxAttempts)
327+
v, err := cloud.GetVolume(ctx, volumeIDInt)
328+
if err != nil {
329+
klog.Errorf("ControllerUnpublishVolume: Failed to GetVolume while polling: %v", err)
330+
time.Sleep(2 * time.Second)
331+
continue
332+
}
333+
if v == nil {
334+
klog.Warningf("ControllerUnpublishVolume: GetVolume attempt %d returned nil volume", i+1)
335+
time.Sleep(2 * time.Second)
336+
continue
337+
}
338+
if *v.Status == "available" {
339+
klog.Infof("ControllerUnpublishVolume: Volume is now available after detachment")
340+
return &csi.ControllerUnpublishVolumeResponse{}, nil
341+
}
342+
time.Sleep(2 * time.Second)
343+
}
344+
return nil, status.Error(codes.DeadlineExceeded, "ControllerUnpublishVolume: Timeout waiting for volume to detach")
257345
}
258346
return &csi.ControllerUnpublishVolumeResponse{}, nil
259347
}

0 commit comments

Comments
 (0)