From 714815e540034d978e00b8fbdc9fbc2c64677b72 Mon Sep 17 00:00:00 2001 From: Corey Christous Date: Tue, 23 Jun 2026 11:30:44 -0400 Subject: [PATCH 1/6] fix(vpc): tolerate transient API errors while polling resource status The resource-status poll loops aborted the whole build on a single transient IBM VPC API error (5xx/429 or a network blip), even during long-running (~48 min) bakes. A flaky GetImage during the post-capture "wait for AVAILABLE" poll would fail an otherwise-successful build. Classify poll errors as transient (5xx/429, or no/zero-status response = network-level failure) vs fatal (4xx). Transient errors are retried up to maxConsecutiveTransientPollFailures (5) consecutive times, resetting on any successful poll; fatal errors still abort immediately. The overall StateTimeout remains the outer bound. Applied to all three poll loops, which previously treated any error as fatal: waitForResourceReady/isResourceReady and waitForResourceDown/ isResourceDown (now share a single pollUntil helper), and waitForExportJobToSucceed (the 5-45 min image-export poll). The previously-discarded *core.DetailedResponse is captured to read the status code. A pollInterval seam was added so the loops are unit-testable without real 10s sleeps. Signed-off-by: Corey Christous --- builder/ibmcloud/vpc/client.go | 211 ++++++++------ builder/ibmcloud/vpc/client_test.go | 264 ++++++++++++++++++ builder/ibmcloud/vpc/step_image_export.go | 38 ++- .../ibmcloud/vpc/step_image_export_test.go | 73 +++++ 4 files changed, 503 insertions(+), 83 deletions(-) create mode 100644 builder/ibmcloud/vpc/client_test.go create mode 100644 builder/ibmcloud/vpc/step_image_export_test.go diff --git a/builder/ibmcloud/vpc/client.go b/builder/ibmcloud/vpc/client.go index ec00075..2d4e781 100755 --- a/builder/ibmcloud/vpc/client.go +++ b/builder/ibmcloud/vpc/client.go @@ -1,6 +1,7 @@ package vpc import ( + "errors" "fmt" "log" "net/http" @@ -8,17 +9,78 @@ import ( "os/exec" "time" + "github.com/IBM/go-sdk-core/v5/core" "github.com/IBM/vpc-go-sdk/vpcv1" "github.com/hashicorp/packer-plugin-sdk/multistep" "github.com/hashicorp/packer-plugin-sdk/packer" ) +// maxConsecutiveTransientPollFailures bounds how many consecutive transient +// errors (5xx responses or network-level failures) waitForResourceReady will +// tolerate while polling a resource's status before giving up. This keeps a +// genuinely unhealthy API from spinning until the overall StateTimeout while +// still riding out the occasional blip during a long-running bake. +const maxConsecutiveTransientPollFailures = 5 + +// defaultPollInterval is the wait between status polls when a client does not +// set one explicitly. +const defaultPollInterval = 10 * time.Second + +// transientPollError wraps an error from a single resource-status poll that is +// considered transient (a retryable 5xx/429 response or a network-level +// failure) and therefore safe to retry rather than fail the build outright. +type transientPollError struct { + err error +} + +func (e *transientPollError) Error() string { return e.err.Error() } +func (e *transientPollError) Unwrap() error { return e.err } + +// isTransientPollError reports whether an error returned by an IBM VPC status +// call should be retried. resp may be nil when the request never reached the +// server (network timeout, connection reset, EOF), which is itself transient. +func isTransientPollError(resp *core.DetailedResponse, err error) bool { + if err == nil { + return false + } + if resp != nil && resp.StatusCode != 0 { + switch resp.StatusCode { + case http.StatusInternalServerError, // 500 + http.StatusBadGateway, // 502 + http.StatusServiceUnavailable, // 503 + http.StatusGatewayTimeout, // 504 + http.StatusTooManyRequests: // 429 + return true + default: + // Any other response carrying a status code (e.g. 4xx) is fatal. + return false + } + } + // No HTTP response with a usable status code means the request failed at + // the network level before the server answered. Treat that as transient. + return true +} + +// classifyPollError returns wrapped wrapped in a *transientPollError when the +// underlying failure is retryable, and wrapped unchanged when it is fatal. This +// lets waitForResourceReady distinguish "retry" from "abort the build". +func classifyPollError(resp *core.DetailedResponse, err error, wrapped error) error { + if isTransientPollError(resp, err) { + return &transientPollError{err: wrapped} + } + return wrapped +} + type IBMCloudClient struct { // // The http client for communicating http *http.Client // Credentials IBMApiKey string + + // pollInterval is the wait between resource-status polls. When zero, + // defaultPollInterval is used. Primarily a seam for tests. + pollInterval time.Duration } func (client IBMCloudClient) New(IBMApiKey string) *IBMCloudClient { @@ -33,13 +95,37 @@ func (client IBMCloudClient) New(IBMApiKey string) *IBMCloudClient { } func (client IBMCloudClient) waitForResourceReady(resourceID string, resourceType string, timeout time.Duration, state multistep.StateBag) error { + return client.pollUntil(resourceID, resourceType, "ready", timeout, state, client.isResourceReady) +} + +// pollUntil repeatedly invokes check until it reports the resource has reached +// its goal state, the timeout elapses, or check returns a fatal (non-transient) +// error. Transient errors (5xx/429 responses or network blips) are retried up to +// maxConsecutiveTransientPollFailures consecutive times so a single flaky API +// response doesn't abort an otherwise-healthy, long-running build; the streak +// resets on any successful poll. goal is used only in log/timeout messages +// (e.g. "ready", "stopped"). +func (client IBMCloudClient) pollUntil( + resourceID string, + resourceType string, + goal string, + timeout time.Duration, + state multistep.StateBag, + check func(resourceID string, resourceType string, state multistep.StateBag) (bool, error), +) error { ui := state.Get("ui").(packer.Ui) done := make(chan struct{}) defer close(done) result := make(chan error, 1) + interval := client.pollInterval + if interval <= 0 { + interval = defaultPollInterval + } + go func() { attempts := 0 + consecutiveTransientFailures := 0 for { attempts += 1 if attempts%6 == 0 { @@ -49,20 +135,36 @@ func (client IBMCloudClient) waitForResourceReady(resourceID string, resourceTyp } log.Printf("Checking resource state... (attempt: %d)", attempts) - isReady, err := client.isResourceReady(resourceID, resourceType, state) + reached, err := check(resourceID, resourceType, state) if err != nil { - result <- err - return - } - - if isReady { - result <- nil - return + var transient *transientPollError + if errors.As(err, &transient) { + consecutiveTransientFailures++ + if consecutiveTransientFailures > maxConsecutiveTransientPollFailures { + result <- fmt.Errorf("giving up after %d consecutive transient errors polling %s status: %w", + consecutiveTransientFailures, resourceType, err) + return + } + ui.Say(fmt.Sprintf("Transient error polling %s status (%d/%d), retrying: %s", + resourceType, consecutiveTransientFailures, maxConsecutiveTransientPollFailures, err)) + log.Printf("transient error polling %s status (attempt %d, consecutive %d/%d): %s", + resourceType, attempts, consecutiveTransientFailures, maxConsecutiveTransientPollFailures, err) + } else { + result <- err + return + } + } else { + // A successful poll clears the transient-failure streak. + consecutiveTransientFailures = 0 + if reached { + result <- nil + return + } } - // Wait 10 seconds in between - time.Sleep(10 * time.Second) + // Wait in between polls. + time.Sleep(interval) // Verify we shouldn't exit select { @@ -75,12 +177,12 @@ func (client IBMCloudClient) waitForResourceReady(resourceID string, resourceTyp } }() - log.Printf("Waiting for up to %d seconds for resource to become ready", timeout/time.Second) + log.Printf("Waiting for up to %d seconds for resource to become %s", timeout/time.Second, goal) select { case err := <-result: return err case <-time.After(timeout): - err := fmt.Errorf("timeout while waiting to for the resource to become ready") + err := fmt.Errorf("timeout while waiting for the resource to become %s", goal) return err } } @@ -94,10 +196,10 @@ func (client IBMCloudClient) isResourceReady(resourceID string, resourceType str if resourceType == "instances" { options := vpcService.NewGetInstanceOptions(resourceID) - instance, _, err := vpcService.GetInstance(options) + instance, resp, err := vpcService.GetInstance(options) if err != nil { - err := fmt.Errorf("[ERROR] Error occurred while getting instance information. Error: %s", err) - return false, err + wrapped := fmt.Errorf("[ERROR] Error occurred while getting instance information. Error: %s", err) + return false, classifyPollError(resp, err, wrapped) } status := *instance.Status if status == "failed" { @@ -108,30 +210,30 @@ func (client IBMCloudClient) isResourceReady(resourceID string, resourceType str return ready, err } else if resourceType == "floating_ips" { options := vpcService.NewGetFloatingIPOptions(resourceID) - floatingIP, _, err := vpcService.GetFloatingIP(options) + floatingIP, resp, err := vpcService.GetFloatingIP(options) if err != nil { - err := fmt.Errorf("[ERROR] Error occurred while getting floating ip information. Error: %s", err) - return false, err + wrapped := fmt.Errorf("[ERROR] Error occurred while getting floating ip information. Error: %s", err) + return false, classifyPollError(resp, err, wrapped) } status := *floatingIP.Status ready = status == "available" return ready, err } else if resourceType == "subnets" { options := vpcService.NewGetSubnetOptions(resourceID) - subnet, _, err := vpcService.GetSubnet(options) + subnet, resp, err := vpcService.GetSubnet(options) if err != nil { - err := fmt.Errorf("[ERROR] Error occurred while getting subnet information. Error: %s", err) - return false, err + wrapped := fmt.Errorf("[ERROR] Error occurred while getting subnet information. Error: %s", err) + return false, classifyPollError(resp, err, wrapped) } status := *subnet.Status ready = status == "available" return ready, err } else if resourceType == "images" { options := vpcService.NewGetImageOptions(resourceID) - image, _, err := vpcService.GetImage(options) + image, resp, err := vpcService.GetImage(options) if err != nil { - err := fmt.Errorf("[ERROR] Error occurred while getting image information. Error: %s", err) - return false, err + wrapped := fmt.Errorf("[ERROR] Error occurred while getting image information. Error: %s", err) + return false, classifyPollError(resp, err, wrapped) } status := *image.Status ready = status == "available" @@ -144,56 +246,7 @@ func (client IBMCloudClient) isResourceReady(resourceID string, resourceType str } func (client IBMCloudClient) waitForResourceDown(resourceID string, resourceType string, timeout time.Duration, state multistep.StateBag) error { - ui := state.Get("ui").(packer.Ui) - done := make(chan struct{}) - defer close(done) - result := make(chan error, 1) - - go func() { - attempts := 0 - for { - attempts += 1 - if attempts%6 == 0 { - ui.Say(fmt.Sprintf("Waiting time: %d minutes", attempts/6)) - } else { - ui.Say(".") - } - - log.Printf("Checking resource state... (attempt: %d)", attempts) - isDown, err := client.isResourceDown(resourceID, resourceType, state) - - if err != nil { - result <- err - return - } - - if isDown { - result <- nil - return - } - - // Wait 10 seconds in between - time.Sleep(10 * time.Second) - - // Verify we shouldn't exit - select { - case <-done: - // We finished, so just exit the go routine - return - default: - // Keep going - } - } - }() - - log.Printf("Waiting for up to %d seconds for resource to be stopped", timeout/time.Second) - select { - case err := <-result: - return err - case <-time.After(timeout): - err := fmt.Errorf("timeout while waiting to for the resource to be stopped") - return err - } + return client.pollUntil(resourceID, resourceType, "stopped", timeout, state, client.isResourceDown) } func (client IBMCloudClient) isResourceDown(resourceID string, resourceType string, state multistep.StateBag) (bool, error) { @@ -207,12 +260,12 @@ func (client IBMCloudClient) isResourceDown(resourceID string, resourceType stri if resourceType == "instances" { options := &vpcv1.GetInstanceOptions{} options.SetID(resourceID) - instance, _, err := vpcService.GetInstance(options) + instance, resp, err := vpcService.GetInstance(options) if err != nil { - err := fmt.Errorf("[ERROR] Failed retrieving resource information. Error: %s", err) - ui.Error(err.Error()) - log.Println(err.Error()) - return false, err + wrapped := fmt.Errorf("[ERROR] Failed retrieving resource information. Error: %s", err) + ui.Error(wrapped.Error()) + log.Println(wrapped.Error()) + return false, classifyPollError(resp, err, wrapped) } status := *instance.Status down = status == "stopped" diff --git a/builder/ibmcloud/vpc/client_test.go b/builder/ibmcloud/vpc/client_test.go new file mode 100644 index 0000000..9eb886e --- /dev/null +++ b/builder/ibmcloud/vpc/client_test.go @@ -0,0 +1,264 @@ +package vpc + +import ( + "errors" + "net/http" + "net/http/httptest" + "strings" + "sync/atomic" + "testing" + "time" + + "github.com/IBM/go-sdk-core/v5/core" + "github.com/IBM/vpc-go-sdk/vpcv1" + "github.com/hashicorp/packer-plugin-sdk/multistep" + "github.com/hashicorp/packer-plugin-sdk/packer" +) + +// newTestVpcService builds a vpcv1.VpcV1 pointed at an httptest server with a no-op +// authenticator so the resource-polling helpers can be exercised end to end. +func newTestVpcService(t *testing.T, url string) *vpcv1.VpcV1 { + t.Helper() + svc, err := vpcv1.NewVpcV1(&vpcv1.VpcV1Options{ + Authenticator: stubAuthenticator{}, + URL: url, + }) + if err != nil { + t.Fatalf("failed to build test vpc service: %s", err) + } + return svc +} + +func TestIsTransientPollError(t *testing.T) { + tests := []struct { + name string + resp *core.DetailedResponse + err error + want bool + }{ + {name: "no error is not transient", resp: &core.DetailedResponse{StatusCode: 200}, err: nil, want: false}, + {name: "500 is transient", resp: &core.DetailedResponse{StatusCode: 500}, err: errors.New("boom"), want: true}, + {name: "502 is transient", resp: &core.DetailedResponse{StatusCode: 502}, err: errors.New("bad gateway"), want: true}, + {name: "503 is transient", resp: &core.DetailedResponse{StatusCode: 503}, err: errors.New("unavailable"), want: true}, + {name: "504 is transient", resp: &core.DetailedResponse{StatusCode: 504}, err: errors.New("timeout"), want: true}, + {name: "429 is transient", resp: &core.DetailedResponse{StatusCode: 429}, err: errors.New("rate limited"), want: true}, + {name: "404 is fatal", resp: &core.DetailedResponse{StatusCode: 404}, err: errors.New("not found"), want: false}, + {name: "401 is fatal", resp: &core.DetailedResponse{StatusCode: 401}, err: errors.New("unauthorized"), want: false}, + {name: "403 is fatal", resp: &core.DetailedResponse{StatusCode: 403}, err: errors.New("forbidden"), want: false}, + {name: "400 is fatal", resp: &core.DetailedResponse{StatusCode: 400}, err: errors.New("bad request"), want: false}, + {name: "network error with no response is transient", resp: nil, err: errors.New("connection reset by peer"), want: true}, + {name: "response with no status code is transient", resp: &core.DetailedResponse{StatusCode: 0}, err: errors.New("eof"), want: true}, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + if got := isTransientPollError(tt.resp, tt.err); got != tt.want { + t.Fatalf("isTransientPollError() = %v, want %v", got, tt.want) + } + }) + } +} + +func TestIsResourceReadyImageClassification(t *testing.T) { + tests := []struct { + name string + status int + body string + wantReady bool + wantErr bool + wantTransient bool + }{ + {name: "available image is ready", status: http.StatusOK, body: `{"id":"img-1","status":"available"}`, wantReady: true}, + {name: "pending image is not ready", status: http.StatusOK, body: `{"id":"img-1","status":"pending"}`, wantReady: false}, + {name: "failed image is an error", status: http.StatusOK, body: `{"id":"img-1","status":"failed"}`, wantReady: false, wantErr: true}, + {name: "502 is a transient error", status: http.StatusBadGateway, body: `{}`, wantErr: true, wantTransient: true}, + {name: "503 is a transient error", status: http.StatusServiceUnavailable, body: `{}`, wantErr: true, wantTransient: true}, + {name: "404 is a fatal error", status: http.StatusNotFound, body: `{}`, wantErr: true, wantTransient: false}, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + w.Header().Set("Content-Type", "application/json") + w.WriteHeader(tt.status) + _, _ = w.Write([]byte(tt.body)) + })) + defer srv.Close() + + state := new(multistep.BasicStateBag) + state.Put("vpcService", newTestVpcService(t, srv.URL)) + client := IBMCloudClient{} + + ready, err := client.isResourceReady("img-1", "images", state) + + if ready != tt.wantReady { + t.Errorf("ready = %v, want %v", ready, tt.wantReady) + } + if (err != nil) != tt.wantErr { + t.Fatalf("err = %v, wantErr = %v", err, tt.wantErr) + } + var tpe *transientPollError + if got := errors.As(err, &tpe); got != tt.wantTransient { + t.Errorf("errors.As transientPollError = %v, want %v (err: %v)", got, tt.wantTransient, err) + } + }) + } +} + +func TestWaitForResourceReadyToleratesTransientFailures(t *testing.T) { + var calls int32 + // Return three transient 502s, then report the image as available. + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + w.Header().Set("Content-Type", "application/json") + if atomic.AddInt32(&calls, 1) <= 3 { + w.WriteHeader(http.StatusBadGateway) + _, _ = w.Write([]byte(`{}`)) + return + } + w.WriteHeader(http.StatusOK) + _, _ = w.Write([]byte(`{"id":"img-1","status":"available"}`)) + })) + defer srv.Close() + + state := new(multistep.BasicStateBag) + state.Put("ui", packer.TestUi(t)) + state.Put("vpcService", newTestVpcService(t, srv.URL)) + client := IBMCloudClient{pollInterval: time.Millisecond} + + if err := client.waitForResourceReady("img-1", "images", 30*time.Second, state); err != nil { + t.Fatalf("waitForResourceReady returned error after transient failures: %s", err) + } +} + +func TestWaitForResourceReadyGivesUpAfterTooManyTransientFailures(t *testing.T) { + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + w.Header().Set("Content-Type", "application/json") + w.WriteHeader(http.StatusBadGateway) + _, _ = w.Write([]byte(`{}`)) + })) + defer srv.Close() + + state := new(multistep.BasicStateBag) + state.Put("ui", packer.TestUi(t)) + state.Put("vpcService", newTestVpcService(t, srv.URL)) + client := IBMCloudClient{pollInterval: time.Millisecond} + + err := client.waitForResourceReady("img-1", "images", 30*time.Second, state) + if err == nil { + t.Fatal("expected an error after exceeding the consecutive transient failure cap") + } + if !strings.Contains(err.Error(), "transient") { + t.Errorf("error should mention transient failures, got: %s", err) + } +} + +func TestIsResourceDownClassification(t *testing.T) { + tests := []struct { + name string + status int + body string + wantDown bool + wantErr bool + wantTransient bool + }{ + {name: "stopped instance is down", status: http.StatusOK, body: `{"id":"i-1","status":"stopped"}`, wantDown: true}, + {name: "running instance is not down", status: http.StatusOK, body: `{"id":"i-1","status":"running"}`, wantDown: false}, + {name: "502 is a transient error", status: http.StatusBadGateway, body: `{}`, wantErr: true, wantTransient: true}, + {name: "404 is a fatal error", status: http.StatusNotFound, body: `{}`, wantErr: true, wantTransient: false}, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + w.Header().Set("Content-Type", "application/json") + w.WriteHeader(tt.status) + _, _ = w.Write([]byte(tt.body)) + })) + defer srv.Close() + + state := new(multistep.BasicStateBag) + state.Put("ui", packer.TestUi(t)) + state.Put("vpcService", newTestVpcService(t, srv.URL)) + client := IBMCloudClient{} + + down, err := client.isResourceDown("i-1", "instances", state) + + if down != tt.wantDown { + t.Errorf("down = %v, want %v", down, tt.wantDown) + } + if (err != nil) != tt.wantErr { + t.Fatalf("err = %v, wantErr = %v", err, tt.wantErr) + } + var tpe *transientPollError + if got := errors.As(err, &tpe); got != tt.wantTransient { + t.Errorf("errors.As transientPollError = %v, want %v (err: %v)", got, tt.wantTransient, err) + } + }) + } +} + +func TestWaitForResourceDownToleratesTransientFailures(t *testing.T) { + var calls int32 + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + w.Header().Set("Content-Type", "application/json") + if atomic.AddInt32(&calls, 1) <= 3 { + w.WriteHeader(http.StatusServiceUnavailable) + _, _ = w.Write([]byte(`{}`)) + return + } + w.WriteHeader(http.StatusOK) + _, _ = w.Write([]byte(`{"id":"i-1","status":"stopped"}`)) + })) + defer srv.Close() + + state := new(multistep.BasicStateBag) + state.Put("ui", packer.TestUi(t)) + state.Put("vpcService", newTestVpcService(t, srv.URL)) + client := IBMCloudClient{pollInterval: time.Millisecond} + + if err := client.waitForResourceDown("i-1", "instances", 30*time.Second, state); err != nil { + t.Fatalf("waitForResourceDown returned error after transient failures: %s", err) + } +} + +func TestWaitForResourceDownGivesUpAfterTooManyTransientFailures(t *testing.T) { + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + w.Header().Set("Content-Type", "application/json") + w.WriteHeader(http.StatusBadGateway) + _, _ = w.Write([]byte(`{}`)) + })) + defer srv.Close() + + state := new(multistep.BasicStateBag) + state.Put("ui", packer.TestUi(t)) + state.Put("vpcService", newTestVpcService(t, srv.URL)) + client := IBMCloudClient{pollInterval: time.Millisecond} + + err := client.waitForResourceDown("i-1", "instances", 30*time.Second, state) + if err == nil { + t.Fatal("expected an error after exceeding the consecutive transient failure cap") + } + if !strings.Contains(err.Error(), "transient") { + t.Errorf("error should mention transient failures, got: %s", err) + } +} + +func TestWaitForResourceReadyFatalErrorFailsImmediately(t *testing.T) { + var calls int32 + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + atomic.AddInt32(&calls, 1) + w.Header().Set("Content-Type", "application/json") + w.WriteHeader(http.StatusNotFound) + _, _ = w.Write([]byte(`{}`)) + })) + defer srv.Close() + + state := new(multistep.BasicStateBag) + state.Put("ui", packer.TestUi(t)) + state.Put("vpcService", newTestVpcService(t, srv.URL)) + client := IBMCloudClient{pollInterval: time.Millisecond} + + if err := client.waitForResourceReady("img-1", "images", 30*time.Second, state); err == nil { + t.Fatal("expected a fatal error for a 404 response") + } + if got := atomic.LoadInt32(&calls); got != 1 { + t.Errorf("expected exactly one poll for a fatal error, got %d", got) + } +} diff --git a/builder/ibmcloud/vpc/step_image_export.go b/builder/ibmcloud/vpc/step_image_export.go index 2591c95..9323e06 100644 --- a/builder/ibmcloud/vpc/step_image_export.go +++ b/builder/ibmcloud/vpc/step_image_export.go @@ -59,7 +59,7 @@ func (step *StepImageExport) Run(_ context.Context, state multistep.StateBag) mu } jobId := *imageExportJob.ID ui.Say("Waiting for the Export image to SUCCEED...") - err3 := waitForExportJobToSucceed(config.ImageID, jobId, vpcService, exportTimeout, state) + err3 := waitForExportJobToSucceed(config.ImageID, jobId, vpcService, exportTimeout, 0, state) if err3 != nil { err := fmt.Errorf("[ERROR] Error waiting for the Image export job to succeed: %s", err3) state.Put("error", err) @@ -75,7 +75,7 @@ func (step *StepImageExport) Run(_ context.Context, state multistep.StateBag) mu return multistep.ActionContinue } -func waitForExportJobToSucceed(imageId, exportJobId string, vpcService *vpcv1.VpcV1, timeout time.Duration, state multistep.StateBag) error { +func waitForExportJobToSucceed(imageId, exportJobId string, vpcService *vpcv1.VpcV1, timeout time.Duration, pollInterval time.Duration, state multistep.StateBag) error { ui := state.Get("ui").(packer.Ui) done := make(chan struct{}) defer close(done) @@ -88,6 +88,11 @@ func waitForExportJobToSucceed(imageId, exportJobId string, vpcService *vpcv1.Vp ui.Say(fmt.Sprintf("Using %v timeout for image export", timeout)) } + interval := pollInterval + if interval <= 0 { + interval = defaultPollInterval + } + options := vpcv1.GetImageExportJobOptions{ ImageID: &imageId, ID: &exportJobId, @@ -95,6 +100,7 @@ func waitForExportJobToSucceed(imageId, exportJobId string, vpcService *vpcv1.Vp go func() { attempts := 0 + consecutiveTransientFailures := 0 for { attempts += 1 if attempts%6 == 0 { @@ -104,14 +110,38 @@ func waitForExportJobToSucceed(imageId, exportJobId string, vpcService *vpcv1.Vp } log.Printf("Checking export job status ... (attempt: %d)", attempts) - expJob, _, err := vpcService.GetImageExportJob(&options) + expJob, resp, err := vpcService.GetImageExportJob(&options) if err != nil { + // Export jobs run for many minutes, so ride out a transient + // 5xx/429 or network blip rather than failing the whole export. + if isTransientPollError(resp, err) { + consecutiveTransientFailures++ + if consecutiveTransientFailures > maxConsecutiveTransientPollFailures { + result <- fmt.Errorf("giving up after %d consecutive transient errors polling export job status: %w", + consecutiveTransientFailures, err) + return + } + ui.Say(fmt.Sprintf("Transient error polling export job status (%d/%d), retrying: %v", + consecutiveTransientFailures, maxConsecutiveTransientPollFailures, err)) + log.Printf("transient error polling export job status (attempt %d, consecutive %d/%d): %v", + attempts, consecutiveTransientFailures, maxConsecutiveTransientPollFailures, err) + time.Sleep(interval) + select { + case <-done: + return + default: + } + continue + } ui.Say(fmt.Sprintf("Error fetching image export job: %v", err)) result <- err return } + // A successful poll clears the transient-failure streak. + consecutiveTransientFailures = 0 + if expJob.Status != nil { log.Printf("Export job status: %s", *expJob.Status) if attempts%6 == 0 { // Every 1 minutes @@ -130,7 +160,7 @@ func waitForExportJobToSucceed(imageId, exportJobId string, vpcService *vpcv1.Vp return } - time.Sleep(10 * time.Second) + time.Sleep(interval) select { case <-done: diff --git a/builder/ibmcloud/vpc/step_image_export_test.go b/builder/ibmcloud/vpc/step_image_export_test.go new file mode 100644 index 0000000..c05d6bc --- /dev/null +++ b/builder/ibmcloud/vpc/step_image_export_test.go @@ -0,0 +1,73 @@ +package vpc + +import ( + "net/http" + "net/http/httptest" + "strings" + "sync/atomic" + "testing" + "time" + + "github.com/hashicorp/packer-plugin-sdk/multistep" + "github.com/hashicorp/packer-plugin-sdk/packer" +) + +func TestWaitForExportJobToleratesTransientFailures(t *testing.T) { + var calls int32 + // Three transient 502s, then the export job reports success. + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + w.Header().Set("Content-Type", "application/json") + if atomic.AddInt32(&calls, 1) <= 3 { + w.WriteHeader(http.StatusBadGateway) + _, _ = w.Write([]byte(`{}`)) + return + } + w.WriteHeader(http.StatusOK) + _, _ = w.Write([]byte(`{"status":"succeeded"}`)) + })) + defer srv.Close() + + state := new(multistep.BasicStateBag) + state.Put("ui", packer.TestUi(t)) + + err := waitForExportJobToSucceed("img-1", "job-1", newTestVpcService(t, srv.URL), 30*time.Second, time.Millisecond, state) + if err != nil { + t.Fatalf("waitForExportJobToSucceed returned error after transient failures: %s", err) + } +} + +func TestWaitForExportJobGivesUpAfterTooManyTransientFailures(t *testing.T) { + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + w.Header().Set("Content-Type", "application/json") + w.WriteHeader(http.StatusBadGateway) + _, _ = w.Write([]byte(`{}`)) + })) + defer srv.Close() + + state := new(multistep.BasicStateBag) + state.Put("ui", packer.TestUi(t)) + + err := waitForExportJobToSucceed("img-1", "job-1", newTestVpcService(t, srv.URL), 30*time.Second, time.Millisecond, state) + if err == nil { + t.Fatal("expected an error after exceeding the consecutive transient failure cap") + } + if !strings.Contains(err.Error(), "transient") { + t.Errorf("error should mention transient failures, got: %s", err) + } +} + +func TestWaitForExportJobFatalErrorFailsImmediately(t *testing.T) { + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + w.Header().Set("Content-Type", "application/json") + w.WriteHeader(http.StatusNotFound) + _, _ = w.Write([]byte(`{}`)) + })) + defer srv.Close() + + state := new(multistep.BasicStateBag) + state.Put("ui", packer.TestUi(t)) + + if err := waitForExportJobToSucceed("img-1", "job-1", newTestVpcService(t, srv.URL), 30*time.Second, time.Millisecond, state); err == nil { + t.Fatal("expected a fatal error for a 404 response") + } +} From c53378ace9666b5b65670c38f6ea4abb4fcb47aa Mon Sep 17 00:00:00 2001 From: Corey Christous Date: Tue, 23 Jun 2026 11:36:05 -0400 Subject: [PATCH 2/6] fix(vpc): address review feedback on transient-poll tolerance - Drop ui.Error/log.Println from isResourceDown's error path so a retried transient blip no longer prints a misleading [ERROR] line before the "retrying" message (matches isResourceReady, which only returns the error). - Extract sleepOrDone helper to remove the duplicated sleep+select between pollUntil and the export-job loop. - Generalize the maxConsecutiveTransientPollFailures/classifyPollError doc comments (they no longer name waitForResourceReady specifically; mention 429) and fix the "wrapped wrapped" typo. - Align the export loop's error formatting to %s. - Add tests: streak-reset (5 transient -> success -> 5 transient -> ready, pinning both the cap at 5 and reset-on-success) and transient/fatal classification for the floating_ips and subnets branches. Signed-off-by: Corey Christous --- builder/ibmcloud/vpc/client.go | 43 ++++++++------ builder/ibmcloud/vpc/client_test.go | 72 +++++++++++++++++++++++ builder/ibmcloud/vpc/step_image_export.go | 18 ++---- 3 files changed, 101 insertions(+), 32 deletions(-) diff --git a/builder/ibmcloud/vpc/client.go b/builder/ibmcloud/vpc/client.go index 2d4e781..6974ed4 100755 --- a/builder/ibmcloud/vpc/client.go +++ b/builder/ibmcloud/vpc/client.go @@ -16,13 +16,14 @@ import ( ) // maxConsecutiveTransientPollFailures bounds how many consecutive transient -// errors (5xx responses or network-level failures) waitForResourceReady will -// tolerate while polling a resource's status before giving up. This keeps a -// genuinely unhealthy API from spinning until the overall StateTimeout while -// still riding out the occasional blip during a long-running bake. +// errors (5xx/429 responses or network-level failures) a status-poll loop will +// tolerate before giving up. This keeps a genuinely unhealthy API from spinning +// until the overall StateTimeout while still riding out the occasional blip +// during a long-running bake. It applies to every poll loop in this package +// (pollUntil and the image-export poll). const maxConsecutiveTransientPollFailures = 5 -// defaultPollInterval is the wait between status polls when a client does not +// defaultPollInterval is the wait between status polls when a caller does not // set one explicitly. const defaultPollInterval = 10 * time.Second @@ -61,9 +62,9 @@ func isTransientPollError(resp *core.DetailedResponse, err error) bool { return true } -// classifyPollError returns wrapped wrapped in a *transientPollError when the +// classifyPollError returns wrapped enclosed in a *transientPollError when the // underlying failure is retryable, and wrapped unchanged when it is fatal. This -// lets waitForResourceReady distinguish "retry" from "abort the build". +// lets the poll loops distinguish "retry" from "abort the build". func classifyPollError(resp *core.DetailedResponse, err error, wrapped error) error { if isTransientPollError(resp, err) { return &transientPollError{err: wrapped} @@ -71,6 +72,18 @@ func classifyPollError(resp *core.DetailedResponse, err error, wrapped error) er return wrapped } +// sleepOrDone waits interval between polls and then reports whether the poll +// goroutine should stop because its parent has already returned (done closed). +func sleepOrDone(interval time.Duration, done <-chan struct{}) (stop bool) { + time.Sleep(interval) + select { + case <-done: + return true + default: + return false + } +} + type IBMCloudClient struct { // // The http client for communicating http *http.Client @@ -163,16 +176,8 @@ func (client IBMCloudClient) pollUntil( } } - // Wait in between polls. - time.Sleep(interval) - - // Verify we shouldn't exit - select { - case <-done: - // We finished, so just exit the go routine + if sleepOrDone(interval, done) { return - default: - // Keep going } } }() @@ -252,7 +257,6 @@ func (client IBMCloudClient) waitForResourceDown(resourceID string, resourceType func (client IBMCloudClient) isResourceDown(resourceID string, resourceType string, state multistep.StateBag) (bool, error) { var down bool - ui := state.Get("ui").(packer.Ui) var vpcService *vpcv1.VpcV1 if state.Get("vpcService") != nil { vpcService = state.Get("vpcService").(*vpcv1.VpcV1) @@ -262,9 +266,10 @@ func (client IBMCloudClient) isResourceDown(resourceID string, resourceType stri options.SetID(resourceID) instance, resp, err := vpcService.GetInstance(options) if err != nil { + // Return the classified error and let pollUntil decide whether to + // retry (transient) or surface it; don't ui.Error/log here, since a + // retried transient blip shouldn't print a scary [ERROR] line. wrapped := fmt.Errorf("[ERROR] Failed retrieving resource information. Error: %s", err) - ui.Error(wrapped.Error()) - log.Println(wrapped.Error()) return false, classifyPollError(resp, err, wrapped) } status := *instance.Status diff --git a/builder/ibmcloud/vpc/client_test.go b/builder/ibmcloud/vpc/client_test.go index 9eb886e..276c54f 100644 --- a/builder/ibmcloud/vpc/client_test.go +++ b/builder/ibmcloud/vpc/client_test.go @@ -150,6 +150,78 @@ func TestWaitForResourceReadyGivesUpAfterTooManyTransientFailures(t *testing.T) } } +func TestWaitForResourceReadyResetsTransientStreak(t *testing.T) { + // 5x502 (the full tolerated streak), then a successful-but-not-ready poll + // that resets the counter, then 5x502 again, then available. Ten transient + // errors total but never 6 in a row, so the build must NOT abort. This pins + // both the cap (5 consecutive tolerated) and the reset-on-success behavior. + var calls int32 + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + w.Header().Set("Content-Type", "application/json") + n := atomic.AddInt32(&calls, 1) + switch { + case n <= 5: // first transient burst (5 tolerated) + w.WriteHeader(http.StatusBadGateway) + _, _ = w.Write([]byte(`{}`)) + case n == 6: // successful poll, not ready yet -> resets the streak + w.WriteHeader(http.StatusOK) + _, _ = w.Write([]byte(`{"id":"img-1","status":"pending"}`)) + case n <= 11: // second transient burst (5 more) + w.WriteHeader(http.StatusBadGateway) + _, _ = w.Write([]byte(`{}`)) + default: + w.WriteHeader(http.StatusOK) + _, _ = w.Write([]byte(`{"id":"img-1","status":"available"}`)) + } + })) + defer srv.Close() + + state := new(multistep.BasicStateBag) + state.Put("ui", packer.TestUi(t)) + state.Put("vpcService", newTestVpcService(t, srv.URL)) + client := IBMCloudClient{pollInterval: time.Millisecond} + + if err := client.waitForResourceReady("img-1", "images", 30*time.Second, state); err != nil { + t.Fatalf("waitForResourceReady aborted despite the transient streak resetting: %s", err) + } +} + +func TestIsResourceReadyTransientClassificationByType(t *testing.T) { + for _, resourceType := range []string{"floating_ips", "subnets"} { + tests := []struct { + name string + status int + wantTransient bool + }{ + {name: "502 is transient", status: http.StatusBadGateway, wantTransient: true}, + {name: "404 is fatal", status: http.StatusNotFound, wantTransient: false}, + } + for _, tt := range tests { + t.Run(resourceType+" "+tt.name, func(t *testing.T) { + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + w.Header().Set("Content-Type", "application/json") + w.WriteHeader(tt.status) + _, _ = w.Write([]byte(`{}`)) + })) + defer srv.Close() + + state := new(multistep.BasicStateBag) + state.Put("vpcService", newTestVpcService(t, srv.URL)) + client := IBMCloudClient{} + + _, err := client.isResourceReady("r-1", resourceType, state) + if err == nil { + t.Fatalf("expected an error for status %d", tt.status) + } + var tpe *transientPollError + if got := errors.As(err, &tpe); got != tt.wantTransient { + t.Errorf("errors.As transientPollError = %v, want %v (err: %v)", got, tt.wantTransient, err) + } + }) + } + } +} + func TestIsResourceDownClassification(t *testing.T) { tests := []struct { name string diff --git a/builder/ibmcloud/vpc/step_image_export.go b/builder/ibmcloud/vpc/step_image_export.go index 9323e06..6346ca0 100644 --- a/builder/ibmcloud/vpc/step_image_export.go +++ b/builder/ibmcloud/vpc/step_image_export.go @@ -122,19 +122,16 @@ func waitForExportJobToSucceed(imageId, exportJobId string, vpcService *vpcv1.Vp consecutiveTransientFailures, err) return } - ui.Say(fmt.Sprintf("Transient error polling export job status (%d/%d), retrying: %v", + ui.Say(fmt.Sprintf("Transient error polling export job status (%d/%d), retrying: %s", consecutiveTransientFailures, maxConsecutiveTransientPollFailures, err)) - log.Printf("transient error polling export job status (attempt %d, consecutive %d/%d): %v", + log.Printf("transient error polling export job status (attempt %d, consecutive %d/%d): %s", attempts, consecutiveTransientFailures, maxConsecutiveTransientPollFailures, err) - time.Sleep(interval) - select { - case <-done: + if sleepOrDone(interval, done) { return - default: } continue } - ui.Say(fmt.Sprintf("Error fetching image export job: %v", err)) + ui.Say(fmt.Sprintf("Error fetching image export job: %s", err)) result <- err return } @@ -160,13 +157,8 @@ func waitForExportJobToSucceed(imageId, exportJobId string, vpcService *vpcv1.Vp return } - time.Sleep(interval) - - select { - case <-done: + if sleepOrDone(interval, done) { return - default: - // Keep going } } }() From 621ba8ed6a27a0107b6d48179231a22246fb02c3 Mon Sep 17 00:00:00 2001 From: Corey Christous Date: Fri, 26 Jun 2026 10:24:10 -0400 Subject: [PATCH 3/6] refactor(vpc): tolerate transient API errors via SDK EnableRetries Replace the hand-rolled poll-loop transient classifier with the IBM Cloud SDK's built-in request retries (go-sdk-core EnableRetries), enabled once at VPC service construction (StepCreateVPCServiceInstance, used by both the builder and the export post-processor). The SDK retries 429/5xx/network errors, honors Retry-After, and backs off exponentially, covering every VPC call -- one-shot creates (CreateKey, CreateInstance, ...) and status polls alike -- so a single transient 502 no longer aborts a bake. Removes the now-redundant transientPollError / classifyPollError / isTransientPollError / maxConsecutiveTransientPollFailures machinery and the consecutive-failure counters in pollUntil and waitForExportJobToSucceed; those loops now just re-check status on the configured cadence and surface any genuine error. pollUntil's generalization (shared check func) is kept. Tests now exercise the SDK retry behavior end-to-end (502->success, give-up after the configured cap, fatal 4xx fails fast). Signed-off-by: Corey Christous --- builder/ibmcloud/vpc/client.go | 145 ++------- builder/ibmcloud/vpc/client_test.go | 283 +++--------------- .../vpc/step_create_vpc_service_instance.go | 14 + builder/ibmcloud/vpc/step_image_export.go | 27 +- .../ibmcloud/vpc/step_image_export_test.go | 38 ++- 5 files changed, 107 insertions(+), 400 deletions(-) diff --git a/builder/ibmcloud/vpc/client.go b/builder/ibmcloud/vpc/client.go index 6974ed4..5dab51c 100755 --- a/builder/ibmcloud/vpc/client.go +++ b/builder/ibmcloud/vpc/client.go @@ -1,7 +1,6 @@ package vpc import ( - "errors" "fmt" "log" "net/http" @@ -9,69 +8,17 @@ import ( "os/exec" "time" - "github.com/IBM/go-sdk-core/v5/core" "github.com/IBM/vpc-go-sdk/vpcv1" "github.com/hashicorp/packer-plugin-sdk/multistep" "github.com/hashicorp/packer-plugin-sdk/packer" ) -// maxConsecutiveTransientPollFailures bounds how many consecutive transient -// errors (5xx/429 responses or network-level failures) a status-poll loop will -// tolerate before giving up. This keeps a genuinely unhealthy API from spinning -// until the overall StateTimeout while still riding out the occasional blip -// during a long-running bake. It applies to every poll loop in this package -// (pollUntil and the image-export poll). -const maxConsecutiveTransientPollFailures = 5 - // defaultPollInterval is the wait between status polls when a caller does not -// set one explicitly. +// set one explicitly. Transient API errors are handled by the SDK's request +// retries (see vpcRetryMaxAttempts); this interval is purely the cadence at +// which we re-check a resource's status while waiting for it to reach its goal. const defaultPollInterval = 10 * time.Second -// transientPollError wraps an error from a single resource-status poll that is -// considered transient (a retryable 5xx/429 response or a network-level -// failure) and therefore safe to retry rather than fail the build outright. -type transientPollError struct { - err error -} - -func (e *transientPollError) Error() string { return e.err.Error() } -func (e *transientPollError) Unwrap() error { return e.err } - -// isTransientPollError reports whether an error returned by an IBM VPC status -// call should be retried. resp may be nil when the request never reached the -// server (network timeout, connection reset, EOF), which is itself transient. -func isTransientPollError(resp *core.DetailedResponse, err error) bool { - if err == nil { - return false - } - if resp != nil && resp.StatusCode != 0 { - switch resp.StatusCode { - case http.StatusInternalServerError, // 500 - http.StatusBadGateway, // 502 - http.StatusServiceUnavailable, // 503 - http.StatusGatewayTimeout, // 504 - http.StatusTooManyRequests: // 429 - return true - default: - // Any other response carrying a status code (e.g. 4xx) is fatal. - return false - } - } - // No HTTP response with a usable status code means the request failed at - // the network level before the server answered. Treat that as transient. - return true -} - -// classifyPollError returns wrapped enclosed in a *transientPollError when the -// underlying failure is retryable, and wrapped unchanged when it is fatal. This -// lets the poll loops distinguish "retry" from "abort the build". -func classifyPollError(resp *core.DetailedResponse, err error, wrapped error) error { - if isTransientPollError(resp, err) { - return &transientPollError{err: wrapped} - } - return wrapped -} - // sleepOrDone waits interval between polls and then reports whether the poll // goroutine should stop because its parent has already returned (done closed). func sleepOrDone(interval time.Duration, done <-chan struct{}) (stop bool) { @@ -90,10 +37,6 @@ type IBMCloudClient struct { // Credentials IBMApiKey string - - // pollInterval is the wait between resource-status polls. When zero, - // defaultPollInterval is used. Primarily a seam for tests. - pollInterval time.Duration } func (client IBMCloudClient) New(IBMApiKey string) *IBMCloudClient { @@ -112,12 +55,11 @@ func (client IBMCloudClient) waitForResourceReady(resourceID string, resourceTyp } // pollUntil repeatedly invokes check until it reports the resource has reached -// its goal state, the timeout elapses, or check returns a fatal (non-transient) -// error. Transient errors (5xx/429 responses or network blips) are retried up to -// maxConsecutiveTransientPollFailures consecutive times so a single flaky API -// response doesn't abort an otherwise-healthy, long-running build; the streak -// resets on any successful poll. goal is used only in log/timeout messages -// (e.g. "ready", "stopped"). +// its goal state, the timeout elapses, or check returns an error. Transient API +// errors (5xx/429/network blips) are absorbed beneath check by the SDK's request +// retries (see vpcRetryMaxAttempts), so an error surfacing here is genuine and +// aborts the wait. goal is used only in log/timeout messages (e.g. "ready", +// "stopped"). func (client IBMCloudClient) pollUntil( resourceID string, resourceType string, @@ -131,14 +73,8 @@ func (client IBMCloudClient) pollUntil( defer close(done) result := make(chan error, 1) - interval := client.pollInterval - if interval <= 0 { - interval = defaultPollInterval - } - go func() { attempts := 0 - consecutiveTransientFailures := 0 for { attempts += 1 if attempts%6 == 0 { @@ -149,34 +85,16 @@ func (client IBMCloudClient) pollUntil( log.Printf("Checking resource state... (attempt: %d)", attempts) reached, err := check(resourceID, resourceType, state) - if err != nil { - var transient *transientPollError - if errors.As(err, &transient) { - consecutiveTransientFailures++ - if consecutiveTransientFailures > maxConsecutiveTransientPollFailures { - result <- fmt.Errorf("giving up after %d consecutive transient errors polling %s status: %w", - consecutiveTransientFailures, resourceType, err) - return - } - ui.Say(fmt.Sprintf("Transient error polling %s status (%d/%d), retrying: %s", - resourceType, consecutiveTransientFailures, maxConsecutiveTransientPollFailures, err)) - log.Printf("transient error polling %s status (attempt %d, consecutive %d/%d): %s", - resourceType, attempts, consecutiveTransientFailures, maxConsecutiveTransientPollFailures, err) - } else { - result <- err - return - } - } else { - // A successful poll clears the transient-failure streak. - consecutiveTransientFailures = 0 - if reached { - result <- nil - return - } + result <- err + return + } + if reached { + result <- nil + return } - if sleepOrDone(interval, done) { + if sleepOrDone(defaultPollInterval, done) { return } } @@ -201,10 +119,10 @@ func (client IBMCloudClient) isResourceReady(resourceID string, resourceType str if resourceType == "instances" { options := vpcService.NewGetInstanceOptions(resourceID) - instance, resp, err := vpcService.GetInstance(options) + instance, _, err := vpcService.GetInstance(options) if err != nil { - wrapped := fmt.Errorf("[ERROR] Error occurred while getting instance information. Error: %s", err) - return false, classifyPollError(resp, err, wrapped) + err := fmt.Errorf("[ERROR] Error occurred while getting instance information. Error: %s", err) + return false, err } status := *instance.Status if status == "failed" { @@ -215,30 +133,30 @@ func (client IBMCloudClient) isResourceReady(resourceID string, resourceType str return ready, err } else if resourceType == "floating_ips" { options := vpcService.NewGetFloatingIPOptions(resourceID) - floatingIP, resp, err := vpcService.GetFloatingIP(options) + floatingIP, _, err := vpcService.GetFloatingIP(options) if err != nil { - wrapped := fmt.Errorf("[ERROR] Error occurred while getting floating ip information. Error: %s", err) - return false, classifyPollError(resp, err, wrapped) + err := fmt.Errorf("[ERROR] Error occurred while getting floating ip information. Error: %s", err) + return false, err } status := *floatingIP.Status ready = status == "available" return ready, err } else if resourceType == "subnets" { options := vpcService.NewGetSubnetOptions(resourceID) - subnet, resp, err := vpcService.GetSubnet(options) + subnet, _, err := vpcService.GetSubnet(options) if err != nil { - wrapped := fmt.Errorf("[ERROR] Error occurred while getting subnet information. Error: %s", err) - return false, classifyPollError(resp, err, wrapped) + err := fmt.Errorf("[ERROR] Error occurred while getting subnet information. Error: %s", err) + return false, err } status := *subnet.Status ready = status == "available" return ready, err } else if resourceType == "images" { options := vpcService.NewGetImageOptions(resourceID) - image, resp, err := vpcService.GetImage(options) + image, _, err := vpcService.GetImage(options) if err != nil { - wrapped := fmt.Errorf("[ERROR] Error occurred while getting image information. Error: %s", err) - return false, classifyPollError(resp, err, wrapped) + err := fmt.Errorf("[ERROR] Error occurred while getting image information. Error: %s", err) + return false, err } status := *image.Status ready = status == "available" @@ -264,13 +182,10 @@ func (client IBMCloudClient) isResourceDown(resourceID string, resourceType stri if resourceType == "instances" { options := &vpcv1.GetInstanceOptions{} options.SetID(resourceID) - instance, resp, err := vpcService.GetInstance(options) + instance, _, err := vpcService.GetInstance(options) if err != nil { - // Return the classified error and let pollUntil decide whether to - // retry (transient) or surface it; don't ui.Error/log here, since a - // retried transient blip shouldn't print a scary [ERROR] line. - wrapped := fmt.Errorf("[ERROR] Failed retrieving resource information. Error: %s", err) - return false, classifyPollError(resp, err, wrapped) + err := fmt.Errorf("[ERROR] Failed retrieving resource information. Error: %s", err) + return false, err } status := *instance.Status down = status == "stopped" diff --git a/builder/ibmcloud/vpc/client_test.go b/builder/ibmcloud/vpc/client_test.go index 276c54f..b3f6f3f 100644 --- a/builder/ibmcloud/vpc/client_test.go +++ b/builder/ibmcloud/vpc/client_test.go @@ -1,22 +1,18 @@ package vpc import ( - "errors" "net/http" "net/http/httptest" - "strings" "sync/atomic" "testing" "time" - "github.com/IBM/go-sdk-core/v5/core" "github.com/IBM/vpc-go-sdk/vpcv1" "github.com/hashicorp/packer-plugin-sdk/multistep" - "github.com/hashicorp/packer-plugin-sdk/packer" ) -// newTestVpcService builds a vpcv1.VpcV1 pointed at an httptest server with a no-op -// authenticator so the resource-polling helpers can be exercised end to end. +// newTestVpcService builds a vpcv1.VpcV1 pointed at an httptest server with a +// no-op authenticator so the client helpers can be exercised end to end. func newTestVpcService(t *testing.T, url string) *vpcv1.VpcV1 { t.Helper() svc, err := vpcv1.NewVpcV1(&vpcv1.VpcV1Options{ @@ -29,56 +25,26 @@ func newTestVpcService(t *testing.T, url string) *vpcv1.VpcV1 { return svc } -func TestIsTransientPollError(t *testing.T) { +// TestIsResourceReadyImageStatus pins the status→ready/error mapping. Transient +// HTTP errors are no longer classified here (the SDK's request retries absorb +// them, see TestVPCServiceRetriesTransientErrors); this only covers the +// application-level status handling on a successful response. +func TestIsResourceReadyImageStatus(t *testing.T) { tests := []struct { - name string - resp *core.DetailedResponse - err error - want bool + name string + body string + wantReady bool + wantErr bool }{ - {name: "no error is not transient", resp: &core.DetailedResponse{StatusCode: 200}, err: nil, want: false}, - {name: "500 is transient", resp: &core.DetailedResponse{StatusCode: 500}, err: errors.New("boom"), want: true}, - {name: "502 is transient", resp: &core.DetailedResponse{StatusCode: 502}, err: errors.New("bad gateway"), want: true}, - {name: "503 is transient", resp: &core.DetailedResponse{StatusCode: 503}, err: errors.New("unavailable"), want: true}, - {name: "504 is transient", resp: &core.DetailedResponse{StatusCode: 504}, err: errors.New("timeout"), want: true}, - {name: "429 is transient", resp: &core.DetailedResponse{StatusCode: 429}, err: errors.New("rate limited"), want: true}, - {name: "404 is fatal", resp: &core.DetailedResponse{StatusCode: 404}, err: errors.New("not found"), want: false}, - {name: "401 is fatal", resp: &core.DetailedResponse{StatusCode: 401}, err: errors.New("unauthorized"), want: false}, - {name: "403 is fatal", resp: &core.DetailedResponse{StatusCode: 403}, err: errors.New("forbidden"), want: false}, - {name: "400 is fatal", resp: &core.DetailedResponse{StatusCode: 400}, err: errors.New("bad request"), want: false}, - {name: "network error with no response is transient", resp: nil, err: errors.New("connection reset by peer"), want: true}, - {name: "response with no status code is transient", resp: &core.DetailedResponse{StatusCode: 0}, err: errors.New("eof"), want: true}, - } - for _, tt := range tests { - t.Run(tt.name, func(t *testing.T) { - if got := isTransientPollError(tt.resp, tt.err); got != tt.want { - t.Fatalf("isTransientPollError() = %v, want %v", got, tt.want) - } - }) - } -} - -func TestIsResourceReadyImageClassification(t *testing.T) { - tests := []struct { - name string - status int - body string - wantReady bool - wantErr bool - wantTransient bool - }{ - {name: "available image is ready", status: http.StatusOK, body: `{"id":"img-1","status":"available"}`, wantReady: true}, - {name: "pending image is not ready", status: http.StatusOK, body: `{"id":"img-1","status":"pending"}`, wantReady: false}, - {name: "failed image is an error", status: http.StatusOK, body: `{"id":"img-1","status":"failed"}`, wantReady: false, wantErr: true}, - {name: "502 is a transient error", status: http.StatusBadGateway, body: `{}`, wantErr: true, wantTransient: true}, - {name: "503 is a transient error", status: http.StatusServiceUnavailable, body: `{}`, wantErr: true, wantTransient: true}, - {name: "404 is a fatal error", status: http.StatusNotFound, body: `{}`, wantErr: true, wantTransient: false}, + {name: "available image is ready", body: `{"id":"img-1","status":"available"}`, wantReady: true}, + {name: "pending image is not ready", body: `{"id":"img-1","status":"pending"}`, wantReady: false}, + {name: "failed image is an error", body: `{"id":"img-1","status":"failed"}`, wantReady: false, wantErr: true}, } for _, tt := range tests { t.Run(tt.name, func(t *testing.T) { srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { w.Header().Set("Content-Type", "application/json") - w.WriteHeader(tt.status) + w.WriteHeader(http.StatusOK) _, _ = w.Write([]byte(tt.body)) })) defer srv.Close() @@ -88,27 +54,25 @@ func TestIsResourceReadyImageClassification(t *testing.T) { client := IBMCloudClient{} ready, err := client.isResourceReady("img-1", "images", state) - if ready != tt.wantReady { t.Errorf("ready = %v, want %v", ready, tt.wantReady) } if (err != nil) != tt.wantErr { t.Fatalf("err = %v, wantErr = %v", err, tt.wantErr) } - var tpe *transientPollError - if got := errors.As(err, &tpe); got != tt.wantTransient { - t.Errorf("errors.As transientPollError = %v, want %v (err: %v)", got, tt.wantTransient, err) - } }) } } -func TestWaitForResourceReadyToleratesTransientFailures(t *testing.T) { +// TestVPCServiceRetriesTransientErrors proves the retry configuration we apply +// in StepCreateVPCServiceInstance (vpcRetryMaxAttempts) makes the IBM SDK ride +// out a transient 5xx and ultimately succeed. The interval is overridden to 1ms +// purely to keep the test fast — it caps the exponential backoff. +func TestVPCServiceRetriesTransientErrors(t *testing.T) { var calls int32 - // Return three transient 502s, then report the image as available. srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { w.Header().Set("Content-Type", "application/json") - if atomic.AddInt32(&calls, 1) <= 3 { + if atomic.AddInt32(&calls, 1) <= 2 { w.WriteHeader(http.StatusBadGateway) _, _ = w.Write([]byte(`{}`)) return @@ -118,201 +82,40 @@ func TestWaitForResourceReadyToleratesTransientFailures(t *testing.T) { })) defer srv.Close() - state := new(multistep.BasicStateBag) - state.Put("ui", packer.TestUi(t)) - state.Put("vpcService", newTestVpcService(t, srv.URL)) - client := IBMCloudClient{pollInterval: time.Millisecond} - - if err := client.waitForResourceReady("img-1", "images", 30*time.Second, state); err != nil { - t.Fatalf("waitForResourceReady returned error after transient failures: %s", err) - } -} - -func TestWaitForResourceReadyGivesUpAfterTooManyTransientFailures(t *testing.T) { - srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { - w.Header().Set("Content-Type", "application/json") - w.WriteHeader(http.StatusBadGateway) - _, _ = w.Write([]byte(`{}`)) - })) - defer srv.Close() - - state := new(multistep.BasicStateBag) - state.Put("ui", packer.TestUi(t)) - state.Put("vpcService", newTestVpcService(t, srv.URL)) - client := IBMCloudClient{pollInterval: time.Millisecond} - - err := client.waitForResourceReady("img-1", "images", 30*time.Second, state) - if err == nil { - t.Fatal("expected an error after exceeding the consecutive transient failure cap") - } - if !strings.Contains(err.Error(), "transient") { - t.Errorf("error should mention transient failures, got: %s", err) - } -} - -func TestWaitForResourceReadyResetsTransientStreak(t *testing.T) { - // 5x502 (the full tolerated streak), then a successful-but-not-ready poll - // that resets the counter, then 5x502 again, then available. Ten transient - // errors total but never 6 in a row, so the build must NOT abort. This pins - // both the cap (5 consecutive tolerated) and the reset-on-success behavior. - var calls int32 - srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { - w.Header().Set("Content-Type", "application/json") - n := atomic.AddInt32(&calls, 1) - switch { - case n <= 5: // first transient burst (5 tolerated) - w.WriteHeader(http.StatusBadGateway) - _, _ = w.Write([]byte(`{}`)) - case n == 6: // successful poll, not ready yet -> resets the streak - w.WriteHeader(http.StatusOK) - _, _ = w.Write([]byte(`{"id":"img-1","status":"pending"}`)) - case n <= 11: // second transient burst (5 more) - w.WriteHeader(http.StatusBadGateway) - _, _ = w.Write([]byte(`{}`)) - default: - w.WriteHeader(http.StatusOK) - _, _ = w.Write([]byte(`{"id":"img-1","status":"available"}`)) - } - })) - defer srv.Close() - - state := new(multistep.BasicStateBag) - state.Put("ui", packer.TestUi(t)) - state.Put("vpcService", newTestVpcService(t, srv.URL)) - client := IBMCloudClient{pollInterval: time.Millisecond} - - if err := client.waitForResourceReady("img-1", "images", 30*time.Second, state); err != nil { - t.Fatalf("waitForResourceReady aborted despite the transient streak resetting: %s", err) - } -} - -func TestIsResourceReadyTransientClassificationByType(t *testing.T) { - for _, resourceType := range []string{"floating_ips", "subnets"} { - tests := []struct { - name string - status int - wantTransient bool - }{ - {name: "502 is transient", status: http.StatusBadGateway, wantTransient: true}, - {name: "404 is fatal", status: http.StatusNotFound, wantTransient: false}, - } - for _, tt := range tests { - t.Run(resourceType+" "+tt.name, func(t *testing.T) { - srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { - w.Header().Set("Content-Type", "application/json") - w.WriteHeader(tt.status) - _, _ = w.Write([]byte(`{}`)) - })) - defer srv.Close() - - state := new(multistep.BasicStateBag) - state.Put("vpcService", newTestVpcService(t, srv.URL)) - client := IBMCloudClient{} + svc := newTestVpcService(t, srv.URL) + svc.EnableRetries(vpcRetryMaxAttempts, time.Millisecond) - _, err := client.isResourceReady("r-1", resourceType, state) - if err == nil { - t.Fatalf("expected an error for status %d", tt.status) - } - var tpe *transientPollError - if got := errors.As(err, &tpe); got != tt.wantTransient { - t.Errorf("errors.As transientPollError = %v, want %v (err: %v)", got, tt.wantTransient, err) - } - }) - } + if _, _, err := svc.GetImage(svc.NewGetImageOptions("img-1")); err != nil { + t.Fatalf("GetImage failed despite retries: %s", err) } -} - -func TestIsResourceDownClassification(t *testing.T) { - tests := []struct { - name string - status int - body string - wantDown bool - wantErr bool - wantTransient bool - }{ - {name: "stopped instance is down", status: http.StatusOK, body: `{"id":"i-1","status":"stopped"}`, wantDown: true}, - {name: "running instance is not down", status: http.StatusOK, body: `{"id":"i-1","status":"running"}`, wantDown: false}, - {name: "502 is a transient error", status: http.StatusBadGateway, body: `{}`, wantErr: true, wantTransient: true}, - {name: "404 is a fatal error", status: http.StatusNotFound, body: `{}`, wantErr: true, wantTransient: false}, - } - for _, tt := range tests { - t.Run(tt.name, func(t *testing.T) { - srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { - w.Header().Set("Content-Type", "application/json") - w.WriteHeader(tt.status) - _, _ = w.Write([]byte(tt.body)) - })) - defer srv.Close() - - state := new(multistep.BasicStateBag) - state.Put("ui", packer.TestUi(t)) - state.Put("vpcService", newTestVpcService(t, srv.URL)) - client := IBMCloudClient{} - - down, err := client.isResourceDown("i-1", "instances", state) - - if down != tt.wantDown { - t.Errorf("down = %v, want %v", down, tt.wantDown) - } - if (err != nil) != tt.wantErr { - t.Fatalf("err = %v, wantErr = %v", err, tt.wantErr) - } - var tpe *transientPollError - if got := errors.As(err, &tpe); got != tt.wantTransient { - t.Errorf("errors.As transientPollError = %v, want %v (err: %v)", got, tt.wantTransient, err) - } - }) + if got := atomic.LoadInt32(&calls); got != 3 { + t.Errorf("expected 3 requests (2 transient + 1 success), got %d", got) } } -func TestWaitForResourceDownToleratesTransientFailures(t *testing.T) { +func TestVPCServiceGivesUpAfterRetries(t *testing.T) { var calls int32 srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { - w.Header().Set("Content-Type", "application/json") - if atomic.AddInt32(&calls, 1) <= 3 { - w.WriteHeader(http.StatusServiceUnavailable) - _, _ = w.Write([]byte(`{}`)) - return - } - w.WriteHeader(http.StatusOK) - _, _ = w.Write([]byte(`{"id":"i-1","status":"stopped"}`)) - })) - defer srv.Close() - - state := new(multistep.BasicStateBag) - state.Put("ui", packer.TestUi(t)) - state.Put("vpcService", newTestVpcService(t, srv.URL)) - client := IBMCloudClient{pollInterval: time.Millisecond} - - if err := client.waitForResourceDown("i-1", "instances", 30*time.Second, state); err != nil { - t.Fatalf("waitForResourceDown returned error after transient failures: %s", err) - } -} - -func TestWaitForResourceDownGivesUpAfterTooManyTransientFailures(t *testing.T) { - srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + atomic.AddInt32(&calls, 1) w.Header().Set("Content-Type", "application/json") w.WriteHeader(http.StatusBadGateway) _, _ = w.Write([]byte(`{}`)) })) defer srv.Close() - state := new(multistep.BasicStateBag) - state.Put("ui", packer.TestUi(t)) - state.Put("vpcService", newTestVpcService(t, srv.URL)) - client := IBMCloudClient{pollInterval: time.Millisecond} + svc := newTestVpcService(t, srv.URL) + svc.EnableRetries(vpcRetryMaxAttempts, time.Millisecond) - err := client.waitForResourceDown("i-1", "instances", 30*time.Second, state) - if err == nil { - t.Fatal("expected an error after exceeding the consecutive transient failure cap") + if _, _, err := svc.GetImage(svc.NewGetImageOptions("img-1")); err == nil { + t.Fatal("expected an error after retries are exhausted") } - if !strings.Contains(err.Error(), "transient") { - t.Errorf("error should mention transient failures, got: %s", err) + // One initial attempt plus vpcRetryMaxAttempts retries. + if want := int32(vpcRetryMaxAttempts + 1); atomic.LoadInt32(&calls) != want { + t.Errorf("expected %d requests, got %d", want, atomic.LoadInt32(&calls)) } } -func TestWaitForResourceReadyFatalErrorFailsImmediately(t *testing.T) { +func TestVPCServiceDoesNotRetryFatalErrors(t *testing.T) { var calls int32 srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { atomic.AddInt32(&calls, 1) @@ -322,15 +125,13 @@ func TestWaitForResourceReadyFatalErrorFailsImmediately(t *testing.T) { })) defer srv.Close() - state := new(multistep.BasicStateBag) - state.Put("ui", packer.TestUi(t)) - state.Put("vpcService", newTestVpcService(t, srv.URL)) - client := IBMCloudClient{pollInterval: time.Millisecond} + svc := newTestVpcService(t, srv.URL) + svc.EnableRetries(vpcRetryMaxAttempts, time.Millisecond) - if err := client.waitForResourceReady("img-1", "images", 30*time.Second, state); err == nil { - t.Fatal("expected a fatal error for a 404 response") + if _, _, err := svc.GetImage(svc.NewGetImageOptions("img-1")); err == nil { + t.Fatal("expected a fatal 404 error") } if got := atomic.LoadInt32(&calls); got != 1 { - t.Errorf("expected exactly one poll for a fatal error, got %d", got) + t.Errorf("expected exactly one request for a fatal 404, got %d", got) } } diff --git a/builder/ibmcloud/vpc/step_create_vpc_service_instance.go b/builder/ibmcloud/vpc/step_create_vpc_service_instance.go index df24181..52c0297 100644 --- a/builder/ibmcloud/vpc/step_create_vpc_service_instance.go +++ b/builder/ibmcloud/vpc/step_create_vpc_service_instance.go @@ -4,6 +4,7 @@ import ( "context" "fmt" "log" + "time" "github.com/IBM/go-sdk-core/v5/core" "github.com/IBM/vpc-go-sdk/vpcv1" @@ -11,6 +12,17 @@ import ( "github.com/hashicorp/packer-plugin-sdk/packer" ) +// vpcRetryMaxAttempts and vpcRetryMaxInterval configure the IBM Cloud SDK's +// built-in request retries (go-sdk-core EnableRetries). The SDK retries 429 and +// 5xx (except 501) responses and network-level failures, honors the Retry-After +// header, and backs off exponentially up to vpcRetryMaxInterval. This rides out +// transient API blips on every VPC call — both one-shot creates and status +// polls — so a single 502 no longer aborts an otherwise-healthy bake. +const ( + vpcRetryMaxAttempts = 5 + vpcRetryMaxInterval = 30 * time.Second +) + type StepCreateVPCServiceInstance struct { } @@ -61,6 +73,8 @@ func (step *StepCreateVPCServiceInstance) Run(_ context.Context, state multistep return multistep.ActionHalt } + vpcService.EnableRetries(vpcRetryMaxAttempts, vpcRetryMaxInterval) + state.Put("vpcService", vpcService) ui.Say("VPC service creation successful!") return multistep.ActionContinue diff --git a/builder/ibmcloud/vpc/step_image_export.go b/builder/ibmcloud/vpc/step_image_export.go index 6346ca0..7d11778 100644 --- a/builder/ibmcloud/vpc/step_image_export.go +++ b/builder/ibmcloud/vpc/step_image_export.go @@ -100,7 +100,6 @@ func waitForExportJobToSucceed(imageId, exportJobId string, vpcService *vpcv1.Vp go func() { attempts := 0 - consecutiveTransientFailures := 0 for { attempts += 1 if attempts%6 == 0 { @@ -110,35 +109,15 @@ func waitForExportJobToSucceed(imageId, exportJobId string, vpcService *vpcv1.Vp } log.Printf("Checking export job status ... (attempt: %d)", attempts) - expJob, resp, err := vpcService.GetImageExportJob(&options) - + expJob, _, err := vpcService.GetImageExportJob(&options) if err != nil { - // Export jobs run for many minutes, so ride out a transient - // 5xx/429 or network blip rather than failing the whole export. - if isTransientPollError(resp, err) { - consecutiveTransientFailures++ - if consecutiveTransientFailures > maxConsecutiveTransientPollFailures { - result <- fmt.Errorf("giving up after %d consecutive transient errors polling export job status: %w", - consecutiveTransientFailures, err) - return - } - ui.Say(fmt.Sprintf("Transient error polling export job status (%d/%d), retrying: %s", - consecutiveTransientFailures, maxConsecutiveTransientPollFailures, err)) - log.Printf("transient error polling export job status (attempt %d, consecutive %d/%d): %s", - attempts, consecutiveTransientFailures, maxConsecutiveTransientPollFailures, err) - if sleepOrDone(interval, done) { - return - } - continue - } + // Transient 5xx/429/network blips are absorbed by the SDK's + // request retries; an error here is genuine, so stop waiting. ui.Say(fmt.Sprintf("Error fetching image export job: %s", err)) result <- err return } - // A successful poll clears the transient-failure streak. - consecutiveTransientFailures = 0 - if expJob.Status != nil { log.Printf("Export job status: %s", *expJob.Status) if attempts%6 == 0 { // Every 1 minutes diff --git a/builder/ibmcloud/vpc/step_image_export_test.go b/builder/ibmcloud/vpc/step_image_export_test.go index c05d6bc..9e172d7 100644 --- a/builder/ibmcloud/vpc/step_image_export_test.go +++ b/builder/ibmcloud/vpc/step_image_export_test.go @@ -3,7 +3,6 @@ package vpc import ( "net/http" "net/http/httptest" - "strings" "sync/atomic" "testing" "time" @@ -12,17 +11,16 @@ import ( "github.com/hashicorp/packer-plugin-sdk/packer" ) -func TestWaitForExportJobToleratesTransientFailures(t *testing.T) { +func TestWaitForExportJobSucceeds(t *testing.T) { var calls int32 - // Three transient 502s, then the export job reports success. + // Report "running" twice, then "succeeded" — exercises the poll-again loop. srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { w.Header().Set("Content-Type", "application/json") - if atomic.AddInt32(&calls, 1) <= 3 { - w.WriteHeader(http.StatusBadGateway) - _, _ = w.Write([]byte(`{}`)) + w.WriteHeader(http.StatusOK) + if atomic.AddInt32(&calls, 1) <= 2 { + _, _ = w.Write([]byte(`{"status":"running"}`)) return } - w.WriteHeader(http.StatusOK) _, _ = w.Write([]byte(`{"status":"succeeded"}`)) })) defer srv.Close() @@ -30,33 +28,33 @@ func TestWaitForExportJobToleratesTransientFailures(t *testing.T) { state := new(multistep.BasicStateBag) state.Put("ui", packer.TestUi(t)) - err := waitForExportJobToSucceed("img-1", "job-1", newTestVpcService(t, srv.URL), 30*time.Second, time.Millisecond, state) - if err != nil { - t.Fatalf("waitForExportJobToSucceed returned error after transient failures: %s", err) + if err := waitForExportJobToSucceed("img-1", "job-1", newTestVpcService(t, srv.URL), 30*time.Second, time.Millisecond, state); err != nil { + t.Fatalf("waitForExportJobToSucceed returned error: %s", err) + } + if got := atomic.LoadInt32(&calls); got != 3 { + t.Errorf("expected 3 polls (2 running + 1 succeeded), got %d", got) } } -func TestWaitForExportJobGivesUpAfterTooManyTransientFailures(t *testing.T) { +func TestWaitForExportJobFailsOnFailedStatus(t *testing.T) { srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { w.Header().Set("Content-Type", "application/json") - w.WriteHeader(http.StatusBadGateway) - _, _ = w.Write([]byte(`{}`)) + w.WriteHeader(http.StatusOK) + _, _ = w.Write([]byte(`{"status":"failed"}`)) })) defer srv.Close() state := new(multistep.BasicStateBag) state.Put("ui", packer.TestUi(t)) - err := waitForExportJobToSucceed("img-1", "job-1", newTestVpcService(t, srv.URL), 30*time.Second, time.Millisecond, state) - if err == nil { - t.Fatal("expected an error after exceeding the consecutive transient failure cap") - } - if !strings.Contains(err.Error(), "transient") { - t.Errorf("error should mention transient failures, got: %s", err) + if err := waitForExportJobToSucceed("img-1", "job-1", newTestVpcService(t, srv.URL), 30*time.Second, time.Millisecond, state); err == nil { + t.Fatal("expected an error for a failed export job") } } -func TestWaitForExportJobFatalErrorFailsImmediately(t *testing.T) { +// A non-transient API error (e.g. 404) surfaces immediately; transient 5xx/429 +// blips are absorbed beneath this function by the SDK's request retries. +func TestWaitForExportJobFailsOnAPIError(t *testing.T) { srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { w.Header().Set("Content-Type", "application/json") w.WriteHeader(http.StatusNotFound) From ffefe04a1959ae105e8585b9a92fbfe3862e4ab4 Mon Sep 17 00:00:00 2001 From: Corey Christous Date: Fri, 26 Jun 2026 10:51:39 -0400 Subject: [PATCH 4/6] test(vpc): add EnableRetries wiring test; document export poll seam - Add TestStepCreateVPCServiceInstanceEnablesRetries: runs the construction step and asserts the service has a retryablehttp transport, so removing the EnableRetries call (which would silently drop transient tolerance for every VPC call) fails the build. The behavioral retry tests enable retries on a fresh service themselves and would not catch that regression. - Document why waitForExportJobToSucceed keeps a pollInterval seam while pollUntil hardcodes defaultPollInterval (its multi-poll loop is tested). Signed-off-by: Corey Christous --- .../step_create_vpc_service_instance_test.go | 35 +++++++++++++++++++ builder/ibmcloud/vpc/step_image_export.go | 3 ++ 2 files changed, 38 insertions(+) create mode 100644 builder/ibmcloud/vpc/step_create_vpc_service_instance_test.go diff --git a/builder/ibmcloud/vpc/step_create_vpc_service_instance_test.go b/builder/ibmcloud/vpc/step_create_vpc_service_instance_test.go new file mode 100644 index 0000000..ba5ea7b --- /dev/null +++ b/builder/ibmcloud/vpc/step_create_vpc_service_instance_test.go @@ -0,0 +1,35 @@ +package vpc + +import ( + "context" + "fmt" + "strings" + "testing" + + "github.com/IBM/vpc-go-sdk/vpcv1" + "github.com/hashicorp/packer-plugin-sdk/multistep" + "github.com/hashicorp/packer-plugin-sdk/packer" +) + +// TestStepCreateVPCServiceInstanceEnablesRetries guards the wiring: transient- +// error tolerance for every VPC call depends on the EnableRetries call in Run. +// The behavioral retry tests (client_test.go) enable retries themselves, so they +// would still pass if that line were removed — this test fails if it is. +func TestStepCreateVPCServiceInstanceEnablesRetries(t *testing.T) { + state := new(multistep.BasicStateBag) + state.Put("ui", packer.TestUi(t)) + state.Put("client", &IBMCloudClient{IBMApiKey: "dummy-key"}) + state.Put("config", Config{}) + + step := &StepCreateVPCServiceInstance{} + if action := step.Run(context.Background(), state); action != multistep.ActionContinue { + t.Fatalf("Run returned %v, want ActionContinue", action) + } + + svc := state.Get("vpcService").(*vpcv1.VpcV1) + // EnableRetries swaps the service's HTTP client for a retryablehttp-backed + // one; assert that transport is in place so dropping EnableRetries is caught. + if got := fmt.Sprintf("%T", svc.Service.Client.Transport); !strings.Contains(got, "retryablehttp") { + t.Errorf("VPC service transport = %s, want a retryablehttp transport (EnableRetries not wired)", got) + } +} diff --git a/builder/ibmcloud/vpc/step_image_export.go b/builder/ibmcloud/vpc/step_image_export.go index 7d11778..c21ad4c 100644 --- a/builder/ibmcloud/vpc/step_image_export.go +++ b/builder/ibmcloud/vpc/step_image_export.go @@ -75,6 +75,9 @@ func (step *StepImageExport) Run(_ context.Context, state multistep.StateBag) mu return multistep.ActionContinue } +// pollInterval is the cadence between status checks; production passes 0 to use +// defaultPollInterval. It exists as a parameter (unlike pollUntil, which hardcodes +// the constant) so tests can drive the multi-poll loop without a real 10s wait. func waitForExportJobToSucceed(imageId, exportJobId string, vpcService *vpcv1.VpcV1, timeout time.Duration, pollInterval time.Duration, state multistep.StateBag) error { ui := state.Get("ui").(packer.Ui) done := make(chan struct{}) From 0aba9e66cb49677a742034db6896a84472fbe306 Mon Sep 17 00:00:00 2001 From: Corey Christous Date: Fri, 26 Jun 2026 10:53:52 -0400 Subject: [PATCH 5/6] docs(vpc): clarify retry backoff comment; fix stale export-timeout comment Address review-pass findings: - The SDK honors a server-sent Retry-After uncapped; vpcRetryMaxInterval caps only the exponential backoff. Reword the constant comment so it no longer implies Retry-After is bounded by vpcRetryMaxInterval. - Fix a stale comment claiming a '45 minutes' default export timeout where the code uses 5 minutes. Signed-off-by: Corey Christous --- builder/ibmcloud/vpc/step_create_vpc_service_instance.go | 9 +++++---- builder/ibmcloud/vpc/step_image_export.go | 2 +- 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/builder/ibmcloud/vpc/step_create_vpc_service_instance.go b/builder/ibmcloud/vpc/step_create_vpc_service_instance.go index 52c0297..6914214 100644 --- a/builder/ibmcloud/vpc/step_create_vpc_service_instance.go +++ b/builder/ibmcloud/vpc/step_create_vpc_service_instance.go @@ -14,10 +14,11 @@ import ( // vpcRetryMaxAttempts and vpcRetryMaxInterval configure the IBM Cloud SDK's // built-in request retries (go-sdk-core EnableRetries). The SDK retries 429 and -// 5xx (except 501) responses and network-level failures, honors the Retry-After -// header, and backs off exponentially up to vpcRetryMaxInterval. This rides out -// transient API blips on every VPC call — both one-shot creates and status -// polls — so a single 502 no longer aborts an otherwise-healthy bake. +// 5xx (except 501) responses and network-level failures. It honors a server-sent +// Retry-After header (as given); otherwise it backs off exponentially, with +// vpcRetryMaxInterval capping that exponential wait. This rides out transient API +// blips on every VPC call — both one-shot creates and status polls — so a single +// 502 no longer aborts an otherwise-healthy bake. const ( vpcRetryMaxAttempts = 5 vpcRetryMaxInterval = 30 * time.Second diff --git a/builder/ibmcloud/vpc/step_image_export.go b/builder/ibmcloud/vpc/step_image_export.go index c21ad4c..29496a1 100644 --- a/builder/ibmcloud/vpc/step_image_export.go +++ b/builder/ibmcloud/vpc/step_image_export.go @@ -85,7 +85,7 @@ func waitForExportJobToSucceed(imageId, exportJobId string, vpcService *vpcv1.Vp result := make(chan error, 1) if timeout < 1*time.Minute { - timeout = 5 * time.Minute // Default to 45 minutes for image exports + timeout = 5 * time.Minute // Default when no (or sub-minute) export timeout is configured ui.Say(fmt.Sprintf("Using default %v timeout for image export", timeout)) } else { ui.Say(fmt.Sprintf("Using %v timeout for image export", timeout)) From 2a1e9e3262dd05be840ecde8a01804d9c9d34425 Mon Sep 17 00:00:00 2001 From: Corey Christous Date: Fri, 26 Jun 2026 11:11:37 -0400 Subject: [PATCH 6/6] test(vpc): cover poll-loop abort/ready paths; clarify retry comment Address final-round review findings: - Add TestWaitForResourceReadyAbortsOnAPIError and TestWaitForResourceReadyReturnsWhenReady so the production pollUntil loop is exercised end-to-end (the prior transient tests went through the SDK client directly). Both terminate on the first check, so no real poll wait. - Reword pollUntil's comment: transient errors are retried per-request by the SDK (up to vpcRetryMaxAttempts each), not absorbed across the poll loop, so an error surfacing here means retries were exhausted or it's fatal. Signed-off-by: Corey Christous --- builder/ibmcloud/vpc/client.go | 9 ++++--- builder/ibmcloud/vpc/client_test.go | 42 +++++++++++++++++++++++++++++ 2 files changed, 47 insertions(+), 4 deletions(-) diff --git a/builder/ibmcloud/vpc/client.go b/builder/ibmcloud/vpc/client.go index 5dab51c..9945c34 100755 --- a/builder/ibmcloud/vpc/client.go +++ b/builder/ibmcloud/vpc/client.go @@ -56,10 +56,11 @@ func (client IBMCloudClient) waitForResourceReady(resourceID string, resourceTyp // pollUntil repeatedly invokes check until it reports the resource has reached // its goal state, the timeout elapses, or check returns an error. Transient API -// errors (5xx/429/network blips) are absorbed beneath check by the SDK's request -// retries (see vpcRetryMaxAttempts), so an error surfacing here is genuine and -// aborts the wait. goal is used only in log/timeout messages (e.g. "ready", -// "stopped"). +// errors (5xx/429/network blips) are retried beneath check by the SDK's +// per-request retries (up to vpcRetryMaxAttempts each, see EnableRetries); an +// error surfacing here therefore means those retries were exhausted or the error +// is fatal, so the wait aborts rather than re-checking. goal is used only in +// log/timeout messages (e.g. "ready", "stopped"). func (client IBMCloudClient) pollUntil( resourceID string, resourceType string, diff --git a/builder/ibmcloud/vpc/client_test.go b/builder/ibmcloud/vpc/client_test.go index b3f6f3f..214c7bc 100644 --- a/builder/ibmcloud/vpc/client_test.go +++ b/builder/ibmcloud/vpc/client_test.go @@ -9,6 +9,7 @@ import ( "github.com/IBM/vpc-go-sdk/vpcv1" "github.com/hashicorp/packer-plugin-sdk/multistep" + "github.com/hashicorp/packer-plugin-sdk/packer" ) // newTestVpcService builds a vpcv1.VpcV1 pointed at an httptest server with a @@ -115,6 +116,47 @@ func TestVPCServiceGivesUpAfterRetries(t *testing.T) { } } +// TestWaitForResourceReadyAbortsOnAPIError exercises the production poll loop +// end to end: with no SDK retries on this test client, an API error surfaces +// from the check immediately and pollUntil aborts the wait (in production the +// SDK would have retried transient errors before this point). This locks the +// post-rework semantics — pollUntil no longer rides out errors itself. +func TestWaitForResourceReadyAbortsOnAPIError(t *testing.T) { + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + w.Header().Set("Content-Type", "application/json") + w.WriteHeader(http.StatusBadGateway) + _, _ = w.Write([]byte(`{}`)) + })) + defer srv.Close() + + state := new(multistep.BasicStateBag) + state.Put("ui", packer.TestUi(t)) + state.Put("vpcService", newTestVpcService(t, srv.URL)) + client := IBMCloudClient{} + + if err := client.waitForResourceReady("i-1", "instances", 30*time.Second, state); err == nil { + t.Fatal("expected pollUntil to abort the wait when the API errors") + } +} + +func TestWaitForResourceReadyReturnsWhenReady(t *testing.T) { + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + w.Header().Set("Content-Type", "application/json") + w.WriteHeader(http.StatusOK) + _, _ = w.Write([]byte(`{"id":"i-1","status":"running"}`)) + })) + defer srv.Close() + + state := new(multistep.BasicStateBag) + state.Put("ui", packer.TestUi(t)) + state.Put("vpcService", newTestVpcService(t, srv.URL)) + client := IBMCloudClient{} + + if err := client.waitForResourceReady("i-1", "instances", 30*time.Second, state); err != nil { + t.Fatalf("waitForResourceReady returned error when the resource was ready: %s", err) + } +} + func TestVPCServiceDoesNotRetryFatalErrors(t *testing.T) { var calls int32 srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {