diff --git a/builder/ibmcloud/vpc/client.go b/builder/ibmcloud/vpc/client.go index ec00075..9945c34 100755 --- a/builder/ibmcloud/vpc/client.go +++ b/builder/ibmcloud/vpc/client.go @@ -13,6 +13,24 @@ import ( "github.com/hashicorp/packer-plugin-sdk/packer" ) +// defaultPollInterval is the wait between status polls when a caller does not +// set one explicitly. Transient API errors are handled by the SDK's request +// retries (see vpcRetryMaxAttempts); this interval is purely the cadence at +// which we re-check a resource's status while waiting for it to reach its goal. +const defaultPollInterval = 10 * time.Second + +// sleepOrDone waits interval between polls and then reports whether the poll +// goroutine should stop because its parent has already returned (done closed). +func sleepOrDone(interval time.Duration, done <-chan struct{}) (stop bool) { + time.Sleep(interval) + select { + case <-done: + return true + default: + return false + } +} + type IBMCloudClient struct { // // The http client for communicating http *http.Client @@ -33,6 +51,24 @@ func (client IBMCloudClient) New(IBMApiKey string) *IBMCloudClient { } func (client IBMCloudClient) waitForResourceReady(resourceID string, resourceType string, timeout time.Duration, state multistep.StateBag) error { + return client.pollUntil(resourceID, resourceType, "ready", timeout, state, client.isResourceReady) +} + +// pollUntil repeatedly invokes check until it reports the resource has reached +// its goal state, the timeout elapses, or check returns an error. Transient API +// errors (5xx/429/network blips) are retried beneath check by the SDK's +// per-request retries (up to vpcRetryMaxAttempts each, see EnableRetries); an +// error surfacing here therefore means those retries were exhausted or the error +// is fatal, so the wait aborts rather than re-checking. goal is used only in +// log/timeout messages (e.g. "ready", "stopped"). +func (client IBMCloudClient) pollUntil( + resourceID string, + resourceType string, + goal string, + timeout time.Duration, + state multistep.StateBag, + check func(resourceID string, resourceType string, state multistep.StateBag) (bool, error), +) error { ui := state.Get("ui").(packer.Ui) done := make(chan struct{}) defer close(done) @@ -49,38 +85,28 @@ func (client IBMCloudClient) waitForResourceReady(resourceID string, resourceTyp } log.Printf("Checking resource state... (attempt: %d)", attempts) - isReady, err := client.isResourceReady(resourceID, resourceType, state) - + reached, err := check(resourceID, resourceType, state) if err != nil { result <- err return } - - if isReady { + if reached { result <- nil return } - // Wait 10 seconds in between - time.Sleep(10 * time.Second) - - // Verify we shouldn't exit - select { - case <-done: - // We finished, so just exit the go routine + if sleepOrDone(defaultPollInterval, done) { return - default: - // Keep going } } }() - log.Printf("Waiting for up to %d seconds for resource to become ready", timeout/time.Second) + log.Printf("Waiting for up to %d seconds for resource to become %s", timeout/time.Second, goal) select { case err := <-result: return err case <-time.After(timeout): - err := fmt.Errorf("timeout while waiting to for the resource to become ready") + err := fmt.Errorf("timeout while waiting for the resource to become %s", goal) return err } } @@ -144,62 +170,12 @@ func (client IBMCloudClient) isResourceReady(resourceID string, resourceType str } func (client IBMCloudClient) waitForResourceDown(resourceID string, resourceType string, timeout time.Duration, state multistep.StateBag) error { - ui := state.Get("ui").(packer.Ui) - done := make(chan struct{}) - defer close(done) - result := make(chan error, 1) - - go func() { - attempts := 0 - for { - attempts += 1 - if attempts%6 == 0 { - ui.Say(fmt.Sprintf("Waiting time: %d minutes", attempts/6)) - } else { - ui.Say(".") - } - - log.Printf("Checking resource state... (attempt: %d)", attempts) - isDown, err := client.isResourceDown(resourceID, resourceType, state) - - if err != nil { - result <- err - return - } - - if isDown { - result <- nil - return - } - - // Wait 10 seconds in between - time.Sleep(10 * time.Second) - - // Verify we shouldn't exit - select { - case <-done: - // We finished, so just exit the go routine - return - default: - // Keep going - } - } - }() - - log.Printf("Waiting for up to %d seconds for resource to be stopped", timeout/time.Second) - select { - case err := <-result: - return err - case <-time.After(timeout): - err := fmt.Errorf("timeout while waiting to for the resource to be stopped") - return err - } + return client.pollUntil(resourceID, resourceType, "stopped", timeout, state, client.isResourceDown) } func (client IBMCloudClient) isResourceDown(resourceID string, resourceType string, state multistep.StateBag) (bool, error) { var down bool - ui := state.Get("ui").(packer.Ui) var vpcService *vpcv1.VpcV1 if state.Get("vpcService") != nil { vpcService = state.Get("vpcService").(*vpcv1.VpcV1) @@ -210,8 +186,6 @@ func (client IBMCloudClient) isResourceDown(resourceID string, resourceType stri instance, _, err := vpcService.GetInstance(options) if err != nil { err := fmt.Errorf("[ERROR] Failed retrieving resource information. Error: %s", err) - ui.Error(err.Error()) - log.Println(err.Error()) return false, err } status := *instance.Status diff --git a/builder/ibmcloud/vpc/client_test.go b/builder/ibmcloud/vpc/client_test.go new file mode 100644 index 0000000..214c7bc --- /dev/null +++ b/builder/ibmcloud/vpc/client_test.go @@ -0,0 +1,179 @@ +package vpc + +import ( + "net/http" + "net/http/httptest" + "sync/atomic" + "testing" + "time" + + "github.com/IBM/vpc-go-sdk/vpcv1" + "github.com/hashicorp/packer-plugin-sdk/multistep" + "github.com/hashicorp/packer-plugin-sdk/packer" +) + +// newTestVpcService builds a vpcv1.VpcV1 pointed at an httptest server with a +// no-op authenticator so the client helpers can be exercised end to end. +func newTestVpcService(t *testing.T, url string) *vpcv1.VpcV1 { + t.Helper() + svc, err := vpcv1.NewVpcV1(&vpcv1.VpcV1Options{ + Authenticator: stubAuthenticator{}, + URL: url, + }) + if err != nil { + t.Fatalf("failed to build test vpc service: %s", err) + } + return svc +} + +// TestIsResourceReadyImageStatus pins the status→ready/error mapping. Transient +// HTTP errors are no longer classified here (the SDK's request retries absorb +// them, see TestVPCServiceRetriesTransientErrors); this only covers the +// application-level status handling on a successful response. +func TestIsResourceReadyImageStatus(t *testing.T) { + tests := []struct { + name string + body string + wantReady bool + wantErr bool + }{ + {name: "available image is ready", body: `{"id":"img-1","status":"available"}`, wantReady: true}, + {name: "pending image is not ready", body: `{"id":"img-1","status":"pending"}`, wantReady: false}, + {name: "failed image is an error", body: `{"id":"img-1","status":"failed"}`, wantReady: false, wantErr: true}, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + w.Header().Set("Content-Type", "application/json") + w.WriteHeader(http.StatusOK) + _, _ = w.Write([]byte(tt.body)) + })) + defer srv.Close() + + state := new(multistep.BasicStateBag) + state.Put("vpcService", newTestVpcService(t, srv.URL)) + client := IBMCloudClient{} + + ready, err := client.isResourceReady("img-1", "images", state) + if ready != tt.wantReady { + t.Errorf("ready = %v, want %v", ready, tt.wantReady) + } + if (err != nil) != tt.wantErr { + t.Fatalf("err = %v, wantErr = %v", err, tt.wantErr) + } + }) + } +} + +// TestVPCServiceRetriesTransientErrors proves the retry configuration we apply +// in StepCreateVPCServiceInstance (vpcRetryMaxAttempts) makes the IBM SDK ride +// out a transient 5xx and ultimately succeed. The interval is overridden to 1ms +// purely to keep the test fast — it caps the exponential backoff. +func TestVPCServiceRetriesTransientErrors(t *testing.T) { + var calls int32 + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + w.Header().Set("Content-Type", "application/json") + if atomic.AddInt32(&calls, 1) <= 2 { + w.WriteHeader(http.StatusBadGateway) + _, _ = w.Write([]byte(`{}`)) + return + } + w.WriteHeader(http.StatusOK) + _, _ = w.Write([]byte(`{"id":"img-1","status":"available"}`)) + })) + defer srv.Close() + + svc := newTestVpcService(t, srv.URL) + svc.EnableRetries(vpcRetryMaxAttempts, time.Millisecond) + + if _, _, err := svc.GetImage(svc.NewGetImageOptions("img-1")); err != nil { + t.Fatalf("GetImage failed despite retries: %s", err) + } + if got := atomic.LoadInt32(&calls); got != 3 { + t.Errorf("expected 3 requests (2 transient + 1 success), got %d", got) + } +} + +func TestVPCServiceGivesUpAfterRetries(t *testing.T) { + var calls int32 + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + atomic.AddInt32(&calls, 1) + w.Header().Set("Content-Type", "application/json") + w.WriteHeader(http.StatusBadGateway) + _, _ = w.Write([]byte(`{}`)) + })) + defer srv.Close() + + svc := newTestVpcService(t, srv.URL) + svc.EnableRetries(vpcRetryMaxAttempts, time.Millisecond) + + if _, _, err := svc.GetImage(svc.NewGetImageOptions("img-1")); err == nil { + t.Fatal("expected an error after retries are exhausted") + } + // One initial attempt plus vpcRetryMaxAttempts retries. + if want := int32(vpcRetryMaxAttempts + 1); atomic.LoadInt32(&calls) != want { + t.Errorf("expected %d requests, got %d", want, atomic.LoadInt32(&calls)) + } +} + +// TestWaitForResourceReadyAbortsOnAPIError exercises the production poll loop +// end to end: with no SDK retries on this test client, an API error surfaces +// from the check immediately and pollUntil aborts the wait (in production the +// SDK would have retried transient errors before this point). This locks the +// post-rework semantics — pollUntil no longer rides out errors itself. +func TestWaitForResourceReadyAbortsOnAPIError(t *testing.T) { + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + w.Header().Set("Content-Type", "application/json") + w.WriteHeader(http.StatusBadGateway) + _, _ = w.Write([]byte(`{}`)) + })) + defer srv.Close() + + state := new(multistep.BasicStateBag) + state.Put("ui", packer.TestUi(t)) + state.Put("vpcService", newTestVpcService(t, srv.URL)) + client := IBMCloudClient{} + + if err := client.waitForResourceReady("i-1", "instances", 30*time.Second, state); err == nil { + t.Fatal("expected pollUntil to abort the wait when the API errors") + } +} + +func TestWaitForResourceReadyReturnsWhenReady(t *testing.T) { + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + w.Header().Set("Content-Type", "application/json") + w.WriteHeader(http.StatusOK) + _, _ = w.Write([]byte(`{"id":"i-1","status":"running"}`)) + })) + defer srv.Close() + + state := new(multistep.BasicStateBag) + state.Put("ui", packer.TestUi(t)) + state.Put("vpcService", newTestVpcService(t, srv.URL)) + client := IBMCloudClient{} + + if err := client.waitForResourceReady("i-1", "instances", 30*time.Second, state); err != nil { + t.Fatalf("waitForResourceReady returned error when the resource was ready: %s", err) + } +} + +func TestVPCServiceDoesNotRetryFatalErrors(t *testing.T) { + var calls int32 + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + atomic.AddInt32(&calls, 1) + w.Header().Set("Content-Type", "application/json") + w.WriteHeader(http.StatusNotFound) + _, _ = w.Write([]byte(`{}`)) + })) + defer srv.Close() + + svc := newTestVpcService(t, srv.URL) + svc.EnableRetries(vpcRetryMaxAttempts, time.Millisecond) + + if _, _, err := svc.GetImage(svc.NewGetImageOptions("img-1")); err == nil { + t.Fatal("expected a fatal 404 error") + } + if got := atomic.LoadInt32(&calls); got != 1 { + t.Errorf("expected exactly one request for a fatal 404, got %d", got) + } +} diff --git a/builder/ibmcloud/vpc/step_create_vpc_service_instance.go b/builder/ibmcloud/vpc/step_create_vpc_service_instance.go index df24181..6914214 100644 --- a/builder/ibmcloud/vpc/step_create_vpc_service_instance.go +++ b/builder/ibmcloud/vpc/step_create_vpc_service_instance.go @@ -4,6 +4,7 @@ import ( "context" "fmt" "log" + "time" "github.com/IBM/go-sdk-core/v5/core" "github.com/IBM/vpc-go-sdk/vpcv1" @@ -11,6 +12,18 @@ import ( "github.com/hashicorp/packer-plugin-sdk/packer" ) +// vpcRetryMaxAttempts and vpcRetryMaxInterval configure the IBM Cloud SDK's +// built-in request retries (go-sdk-core EnableRetries). The SDK retries 429 and +// 5xx (except 501) responses and network-level failures. It honors a server-sent +// Retry-After header (as given); otherwise it backs off exponentially, with +// vpcRetryMaxInterval capping that exponential wait. This rides out transient API +// blips on every VPC call — both one-shot creates and status polls — so a single +// 502 no longer aborts an otherwise-healthy bake. +const ( + vpcRetryMaxAttempts = 5 + vpcRetryMaxInterval = 30 * time.Second +) + type StepCreateVPCServiceInstance struct { } @@ -61,6 +74,8 @@ func (step *StepCreateVPCServiceInstance) Run(_ context.Context, state multistep return multistep.ActionHalt } + vpcService.EnableRetries(vpcRetryMaxAttempts, vpcRetryMaxInterval) + state.Put("vpcService", vpcService) ui.Say("VPC service creation successful!") return multistep.ActionContinue diff --git a/builder/ibmcloud/vpc/step_create_vpc_service_instance_test.go b/builder/ibmcloud/vpc/step_create_vpc_service_instance_test.go new file mode 100644 index 0000000..ba5ea7b --- /dev/null +++ b/builder/ibmcloud/vpc/step_create_vpc_service_instance_test.go @@ -0,0 +1,35 @@ +package vpc + +import ( + "context" + "fmt" + "strings" + "testing" + + "github.com/IBM/vpc-go-sdk/vpcv1" + "github.com/hashicorp/packer-plugin-sdk/multistep" + "github.com/hashicorp/packer-plugin-sdk/packer" +) + +// TestStepCreateVPCServiceInstanceEnablesRetries guards the wiring: transient- +// error tolerance for every VPC call depends on the EnableRetries call in Run. +// The behavioral retry tests (client_test.go) enable retries themselves, so they +// would still pass if that line were removed — this test fails if it is. +func TestStepCreateVPCServiceInstanceEnablesRetries(t *testing.T) { + state := new(multistep.BasicStateBag) + state.Put("ui", packer.TestUi(t)) + state.Put("client", &IBMCloudClient{IBMApiKey: "dummy-key"}) + state.Put("config", Config{}) + + step := &StepCreateVPCServiceInstance{} + if action := step.Run(context.Background(), state); action != multistep.ActionContinue { + t.Fatalf("Run returned %v, want ActionContinue", action) + } + + svc := state.Get("vpcService").(*vpcv1.VpcV1) + // EnableRetries swaps the service's HTTP client for a retryablehttp-backed + // one; assert that transport is in place so dropping EnableRetries is caught. + if got := fmt.Sprintf("%T", svc.Service.Client.Transport); !strings.Contains(got, "retryablehttp") { + t.Errorf("VPC service transport = %s, want a retryablehttp transport (EnableRetries not wired)", got) + } +} diff --git a/builder/ibmcloud/vpc/step_image_export.go b/builder/ibmcloud/vpc/step_image_export.go index 2591c95..29496a1 100644 --- a/builder/ibmcloud/vpc/step_image_export.go +++ b/builder/ibmcloud/vpc/step_image_export.go @@ -59,7 +59,7 @@ func (step *StepImageExport) Run(_ context.Context, state multistep.StateBag) mu } jobId := *imageExportJob.ID ui.Say("Waiting for the Export image to SUCCEED...") - err3 := waitForExportJobToSucceed(config.ImageID, jobId, vpcService, exportTimeout, state) + err3 := waitForExportJobToSucceed(config.ImageID, jobId, vpcService, exportTimeout, 0, state) if err3 != nil { err := fmt.Errorf("[ERROR] Error waiting for the Image export job to succeed: %s", err3) state.Put("error", err) @@ -75,19 +75,27 @@ func (step *StepImageExport) Run(_ context.Context, state multistep.StateBag) mu return multistep.ActionContinue } -func waitForExportJobToSucceed(imageId, exportJobId string, vpcService *vpcv1.VpcV1, timeout time.Duration, state multistep.StateBag) error { +// pollInterval is the cadence between status checks; production passes 0 to use +// defaultPollInterval. It exists as a parameter (unlike pollUntil, which hardcodes +// the constant) so tests can drive the multi-poll loop without a real 10s wait. +func waitForExportJobToSucceed(imageId, exportJobId string, vpcService *vpcv1.VpcV1, timeout time.Duration, pollInterval time.Duration, state multistep.StateBag) error { ui := state.Get("ui").(packer.Ui) done := make(chan struct{}) defer close(done) result := make(chan error, 1) if timeout < 1*time.Minute { - timeout = 5 * time.Minute // Default to 45 minutes for image exports + timeout = 5 * time.Minute // Default when no (or sub-minute) export timeout is configured ui.Say(fmt.Sprintf("Using default %v timeout for image export", timeout)) } else { ui.Say(fmt.Sprintf("Using %v timeout for image export", timeout)) } + interval := pollInterval + if interval <= 0 { + interval = defaultPollInterval + } + options := vpcv1.GetImageExportJobOptions{ ImageID: &imageId, ID: &exportJobId, @@ -105,9 +113,10 @@ func waitForExportJobToSucceed(imageId, exportJobId string, vpcService *vpcv1.Vp log.Printf("Checking export job status ... (attempt: %d)", attempts) expJob, _, err := vpcService.GetImageExportJob(&options) - if err != nil { - ui.Say(fmt.Sprintf("Error fetching image export job: %v", err)) + // Transient 5xx/429/network blips are absorbed by the SDK's + // request retries; an error here is genuine, so stop waiting. + ui.Say(fmt.Sprintf("Error fetching image export job: %s", err)) result <- err return } @@ -130,13 +139,8 @@ func waitForExportJobToSucceed(imageId, exportJobId string, vpcService *vpcv1.Vp return } - time.Sleep(10 * time.Second) - - select { - case <-done: + if sleepOrDone(interval, done) { return - default: - // Keep going } } }() diff --git a/builder/ibmcloud/vpc/step_image_export_test.go b/builder/ibmcloud/vpc/step_image_export_test.go new file mode 100644 index 0000000..9e172d7 --- /dev/null +++ b/builder/ibmcloud/vpc/step_image_export_test.go @@ -0,0 +1,71 @@ +package vpc + +import ( + "net/http" + "net/http/httptest" + "sync/atomic" + "testing" + "time" + + "github.com/hashicorp/packer-plugin-sdk/multistep" + "github.com/hashicorp/packer-plugin-sdk/packer" +) + +func TestWaitForExportJobSucceeds(t *testing.T) { + var calls int32 + // Report "running" twice, then "succeeded" — exercises the poll-again loop. + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + w.Header().Set("Content-Type", "application/json") + w.WriteHeader(http.StatusOK) + if atomic.AddInt32(&calls, 1) <= 2 { + _, _ = w.Write([]byte(`{"status":"running"}`)) + return + } + _, _ = w.Write([]byte(`{"status":"succeeded"}`)) + })) + defer srv.Close() + + state := new(multistep.BasicStateBag) + state.Put("ui", packer.TestUi(t)) + + if err := waitForExportJobToSucceed("img-1", "job-1", newTestVpcService(t, srv.URL), 30*time.Second, time.Millisecond, state); err != nil { + t.Fatalf("waitForExportJobToSucceed returned error: %s", err) + } + if got := atomic.LoadInt32(&calls); got != 3 { + t.Errorf("expected 3 polls (2 running + 1 succeeded), got %d", got) + } +} + +func TestWaitForExportJobFailsOnFailedStatus(t *testing.T) { + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + w.Header().Set("Content-Type", "application/json") + w.WriteHeader(http.StatusOK) + _, _ = w.Write([]byte(`{"status":"failed"}`)) + })) + defer srv.Close() + + state := new(multistep.BasicStateBag) + state.Put("ui", packer.TestUi(t)) + + if err := waitForExportJobToSucceed("img-1", "job-1", newTestVpcService(t, srv.URL), 30*time.Second, time.Millisecond, state); err == nil { + t.Fatal("expected an error for a failed export job") + } +} + +// A non-transient API error (e.g. 404) surfaces immediately; transient 5xx/429 +// blips are absorbed beneath this function by the SDK's request retries. +func TestWaitForExportJobFailsOnAPIError(t *testing.T) { + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + w.Header().Set("Content-Type", "application/json") + w.WriteHeader(http.StatusNotFound) + _, _ = w.Write([]byte(`{}`)) + })) + defer srv.Close() + + state := new(multistep.BasicStateBag) + state.Put("ui", packer.TestUi(t)) + + if err := waitForExportJobToSucceed("img-1", "job-1", newTestVpcService(t, srv.URL), 30*time.Second, time.Millisecond, state); err == nil { + t.Fatal("expected a fatal error for a 404 response") + } +}