Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
110 changes: 42 additions & 68 deletions builder/ibmcloud/vpc/client.go
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,24 @@ import (
"github.com/hashicorp/packer-plugin-sdk/packer"
)

// defaultPollInterval is the wait between status polls when a caller does not
// set one explicitly. Transient API errors are handled by the SDK's request
// retries (see vpcRetryMaxAttempts); this interval is purely the cadence at
// which we re-check a resource's status while waiting for it to reach its goal.
const defaultPollInterval = 10 * time.Second

// sleepOrDone waits interval between polls and then reports whether the poll
// goroutine should stop because its parent has already returned (done closed).
func sleepOrDone(interval time.Duration, done <-chan struct{}) (stop bool) {
time.Sleep(interval)
select {
case <-done:
return true
default:
return false
}
}

type IBMCloudClient struct {
// // The http client for communicating
http *http.Client
Expand All @@ -33,6 +51,24 @@ func (client IBMCloudClient) New(IBMApiKey string) *IBMCloudClient {
}

func (client IBMCloudClient) waitForResourceReady(resourceID string, resourceType string, timeout time.Duration, state multistep.StateBag) error {
return client.pollUntil(resourceID, resourceType, "ready", timeout, state, client.isResourceReady)
}

// pollUntil repeatedly invokes check until it reports the resource has reached
// its goal state, the timeout elapses, or check returns an error. Transient API
// errors (5xx/429/network blips) are retried beneath check by the SDK's
// per-request retries (up to vpcRetryMaxAttempts each, see EnableRetries); an
// error surfacing here therefore means those retries were exhausted or the error
// is fatal, so the wait aborts rather than re-checking. goal is used only in
// log/timeout messages (e.g. "ready", "stopped").
func (client IBMCloudClient) pollUntil(
resourceID string,
resourceType string,
goal string,
timeout time.Duration,
state multistep.StateBag,
check func(resourceID string, resourceType string, state multistep.StateBag) (bool, error),
) error {
ui := state.Get("ui").(packer.Ui)
done := make(chan struct{})
defer close(done)
Expand All @@ -49,38 +85,28 @@ func (client IBMCloudClient) waitForResourceReady(resourceID string, resourceTyp
}

log.Printf("Checking resource state... (attempt: %d)", attempts)
isReady, err := client.isResourceReady(resourceID, resourceType, state)

reached, err := check(resourceID, resourceType, state)
if err != nil {
result <- err
return
}

if isReady {
if reached {
result <- nil
return
}

// Wait 10 seconds in between
time.Sleep(10 * time.Second)

// Verify we shouldn't exit
select {
case <-done:
// We finished, so just exit the go routine
if sleepOrDone(defaultPollInterval, done) {
return
default:
// Keep going
}
}
}()

log.Printf("Waiting for up to %d seconds for resource to become ready", timeout/time.Second)
log.Printf("Waiting for up to %d seconds for resource to become %s", timeout/time.Second, goal)
select {
case err := <-result:
return err
case <-time.After(timeout):
err := fmt.Errorf("timeout while waiting to for the resource to become ready")
err := fmt.Errorf("timeout while waiting for the resource to become %s", goal)
return err
}
}
Expand Down Expand Up @@ -144,62 +170,12 @@ func (client IBMCloudClient) isResourceReady(resourceID string, resourceType str
}

func (client IBMCloudClient) waitForResourceDown(resourceID string, resourceType string, timeout time.Duration, state multistep.StateBag) error {
ui := state.Get("ui").(packer.Ui)
done := make(chan struct{})
defer close(done)
result := make(chan error, 1)

go func() {
attempts := 0
for {
attempts += 1
if attempts%6 == 0 {
ui.Say(fmt.Sprintf("Waiting time: %d minutes", attempts/6))
} else {
ui.Say(".")
}

log.Printf("Checking resource state... (attempt: %d)", attempts)
isDown, err := client.isResourceDown(resourceID, resourceType, state)

if err != nil {
result <- err
return
}

if isDown {
result <- nil
return
}

// Wait 10 seconds in between
time.Sleep(10 * time.Second)

// Verify we shouldn't exit
select {
case <-done:
// We finished, so just exit the go routine
return
default:
// Keep going
}
}
}()

log.Printf("Waiting for up to %d seconds for resource to be stopped", timeout/time.Second)
select {
case err := <-result:
return err
case <-time.After(timeout):
err := fmt.Errorf("timeout while waiting to for the resource to be stopped")
return err
}
return client.pollUntil(resourceID, resourceType, "stopped", timeout, state, client.isResourceDown)
}

func (client IBMCloudClient) isResourceDown(resourceID string, resourceType string, state multistep.StateBag) (bool, error) {
var down bool

ui := state.Get("ui").(packer.Ui)
var vpcService *vpcv1.VpcV1
if state.Get("vpcService") != nil {
vpcService = state.Get("vpcService").(*vpcv1.VpcV1)
Expand All @@ -210,8 +186,6 @@ func (client IBMCloudClient) isResourceDown(resourceID string, resourceType stri
instance, _, err := vpcService.GetInstance(options)
if err != nil {
err := fmt.Errorf("[ERROR] Failed retrieving resource information. Error: %s", err)
ui.Error(err.Error())
log.Println(err.Error())
return false, err
}
status := *instance.Status
Expand Down
179 changes: 179 additions & 0 deletions builder/ibmcloud/vpc/client_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,179 @@
package vpc

import (
"net/http"
"net/http/httptest"
"sync/atomic"
"testing"
"time"

"github.com/IBM/vpc-go-sdk/vpcv1"
"github.com/hashicorp/packer-plugin-sdk/multistep"
"github.com/hashicorp/packer-plugin-sdk/packer"
)

// newTestVpcService builds a vpcv1.VpcV1 pointed at an httptest server with a
// no-op authenticator so the client helpers can be exercised end to end.
func newTestVpcService(t *testing.T, url string) *vpcv1.VpcV1 {
t.Helper()
svc, err := vpcv1.NewVpcV1(&vpcv1.VpcV1Options{
Authenticator: stubAuthenticator{},
URL: url,
})
if err != nil {
t.Fatalf("failed to build test vpc service: %s", err)
}
return svc
}

// TestIsResourceReadyImageStatus pins the status→ready/error mapping. Transient
// HTTP errors are no longer classified here (the SDK's request retries absorb
// them, see TestVPCServiceRetriesTransientErrors); this only covers the
// application-level status handling on a successful response.
func TestIsResourceReadyImageStatus(t *testing.T) {
tests := []struct {
name string
body string
wantReady bool
wantErr bool
}{
{name: "available image is ready", body: `{"id":"img-1","status":"available"}`, wantReady: true},
{name: "pending image is not ready", body: `{"id":"img-1","status":"pending"}`, wantReady: false},
{name: "failed image is an error", body: `{"id":"img-1","status":"failed"}`, wantReady: false, wantErr: true},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
w.Header().Set("Content-Type", "application/json")
w.WriteHeader(http.StatusOK)
_, _ = w.Write([]byte(tt.body))
}))
defer srv.Close()

state := new(multistep.BasicStateBag)
state.Put("vpcService", newTestVpcService(t, srv.URL))
client := IBMCloudClient{}

ready, err := client.isResourceReady("img-1", "images", state)
if ready != tt.wantReady {
t.Errorf("ready = %v, want %v", ready, tt.wantReady)
}
if (err != nil) != tt.wantErr {
t.Fatalf("err = %v, wantErr = %v", err, tt.wantErr)
}
})
}
}

// TestVPCServiceRetriesTransientErrors proves the retry configuration we apply
// in StepCreateVPCServiceInstance (vpcRetryMaxAttempts) makes the IBM SDK ride
// out a transient 5xx and ultimately succeed. The interval is overridden to 1ms
// purely to keep the test fast — it caps the exponential backoff.
func TestVPCServiceRetriesTransientErrors(t *testing.T) {
var calls int32
srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
w.Header().Set("Content-Type", "application/json")
if atomic.AddInt32(&calls, 1) <= 2 {
w.WriteHeader(http.StatusBadGateway)
_, _ = w.Write([]byte(`{}`))
return
}
w.WriteHeader(http.StatusOK)
_, _ = w.Write([]byte(`{"id":"img-1","status":"available"}`))
}))
defer srv.Close()

svc := newTestVpcService(t, srv.URL)
svc.EnableRetries(vpcRetryMaxAttempts, time.Millisecond)

if _, _, err := svc.GetImage(svc.NewGetImageOptions("img-1")); err != nil {
t.Fatalf("GetImage failed despite retries: %s", err)
}
if got := atomic.LoadInt32(&calls); got != 3 {
t.Errorf("expected 3 requests (2 transient + 1 success), got %d", got)
}
}

func TestVPCServiceGivesUpAfterRetries(t *testing.T) {
var calls int32
srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
atomic.AddInt32(&calls, 1)
w.Header().Set("Content-Type", "application/json")
w.WriteHeader(http.StatusBadGateway)
_, _ = w.Write([]byte(`{}`))
}))
defer srv.Close()

svc := newTestVpcService(t, srv.URL)
svc.EnableRetries(vpcRetryMaxAttempts, time.Millisecond)

if _, _, err := svc.GetImage(svc.NewGetImageOptions("img-1")); err == nil {
t.Fatal("expected an error after retries are exhausted")
}
// One initial attempt plus vpcRetryMaxAttempts retries.
if want := int32(vpcRetryMaxAttempts + 1); atomic.LoadInt32(&calls) != want {
t.Errorf("expected %d requests, got %d", want, atomic.LoadInt32(&calls))
}
}

// TestWaitForResourceReadyAbortsOnAPIError exercises the production poll loop
// end to end: with no SDK retries on this test client, an API error surfaces
// from the check immediately and pollUntil aborts the wait (in production the
// SDK would have retried transient errors before this point). This locks the
// post-rework semantics — pollUntil no longer rides out errors itself.
func TestWaitForResourceReadyAbortsOnAPIError(t *testing.T) {
srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
w.Header().Set("Content-Type", "application/json")
w.WriteHeader(http.StatusBadGateway)
_, _ = w.Write([]byte(`{}`))
}))
defer srv.Close()

state := new(multistep.BasicStateBag)
state.Put("ui", packer.TestUi(t))
state.Put("vpcService", newTestVpcService(t, srv.URL))
client := IBMCloudClient{}

if err := client.waitForResourceReady("i-1", "instances", 30*time.Second, state); err == nil {
t.Fatal("expected pollUntil to abort the wait when the API errors")
}
}

func TestWaitForResourceReadyReturnsWhenReady(t *testing.T) {
srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
w.Header().Set("Content-Type", "application/json")
w.WriteHeader(http.StatusOK)
_, _ = w.Write([]byte(`{"id":"i-1","status":"running"}`))
}))
defer srv.Close()

state := new(multistep.BasicStateBag)
state.Put("ui", packer.TestUi(t))
state.Put("vpcService", newTestVpcService(t, srv.URL))
client := IBMCloudClient{}

if err := client.waitForResourceReady("i-1", "instances", 30*time.Second, state); err != nil {
t.Fatalf("waitForResourceReady returned error when the resource was ready: %s", err)
}
}

func TestVPCServiceDoesNotRetryFatalErrors(t *testing.T) {
var calls int32
srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
atomic.AddInt32(&calls, 1)
w.Header().Set("Content-Type", "application/json")
w.WriteHeader(http.StatusNotFound)
_, _ = w.Write([]byte(`{}`))
}))
defer srv.Close()

svc := newTestVpcService(t, srv.URL)
svc.EnableRetries(vpcRetryMaxAttempts, time.Millisecond)

if _, _, err := svc.GetImage(svc.NewGetImageOptions("img-1")); err == nil {
t.Fatal("expected a fatal 404 error")
}
if got := atomic.LoadInt32(&calls); got != 1 {
t.Errorf("expected exactly one request for a fatal 404, got %d", got)
}
}
15 changes: 15 additions & 0 deletions builder/ibmcloud/vpc/step_create_vpc_service_instance.go
Original file line number Diff line number Diff line change
Expand Up @@ -4,13 +4,26 @@ import (
"context"
"fmt"
"log"
"time"

"github.com/IBM/go-sdk-core/v5/core"
"github.com/IBM/vpc-go-sdk/vpcv1"
"github.com/hashicorp/packer-plugin-sdk/multistep"
"github.com/hashicorp/packer-plugin-sdk/packer"
)

// vpcRetryMaxAttempts and vpcRetryMaxInterval configure the IBM Cloud SDK's
// built-in request retries (go-sdk-core EnableRetries). The SDK retries 429 and
// 5xx (except 501) responses and network-level failures. It honors a server-sent
// Retry-After header (as given); otherwise it backs off exponentially, with
// vpcRetryMaxInterval capping that exponential wait. This rides out transient API
// blips on every VPC call — both one-shot creates and status polls — so a single
// 502 no longer aborts an otherwise-healthy bake.
const (
vpcRetryMaxAttempts = 5
vpcRetryMaxInterval = 30 * time.Second
)

type StepCreateVPCServiceInstance struct {
}

Expand Down Expand Up @@ -61,6 +74,8 @@ func (step *StepCreateVPCServiceInstance) Run(_ context.Context, state multistep
return multistep.ActionHalt
}

vpcService.EnableRetries(vpcRetryMaxAttempts, vpcRetryMaxInterval)

state.Put("vpcService", vpcService)
ui.Say("VPC service creation successful!")
return multistep.ActionContinue
Expand Down
Loading
Loading